fe_x25519_asm.S 403 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381838283838384838583868387838883898390839183928393839483958396839783988399840084018402840384048405840684078408840984108411841284138414841584168417841884198420842184228423842484258426842784288429843084318432843384348435843684378438843984408441844284438444844584468447844884498450845184528453845484558456845784588459846084618462846384648465846684678468846984708471847284738474847584768477847884798480848184828483848484858486848784888489849084918492849384948495849684978498849985008501850285038504850585068507850885098510851185128513851485158516851785188519852085218522852385248525852685278528852985308531853285338534853585368537853885398540854185428543854485458546854785488549855085518552855385548555855685578558855985608561856285638564856585668567856885698570857185728573857485758576857785788579858085818582858385848585858685878588858985908591859285938594859585968597859885998600860186028603860486058606860786088609861086118612861386148615861686178618861986208621862286238624862586268627862886298630863186328633863486358636863786388639864086418642864386448645864686478648864986508651865286538654865586568657865886598660866186628663866486658666866786688669867086718672867386748675867686778678867986808681868286838684868586868687868886898690869186928693869486958696869786988699870087018702870387048705870687078708870987108711871287138714871587168717871887198720872187228723872487258726872787288729873087318732873387348735873687378738873987408741874287438744874587468747874887498750875187528753875487558756875787588759876087618762876387648765876687678768876987708771877287738774877587768777877887798780878187828783878487858786878787888789879087918792879387948795879687978798879988008801880288038804880588068807880888098810881188128813881488158816881788188819882088218822882388248825882688278828882988308831883288338834883588368837883888398840884188428843884488458846884788488849885088518852885388548855885688578858885988608861886288638864886588668867886888698870887188728873887488758876887788788879888088818882888388848885888688878888888988908891889288938894889588968897889888998900890189028903890489058906890789088909891089118912891389148915891689178918891989208921892289238924892589268927892889298930893189328933893489358936893789388939894089418942894389448945894689478948894989508951895289538954895589568957895889598960896189628963896489658966896789688969897089718972897389748975897689778978897989808981898289838984898589868987898889898990899189928993899489958996899789988999900090019002900390049005900690079008900990109011901290139014901590169017901890199020902190229023902490259026902790289029903090319032903390349035903690379038903990409041904290439044904590469047904890499050905190529053905490559056905790589059906090619062906390649065906690679068906990709071907290739074907590769077907890799080908190829083908490859086908790889089909090919092909390949095909690979098909991009101910291039104910591069107910891099110911191129113911491159116911791189119912091219122912391249125912691279128912991309131913291339134913591369137913891399140914191429143914491459146914791489149915091519152915391549155915691579158915991609161916291639164916591669167916891699170917191729173917491759176917791789179918091819182918391849185918691879188918991909191919291939194919591969197919891999200920192029203920492059206920792089209921092119212921392149215921692179218921992209221922292239224922592269227922892299230923192329233923492359236923792389239924092419242924392449245924692479248924992509251925292539254925592569257925892599260926192629263926492659266926792689269927092719272927392749275927692779278927992809281928292839284928592869287928892899290929192929293929492959296929792989299930093019302930393049305930693079308930993109311931293139314931593169317931893199320932193229323932493259326932793289329933093319332933393349335933693379338933993409341934293439344934593469347934893499350935193529353935493559356935793589359936093619362936393649365936693679368936993709371937293739374937593769377937893799380938193829383938493859386938793889389939093919392939393949395939693979398939994009401940294039404940594069407940894099410941194129413941494159416941794189419942094219422942394249425942694279428942994309431943294339434943594369437943894399440944194429443944494459446944794489449945094519452945394549455945694579458945994609461946294639464946594669467946894699470947194729473947494759476947794789479948094819482948394849485948694879488948994909491949294939494949594969497949894999500950195029503950495059506950795089509951095119512951395149515951695179518951995209521952295239524952595269527952895299530953195329533953495359536953795389539954095419542954395449545954695479548954995509551955295539554955595569557955895599560956195629563956495659566956795689569957095719572957395749575957695779578957995809581958295839584958595869587958895899590959195929593959495959596959795989599960096019602960396049605960696079608960996109611961296139614961596169617961896199620962196229623962496259626962796289629963096319632963396349635963696379638963996409641964296439644964596469647964896499650965196529653965496559656965796589659966096619662966396649665966696679668966996709671967296739674967596769677967896799680968196829683968496859686968796889689969096919692969396949695969696979698969997009701970297039704970597069707970897099710971197129713971497159716971797189719972097219722972397249725972697279728972997309731973297339734973597369737973897399740974197429743974497459746974797489749975097519752975397549755975697579758975997609761976297639764976597669767976897699770977197729773977497759776977797789779978097819782978397849785978697879788978997909791979297939794979597969797979897999800980198029803980498059806980798089809981098119812981398149815981698179818981998209821982298239824982598269827982898299830983198329833983498359836983798389839984098419842984398449845984698479848984998509851985298539854985598569857985898599860986198629863986498659866986798689869987098719872987398749875987698779878987998809881988298839884988598869887988898899890989198929893989498959896989798989899990099019902990399049905990699079908990999109911991299139914991599169917991899199920992199229923992499259926992799289929993099319932993399349935993699379938993999409941994299439944994599469947994899499950995199529953995499559956995799589959996099619962996399649965996699679968996999709971997299739974997599769977997899799980998199829983998499859986998799889989999099919992999399949995999699979998999910000100011000210003100041000510006100071000810009100101001110012100131001410015100161001710018100191002010021100221002310024100251002610027100281002910030100311003210033100341003510036100371003810039100401004110042100431004410045100461004710048100491005010051100521005310054100551005610057100581005910060100611006210063100641006510066100671006810069100701007110072100731007410075100761007710078100791008010081100821008310084100851008610087100881008910090100911009210093100941009510096100971009810099101001010110102101031010410105101061010710108101091011010111101121011310114101151011610117101181011910120101211012210123101241012510126101271012810129101301013110132101331013410135101361013710138101391014010141101421014310144101451014610147101481014910150101511015210153101541015510156101571015810159101601016110162101631016410165101661016710168101691017010171101721017310174101751017610177101781017910180101811018210183101841018510186101871018810189101901019110192101931019410195101961019710198101991020010201102021020310204102051020610207102081020910210102111021210213102141021510216102171021810219102201022110222102231022410225102261022710228102291023010231102321023310234102351023610237102381023910240102411024210243102441024510246102471024810249102501025110252102531025410255102561025710258102591026010261102621026310264102651026610267102681026910270102711027210273102741027510276102771027810279102801028110282102831028410285102861028710288102891029010291102921029310294102951029610297102981029910300103011030210303103041030510306103071030810309103101031110312103131031410315103161031710318103191032010321103221032310324103251032610327103281032910330103311033210333103341033510336103371033810339103401034110342103431034410345103461034710348103491035010351103521035310354103551035610357103581035910360103611036210363103641036510366103671036810369103701037110372103731037410375103761037710378103791038010381103821038310384103851038610387103881038910390103911039210393103941039510396103971039810399104001040110402104031040410405104061040710408104091041010411104121041310414104151041610417104181041910420104211042210423104241042510426104271042810429104301043110432104331043410435104361043710438104391044010441104421044310444104451044610447104481044910450104511045210453104541045510456104571045810459104601046110462104631046410465104661046710468104691047010471104721047310474104751047610477104781047910480104811048210483104841048510486104871048810489104901049110492104931049410495104961049710498104991050010501105021050310504105051050610507105081050910510105111051210513105141051510516105171051810519105201052110522105231052410525105261052710528105291053010531105321053310534105351053610537105381053910540105411054210543105441054510546105471054810549105501055110552105531055410555105561055710558105591056010561105621056310564105651056610567105681056910570105711057210573105741057510576105771057810579105801058110582105831058410585105861058710588105891059010591105921059310594105951059610597105981059910600106011060210603106041060510606106071060810609106101061110612106131061410615106161061710618106191062010621106221062310624106251062610627106281062910630106311063210633106341063510636106371063810639106401064110642106431064410645106461064710648106491065010651106521065310654106551065610657106581065910660106611066210663106641066510666106671066810669106701067110672106731067410675106761067710678106791068010681106821068310684106851068610687106881068910690106911069210693106941069510696106971069810699107001070110702107031070410705107061070710708107091071010711107121071310714107151071610717107181071910720107211072210723107241072510726107271072810729107301073110732107331073410735107361073710738107391074010741107421074310744107451074610747107481074910750107511075210753107541075510756107571075810759107601076110762107631076410765107661076710768107691077010771107721077310774107751077610777107781077910780107811078210783107841078510786107871078810789107901079110792107931079410795107961079710798107991080010801108021080310804108051080610807108081080910810108111081210813108141081510816108171081810819108201082110822108231082410825108261082710828108291083010831108321083310834108351083610837108381083910840108411084210843108441084510846108471084810849108501085110852108531085410855108561085710858108591086010861108621086310864108651086610867108681086910870108711087210873108741087510876108771087810879108801088110882108831088410885108861088710888108891089010891108921089310894108951089610897108981089910900109011090210903109041090510906109071090810909109101091110912109131091410915109161091710918109191092010921109221092310924109251092610927109281092910930109311093210933109341093510936109371093810939109401094110942109431094410945109461094710948109491095010951109521095310954109551095610957109581095910960109611096210963109641096510966109671096810969109701097110972109731097410975109761097710978109791098010981109821098310984109851098610987109881098910990109911099210993109941099510996109971099810999110001100111002110031100411005110061100711008110091101011011110121101311014110151101611017110181101911020110211102211023110241102511026110271102811029110301103111032110331103411035110361103711038110391104011041110421104311044110451104611047110481104911050110511105211053110541105511056110571105811059110601106111062110631106411065110661106711068110691107011071110721107311074110751107611077110781107911080110811108211083110841108511086110871108811089110901109111092110931109411095110961109711098110991110011101111021110311104111051110611107111081110911110111111111211113111141111511116111171111811119111201112111122111231112411125111261112711128111291113011131111321113311134111351113611137111381113911140111411114211143111441114511146111471114811149111501115111152111531115411155111561115711158111591116011161111621116311164111651116611167111681116911170111711117211173111741117511176111771117811179111801118111182111831118411185111861118711188111891119011191111921119311194111951119611197111981119911200112011120211203112041120511206112071120811209112101121111212112131121411215112161121711218112191122011221112221122311224112251122611227112281122911230112311123211233112341123511236112371123811239112401124111242112431124411245112461124711248112491125011251112521125311254112551125611257112581125911260112611126211263112641126511266112671126811269112701127111272112731127411275112761127711278112791128011281112821128311284112851128611287112881128911290112911129211293112941129511296112971129811299113001130111302113031130411305113061130711308113091131011311113121131311314113151131611317113181131911320113211132211323113241132511326113271132811329113301133111332113331133411335113361133711338113391134011341113421134311344113451134611347113481134911350113511135211353113541135511356113571135811359113601136111362113631136411365113661136711368113691137011371113721137311374113751137611377113781137911380113811138211383113841138511386113871138811389113901139111392113931139411395113961139711398113991140011401114021140311404114051140611407114081140911410114111141211413114141141511416114171141811419114201142111422114231142411425114261142711428114291143011431114321143311434114351143611437114381143911440114411144211443114441144511446114471144811449114501145111452114531145411455114561145711458114591146011461114621146311464114651146611467114681146911470114711147211473114741147511476114771147811479114801148111482114831148411485114861148711488114891149011491114921149311494114951149611497114981149911500115011150211503115041150511506115071150811509115101151111512115131151411515115161151711518115191152011521115221152311524115251152611527115281152911530115311153211533115341153511536115371153811539115401154111542115431154411545115461154711548115491155011551115521155311554115551155611557115581155911560115611156211563115641156511566115671156811569115701157111572115731157411575115761157711578115791158011581115821158311584115851158611587115881158911590115911159211593115941159511596115971159811599116001160111602116031160411605116061160711608116091161011611116121161311614116151161611617116181161911620116211162211623116241162511626116271162811629116301163111632116331163411635116361163711638116391164011641116421164311644116451164611647116481164911650116511165211653116541165511656116571165811659116601166111662116631166411665116661166711668116691167011671116721167311674116751167611677116781167911680116811168211683116841168511686116871168811689116901169111692116931169411695116961169711698116991170011701117021170311704117051170611707117081170911710117111171211713117141171511716117171171811719117201172111722117231172411725117261172711728117291173011731117321173311734117351173611737117381173911740117411174211743117441174511746117471174811749117501175111752117531175411755117561175711758117591176011761117621176311764117651176611767117681176911770117711177211773117741177511776117771177811779117801178111782117831178411785117861178711788117891179011791117921179311794117951179611797117981179911800118011180211803118041180511806118071180811809118101181111812118131181411815118161181711818118191182011821118221182311824118251182611827118281182911830118311183211833118341183511836118371183811839118401184111842118431184411845118461184711848118491185011851118521185311854118551185611857118581185911860118611186211863118641186511866118671186811869118701187111872118731187411875118761187711878118791188011881118821188311884118851188611887118881188911890118911189211893118941189511896118971189811899119001190111902119031190411905119061190711908119091191011911119121191311914119151191611917119181191911920119211192211923119241192511926119271192811929119301193111932119331193411935119361193711938119391194011941119421194311944119451194611947119481194911950119511195211953119541195511956119571195811959119601196111962119631196411965119661196711968119691197011971119721197311974119751197611977119781197911980119811198211983119841198511986119871198811989119901199111992119931199411995119961199711998119991200012001120021200312004120051200612007120081200912010120111201212013120141201512016120171201812019120201202112022120231202412025120261202712028120291203012031120321203312034120351203612037120381203912040120411204212043120441204512046120471204812049120501205112052120531205412055120561205712058120591206012061120621206312064120651206612067120681206912070120711207212073120741207512076120771207812079120801208112082120831208412085120861208712088120891209012091120921209312094120951209612097120981209912100121011210212103121041210512106121071210812109121101211112112121131211412115121161211712118121191212012121121221212312124121251212612127121281212912130121311213212133121341213512136121371213812139121401214112142121431214412145121461214712148121491215012151121521215312154121551215612157121581215912160121611216212163121641216512166121671216812169121701217112172121731217412175121761217712178121791218012181121821218312184121851218612187121881218912190121911219212193121941219512196121971219812199122001220112202122031220412205122061220712208122091221012211122121221312214122151221612217122181221912220122211222212223122241222512226122271222812229122301223112232122331223412235122361223712238122391224012241122421224312244122451224612247122481224912250122511225212253122541225512256122571225812259122601226112262122631226412265122661226712268122691227012271122721227312274122751227612277122781227912280122811228212283122841228512286122871228812289122901229112292122931229412295122961229712298122991230012301123021230312304123051230612307123081230912310123111231212313123141231512316123171231812319123201232112322123231232412325123261232712328123291233012331123321233312334123351233612337123381233912340123411234212343123441234512346123471234812349123501235112352123531235412355123561235712358123591236012361123621236312364123651236612367123681236912370123711237212373123741237512376123771237812379123801238112382123831238412385123861238712388123891239012391123921239312394123951239612397123981239912400124011240212403124041240512406124071240812409124101241112412124131241412415124161241712418124191242012421124221242312424124251242612427124281242912430124311243212433124341243512436124371243812439124401244112442124431244412445124461244712448124491245012451124521245312454124551245612457124581245912460124611246212463124641246512466124671246812469124701247112472124731247412475124761247712478124791248012481124821248312484124851248612487124881248912490124911249212493124941249512496124971249812499125001250112502125031250412505125061250712508125091251012511125121251312514125151251612517125181251912520125211252212523125241252512526125271252812529125301253112532125331253412535125361253712538125391254012541125421254312544125451254612547125481254912550125511255212553125541255512556125571255812559125601256112562125631256412565125661256712568125691257012571125721257312574125751257612577125781257912580125811258212583125841258512586125871258812589125901259112592125931259412595125961259712598125991260012601126021260312604126051260612607126081260912610126111261212613126141261512616126171261812619126201262112622126231262412625126261262712628126291263012631126321263312634126351263612637126381263912640126411264212643126441264512646126471264812649126501265112652126531265412655126561265712658126591266012661126621266312664126651266612667126681266912670126711267212673126741267512676126771267812679126801268112682126831268412685126861268712688126891269012691126921269312694126951269612697126981269912700127011270212703127041270512706127071270812709127101271112712127131271412715127161271712718127191272012721127221272312724127251272612727127281272912730127311273212733127341273512736127371273812739127401274112742127431274412745127461274712748127491275012751127521275312754127551275612757127581275912760127611276212763127641276512766127671276812769127701277112772127731277412775127761277712778127791278012781127821278312784127851278612787127881278912790127911279212793127941279512796127971279812799128001280112802128031280412805128061280712808128091281012811128121281312814128151281612817128181281912820128211282212823128241282512826128271282812829128301283112832128331283412835128361283712838128391284012841128421284312844128451284612847128481284912850128511285212853128541285512856128571285812859128601286112862128631286412865128661286712868128691287012871128721287312874128751287612877128781287912880128811288212883128841288512886128871288812889128901289112892128931289412895128961289712898128991290012901129021290312904129051290612907129081290912910129111291212913129141291512916129171291812919129201292112922129231292412925129261292712928129291293012931129321293312934129351293612937129381293912940129411294212943129441294512946129471294812949129501295112952129531295412955129561295712958129591296012961129621296312964129651296612967129681296912970129711297212973129741297512976129771297812979129801298112982129831298412985129861298712988129891299012991129921299312994129951299612997129981299913000130011300213003130041300513006130071300813009130101301113012130131301413015130161301713018130191302013021130221302313024130251302613027130281302913030130311303213033130341303513036130371303813039130401304113042130431304413045130461304713048130491305013051130521305313054130551305613057130581305913060130611306213063130641306513066130671306813069130701307113072130731307413075130761307713078130791308013081130821308313084130851308613087130881308913090130911309213093130941309513096130971309813099131001310113102131031310413105131061310713108131091311013111131121311313114131151311613117131181311913120131211312213123131241312513126131271312813129131301313113132131331313413135131361313713138131391314013141131421314313144131451314613147131481314913150131511315213153131541315513156131571315813159131601316113162131631316413165131661316713168131691317013171131721317313174131751317613177131781317913180131811318213183131841318513186131871318813189131901319113192131931319413195131961319713198131991320013201132021320313204132051320613207132081320913210132111321213213132141321513216132171321813219132201322113222132231322413225132261322713228132291323013231132321323313234132351323613237132381323913240132411324213243132441324513246132471324813249132501325113252132531325413255132561325713258132591326013261132621326313264132651326613267132681326913270132711327213273132741327513276132771327813279132801328113282132831328413285132861328713288132891329013291132921329313294132951329613297132981329913300133011330213303133041330513306133071330813309133101331113312133131331413315133161331713318133191332013321133221332313324133251332613327133281332913330133311333213333133341333513336133371333813339133401334113342133431334413345133461334713348133491335013351133521335313354133551335613357133581335913360133611336213363133641336513366133671336813369133701337113372133731337413375133761337713378133791338013381133821338313384133851338613387133881338913390133911339213393133941339513396133971339813399134001340113402134031340413405134061340713408134091341013411134121341313414134151341613417134181341913420134211342213423134241342513426134271342813429134301343113432134331343413435134361343713438134391344013441134421344313444134451344613447134481344913450134511345213453134541345513456134571345813459134601346113462134631346413465134661346713468134691347013471134721347313474134751347613477134781347913480134811348213483134841348513486134871348813489134901349113492134931349413495134961349713498134991350013501135021350313504135051350613507135081350913510135111351213513135141351513516135171351813519135201352113522135231352413525135261352713528135291353013531135321353313534135351353613537135381353913540135411354213543135441354513546135471354813549135501355113552135531355413555135561355713558135591356013561135621356313564135651356613567135681356913570135711357213573135741357513576135771357813579135801358113582135831358413585135861358713588135891359013591135921359313594135951359613597135981359913600136011360213603136041360513606136071360813609136101361113612136131361413615136161361713618136191362013621136221362313624136251362613627136281362913630136311363213633136341363513636136371363813639136401364113642136431364413645136461364713648136491365013651136521365313654136551365613657136581365913660136611366213663136641366513666136671366813669136701367113672136731367413675136761367713678136791368013681136821368313684136851368613687136881368913690136911369213693136941369513696136971369813699137001370113702137031370413705137061370713708137091371013711137121371313714137151371613717137181371913720137211372213723137241372513726137271372813729137301373113732137331373413735137361373713738137391374013741137421374313744137451374613747137481374913750137511375213753137541375513756137571375813759137601376113762137631376413765137661376713768137691377013771137721377313774137751377613777137781377913780137811378213783137841378513786137871378813789137901379113792137931379413795137961379713798137991380013801138021380313804138051380613807138081380913810138111381213813138141381513816138171381813819138201382113822138231382413825138261382713828138291383013831138321383313834138351383613837138381383913840138411384213843138441384513846138471384813849138501385113852138531385413855138561385713858138591386013861138621386313864138651386613867138681386913870138711387213873138741387513876138771387813879138801388113882138831388413885138861388713888138891389013891138921389313894138951389613897138981389913900139011390213903139041390513906139071390813909139101391113912139131391413915139161391713918139191392013921139221392313924139251392613927139281392913930139311393213933139341393513936139371393813939139401394113942139431394413945139461394713948139491395013951139521395313954139551395613957139581395913960139611396213963139641396513966139671396813969139701397113972139731397413975139761397713978139791398013981139821398313984139851398613987139881398913990139911399213993139941399513996139971399813999140001400114002140031400414005140061400714008140091401014011140121401314014140151401614017140181401914020140211402214023140241402514026140271402814029140301403114032140331403414035140361403714038140391404014041140421404314044140451404614047140481404914050140511405214053140541405514056140571405814059140601406114062140631406414065140661406714068140691407014071140721407314074140751407614077140781407914080140811408214083140841408514086140871408814089140901409114092140931409414095140961409714098140991410014101141021410314104141051410614107141081410914110141111411214113141141411514116141171411814119141201412114122141231412414125141261412714128141291413014131141321413314134141351413614137141381413914140141411414214143141441414514146141471414814149141501415114152141531415414155141561415714158141591416014161141621416314164141651416614167141681416914170141711417214173141741417514176141771417814179141801418114182141831418414185141861418714188141891419014191141921419314194141951419614197141981419914200142011420214203142041420514206142071420814209142101421114212142131421414215142161421714218142191422014221142221422314224142251422614227142281422914230142311423214233142341423514236142371423814239142401424114242142431424414245142461424714248142491425014251142521425314254142551425614257142581425914260142611426214263142641426514266142671426814269142701427114272142731427414275142761427714278142791428014281142821428314284142851428614287142881428914290142911429214293142941429514296142971429814299143001430114302143031430414305143061430714308143091431014311143121431314314143151431614317143181431914320143211432214323143241432514326143271432814329143301433114332143331433414335143361433714338143391434014341143421434314344143451434614347143481434914350143511435214353143541435514356143571435814359143601436114362143631436414365143661436714368143691437014371143721437314374143751437614377143781437914380143811438214383143841438514386143871438814389143901439114392143931439414395143961439714398143991440014401144021440314404144051440614407144081440914410144111441214413144141441514416144171441814419144201442114422144231442414425144261442714428144291443014431144321443314434144351443614437144381443914440144411444214443144441444514446144471444814449144501445114452144531445414455144561445714458144591446014461144621446314464144651446614467144681446914470144711447214473144741447514476144771447814479144801448114482144831448414485144861448714488144891449014491144921449314494144951449614497144981449914500145011450214503145041450514506145071450814509145101451114512145131451414515145161451714518145191452014521145221452314524145251452614527145281452914530145311453214533145341453514536145371453814539145401454114542145431454414545145461454714548145491455014551145521455314554145551455614557145581455914560145611456214563145641456514566145671456814569145701457114572145731457414575145761457714578145791458014581145821458314584145851458614587145881458914590145911459214593145941459514596145971459814599146001460114602146031460414605146061460714608146091461014611146121461314614146151461614617146181461914620146211462214623146241462514626146271462814629146301463114632146331463414635146361463714638146391464014641146421464314644146451464614647146481464914650146511465214653146541465514656146571465814659146601466114662146631466414665146661466714668146691467014671146721467314674146751467614677146781467914680146811468214683146841468514686146871468814689146901469114692146931469414695146961469714698146991470014701147021470314704147051470614707147081470914710147111471214713147141471514716147171471814719147201472114722147231472414725147261472714728147291473014731147321473314734147351473614737147381473914740147411474214743147441474514746147471474814749147501475114752147531475414755147561475714758147591476014761147621476314764147651476614767147681476914770147711477214773147741477514776147771477814779147801478114782147831478414785147861478714788147891479014791147921479314794147951479614797147981479914800148011480214803148041480514806148071480814809148101481114812148131481414815148161481714818148191482014821148221482314824148251482614827148281482914830148311483214833148341483514836148371483814839148401484114842148431484414845148461484714848148491485014851148521485314854148551485614857148581485914860148611486214863148641486514866148671486814869148701487114872148731487414875148761487714878148791488014881148821488314884148851488614887148881488914890148911489214893148941489514896148971489814899149001490114902149031490414905149061490714908149091491014911149121491314914149151491614917149181491914920149211492214923149241492514926149271492814929149301493114932149331493414935149361493714938149391494014941149421494314944149451494614947149481494914950149511495214953149541495514956149571495814959149601496114962149631496414965149661496714968149691497014971149721497314974149751497614977149781497914980149811498214983149841498514986149871498814989149901499114992149931499414995149961499714998149991500015001150021500315004150051500615007150081500915010150111501215013150141501515016150171501815019150201502115022150231502415025150261502715028150291503015031150321503315034150351503615037150381503915040150411504215043150441504515046150471504815049150501505115052150531505415055150561505715058150591506015061150621506315064150651506615067150681506915070150711507215073150741507515076150771507815079150801508115082150831508415085150861508715088150891509015091150921509315094150951509615097150981509915100151011510215103151041510515106151071510815109151101511115112151131511415115151161511715118151191512015121151221512315124151251512615127151281512915130151311513215133151341513515136151371513815139151401514115142151431514415145151461514715148151491515015151151521515315154151551515615157151581515915160151611516215163151641516515166151671516815169151701517115172151731517415175151761517715178151791518015181151821518315184151851518615187151881518915190151911519215193151941519515196151971519815199152001520115202152031520415205152061520715208152091521015211152121521315214152151521615217152181521915220152211522215223152241522515226152271522815229152301523115232152331523415235152361523715238152391524015241152421524315244152451524615247152481524915250152511525215253152541525515256152571525815259152601526115262152631526415265152661526715268152691527015271152721527315274152751527615277152781527915280152811528215283152841528515286152871528815289152901529115292152931529415295152961529715298152991530015301153021530315304153051530615307153081530915310153111531215313153141531515316153171531815319153201532115322153231532415325153261532715328153291533015331153321533315334153351533615337153381533915340153411534215343153441534515346153471534815349153501535115352153531535415355153561535715358153591536015361153621536315364153651536615367153681536915370153711537215373153741537515376153771537815379153801538115382153831538415385153861538715388153891539015391153921539315394153951539615397153981539915400154011540215403154041540515406154071540815409154101541115412154131541415415154161541715418154191542015421154221542315424154251542615427154281542915430154311543215433154341543515436154371543815439154401544115442154431544415445154461544715448154491545015451154521545315454154551545615457154581545915460154611546215463154641546515466154671546815469154701547115472154731547415475154761547715478154791548015481154821548315484154851548615487154881548915490154911549215493154941549515496154971549815499155001550115502155031550415505155061550715508155091551015511155121551315514155151551615517155181551915520155211552215523155241552515526155271552815529155301553115532155331553415535155361553715538155391554015541155421554315544155451554615547155481554915550155511555215553155541555515556155571555815559155601556115562155631556415565155661556715568155691557015571155721557315574155751557615577155781557915580155811558215583155841558515586155871558815589155901559115592155931559415595155961559715598155991560015601156021560315604156051560615607156081560915610156111561215613156141561515616156171561815619156201562115622156231562415625156261562715628156291563015631156321563315634156351563615637156381563915640156411564215643156441564515646156471564815649156501565115652156531565415655156561565715658156591566015661156621566315664156651566615667156681566915670156711567215673156741567515676156771567815679156801568115682156831568415685156861568715688156891569015691156921569315694156951569615697156981569915700157011570215703157041570515706157071570815709157101571115712157131571415715157161571715718157191572015721157221572315724157251572615727157281572915730157311573215733157341573515736157371573815739157401574115742157431574415745157461574715748157491575015751157521575315754157551575615757157581575915760157611576215763157641576515766157671576815769157701577115772157731577415775157761577715778157791578015781157821578315784157851578615787157881578915790157911579215793157941579515796157971579815799158001580115802158031580415805158061580715808158091581015811158121581315814158151581615817158181581915820158211582215823158241582515826158271582815829158301583115832158331583415835158361583715838158391584015841158421584315844158451584615847158481584915850158511585215853158541585515856158571585815859158601586115862158631586415865158661586715868158691587015871158721587315874158751587615877158781587915880158811588215883158841588515886158871588815889158901589115892158931589415895158961589715898158991590015901159021590315904159051590615907159081590915910159111591215913159141591515916159171591815919159201592115922159231592415925159261592715928159291593015931159321593315934159351593615937159381593915940159411594215943159441594515946159471594815949159501595115952159531595415955159561595715958159591596015961159621596315964159651596615967159681596915970159711597215973159741597515976159771597815979159801598115982159831598415985159861598715988159891599015991159921599315994159951599615997159981599916000160011600216003160041600516006160071600816009160101601116012160131601416015160161601716018160191602016021160221602316024160251602616027160281602916030160311603216033160341603516036160371603816039160401604116042160431604416045160461604716048160491605016051160521605316054160551605616057160581605916060160611606216063160641606516066160671606816069160701607116072160731607416075160761607716078160791608016081160821608316084160851608616087160881608916090160911609216093160941609516096160971609816099161001610116102161031610416105161061610716108161091611016111161121611316114161151611616117161181611916120161211612216123161241612516126161271612816129161301613116132161331613416135161361613716138161391614016141161421614316144161451614616147161481614916150161511615216153161541615516156161571615816159161601616116162161631616416165161661616716168161691617016171161721617316174161751617616177161781617916180161811618216183161841618516186161871618816189161901619116192161931619416195161961619716198161991620016201162021620316204162051620616207162081620916210162111621216213162141621516216162171621816219162201622116222162231622416225162261622716228162291623016231162321623316234162351623616237162381623916240162411624216243162441624516246162471624816249162501625116252162531625416255162561625716258162591626016261162621626316264162651626616267162681626916270162711627216273162741627516276162771627816279162801628116282162831628416285162861628716288162891629016291162921629316294162951629616297162981629916300163011630216303163041630516306163071630816309163101631116312163131631416315163161631716318163191632016321163221632316324163251632616327163281632916330163311633216333163341633516336163371633816339163401634116342163431634416345163461634716348163491635016351163521635316354163551635616357163581635916360163611636216363163641636516366163671636816369163701637116372163731637416375163761637716378163791638016381163821638316384163851638616387163881638916390163911639216393163941639516396163971639816399164001640116402164031640416405164061640716408164091641016411164121641316414164151641616417164181641916420164211642216423164241642516426164271642816429164301643116432164331643416435164361643716438164391644016441164421644316444164451644616447164481644916450164511645216453164541645516456164571645816459164601646116462164631646416465164661646716468164691647016471164721647316474164751647616477164781647916480164811648216483164841648516486164871648816489164901649116492164931649416495164961649716498164991650016501165021650316504165051650616507165081650916510165111651216513165141651516516165171651816519165201652116522165231652416525165261652716528165291653016531165321653316534165351653616537165381653916540165411654216543165441654516546
  1. /* fe_x25519_asm
  2. *
  3. * Copyright (C) 2006-2020 wolfSSL Inc.
  4. *
  5. * This file is part of wolfSSL.
  6. *
  7. * wolfSSL is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU General Public License as published by
  9. * the Free Software Foundation; either version 2 of the License, or
  10. * (at your option) any later version.
  11. *
  12. * wolfSSL is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU General Public License
  18. * along with this program; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
  20. */
  21. #ifndef HAVE_INTEL_AVX1
  22. #define HAVE_INTEL_AVX1
  23. #endif /* HAVE_INTEL_AVX1 */
  24. #ifndef NO_AVX2_SUPPORT
  25. #define HAVE_INTEL_AVX2
  26. #endif /* NO_AVX2_SUPPORT */
  27. #ifndef __APPLE__
  28. .text
  29. .globl fe_init
  30. .type fe_init,@function
  31. .align 16
  32. fe_init:
  33. #else
  34. .section __TEXT,__text
  35. .globl _fe_init
  36. .p2align 4
  37. _fe_init:
  38. #endif /* __APPLE__ */
  39. #ifdef HAVE_INTEL_AVX2
  40. #ifndef __APPLE__
  41. movq cpuFlagsSet@GOTPCREL(%rip), %rax
  42. movl (%rax), %eax
  43. #else
  44. movl _cpuFlagsSet(%rip), %eax
  45. #endif /* __APPLE__ */
  46. testl %eax, %eax
  47. je L_fe_init_get_flags
  48. repz retq
  49. L_fe_init_get_flags:
  50. #ifndef __APPLE__
  51. callq cpuid_get_flags@plt
  52. #else
  53. callq _cpuid_get_flags
  54. #endif /* __APPLE__ */
  55. #ifndef __APPLE__
  56. movq intelFlags@GOTPCREL(%rip), %rdx
  57. movl %eax, (%rdx)
  58. #else
  59. movl %eax, _intelFlags(%rip)
  60. #endif /* __APPLE__ */
  61. andl $0x50, %eax
  62. cmpl $0x50, %eax
  63. jne L_fe_init_flags_done
  64. #ifndef __APPLE__
  65. movq fe_mul_avx2@GOTPCREL(%rip), %rax
  66. #else
  67. leaq _fe_mul_avx2(%rip), %rax
  68. #endif /* __APPLE__ */
  69. #ifndef __APPLE__
  70. movq fe_mul_p@GOTPCREL(%rip), %rdx
  71. movq %rax, (%rdx)
  72. #else
  73. movq %rax, _fe_mul_p(%rip)
  74. #endif /* __APPLE__ */
  75. #ifndef __APPLE__
  76. movq fe_sq_avx2@GOTPCREL(%rip), %rax
  77. #else
  78. leaq _fe_sq_avx2(%rip), %rax
  79. #endif /* __APPLE__ */
  80. #ifndef __APPLE__
  81. movq fe_sq_p@GOTPCREL(%rip), %rdx
  82. movq %rax, (%rdx)
  83. #else
  84. movq %rax, _fe_sq_p(%rip)
  85. #endif /* __APPLE__ */
  86. #ifndef __APPLE__
  87. movq fe_mul121666_avx2@GOTPCREL(%rip), %rax
  88. #else
  89. leaq _fe_mul121666_avx2(%rip), %rax
  90. #endif /* __APPLE__ */
  91. #ifndef __APPLE__
  92. movq fe_mul121666_p@GOTPCREL(%rip), %rdx
  93. movq %rax, (%rdx)
  94. #else
  95. movq %rax, _fe_mul121666_p(%rip)
  96. #endif /* __APPLE__ */
  97. #ifndef __APPLE__
  98. movq fe_sq2_avx2@GOTPCREL(%rip), %rax
  99. #else
  100. leaq _fe_sq2_avx2(%rip), %rax
  101. #endif /* __APPLE__ */
  102. #ifndef __APPLE__
  103. movq fe_sq2_p@GOTPCREL(%rip), %rdx
  104. movq %rax, (%rdx)
  105. #else
  106. movq %rax, _fe_sq2_p(%rip)
  107. #endif /* __APPLE__ */
  108. #ifndef __APPLE__
  109. movq fe_invert_avx2@GOTPCREL(%rip), %rax
  110. #else
  111. leaq _fe_invert_avx2(%rip), %rax
  112. #endif /* __APPLE__ */
  113. #ifndef __APPLE__
  114. movq fe_invert_p@GOTPCREL(%rip), %rdx
  115. movq %rax, (%rdx)
  116. #else
  117. movq %rax, _fe_invert_p(%rip)
  118. #endif /* __APPLE__ */
  119. #ifndef __APPLE__
  120. movq curve25519_avx2@GOTPCREL(%rip), %rax
  121. #else
  122. leaq _curve25519_avx2(%rip), %rax
  123. #endif /* __APPLE__ */
  124. #ifndef __APPLE__
  125. movq curve25519_p@GOTPCREL(%rip), %rdx
  126. movq %rax, (%rdx)
  127. #else
  128. movq %rax, _curve25519_p(%rip)
  129. #endif /* __APPLE__ */
  130. #ifndef __APPLE__
  131. movq fe_pow22523_avx2@GOTPCREL(%rip), %rax
  132. #else
  133. leaq _fe_pow22523_avx2(%rip), %rax
  134. #endif /* __APPLE__ */
  135. #ifndef __APPLE__
  136. movq fe_pow22523_p@GOTPCREL(%rip), %rdx
  137. movq %rax, (%rdx)
  138. #else
  139. movq %rax, _fe_pow22523_p(%rip)
  140. #endif /* __APPLE__ */
  141. #ifndef __APPLE__
  142. movq fe_ge_to_p2_avx2@GOTPCREL(%rip), %rax
  143. #else
  144. leaq _fe_ge_to_p2_avx2(%rip), %rax
  145. #endif /* __APPLE__ */
  146. #ifndef __APPLE__
  147. movq fe_ge_to_p2_p@GOTPCREL(%rip), %rdx
  148. movq %rax, (%rdx)
  149. #else
  150. movq %rax, _fe_ge_to_p2_p(%rip)
  151. #endif /* __APPLE__ */
  152. #ifndef __APPLE__
  153. movq fe_ge_to_p3_avx2@GOTPCREL(%rip), %rax
  154. #else
  155. leaq _fe_ge_to_p3_avx2(%rip), %rax
  156. #endif /* __APPLE__ */
  157. #ifndef __APPLE__
  158. movq fe_ge_to_p3_p@GOTPCREL(%rip), %rdx
  159. movq %rax, (%rdx)
  160. #else
  161. movq %rax, _fe_ge_to_p3_p(%rip)
  162. #endif /* __APPLE__ */
  163. #ifndef __APPLE__
  164. movq fe_ge_dbl_avx2@GOTPCREL(%rip), %rax
  165. #else
  166. leaq _fe_ge_dbl_avx2(%rip), %rax
  167. #endif /* __APPLE__ */
  168. #ifndef __APPLE__
  169. movq fe_ge_dbl_p@GOTPCREL(%rip), %rdx
  170. movq %rax, (%rdx)
  171. #else
  172. movq %rax, _fe_ge_dbl_p(%rip)
  173. #endif /* __APPLE__ */
  174. #ifndef __APPLE__
  175. movq fe_ge_madd_avx2@GOTPCREL(%rip), %rax
  176. #else
  177. leaq _fe_ge_madd_avx2(%rip), %rax
  178. #endif /* __APPLE__ */
  179. #ifndef __APPLE__
  180. movq fe_ge_madd_p@GOTPCREL(%rip), %rdx
  181. movq %rax, (%rdx)
  182. #else
  183. movq %rax, _fe_ge_madd_p(%rip)
  184. #endif /* __APPLE__ */
  185. #ifndef __APPLE__
  186. movq fe_ge_msub_avx2@GOTPCREL(%rip), %rax
  187. #else
  188. leaq _fe_ge_msub_avx2(%rip), %rax
  189. #endif /* __APPLE__ */
  190. #ifndef __APPLE__
  191. movq fe_ge_msub_p@GOTPCREL(%rip), %rdx
  192. movq %rax, (%rdx)
  193. #else
  194. movq %rax, _fe_ge_msub_p(%rip)
  195. #endif /* __APPLE__ */
  196. #ifndef __APPLE__
  197. movq fe_ge_add_avx2@GOTPCREL(%rip), %rax
  198. #else
  199. leaq _fe_ge_add_avx2(%rip), %rax
  200. #endif /* __APPLE__ */
  201. #ifndef __APPLE__
  202. movq fe_ge_add_p@GOTPCREL(%rip), %rdx
  203. movq %rax, (%rdx)
  204. #else
  205. movq %rax, _fe_ge_add_p(%rip)
  206. #endif /* __APPLE__ */
  207. #ifndef __APPLE__
  208. movq fe_ge_sub_avx2@GOTPCREL(%rip), %rax
  209. #else
  210. leaq _fe_ge_sub_avx2(%rip), %rax
  211. #endif /* __APPLE__ */
  212. #ifndef __APPLE__
  213. movq fe_ge_sub_p@GOTPCREL(%rip), %rdx
  214. movq %rax, (%rdx)
  215. #else
  216. movq %rax, _fe_ge_sub_p(%rip)
  217. #endif /* __APPLE__ */
  218. L_fe_init_flags_done:
  219. #ifndef __APPLE__
  220. movq cpuFlagsSet@GOTPCREL(%rip), %rdx
  221. movl $0x1, (%rdx)
  222. #else
  223. movl $0x1, _cpuFlagsSet(%rip)
  224. #endif /* __APPLE__ */
  225. #endif /* HAVE_INTEL_AVX2 */
  226. repz retq
  227. #ifndef __APPLE__
  228. .size fe_init,.-fe_init
  229. #endif /* __APPLE__ */
  230. #ifndef __APPLE__
  231. .text
  232. .globl fe_frombytes
  233. .type fe_frombytes,@function
  234. .align 16
  235. fe_frombytes:
  236. #else
  237. .section __TEXT,__text
  238. .globl _fe_frombytes
  239. .p2align 4
  240. _fe_frombytes:
  241. #endif /* __APPLE__ */
  242. movq $0x7fffffffffffffff, %r9
  243. movq (%rsi), %rdx
  244. movq 8(%rsi), %rax
  245. movq 16(%rsi), %rcx
  246. movq 24(%rsi), %r8
  247. andq %r9, %r8
  248. movq %rdx, (%rdi)
  249. movq %rax, 8(%rdi)
  250. movq %rcx, 16(%rdi)
  251. movq %r8, 24(%rdi)
  252. repz retq
  253. #ifndef __APPLE__
  254. .size fe_frombytes,.-fe_frombytes
  255. #endif /* __APPLE__ */
  256. #ifndef __APPLE__
  257. .text
  258. .globl fe_tobytes
  259. .type fe_tobytes,@function
  260. .align 16
  261. fe_tobytes:
  262. #else
  263. .section __TEXT,__text
  264. .globl _fe_tobytes
  265. .p2align 4
  266. _fe_tobytes:
  267. #endif /* __APPLE__ */
  268. movq $0x7fffffffffffffff, %r10
  269. movq (%rsi), %rdx
  270. movq 8(%rsi), %rax
  271. movq 16(%rsi), %rcx
  272. movq 24(%rsi), %r8
  273. addq $19, %rdx
  274. adcq $0x00, %rax
  275. adcq $0x00, %rcx
  276. adcq $0x00, %r8
  277. shrq $63, %r8
  278. imulq $19, %r8, %r9
  279. movq (%rsi), %rdx
  280. movq 8(%rsi), %rax
  281. movq 16(%rsi), %rcx
  282. movq 24(%rsi), %r8
  283. addq %r9, %rdx
  284. adcq $0x00, %rax
  285. adcq $0x00, %rcx
  286. adcq $0x00, %r8
  287. andq %r10, %r8
  288. movq %rdx, (%rdi)
  289. movq %rax, 8(%rdi)
  290. movq %rcx, 16(%rdi)
  291. movq %r8, 24(%rdi)
  292. repz retq
  293. #ifndef __APPLE__
  294. .size fe_tobytes,.-fe_tobytes
  295. #endif /* __APPLE__ */
  296. #ifndef __APPLE__
  297. .text
  298. .globl fe_1
  299. .type fe_1,@function
  300. .align 16
  301. fe_1:
  302. #else
  303. .section __TEXT,__text
  304. .globl _fe_1
  305. .p2align 4
  306. _fe_1:
  307. #endif /* __APPLE__ */
  308. # Set one
  309. movq $0x01, (%rdi)
  310. movq $0x00, 8(%rdi)
  311. movq $0x00, 16(%rdi)
  312. movq $0x00, 24(%rdi)
  313. repz retq
  314. #ifndef __APPLE__
  315. .size fe_1,.-fe_1
  316. #endif /* __APPLE__ */
  317. #ifndef __APPLE__
  318. .text
  319. .globl fe_0
  320. .type fe_0,@function
  321. .align 16
  322. fe_0:
  323. #else
  324. .section __TEXT,__text
  325. .globl _fe_0
  326. .p2align 4
  327. _fe_0:
  328. #endif /* __APPLE__ */
  329. # Set zero
  330. movq $0x00, (%rdi)
  331. movq $0x00, 8(%rdi)
  332. movq $0x00, 16(%rdi)
  333. movq $0x00, 24(%rdi)
  334. repz retq
  335. #ifndef __APPLE__
  336. .size fe_0,.-fe_0
  337. #endif /* __APPLE__ */
  338. #ifndef __APPLE__
  339. .text
  340. .globl fe_copy
  341. .type fe_copy,@function
  342. .align 16
  343. fe_copy:
  344. #else
  345. .section __TEXT,__text
  346. .globl _fe_copy
  347. .p2align 4
  348. _fe_copy:
  349. #endif /* __APPLE__ */
  350. # Copy
  351. movq (%rsi), %rdx
  352. movq 8(%rsi), %rax
  353. movq 16(%rsi), %rcx
  354. movq 24(%rsi), %r8
  355. movq %rdx, (%rdi)
  356. movq %rax, 8(%rdi)
  357. movq %rcx, 16(%rdi)
  358. movq %r8, 24(%rdi)
  359. repz retq
  360. #ifndef __APPLE__
  361. .size fe_copy,.-fe_copy
  362. #endif /* __APPLE__ */
  363. #ifndef __APPLE__
  364. .text
  365. .globl fe_sub
  366. .type fe_sub,@function
  367. .align 16
  368. fe_sub:
  369. #else
  370. .section __TEXT,__text
  371. .globl _fe_sub
  372. .p2align 4
  373. _fe_sub:
  374. #endif /* __APPLE__ */
  375. pushq %r12
  376. # Sub
  377. movq (%rsi), %rax
  378. movq 8(%rsi), %rcx
  379. movq 16(%rsi), %r8
  380. movq 24(%rsi), %r9
  381. subq (%rdx), %rax
  382. movq $0x00, %r10
  383. sbbq 8(%rdx), %rcx
  384. movq $-19, %r11
  385. sbbq 16(%rdx), %r8
  386. movq $0x7fffffffffffffff, %r12
  387. sbbq 24(%rdx), %r9
  388. sbbq $0x00, %r10
  389. # Mask the modulus
  390. andq %r10, %r11
  391. andq %r10, %r12
  392. # Add modulus (if underflow)
  393. addq %r11, %rax
  394. adcq %r10, %rcx
  395. adcq %r10, %r8
  396. adcq %r12, %r9
  397. movq %rax, (%rdi)
  398. movq %rcx, 8(%rdi)
  399. movq %r8, 16(%rdi)
  400. movq %r9, 24(%rdi)
  401. popq %r12
  402. repz retq
  403. #ifndef __APPLE__
  404. .size fe_sub,.-fe_sub
  405. #endif /* __APPLE__ */
  406. #ifndef __APPLE__
  407. .text
  408. .globl fe_add
  409. .type fe_add,@function
  410. .align 16
  411. fe_add:
  412. #else
  413. .section __TEXT,__text
  414. .globl _fe_add
  415. .p2align 4
  416. _fe_add:
  417. #endif /* __APPLE__ */
  418. pushq %r12
  419. # Add
  420. movq (%rsi), %rax
  421. movq 8(%rsi), %rcx
  422. addq (%rdx), %rax
  423. movq 16(%rsi), %r8
  424. adcq 8(%rdx), %rcx
  425. movq 24(%rsi), %r10
  426. adcq 16(%rdx), %r8
  427. movq $-19, %r11
  428. adcq 24(%rdx), %r10
  429. movq $0x7fffffffffffffff, %r12
  430. movq %r10, %r9
  431. sarq $63, %r10
  432. # Mask the modulus
  433. andq %r10, %r11
  434. andq %r10, %r12
  435. # Sub modulus (if overflow)
  436. subq %r11, %rax
  437. sbbq %r10, %rcx
  438. sbbq %r10, %r8
  439. sbbq %r12, %r9
  440. movq %rax, (%rdi)
  441. movq %rcx, 8(%rdi)
  442. movq %r8, 16(%rdi)
  443. movq %r9, 24(%rdi)
  444. popq %r12
  445. repz retq
  446. #ifndef __APPLE__
  447. .size fe_add,.-fe_add
  448. #endif /* __APPLE__ */
  449. #ifndef __APPLE__
  450. .text
  451. .globl fe_neg
  452. .type fe_neg,@function
  453. .align 16
  454. fe_neg:
  455. #else
  456. .section __TEXT,__text
  457. .globl _fe_neg
  458. .p2align 4
  459. _fe_neg:
  460. #endif /* __APPLE__ */
  461. movq $-19, %rdx
  462. movq $-1, %rax
  463. movq $-1, %rcx
  464. movq $0x7fffffffffffffff, %r8
  465. subq (%rsi), %rdx
  466. sbbq 8(%rsi), %rax
  467. sbbq 16(%rsi), %rcx
  468. sbbq 24(%rsi), %r8
  469. movq %rdx, (%rdi)
  470. movq %rax, 8(%rdi)
  471. movq %rcx, 16(%rdi)
  472. movq %r8, 24(%rdi)
  473. repz retq
  474. #ifndef __APPLE__
  475. .size fe_neg,.-fe_neg
  476. #endif /* __APPLE__ */
  477. #ifndef __APPLE__
  478. .text
  479. .globl fe_cmov
  480. .type fe_cmov,@function
  481. .align 16
  482. fe_cmov:
  483. #else
  484. .section __TEXT,__text
  485. .globl _fe_cmov
  486. .p2align 4
  487. _fe_cmov:
  488. #endif /* __APPLE__ */
  489. cmpl $0x01, %edx
  490. movq (%rdi), %rcx
  491. movq 8(%rdi), %r8
  492. movq 16(%rdi), %r9
  493. movq 24(%rdi), %r10
  494. cmoveq (%rsi), %rcx
  495. cmoveq 8(%rsi), %r8
  496. cmoveq 16(%rsi), %r9
  497. cmoveq 24(%rsi), %r10
  498. movq %rcx, (%rdi)
  499. movq %r8, 8(%rdi)
  500. movq %r9, 16(%rdi)
  501. movq %r10, 24(%rdi)
  502. repz retq
  503. #ifndef __APPLE__
  504. .size fe_cmov,.-fe_cmov
  505. #endif /* __APPLE__ */
  506. #ifndef __APPLE__
  507. .text
  508. .globl fe_isnonzero
  509. .type fe_isnonzero,@function
  510. .align 16
  511. fe_isnonzero:
  512. #else
  513. .section __TEXT,__text
  514. .globl _fe_isnonzero
  515. .p2align 4
  516. _fe_isnonzero:
  517. #endif /* __APPLE__ */
  518. movq $0x7fffffffffffffff, %r10
  519. movq (%rdi), %rax
  520. movq 8(%rdi), %rdx
  521. movq 16(%rdi), %rcx
  522. movq 24(%rdi), %r8
  523. addq $19, %rax
  524. adcq $0x00, %rdx
  525. adcq $0x00, %rcx
  526. adcq $0x00, %r8
  527. shrq $63, %r8
  528. imulq $19, %r8, %r9
  529. movq (%rdi), %rax
  530. movq 8(%rdi), %rdx
  531. movq 16(%rdi), %rcx
  532. movq 24(%rdi), %r8
  533. addq %r9, %rax
  534. adcq $0x00, %rdx
  535. adcq $0x00, %rcx
  536. adcq $0x00, %r8
  537. andq %r10, %r8
  538. orq %rdx, %rax
  539. orq %rcx, %rax
  540. orq %r8, %rax
  541. repz retq
  542. #ifndef __APPLE__
  543. .size fe_isnonzero,.-fe_isnonzero
  544. #endif /* __APPLE__ */
  545. #ifndef __APPLE__
  546. .text
  547. .globl fe_isnegative
  548. .type fe_isnegative,@function
  549. .align 16
  550. fe_isnegative:
  551. #else
  552. .section __TEXT,__text
  553. .globl _fe_isnegative
  554. .p2align 4
  555. _fe_isnegative:
  556. #endif /* __APPLE__ */
  557. movq $0x7fffffffffffffff, %r11
  558. movq (%rdi), %rdx
  559. movq 8(%rdi), %rcx
  560. movq 16(%rdi), %r8
  561. movq 24(%rdi), %r9
  562. movq %rdx, %rax
  563. addq $19, %rdx
  564. adcq $0x00, %rcx
  565. adcq $0x00, %r8
  566. adcq $0x00, %r9
  567. shrq $63, %r9
  568. imulq $19, %r9, %r10
  569. addq %r10, %rax
  570. andq $0x01, %rax
  571. repz retq
  572. #ifndef __APPLE__
  573. .size fe_isnegative,.-fe_isnegative
  574. #endif /* __APPLE__ */
  575. #ifndef __APPLE__
  576. .text
  577. .globl fe_cmov_table
  578. .type fe_cmov_table,@function
  579. .align 16
  580. fe_cmov_table:
  581. #else
  582. .section __TEXT,__text
  583. .globl _fe_cmov_table
  584. .p2align 4
  585. _fe_cmov_table:
  586. #endif /* __APPLE__ */
  587. pushq %r12
  588. pushq %r13
  589. pushq %r14
  590. pushq %r15
  591. movq %rdx, %rcx
  592. movsbq %cl, %rax
  593. cdq
  594. xorb %dl, %al
  595. subb %dl, %al
  596. movb %al, %r15b
  597. movq $0x01, %rax
  598. xorq %rdx, %rdx
  599. xorq %r8, %r8
  600. xorq %r9, %r9
  601. movq $0x01, %r10
  602. xorq %r11, %r11
  603. xorq %r12, %r12
  604. xorq %r13, %r13
  605. cmpb $0x01, %r15b
  606. movq (%rsi), %r14
  607. cmoveq %r14, %rax
  608. movq 8(%rsi), %r14
  609. cmoveq %r14, %rdx
  610. movq 16(%rsi), %r14
  611. cmoveq %r14, %r8
  612. movq 24(%rsi), %r14
  613. cmoveq %r14, %r9
  614. movq 32(%rsi), %r14
  615. cmoveq %r14, %r10
  616. movq 40(%rsi), %r14
  617. cmoveq %r14, %r11
  618. movq 48(%rsi), %r14
  619. cmoveq %r14, %r12
  620. movq 56(%rsi), %r14
  621. cmoveq %r14, %r13
  622. cmpb $2, %r15b
  623. movq 96(%rsi), %r14
  624. cmoveq %r14, %rax
  625. movq 104(%rsi), %r14
  626. cmoveq %r14, %rdx
  627. movq 112(%rsi), %r14
  628. cmoveq %r14, %r8
  629. movq 120(%rsi), %r14
  630. cmoveq %r14, %r9
  631. movq 128(%rsi), %r14
  632. cmoveq %r14, %r10
  633. movq 136(%rsi), %r14
  634. cmoveq %r14, %r11
  635. movq 144(%rsi), %r14
  636. cmoveq %r14, %r12
  637. movq 152(%rsi), %r14
  638. cmoveq %r14, %r13
  639. cmpb $3, %r15b
  640. movq 192(%rsi), %r14
  641. cmoveq %r14, %rax
  642. movq 200(%rsi), %r14
  643. cmoveq %r14, %rdx
  644. movq 208(%rsi), %r14
  645. cmoveq %r14, %r8
  646. movq 216(%rsi), %r14
  647. cmoveq %r14, %r9
  648. movq 224(%rsi), %r14
  649. cmoveq %r14, %r10
  650. movq 232(%rsi), %r14
  651. cmoveq %r14, %r11
  652. movq 240(%rsi), %r14
  653. cmoveq %r14, %r12
  654. movq 248(%rsi), %r14
  655. cmoveq %r14, %r13
  656. cmpb $4, %r15b
  657. movq 288(%rsi), %r14
  658. cmoveq %r14, %rax
  659. movq 296(%rsi), %r14
  660. cmoveq %r14, %rdx
  661. movq 304(%rsi), %r14
  662. cmoveq %r14, %r8
  663. movq 312(%rsi), %r14
  664. cmoveq %r14, %r9
  665. movq 320(%rsi), %r14
  666. cmoveq %r14, %r10
  667. movq 328(%rsi), %r14
  668. cmoveq %r14, %r11
  669. movq 336(%rsi), %r14
  670. cmoveq %r14, %r12
  671. movq 344(%rsi), %r14
  672. cmoveq %r14, %r13
  673. cmpb $5, %r15b
  674. movq 384(%rsi), %r14
  675. cmoveq %r14, %rax
  676. movq 392(%rsi), %r14
  677. cmoveq %r14, %rdx
  678. movq 400(%rsi), %r14
  679. cmoveq %r14, %r8
  680. movq 408(%rsi), %r14
  681. cmoveq %r14, %r9
  682. movq 416(%rsi), %r14
  683. cmoveq %r14, %r10
  684. movq 424(%rsi), %r14
  685. cmoveq %r14, %r11
  686. movq 432(%rsi), %r14
  687. cmoveq %r14, %r12
  688. movq 440(%rsi), %r14
  689. cmoveq %r14, %r13
  690. cmpb $6, %r15b
  691. movq 480(%rsi), %r14
  692. cmoveq %r14, %rax
  693. movq 488(%rsi), %r14
  694. cmoveq %r14, %rdx
  695. movq 496(%rsi), %r14
  696. cmoveq %r14, %r8
  697. movq 504(%rsi), %r14
  698. cmoveq %r14, %r9
  699. movq 512(%rsi), %r14
  700. cmoveq %r14, %r10
  701. movq 520(%rsi), %r14
  702. cmoveq %r14, %r11
  703. movq 528(%rsi), %r14
  704. cmoveq %r14, %r12
  705. movq 536(%rsi), %r14
  706. cmoveq %r14, %r13
  707. cmpb $7, %r15b
  708. movq 576(%rsi), %r14
  709. cmoveq %r14, %rax
  710. movq 584(%rsi), %r14
  711. cmoveq %r14, %rdx
  712. movq 592(%rsi), %r14
  713. cmoveq %r14, %r8
  714. movq 600(%rsi), %r14
  715. cmoveq %r14, %r9
  716. movq 608(%rsi), %r14
  717. cmoveq %r14, %r10
  718. movq 616(%rsi), %r14
  719. cmoveq %r14, %r11
  720. movq 624(%rsi), %r14
  721. cmoveq %r14, %r12
  722. movq 632(%rsi), %r14
  723. cmoveq %r14, %r13
  724. cmpb $8, %r15b
  725. movq 672(%rsi), %r14
  726. cmoveq %r14, %rax
  727. movq 680(%rsi), %r14
  728. cmoveq %r14, %rdx
  729. movq 688(%rsi), %r14
  730. cmoveq %r14, %r8
  731. movq 696(%rsi), %r14
  732. cmoveq %r14, %r9
  733. movq 704(%rsi), %r14
  734. cmoveq %r14, %r10
  735. movq 712(%rsi), %r14
  736. cmoveq %r14, %r11
  737. movq 720(%rsi), %r14
  738. cmoveq %r14, %r12
  739. movq 728(%rsi), %r14
  740. cmoveq %r14, %r13
  741. cmpb $0x00, %cl
  742. movq %rax, %r14
  743. cmovlq %r10, %rax
  744. cmovlq %r14, %r10
  745. movq %rdx, %r14
  746. cmovlq %r11, %rdx
  747. cmovlq %r14, %r11
  748. movq %r8, %r14
  749. cmovlq %r12, %r8
  750. cmovlq %r14, %r12
  751. movq %r9, %r14
  752. cmovlq %r13, %r9
  753. cmovlq %r14, %r13
  754. movq %rax, (%rdi)
  755. movq %rdx, 8(%rdi)
  756. movq %r8, 16(%rdi)
  757. movq %r9, 24(%rdi)
  758. movq %r10, 32(%rdi)
  759. movq %r11, 40(%rdi)
  760. movq %r12, 48(%rdi)
  761. movq %r13, 56(%rdi)
  762. xorq %rax, %rax
  763. xorq %rdx, %rdx
  764. xorq %r8, %r8
  765. xorq %r9, %r9
  766. cmpb $0x01, %r15b
  767. movq 64(%rsi), %r14
  768. cmoveq %r14, %rax
  769. movq 72(%rsi), %r14
  770. cmoveq %r14, %rdx
  771. movq 80(%rsi), %r14
  772. cmoveq %r14, %r8
  773. movq 88(%rsi), %r14
  774. cmoveq %r14, %r9
  775. cmpb $2, %r15b
  776. movq 160(%rsi), %r14
  777. cmoveq %r14, %rax
  778. movq 168(%rsi), %r14
  779. cmoveq %r14, %rdx
  780. movq 176(%rsi), %r14
  781. cmoveq %r14, %r8
  782. movq 184(%rsi), %r14
  783. cmoveq %r14, %r9
  784. cmpb $3, %r15b
  785. movq 256(%rsi), %r14
  786. cmoveq %r14, %rax
  787. movq 264(%rsi), %r14
  788. cmoveq %r14, %rdx
  789. movq 272(%rsi), %r14
  790. cmoveq %r14, %r8
  791. movq 280(%rsi), %r14
  792. cmoveq %r14, %r9
  793. cmpb $4, %r15b
  794. movq 352(%rsi), %r14
  795. cmoveq %r14, %rax
  796. movq 360(%rsi), %r14
  797. cmoveq %r14, %rdx
  798. movq 368(%rsi), %r14
  799. cmoveq %r14, %r8
  800. movq 376(%rsi), %r14
  801. cmoveq %r14, %r9
  802. cmpb $5, %r15b
  803. movq 448(%rsi), %r14
  804. cmoveq %r14, %rax
  805. movq 456(%rsi), %r14
  806. cmoveq %r14, %rdx
  807. movq 464(%rsi), %r14
  808. cmoveq %r14, %r8
  809. movq 472(%rsi), %r14
  810. cmoveq %r14, %r9
  811. cmpb $6, %r15b
  812. movq 544(%rsi), %r14
  813. cmoveq %r14, %rax
  814. movq 552(%rsi), %r14
  815. cmoveq %r14, %rdx
  816. movq 560(%rsi), %r14
  817. cmoveq %r14, %r8
  818. movq 568(%rsi), %r14
  819. cmoveq %r14, %r9
  820. cmpb $7, %r15b
  821. movq 640(%rsi), %r14
  822. cmoveq %r14, %rax
  823. movq 648(%rsi), %r14
  824. cmoveq %r14, %rdx
  825. movq 656(%rsi), %r14
  826. cmoveq %r14, %r8
  827. movq 664(%rsi), %r14
  828. cmoveq %r14, %r9
  829. cmpb $8, %r15b
  830. movq 736(%rsi), %r14
  831. cmoveq %r14, %rax
  832. movq 744(%rsi), %r14
  833. cmoveq %r14, %rdx
  834. movq 752(%rsi), %r14
  835. cmoveq %r14, %r8
  836. movq 760(%rsi), %r14
  837. cmoveq %r14, %r9
  838. movq $-19, %r10
  839. movq $-1, %r11
  840. movq $-1, %r12
  841. movq $0x7fffffffffffffff, %r13
  842. subq %rax, %r10
  843. sbbq %rdx, %r11
  844. sbbq %r8, %r12
  845. sbbq %r9, %r13
  846. cmpb $0x00, %cl
  847. cmovlq %r10, %rax
  848. cmovlq %r11, %rdx
  849. cmovlq %r12, %r8
  850. cmovlq %r13, %r9
  851. movq %rax, 64(%rdi)
  852. movq %rdx, 72(%rdi)
  853. movq %r8, 80(%rdi)
  854. movq %r9, 88(%rdi)
  855. popq %r15
  856. popq %r14
  857. popq %r13
  858. popq %r12
  859. repz retq
  860. #ifndef __APPLE__
  861. .size fe_cmov_table,.-fe_cmov_table
  862. #endif /* __APPLE__ */
  863. #ifndef __APPLE__
  864. .text
  865. .globl fe_mul
  866. .type fe_mul,@function
  867. .align 16
  868. fe_mul:
  869. #else
  870. .section __TEXT,__text
  871. .globl _fe_mul
  872. .p2align 4
  873. _fe_mul:
  874. #endif /* __APPLE__ */
  875. #ifndef __APPLE__
  876. jmpq *fe_mul_p(%rip)
  877. #else
  878. jmpq *_fe_mul_p(%rip)
  879. #endif /* __APPLE__ */
  880. #ifndef __APPLE__
  881. .size fe_mul,.-fe_mul
  882. #endif /* __APPLE__ */
  883. #ifndef __APPLE__
  884. .text
  885. .globl fe_sq
  886. .type fe_sq,@function
  887. .align 16
  888. fe_sq:
  889. #else
  890. .section __TEXT,__text
  891. .globl _fe_sq
  892. .p2align 4
  893. _fe_sq:
  894. #endif /* __APPLE__ */
  895. #ifndef __APPLE__
  896. jmpq *fe_sq_p(%rip)
  897. #else
  898. jmpq *_fe_sq_p(%rip)
  899. #endif /* __APPLE__ */
  900. #ifndef __APPLE__
  901. .size fe_sq,.-fe_sq
  902. #endif /* __APPLE__ */
  903. #ifndef __APPLE__
  904. .text
  905. .globl fe_mul121666
  906. .type fe_mul121666,@function
  907. .align 16
  908. fe_mul121666:
  909. #else
  910. .section __TEXT,__text
  911. .globl _fe_mul121666
  912. .p2align 4
  913. _fe_mul121666:
  914. #endif /* __APPLE__ */
  915. #ifndef __APPLE__
  916. jmpq *fe_mul121666_p(%rip)
  917. #else
  918. jmpq *_fe_mul121666_p(%rip)
  919. #endif /* __APPLE__ */
  920. #ifndef __APPLE__
  921. .size fe_mul121666,.-fe_mul121666
  922. #endif /* __APPLE__ */
  923. #ifndef __APPLE__
  924. .text
  925. .globl fe_sq2
  926. .type fe_sq2,@function
  927. .align 16
  928. fe_sq2:
  929. #else
  930. .section __TEXT,__text
  931. .globl _fe_sq2
  932. .p2align 4
  933. _fe_sq2:
  934. #endif /* __APPLE__ */
  935. #ifndef __APPLE__
  936. jmpq *fe_sq2_p(%rip)
  937. #else
  938. jmpq *_fe_sq2_p(%rip)
  939. #endif /* __APPLE__ */
  940. #ifndef __APPLE__
  941. .size fe_sq2,.-fe_sq2
  942. #endif /* __APPLE__ */
  943. #ifndef __APPLE__
  944. .text
  945. .globl fe_invert
  946. .type fe_invert,@function
  947. .align 16
  948. fe_invert:
  949. #else
  950. .section __TEXT,__text
  951. .globl _fe_invert
  952. .p2align 4
  953. _fe_invert:
  954. #endif /* __APPLE__ */
  955. #ifndef __APPLE__
  956. jmpq *fe_invert_p(%rip)
  957. #else
  958. jmpq *_fe_invert_p(%rip)
  959. #endif /* __APPLE__ */
  960. #ifndef __APPLE__
  961. .size fe_invert,.-fe_invert
  962. #endif /* __APPLE__ */
  963. #ifndef __APPLE__
  964. .text
  965. .globl curve25519
  966. .type curve25519,@function
  967. .align 16
  968. curve25519:
  969. #else
  970. .section __TEXT,__text
  971. .globl _curve25519
  972. .p2align 4
  973. _curve25519:
  974. #endif /* __APPLE__ */
  975. #ifndef __APPLE__
  976. jmpq *curve25519_p(%rip)
  977. #else
  978. jmpq *_curve25519_p(%rip)
  979. #endif /* __APPLE__ */
  980. #ifndef __APPLE__
  981. .size curve25519,.-curve25519
  982. #endif /* __APPLE__ */
  983. #ifndef __APPLE__
  984. .text
  985. .globl fe_pow22523
  986. .type fe_pow22523,@function
  987. .align 16
  988. fe_pow22523:
  989. #else
  990. .section __TEXT,__text
  991. .globl _fe_pow22523
  992. .p2align 4
  993. _fe_pow22523:
  994. #endif /* __APPLE__ */
  995. #ifndef __APPLE__
  996. jmpq *fe_pow22523_p(%rip)
  997. #else
  998. jmpq *_fe_pow22523_p(%rip)
  999. #endif /* __APPLE__ */
  1000. #ifndef __APPLE__
  1001. .size fe_pow22523,.-fe_pow22523
  1002. #endif /* __APPLE__ */
  1003. #ifndef __APPLE__
  1004. .text
  1005. .globl fe_ge_to_p2
  1006. .type fe_ge_to_p2,@function
  1007. .align 16
  1008. fe_ge_to_p2:
  1009. #else
  1010. .section __TEXT,__text
  1011. .globl _fe_ge_to_p2
  1012. .p2align 4
  1013. _fe_ge_to_p2:
  1014. #endif /* __APPLE__ */
  1015. #ifndef __APPLE__
  1016. jmpq *fe_ge_to_p2_p(%rip)
  1017. #else
  1018. jmpq *_fe_ge_to_p2_p(%rip)
  1019. #endif /* __APPLE__ */
  1020. #ifndef __APPLE__
  1021. .size fe_ge_to_p2,.-fe_ge_to_p2
  1022. #endif /* __APPLE__ */
  1023. #ifndef __APPLE__
  1024. .text
  1025. .globl fe_ge_to_p3
  1026. .type fe_ge_to_p3,@function
  1027. .align 16
  1028. fe_ge_to_p3:
  1029. #else
  1030. .section __TEXT,__text
  1031. .globl _fe_ge_to_p3
  1032. .p2align 4
  1033. _fe_ge_to_p3:
  1034. #endif /* __APPLE__ */
  1035. #ifndef __APPLE__
  1036. jmpq *fe_ge_to_p3_p(%rip)
  1037. #else
  1038. jmpq *_fe_ge_to_p3_p(%rip)
  1039. #endif /* __APPLE__ */
  1040. #ifndef __APPLE__
  1041. .size fe_ge_to_p3,.-fe_ge_to_p3
  1042. #endif /* __APPLE__ */
  1043. #ifndef __APPLE__
  1044. .text
  1045. .globl fe_ge_dbl
  1046. .type fe_ge_dbl,@function
  1047. .align 16
  1048. fe_ge_dbl:
  1049. #else
  1050. .section __TEXT,__text
  1051. .globl _fe_ge_dbl
  1052. .p2align 4
  1053. _fe_ge_dbl:
  1054. #endif /* __APPLE__ */
  1055. #ifndef __APPLE__
  1056. jmpq *fe_ge_dbl_p(%rip)
  1057. #else
  1058. jmpq *_fe_ge_dbl_p(%rip)
  1059. #endif /* __APPLE__ */
  1060. #ifndef __APPLE__
  1061. .size fe_ge_dbl,.-fe_ge_dbl
  1062. #endif /* __APPLE__ */
  1063. #ifndef __APPLE__
  1064. .text
  1065. .globl fe_ge_madd
  1066. .type fe_ge_madd,@function
  1067. .align 16
  1068. fe_ge_madd:
  1069. #else
  1070. .section __TEXT,__text
  1071. .globl _fe_ge_madd
  1072. .p2align 4
  1073. _fe_ge_madd:
  1074. #endif /* __APPLE__ */
  1075. #ifndef __APPLE__
  1076. jmpq *fe_ge_madd_p(%rip)
  1077. #else
  1078. jmpq *_fe_ge_madd_p(%rip)
  1079. #endif /* __APPLE__ */
  1080. #ifndef __APPLE__
  1081. .size fe_ge_madd,.-fe_ge_madd
  1082. #endif /* __APPLE__ */
  1083. #ifndef __APPLE__
  1084. .text
  1085. .globl fe_ge_msub
  1086. .type fe_ge_msub,@function
  1087. .align 16
  1088. fe_ge_msub:
  1089. #else
  1090. .section __TEXT,__text
  1091. .globl _fe_ge_msub
  1092. .p2align 4
  1093. _fe_ge_msub:
  1094. #endif /* __APPLE__ */
  1095. #ifndef __APPLE__
  1096. jmpq *fe_ge_msub_p(%rip)
  1097. #else
  1098. jmpq *_fe_ge_msub_p(%rip)
  1099. #endif /* __APPLE__ */
  1100. #ifndef __APPLE__
  1101. .size fe_ge_msub,.-fe_ge_msub
  1102. #endif /* __APPLE__ */
  1103. #ifndef __APPLE__
  1104. .text
  1105. .globl fe_ge_add
  1106. .type fe_ge_add,@function
  1107. .align 16
  1108. fe_ge_add:
  1109. #else
  1110. .section __TEXT,__text
  1111. .globl _fe_ge_add
  1112. .p2align 4
  1113. _fe_ge_add:
  1114. #endif /* __APPLE__ */
  1115. #ifndef __APPLE__
  1116. jmpq *fe_ge_add_p(%rip)
  1117. #else
  1118. jmpq *_fe_ge_add_p(%rip)
  1119. #endif /* __APPLE__ */
  1120. #ifndef __APPLE__
  1121. .size fe_ge_add,.-fe_ge_add
  1122. #endif /* __APPLE__ */
  1123. #ifndef __APPLE__
  1124. .text
  1125. .globl fe_ge_sub
  1126. .type fe_ge_sub,@function
  1127. .align 16
  1128. fe_ge_sub:
  1129. #else
  1130. .section __TEXT,__text
  1131. .globl _fe_ge_sub
  1132. .p2align 4
  1133. _fe_ge_sub:
  1134. #endif /* __APPLE__ */
  1135. #ifndef __APPLE__
  1136. jmpq *fe_ge_sub_p(%rip)
  1137. #else
  1138. jmpq *_fe_ge_sub_p(%rip)
  1139. #endif /* __APPLE__ */
  1140. #ifndef __APPLE__
  1141. .size fe_ge_sub,.-fe_ge_sub
  1142. #endif /* __APPLE__ */
  1143. #ifndef __APPLE__
  1144. .data
  1145. .type cpuFlagsSet, @object
  1146. .size cpuFlagsSet,4
  1147. cpuFlagsSet:
  1148. .long 0
  1149. #else
  1150. .section __DATA,__data
  1151. .p2align 2
  1152. _cpuFlagsSet:
  1153. .long 0
  1154. #endif /* __APPLE__ */
  1155. #ifndef __APPLE__
  1156. .data
  1157. .type intelFlags, @object
  1158. .size intelFlags,4
  1159. intelFlags:
  1160. .long 0
  1161. #else
  1162. .section __DATA,__data
  1163. .p2align 2
  1164. _intelFlags:
  1165. .long 0
  1166. #endif /* __APPLE__ */
  1167. #ifndef __APPLE__
  1168. .data
  1169. .type fe_mul_p, @object
  1170. .size fe_mul_p,8
  1171. fe_mul_p:
  1172. .quad fe_mul_x64
  1173. #else
  1174. .section __DATA,__data
  1175. .p2align 2
  1176. _fe_mul_p:
  1177. .quad _fe_mul_x64
  1178. #endif /* __APPLE__ */
  1179. #ifndef __APPLE__
  1180. .data
  1181. .type fe_sq_p, @object
  1182. .size fe_sq_p,8
  1183. fe_sq_p:
  1184. .quad fe_sq_x64
  1185. #else
  1186. .section __DATA,__data
  1187. .p2align 2
  1188. _fe_sq_p:
  1189. .quad _fe_sq_x64
  1190. #endif /* __APPLE__ */
  1191. #ifndef __APPLE__
  1192. .data
  1193. .type fe_mul121666_p, @object
  1194. .size fe_mul121666_p,8
  1195. fe_mul121666_p:
  1196. .quad fe_mul121666_x64
  1197. #else
  1198. .section __DATA,__data
  1199. .p2align 2
  1200. _fe_mul121666_p:
  1201. .quad _fe_mul121666_x64
  1202. #endif /* __APPLE__ */
  1203. #ifndef __APPLE__
  1204. .data
  1205. .type fe_sq2_p, @object
  1206. .size fe_sq2_p,8
  1207. fe_sq2_p:
  1208. .quad fe_sq2_x64
  1209. #else
  1210. .section __DATA,__data
  1211. .p2align 2
  1212. _fe_sq2_p:
  1213. .quad _fe_sq2_x64
  1214. #endif /* __APPLE__ */
  1215. #ifndef __APPLE__
  1216. .data
  1217. .type fe_invert_p, @object
  1218. .size fe_invert_p,8
  1219. fe_invert_p:
  1220. .quad fe_invert_x64
  1221. #else
  1222. .section __DATA,__data
  1223. .p2align 2
  1224. _fe_invert_p:
  1225. .quad _fe_invert_x64
  1226. #endif /* __APPLE__ */
  1227. #ifndef __APPLE__
  1228. .data
  1229. .type curve25519_p, @object
  1230. .size curve25519_p,8
  1231. curve25519_p:
  1232. .quad curve25519_x64
  1233. #else
  1234. .section __DATA,__data
  1235. .p2align 2
  1236. _curve25519_p:
  1237. .quad _curve25519_x64
  1238. #endif /* __APPLE__ */
  1239. #ifndef __APPLE__
  1240. .data
  1241. .type fe_pow22523_p, @object
  1242. .size fe_pow22523_p,8
  1243. fe_pow22523_p:
  1244. .quad fe_pow22523_x64
  1245. #else
  1246. .section __DATA,__data
  1247. .p2align 2
  1248. _fe_pow22523_p:
  1249. .quad _fe_pow22523_x64
  1250. #endif /* __APPLE__ */
  1251. #ifndef __APPLE__
  1252. .data
  1253. .type fe_ge_to_p2_p, @object
  1254. .size fe_ge_to_p2_p,8
  1255. fe_ge_to_p2_p:
  1256. .quad fe_ge_to_p2_x64
  1257. #else
  1258. .section __DATA,__data
  1259. .p2align 2
  1260. _fe_ge_to_p2_p:
  1261. .quad _fe_ge_to_p2_x64
  1262. #endif /* __APPLE__ */
  1263. #ifndef __APPLE__
  1264. .data
  1265. .type fe_ge_to_p3_p, @object
  1266. .size fe_ge_to_p3_p,8
  1267. fe_ge_to_p3_p:
  1268. .quad fe_ge_to_p3_x64
  1269. #else
  1270. .section __DATA,__data
  1271. .p2align 2
  1272. _fe_ge_to_p3_p:
  1273. .quad _fe_ge_to_p3_x64
  1274. #endif /* __APPLE__ */
  1275. #ifndef __APPLE__
  1276. .data
  1277. .type fe_ge_dbl_p, @object
  1278. .size fe_ge_dbl_p,8
  1279. fe_ge_dbl_p:
  1280. .quad fe_ge_dbl_x64
  1281. #else
  1282. .section __DATA,__data
  1283. .p2align 2
  1284. _fe_ge_dbl_p:
  1285. .quad _fe_ge_dbl_x64
  1286. #endif /* __APPLE__ */
  1287. #ifndef __APPLE__
  1288. .data
  1289. .type fe_ge_madd_p, @object
  1290. .size fe_ge_madd_p,8
  1291. fe_ge_madd_p:
  1292. .quad fe_ge_madd_x64
  1293. #else
  1294. .section __DATA,__data
  1295. .p2align 2
  1296. _fe_ge_madd_p:
  1297. .quad _fe_ge_madd_x64
  1298. #endif /* __APPLE__ */
  1299. #ifndef __APPLE__
  1300. .data
  1301. .type fe_ge_msub_p, @object
  1302. .size fe_ge_msub_p,8
  1303. fe_ge_msub_p:
  1304. .quad fe_ge_msub_x64
  1305. #else
  1306. .section __DATA,__data
  1307. .p2align 2
  1308. _fe_ge_msub_p:
  1309. .quad _fe_ge_msub_x64
  1310. #endif /* __APPLE__ */
  1311. #ifndef __APPLE__
  1312. .data
  1313. .type fe_ge_add_p, @object
  1314. .size fe_ge_add_p,8
  1315. fe_ge_add_p:
  1316. .quad fe_ge_add_x64
  1317. #else
  1318. .section __DATA,__data
  1319. .p2align 2
  1320. _fe_ge_add_p:
  1321. .quad _fe_ge_add_x64
  1322. #endif /* __APPLE__ */
  1323. #ifndef __APPLE__
  1324. .data
  1325. .type fe_ge_sub_p, @object
  1326. .size fe_ge_sub_p,8
  1327. fe_ge_sub_p:
  1328. .quad fe_ge_sub_x64
  1329. #else
  1330. .section __DATA,__data
  1331. .p2align 2
  1332. _fe_ge_sub_p:
  1333. .quad _fe_ge_sub_x64
  1334. #endif /* __APPLE__ */
  1335. #ifndef __APPLE__
  1336. .text
  1337. .globl fe_mul_x64
  1338. .type fe_mul_x64,@function
  1339. .align 16
  1340. fe_mul_x64:
  1341. #else
  1342. .section __TEXT,__text
  1343. .globl _fe_mul_x64
  1344. .p2align 4
  1345. _fe_mul_x64:
  1346. #endif /* __APPLE__ */
  1347. pushq %r12
  1348. pushq %r13
  1349. pushq %r14
  1350. pushq %r15
  1351. pushq %rbx
  1352. movq %rdx, %rcx
  1353. # Multiply
  1354. # A[0] * B[0]
  1355. movq (%rcx), %rax
  1356. mulq (%rsi)
  1357. movq %rax, %r8
  1358. movq %rdx, %r9
  1359. # A[0] * B[1]
  1360. movq 8(%rcx), %rax
  1361. mulq (%rsi)
  1362. xorq %r10, %r10
  1363. addq %rax, %r9
  1364. adcq %rdx, %r10
  1365. # A[1] * B[0]
  1366. movq (%rcx), %rax
  1367. mulq 8(%rsi)
  1368. xorq %r11, %r11
  1369. addq %rax, %r9
  1370. adcq %rdx, %r10
  1371. adcq $0x00, %r11
  1372. # A[0] * B[2]
  1373. movq 16(%rcx), %rax
  1374. mulq (%rsi)
  1375. addq %rax, %r10
  1376. adcq %rdx, %r11
  1377. # A[1] * B[1]
  1378. movq 8(%rcx), %rax
  1379. mulq 8(%rsi)
  1380. xorq %r12, %r12
  1381. addq %rax, %r10
  1382. adcq %rdx, %r11
  1383. adcq $0x00, %r12
  1384. # A[2] * B[0]
  1385. movq (%rcx), %rax
  1386. mulq 16(%rsi)
  1387. addq %rax, %r10
  1388. adcq %rdx, %r11
  1389. adcq $0x00, %r12
  1390. # A[0] * B[3]
  1391. movq 24(%rcx), %rax
  1392. mulq (%rsi)
  1393. xorq %r13, %r13
  1394. addq %rax, %r11
  1395. adcq %rdx, %r12
  1396. adcq $0x00, %r13
  1397. # A[1] * B[2]
  1398. movq 16(%rcx), %rax
  1399. mulq 8(%rsi)
  1400. addq %rax, %r11
  1401. adcq %rdx, %r12
  1402. adcq $0x00, %r13
  1403. # A[2] * B[1]
  1404. movq 8(%rcx), %rax
  1405. mulq 16(%rsi)
  1406. addq %rax, %r11
  1407. adcq %rdx, %r12
  1408. adcq $0x00, %r13
  1409. # A[3] * B[0]
  1410. movq (%rcx), %rax
  1411. mulq 24(%rsi)
  1412. addq %rax, %r11
  1413. adcq %rdx, %r12
  1414. adcq $0x00, %r13
  1415. # A[1] * B[3]
  1416. movq 24(%rcx), %rax
  1417. mulq 8(%rsi)
  1418. xorq %r14, %r14
  1419. addq %rax, %r12
  1420. adcq %rdx, %r13
  1421. adcq $0x00, %r14
  1422. # A[2] * B[2]
  1423. movq 16(%rcx), %rax
  1424. mulq 16(%rsi)
  1425. addq %rax, %r12
  1426. adcq %rdx, %r13
  1427. adcq $0x00, %r14
  1428. # A[3] * B[1]
  1429. movq 8(%rcx), %rax
  1430. mulq 24(%rsi)
  1431. addq %rax, %r12
  1432. adcq %rdx, %r13
  1433. adcq $0x00, %r14
  1434. # A[2] * B[3]
  1435. movq 24(%rcx), %rax
  1436. mulq 16(%rsi)
  1437. xorq %r15, %r15
  1438. addq %rax, %r13
  1439. adcq %rdx, %r14
  1440. adcq $0x00, %r15
  1441. # A[3] * B[2]
  1442. movq 16(%rcx), %rax
  1443. mulq 24(%rsi)
  1444. addq %rax, %r13
  1445. adcq %rdx, %r14
  1446. adcq $0x00, %r15
  1447. # A[3] * B[3]
  1448. movq 24(%rcx), %rax
  1449. mulq 24(%rsi)
  1450. addq %rax, %r14
  1451. adcq %rdx, %r15
  1452. # Reduce
  1453. movq $0x7fffffffffffffff, %rbx
  1454. # Move top half into t4-t7 and remove top bit from t3
  1455. shldq $0x01, %r14, %r15
  1456. shldq $0x01, %r13, %r14
  1457. shldq $0x01, %r12, %r13
  1458. shldq $0x01, %r11, %r12
  1459. andq %rbx, %r11
  1460. # Multiply top half by 19
  1461. movq $19, %rax
  1462. mulq %r12
  1463. xorq %r12, %r12
  1464. addq %rax, %r8
  1465. movq $19, %rax
  1466. adcq %rdx, %r12
  1467. mulq %r13
  1468. xorq %r13, %r13
  1469. addq %rax, %r9
  1470. movq $19, %rax
  1471. adcq %rdx, %r13
  1472. mulq %r14
  1473. xorq %r14, %r14
  1474. addq %rax, %r10
  1475. movq $19, %rax
  1476. adcq %rdx, %r14
  1477. mulq %r15
  1478. # Add remaining product results in
  1479. addq %r12, %r9
  1480. adcq %r13, %r10
  1481. adcq %r14, %r11
  1482. adcq %rax, %r11
  1483. adcq $0x00, %rdx
  1484. # Overflow
  1485. shldq $0x01, %r11, %rdx
  1486. imulq $19, %rdx, %rax
  1487. andq %rbx, %r11
  1488. addq %rax, %r8
  1489. adcq $0x00, %r9
  1490. adcq $0x00, %r10
  1491. adcq $0x00, %r11
  1492. # Reduce if top bit set
  1493. movq %r11, %rdx
  1494. shrq $63, %rdx
  1495. imulq $19, %rdx, %rax
  1496. andq %rbx, %r11
  1497. addq %rax, %r8
  1498. adcq $0x00, %r9
  1499. adcq $0x00, %r10
  1500. adcq $0x00, %r11
  1501. # Store
  1502. movq %r8, (%rdi)
  1503. movq %r9, 8(%rdi)
  1504. movq %r10, 16(%rdi)
  1505. movq %r11, 24(%rdi)
  1506. popq %rbx
  1507. popq %r15
  1508. popq %r14
  1509. popq %r13
  1510. popq %r12
  1511. repz retq
  1512. #ifndef __APPLE__
  1513. .size fe_mul_x64,.-fe_mul_x64
  1514. #endif /* __APPLE__ */
  1515. #ifndef __APPLE__
  1516. .text
  1517. .globl fe_sq_x64
  1518. .type fe_sq_x64,@function
  1519. .align 16
  1520. fe_sq_x64:
  1521. #else
  1522. .section __TEXT,__text
  1523. .globl _fe_sq_x64
  1524. .p2align 4
  1525. _fe_sq_x64:
  1526. #endif /* __APPLE__ */
  1527. pushq %r12
  1528. pushq %r13
  1529. pushq %r14
  1530. pushq %r15
  1531. # Square
  1532. # A[0] * A[1]
  1533. movq (%rsi), %rax
  1534. mulq 8(%rsi)
  1535. movq %rax, %r8
  1536. movq %rdx, %r9
  1537. # A[0] * A[2]
  1538. movq (%rsi), %rax
  1539. mulq 16(%rsi)
  1540. xorq %r10, %r10
  1541. addq %rax, %r9
  1542. adcq %rdx, %r10
  1543. # A[0] * A[3]
  1544. movq (%rsi), %rax
  1545. mulq 24(%rsi)
  1546. xorq %r11, %r11
  1547. addq %rax, %r10
  1548. adcq %rdx, %r11
  1549. # A[1] * A[2]
  1550. movq 8(%rsi), %rax
  1551. mulq 16(%rsi)
  1552. xorq %r12, %r12
  1553. addq %rax, %r10
  1554. adcq %rdx, %r11
  1555. adcq $0x00, %r12
  1556. # A[1] * A[3]
  1557. movq 8(%rsi), %rax
  1558. mulq 24(%rsi)
  1559. addq %rax, %r11
  1560. adcq %rdx, %r12
  1561. # A[2] * A[3]
  1562. movq 16(%rsi), %rax
  1563. mulq 24(%rsi)
  1564. xorq %r13, %r13
  1565. addq %rax, %r12
  1566. adcq %rdx, %r13
  1567. # Double
  1568. xorq %r14, %r14
  1569. addq %r8, %r8
  1570. adcq %r9, %r9
  1571. adcq %r10, %r10
  1572. adcq %r11, %r11
  1573. adcq %r12, %r12
  1574. adcq %r13, %r13
  1575. adcq $0x00, %r14
  1576. # A[0] * A[0]
  1577. movq (%rsi), %rax
  1578. mulq %rax
  1579. movq %rax, %rcx
  1580. movq %rdx, %r15
  1581. # A[1] * A[1]
  1582. movq 8(%rsi), %rax
  1583. mulq %rax
  1584. addq %r15, %r8
  1585. adcq %rax, %r9
  1586. adcq $0x00, %rdx
  1587. movq %rdx, %r15
  1588. # A[2] * A[2]
  1589. movq 16(%rsi), %rax
  1590. mulq %rax
  1591. addq %r15, %r10
  1592. adcq %rax, %r11
  1593. adcq $0x00, %rdx
  1594. movq %rdx, %r15
  1595. # A[3] * A[3]
  1596. movq 24(%rsi), %rax
  1597. mulq %rax
  1598. addq %rax, %r13
  1599. adcq %rdx, %r14
  1600. addq %r15, %r12
  1601. adcq $0x00, %r13
  1602. adcq $0x00, %r14
  1603. # Reduce
  1604. movq $0x7fffffffffffffff, %r15
  1605. # Move top half into t4-t7 and remove top bit from t3
  1606. shldq $0x01, %r13, %r14
  1607. shldq $0x01, %r12, %r13
  1608. shldq $0x01, %r11, %r12
  1609. shldq $0x01, %r10, %r11
  1610. andq %r15, %r10
  1611. # Multiply top half by 19
  1612. movq $19, %rax
  1613. mulq %r11
  1614. xorq %r11, %r11
  1615. addq %rax, %rcx
  1616. movq $19, %rax
  1617. adcq %rdx, %r11
  1618. mulq %r12
  1619. xorq %r12, %r12
  1620. addq %rax, %r8
  1621. movq $19, %rax
  1622. adcq %rdx, %r12
  1623. mulq %r13
  1624. xorq %r13, %r13
  1625. addq %rax, %r9
  1626. movq $19, %rax
  1627. adcq %rdx, %r13
  1628. mulq %r14
  1629. # Add remaining product results in
  1630. addq %r11, %r8
  1631. adcq %r12, %r9
  1632. adcq %r13, %r10
  1633. adcq %rax, %r10
  1634. adcq $0x00, %rdx
  1635. # Overflow
  1636. shldq $0x01, %r10, %rdx
  1637. imulq $19, %rdx, %rax
  1638. andq %r15, %r10
  1639. addq %rax, %rcx
  1640. adcq $0x00, %r8
  1641. adcq $0x00, %r9
  1642. adcq $0x00, %r10
  1643. # Reduce if top bit set
  1644. movq %r10, %rdx
  1645. shrq $63, %rdx
  1646. imulq $19, %rdx, %rax
  1647. andq %r15, %r10
  1648. addq %rax, %rcx
  1649. adcq $0x00, %r8
  1650. adcq $0x00, %r9
  1651. adcq $0x00, %r10
  1652. # Store
  1653. movq %rcx, (%rdi)
  1654. movq %r8, 8(%rdi)
  1655. movq %r9, 16(%rdi)
  1656. movq %r10, 24(%rdi)
  1657. popq %r15
  1658. popq %r14
  1659. popq %r13
  1660. popq %r12
  1661. repz retq
  1662. #ifndef __APPLE__
  1663. .size fe_sq_x64,.-fe_sq_x64
  1664. #endif /* __APPLE__ */
  1665. #ifndef __APPLE__
  1666. .text
  1667. .globl fe_sq_n_x64
  1668. .type fe_sq_n_x64,@function
  1669. .align 16
  1670. fe_sq_n_x64:
  1671. #else
  1672. .section __TEXT,__text
  1673. .globl _fe_sq_n_x64
  1674. .p2align 4
  1675. _fe_sq_n_x64:
  1676. #endif /* __APPLE__ */
  1677. pushq %r12
  1678. pushq %r13
  1679. pushq %r14
  1680. pushq %r15
  1681. pushq %rbx
  1682. movq %rdx, %rcx
  1683. L_fe_sq_n_x64:
  1684. # Square
  1685. # A[0] * A[1]
  1686. movq (%rsi), %rax
  1687. mulq 8(%rsi)
  1688. movq %rax, %r9
  1689. movq %rdx, %r10
  1690. # A[0] * A[2]
  1691. movq (%rsi), %rax
  1692. mulq 16(%rsi)
  1693. xorq %r11, %r11
  1694. addq %rax, %r10
  1695. adcq %rdx, %r11
  1696. # A[0] * A[3]
  1697. movq (%rsi), %rax
  1698. mulq 24(%rsi)
  1699. xorq %r12, %r12
  1700. addq %rax, %r11
  1701. adcq %rdx, %r12
  1702. # A[1] * A[2]
  1703. movq 8(%rsi), %rax
  1704. mulq 16(%rsi)
  1705. xorq %r13, %r13
  1706. addq %rax, %r11
  1707. adcq %rdx, %r12
  1708. adcq $0x00, %r13
  1709. # A[1] * A[3]
  1710. movq 8(%rsi), %rax
  1711. mulq 24(%rsi)
  1712. addq %rax, %r12
  1713. adcq %rdx, %r13
  1714. # A[2] * A[3]
  1715. movq 16(%rsi), %rax
  1716. mulq 24(%rsi)
  1717. xorq %r14, %r14
  1718. addq %rax, %r13
  1719. adcq %rdx, %r14
  1720. # Double
  1721. xorq %r15, %r15
  1722. addq %r9, %r9
  1723. adcq %r10, %r10
  1724. adcq %r11, %r11
  1725. adcq %r12, %r12
  1726. adcq %r13, %r13
  1727. adcq %r14, %r14
  1728. adcq $0x00, %r15
  1729. # A[0] * A[0]
  1730. movq (%rsi), %rax
  1731. mulq %rax
  1732. movq %rax, %r8
  1733. movq %rdx, %rbx
  1734. # A[1] * A[1]
  1735. movq 8(%rsi), %rax
  1736. mulq %rax
  1737. addq %rbx, %r9
  1738. adcq %rax, %r10
  1739. adcq $0x00, %rdx
  1740. movq %rdx, %rbx
  1741. # A[2] * A[2]
  1742. movq 16(%rsi), %rax
  1743. mulq %rax
  1744. addq %rbx, %r11
  1745. adcq %rax, %r12
  1746. adcq $0x00, %rdx
  1747. movq %rdx, %rbx
  1748. # A[3] * A[3]
  1749. movq 24(%rsi), %rax
  1750. mulq %rax
  1751. addq %rax, %r14
  1752. adcq %rdx, %r15
  1753. addq %rbx, %r13
  1754. adcq $0x00, %r14
  1755. adcq $0x00, %r15
  1756. # Reduce
  1757. movq $0x7fffffffffffffff, %rbx
  1758. # Move top half into t4-t7 and remove top bit from t3
  1759. shldq $0x01, %r14, %r15
  1760. shldq $0x01, %r13, %r14
  1761. shldq $0x01, %r12, %r13
  1762. shldq $0x01, %r11, %r12
  1763. andq %rbx, %r11
  1764. # Multiply top half by 19
  1765. movq $19, %rax
  1766. mulq %r12
  1767. xorq %r12, %r12
  1768. addq %rax, %r8
  1769. movq $19, %rax
  1770. adcq %rdx, %r12
  1771. mulq %r13
  1772. xorq %r13, %r13
  1773. addq %rax, %r9
  1774. movq $19, %rax
  1775. adcq %rdx, %r13
  1776. mulq %r14
  1777. xorq %r14, %r14
  1778. addq %rax, %r10
  1779. movq $19, %rax
  1780. adcq %rdx, %r14
  1781. mulq %r15
  1782. # Add remaining product results in
  1783. addq %r12, %r9
  1784. adcq %r13, %r10
  1785. adcq %r14, %r11
  1786. adcq %rax, %r11
  1787. adcq $0x00, %rdx
  1788. # Overflow
  1789. shldq $0x01, %r11, %rdx
  1790. imulq $19, %rdx, %rax
  1791. andq %rbx, %r11
  1792. addq %rax, %r8
  1793. adcq $0x00, %r9
  1794. adcq $0x00, %r10
  1795. adcq $0x00, %r11
  1796. # Reduce if top bit set
  1797. movq %r11, %rdx
  1798. shrq $63, %rdx
  1799. imulq $19, %rdx, %rax
  1800. andq %rbx, %r11
  1801. addq %rax, %r8
  1802. adcq $0x00, %r9
  1803. adcq $0x00, %r10
  1804. adcq $0x00, %r11
  1805. # Store
  1806. movq %r8, (%rdi)
  1807. movq %r9, 8(%rdi)
  1808. movq %r10, 16(%rdi)
  1809. movq %r11, 24(%rdi)
  1810. decb %cl
  1811. jnz L_fe_sq_n_x64
  1812. popq %rbx
  1813. popq %r15
  1814. popq %r14
  1815. popq %r13
  1816. popq %r12
  1817. repz retq
  1818. #ifndef __APPLE__
  1819. .size fe_sq_n_x64,.-fe_sq_n_x64
  1820. #endif /* __APPLE__ */
  1821. #ifndef __APPLE__
  1822. .text
  1823. .globl fe_mul121666_x64
  1824. .type fe_mul121666_x64,@function
  1825. .align 16
  1826. fe_mul121666_x64:
  1827. #else
  1828. .section __TEXT,__text
  1829. .globl _fe_mul121666_x64
  1830. .p2align 4
  1831. _fe_mul121666_x64:
  1832. #endif /* __APPLE__ */
  1833. pushq %r12
  1834. # Multiply by 121666
  1835. movq $0x1db42, %rax
  1836. mulq (%rsi)
  1837. xorq %r10, %r10
  1838. movq %rax, %r8
  1839. movq %rdx, %r9
  1840. movq $0x1db42, %rax
  1841. mulq 8(%rsi)
  1842. xorq %r11, %r11
  1843. addq %rax, %r9
  1844. adcq %rdx, %r10
  1845. movq $0x1db42, %rax
  1846. mulq 16(%rsi)
  1847. xorq %r12, %r12
  1848. addq %rax, %r10
  1849. adcq %rdx, %r11
  1850. movq $0x1db42, %rax
  1851. mulq 24(%rsi)
  1852. movq $0x7fffffffffffffff, %rcx
  1853. addq %rax, %r11
  1854. adcq %rdx, %r12
  1855. shldq $0x01, %r11, %r12
  1856. andq %rcx, %r11
  1857. movq $19, %rax
  1858. mulq %r12
  1859. addq %rax, %r8
  1860. adcq $0x00, %r9
  1861. adcq $0x00, %r10
  1862. adcq $0x00, %r11
  1863. movq %r8, (%rdi)
  1864. movq %r9, 8(%rdi)
  1865. movq %r10, 16(%rdi)
  1866. movq %r11, 24(%rdi)
  1867. popq %r12
  1868. repz retq
  1869. #ifndef __APPLE__
  1870. .size fe_mul121666_x64,.-fe_mul121666_x64
  1871. #endif /* __APPLE__ */
  1872. #ifndef __APPLE__
  1873. .text
  1874. .globl fe_sq2_x64
  1875. .type fe_sq2_x64,@function
  1876. .align 16
  1877. fe_sq2_x64:
  1878. #else
  1879. .section __TEXT,__text
  1880. .globl _fe_sq2_x64
  1881. .p2align 4
  1882. _fe_sq2_x64:
  1883. #endif /* __APPLE__ */
  1884. pushq %r12
  1885. pushq %r13
  1886. pushq %r14
  1887. pushq %r15
  1888. pushq %rbx
  1889. # Square * 2
  1890. # A[0] * A[1]
  1891. movq (%rsi), %rax
  1892. mulq 8(%rsi)
  1893. movq %rax, %r8
  1894. movq %rdx, %r9
  1895. # A[0] * A[2]
  1896. movq (%rsi), %rax
  1897. mulq 16(%rsi)
  1898. xorq %r10, %r10
  1899. addq %rax, %r9
  1900. adcq %rdx, %r10
  1901. # A[0] * A[3]
  1902. movq (%rsi), %rax
  1903. mulq 24(%rsi)
  1904. xorq %r11, %r11
  1905. addq %rax, %r10
  1906. adcq %rdx, %r11
  1907. # A[1] * A[2]
  1908. movq 8(%rsi), %rax
  1909. mulq 16(%rsi)
  1910. xorq %r12, %r12
  1911. addq %rax, %r10
  1912. adcq %rdx, %r11
  1913. adcq $0x00, %r12
  1914. # A[1] * A[3]
  1915. movq 8(%rsi), %rax
  1916. mulq 24(%rsi)
  1917. addq %rax, %r11
  1918. adcq %rdx, %r12
  1919. # A[2] * A[3]
  1920. movq 16(%rsi), %rax
  1921. mulq 24(%rsi)
  1922. xorq %r13, %r13
  1923. addq %rax, %r12
  1924. adcq %rdx, %r13
  1925. # Double
  1926. xorq %r14, %r14
  1927. addq %r8, %r8
  1928. adcq %r9, %r9
  1929. adcq %r10, %r10
  1930. adcq %r11, %r11
  1931. adcq %r12, %r12
  1932. adcq %r13, %r13
  1933. adcq $0x00, %r14
  1934. # A[0] * A[0]
  1935. movq (%rsi), %rax
  1936. mulq %rax
  1937. movq %rax, %rcx
  1938. movq %rdx, %r15
  1939. # A[1] * A[1]
  1940. movq 8(%rsi), %rax
  1941. mulq %rax
  1942. addq %r15, %r8
  1943. adcq %rax, %r9
  1944. adcq $0x00, %rdx
  1945. movq %rdx, %r15
  1946. # A[2] * A[2]
  1947. movq 16(%rsi), %rax
  1948. mulq %rax
  1949. addq %r15, %r10
  1950. adcq %rax, %r11
  1951. adcq $0x00, %rdx
  1952. movq %rdx, %r15
  1953. # A[3] * A[3]
  1954. movq 24(%rsi), %rax
  1955. mulq %rax
  1956. addq %rax, %r13
  1957. adcq %rdx, %r14
  1958. addq %r15, %r12
  1959. adcq $0x00, %r13
  1960. adcq $0x00, %r14
  1961. # Reduce
  1962. movq $0x7fffffffffffffff, %rbx
  1963. xorq %rax, %rax
  1964. # Move top half into t4-t7 and remove top bit from t3
  1965. shldq $3, %r14, %rax
  1966. shldq $2, %r13, %r14
  1967. shldq $2, %r12, %r13
  1968. shldq $2, %r11, %r12
  1969. shldq $2, %r10, %r11
  1970. shldq $0x01, %r9, %r10
  1971. shldq $0x01, %r8, %r9
  1972. shldq $0x01, %rcx, %r8
  1973. shlq $0x01, %rcx
  1974. andq %rbx, %r10
  1975. # Two out left, one in right
  1976. andq %rbx, %r14
  1977. # Multiply top bits by 19*19
  1978. imulq $0x169, %rax, %r15
  1979. # Multiply top half by 19
  1980. movq $19, %rax
  1981. mulq %r11
  1982. xorq %r11, %r11
  1983. addq %rax, %rcx
  1984. movq $19, %rax
  1985. adcq %rdx, %r11
  1986. mulq %r12
  1987. xorq %r12, %r12
  1988. addq %rax, %r8
  1989. movq $19, %rax
  1990. adcq %rdx, %r12
  1991. mulq %r13
  1992. xorq %r13, %r13
  1993. addq %rax, %r9
  1994. movq $19, %rax
  1995. adcq %rdx, %r13
  1996. mulq %r14
  1997. # Add remaining produce results in
  1998. addq %r15, %rcx
  1999. adcq %r11, %r8
  2000. adcq %r12, %r9
  2001. adcq %r13, %r10
  2002. adcq %rax, %r10
  2003. adcq $0x00, %rdx
  2004. # Overflow
  2005. shldq $0x01, %r10, %rdx
  2006. imulq $19, %rdx, %rax
  2007. andq %rbx, %r10
  2008. addq %rax, %rcx
  2009. adcq $0x00, %r8
  2010. adcq $0x00, %r9
  2011. adcq $0x00, %r10
  2012. # Reduce if top bit set
  2013. movq %r10, %rdx
  2014. shrq $63, %rdx
  2015. imulq $19, %rdx, %rax
  2016. andq %rbx, %r10
  2017. addq %rax, %rcx
  2018. adcq $0x00, %r8
  2019. adcq $0x00, %r9
  2020. adcq $0x00, %r10
  2021. # Store
  2022. movq %rcx, (%rdi)
  2023. movq %r8, 8(%rdi)
  2024. movq %r9, 16(%rdi)
  2025. movq %r10, 24(%rdi)
  2026. popq %rbx
  2027. popq %r15
  2028. popq %r14
  2029. popq %r13
  2030. popq %r12
  2031. repz retq
  2032. #ifndef __APPLE__
  2033. .size fe_sq2_x64,.-fe_sq2_x64
  2034. #endif /* __APPLE__ */
  2035. #ifndef __APPLE__
  2036. .text
  2037. .globl fe_invert_x64
  2038. .type fe_invert_x64,@function
  2039. .align 16
  2040. fe_invert_x64:
  2041. #else
  2042. .section __TEXT,__text
  2043. .globl _fe_invert_x64
  2044. .p2align 4
  2045. _fe_invert_x64:
  2046. #endif /* __APPLE__ */
  2047. subq $0x90, %rsp
  2048. # Invert
  2049. movq %rdi, 128(%rsp)
  2050. movq %rsi, 136(%rsp)
  2051. movq %rsp, %rdi
  2052. movq 136(%rsp), %rsi
  2053. #ifndef __APPLE__
  2054. callq fe_sq_x64@plt
  2055. #else
  2056. callq _fe_sq_x64
  2057. #endif /* __APPLE__ */
  2058. leaq 32(%rsp), %rdi
  2059. movq %rsp, %rsi
  2060. #ifndef __APPLE__
  2061. callq fe_sq_x64@plt
  2062. #else
  2063. callq _fe_sq_x64
  2064. #endif /* __APPLE__ */
  2065. leaq 32(%rsp), %rdi
  2066. leaq 32(%rsp), %rsi
  2067. #ifndef __APPLE__
  2068. callq fe_sq_x64@plt
  2069. #else
  2070. callq _fe_sq_x64
  2071. #endif /* __APPLE__ */
  2072. leaq 32(%rsp), %rdi
  2073. movq 136(%rsp), %rsi
  2074. leaq 32(%rsp), %rdx
  2075. #ifndef __APPLE__
  2076. callq fe_mul_x64@plt
  2077. #else
  2078. callq _fe_mul_x64
  2079. #endif /* __APPLE__ */
  2080. movq %rsp, %rdi
  2081. movq %rsp, %rsi
  2082. leaq 32(%rsp), %rdx
  2083. #ifndef __APPLE__
  2084. callq fe_mul_x64@plt
  2085. #else
  2086. callq _fe_mul_x64
  2087. #endif /* __APPLE__ */
  2088. leaq 64(%rsp), %rdi
  2089. movq %rsp, %rsi
  2090. #ifndef __APPLE__
  2091. callq fe_sq_x64@plt
  2092. #else
  2093. callq _fe_sq_x64
  2094. #endif /* __APPLE__ */
  2095. leaq 32(%rsp), %rdi
  2096. leaq 32(%rsp), %rsi
  2097. leaq 64(%rsp), %rdx
  2098. #ifndef __APPLE__
  2099. callq fe_mul_x64@plt
  2100. #else
  2101. callq _fe_mul_x64
  2102. #endif /* __APPLE__ */
  2103. leaq 64(%rsp), %rdi
  2104. leaq 32(%rsp), %rsi
  2105. #ifndef __APPLE__
  2106. callq fe_sq_x64@plt
  2107. #else
  2108. callq _fe_sq_x64
  2109. #endif /* __APPLE__ */
  2110. leaq 64(%rsp), %rdi
  2111. leaq 64(%rsp), %rsi
  2112. movq $4, %rdx
  2113. #ifndef __APPLE__
  2114. callq fe_sq_n_x64@plt
  2115. #else
  2116. callq _fe_sq_n_x64
  2117. #endif /* __APPLE__ */
  2118. leaq 32(%rsp), %rdi
  2119. leaq 64(%rsp), %rsi
  2120. leaq 32(%rsp), %rdx
  2121. #ifndef __APPLE__
  2122. callq fe_mul_x64@plt
  2123. #else
  2124. callq _fe_mul_x64
  2125. #endif /* __APPLE__ */
  2126. leaq 64(%rsp), %rdi
  2127. leaq 32(%rsp), %rsi
  2128. #ifndef __APPLE__
  2129. callq fe_sq_x64@plt
  2130. #else
  2131. callq _fe_sq_x64
  2132. #endif /* __APPLE__ */
  2133. leaq 64(%rsp), %rdi
  2134. leaq 64(%rsp), %rsi
  2135. movq $9, %rdx
  2136. #ifndef __APPLE__
  2137. callq fe_sq_n_x64@plt
  2138. #else
  2139. callq _fe_sq_n_x64
  2140. #endif /* __APPLE__ */
  2141. leaq 64(%rsp), %rdi
  2142. leaq 64(%rsp), %rsi
  2143. leaq 32(%rsp), %rdx
  2144. #ifndef __APPLE__
  2145. callq fe_mul_x64@plt
  2146. #else
  2147. callq _fe_mul_x64
  2148. #endif /* __APPLE__ */
  2149. leaq 96(%rsp), %rdi
  2150. leaq 64(%rsp), %rsi
  2151. #ifndef __APPLE__
  2152. callq fe_sq_x64@plt
  2153. #else
  2154. callq _fe_sq_x64
  2155. #endif /* __APPLE__ */
  2156. leaq 96(%rsp), %rdi
  2157. leaq 96(%rsp), %rsi
  2158. movq $19, %rdx
  2159. #ifndef __APPLE__
  2160. callq fe_sq_n_x64@plt
  2161. #else
  2162. callq _fe_sq_n_x64
  2163. #endif /* __APPLE__ */
  2164. leaq 64(%rsp), %rdi
  2165. leaq 96(%rsp), %rsi
  2166. leaq 64(%rsp), %rdx
  2167. #ifndef __APPLE__
  2168. callq fe_mul_x64@plt
  2169. #else
  2170. callq _fe_mul_x64
  2171. #endif /* __APPLE__ */
  2172. leaq 64(%rsp), %rdi
  2173. leaq 64(%rsp), %rsi
  2174. #ifndef __APPLE__
  2175. callq fe_sq_x64@plt
  2176. #else
  2177. callq _fe_sq_x64
  2178. #endif /* __APPLE__ */
  2179. leaq 64(%rsp), %rdi
  2180. leaq 64(%rsp), %rsi
  2181. movq $9, %rdx
  2182. #ifndef __APPLE__
  2183. callq fe_sq_n_x64@plt
  2184. #else
  2185. callq _fe_sq_n_x64
  2186. #endif /* __APPLE__ */
  2187. leaq 32(%rsp), %rdi
  2188. leaq 64(%rsp), %rsi
  2189. leaq 32(%rsp), %rdx
  2190. #ifndef __APPLE__
  2191. callq fe_mul_x64@plt
  2192. #else
  2193. callq _fe_mul_x64
  2194. #endif /* __APPLE__ */
  2195. leaq 64(%rsp), %rdi
  2196. leaq 32(%rsp), %rsi
  2197. #ifndef __APPLE__
  2198. callq fe_sq_x64@plt
  2199. #else
  2200. callq _fe_sq_x64
  2201. #endif /* __APPLE__ */
  2202. leaq 64(%rsp), %rdi
  2203. leaq 64(%rsp), %rsi
  2204. movq $49, %rdx
  2205. #ifndef __APPLE__
  2206. callq fe_sq_n_x64@plt
  2207. #else
  2208. callq _fe_sq_n_x64
  2209. #endif /* __APPLE__ */
  2210. leaq 64(%rsp), %rdi
  2211. leaq 64(%rsp), %rsi
  2212. leaq 32(%rsp), %rdx
  2213. #ifndef __APPLE__
  2214. callq fe_mul_x64@plt
  2215. #else
  2216. callq _fe_mul_x64
  2217. #endif /* __APPLE__ */
  2218. leaq 96(%rsp), %rdi
  2219. leaq 64(%rsp), %rsi
  2220. #ifndef __APPLE__
  2221. callq fe_sq_x64@plt
  2222. #else
  2223. callq _fe_sq_x64
  2224. #endif /* __APPLE__ */
  2225. leaq 96(%rsp), %rdi
  2226. leaq 96(%rsp), %rsi
  2227. movq $0x63, %rdx
  2228. #ifndef __APPLE__
  2229. callq fe_sq_n_x64@plt
  2230. #else
  2231. callq _fe_sq_n_x64
  2232. #endif /* __APPLE__ */
  2233. leaq 64(%rsp), %rdi
  2234. leaq 96(%rsp), %rsi
  2235. leaq 64(%rsp), %rdx
  2236. #ifndef __APPLE__
  2237. callq fe_mul_x64@plt
  2238. #else
  2239. callq _fe_mul_x64
  2240. #endif /* __APPLE__ */
  2241. leaq 64(%rsp), %rdi
  2242. leaq 64(%rsp), %rsi
  2243. #ifndef __APPLE__
  2244. callq fe_sq_x64@plt
  2245. #else
  2246. callq _fe_sq_x64
  2247. #endif /* __APPLE__ */
  2248. leaq 64(%rsp), %rdi
  2249. leaq 64(%rsp), %rsi
  2250. movq $49, %rdx
  2251. #ifndef __APPLE__
  2252. callq fe_sq_n_x64@plt
  2253. #else
  2254. callq _fe_sq_n_x64
  2255. #endif /* __APPLE__ */
  2256. leaq 32(%rsp), %rdi
  2257. leaq 64(%rsp), %rsi
  2258. leaq 32(%rsp), %rdx
  2259. #ifndef __APPLE__
  2260. callq fe_mul_x64@plt
  2261. #else
  2262. callq _fe_mul_x64
  2263. #endif /* __APPLE__ */
  2264. leaq 32(%rsp), %rdi
  2265. leaq 32(%rsp), %rsi
  2266. #ifndef __APPLE__
  2267. callq fe_sq_x64@plt
  2268. #else
  2269. callq _fe_sq_x64
  2270. #endif /* __APPLE__ */
  2271. leaq 32(%rsp), %rdi
  2272. leaq 32(%rsp), %rsi
  2273. movq $4, %rdx
  2274. #ifndef __APPLE__
  2275. callq fe_sq_n_x64@plt
  2276. #else
  2277. callq _fe_sq_n_x64
  2278. #endif /* __APPLE__ */
  2279. movq 128(%rsp), %rdi
  2280. leaq 32(%rsp), %rsi
  2281. movq %rsp, %rdx
  2282. #ifndef __APPLE__
  2283. callq fe_mul_x64@plt
  2284. #else
  2285. callq _fe_mul_x64
  2286. #endif /* __APPLE__ */
  2287. movq 136(%rsp), %rsi
  2288. movq 128(%rsp), %rdi
  2289. addq $0x90, %rsp
  2290. repz retq
  2291. #ifndef __APPLE__
  2292. .text
  2293. .globl curve25519_x64
  2294. .type curve25519_x64,@function
  2295. .align 16
  2296. curve25519_x64:
  2297. #else
  2298. .section __TEXT,__text
  2299. .globl _curve25519_x64
  2300. .p2align 4
  2301. _curve25519_x64:
  2302. #endif /* __APPLE__ */
  2303. pushq %r12
  2304. pushq %r13
  2305. pushq %r14
  2306. pushq %r15
  2307. pushq %rbx
  2308. pushq %rbp
  2309. movq %rdx, %r8
  2310. subq $0xb8, %rsp
  2311. xorq %rbx, %rbx
  2312. movq %rdi, 176(%rsp)
  2313. # Set one
  2314. movq $0x01, (%rdi)
  2315. movq $0x00, 8(%rdi)
  2316. movq $0x00, 16(%rdi)
  2317. movq $0x00, 24(%rdi)
  2318. # Set zero
  2319. movq $0x00, (%rsp)
  2320. movq $0x00, 8(%rsp)
  2321. movq $0x00, 16(%rsp)
  2322. movq $0x00, 24(%rsp)
  2323. # Set one
  2324. movq $0x01, 32(%rsp)
  2325. movq $0x00, 40(%rsp)
  2326. movq $0x00, 48(%rsp)
  2327. movq $0x00, 56(%rsp)
  2328. # Copy
  2329. movq (%r8), %rcx
  2330. movq 8(%r8), %r9
  2331. movq 16(%r8), %r10
  2332. movq 24(%r8), %r11
  2333. movq %rcx, 64(%rsp)
  2334. movq %r9, 72(%rsp)
  2335. movq %r10, 80(%rsp)
  2336. movq %r11, 88(%rsp)
  2337. movb $62, 168(%rsp)
  2338. movq $3, 160(%rsp)
  2339. L_curve25519_x64_words:
  2340. L_curve25519_x64_bits:
  2341. movq 160(%rsp), %r9
  2342. movb 168(%rsp), %cl
  2343. movq (%rsi,%r9,8), %rbp
  2344. shrq %cl, %rbp
  2345. andq $0x01, %rbp
  2346. xorq %rbp, %rbx
  2347. negq %rbx
  2348. # Conditional Swap
  2349. movq (%rdi), %rcx
  2350. movq 8(%rdi), %r9
  2351. movq 16(%rdi), %r10
  2352. movq 24(%rdi), %r11
  2353. xorq 64(%rsp), %rcx
  2354. xorq 72(%rsp), %r9
  2355. xorq 80(%rsp), %r10
  2356. xorq 88(%rsp), %r11
  2357. andq %rbx, %rcx
  2358. andq %rbx, %r9
  2359. andq %rbx, %r10
  2360. andq %rbx, %r11
  2361. xorq %rcx, (%rdi)
  2362. xorq %r9, 8(%rdi)
  2363. xorq %r10, 16(%rdi)
  2364. xorq %r11, 24(%rdi)
  2365. xorq %rcx, 64(%rsp)
  2366. xorq %r9, 72(%rsp)
  2367. xorq %r10, 80(%rsp)
  2368. xorq %r11, 88(%rsp)
  2369. # Conditional Swap
  2370. movq (%rsp), %rcx
  2371. movq 8(%rsp), %r9
  2372. movq 16(%rsp), %r10
  2373. movq 24(%rsp), %r11
  2374. xorq 32(%rsp), %rcx
  2375. xorq 40(%rsp), %r9
  2376. xorq 48(%rsp), %r10
  2377. xorq 56(%rsp), %r11
  2378. andq %rbx, %rcx
  2379. andq %rbx, %r9
  2380. andq %rbx, %r10
  2381. andq %rbx, %r11
  2382. xorq %rcx, (%rsp)
  2383. xorq %r9, 8(%rsp)
  2384. xorq %r10, 16(%rsp)
  2385. xorq %r11, 24(%rsp)
  2386. xorq %rcx, 32(%rsp)
  2387. xorq %r9, 40(%rsp)
  2388. xorq %r10, 48(%rsp)
  2389. xorq %r11, 56(%rsp)
  2390. movq %rbp, %rbx
  2391. # Add
  2392. movq (%rdi), %rcx
  2393. movq 8(%rdi), %r9
  2394. movq 16(%rdi), %r10
  2395. movq 24(%rdi), %rbp
  2396. movq %rcx, %r12
  2397. addq (%rsp), %rcx
  2398. movq %r9, %r13
  2399. adcq 8(%rsp), %r9
  2400. movq %r10, %r14
  2401. adcq 16(%rsp), %r10
  2402. movq %rbp, %r15
  2403. adcq 24(%rsp), %rbp
  2404. movq $-19, %rax
  2405. movq %rbp, %r11
  2406. movq $0x7fffffffffffffff, %rdx
  2407. sarq $63, %rbp
  2408. # Mask the modulus
  2409. andq %rbp, %rax
  2410. andq %rbp, %rdx
  2411. # Sub modulus (if overflow)
  2412. subq %rax, %rcx
  2413. sbbq %rbp, %r9
  2414. sbbq %rbp, %r10
  2415. sbbq %rdx, %r11
  2416. # Sub
  2417. subq (%rsp), %r12
  2418. movq $0x00, %rbp
  2419. sbbq 8(%rsp), %r13
  2420. movq $-19, %rax
  2421. sbbq 16(%rsp), %r14
  2422. movq $0x7fffffffffffffff, %rdx
  2423. sbbq 24(%rsp), %r15
  2424. sbbq $0x00, %rbp
  2425. # Mask the modulus
  2426. andq %rbp, %rax
  2427. andq %rbp, %rdx
  2428. # Add modulus (if underflow)
  2429. addq %rax, %r12
  2430. adcq %rbp, %r13
  2431. adcq %rbp, %r14
  2432. adcq %rdx, %r15
  2433. movq %rcx, (%rdi)
  2434. movq %r9, 8(%rdi)
  2435. movq %r10, 16(%rdi)
  2436. movq %r11, 24(%rdi)
  2437. movq %r12, 128(%rsp)
  2438. movq %r13, 136(%rsp)
  2439. movq %r14, 144(%rsp)
  2440. movq %r15, 152(%rsp)
  2441. # Add
  2442. movq 64(%rsp), %rcx
  2443. movq 72(%rsp), %r9
  2444. movq 80(%rsp), %r10
  2445. movq 88(%rsp), %rbp
  2446. movq %rcx, %r12
  2447. addq 32(%rsp), %rcx
  2448. movq %r9, %r13
  2449. adcq 40(%rsp), %r9
  2450. movq %r10, %r14
  2451. adcq 48(%rsp), %r10
  2452. movq %rbp, %r15
  2453. adcq 56(%rsp), %rbp
  2454. movq $-19, %rax
  2455. movq %rbp, %r11
  2456. movq $0x7fffffffffffffff, %rdx
  2457. sarq $63, %rbp
  2458. # Mask the modulus
  2459. andq %rbp, %rax
  2460. andq %rbp, %rdx
  2461. # Sub modulus (if overflow)
  2462. subq %rax, %rcx
  2463. sbbq %rbp, %r9
  2464. sbbq %rbp, %r10
  2465. sbbq %rdx, %r11
  2466. # Sub
  2467. subq 32(%rsp), %r12
  2468. movq $0x00, %rbp
  2469. sbbq 40(%rsp), %r13
  2470. movq $-19, %rax
  2471. sbbq 48(%rsp), %r14
  2472. movq $0x7fffffffffffffff, %rdx
  2473. sbbq 56(%rsp), %r15
  2474. sbbq $0x00, %rbp
  2475. # Mask the modulus
  2476. andq %rbp, %rax
  2477. andq %rbp, %rdx
  2478. # Add modulus (if underflow)
  2479. addq %rax, %r12
  2480. adcq %rbp, %r13
  2481. adcq %rbp, %r14
  2482. adcq %rdx, %r15
  2483. movq %rcx, (%rsp)
  2484. movq %r9, 8(%rsp)
  2485. movq %r10, 16(%rsp)
  2486. movq %r11, 24(%rsp)
  2487. movq %r12, 96(%rsp)
  2488. movq %r13, 104(%rsp)
  2489. movq %r14, 112(%rsp)
  2490. movq %r15, 120(%rsp)
  2491. # Multiply
  2492. # A[0] * B[0]
  2493. movq (%rdi), %rax
  2494. mulq 96(%rsp)
  2495. movq %rax, %rcx
  2496. movq %rdx, %r9
  2497. # A[0] * B[1]
  2498. movq 8(%rdi), %rax
  2499. mulq 96(%rsp)
  2500. xorq %r10, %r10
  2501. addq %rax, %r9
  2502. adcq %rdx, %r10
  2503. # A[1] * B[0]
  2504. movq (%rdi), %rax
  2505. mulq 104(%rsp)
  2506. xorq %r11, %r11
  2507. addq %rax, %r9
  2508. adcq %rdx, %r10
  2509. adcq $0x00, %r11
  2510. # A[0] * B[2]
  2511. movq 16(%rdi), %rax
  2512. mulq 96(%rsp)
  2513. addq %rax, %r10
  2514. adcq %rdx, %r11
  2515. # A[1] * B[1]
  2516. movq 8(%rdi), %rax
  2517. mulq 104(%rsp)
  2518. xorq %r12, %r12
  2519. addq %rax, %r10
  2520. adcq %rdx, %r11
  2521. adcq $0x00, %r12
  2522. # A[2] * B[0]
  2523. movq (%rdi), %rax
  2524. mulq 112(%rsp)
  2525. addq %rax, %r10
  2526. adcq %rdx, %r11
  2527. adcq $0x00, %r12
  2528. # A[0] * B[3]
  2529. movq 24(%rdi), %rax
  2530. mulq 96(%rsp)
  2531. xorq %r13, %r13
  2532. addq %rax, %r11
  2533. adcq %rdx, %r12
  2534. adcq $0x00, %r13
  2535. # A[1] * B[2]
  2536. movq 16(%rdi), %rax
  2537. mulq 104(%rsp)
  2538. addq %rax, %r11
  2539. adcq %rdx, %r12
  2540. adcq $0x00, %r13
  2541. # A[2] * B[1]
  2542. movq 8(%rdi), %rax
  2543. mulq 112(%rsp)
  2544. addq %rax, %r11
  2545. adcq %rdx, %r12
  2546. adcq $0x00, %r13
  2547. # A[3] * B[0]
  2548. movq (%rdi), %rax
  2549. mulq 120(%rsp)
  2550. addq %rax, %r11
  2551. adcq %rdx, %r12
  2552. adcq $0x00, %r13
  2553. # A[1] * B[3]
  2554. movq 24(%rdi), %rax
  2555. mulq 104(%rsp)
  2556. xorq %r14, %r14
  2557. addq %rax, %r12
  2558. adcq %rdx, %r13
  2559. adcq $0x00, %r14
  2560. # A[2] * B[2]
  2561. movq 16(%rdi), %rax
  2562. mulq 112(%rsp)
  2563. addq %rax, %r12
  2564. adcq %rdx, %r13
  2565. adcq $0x00, %r14
  2566. # A[3] * B[1]
  2567. movq 8(%rdi), %rax
  2568. mulq 120(%rsp)
  2569. addq %rax, %r12
  2570. adcq %rdx, %r13
  2571. adcq $0x00, %r14
  2572. # A[2] * B[3]
  2573. movq 24(%rdi), %rax
  2574. mulq 112(%rsp)
  2575. xorq %r15, %r15
  2576. addq %rax, %r13
  2577. adcq %rdx, %r14
  2578. adcq $0x00, %r15
  2579. # A[3] * B[2]
  2580. movq 16(%rdi), %rax
  2581. mulq 120(%rsp)
  2582. addq %rax, %r13
  2583. adcq %rdx, %r14
  2584. adcq $0x00, %r15
  2585. # A[3] * B[3]
  2586. movq 24(%rdi), %rax
  2587. mulq 120(%rsp)
  2588. addq %rax, %r14
  2589. adcq %rdx, %r15
  2590. # Reduce
  2591. movq $0x7fffffffffffffff, %rbp
  2592. # Move top half into t4-t7 and remove top bit from t3
  2593. shldq $0x01, %r14, %r15
  2594. shldq $0x01, %r13, %r14
  2595. shldq $0x01, %r12, %r13
  2596. shldq $0x01, %r11, %r12
  2597. andq %rbp, %r11
  2598. # Multiply top half by 19
  2599. movq $19, %rax
  2600. mulq %r12
  2601. xorq %r12, %r12
  2602. addq %rax, %rcx
  2603. movq $19, %rax
  2604. adcq %rdx, %r12
  2605. mulq %r13
  2606. xorq %r13, %r13
  2607. addq %rax, %r9
  2608. movq $19, %rax
  2609. adcq %rdx, %r13
  2610. mulq %r14
  2611. xorq %r14, %r14
  2612. addq %rax, %r10
  2613. movq $19, %rax
  2614. adcq %rdx, %r14
  2615. mulq %r15
  2616. # Add remaining product results in
  2617. addq %r12, %r9
  2618. adcq %r13, %r10
  2619. adcq %r14, %r11
  2620. adcq %rax, %r11
  2621. adcq $0x00, %rdx
  2622. # Overflow
  2623. shldq $0x01, %r11, %rdx
  2624. imulq $19, %rdx, %rax
  2625. andq %rbp, %r11
  2626. addq %rax, %rcx
  2627. adcq $0x00, %r9
  2628. adcq $0x00, %r10
  2629. adcq $0x00, %r11
  2630. # Reduce if top bit set
  2631. movq %r11, %rdx
  2632. shrq $63, %rdx
  2633. imulq $19, %rdx, %rax
  2634. andq %rbp, %r11
  2635. addq %rax, %rcx
  2636. adcq $0x00, %r9
  2637. adcq $0x00, %r10
  2638. adcq $0x00, %r11
  2639. # Store
  2640. movq %rcx, 32(%rsp)
  2641. movq %r9, 40(%rsp)
  2642. movq %r10, 48(%rsp)
  2643. movq %r11, 56(%rsp)
  2644. # Multiply
  2645. # A[0] * B[0]
  2646. movq 128(%rsp), %rax
  2647. mulq (%rsp)
  2648. movq %rax, %rcx
  2649. movq %rdx, %r9
  2650. # A[0] * B[1]
  2651. movq 136(%rsp), %rax
  2652. mulq (%rsp)
  2653. xorq %r10, %r10
  2654. addq %rax, %r9
  2655. adcq %rdx, %r10
  2656. # A[1] * B[0]
  2657. movq 128(%rsp), %rax
  2658. mulq 8(%rsp)
  2659. xorq %r11, %r11
  2660. addq %rax, %r9
  2661. adcq %rdx, %r10
  2662. adcq $0x00, %r11
  2663. # A[0] * B[2]
  2664. movq 144(%rsp), %rax
  2665. mulq (%rsp)
  2666. addq %rax, %r10
  2667. adcq %rdx, %r11
  2668. # A[1] * B[1]
  2669. movq 136(%rsp), %rax
  2670. mulq 8(%rsp)
  2671. xorq %r12, %r12
  2672. addq %rax, %r10
  2673. adcq %rdx, %r11
  2674. adcq $0x00, %r12
  2675. # A[2] * B[0]
  2676. movq 128(%rsp), %rax
  2677. mulq 16(%rsp)
  2678. addq %rax, %r10
  2679. adcq %rdx, %r11
  2680. adcq $0x00, %r12
  2681. # A[0] * B[3]
  2682. movq 152(%rsp), %rax
  2683. mulq (%rsp)
  2684. xorq %r13, %r13
  2685. addq %rax, %r11
  2686. adcq %rdx, %r12
  2687. adcq $0x00, %r13
  2688. # A[1] * B[2]
  2689. movq 144(%rsp), %rax
  2690. mulq 8(%rsp)
  2691. addq %rax, %r11
  2692. adcq %rdx, %r12
  2693. adcq $0x00, %r13
  2694. # A[2] * B[1]
  2695. movq 136(%rsp), %rax
  2696. mulq 16(%rsp)
  2697. addq %rax, %r11
  2698. adcq %rdx, %r12
  2699. adcq $0x00, %r13
  2700. # A[3] * B[0]
  2701. movq 128(%rsp), %rax
  2702. mulq 24(%rsp)
  2703. addq %rax, %r11
  2704. adcq %rdx, %r12
  2705. adcq $0x00, %r13
  2706. # A[1] * B[3]
  2707. movq 152(%rsp), %rax
  2708. mulq 8(%rsp)
  2709. xorq %r14, %r14
  2710. addq %rax, %r12
  2711. adcq %rdx, %r13
  2712. adcq $0x00, %r14
  2713. # A[2] * B[2]
  2714. movq 144(%rsp), %rax
  2715. mulq 16(%rsp)
  2716. addq %rax, %r12
  2717. adcq %rdx, %r13
  2718. adcq $0x00, %r14
  2719. # A[3] * B[1]
  2720. movq 136(%rsp), %rax
  2721. mulq 24(%rsp)
  2722. addq %rax, %r12
  2723. adcq %rdx, %r13
  2724. adcq $0x00, %r14
  2725. # A[2] * B[3]
  2726. movq 152(%rsp), %rax
  2727. mulq 16(%rsp)
  2728. xorq %r15, %r15
  2729. addq %rax, %r13
  2730. adcq %rdx, %r14
  2731. adcq $0x00, %r15
  2732. # A[3] * B[2]
  2733. movq 144(%rsp), %rax
  2734. mulq 24(%rsp)
  2735. addq %rax, %r13
  2736. adcq %rdx, %r14
  2737. adcq $0x00, %r15
  2738. # A[3] * B[3]
  2739. movq 152(%rsp), %rax
  2740. mulq 24(%rsp)
  2741. addq %rax, %r14
  2742. adcq %rdx, %r15
  2743. # Reduce
  2744. movq $0x7fffffffffffffff, %rbp
  2745. # Move top half into t4-t7 and remove top bit from t3
  2746. shldq $0x01, %r14, %r15
  2747. shldq $0x01, %r13, %r14
  2748. shldq $0x01, %r12, %r13
  2749. shldq $0x01, %r11, %r12
  2750. andq %rbp, %r11
  2751. # Multiply top half by 19
  2752. movq $19, %rax
  2753. mulq %r12
  2754. xorq %r12, %r12
  2755. addq %rax, %rcx
  2756. movq $19, %rax
  2757. adcq %rdx, %r12
  2758. mulq %r13
  2759. xorq %r13, %r13
  2760. addq %rax, %r9
  2761. movq $19, %rax
  2762. adcq %rdx, %r13
  2763. mulq %r14
  2764. xorq %r14, %r14
  2765. addq %rax, %r10
  2766. movq $19, %rax
  2767. adcq %rdx, %r14
  2768. mulq %r15
  2769. # Add remaining product results in
  2770. addq %r12, %r9
  2771. adcq %r13, %r10
  2772. adcq %r14, %r11
  2773. adcq %rax, %r11
  2774. adcq $0x00, %rdx
  2775. # Overflow
  2776. shldq $0x01, %r11, %rdx
  2777. imulq $19, %rdx, %rax
  2778. andq %rbp, %r11
  2779. addq %rax, %rcx
  2780. adcq $0x00, %r9
  2781. adcq $0x00, %r10
  2782. adcq $0x00, %r11
  2783. # Reduce if top bit set
  2784. movq %r11, %rdx
  2785. shrq $63, %rdx
  2786. imulq $19, %rdx, %rax
  2787. andq %rbp, %r11
  2788. addq %rax, %rcx
  2789. adcq $0x00, %r9
  2790. adcq $0x00, %r10
  2791. adcq $0x00, %r11
  2792. # Store
  2793. movq %rcx, (%rsp)
  2794. movq %r9, 8(%rsp)
  2795. movq %r10, 16(%rsp)
  2796. movq %r11, 24(%rsp)
  2797. # Square
  2798. # A[0] * A[1]
  2799. movq 128(%rsp), %rax
  2800. mulq 136(%rsp)
  2801. movq %rax, %r9
  2802. movq %rdx, %r10
  2803. # A[0] * A[2]
  2804. movq 128(%rsp), %rax
  2805. mulq 144(%rsp)
  2806. xorq %r11, %r11
  2807. addq %rax, %r10
  2808. adcq %rdx, %r11
  2809. # A[0] * A[3]
  2810. movq 128(%rsp), %rax
  2811. mulq 152(%rsp)
  2812. xorq %r12, %r12
  2813. addq %rax, %r11
  2814. adcq %rdx, %r12
  2815. # A[1] * A[2]
  2816. movq 136(%rsp), %rax
  2817. mulq 144(%rsp)
  2818. xorq %r13, %r13
  2819. addq %rax, %r11
  2820. adcq %rdx, %r12
  2821. adcq $0x00, %r13
  2822. # A[1] * A[3]
  2823. movq 136(%rsp), %rax
  2824. mulq 152(%rsp)
  2825. addq %rax, %r12
  2826. adcq %rdx, %r13
  2827. # A[2] * A[3]
  2828. movq 144(%rsp), %rax
  2829. mulq 152(%rsp)
  2830. xorq %r14, %r14
  2831. addq %rax, %r13
  2832. adcq %rdx, %r14
  2833. # Double
  2834. xorq %r15, %r15
  2835. addq %r9, %r9
  2836. adcq %r10, %r10
  2837. adcq %r11, %r11
  2838. adcq %r12, %r12
  2839. adcq %r13, %r13
  2840. adcq %r14, %r14
  2841. adcq $0x00, %r15
  2842. # A[0] * A[0]
  2843. movq 128(%rsp), %rax
  2844. mulq %rax
  2845. movq %rax, %rcx
  2846. movq %rdx, %rbp
  2847. # A[1] * A[1]
  2848. movq 136(%rsp), %rax
  2849. mulq %rax
  2850. addq %rbp, %r9
  2851. adcq %rax, %r10
  2852. adcq $0x00, %rdx
  2853. movq %rdx, %rbp
  2854. # A[2] * A[2]
  2855. movq 144(%rsp), %rax
  2856. mulq %rax
  2857. addq %rbp, %r11
  2858. adcq %rax, %r12
  2859. adcq $0x00, %rdx
  2860. movq %rdx, %rbp
  2861. # A[3] * A[3]
  2862. movq 152(%rsp), %rax
  2863. mulq %rax
  2864. addq %rax, %r14
  2865. adcq %rdx, %r15
  2866. addq %rbp, %r13
  2867. adcq $0x00, %r14
  2868. adcq $0x00, %r15
  2869. # Reduce
  2870. movq $0x7fffffffffffffff, %rbp
  2871. # Move top half into t4-t7 and remove top bit from t3
  2872. shldq $0x01, %r14, %r15
  2873. shldq $0x01, %r13, %r14
  2874. shldq $0x01, %r12, %r13
  2875. shldq $0x01, %r11, %r12
  2876. andq %rbp, %r11
  2877. # Multiply top half by 19
  2878. movq $19, %rax
  2879. mulq %r12
  2880. xorq %r12, %r12
  2881. addq %rax, %rcx
  2882. movq $19, %rax
  2883. adcq %rdx, %r12
  2884. mulq %r13
  2885. xorq %r13, %r13
  2886. addq %rax, %r9
  2887. movq $19, %rax
  2888. adcq %rdx, %r13
  2889. mulq %r14
  2890. xorq %r14, %r14
  2891. addq %rax, %r10
  2892. movq $19, %rax
  2893. adcq %rdx, %r14
  2894. mulq %r15
  2895. # Add remaining product results in
  2896. addq %r12, %r9
  2897. adcq %r13, %r10
  2898. adcq %r14, %r11
  2899. adcq %rax, %r11
  2900. adcq $0x00, %rdx
  2901. # Overflow
  2902. shldq $0x01, %r11, %rdx
  2903. imulq $19, %rdx, %rax
  2904. andq %rbp, %r11
  2905. addq %rax, %rcx
  2906. adcq $0x00, %r9
  2907. adcq $0x00, %r10
  2908. adcq $0x00, %r11
  2909. # Reduce if top bit set
  2910. movq %r11, %rdx
  2911. shrq $63, %rdx
  2912. imulq $19, %rdx, %rax
  2913. andq %rbp, %r11
  2914. addq %rax, %rcx
  2915. adcq $0x00, %r9
  2916. adcq $0x00, %r10
  2917. adcq $0x00, %r11
  2918. # Store
  2919. movq %rcx, 96(%rsp)
  2920. movq %r9, 104(%rsp)
  2921. movq %r10, 112(%rsp)
  2922. movq %r11, 120(%rsp)
  2923. # Square
  2924. # A[0] * A[1]
  2925. movq (%rdi), %rax
  2926. mulq 8(%rdi)
  2927. movq %rax, %r9
  2928. movq %rdx, %r10
  2929. # A[0] * A[2]
  2930. movq (%rdi), %rax
  2931. mulq 16(%rdi)
  2932. xorq %r11, %r11
  2933. addq %rax, %r10
  2934. adcq %rdx, %r11
  2935. # A[0] * A[3]
  2936. movq (%rdi), %rax
  2937. mulq 24(%rdi)
  2938. xorq %r12, %r12
  2939. addq %rax, %r11
  2940. adcq %rdx, %r12
  2941. # A[1] * A[2]
  2942. movq 8(%rdi), %rax
  2943. mulq 16(%rdi)
  2944. xorq %r13, %r13
  2945. addq %rax, %r11
  2946. adcq %rdx, %r12
  2947. adcq $0x00, %r13
  2948. # A[1] * A[3]
  2949. movq 8(%rdi), %rax
  2950. mulq 24(%rdi)
  2951. addq %rax, %r12
  2952. adcq %rdx, %r13
  2953. # A[2] * A[3]
  2954. movq 16(%rdi), %rax
  2955. mulq 24(%rdi)
  2956. xorq %r14, %r14
  2957. addq %rax, %r13
  2958. adcq %rdx, %r14
  2959. # Double
  2960. xorq %r15, %r15
  2961. addq %r9, %r9
  2962. adcq %r10, %r10
  2963. adcq %r11, %r11
  2964. adcq %r12, %r12
  2965. adcq %r13, %r13
  2966. adcq %r14, %r14
  2967. adcq $0x00, %r15
  2968. # A[0] * A[0]
  2969. movq (%rdi), %rax
  2970. mulq %rax
  2971. movq %rax, %rcx
  2972. movq %rdx, %rbp
  2973. # A[1] * A[1]
  2974. movq 8(%rdi), %rax
  2975. mulq %rax
  2976. addq %rbp, %r9
  2977. adcq %rax, %r10
  2978. adcq $0x00, %rdx
  2979. movq %rdx, %rbp
  2980. # A[2] * A[2]
  2981. movq 16(%rdi), %rax
  2982. mulq %rax
  2983. addq %rbp, %r11
  2984. adcq %rax, %r12
  2985. adcq $0x00, %rdx
  2986. movq %rdx, %rbp
  2987. # A[3] * A[3]
  2988. movq 24(%rdi), %rax
  2989. mulq %rax
  2990. addq %rax, %r14
  2991. adcq %rdx, %r15
  2992. addq %rbp, %r13
  2993. adcq $0x00, %r14
  2994. adcq $0x00, %r15
  2995. # Reduce
  2996. movq $0x7fffffffffffffff, %rbp
  2997. # Move top half into t4-t7 and remove top bit from t3
  2998. shldq $0x01, %r14, %r15
  2999. shldq $0x01, %r13, %r14
  3000. shldq $0x01, %r12, %r13
  3001. shldq $0x01, %r11, %r12
  3002. andq %rbp, %r11
  3003. # Multiply top half by 19
  3004. movq $19, %rax
  3005. mulq %r12
  3006. xorq %r12, %r12
  3007. addq %rax, %rcx
  3008. movq $19, %rax
  3009. adcq %rdx, %r12
  3010. mulq %r13
  3011. xorq %r13, %r13
  3012. addq %rax, %r9
  3013. movq $19, %rax
  3014. adcq %rdx, %r13
  3015. mulq %r14
  3016. xorq %r14, %r14
  3017. addq %rax, %r10
  3018. movq $19, %rax
  3019. adcq %rdx, %r14
  3020. mulq %r15
  3021. # Add remaining product results in
  3022. addq %r12, %r9
  3023. adcq %r13, %r10
  3024. adcq %r14, %r11
  3025. adcq %rax, %r11
  3026. adcq $0x00, %rdx
  3027. # Overflow
  3028. shldq $0x01, %r11, %rdx
  3029. imulq $19, %rdx, %rax
  3030. andq %rbp, %r11
  3031. addq %rax, %rcx
  3032. adcq $0x00, %r9
  3033. adcq $0x00, %r10
  3034. adcq $0x00, %r11
  3035. # Reduce if top bit set
  3036. movq %r11, %rdx
  3037. shrq $63, %rdx
  3038. imulq $19, %rdx, %rax
  3039. andq %rbp, %r11
  3040. addq %rax, %rcx
  3041. adcq $0x00, %r9
  3042. adcq $0x00, %r10
  3043. adcq $0x00, %r11
  3044. # Store
  3045. movq %rcx, 128(%rsp)
  3046. movq %r9, 136(%rsp)
  3047. movq %r10, 144(%rsp)
  3048. movq %r11, 152(%rsp)
  3049. # Add
  3050. movq 32(%rsp), %rcx
  3051. movq 40(%rsp), %r9
  3052. movq 48(%rsp), %r10
  3053. movq 56(%rsp), %rbp
  3054. movq %rcx, %r12
  3055. addq (%rsp), %rcx
  3056. movq %r9, %r13
  3057. adcq 8(%rsp), %r9
  3058. movq %r10, %r14
  3059. adcq 16(%rsp), %r10
  3060. movq %rbp, %r15
  3061. adcq 24(%rsp), %rbp
  3062. movq $-19, %rax
  3063. movq %rbp, %r11
  3064. movq $0x7fffffffffffffff, %rdx
  3065. sarq $63, %rbp
  3066. # Mask the modulus
  3067. andq %rbp, %rax
  3068. andq %rbp, %rdx
  3069. # Sub modulus (if overflow)
  3070. subq %rax, %rcx
  3071. sbbq %rbp, %r9
  3072. sbbq %rbp, %r10
  3073. sbbq %rdx, %r11
  3074. # Sub
  3075. subq (%rsp), %r12
  3076. movq $0x00, %rbp
  3077. sbbq 8(%rsp), %r13
  3078. movq $-19, %rax
  3079. sbbq 16(%rsp), %r14
  3080. movq $0x7fffffffffffffff, %rdx
  3081. sbbq 24(%rsp), %r15
  3082. sbbq $0x00, %rbp
  3083. # Mask the modulus
  3084. andq %rbp, %rax
  3085. andq %rbp, %rdx
  3086. # Add modulus (if underflow)
  3087. addq %rax, %r12
  3088. adcq %rbp, %r13
  3089. adcq %rbp, %r14
  3090. adcq %rdx, %r15
  3091. movq %rcx, 64(%rsp)
  3092. movq %r9, 72(%rsp)
  3093. movq %r10, 80(%rsp)
  3094. movq %r11, 88(%rsp)
  3095. movq %r12, (%rsp)
  3096. movq %r13, 8(%rsp)
  3097. movq %r14, 16(%rsp)
  3098. movq %r15, 24(%rsp)
  3099. # Multiply
  3100. # A[0] * B[0]
  3101. movq 96(%rsp), %rax
  3102. mulq 128(%rsp)
  3103. movq %rax, %rcx
  3104. movq %rdx, %r9
  3105. # A[0] * B[1]
  3106. movq 104(%rsp), %rax
  3107. mulq 128(%rsp)
  3108. xorq %r10, %r10
  3109. addq %rax, %r9
  3110. adcq %rdx, %r10
  3111. # A[1] * B[0]
  3112. movq 96(%rsp), %rax
  3113. mulq 136(%rsp)
  3114. xorq %r11, %r11
  3115. addq %rax, %r9
  3116. adcq %rdx, %r10
  3117. adcq $0x00, %r11
  3118. # A[0] * B[2]
  3119. movq 112(%rsp), %rax
  3120. mulq 128(%rsp)
  3121. addq %rax, %r10
  3122. adcq %rdx, %r11
  3123. # A[1] * B[1]
  3124. movq 104(%rsp), %rax
  3125. mulq 136(%rsp)
  3126. xorq %r12, %r12
  3127. addq %rax, %r10
  3128. adcq %rdx, %r11
  3129. adcq $0x00, %r12
  3130. # A[2] * B[0]
  3131. movq 96(%rsp), %rax
  3132. mulq 144(%rsp)
  3133. addq %rax, %r10
  3134. adcq %rdx, %r11
  3135. adcq $0x00, %r12
  3136. # A[0] * B[3]
  3137. movq 120(%rsp), %rax
  3138. mulq 128(%rsp)
  3139. xorq %r13, %r13
  3140. addq %rax, %r11
  3141. adcq %rdx, %r12
  3142. adcq $0x00, %r13
  3143. # A[1] * B[2]
  3144. movq 112(%rsp), %rax
  3145. mulq 136(%rsp)
  3146. addq %rax, %r11
  3147. adcq %rdx, %r12
  3148. adcq $0x00, %r13
  3149. # A[2] * B[1]
  3150. movq 104(%rsp), %rax
  3151. mulq 144(%rsp)
  3152. addq %rax, %r11
  3153. adcq %rdx, %r12
  3154. adcq $0x00, %r13
  3155. # A[3] * B[0]
  3156. movq 96(%rsp), %rax
  3157. mulq 152(%rsp)
  3158. addq %rax, %r11
  3159. adcq %rdx, %r12
  3160. adcq $0x00, %r13
  3161. # A[1] * B[3]
  3162. movq 120(%rsp), %rax
  3163. mulq 136(%rsp)
  3164. xorq %r14, %r14
  3165. addq %rax, %r12
  3166. adcq %rdx, %r13
  3167. adcq $0x00, %r14
  3168. # A[2] * B[2]
  3169. movq 112(%rsp), %rax
  3170. mulq 144(%rsp)
  3171. addq %rax, %r12
  3172. adcq %rdx, %r13
  3173. adcq $0x00, %r14
  3174. # A[3] * B[1]
  3175. movq 104(%rsp), %rax
  3176. mulq 152(%rsp)
  3177. addq %rax, %r12
  3178. adcq %rdx, %r13
  3179. adcq $0x00, %r14
  3180. # A[2] * B[3]
  3181. movq 120(%rsp), %rax
  3182. mulq 144(%rsp)
  3183. xorq %r15, %r15
  3184. addq %rax, %r13
  3185. adcq %rdx, %r14
  3186. adcq $0x00, %r15
  3187. # A[3] * B[2]
  3188. movq 112(%rsp), %rax
  3189. mulq 152(%rsp)
  3190. addq %rax, %r13
  3191. adcq %rdx, %r14
  3192. adcq $0x00, %r15
  3193. # A[3] * B[3]
  3194. movq 120(%rsp), %rax
  3195. mulq 152(%rsp)
  3196. addq %rax, %r14
  3197. adcq %rdx, %r15
  3198. # Reduce
  3199. movq $0x7fffffffffffffff, %rbp
  3200. # Move top half into t4-t7 and remove top bit from t3
  3201. shldq $0x01, %r14, %r15
  3202. shldq $0x01, %r13, %r14
  3203. shldq $0x01, %r12, %r13
  3204. shldq $0x01, %r11, %r12
  3205. andq %rbp, %r11
  3206. # Multiply top half by 19
  3207. movq $19, %rax
  3208. mulq %r12
  3209. xorq %r12, %r12
  3210. addq %rax, %rcx
  3211. movq $19, %rax
  3212. adcq %rdx, %r12
  3213. mulq %r13
  3214. xorq %r13, %r13
  3215. addq %rax, %r9
  3216. movq $19, %rax
  3217. adcq %rdx, %r13
  3218. mulq %r14
  3219. xorq %r14, %r14
  3220. addq %rax, %r10
  3221. movq $19, %rax
  3222. adcq %rdx, %r14
  3223. mulq %r15
  3224. # Add remaining product results in
  3225. addq %r12, %r9
  3226. adcq %r13, %r10
  3227. adcq %r14, %r11
  3228. adcq %rax, %r11
  3229. adcq $0x00, %rdx
  3230. # Overflow
  3231. shldq $0x01, %r11, %rdx
  3232. imulq $19, %rdx, %rax
  3233. andq %rbp, %r11
  3234. addq %rax, %rcx
  3235. adcq $0x00, %r9
  3236. adcq $0x00, %r10
  3237. adcq $0x00, %r11
  3238. # Reduce if top bit set
  3239. movq %r11, %rdx
  3240. shrq $63, %rdx
  3241. imulq $19, %rdx, %rax
  3242. andq %rbp, %r11
  3243. addq %rax, %rcx
  3244. adcq $0x00, %r9
  3245. adcq $0x00, %r10
  3246. adcq $0x00, %r11
  3247. # Store
  3248. movq %rcx, (%rdi)
  3249. movq %r9, 8(%rdi)
  3250. movq %r10, 16(%rdi)
  3251. movq %r11, 24(%rdi)
  3252. # Sub
  3253. movq 128(%rsp), %rcx
  3254. movq 136(%rsp), %r9
  3255. movq 144(%rsp), %r10
  3256. movq 152(%rsp), %r11
  3257. subq 96(%rsp), %rcx
  3258. movq $0x00, %rbp
  3259. sbbq 104(%rsp), %r9
  3260. movq $-19, %rax
  3261. sbbq 112(%rsp), %r10
  3262. movq $0x7fffffffffffffff, %rdx
  3263. sbbq 120(%rsp), %r11
  3264. sbbq $0x00, %rbp
  3265. # Mask the modulus
  3266. andq %rbp, %rax
  3267. andq %rbp, %rdx
  3268. # Add modulus (if underflow)
  3269. addq %rax, %rcx
  3270. adcq %rbp, %r9
  3271. adcq %rbp, %r10
  3272. adcq %rdx, %r11
  3273. movq %rcx, 128(%rsp)
  3274. movq %r9, 136(%rsp)
  3275. movq %r10, 144(%rsp)
  3276. movq %r11, 152(%rsp)
  3277. # Square
  3278. # A[0] * A[1]
  3279. movq (%rsp), %rax
  3280. mulq 8(%rsp)
  3281. movq %rax, %r9
  3282. movq %rdx, %r10
  3283. # A[0] * A[2]
  3284. movq (%rsp), %rax
  3285. mulq 16(%rsp)
  3286. xorq %r11, %r11
  3287. addq %rax, %r10
  3288. adcq %rdx, %r11
  3289. # A[0] * A[3]
  3290. movq (%rsp), %rax
  3291. mulq 24(%rsp)
  3292. xorq %r12, %r12
  3293. addq %rax, %r11
  3294. adcq %rdx, %r12
  3295. # A[1] * A[2]
  3296. movq 8(%rsp), %rax
  3297. mulq 16(%rsp)
  3298. xorq %r13, %r13
  3299. addq %rax, %r11
  3300. adcq %rdx, %r12
  3301. adcq $0x00, %r13
  3302. # A[1] * A[3]
  3303. movq 8(%rsp), %rax
  3304. mulq 24(%rsp)
  3305. addq %rax, %r12
  3306. adcq %rdx, %r13
  3307. # A[2] * A[3]
  3308. movq 16(%rsp), %rax
  3309. mulq 24(%rsp)
  3310. xorq %r14, %r14
  3311. addq %rax, %r13
  3312. adcq %rdx, %r14
  3313. # Double
  3314. xorq %r15, %r15
  3315. addq %r9, %r9
  3316. adcq %r10, %r10
  3317. adcq %r11, %r11
  3318. adcq %r12, %r12
  3319. adcq %r13, %r13
  3320. adcq %r14, %r14
  3321. adcq $0x00, %r15
  3322. # A[0] * A[0]
  3323. movq (%rsp), %rax
  3324. mulq %rax
  3325. movq %rax, %rcx
  3326. movq %rdx, %rbp
  3327. # A[1] * A[1]
  3328. movq 8(%rsp), %rax
  3329. mulq %rax
  3330. addq %rbp, %r9
  3331. adcq %rax, %r10
  3332. adcq $0x00, %rdx
  3333. movq %rdx, %rbp
  3334. # A[2] * A[2]
  3335. movq 16(%rsp), %rax
  3336. mulq %rax
  3337. addq %rbp, %r11
  3338. adcq %rax, %r12
  3339. adcq $0x00, %rdx
  3340. movq %rdx, %rbp
  3341. # A[3] * A[3]
  3342. movq 24(%rsp), %rax
  3343. mulq %rax
  3344. addq %rax, %r14
  3345. adcq %rdx, %r15
  3346. addq %rbp, %r13
  3347. adcq $0x00, %r14
  3348. adcq $0x00, %r15
  3349. # Reduce
  3350. movq $0x7fffffffffffffff, %rbp
  3351. # Move top half into t4-t7 and remove top bit from t3
  3352. shldq $0x01, %r14, %r15
  3353. shldq $0x01, %r13, %r14
  3354. shldq $0x01, %r12, %r13
  3355. shldq $0x01, %r11, %r12
  3356. andq %rbp, %r11
  3357. # Multiply top half by 19
  3358. movq $19, %rax
  3359. mulq %r12
  3360. xorq %r12, %r12
  3361. addq %rax, %rcx
  3362. movq $19, %rax
  3363. adcq %rdx, %r12
  3364. mulq %r13
  3365. xorq %r13, %r13
  3366. addq %rax, %r9
  3367. movq $19, %rax
  3368. adcq %rdx, %r13
  3369. mulq %r14
  3370. xorq %r14, %r14
  3371. addq %rax, %r10
  3372. movq $19, %rax
  3373. adcq %rdx, %r14
  3374. mulq %r15
  3375. # Add remaining product results in
  3376. addq %r12, %r9
  3377. adcq %r13, %r10
  3378. adcq %r14, %r11
  3379. adcq %rax, %r11
  3380. adcq $0x00, %rdx
  3381. # Overflow
  3382. shldq $0x01, %r11, %rdx
  3383. imulq $19, %rdx, %rax
  3384. andq %rbp, %r11
  3385. addq %rax, %rcx
  3386. adcq $0x00, %r9
  3387. adcq $0x00, %r10
  3388. adcq $0x00, %r11
  3389. # Reduce if top bit set
  3390. movq %r11, %rdx
  3391. shrq $63, %rdx
  3392. imulq $19, %rdx, %rax
  3393. andq %rbp, %r11
  3394. addq %rax, %rcx
  3395. adcq $0x00, %r9
  3396. adcq $0x00, %r10
  3397. adcq $0x00, %r11
  3398. # Store
  3399. movq %rcx, (%rsp)
  3400. movq %r9, 8(%rsp)
  3401. movq %r10, 16(%rsp)
  3402. movq %r11, 24(%rsp)
  3403. # Multiply by 121666
  3404. movq $0x1db42, %rax
  3405. mulq 128(%rsp)
  3406. xorq %r10, %r10
  3407. movq %rax, %rcx
  3408. movq %rdx, %r9
  3409. movq $0x1db42, %rax
  3410. mulq 136(%rsp)
  3411. xorq %r11, %r11
  3412. addq %rax, %r9
  3413. adcq %rdx, %r10
  3414. movq $0x1db42, %rax
  3415. mulq 144(%rsp)
  3416. xorq %r13, %r13
  3417. addq %rax, %r10
  3418. adcq %rdx, %r11
  3419. movq $0x1db42, %rax
  3420. mulq 152(%rsp)
  3421. movq $0x7fffffffffffffff, %r12
  3422. addq %rax, %r11
  3423. adcq %rdx, %r13
  3424. shldq $0x01, %r11, %r13
  3425. andq %r12, %r11
  3426. movq $19, %rax
  3427. mulq %r13
  3428. addq %rax, %rcx
  3429. adcq $0x00, %r9
  3430. adcq $0x00, %r10
  3431. adcq $0x00, %r11
  3432. movq %rcx, 32(%rsp)
  3433. movq %r9, 40(%rsp)
  3434. movq %r10, 48(%rsp)
  3435. movq %r11, 56(%rsp)
  3436. # Square
  3437. # A[0] * A[1]
  3438. movq 64(%rsp), %rax
  3439. mulq 72(%rsp)
  3440. movq %rax, %r9
  3441. movq %rdx, %r10
  3442. # A[0] * A[2]
  3443. movq 64(%rsp), %rax
  3444. mulq 80(%rsp)
  3445. xorq %r11, %r11
  3446. addq %rax, %r10
  3447. adcq %rdx, %r11
  3448. # A[0] * A[3]
  3449. movq 64(%rsp), %rax
  3450. mulq 88(%rsp)
  3451. xorq %r12, %r12
  3452. addq %rax, %r11
  3453. adcq %rdx, %r12
  3454. # A[1] * A[2]
  3455. movq 72(%rsp), %rax
  3456. mulq 80(%rsp)
  3457. xorq %r13, %r13
  3458. addq %rax, %r11
  3459. adcq %rdx, %r12
  3460. adcq $0x00, %r13
  3461. # A[1] * A[3]
  3462. movq 72(%rsp), %rax
  3463. mulq 88(%rsp)
  3464. addq %rax, %r12
  3465. adcq %rdx, %r13
  3466. # A[2] * A[3]
  3467. movq 80(%rsp), %rax
  3468. mulq 88(%rsp)
  3469. xorq %r14, %r14
  3470. addq %rax, %r13
  3471. adcq %rdx, %r14
  3472. # Double
  3473. xorq %r15, %r15
  3474. addq %r9, %r9
  3475. adcq %r10, %r10
  3476. adcq %r11, %r11
  3477. adcq %r12, %r12
  3478. adcq %r13, %r13
  3479. adcq %r14, %r14
  3480. adcq $0x00, %r15
  3481. # A[0] * A[0]
  3482. movq 64(%rsp), %rax
  3483. mulq %rax
  3484. movq %rax, %rcx
  3485. movq %rdx, %rbp
  3486. # A[1] * A[1]
  3487. movq 72(%rsp), %rax
  3488. mulq %rax
  3489. addq %rbp, %r9
  3490. adcq %rax, %r10
  3491. adcq $0x00, %rdx
  3492. movq %rdx, %rbp
  3493. # A[2] * A[2]
  3494. movq 80(%rsp), %rax
  3495. mulq %rax
  3496. addq %rbp, %r11
  3497. adcq %rax, %r12
  3498. adcq $0x00, %rdx
  3499. movq %rdx, %rbp
  3500. # A[3] * A[3]
  3501. movq 88(%rsp), %rax
  3502. mulq %rax
  3503. addq %rax, %r14
  3504. adcq %rdx, %r15
  3505. addq %rbp, %r13
  3506. adcq $0x00, %r14
  3507. adcq $0x00, %r15
  3508. # Reduce
  3509. movq $0x7fffffffffffffff, %rbp
  3510. # Move top half into t4-t7 and remove top bit from t3
  3511. shldq $0x01, %r14, %r15
  3512. shldq $0x01, %r13, %r14
  3513. shldq $0x01, %r12, %r13
  3514. shldq $0x01, %r11, %r12
  3515. andq %rbp, %r11
  3516. # Multiply top half by 19
  3517. movq $19, %rax
  3518. mulq %r12
  3519. xorq %r12, %r12
  3520. addq %rax, %rcx
  3521. movq $19, %rax
  3522. adcq %rdx, %r12
  3523. mulq %r13
  3524. xorq %r13, %r13
  3525. addq %rax, %r9
  3526. movq $19, %rax
  3527. adcq %rdx, %r13
  3528. mulq %r14
  3529. xorq %r14, %r14
  3530. addq %rax, %r10
  3531. movq $19, %rax
  3532. adcq %rdx, %r14
  3533. mulq %r15
  3534. # Add remaining product results in
  3535. addq %r12, %r9
  3536. adcq %r13, %r10
  3537. adcq %r14, %r11
  3538. adcq %rax, %r11
  3539. adcq $0x00, %rdx
  3540. # Overflow
  3541. shldq $0x01, %r11, %rdx
  3542. imulq $19, %rdx, %rax
  3543. andq %rbp, %r11
  3544. addq %rax, %rcx
  3545. adcq $0x00, %r9
  3546. adcq $0x00, %r10
  3547. adcq $0x00, %r11
  3548. # Reduce if top bit set
  3549. movq %r11, %rdx
  3550. shrq $63, %rdx
  3551. imulq $19, %rdx, %rax
  3552. andq %rbp, %r11
  3553. addq %rax, %rcx
  3554. adcq $0x00, %r9
  3555. adcq $0x00, %r10
  3556. adcq $0x00, %r11
  3557. # Store
  3558. movq %rcx, 64(%rsp)
  3559. movq %r9, 72(%rsp)
  3560. movq %r10, 80(%rsp)
  3561. movq %r11, 88(%rsp)
  3562. # Add
  3563. movq 96(%rsp), %rcx
  3564. movq 104(%rsp), %r9
  3565. addq 32(%rsp), %rcx
  3566. movq 112(%rsp), %r10
  3567. adcq 40(%rsp), %r9
  3568. movq 120(%rsp), %rbp
  3569. adcq 48(%rsp), %r10
  3570. movq $-19, %rax
  3571. adcq 56(%rsp), %rbp
  3572. movq $0x7fffffffffffffff, %rdx
  3573. movq %rbp, %r11
  3574. sarq $63, %rbp
  3575. # Mask the modulus
  3576. andq %rbp, %rax
  3577. andq %rbp, %rdx
  3578. # Sub modulus (if overflow)
  3579. subq %rax, %rcx
  3580. sbbq %rbp, %r9
  3581. sbbq %rbp, %r10
  3582. sbbq %rdx, %r11
  3583. movq %rcx, 96(%rsp)
  3584. movq %r9, 104(%rsp)
  3585. movq %r10, 112(%rsp)
  3586. movq %r11, 120(%rsp)
  3587. # Multiply
  3588. # A[0] * B[0]
  3589. movq (%rsp), %rax
  3590. mulq (%r8)
  3591. movq %rax, %rcx
  3592. movq %rdx, %r9
  3593. # A[0] * B[1]
  3594. movq 8(%rsp), %rax
  3595. mulq (%r8)
  3596. xorq %r10, %r10
  3597. addq %rax, %r9
  3598. adcq %rdx, %r10
  3599. # A[1] * B[0]
  3600. movq (%rsp), %rax
  3601. mulq 8(%r8)
  3602. xorq %r11, %r11
  3603. addq %rax, %r9
  3604. adcq %rdx, %r10
  3605. adcq $0x00, %r11
  3606. # A[0] * B[2]
  3607. movq 16(%rsp), %rax
  3608. mulq (%r8)
  3609. addq %rax, %r10
  3610. adcq %rdx, %r11
  3611. # A[1] * B[1]
  3612. movq 8(%rsp), %rax
  3613. mulq 8(%r8)
  3614. xorq %r12, %r12
  3615. addq %rax, %r10
  3616. adcq %rdx, %r11
  3617. adcq $0x00, %r12
  3618. # A[2] * B[0]
  3619. movq (%rsp), %rax
  3620. mulq 16(%r8)
  3621. addq %rax, %r10
  3622. adcq %rdx, %r11
  3623. adcq $0x00, %r12
  3624. # A[0] * B[3]
  3625. movq 24(%rsp), %rax
  3626. mulq (%r8)
  3627. xorq %r13, %r13
  3628. addq %rax, %r11
  3629. adcq %rdx, %r12
  3630. adcq $0x00, %r13
  3631. # A[1] * B[2]
  3632. movq 16(%rsp), %rax
  3633. mulq 8(%r8)
  3634. addq %rax, %r11
  3635. adcq %rdx, %r12
  3636. adcq $0x00, %r13
  3637. # A[2] * B[1]
  3638. movq 8(%rsp), %rax
  3639. mulq 16(%r8)
  3640. addq %rax, %r11
  3641. adcq %rdx, %r12
  3642. adcq $0x00, %r13
  3643. # A[3] * B[0]
  3644. movq (%rsp), %rax
  3645. mulq 24(%r8)
  3646. addq %rax, %r11
  3647. adcq %rdx, %r12
  3648. adcq $0x00, %r13
  3649. # A[1] * B[3]
  3650. movq 24(%rsp), %rax
  3651. mulq 8(%r8)
  3652. xorq %r14, %r14
  3653. addq %rax, %r12
  3654. adcq %rdx, %r13
  3655. adcq $0x00, %r14
  3656. # A[2] * B[2]
  3657. movq 16(%rsp), %rax
  3658. mulq 16(%r8)
  3659. addq %rax, %r12
  3660. adcq %rdx, %r13
  3661. adcq $0x00, %r14
  3662. # A[3] * B[1]
  3663. movq 8(%rsp), %rax
  3664. mulq 24(%r8)
  3665. addq %rax, %r12
  3666. adcq %rdx, %r13
  3667. adcq $0x00, %r14
  3668. # A[2] * B[3]
  3669. movq 24(%rsp), %rax
  3670. mulq 16(%r8)
  3671. xorq %r15, %r15
  3672. addq %rax, %r13
  3673. adcq %rdx, %r14
  3674. adcq $0x00, %r15
  3675. # A[3] * B[2]
  3676. movq 16(%rsp), %rax
  3677. mulq 24(%r8)
  3678. addq %rax, %r13
  3679. adcq %rdx, %r14
  3680. adcq $0x00, %r15
  3681. # A[3] * B[3]
  3682. movq 24(%rsp), %rax
  3683. mulq 24(%r8)
  3684. addq %rax, %r14
  3685. adcq %rdx, %r15
  3686. # Reduce
  3687. movq $0x7fffffffffffffff, %rbp
  3688. # Move top half into t4-t7 and remove top bit from t3
  3689. shldq $0x01, %r14, %r15
  3690. shldq $0x01, %r13, %r14
  3691. shldq $0x01, %r12, %r13
  3692. shldq $0x01, %r11, %r12
  3693. andq %rbp, %r11
  3694. # Multiply top half by 19
  3695. movq $19, %rax
  3696. mulq %r12
  3697. xorq %r12, %r12
  3698. addq %rax, %rcx
  3699. movq $19, %rax
  3700. adcq %rdx, %r12
  3701. mulq %r13
  3702. xorq %r13, %r13
  3703. addq %rax, %r9
  3704. movq $19, %rax
  3705. adcq %rdx, %r13
  3706. mulq %r14
  3707. xorq %r14, %r14
  3708. addq %rax, %r10
  3709. movq $19, %rax
  3710. adcq %rdx, %r14
  3711. mulq %r15
  3712. # Add remaining product results in
  3713. addq %r12, %r9
  3714. adcq %r13, %r10
  3715. adcq %r14, %r11
  3716. adcq %rax, %r11
  3717. adcq $0x00, %rdx
  3718. # Overflow
  3719. shldq $0x01, %r11, %rdx
  3720. imulq $19, %rdx, %rax
  3721. andq %rbp, %r11
  3722. addq %rax, %rcx
  3723. adcq $0x00, %r9
  3724. adcq $0x00, %r10
  3725. adcq $0x00, %r11
  3726. # Reduce if top bit set
  3727. movq %r11, %rdx
  3728. shrq $63, %rdx
  3729. imulq $19, %rdx, %rax
  3730. andq %rbp, %r11
  3731. addq %rax, %rcx
  3732. adcq $0x00, %r9
  3733. adcq $0x00, %r10
  3734. adcq $0x00, %r11
  3735. # Store
  3736. movq %rcx, 32(%rsp)
  3737. movq %r9, 40(%rsp)
  3738. movq %r10, 48(%rsp)
  3739. movq %r11, 56(%rsp)
  3740. # Multiply
  3741. # A[0] * B[0]
  3742. movq 96(%rsp), %rax
  3743. mulq 128(%rsp)
  3744. movq %rax, %rcx
  3745. movq %rdx, %r9
  3746. # A[0] * B[1]
  3747. movq 104(%rsp), %rax
  3748. mulq 128(%rsp)
  3749. xorq %r10, %r10
  3750. addq %rax, %r9
  3751. adcq %rdx, %r10
  3752. # A[1] * B[0]
  3753. movq 96(%rsp), %rax
  3754. mulq 136(%rsp)
  3755. xorq %r11, %r11
  3756. addq %rax, %r9
  3757. adcq %rdx, %r10
  3758. adcq $0x00, %r11
  3759. # A[0] * B[2]
  3760. movq 112(%rsp), %rax
  3761. mulq 128(%rsp)
  3762. addq %rax, %r10
  3763. adcq %rdx, %r11
  3764. # A[1] * B[1]
  3765. movq 104(%rsp), %rax
  3766. mulq 136(%rsp)
  3767. xorq %r12, %r12
  3768. addq %rax, %r10
  3769. adcq %rdx, %r11
  3770. adcq $0x00, %r12
  3771. # A[2] * B[0]
  3772. movq 96(%rsp), %rax
  3773. mulq 144(%rsp)
  3774. addq %rax, %r10
  3775. adcq %rdx, %r11
  3776. adcq $0x00, %r12
  3777. # A[0] * B[3]
  3778. movq 120(%rsp), %rax
  3779. mulq 128(%rsp)
  3780. xorq %r13, %r13
  3781. addq %rax, %r11
  3782. adcq %rdx, %r12
  3783. adcq $0x00, %r13
  3784. # A[1] * B[2]
  3785. movq 112(%rsp), %rax
  3786. mulq 136(%rsp)
  3787. addq %rax, %r11
  3788. adcq %rdx, %r12
  3789. adcq $0x00, %r13
  3790. # A[2] * B[1]
  3791. movq 104(%rsp), %rax
  3792. mulq 144(%rsp)
  3793. addq %rax, %r11
  3794. adcq %rdx, %r12
  3795. adcq $0x00, %r13
  3796. # A[3] * B[0]
  3797. movq 96(%rsp), %rax
  3798. mulq 152(%rsp)
  3799. addq %rax, %r11
  3800. adcq %rdx, %r12
  3801. adcq $0x00, %r13
  3802. # A[1] * B[3]
  3803. movq 120(%rsp), %rax
  3804. mulq 136(%rsp)
  3805. xorq %r14, %r14
  3806. addq %rax, %r12
  3807. adcq %rdx, %r13
  3808. adcq $0x00, %r14
  3809. # A[2] * B[2]
  3810. movq 112(%rsp), %rax
  3811. mulq 144(%rsp)
  3812. addq %rax, %r12
  3813. adcq %rdx, %r13
  3814. adcq $0x00, %r14
  3815. # A[3] * B[1]
  3816. movq 104(%rsp), %rax
  3817. mulq 152(%rsp)
  3818. addq %rax, %r12
  3819. adcq %rdx, %r13
  3820. adcq $0x00, %r14
  3821. # A[2] * B[3]
  3822. movq 120(%rsp), %rax
  3823. mulq 144(%rsp)
  3824. xorq %r15, %r15
  3825. addq %rax, %r13
  3826. adcq %rdx, %r14
  3827. adcq $0x00, %r15
  3828. # A[3] * B[2]
  3829. movq 112(%rsp), %rax
  3830. mulq 152(%rsp)
  3831. addq %rax, %r13
  3832. adcq %rdx, %r14
  3833. adcq $0x00, %r15
  3834. # A[3] * B[3]
  3835. movq 120(%rsp), %rax
  3836. mulq 152(%rsp)
  3837. addq %rax, %r14
  3838. adcq %rdx, %r15
  3839. # Reduce
  3840. movq $0x7fffffffffffffff, %rbp
  3841. # Move top half into t4-t7 and remove top bit from t3
  3842. shldq $0x01, %r14, %r15
  3843. shldq $0x01, %r13, %r14
  3844. shldq $0x01, %r12, %r13
  3845. shldq $0x01, %r11, %r12
  3846. andq %rbp, %r11
  3847. # Multiply top half by 19
  3848. movq $19, %rax
  3849. mulq %r12
  3850. xorq %r12, %r12
  3851. addq %rax, %rcx
  3852. movq $19, %rax
  3853. adcq %rdx, %r12
  3854. mulq %r13
  3855. xorq %r13, %r13
  3856. addq %rax, %r9
  3857. movq $19, %rax
  3858. adcq %rdx, %r13
  3859. mulq %r14
  3860. xorq %r14, %r14
  3861. addq %rax, %r10
  3862. movq $19, %rax
  3863. adcq %rdx, %r14
  3864. mulq %r15
  3865. # Add remaining product results in
  3866. addq %r12, %r9
  3867. adcq %r13, %r10
  3868. adcq %r14, %r11
  3869. adcq %rax, %r11
  3870. adcq $0x00, %rdx
  3871. # Overflow
  3872. shldq $0x01, %r11, %rdx
  3873. imulq $19, %rdx, %rax
  3874. andq %rbp, %r11
  3875. addq %rax, %rcx
  3876. adcq $0x00, %r9
  3877. adcq $0x00, %r10
  3878. adcq $0x00, %r11
  3879. # Reduce if top bit set
  3880. movq %r11, %rdx
  3881. shrq $63, %rdx
  3882. imulq $19, %rdx, %rax
  3883. andq %rbp, %r11
  3884. addq %rax, %rcx
  3885. adcq $0x00, %r9
  3886. adcq $0x00, %r10
  3887. adcq $0x00, %r11
  3888. # Store
  3889. movq %rcx, (%rsp)
  3890. movq %r9, 8(%rsp)
  3891. movq %r10, 16(%rsp)
  3892. movq %r11, 24(%rsp)
  3893. decb 168(%rsp)
  3894. jge L_curve25519_x64_bits
  3895. movq $63, 168(%rsp)
  3896. decb 160(%rsp)
  3897. jge L_curve25519_x64_words
  3898. # Invert
  3899. leaq 32(%rsp), %rdi
  3900. movq %rsp, %rsi
  3901. #ifndef __APPLE__
  3902. callq fe_sq_x64@plt
  3903. #else
  3904. callq _fe_sq_x64
  3905. #endif /* __APPLE__ */
  3906. leaq 64(%rsp), %rdi
  3907. leaq 32(%rsp), %rsi
  3908. #ifndef __APPLE__
  3909. callq fe_sq_x64@plt
  3910. #else
  3911. callq _fe_sq_x64
  3912. #endif /* __APPLE__ */
  3913. leaq 64(%rsp), %rdi
  3914. leaq 64(%rsp), %rsi
  3915. #ifndef __APPLE__
  3916. callq fe_sq_x64@plt
  3917. #else
  3918. callq _fe_sq_x64
  3919. #endif /* __APPLE__ */
  3920. leaq 64(%rsp), %rdi
  3921. movq %rsp, %rsi
  3922. leaq 64(%rsp), %rdx
  3923. #ifndef __APPLE__
  3924. callq fe_mul_x64@plt
  3925. #else
  3926. callq _fe_mul_x64
  3927. #endif /* __APPLE__ */
  3928. leaq 32(%rsp), %rdi
  3929. leaq 32(%rsp), %rsi
  3930. leaq 64(%rsp), %rdx
  3931. #ifndef __APPLE__
  3932. callq fe_mul_x64@plt
  3933. #else
  3934. callq _fe_mul_x64
  3935. #endif /* __APPLE__ */
  3936. leaq 96(%rsp), %rdi
  3937. leaq 32(%rsp), %rsi
  3938. #ifndef __APPLE__
  3939. callq fe_sq_x64@plt
  3940. #else
  3941. callq _fe_sq_x64
  3942. #endif /* __APPLE__ */
  3943. leaq 64(%rsp), %rdi
  3944. leaq 64(%rsp), %rsi
  3945. leaq 96(%rsp), %rdx
  3946. #ifndef __APPLE__
  3947. callq fe_mul_x64@plt
  3948. #else
  3949. callq _fe_mul_x64
  3950. #endif /* __APPLE__ */
  3951. leaq 96(%rsp), %rdi
  3952. leaq 64(%rsp), %rsi
  3953. #ifndef __APPLE__
  3954. callq fe_sq_x64@plt
  3955. #else
  3956. callq _fe_sq_x64
  3957. #endif /* __APPLE__ */
  3958. leaq 96(%rsp), %rdi
  3959. leaq 96(%rsp), %rsi
  3960. movq $4, %rdx
  3961. #ifndef __APPLE__
  3962. callq fe_sq_n_x64@plt
  3963. #else
  3964. callq _fe_sq_n_x64
  3965. #endif /* __APPLE__ */
  3966. leaq 64(%rsp), %rdi
  3967. leaq 96(%rsp), %rsi
  3968. leaq 64(%rsp), %rdx
  3969. #ifndef __APPLE__
  3970. callq fe_mul_x64@plt
  3971. #else
  3972. callq _fe_mul_x64
  3973. #endif /* __APPLE__ */
  3974. leaq 96(%rsp), %rdi
  3975. leaq 64(%rsp), %rsi
  3976. #ifndef __APPLE__
  3977. callq fe_sq_x64@plt
  3978. #else
  3979. callq _fe_sq_x64
  3980. #endif /* __APPLE__ */
  3981. leaq 96(%rsp), %rdi
  3982. leaq 96(%rsp), %rsi
  3983. movq $9, %rdx
  3984. #ifndef __APPLE__
  3985. callq fe_sq_n_x64@plt
  3986. #else
  3987. callq _fe_sq_n_x64
  3988. #endif /* __APPLE__ */
  3989. leaq 96(%rsp), %rdi
  3990. leaq 96(%rsp), %rsi
  3991. leaq 64(%rsp), %rdx
  3992. #ifndef __APPLE__
  3993. callq fe_mul_x64@plt
  3994. #else
  3995. callq _fe_mul_x64
  3996. #endif /* __APPLE__ */
  3997. leaq 128(%rsp), %rdi
  3998. leaq 96(%rsp), %rsi
  3999. #ifndef __APPLE__
  4000. callq fe_sq_x64@plt
  4001. #else
  4002. callq _fe_sq_x64
  4003. #endif /* __APPLE__ */
  4004. leaq 128(%rsp), %rdi
  4005. leaq 128(%rsp), %rsi
  4006. movq $19, %rdx
  4007. #ifndef __APPLE__
  4008. callq fe_sq_n_x64@plt
  4009. #else
  4010. callq _fe_sq_n_x64
  4011. #endif /* __APPLE__ */
  4012. leaq 96(%rsp), %rdi
  4013. leaq 128(%rsp), %rsi
  4014. leaq 96(%rsp), %rdx
  4015. #ifndef __APPLE__
  4016. callq fe_mul_x64@plt
  4017. #else
  4018. callq _fe_mul_x64
  4019. #endif /* __APPLE__ */
  4020. leaq 96(%rsp), %rdi
  4021. leaq 96(%rsp), %rsi
  4022. #ifndef __APPLE__
  4023. callq fe_sq_x64@plt
  4024. #else
  4025. callq _fe_sq_x64
  4026. #endif /* __APPLE__ */
  4027. leaq 96(%rsp), %rdi
  4028. leaq 96(%rsp), %rsi
  4029. movq $9, %rdx
  4030. #ifndef __APPLE__
  4031. callq fe_sq_n_x64@plt
  4032. #else
  4033. callq _fe_sq_n_x64
  4034. #endif /* __APPLE__ */
  4035. leaq 64(%rsp), %rdi
  4036. leaq 96(%rsp), %rsi
  4037. leaq 64(%rsp), %rdx
  4038. #ifndef __APPLE__
  4039. callq fe_mul_x64@plt
  4040. #else
  4041. callq _fe_mul_x64
  4042. #endif /* __APPLE__ */
  4043. leaq 96(%rsp), %rdi
  4044. leaq 64(%rsp), %rsi
  4045. #ifndef __APPLE__
  4046. callq fe_sq_x64@plt
  4047. #else
  4048. callq _fe_sq_x64
  4049. #endif /* __APPLE__ */
  4050. leaq 96(%rsp), %rdi
  4051. leaq 96(%rsp), %rsi
  4052. movq $49, %rdx
  4053. #ifndef __APPLE__
  4054. callq fe_sq_n_x64@plt
  4055. #else
  4056. callq _fe_sq_n_x64
  4057. #endif /* __APPLE__ */
  4058. leaq 96(%rsp), %rdi
  4059. leaq 96(%rsp), %rsi
  4060. leaq 64(%rsp), %rdx
  4061. #ifndef __APPLE__
  4062. callq fe_mul_x64@plt
  4063. #else
  4064. callq _fe_mul_x64
  4065. #endif /* __APPLE__ */
  4066. leaq 128(%rsp), %rdi
  4067. leaq 96(%rsp), %rsi
  4068. #ifndef __APPLE__
  4069. callq fe_sq_x64@plt
  4070. #else
  4071. callq _fe_sq_x64
  4072. #endif /* __APPLE__ */
  4073. leaq 128(%rsp), %rdi
  4074. leaq 128(%rsp), %rsi
  4075. movq $0x63, %rdx
  4076. #ifndef __APPLE__
  4077. callq fe_sq_n_x64@plt
  4078. #else
  4079. callq _fe_sq_n_x64
  4080. #endif /* __APPLE__ */
  4081. leaq 96(%rsp), %rdi
  4082. leaq 128(%rsp), %rsi
  4083. leaq 96(%rsp), %rdx
  4084. #ifndef __APPLE__
  4085. callq fe_mul_x64@plt
  4086. #else
  4087. callq _fe_mul_x64
  4088. #endif /* __APPLE__ */
  4089. leaq 96(%rsp), %rdi
  4090. leaq 96(%rsp), %rsi
  4091. #ifndef __APPLE__
  4092. callq fe_sq_x64@plt
  4093. #else
  4094. callq _fe_sq_x64
  4095. #endif /* __APPLE__ */
  4096. leaq 96(%rsp), %rdi
  4097. leaq 96(%rsp), %rsi
  4098. movq $49, %rdx
  4099. #ifndef __APPLE__
  4100. callq fe_sq_n_x64@plt
  4101. #else
  4102. callq _fe_sq_n_x64
  4103. #endif /* __APPLE__ */
  4104. leaq 64(%rsp), %rdi
  4105. leaq 96(%rsp), %rsi
  4106. leaq 64(%rsp), %rdx
  4107. #ifndef __APPLE__
  4108. callq fe_mul_x64@plt
  4109. #else
  4110. callq _fe_mul_x64
  4111. #endif /* __APPLE__ */
  4112. leaq 64(%rsp), %rdi
  4113. leaq 64(%rsp), %rsi
  4114. #ifndef __APPLE__
  4115. callq fe_sq_x64@plt
  4116. #else
  4117. callq _fe_sq_x64
  4118. #endif /* __APPLE__ */
  4119. leaq 64(%rsp), %rdi
  4120. leaq 64(%rsp), %rsi
  4121. movq $4, %rdx
  4122. #ifndef __APPLE__
  4123. callq fe_sq_n_x64@plt
  4124. #else
  4125. callq _fe_sq_n_x64
  4126. #endif /* __APPLE__ */
  4127. movq %rsp, %rdi
  4128. leaq 64(%rsp), %rsi
  4129. leaq 32(%rsp), %rdx
  4130. #ifndef __APPLE__
  4131. callq fe_mul_x64@plt
  4132. #else
  4133. callq _fe_mul_x64
  4134. #endif /* __APPLE__ */
  4135. movq 176(%rsp), %rdi
  4136. # Multiply
  4137. # A[0] * B[0]
  4138. movq (%rsp), %rax
  4139. mulq (%rdi)
  4140. movq %rax, %rcx
  4141. movq %rdx, %r9
  4142. # A[0] * B[1]
  4143. movq 8(%rsp), %rax
  4144. mulq (%rdi)
  4145. xorq %r10, %r10
  4146. addq %rax, %r9
  4147. adcq %rdx, %r10
  4148. # A[1] * B[0]
  4149. movq (%rsp), %rax
  4150. mulq 8(%rdi)
  4151. xorq %r11, %r11
  4152. addq %rax, %r9
  4153. adcq %rdx, %r10
  4154. adcq $0x00, %r11
  4155. # A[0] * B[2]
  4156. movq 16(%rsp), %rax
  4157. mulq (%rdi)
  4158. addq %rax, %r10
  4159. adcq %rdx, %r11
  4160. # A[1] * B[1]
  4161. movq 8(%rsp), %rax
  4162. mulq 8(%rdi)
  4163. xorq %r12, %r12
  4164. addq %rax, %r10
  4165. adcq %rdx, %r11
  4166. adcq $0x00, %r12
  4167. # A[2] * B[0]
  4168. movq (%rsp), %rax
  4169. mulq 16(%rdi)
  4170. addq %rax, %r10
  4171. adcq %rdx, %r11
  4172. adcq $0x00, %r12
  4173. # A[0] * B[3]
  4174. movq 24(%rsp), %rax
  4175. mulq (%rdi)
  4176. xorq %r13, %r13
  4177. addq %rax, %r11
  4178. adcq %rdx, %r12
  4179. adcq $0x00, %r13
  4180. # A[1] * B[2]
  4181. movq 16(%rsp), %rax
  4182. mulq 8(%rdi)
  4183. addq %rax, %r11
  4184. adcq %rdx, %r12
  4185. adcq $0x00, %r13
  4186. # A[2] * B[1]
  4187. movq 8(%rsp), %rax
  4188. mulq 16(%rdi)
  4189. addq %rax, %r11
  4190. adcq %rdx, %r12
  4191. adcq $0x00, %r13
  4192. # A[3] * B[0]
  4193. movq (%rsp), %rax
  4194. mulq 24(%rdi)
  4195. addq %rax, %r11
  4196. adcq %rdx, %r12
  4197. adcq $0x00, %r13
  4198. # A[1] * B[3]
  4199. movq 24(%rsp), %rax
  4200. mulq 8(%rdi)
  4201. xorq %r14, %r14
  4202. addq %rax, %r12
  4203. adcq %rdx, %r13
  4204. adcq $0x00, %r14
  4205. # A[2] * B[2]
  4206. movq 16(%rsp), %rax
  4207. mulq 16(%rdi)
  4208. addq %rax, %r12
  4209. adcq %rdx, %r13
  4210. adcq $0x00, %r14
  4211. # A[3] * B[1]
  4212. movq 8(%rsp), %rax
  4213. mulq 24(%rdi)
  4214. addq %rax, %r12
  4215. adcq %rdx, %r13
  4216. adcq $0x00, %r14
  4217. # A[2] * B[3]
  4218. movq 24(%rsp), %rax
  4219. mulq 16(%rdi)
  4220. xorq %r15, %r15
  4221. addq %rax, %r13
  4222. adcq %rdx, %r14
  4223. adcq $0x00, %r15
  4224. # A[3] * B[2]
  4225. movq 16(%rsp), %rax
  4226. mulq 24(%rdi)
  4227. addq %rax, %r13
  4228. adcq %rdx, %r14
  4229. adcq $0x00, %r15
  4230. # A[3] * B[3]
  4231. movq 24(%rsp), %rax
  4232. mulq 24(%rdi)
  4233. addq %rax, %r14
  4234. adcq %rdx, %r15
  4235. # Reduce
  4236. movq $0x7fffffffffffffff, %rbp
  4237. # Move top half into t4-t7 and remove top bit from t3
  4238. shldq $0x01, %r14, %r15
  4239. shldq $0x01, %r13, %r14
  4240. shldq $0x01, %r12, %r13
  4241. shldq $0x01, %r11, %r12
  4242. andq %rbp, %r11
  4243. # Multiply top half by 19
  4244. movq $19, %rax
  4245. mulq %r12
  4246. xorq %r12, %r12
  4247. addq %rax, %rcx
  4248. movq $19, %rax
  4249. adcq %rdx, %r12
  4250. mulq %r13
  4251. xorq %r13, %r13
  4252. addq %rax, %r9
  4253. movq $19, %rax
  4254. adcq %rdx, %r13
  4255. mulq %r14
  4256. xorq %r14, %r14
  4257. addq %rax, %r10
  4258. movq $19, %rax
  4259. adcq %rdx, %r14
  4260. mulq %r15
  4261. # Add remaining product results in
  4262. addq %r12, %r9
  4263. adcq %r13, %r10
  4264. adcq %r14, %r11
  4265. adcq %rax, %r11
  4266. adcq $0x00, %rdx
  4267. # Overflow
  4268. shldq $0x01, %r11, %rdx
  4269. imulq $19, %rdx, %rax
  4270. andq %rbp, %r11
  4271. addq %rax, %rcx
  4272. adcq $0x00, %r9
  4273. adcq $0x00, %r10
  4274. adcq $0x00, %r11
  4275. # Reduce if top bit set
  4276. movq %r11, %rdx
  4277. shrq $63, %rdx
  4278. imulq $19, %rdx, %rax
  4279. andq %rbp, %r11
  4280. addq %rax, %rcx
  4281. adcq $0x00, %r9
  4282. adcq $0x00, %r10
  4283. adcq $0x00, %r11
  4284. # Store
  4285. movq %rcx, (%rdi)
  4286. movq %r9, 8(%rdi)
  4287. movq %r10, 16(%rdi)
  4288. movq %r11, 24(%rdi)
  4289. xorq %rax, %rax
  4290. addq $0xb8, %rsp
  4291. popq %rbp
  4292. popq %rbx
  4293. popq %r15
  4294. popq %r14
  4295. popq %r13
  4296. popq %r12
  4297. repz retq
  4298. #ifndef __APPLE__
  4299. .size curve25519_x64,.-curve25519_x64
  4300. #endif /* __APPLE__ */
  4301. #ifndef __APPLE__
  4302. .text
  4303. .globl fe_pow22523_x64
  4304. .type fe_pow22523_x64,@function
  4305. .align 16
  4306. fe_pow22523_x64:
  4307. #else
  4308. .section __TEXT,__text
  4309. .globl _fe_pow22523_x64
  4310. .p2align 4
  4311. _fe_pow22523_x64:
  4312. #endif /* __APPLE__ */
  4313. subq $0x70, %rsp
  4314. # pow22523
  4315. movq %rdi, 96(%rsp)
  4316. movq %rsi, 104(%rsp)
  4317. movq %rsp, %rdi
  4318. movq 104(%rsp), %rsi
  4319. #ifndef __APPLE__
  4320. callq fe_sq_x64@plt
  4321. #else
  4322. callq _fe_sq_x64
  4323. #endif /* __APPLE__ */
  4324. leaq 32(%rsp), %rdi
  4325. movq %rsp, %rsi
  4326. #ifndef __APPLE__
  4327. callq fe_sq_x64@plt
  4328. #else
  4329. callq _fe_sq_x64
  4330. #endif /* __APPLE__ */
  4331. leaq 32(%rsp), %rdi
  4332. leaq 32(%rsp), %rsi
  4333. #ifndef __APPLE__
  4334. callq fe_sq_x64@plt
  4335. #else
  4336. callq _fe_sq_x64
  4337. #endif /* __APPLE__ */
  4338. leaq 32(%rsp), %rdi
  4339. movq 104(%rsp), %rsi
  4340. leaq 32(%rsp), %rdx
  4341. #ifndef __APPLE__
  4342. callq fe_mul_x64@plt
  4343. #else
  4344. callq _fe_mul_x64
  4345. #endif /* __APPLE__ */
  4346. movq %rsp, %rdi
  4347. movq %rsp, %rsi
  4348. leaq 32(%rsp), %rdx
  4349. #ifndef __APPLE__
  4350. callq fe_mul_x64@plt
  4351. #else
  4352. callq _fe_mul_x64
  4353. #endif /* __APPLE__ */
  4354. movq %rsp, %rdi
  4355. movq %rsp, %rsi
  4356. #ifndef __APPLE__
  4357. callq fe_sq_x64@plt
  4358. #else
  4359. callq _fe_sq_x64
  4360. #endif /* __APPLE__ */
  4361. movq %rsp, %rdi
  4362. leaq 32(%rsp), %rsi
  4363. movq %rsp, %rdx
  4364. #ifndef __APPLE__
  4365. callq fe_mul_x64@plt
  4366. #else
  4367. callq _fe_mul_x64
  4368. #endif /* __APPLE__ */
  4369. leaq 32(%rsp), %rdi
  4370. movq %rsp, %rsi
  4371. #ifndef __APPLE__
  4372. callq fe_sq_x64@plt
  4373. #else
  4374. callq _fe_sq_x64
  4375. #endif /* __APPLE__ */
  4376. leaq 32(%rsp), %rdi
  4377. leaq 32(%rsp), %rsi
  4378. movq $4, %rdx
  4379. #ifndef __APPLE__
  4380. callq fe_sq_n_x64@plt
  4381. #else
  4382. callq _fe_sq_n_x64
  4383. #endif /* __APPLE__ */
  4384. movq %rsp, %rdi
  4385. leaq 32(%rsp), %rsi
  4386. movq %rsp, %rdx
  4387. #ifndef __APPLE__
  4388. callq fe_mul_x64@plt
  4389. #else
  4390. callq _fe_mul_x64
  4391. #endif /* __APPLE__ */
  4392. leaq 32(%rsp), %rdi
  4393. movq %rsp, %rsi
  4394. #ifndef __APPLE__
  4395. callq fe_sq_x64@plt
  4396. #else
  4397. callq _fe_sq_x64
  4398. #endif /* __APPLE__ */
  4399. leaq 32(%rsp), %rdi
  4400. leaq 32(%rsp), %rsi
  4401. movq $9, %rdx
  4402. #ifndef __APPLE__
  4403. callq fe_sq_n_x64@plt
  4404. #else
  4405. callq _fe_sq_n_x64
  4406. #endif /* __APPLE__ */
  4407. leaq 32(%rsp), %rdi
  4408. leaq 32(%rsp), %rsi
  4409. movq %rsp, %rdx
  4410. #ifndef __APPLE__
  4411. callq fe_mul_x64@plt
  4412. #else
  4413. callq _fe_mul_x64
  4414. #endif /* __APPLE__ */
  4415. leaq 64(%rsp), %rdi
  4416. leaq 32(%rsp), %rsi
  4417. #ifndef __APPLE__
  4418. callq fe_sq_x64@plt
  4419. #else
  4420. callq _fe_sq_x64
  4421. #endif /* __APPLE__ */
  4422. leaq 64(%rsp), %rdi
  4423. leaq 64(%rsp), %rsi
  4424. movq $19, %rdx
  4425. #ifndef __APPLE__
  4426. callq fe_sq_n_x64@plt
  4427. #else
  4428. callq _fe_sq_n_x64
  4429. #endif /* __APPLE__ */
  4430. leaq 32(%rsp), %rdi
  4431. leaq 64(%rsp), %rsi
  4432. leaq 32(%rsp), %rdx
  4433. #ifndef __APPLE__
  4434. callq fe_mul_x64@plt
  4435. #else
  4436. callq _fe_mul_x64
  4437. #endif /* __APPLE__ */
  4438. leaq 32(%rsp), %rdi
  4439. leaq 32(%rsp), %rsi
  4440. #ifndef __APPLE__
  4441. callq fe_sq_x64@plt
  4442. #else
  4443. callq _fe_sq_x64
  4444. #endif /* __APPLE__ */
  4445. leaq 32(%rsp), %rdi
  4446. leaq 32(%rsp), %rsi
  4447. movq $9, %rdx
  4448. #ifndef __APPLE__
  4449. callq fe_sq_n_x64@plt
  4450. #else
  4451. callq _fe_sq_n_x64
  4452. #endif /* __APPLE__ */
  4453. movq %rsp, %rdi
  4454. leaq 32(%rsp), %rsi
  4455. movq %rsp, %rdx
  4456. #ifndef __APPLE__
  4457. callq fe_mul_x64@plt
  4458. #else
  4459. callq _fe_mul_x64
  4460. #endif /* __APPLE__ */
  4461. leaq 32(%rsp), %rdi
  4462. movq %rsp, %rsi
  4463. #ifndef __APPLE__
  4464. callq fe_sq_x64@plt
  4465. #else
  4466. callq _fe_sq_x64
  4467. #endif /* __APPLE__ */
  4468. leaq 32(%rsp), %rdi
  4469. leaq 32(%rsp), %rsi
  4470. movq $49, %rdx
  4471. #ifndef __APPLE__
  4472. callq fe_sq_n_x64@plt
  4473. #else
  4474. callq _fe_sq_n_x64
  4475. #endif /* __APPLE__ */
  4476. leaq 32(%rsp), %rdi
  4477. leaq 32(%rsp), %rsi
  4478. movq %rsp, %rdx
  4479. #ifndef __APPLE__
  4480. callq fe_mul_x64@plt
  4481. #else
  4482. callq _fe_mul_x64
  4483. #endif /* __APPLE__ */
  4484. leaq 64(%rsp), %rdi
  4485. leaq 32(%rsp), %rsi
  4486. #ifndef __APPLE__
  4487. callq fe_sq_x64@plt
  4488. #else
  4489. callq _fe_sq_x64
  4490. #endif /* __APPLE__ */
  4491. leaq 64(%rsp), %rdi
  4492. leaq 64(%rsp), %rsi
  4493. movq $0x63, %rdx
  4494. #ifndef __APPLE__
  4495. callq fe_sq_n_x64@plt
  4496. #else
  4497. callq _fe_sq_n_x64
  4498. #endif /* __APPLE__ */
  4499. leaq 32(%rsp), %rdi
  4500. leaq 64(%rsp), %rsi
  4501. leaq 32(%rsp), %rdx
  4502. #ifndef __APPLE__
  4503. callq fe_mul_x64@plt
  4504. #else
  4505. callq _fe_mul_x64
  4506. #endif /* __APPLE__ */
  4507. leaq 32(%rsp), %rdi
  4508. leaq 32(%rsp), %rsi
  4509. #ifndef __APPLE__
  4510. callq fe_sq_x64@plt
  4511. #else
  4512. callq _fe_sq_x64
  4513. #endif /* __APPLE__ */
  4514. leaq 32(%rsp), %rdi
  4515. leaq 32(%rsp), %rsi
  4516. movq $49, %rdx
  4517. #ifndef __APPLE__
  4518. callq fe_sq_n_x64@plt
  4519. #else
  4520. callq _fe_sq_n_x64
  4521. #endif /* __APPLE__ */
  4522. movq %rsp, %rdi
  4523. leaq 32(%rsp), %rsi
  4524. movq %rsp, %rdx
  4525. #ifndef __APPLE__
  4526. callq fe_mul_x64@plt
  4527. #else
  4528. callq _fe_mul_x64
  4529. #endif /* __APPLE__ */
  4530. movq %rsp, %rdi
  4531. movq %rsp, %rsi
  4532. #ifndef __APPLE__
  4533. callq fe_sq_x64@plt
  4534. #else
  4535. callq _fe_sq_x64
  4536. #endif /* __APPLE__ */
  4537. movq %rsp, %rdi
  4538. movq %rsp, %rsi
  4539. #ifndef __APPLE__
  4540. callq fe_sq_x64@plt
  4541. #else
  4542. callq _fe_sq_x64
  4543. #endif /* __APPLE__ */
  4544. movq 96(%rsp), %rdi
  4545. movq %rsp, %rsi
  4546. movq 104(%rsp), %rdx
  4547. #ifndef __APPLE__
  4548. callq fe_mul_x64@plt
  4549. #else
  4550. callq _fe_mul_x64
  4551. #endif /* __APPLE__ */
  4552. movq 104(%rsp), %rsi
  4553. movq 96(%rsp), %rdi
  4554. addq $0x70, %rsp
  4555. repz retq
  4556. #ifndef __APPLE__
  4557. .text
  4558. .globl fe_ge_to_p2_x64
  4559. .type fe_ge_to_p2_x64,@function
  4560. .align 16
  4561. fe_ge_to_p2_x64:
  4562. #else
  4563. .section __TEXT,__text
  4564. .globl _fe_ge_to_p2_x64
  4565. .p2align 4
  4566. _fe_ge_to_p2_x64:
  4567. #endif /* __APPLE__ */
  4568. pushq %rbx
  4569. pushq %r12
  4570. pushq %r13
  4571. pushq %r14
  4572. pushq %r15
  4573. subq $40, %rsp
  4574. movq %rsi, (%rsp)
  4575. movq %rdx, 8(%rsp)
  4576. movq %rcx, 16(%rsp)
  4577. movq %r8, 24(%rsp)
  4578. movq %r9, 32(%rsp)
  4579. movq 16(%rsp), %rsi
  4580. movq 88(%rsp), %rbx
  4581. # Multiply
  4582. # A[0] * B[0]
  4583. movq (%rbx), %rax
  4584. mulq (%rsi)
  4585. movq %rax, %r8
  4586. movq %rdx, %r9
  4587. # A[0] * B[1]
  4588. movq 8(%rbx), %rax
  4589. mulq (%rsi)
  4590. xorq %r10, %r10
  4591. addq %rax, %r9
  4592. adcq %rdx, %r10
  4593. # A[1] * B[0]
  4594. movq (%rbx), %rax
  4595. mulq 8(%rsi)
  4596. xorq %r11, %r11
  4597. addq %rax, %r9
  4598. adcq %rdx, %r10
  4599. adcq $0x00, %r11
  4600. # A[0] * B[2]
  4601. movq 16(%rbx), %rax
  4602. mulq (%rsi)
  4603. addq %rax, %r10
  4604. adcq %rdx, %r11
  4605. # A[1] * B[1]
  4606. movq 8(%rbx), %rax
  4607. mulq 8(%rsi)
  4608. xorq %r12, %r12
  4609. addq %rax, %r10
  4610. adcq %rdx, %r11
  4611. adcq $0x00, %r12
  4612. # A[2] * B[0]
  4613. movq (%rbx), %rax
  4614. mulq 16(%rsi)
  4615. addq %rax, %r10
  4616. adcq %rdx, %r11
  4617. adcq $0x00, %r12
  4618. # A[0] * B[3]
  4619. movq 24(%rbx), %rax
  4620. mulq (%rsi)
  4621. xorq %r13, %r13
  4622. addq %rax, %r11
  4623. adcq %rdx, %r12
  4624. adcq $0x00, %r13
  4625. # A[1] * B[2]
  4626. movq 16(%rbx), %rax
  4627. mulq 8(%rsi)
  4628. addq %rax, %r11
  4629. adcq %rdx, %r12
  4630. adcq $0x00, %r13
  4631. # A[2] * B[1]
  4632. movq 8(%rbx), %rax
  4633. mulq 16(%rsi)
  4634. addq %rax, %r11
  4635. adcq %rdx, %r12
  4636. adcq $0x00, %r13
  4637. # A[3] * B[0]
  4638. movq (%rbx), %rax
  4639. mulq 24(%rsi)
  4640. addq %rax, %r11
  4641. adcq %rdx, %r12
  4642. adcq $0x00, %r13
  4643. # A[1] * B[3]
  4644. movq 24(%rbx), %rax
  4645. mulq 8(%rsi)
  4646. xorq %r14, %r14
  4647. addq %rax, %r12
  4648. adcq %rdx, %r13
  4649. adcq $0x00, %r14
  4650. # A[2] * B[2]
  4651. movq 16(%rbx), %rax
  4652. mulq 16(%rsi)
  4653. addq %rax, %r12
  4654. adcq %rdx, %r13
  4655. adcq $0x00, %r14
  4656. # A[3] * B[1]
  4657. movq 8(%rbx), %rax
  4658. mulq 24(%rsi)
  4659. addq %rax, %r12
  4660. adcq %rdx, %r13
  4661. adcq $0x00, %r14
  4662. # A[2] * B[3]
  4663. movq 24(%rbx), %rax
  4664. mulq 16(%rsi)
  4665. xorq %r15, %r15
  4666. addq %rax, %r13
  4667. adcq %rdx, %r14
  4668. adcq $0x00, %r15
  4669. # A[3] * B[2]
  4670. movq 16(%rbx), %rax
  4671. mulq 24(%rsi)
  4672. addq %rax, %r13
  4673. adcq %rdx, %r14
  4674. adcq $0x00, %r15
  4675. # A[3] * B[3]
  4676. movq 24(%rbx), %rax
  4677. mulq 24(%rsi)
  4678. addq %rax, %r14
  4679. adcq %rdx, %r15
  4680. # Reduce
  4681. movq $0x7fffffffffffffff, %rcx
  4682. # Move top half into t4-t7 and remove top bit from t3
  4683. shldq $0x01, %r14, %r15
  4684. shldq $0x01, %r13, %r14
  4685. shldq $0x01, %r12, %r13
  4686. shldq $0x01, %r11, %r12
  4687. andq %rcx, %r11
  4688. # Multiply top half by 19
  4689. movq $19, %rax
  4690. mulq %r12
  4691. xorq %r12, %r12
  4692. addq %rax, %r8
  4693. movq $19, %rax
  4694. adcq %rdx, %r12
  4695. mulq %r13
  4696. xorq %r13, %r13
  4697. addq %rax, %r9
  4698. movq $19, %rax
  4699. adcq %rdx, %r13
  4700. mulq %r14
  4701. xorq %r14, %r14
  4702. addq %rax, %r10
  4703. movq $19, %rax
  4704. adcq %rdx, %r14
  4705. mulq %r15
  4706. # Add remaining product results in
  4707. addq %r12, %r9
  4708. adcq %r13, %r10
  4709. adcq %r14, %r11
  4710. adcq %rax, %r11
  4711. adcq $0x00, %rdx
  4712. # Overflow
  4713. shldq $0x01, %r11, %rdx
  4714. imulq $19, %rdx, %rax
  4715. andq %rcx, %r11
  4716. addq %rax, %r8
  4717. adcq $0x00, %r9
  4718. adcq $0x00, %r10
  4719. adcq $0x00, %r11
  4720. # Reduce if top bit set
  4721. movq %r11, %rdx
  4722. shrq $63, %rdx
  4723. imulq $19, %rdx, %rax
  4724. andq %rcx, %r11
  4725. addq %rax, %r8
  4726. adcq $0x00, %r9
  4727. adcq $0x00, %r10
  4728. adcq $0x00, %r11
  4729. # Store
  4730. movq %r8, (%rdi)
  4731. movq %r9, 8(%rdi)
  4732. movq %r10, 16(%rdi)
  4733. movq %r11, 24(%rdi)
  4734. movq (%rsp), %rdi
  4735. movq 24(%rsp), %rsi
  4736. movq 32(%rsp), %rbx
  4737. # Multiply
  4738. # A[0] * B[0]
  4739. movq (%rbx), %rax
  4740. mulq (%rsi)
  4741. movq %rax, %r8
  4742. movq %rdx, %r9
  4743. # A[0] * B[1]
  4744. movq 8(%rbx), %rax
  4745. mulq (%rsi)
  4746. xorq %r10, %r10
  4747. addq %rax, %r9
  4748. adcq %rdx, %r10
  4749. # A[1] * B[0]
  4750. movq (%rbx), %rax
  4751. mulq 8(%rsi)
  4752. xorq %r11, %r11
  4753. addq %rax, %r9
  4754. adcq %rdx, %r10
  4755. adcq $0x00, %r11
  4756. # A[0] * B[2]
  4757. movq 16(%rbx), %rax
  4758. mulq (%rsi)
  4759. addq %rax, %r10
  4760. adcq %rdx, %r11
  4761. # A[1] * B[1]
  4762. movq 8(%rbx), %rax
  4763. mulq 8(%rsi)
  4764. xorq %r12, %r12
  4765. addq %rax, %r10
  4766. adcq %rdx, %r11
  4767. adcq $0x00, %r12
  4768. # A[2] * B[0]
  4769. movq (%rbx), %rax
  4770. mulq 16(%rsi)
  4771. addq %rax, %r10
  4772. adcq %rdx, %r11
  4773. adcq $0x00, %r12
  4774. # A[0] * B[3]
  4775. movq 24(%rbx), %rax
  4776. mulq (%rsi)
  4777. xorq %r13, %r13
  4778. addq %rax, %r11
  4779. adcq %rdx, %r12
  4780. adcq $0x00, %r13
  4781. # A[1] * B[2]
  4782. movq 16(%rbx), %rax
  4783. mulq 8(%rsi)
  4784. addq %rax, %r11
  4785. adcq %rdx, %r12
  4786. adcq $0x00, %r13
  4787. # A[2] * B[1]
  4788. movq 8(%rbx), %rax
  4789. mulq 16(%rsi)
  4790. addq %rax, %r11
  4791. adcq %rdx, %r12
  4792. adcq $0x00, %r13
  4793. # A[3] * B[0]
  4794. movq (%rbx), %rax
  4795. mulq 24(%rsi)
  4796. addq %rax, %r11
  4797. adcq %rdx, %r12
  4798. adcq $0x00, %r13
  4799. # A[1] * B[3]
  4800. movq 24(%rbx), %rax
  4801. mulq 8(%rsi)
  4802. xorq %r14, %r14
  4803. addq %rax, %r12
  4804. adcq %rdx, %r13
  4805. adcq $0x00, %r14
  4806. # A[2] * B[2]
  4807. movq 16(%rbx), %rax
  4808. mulq 16(%rsi)
  4809. addq %rax, %r12
  4810. adcq %rdx, %r13
  4811. adcq $0x00, %r14
  4812. # A[3] * B[1]
  4813. movq 8(%rbx), %rax
  4814. mulq 24(%rsi)
  4815. addq %rax, %r12
  4816. adcq %rdx, %r13
  4817. adcq $0x00, %r14
  4818. # A[2] * B[3]
  4819. movq 24(%rbx), %rax
  4820. mulq 16(%rsi)
  4821. xorq %r15, %r15
  4822. addq %rax, %r13
  4823. adcq %rdx, %r14
  4824. adcq $0x00, %r15
  4825. # A[3] * B[2]
  4826. movq 16(%rbx), %rax
  4827. mulq 24(%rsi)
  4828. addq %rax, %r13
  4829. adcq %rdx, %r14
  4830. adcq $0x00, %r15
  4831. # A[3] * B[3]
  4832. movq 24(%rbx), %rax
  4833. mulq 24(%rsi)
  4834. addq %rax, %r14
  4835. adcq %rdx, %r15
  4836. # Reduce
  4837. movq $0x7fffffffffffffff, %rcx
  4838. # Move top half into t4-t7 and remove top bit from t3
  4839. shldq $0x01, %r14, %r15
  4840. shldq $0x01, %r13, %r14
  4841. shldq $0x01, %r12, %r13
  4842. shldq $0x01, %r11, %r12
  4843. andq %rcx, %r11
  4844. # Multiply top half by 19
  4845. movq $19, %rax
  4846. mulq %r12
  4847. xorq %r12, %r12
  4848. addq %rax, %r8
  4849. movq $19, %rax
  4850. adcq %rdx, %r12
  4851. mulq %r13
  4852. xorq %r13, %r13
  4853. addq %rax, %r9
  4854. movq $19, %rax
  4855. adcq %rdx, %r13
  4856. mulq %r14
  4857. xorq %r14, %r14
  4858. addq %rax, %r10
  4859. movq $19, %rax
  4860. adcq %rdx, %r14
  4861. mulq %r15
  4862. # Add remaining product results in
  4863. addq %r12, %r9
  4864. adcq %r13, %r10
  4865. adcq %r14, %r11
  4866. adcq %rax, %r11
  4867. adcq $0x00, %rdx
  4868. # Overflow
  4869. shldq $0x01, %r11, %rdx
  4870. imulq $19, %rdx, %rax
  4871. andq %rcx, %r11
  4872. addq %rax, %r8
  4873. adcq $0x00, %r9
  4874. adcq $0x00, %r10
  4875. adcq $0x00, %r11
  4876. # Reduce if top bit set
  4877. movq %r11, %rdx
  4878. shrq $63, %rdx
  4879. imulq $19, %rdx, %rax
  4880. andq %rcx, %r11
  4881. addq %rax, %r8
  4882. adcq $0x00, %r9
  4883. adcq $0x00, %r10
  4884. adcq $0x00, %r11
  4885. # Store
  4886. movq %r8, (%rdi)
  4887. movq %r9, 8(%rdi)
  4888. movq %r10, 16(%rdi)
  4889. movq %r11, 24(%rdi)
  4890. movq 8(%rsp), %rdi
  4891. movq 32(%rsp), %rsi
  4892. movq 88(%rsp), %rbx
  4893. # Multiply
  4894. # A[0] * B[0]
  4895. movq (%rbx), %rax
  4896. mulq (%rsi)
  4897. movq %rax, %r8
  4898. movq %rdx, %r9
  4899. # A[0] * B[1]
  4900. movq 8(%rbx), %rax
  4901. mulq (%rsi)
  4902. xorq %r10, %r10
  4903. addq %rax, %r9
  4904. adcq %rdx, %r10
  4905. # A[1] * B[0]
  4906. movq (%rbx), %rax
  4907. mulq 8(%rsi)
  4908. xorq %r11, %r11
  4909. addq %rax, %r9
  4910. adcq %rdx, %r10
  4911. adcq $0x00, %r11
  4912. # A[0] * B[2]
  4913. movq 16(%rbx), %rax
  4914. mulq (%rsi)
  4915. addq %rax, %r10
  4916. adcq %rdx, %r11
  4917. # A[1] * B[1]
  4918. movq 8(%rbx), %rax
  4919. mulq 8(%rsi)
  4920. xorq %r12, %r12
  4921. addq %rax, %r10
  4922. adcq %rdx, %r11
  4923. adcq $0x00, %r12
  4924. # A[2] * B[0]
  4925. movq (%rbx), %rax
  4926. mulq 16(%rsi)
  4927. addq %rax, %r10
  4928. adcq %rdx, %r11
  4929. adcq $0x00, %r12
  4930. # A[0] * B[3]
  4931. movq 24(%rbx), %rax
  4932. mulq (%rsi)
  4933. xorq %r13, %r13
  4934. addq %rax, %r11
  4935. adcq %rdx, %r12
  4936. adcq $0x00, %r13
  4937. # A[1] * B[2]
  4938. movq 16(%rbx), %rax
  4939. mulq 8(%rsi)
  4940. addq %rax, %r11
  4941. adcq %rdx, %r12
  4942. adcq $0x00, %r13
  4943. # A[2] * B[1]
  4944. movq 8(%rbx), %rax
  4945. mulq 16(%rsi)
  4946. addq %rax, %r11
  4947. adcq %rdx, %r12
  4948. adcq $0x00, %r13
  4949. # A[3] * B[0]
  4950. movq (%rbx), %rax
  4951. mulq 24(%rsi)
  4952. addq %rax, %r11
  4953. adcq %rdx, %r12
  4954. adcq $0x00, %r13
  4955. # A[1] * B[3]
  4956. movq 24(%rbx), %rax
  4957. mulq 8(%rsi)
  4958. xorq %r14, %r14
  4959. addq %rax, %r12
  4960. adcq %rdx, %r13
  4961. adcq $0x00, %r14
  4962. # A[2] * B[2]
  4963. movq 16(%rbx), %rax
  4964. mulq 16(%rsi)
  4965. addq %rax, %r12
  4966. adcq %rdx, %r13
  4967. adcq $0x00, %r14
  4968. # A[3] * B[1]
  4969. movq 8(%rbx), %rax
  4970. mulq 24(%rsi)
  4971. addq %rax, %r12
  4972. adcq %rdx, %r13
  4973. adcq $0x00, %r14
  4974. # A[2] * B[3]
  4975. movq 24(%rbx), %rax
  4976. mulq 16(%rsi)
  4977. xorq %r15, %r15
  4978. addq %rax, %r13
  4979. adcq %rdx, %r14
  4980. adcq $0x00, %r15
  4981. # A[3] * B[2]
  4982. movq 16(%rbx), %rax
  4983. mulq 24(%rsi)
  4984. addq %rax, %r13
  4985. adcq %rdx, %r14
  4986. adcq $0x00, %r15
  4987. # A[3] * B[3]
  4988. movq 24(%rbx), %rax
  4989. mulq 24(%rsi)
  4990. addq %rax, %r14
  4991. adcq %rdx, %r15
  4992. # Reduce
  4993. movq $0x7fffffffffffffff, %rcx
  4994. # Move top half into t4-t7 and remove top bit from t3
  4995. shldq $0x01, %r14, %r15
  4996. shldq $0x01, %r13, %r14
  4997. shldq $0x01, %r12, %r13
  4998. shldq $0x01, %r11, %r12
  4999. andq %rcx, %r11
  5000. # Multiply top half by 19
  5001. movq $19, %rax
  5002. mulq %r12
  5003. xorq %r12, %r12
  5004. addq %rax, %r8
  5005. movq $19, %rax
  5006. adcq %rdx, %r12
  5007. mulq %r13
  5008. xorq %r13, %r13
  5009. addq %rax, %r9
  5010. movq $19, %rax
  5011. adcq %rdx, %r13
  5012. mulq %r14
  5013. xorq %r14, %r14
  5014. addq %rax, %r10
  5015. movq $19, %rax
  5016. adcq %rdx, %r14
  5017. mulq %r15
  5018. # Add remaining product results in
  5019. addq %r12, %r9
  5020. adcq %r13, %r10
  5021. adcq %r14, %r11
  5022. adcq %rax, %r11
  5023. adcq $0x00, %rdx
  5024. # Overflow
  5025. shldq $0x01, %r11, %rdx
  5026. imulq $19, %rdx, %rax
  5027. andq %rcx, %r11
  5028. addq %rax, %r8
  5029. adcq $0x00, %r9
  5030. adcq $0x00, %r10
  5031. adcq $0x00, %r11
  5032. # Reduce if top bit set
  5033. movq %r11, %rdx
  5034. shrq $63, %rdx
  5035. imulq $19, %rdx, %rax
  5036. andq %rcx, %r11
  5037. addq %rax, %r8
  5038. adcq $0x00, %r9
  5039. adcq $0x00, %r10
  5040. adcq $0x00, %r11
  5041. # Store
  5042. movq %r8, (%rdi)
  5043. movq %r9, 8(%rdi)
  5044. movq %r10, 16(%rdi)
  5045. movq %r11, 24(%rdi)
  5046. addq $40, %rsp
  5047. popq %r15
  5048. popq %r14
  5049. popq %r13
  5050. popq %r12
  5051. popq %rbx
  5052. repz retq
  5053. #ifndef __APPLE__
  5054. .size fe_ge_to_p2_x64,.-fe_ge_to_p2_x64
  5055. #endif /* __APPLE__ */
  5056. #ifndef __APPLE__
  5057. .text
  5058. .globl fe_ge_to_p3_x64
  5059. .type fe_ge_to_p3_x64,@function
  5060. .align 16
  5061. fe_ge_to_p3_x64:
  5062. #else
  5063. .section __TEXT,__text
  5064. .globl _fe_ge_to_p3_x64
  5065. .p2align 4
  5066. _fe_ge_to_p3_x64:
  5067. #endif /* __APPLE__ */
  5068. pushq %rbx
  5069. pushq %r12
  5070. pushq %r13
  5071. pushq %r14
  5072. pushq %r15
  5073. subq $40, %rsp
  5074. movq %rsi, (%rsp)
  5075. movq %rdx, 8(%rsp)
  5076. movq %rcx, 16(%rsp)
  5077. movq %r8, 24(%rsp)
  5078. movq %r9, 32(%rsp)
  5079. movq 24(%rsp), %rsi
  5080. movq 96(%rsp), %rbx
  5081. # Multiply
  5082. # A[0] * B[0]
  5083. movq (%rbx), %rax
  5084. mulq (%rsi)
  5085. movq %rax, %r8
  5086. movq %rdx, %r9
  5087. # A[0] * B[1]
  5088. movq 8(%rbx), %rax
  5089. mulq (%rsi)
  5090. xorq %r10, %r10
  5091. addq %rax, %r9
  5092. adcq %rdx, %r10
  5093. # A[1] * B[0]
  5094. movq (%rbx), %rax
  5095. mulq 8(%rsi)
  5096. xorq %r11, %r11
  5097. addq %rax, %r9
  5098. adcq %rdx, %r10
  5099. adcq $0x00, %r11
  5100. # A[0] * B[2]
  5101. movq 16(%rbx), %rax
  5102. mulq (%rsi)
  5103. addq %rax, %r10
  5104. adcq %rdx, %r11
  5105. # A[1] * B[1]
  5106. movq 8(%rbx), %rax
  5107. mulq 8(%rsi)
  5108. xorq %r12, %r12
  5109. addq %rax, %r10
  5110. adcq %rdx, %r11
  5111. adcq $0x00, %r12
  5112. # A[2] * B[0]
  5113. movq (%rbx), %rax
  5114. mulq 16(%rsi)
  5115. addq %rax, %r10
  5116. adcq %rdx, %r11
  5117. adcq $0x00, %r12
  5118. # A[0] * B[3]
  5119. movq 24(%rbx), %rax
  5120. mulq (%rsi)
  5121. xorq %r13, %r13
  5122. addq %rax, %r11
  5123. adcq %rdx, %r12
  5124. adcq $0x00, %r13
  5125. # A[1] * B[2]
  5126. movq 16(%rbx), %rax
  5127. mulq 8(%rsi)
  5128. addq %rax, %r11
  5129. adcq %rdx, %r12
  5130. adcq $0x00, %r13
  5131. # A[2] * B[1]
  5132. movq 8(%rbx), %rax
  5133. mulq 16(%rsi)
  5134. addq %rax, %r11
  5135. adcq %rdx, %r12
  5136. adcq $0x00, %r13
  5137. # A[3] * B[0]
  5138. movq (%rbx), %rax
  5139. mulq 24(%rsi)
  5140. addq %rax, %r11
  5141. adcq %rdx, %r12
  5142. adcq $0x00, %r13
  5143. # A[1] * B[3]
  5144. movq 24(%rbx), %rax
  5145. mulq 8(%rsi)
  5146. xorq %r14, %r14
  5147. addq %rax, %r12
  5148. adcq %rdx, %r13
  5149. adcq $0x00, %r14
  5150. # A[2] * B[2]
  5151. movq 16(%rbx), %rax
  5152. mulq 16(%rsi)
  5153. addq %rax, %r12
  5154. adcq %rdx, %r13
  5155. adcq $0x00, %r14
  5156. # A[3] * B[1]
  5157. movq 8(%rbx), %rax
  5158. mulq 24(%rsi)
  5159. addq %rax, %r12
  5160. adcq %rdx, %r13
  5161. adcq $0x00, %r14
  5162. # A[2] * B[3]
  5163. movq 24(%rbx), %rax
  5164. mulq 16(%rsi)
  5165. xorq %r15, %r15
  5166. addq %rax, %r13
  5167. adcq %rdx, %r14
  5168. adcq $0x00, %r15
  5169. # A[3] * B[2]
  5170. movq 16(%rbx), %rax
  5171. mulq 24(%rsi)
  5172. addq %rax, %r13
  5173. adcq %rdx, %r14
  5174. adcq $0x00, %r15
  5175. # A[3] * B[3]
  5176. movq 24(%rbx), %rax
  5177. mulq 24(%rsi)
  5178. addq %rax, %r14
  5179. adcq %rdx, %r15
  5180. # Reduce
  5181. movq $0x7fffffffffffffff, %rcx
  5182. # Move top half into t4-t7 and remove top bit from t3
  5183. shldq $0x01, %r14, %r15
  5184. shldq $0x01, %r13, %r14
  5185. shldq $0x01, %r12, %r13
  5186. shldq $0x01, %r11, %r12
  5187. andq %rcx, %r11
  5188. # Multiply top half by 19
  5189. movq $19, %rax
  5190. mulq %r12
  5191. xorq %r12, %r12
  5192. addq %rax, %r8
  5193. movq $19, %rax
  5194. adcq %rdx, %r12
  5195. mulq %r13
  5196. xorq %r13, %r13
  5197. addq %rax, %r9
  5198. movq $19, %rax
  5199. adcq %rdx, %r13
  5200. mulq %r14
  5201. xorq %r14, %r14
  5202. addq %rax, %r10
  5203. movq $19, %rax
  5204. adcq %rdx, %r14
  5205. mulq %r15
  5206. # Add remaining product results in
  5207. addq %r12, %r9
  5208. adcq %r13, %r10
  5209. adcq %r14, %r11
  5210. adcq %rax, %r11
  5211. adcq $0x00, %rdx
  5212. # Overflow
  5213. shldq $0x01, %r11, %rdx
  5214. imulq $19, %rdx, %rax
  5215. andq %rcx, %r11
  5216. addq %rax, %r8
  5217. adcq $0x00, %r9
  5218. adcq $0x00, %r10
  5219. adcq $0x00, %r11
  5220. # Reduce if top bit set
  5221. movq %r11, %rdx
  5222. shrq $63, %rdx
  5223. imulq $19, %rdx, %rax
  5224. andq %rcx, %r11
  5225. addq %rax, %r8
  5226. adcq $0x00, %r9
  5227. adcq $0x00, %r10
  5228. adcq $0x00, %r11
  5229. # Store
  5230. movq %r8, (%rdi)
  5231. movq %r9, 8(%rdi)
  5232. movq %r10, 16(%rdi)
  5233. movq %r11, 24(%rdi)
  5234. movq (%rsp), %rdi
  5235. movq 32(%rsp), %rsi
  5236. movq 88(%rsp), %rbx
  5237. # Multiply
  5238. # A[0] * B[0]
  5239. movq (%rbx), %rax
  5240. mulq (%rsi)
  5241. movq %rax, %r8
  5242. movq %rdx, %r9
  5243. # A[0] * B[1]
  5244. movq 8(%rbx), %rax
  5245. mulq (%rsi)
  5246. xorq %r10, %r10
  5247. addq %rax, %r9
  5248. adcq %rdx, %r10
  5249. # A[1] * B[0]
  5250. movq (%rbx), %rax
  5251. mulq 8(%rsi)
  5252. xorq %r11, %r11
  5253. addq %rax, %r9
  5254. adcq %rdx, %r10
  5255. adcq $0x00, %r11
  5256. # A[0] * B[2]
  5257. movq 16(%rbx), %rax
  5258. mulq (%rsi)
  5259. addq %rax, %r10
  5260. adcq %rdx, %r11
  5261. # A[1] * B[1]
  5262. movq 8(%rbx), %rax
  5263. mulq 8(%rsi)
  5264. xorq %r12, %r12
  5265. addq %rax, %r10
  5266. adcq %rdx, %r11
  5267. adcq $0x00, %r12
  5268. # A[2] * B[0]
  5269. movq (%rbx), %rax
  5270. mulq 16(%rsi)
  5271. addq %rax, %r10
  5272. adcq %rdx, %r11
  5273. adcq $0x00, %r12
  5274. # A[0] * B[3]
  5275. movq 24(%rbx), %rax
  5276. mulq (%rsi)
  5277. xorq %r13, %r13
  5278. addq %rax, %r11
  5279. adcq %rdx, %r12
  5280. adcq $0x00, %r13
  5281. # A[1] * B[2]
  5282. movq 16(%rbx), %rax
  5283. mulq 8(%rsi)
  5284. addq %rax, %r11
  5285. adcq %rdx, %r12
  5286. adcq $0x00, %r13
  5287. # A[2] * B[1]
  5288. movq 8(%rbx), %rax
  5289. mulq 16(%rsi)
  5290. addq %rax, %r11
  5291. adcq %rdx, %r12
  5292. adcq $0x00, %r13
  5293. # A[3] * B[0]
  5294. movq (%rbx), %rax
  5295. mulq 24(%rsi)
  5296. addq %rax, %r11
  5297. adcq %rdx, %r12
  5298. adcq $0x00, %r13
  5299. # A[1] * B[3]
  5300. movq 24(%rbx), %rax
  5301. mulq 8(%rsi)
  5302. xorq %r14, %r14
  5303. addq %rax, %r12
  5304. adcq %rdx, %r13
  5305. adcq $0x00, %r14
  5306. # A[2] * B[2]
  5307. movq 16(%rbx), %rax
  5308. mulq 16(%rsi)
  5309. addq %rax, %r12
  5310. adcq %rdx, %r13
  5311. adcq $0x00, %r14
  5312. # A[3] * B[1]
  5313. movq 8(%rbx), %rax
  5314. mulq 24(%rsi)
  5315. addq %rax, %r12
  5316. adcq %rdx, %r13
  5317. adcq $0x00, %r14
  5318. # A[2] * B[3]
  5319. movq 24(%rbx), %rax
  5320. mulq 16(%rsi)
  5321. xorq %r15, %r15
  5322. addq %rax, %r13
  5323. adcq %rdx, %r14
  5324. adcq $0x00, %r15
  5325. # A[3] * B[2]
  5326. movq 16(%rbx), %rax
  5327. mulq 24(%rsi)
  5328. addq %rax, %r13
  5329. adcq %rdx, %r14
  5330. adcq $0x00, %r15
  5331. # A[3] * B[3]
  5332. movq 24(%rbx), %rax
  5333. mulq 24(%rsi)
  5334. addq %rax, %r14
  5335. adcq %rdx, %r15
  5336. # Reduce
  5337. movq $0x7fffffffffffffff, %rcx
  5338. # Move top half into t4-t7 and remove top bit from t3
  5339. shldq $0x01, %r14, %r15
  5340. shldq $0x01, %r13, %r14
  5341. shldq $0x01, %r12, %r13
  5342. shldq $0x01, %r11, %r12
  5343. andq %rcx, %r11
  5344. # Multiply top half by 19
  5345. movq $19, %rax
  5346. mulq %r12
  5347. xorq %r12, %r12
  5348. addq %rax, %r8
  5349. movq $19, %rax
  5350. adcq %rdx, %r12
  5351. mulq %r13
  5352. xorq %r13, %r13
  5353. addq %rax, %r9
  5354. movq $19, %rax
  5355. adcq %rdx, %r13
  5356. mulq %r14
  5357. xorq %r14, %r14
  5358. addq %rax, %r10
  5359. movq $19, %rax
  5360. adcq %rdx, %r14
  5361. mulq %r15
  5362. # Add remaining product results in
  5363. addq %r12, %r9
  5364. adcq %r13, %r10
  5365. adcq %r14, %r11
  5366. adcq %rax, %r11
  5367. adcq $0x00, %rdx
  5368. # Overflow
  5369. shldq $0x01, %r11, %rdx
  5370. imulq $19, %rdx, %rax
  5371. andq %rcx, %r11
  5372. addq %rax, %r8
  5373. adcq $0x00, %r9
  5374. adcq $0x00, %r10
  5375. adcq $0x00, %r11
  5376. # Reduce if top bit set
  5377. movq %r11, %rdx
  5378. shrq $63, %rdx
  5379. imulq $19, %rdx, %rax
  5380. andq %rcx, %r11
  5381. addq %rax, %r8
  5382. adcq $0x00, %r9
  5383. adcq $0x00, %r10
  5384. adcq $0x00, %r11
  5385. # Store
  5386. movq %r8, (%rdi)
  5387. movq %r9, 8(%rdi)
  5388. movq %r10, 16(%rdi)
  5389. movq %r11, 24(%rdi)
  5390. movq 8(%rsp), %rdi
  5391. movq 88(%rsp), %rsi
  5392. movq 96(%rsp), %rbx
  5393. # Multiply
  5394. # A[0] * B[0]
  5395. movq (%rbx), %rax
  5396. mulq (%rsi)
  5397. movq %rax, %r8
  5398. movq %rdx, %r9
  5399. # A[0] * B[1]
  5400. movq 8(%rbx), %rax
  5401. mulq (%rsi)
  5402. xorq %r10, %r10
  5403. addq %rax, %r9
  5404. adcq %rdx, %r10
  5405. # A[1] * B[0]
  5406. movq (%rbx), %rax
  5407. mulq 8(%rsi)
  5408. xorq %r11, %r11
  5409. addq %rax, %r9
  5410. adcq %rdx, %r10
  5411. adcq $0x00, %r11
  5412. # A[0] * B[2]
  5413. movq 16(%rbx), %rax
  5414. mulq (%rsi)
  5415. addq %rax, %r10
  5416. adcq %rdx, %r11
  5417. # A[1] * B[1]
  5418. movq 8(%rbx), %rax
  5419. mulq 8(%rsi)
  5420. xorq %r12, %r12
  5421. addq %rax, %r10
  5422. adcq %rdx, %r11
  5423. adcq $0x00, %r12
  5424. # A[2] * B[0]
  5425. movq (%rbx), %rax
  5426. mulq 16(%rsi)
  5427. addq %rax, %r10
  5428. adcq %rdx, %r11
  5429. adcq $0x00, %r12
  5430. # A[0] * B[3]
  5431. movq 24(%rbx), %rax
  5432. mulq (%rsi)
  5433. xorq %r13, %r13
  5434. addq %rax, %r11
  5435. adcq %rdx, %r12
  5436. adcq $0x00, %r13
  5437. # A[1] * B[2]
  5438. movq 16(%rbx), %rax
  5439. mulq 8(%rsi)
  5440. addq %rax, %r11
  5441. adcq %rdx, %r12
  5442. adcq $0x00, %r13
  5443. # A[2] * B[1]
  5444. movq 8(%rbx), %rax
  5445. mulq 16(%rsi)
  5446. addq %rax, %r11
  5447. adcq %rdx, %r12
  5448. adcq $0x00, %r13
  5449. # A[3] * B[0]
  5450. movq (%rbx), %rax
  5451. mulq 24(%rsi)
  5452. addq %rax, %r11
  5453. adcq %rdx, %r12
  5454. adcq $0x00, %r13
  5455. # A[1] * B[3]
  5456. movq 24(%rbx), %rax
  5457. mulq 8(%rsi)
  5458. xorq %r14, %r14
  5459. addq %rax, %r12
  5460. adcq %rdx, %r13
  5461. adcq $0x00, %r14
  5462. # A[2] * B[2]
  5463. movq 16(%rbx), %rax
  5464. mulq 16(%rsi)
  5465. addq %rax, %r12
  5466. adcq %rdx, %r13
  5467. adcq $0x00, %r14
  5468. # A[3] * B[1]
  5469. movq 8(%rbx), %rax
  5470. mulq 24(%rsi)
  5471. addq %rax, %r12
  5472. adcq %rdx, %r13
  5473. adcq $0x00, %r14
  5474. # A[2] * B[3]
  5475. movq 24(%rbx), %rax
  5476. mulq 16(%rsi)
  5477. xorq %r15, %r15
  5478. addq %rax, %r13
  5479. adcq %rdx, %r14
  5480. adcq $0x00, %r15
  5481. # A[3] * B[2]
  5482. movq 16(%rbx), %rax
  5483. mulq 24(%rsi)
  5484. addq %rax, %r13
  5485. adcq %rdx, %r14
  5486. adcq $0x00, %r15
  5487. # A[3] * B[3]
  5488. movq 24(%rbx), %rax
  5489. mulq 24(%rsi)
  5490. addq %rax, %r14
  5491. adcq %rdx, %r15
  5492. # Reduce
  5493. movq $0x7fffffffffffffff, %rcx
  5494. # Move top half into t4-t7 and remove top bit from t3
  5495. shldq $0x01, %r14, %r15
  5496. shldq $0x01, %r13, %r14
  5497. shldq $0x01, %r12, %r13
  5498. shldq $0x01, %r11, %r12
  5499. andq %rcx, %r11
  5500. # Multiply top half by 19
  5501. movq $19, %rax
  5502. mulq %r12
  5503. xorq %r12, %r12
  5504. addq %rax, %r8
  5505. movq $19, %rax
  5506. adcq %rdx, %r12
  5507. mulq %r13
  5508. xorq %r13, %r13
  5509. addq %rax, %r9
  5510. movq $19, %rax
  5511. adcq %rdx, %r13
  5512. mulq %r14
  5513. xorq %r14, %r14
  5514. addq %rax, %r10
  5515. movq $19, %rax
  5516. adcq %rdx, %r14
  5517. mulq %r15
  5518. # Add remaining product results in
  5519. addq %r12, %r9
  5520. adcq %r13, %r10
  5521. adcq %r14, %r11
  5522. adcq %rax, %r11
  5523. adcq $0x00, %rdx
  5524. # Overflow
  5525. shldq $0x01, %r11, %rdx
  5526. imulq $19, %rdx, %rax
  5527. andq %rcx, %r11
  5528. addq %rax, %r8
  5529. adcq $0x00, %r9
  5530. adcq $0x00, %r10
  5531. adcq $0x00, %r11
  5532. # Reduce if top bit set
  5533. movq %r11, %rdx
  5534. shrq $63, %rdx
  5535. imulq $19, %rdx, %rax
  5536. andq %rcx, %r11
  5537. addq %rax, %r8
  5538. adcq $0x00, %r9
  5539. adcq $0x00, %r10
  5540. adcq $0x00, %r11
  5541. # Store
  5542. movq %r8, (%rdi)
  5543. movq %r9, 8(%rdi)
  5544. movq %r10, 16(%rdi)
  5545. movq %r11, 24(%rdi)
  5546. movq 16(%rsp), %rdi
  5547. movq 24(%rsp), %rsi
  5548. movq 32(%rsp), %rbx
  5549. # Multiply
  5550. # A[0] * B[0]
  5551. movq (%rbx), %rax
  5552. mulq (%rsi)
  5553. movq %rax, %r8
  5554. movq %rdx, %r9
  5555. # A[0] * B[1]
  5556. movq 8(%rbx), %rax
  5557. mulq (%rsi)
  5558. xorq %r10, %r10
  5559. addq %rax, %r9
  5560. adcq %rdx, %r10
  5561. # A[1] * B[0]
  5562. movq (%rbx), %rax
  5563. mulq 8(%rsi)
  5564. xorq %r11, %r11
  5565. addq %rax, %r9
  5566. adcq %rdx, %r10
  5567. adcq $0x00, %r11
  5568. # A[0] * B[2]
  5569. movq 16(%rbx), %rax
  5570. mulq (%rsi)
  5571. addq %rax, %r10
  5572. adcq %rdx, %r11
  5573. # A[1] * B[1]
  5574. movq 8(%rbx), %rax
  5575. mulq 8(%rsi)
  5576. xorq %r12, %r12
  5577. addq %rax, %r10
  5578. adcq %rdx, %r11
  5579. adcq $0x00, %r12
  5580. # A[2] * B[0]
  5581. movq (%rbx), %rax
  5582. mulq 16(%rsi)
  5583. addq %rax, %r10
  5584. adcq %rdx, %r11
  5585. adcq $0x00, %r12
  5586. # A[0] * B[3]
  5587. movq 24(%rbx), %rax
  5588. mulq (%rsi)
  5589. xorq %r13, %r13
  5590. addq %rax, %r11
  5591. adcq %rdx, %r12
  5592. adcq $0x00, %r13
  5593. # A[1] * B[2]
  5594. movq 16(%rbx), %rax
  5595. mulq 8(%rsi)
  5596. addq %rax, %r11
  5597. adcq %rdx, %r12
  5598. adcq $0x00, %r13
  5599. # A[2] * B[1]
  5600. movq 8(%rbx), %rax
  5601. mulq 16(%rsi)
  5602. addq %rax, %r11
  5603. adcq %rdx, %r12
  5604. adcq $0x00, %r13
  5605. # A[3] * B[0]
  5606. movq (%rbx), %rax
  5607. mulq 24(%rsi)
  5608. addq %rax, %r11
  5609. adcq %rdx, %r12
  5610. adcq $0x00, %r13
  5611. # A[1] * B[3]
  5612. movq 24(%rbx), %rax
  5613. mulq 8(%rsi)
  5614. xorq %r14, %r14
  5615. addq %rax, %r12
  5616. adcq %rdx, %r13
  5617. adcq $0x00, %r14
  5618. # A[2] * B[2]
  5619. movq 16(%rbx), %rax
  5620. mulq 16(%rsi)
  5621. addq %rax, %r12
  5622. adcq %rdx, %r13
  5623. adcq $0x00, %r14
  5624. # A[3] * B[1]
  5625. movq 8(%rbx), %rax
  5626. mulq 24(%rsi)
  5627. addq %rax, %r12
  5628. adcq %rdx, %r13
  5629. adcq $0x00, %r14
  5630. # A[2] * B[3]
  5631. movq 24(%rbx), %rax
  5632. mulq 16(%rsi)
  5633. xorq %r15, %r15
  5634. addq %rax, %r13
  5635. adcq %rdx, %r14
  5636. adcq $0x00, %r15
  5637. # A[3] * B[2]
  5638. movq 16(%rbx), %rax
  5639. mulq 24(%rsi)
  5640. addq %rax, %r13
  5641. adcq %rdx, %r14
  5642. adcq $0x00, %r15
  5643. # A[3] * B[3]
  5644. movq 24(%rbx), %rax
  5645. mulq 24(%rsi)
  5646. addq %rax, %r14
  5647. adcq %rdx, %r15
  5648. # Reduce
  5649. movq $0x7fffffffffffffff, %rcx
  5650. # Move top half into t4-t7 and remove top bit from t3
  5651. shldq $0x01, %r14, %r15
  5652. shldq $0x01, %r13, %r14
  5653. shldq $0x01, %r12, %r13
  5654. shldq $0x01, %r11, %r12
  5655. andq %rcx, %r11
  5656. # Multiply top half by 19
  5657. movq $19, %rax
  5658. mulq %r12
  5659. xorq %r12, %r12
  5660. addq %rax, %r8
  5661. movq $19, %rax
  5662. adcq %rdx, %r12
  5663. mulq %r13
  5664. xorq %r13, %r13
  5665. addq %rax, %r9
  5666. movq $19, %rax
  5667. adcq %rdx, %r13
  5668. mulq %r14
  5669. xorq %r14, %r14
  5670. addq %rax, %r10
  5671. movq $19, %rax
  5672. adcq %rdx, %r14
  5673. mulq %r15
  5674. # Add remaining product results in
  5675. addq %r12, %r9
  5676. adcq %r13, %r10
  5677. adcq %r14, %r11
  5678. adcq %rax, %r11
  5679. adcq $0x00, %rdx
  5680. # Overflow
  5681. shldq $0x01, %r11, %rdx
  5682. imulq $19, %rdx, %rax
  5683. andq %rcx, %r11
  5684. addq %rax, %r8
  5685. adcq $0x00, %r9
  5686. adcq $0x00, %r10
  5687. adcq $0x00, %r11
  5688. # Reduce if top bit set
  5689. movq %r11, %rdx
  5690. shrq $63, %rdx
  5691. imulq $19, %rdx, %rax
  5692. andq %rcx, %r11
  5693. addq %rax, %r8
  5694. adcq $0x00, %r9
  5695. adcq $0x00, %r10
  5696. adcq $0x00, %r11
  5697. # Store
  5698. movq %r8, (%rdi)
  5699. movq %r9, 8(%rdi)
  5700. movq %r10, 16(%rdi)
  5701. movq %r11, 24(%rdi)
  5702. addq $40, %rsp
  5703. popq %r15
  5704. popq %r14
  5705. popq %r13
  5706. popq %r12
  5707. popq %rbx
  5708. repz retq
  5709. #ifndef __APPLE__
  5710. .size fe_ge_to_p3_x64,.-fe_ge_to_p3_x64
  5711. #endif /* __APPLE__ */
  5712. #ifndef __APPLE__
  5713. .text
  5714. .globl fe_ge_dbl_x64
  5715. .type fe_ge_dbl_x64,@function
  5716. .align 16
  5717. fe_ge_dbl_x64:
  5718. #else
  5719. .section __TEXT,__text
  5720. .globl _fe_ge_dbl_x64
  5721. .p2align 4
  5722. _fe_ge_dbl_x64:
  5723. #endif /* __APPLE__ */
  5724. pushq %rbx
  5725. pushq %r12
  5726. pushq %r13
  5727. pushq %r14
  5728. pushq %r15
  5729. subq $0x50, %rsp
  5730. movq %rdi, (%rsp)
  5731. movq %rsi, 8(%rsp)
  5732. movq %rdx, 16(%rsp)
  5733. movq %rcx, 24(%rsp)
  5734. movq %r8, 32(%rsp)
  5735. movq %r9, 40(%rsp)
  5736. movq (%rsp), %rdi
  5737. movq 32(%rsp), %rsi
  5738. # Square
  5739. # A[0] * A[1]
  5740. movq (%rsi), %rax
  5741. mulq 8(%rsi)
  5742. movq %rax, %r9
  5743. movq %rdx, %r10
  5744. # A[0] * A[2]
  5745. movq (%rsi), %rax
  5746. mulq 16(%rsi)
  5747. xorq %r11, %r11
  5748. addq %rax, %r10
  5749. adcq %rdx, %r11
  5750. # A[0] * A[3]
  5751. movq (%rsi), %rax
  5752. mulq 24(%rsi)
  5753. xorq %r12, %r12
  5754. addq %rax, %r11
  5755. adcq %rdx, %r12
  5756. # A[1] * A[2]
  5757. movq 8(%rsi), %rax
  5758. mulq 16(%rsi)
  5759. xorq %r13, %r13
  5760. addq %rax, %r11
  5761. adcq %rdx, %r12
  5762. adcq $0x00, %r13
  5763. # A[1] * A[3]
  5764. movq 8(%rsi), %rax
  5765. mulq 24(%rsi)
  5766. addq %rax, %r12
  5767. adcq %rdx, %r13
  5768. # A[2] * A[3]
  5769. movq 16(%rsi), %rax
  5770. mulq 24(%rsi)
  5771. xorq %r14, %r14
  5772. addq %rax, %r13
  5773. adcq %rdx, %r14
  5774. # Double
  5775. xorq %r15, %r15
  5776. addq %r9, %r9
  5777. adcq %r10, %r10
  5778. adcq %r11, %r11
  5779. adcq %r12, %r12
  5780. adcq %r13, %r13
  5781. adcq %r14, %r14
  5782. adcq $0x00, %r15
  5783. # A[0] * A[0]
  5784. movq (%rsi), %rax
  5785. mulq %rax
  5786. movq %rax, %r8
  5787. movq %rdx, %rcx
  5788. # A[1] * A[1]
  5789. movq 8(%rsi), %rax
  5790. mulq %rax
  5791. addq %rcx, %r9
  5792. adcq %rax, %r10
  5793. adcq $0x00, %rdx
  5794. movq %rdx, %rcx
  5795. # A[2] * A[2]
  5796. movq 16(%rsi), %rax
  5797. mulq %rax
  5798. addq %rcx, %r11
  5799. adcq %rax, %r12
  5800. adcq $0x00, %rdx
  5801. movq %rdx, %rcx
  5802. # A[3] * A[3]
  5803. movq 24(%rsi), %rax
  5804. mulq %rax
  5805. addq %rax, %r14
  5806. adcq %rdx, %r15
  5807. addq %rcx, %r13
  5808. adcq $0x00, %r14
  5809. adcq $0x00, %r15
  5810. # Reduce
  5811. movq $0x7fffffffffffffff, %rcx
  5812. # Move top half into t4-t7 and remove top bit from t3
  5813. shldq $0x01, %r14, %r15
  5814. shldq $0x01, %r13, %r14
  5815. shldq $0x01, %r12, %r13
  5816. shldq $0x01, %r11, %r12
  5817. andq %rcx, %r11
  5818. # Multiply top half by 19
  5819. movq $19, %rax
  5820. mulq %r12
  5821. xorq %r12, %r12
  5822. addq %rax, %r8
  5823. movq $19, %rax
  5824. adcq %rdx, %r12
  5825. mulq %r13
  5826. xorq %r13, %r13
  5827. addq %rax, %r9
  5828. movq $19, %rax
  5829. adcq %rdx, %r13
  5830. mulq %r14
  5831. xorq %r14, %r14
  5832. addq %rax, %r10
  5833. movq $19, %rax
  5834. adcq %rdx, %r14
  5835. mulq %r15
  5836. # Add remaining product results in
  5837. addq %r12, %r9
  5838. adcq %r13, %r10
  5839. adcq %r14, %r11
  5840. adcq %rax, %r11
  5841. adcq $0x00, %rdx
  5842. # Overflow
  5843. shldq $0x01, %r11, %rdx
  5844. imulq $19, %rdx, %rax
  5845. andq %rcx, %r11
  5846. addq %rax, %r8
  5847. adcq $0x00, %r9
  5848. adcq $0x00, %r10
  5849. adcq $0x00, %r11
  5850. # Reduce if top bit set
  5851. movq %r11, %rdx
  5852. shrq $63, %rdx
  5853. imulq $19, %rdx, %rax
  5854. andq %rcx, %r11
  5855. addq %rax, %r8
  5856. adcq $0x00, %r9
  5857. adcq $0x00, %r10
  5858. adcq $0x00, %r11
  5859. # Store
  5860. movq %r8, (%rdi)
  5861. movq %r9, 8(%rdi)
  5862. movq %r10, 16(%rdi)
  5863. movq %r11, 24(%rdi)
  5864. movq 16(%rsp), %rdi
  5865. movq 40(%rsp), %rsi
  5866. # Square
  5867. # A[0] * A[1]
  5868. movq (%rsi), %rax
  5869. mulq 8(%rsi)
  5870. movq %rax, %r9
  5871. movq %rdx, %r10
  5872. # A[0] * A[2]
  5873. movq (%rsi), %rax
  5874. mulq 16(%rsi)
  5875. xorq %r11, %r11
  5876. addq %rax, %r10
  5877. adcq %rdx, %r11
  5878. # A[0] * A[3]
  5879. movq (%rsi), %rax
  5880. mulq 24(%rsi)
  5881. xorq %r12, %r12
  5882. addq %rax, %r11
  5883. adcq %rdx, %r12
  5884. # A[1] * A[2]
  5885. movq 8(%rsi), %rax
  5886. mulq 16(%rsi)
  5887. xorq %r13, %r13
  5888. addq %rax, %r11
  5889. adcq %rdx, %r12
  5890. adcq $0x00, %r13
  5891. # A[1] * A[3]
  5892. movq 8(%rsi), %rax
  5893. mulq 24(%rsi)
  5894. addq %rax, %r12
  5895. adcq %rdx, %r13
  5896. # A[2] * A[3]
  5897. movq 16(%rsi), %rax
  5898. mulq 24(%rsi)
  5899. xorq %r14, %r14
  5900. addq %rax, %r13
  5901. adcq %rdx, %r14
  5902. # Double
  5903. xorq %r15, %r15
  5904. addq %r9, %r9
  5905. adcq %r10, %r10
  5906. adcq %r11, %r11
  5907. adcq %r12, %r12
  5908. adcq %r13, %r13
  5909. adcq %r14, %r14
  5910. adcq $0x00, %r15
  5911. # A[0] * A[0]
  5912. movq (%rsi), %rax
  5913. mulq %rax
  5914. movq %rax, %r8
  5915. movq %rdx, %rcx
  5916. # A[1] * A[1]
  5917. movq 8(%rsi), %rax
  5918. mulq %rax
  5919. addq %rcx, %r9
  5920. adcq %rax, %r10
  5921. adcq $0x00, %rdx
  5922. movq %rdx, %rcx
  5923. # A[2] * A[2]
  5924. movq 16(%rsi), %rax
  5925. mulq %rax
  5926. addq %rcx, %r11
  5927. adcq %rax, %r12
  5928. adcq $0x00, %rdx
  5929. movq %rdx, %rcx
  5930. # A[3] * A[3]
  5931. movq 24(%rsi), %rax
  5932. mulq %rax
  5933. addq %rax, %r14
  5934. adcq %rdx, %r15
  5935. addq %rcx, %r13
  5936. adcq $0x00, %r14
  5937. adcq $0x00, %r15
  5938. # Reduce
  5939. movq $0x7fffffffffffffff, %rcx
  5940. # Move top half into t4-t7 and remove top bit from t3
  5941. shldq $0x01, %r14, %r15
  5942. shldq $0x01, %r13, %r14
  5943. shldq $0x01, %r12, %r13
  5944. shldq $0x01, %r11, %r12
  5945. andq %rcx, %r11
  5946. # Multiply top half by 19
  5947. movq $19, %rax
  5948. mulq %r12
  5949. xorq %r12, %r12
  5950. addq %rax, %r8
  5951. movq $19, %rax
  5952. adcq %rdx, %r12
  5953. mulq %r13
  5954. xorq %r13, %r13
  5955. addq %rax, %r9
  5956. movq $19, %rax
  5957. adcq %rdx, %r13
  5958. mulq %r14
  5959. xorq %r14, %r14
  5960. addq %rax, %r10
  5961. movq $19, %rax
  5962. adcq %rdx, %r14
  5963. mulq %r15
  5964. # Add remaining product results in
  5965. addq %r12, %r9
  5966. adcq %r13, %r10
  5967. adcq %r14, %r11
  5968. adcq %rax, %r11
  5969. adcq $0x00, %rdx
  5970. # Overflow
  5971. shldq $0x01, %r11, %rdx
  5972. imulq $19, %rdx, %rax
  5973. andq %rcx, %r11
  5974. addq %rax, %r8
  5975. adcq $0x00, %r9
  5976. adcq $0x00, %r10
  5977. adcq $0x00, %r11
  5978. # Reduce if top bit set
  5979. movq %r11, %rdx
  5980. shrq $63, %rdx
  5981. imulq $19, %rdx, %rax
  5982. andq %rcx, %r11
  5983. addq %rax, %r8
  5984. adcq $0x00, %r9
  5985. adcq $0x00, %r10
  5986. adcq $0x00, %r11
  5987. # Store
  5988. movq %r8, (%rdi)
  5989. movq %r9, 8(%rdi)
  5990. movq %r10, 16(%rdi)
  5991. movq %r11, 24(%rdi)
  5992. movq 24(%rsp), %rdi
  5993. movq 128(%rsp), %rsi
  5994. # Square * 2
  5995. # A[0] * A[1]
  5996. movq (%rsi), %rax
  5997. mulq 8(%rsi)
  5998. movq %rax, %r9
  5999. movq %rdx, %r10
  6000. # A[0] * A[2]
  6001. movq (%rsi), %rax
  6002. mulq 16(%rsi)
  6003. xorq %r11, %r11
  6004. addq %rax, %r10
  6005. adcq %rdx, %r11
  6006. # A[0] * A[3]
  6007. movq (%rsi), %rax
  6008. mulq 24(%rsi)
  6009. xorq %r12, %r12
  6010. addq %rax, %r11
  6011. adcq %rdx, %r12
  6012. # A[1] * A[2]
  6013. movq 8(%rsi), %rax
  6014. mulq 16(%rsi)
  6015. xorq %r13, %r13
  6016. addq %rax, %r11
  6017. adcq %rdx, %r12
  6018. adcq $0x00, %r13
  6019. # A[1] * A[3]
  6020. movq 8(%rsi), %rax
  6021. mulq 24(%rsi)
  6022. addq %rax, %r12
  6023. adcq %rdx, %r13
  6024. # A[2] * A[3]
  6025. movq 16(%rsi), %rax
  6026. mulq 24(%rsi)
  6027. xorq %r14, %r14
  6028. addq %rax, %r13
  6029. adcq %rdx, %r14
  6030. # Double
  6031. xorq %r15, %r15
  6032. addq %r9, %r9
  6033. adcq %r10, %r10
  6034. adcq %r11, %r11
  6035. adcq %r12, %r12
  6036. adcq %r13, %r13
  6037. adcq %r14, %r14
  6038. adcq $0x00, %r15
  6039. # A[0] * A[0]
  6040. movq (%rsi), %rax
  6041. mulq %rax
  6042. movq %rax, %r8
  6043. movq %rdx, %rcx
  6044. # A[1] * A[1]
  6045. movq 8(%rsi), %rax
  6046. mulq %rax
  6047. addq %rcx, %r9
  6048. adcq %rax, %r10
  6049. adcq $0x00, %rdx
  6050. movq %rdx, %rcx
  6051. # A[2] * A[2]
  6052. movq 16(%rsi), %rax
  6053. mulq %rax
  6054. addq %rcx, %r11
  6055. adcq %rax, %r12
  6056. adcq $0x00, %rdx
  6057. movq %rdx, %rcx
  6058. # A[3] * A[3]
  6059. movq 24(%rsi), %rax
  6060. mulq %rax
  6061. addq %rax, %r14
  6062. adcq %rdx, %r15
  6063. addq %rcx, %r13
  6064. adcq $0x00, %r14
  6065. adcq $0x00, %r15
  6066. # Reduce
  6067. movq $0x7fffffffffffffff, %rbx
  6068. xorq %rax, %rax
  6069. # Move top half into t4-t7 and remove top bit from t3
  6070. shldq $3, %r15, %rax
  6071. shldq $2, %r14, %r15
  6072. shldq $2, %r13, %r14
  6073. shldq $2, %r12, %r13
  6074. shldq $2, %r11, %r12
  6075. shldq $0x01, %r10, %r11
  6076. shldq $0x01, %r9, %r10
  6077. shldq $0x01, %r8, %r9
  6078. shlq $0x01, %r8
  6079. andq %rbx, %r11
  6080. # Two out left, one in right
  6081. andq %rbx, %r15
  6082. # Multiply top bits by 19*19
  6083. imulq $0x169, %rax, %rcx
  6084. # Multiply top half by 19
  6085. movq $19, %rax
  6086. mulq %r12
  6087. xorq %r12, %r12
  6088. addq %rax, %r8
  6089. movq $19, %rax
  6090. adcq %rdx, %r12
  6091. mulq %r13
  6092. xorq %r13, %r13
  6093. addq %rax, %r9
  6094. movq $19, %rax
  6095. adcq %rdx, %r13
  6096. mulq %r14
  6097. xorq %r14, %r14
  6098. addq %rax, %r10
  6099. movq $19, %rax
  6100. adcq %rdx, %r14
  6101. mulq %r15
  6102. # Add remaining produce results in
  6103. addq %rcx, %r8
  6104. adcq %r12, %r9
  6105. adcq %r13, %r10
  6106. adcq %r14, %r11
  6107. adcq %rax, %r11
  6108. adcq $0x00, %rdx
  6109. # Overflow
  6110. shldq $0x01, %r11, %rdx
  6111. imulq $19, %rdx, %rax
  6112. andq %rbx, %r11
  6113. addq %rax, %r8
  6114. adcq $0x00, %r9
  6115. adcq $0x00, %r10
  6116. adcq $0x00, %r11
  6117. # Reduce if top bit set
  6118. movq %r11, %rdx
  6119. shrq $63, %rdx
  6120. imulq $19, %rdx, %rax
  6121. andq %rbx, %r11
  6122. addq %rax, %r8
  6123. adcq $0x00, %r9
  6124. adcq $0x00, %r10
  6125. adcq $0x00, %r11
  6126. # Store
  6127. movq %r8, (%rdi)
  6128. movq %r9, 8(%rdi)
  6129. movq %r10, 16(%rdi)
  6130. movq %r11, 24(%rdi)
  6131. movq 8(%rsp), %rdi
  6132. movq 32(%rsp), %rsi
  6133. movq 40(%rsp), %rbx
  6134. # Add
  6135. movq (%rsi), %r8
  6136. movq 8(%rsi), %r9
  6137. addq (%rbx), %r8
  6138. movq 16(%rsi), %r10
  6139. adcq 8(%rbx), %r9
  6140. movq 24(%rsi), %rcx
  6141. adcq 16(%rbx), %r10
  6142. movq $-19, %rax
  6143. adcq 24(%rbx), %rcx
  6144. movq $0x7fffffffffffffff, %rdx
  6145. movq %rcx, %r11
  6146. sarq $63, %rcx
  6147. # Mask the modulus
  6148. andq %rcx, %rax
  6149. andq %rcx, %rdx
  6150. # Sub modulus (if overflow)
  6151. subq %rax, %r8
  6152. sbbq %rcx, %r9
  6153. sbbq %rcx, %r10
  6154. sbbq %rdx, %r11
  6155. movq %r8, (%rdi)
  6156. movq %r9, 8(%rdi)
  6157. movq %r10, 16(%rdi)
  6158. movq %r11, 24(%rdi)
  6159. leaq 48(%rsp), %rdi
  6160. movq 8(%rsp), %rsi
  6161. # Square
  6162. # A[0] * A[1]
  6163. movq (%rsi), %rax
  6164. mulq 8(%rsi)
  6165. movq %rax, %r9
  6166. movq %rdx, %r10
  6167. # A[0] * A[2]
  6168. movq (%rsi), %rax
  6169. mulq 16(%rsi)
  6170. xorq %r11, %r11
  6171. addq %rax, %r10
  6172. adcq %rdx, %r11
  6173. # A[0] * A[3]
  6174. movq (%rsi), %rax
  6175. mulq 24(%rsi)
  6176. xorq %r12, %r12
  6177. addq %rax, %r11
  6178. adcq %rdx, %r12
  6179. # A[1] * A[2]
  6180. movq 8(%rsi), %rax
  6181. mulq 16(%rsi)
  6182. xorq %r13, %r13
  6183. addq %rax, %r11
  6184. adcq %rdx, %r12
  6185. adcq $0x00, %r13
  6186. # A[1] * A[3]
  6187. movq 8(%rsi), %rax
  6188. mulq 24(%rsi)
  6189. addq %rax, %r12
  6190. adcq %rdx, %r13
  6191. # A[2] * A[3]
  6192. movq 16(%rsi), %rax
  6193. mulq 24(%rsi)
  6194. xorq %r14, %r14
  6195. addq %rax, %r13
  6196. adcq %rdx, %r14
  6197. # Double
  6198. xorq %r15, %r15
  6199. addq %r9, %r9
  6200. adcq %r10, %r10
  6201. adcq %r11, %r11
  6202. adcq %r12, %r12
  6203. adcq %r13, %r13
  6204. adcq %r14, %r14
  6205. adcq $0x00, %r15
  6206. # A[0] * A[0]
  6207. movq (%rsi), %rax
  6208. mulq %rax
  6209. movq %rax, %r8
  6210. movq %rdx, %rcx
  6211. # A[1] * A[1]
  6212. movq 8(%rsi), %rax
  6213. mulq %rax
  6214. addq %rcx, %r9
  6215. adcq %rax, %r10
  6216. adcq $0x00, %rdx
  6217. movq %rdx, %rcx
  6218. # A[2] * A[2]
  6219. movq 16(%rsi), %rax
  6220. mulq %rax
  6221. addq %rcx, %r11
  6222. adcq %rax, %r12
  6223. adcq $0x00, %rdx
  6224. movq %rdx, %rcx
  6225. # A[3] * A[3]
  6226. movq 24(%rsi), %rax
  6227. mulq %rax
  6228. addq %rax, %r14
  6229. adcq %rdx, %r15
  6230. addq %rcx, %r13
  6231. adcq $0x00, %r14
  6232. adcq $0x00, %r15
  6233. # Reduce
  6234. movq $0x7fffffffffffffff, %rcx
  6235. # Move top half into t4-t7 and remove top bit from t3
  6236. shldq $0x01, %r14, %r15
  6237. shldq $0x01, %r13, %r14
  6238. shldq $0x01, %r12, %r13
  6239. shldq $0x01, %r11, %r12
  6240. andq %rcx, %r11
  6241. # Multiply top half by 19
  6242. movq $19, %rax
  6243. mulq %r12
  6244. xorq %r12, %r12
  6245. addq %rax, %r8
  6246. movq $19, %rax
  6247. adcq %rdx, %r12
  6248. mulq %r13
  6249. xorq %r13, %r13
  6250. addq %rax, %r9
  6251. movq $19, %rax
  6252. adcq %rdx, %r13
  6253. mulq %r14
  6254. xorq %r14, %r14
  6255. addq %rax, %r10
  6256. movq $19, %rax
  6257. adcq %rdx, %r14
  6258. mulq %r15
  6259. # Add remaining product results in
  6260. addq %r12, %r9
  6261. adcq %r13, %r10
  6262. adcq %r14, %r11
  6263. adcq %rax, %r11
  6264. adcq $0x00, %rdx
  6265. # Overflow
  6266. shldq $0x01, %r11, %rdx
  6267. imulq $19, %rdx, %rax
  6268. andq %rcx, %r11
  6269. addq %rax, %r8
  6270. adcq $0x00, %r9
  6271. adcq $0x00, %r10
  6272. adcq $0x00, %r11
  6273. # Reduce if top bit set
  6274. movq %r11, %rdx
  6275. shrq $63, %rdx
  6276. imulq $19, %rdx, %rax
  6277. andq %rcx, %r11
  6278. addq %rax, %r8
  6279. adcq $0x00, %r9
  6280. adcq $0x00, %r10
  6281. adcq $0x00, %r11
  6282. # Store
  6283. movq %r8, (%rdi)
  6284. movq %r9, 8(%rdi)
  6285. movq %r10, 16(%rdi)
  6286. movq %r11, 24(%rdi)
  6287. movq 8(%rsp), %rdi
  6288. movq 16(%rsp), %rsi
  6289. movq (%rsp), %rbx
  6290. # Add
  6291. movq (%rsi), %r8
  6292. movq 8(%rsi), %r9
  6293. addq (%rbx), %r8
  6294. movq 16(%rsi), %r10
  6295. adcq 8(%rbx), %r9
  6296. movq 24(%rsi), %rcx
  6297. adcq 16(%rbx), %r10
  6298. movq $-19, %rax
  6299. adcq 24(%rbx), %rcx
  6300. movq $0x7fffffffffffffff, %rdx
  6301. movq %rcx, %r11
  6302. sarq $63, %rcx
  6303. # Mask the modulus
  6304. andq %rcx, %rax
  6305. andq %rcx, %rdx
  6306. # Sub modulus (if overflow)
  6307. subq %rax, %r8
  6308. sbbq %rcx, %r9
  6309. sbbq %rcx, %r10
  6310. sbbq %rdx, %r11
  6311. movq %r8, (%rdi)
  6312. movq %r9, 8(%rdi)
  6313. movq %r10, 16(%rdi)
  6314. movq %r11, 24(%rdi)
  6315. movq 16(%rsp), %rdi
  6316. movq 16(%rsp), %rsi
  6317. movq (%rsp), %rbx
  6318. # Sub
  6319. movq (%rsi), %r8
  6320. movq 8(%rsi), %r9
  6321. movq 16(%rsi), %r10
  6322. movq 24(%rsi), %r11
  6323. subq (%rbx), %r8
  6324. movq $0x00, %rcx
  6325. sbbq 8(%rbx), %r9
  6326. movq $-19, %rax
  6327. sbbq 16(%rbx), %r10
  6328. movq $0x7fffffffffffffff, %rdx
  6329. sbbq 24(%rbx), %r11
  6330. sbbq $0x00, %rcx
  6331. # Mask the modulus
  6332. andq %rcx, %rax
  6333. andq %rcx, %rdx
  6334. # Add modulus (if underflow)
  6335. addq %rax, %r8
  6336. adcq %rcx, %r9
  6337. adcq %rcx, %r10
  6338. adcq %rdx, %r11
  6339. movq %r8, (%rdi)
  6340. movq %r9, 8(%rdi)
  6341. movq %r10, 16(%rdi)
  6342. movq %r11, 24(%rdi)
  6343. movq (%rsp), %rdi
  6344. leaq 48(%rsp), %rsi
  6345. movq 8(%rsp), %rbx
  6346. # Sub
  6347. movq (%rsi), %r8
  6348. movq 8(%rsi), %r9
  6349. movq 16(%rsi), %r10
  6350. movq 24(%rsi), %r11
  6351. subq (%rbx), %r8
  6352. movq $0x00, %rcx
  6353. sbbq 8(%rbx), %r9
  6354. movq $-19, %rax
  6355. sbbq 16(%rbx), %r10
  6356. movq $0x7fffffffffffffff, %rdx
  6357. sbbq 24(%rbx), %r11
  6358. sbbq $0x00, %rcx
  6359. # Mask the modulus
  6360. andq %rcx, %rax
  6361. andq %rcx, %rdx
  6362. # Add modulus (if underflow)
  6363. addq %rax, %r8
  6364. adcq %rcx, %r9
  6365. adcq %rcx, %r10
  6366. adcq %rdx, %r11
  6367. movq %r8, (%rdi)
  6368. movq %r9, 8(%rdi)
  6369. movq %r10, 16(%rdi)
  6370. movq %r11, 24(%rdi)
  6371. movq 24(%rsp), %rdi
  6372. movq 24(%rsp), %rsi
  6373. movq 16(%rsp), %rbx
  6374. # Sub
  6375. movq (%rsi), %r8
  6376. movq 8(%rsi), %r9
  6377. movq 16(%rsi), %r10
  6378. movq 24(%rsi), %r11
  6379. subq (%rbx), %r8
  6380. movq $0x00, %rcx
  6381. sbbq 8(%rbx), %r9
  6382. movq $-19, %rax
  6383. sbbq 16(%rbx), %r10
  6384. movq $0x7fffffffffffffff, %rdx
  6385. sbbq 24(%rbx), %r11
  6386. sbbq $0x00, %rcx
  6387. # Mask the modulus
  6388. andq %rcx, %rax
  6389. andq %rcx, %rdx
  6390. # Add modulus (if underflow)
  6391. addq %rax, %r8
  6392. adcq %rcx, %r9
  6393. adcq %rcx, %r10
  6394. adcq %rdx, %r11
  6395. movq %r8, (%rdi)
  6396. movq %r9, 8(%rdi)
  6397. movq %r10, 16(%rdi)
  6398. movq %r11, 24(%rdi)
  6399. addq $0x50, %rsp
  6400. popq %r15
  6401. popq %r14
  6402. popq %r13
  6403. popq %r12
  6404. popq %rbx
  6405. repz retq
  6406. #ifndef __APPLE__
  6407. .size fe_ge_dbl_x64,.-fe_ge_dbl_x64
  6408. #endif /* __APPLE__ */
  6409. #ifndef __APPLE__
  6410. .text
  6411. .globl fe_ge_madd_x64
  6412. .type fe_ge_madd_x64,@function
  6413. .align 16
  6414. fe_ge_madd_x64:
  6415. #else
  6416. .section __TEXT,__text
  6417. .globl _fe_ge_madd_x64
  6418. .p2align 4
  6419. _fe_ge_madd_x64:
  6420. #endif /* __APPLE__ */
  6421. pushq %rbx
  6422. pushq %r12
  6423. pushq %r13
  6424. pushq %r14
  6425. pushq %r15
  6426. subq $0x50, %rsp
  6427. movq %rdi, (%rsp)
  6428. movq %rsi, 8(%rsp)
  6429. movq %rdx, 16(%rsp)
  6430. movq %rcx, 24(%rsp)
  6431. movq %r8, 32(%rsp)
  6432. movq %r9, 40(%rsp)
  6433. movq (%rsp), %rdi
  6434. movq 40(%rsp), %rsi
  6435. movq 32(%rsp), %rbx
  6436. # Add
  6437. movq (%rsi), %r8
  6438. movq 8(%rsi), %r9
  6439. addq (%rbx), %r8
  6440. movq 16(%rsi), %r10
  6441. adcq 8(%rbx), %r9
  6442. movq 24(%rsi), %rcx
  6443. adcq 16(%rbx), %r10
  6444. movq $-19, %rax
  6445. adcq 24(%rbx), %rcx
  6446. movq $0x7fffffffffffffff, %rdx
  6447. movq %rcx, %r11
  6448. sarq $63, %rcx
  6449. # Mask the modulus
  6450. andq %rcx, %rax
  6451. andq %rcx, %rdx
  6452. # Sub modulus (if overflow)
  6453. subq %rax, %r8
  6454. sbbq %rcx, %r9
  6455. sbbq %rcx, %r10
  6456. sbbq %rdx, %r11
  6457. movq %r8, (%rdi)
  6458. movq %r9, 8(%rdi)
  6459. movq %r10, 16(%rdi)
  6460. movq %r11, 24(%rdi)
  6461. movq 8(%rsp), %rdi
  6462. movq 40(%rsp), %rsi
  6463. movq 32(%rsp), %rbx
  6464. # Sub
  6465. movq (%rsi), %r8
  6466. movq 8(%rsi), %r9
  6467. movq 16(%rsi), %r10
  6468. movq 24(%rsi), %r11
  6469. subq (%rbx), %r8
  6470. movq $0x00, %rcx
  6471. sbbq 8(%rbx), %r9
  6472. movq $-19, %rax
  6473. sbbq 16(%rbx), %r10
  6474. movq $0x7fffffffffffffff, %rdx
  6475. sbbq 24(%rbx), %r11
  6476. sbbq $0x00, %rcx
  6477. # Mask the modulus
  6478. andq %rcx, %rax
  6479. andq %rcx, %rdx
  6480. # Add modulus (if underflow)
  6481. addq %rax, %r8
  6482. adcq %rcx, %r9
  6483. adcq %rcx, %r10
  6484. adcq %rdx, %r11
  6485. movq %r8, (%rdi)
  6486. movq %r9, 8(%rdi)
  6487. movq %r10, 16(%rdi)
  6488. movq %r11, 24(%rdi)
  6489. movq 16(%rsp), %rdi
  6490. movq (%rsp), %rsi
  6491. movq 152(%rsp), %rbx
  6492. # Multiply
  6493. # A[0] * B[0]
  6494. movq (%rbx), %rax
  6495. mulq (%rsi)
  6496. movq %rax, %r8
  6497. movq %rdx, %r9
  6498. # A[0] * B[1]
  6499. movq 8(%rbx), %rax
  6500. mulq (%rsi)
  6501. xorq %r10, %r10
  6502. addq %rax, %r9
  6503. adcq %rdx, %r10
  6504. # A[1] * B[0]
  6505. movq (%rbx), %rax
  6506. mulq 8(%rsi)
  6507. xorq %r11, %r11
  6508. addq %rax, %r9
  6509. adcq %rdx, %r10
  6510. adcq $0x00, %r11
  6511. # A[0] * B[2]
  6512. movq 16(%rbx), %rax
  6513. mulq (%rsi)
  6514. addq %rax, %r10
  6515. adcq %rdx, %r11
  6516. # A[1] * B[1]
  6517. movq 8(%rbx), %rax
  6518. mulq 8(%rsi)
  6519. xorq %r12, %r12
  6520. addq %rax, %r10
  6521. adcq %rdx, %r11
  6522. adcq $0x00, %r12
  6523. # A[2] * B[0]
  6524. movq (%rbx), %rax
  6525. mulq 16(%rsi)
  6526. addq %rax, %r10
  6527. adcq %rdx, %r11
  6528. adcq $0x00, %r12
  6529. # A[0] * B[3]
  6530. movq 24(%rbx), %rax
  6531. mulq (%rsi)
  6532. xorq %r13, %r13
  6533. addq %rax, %r11
  6534. adcq %rdx, %r12
  6535. adcq $0x00, %r13
  6536. # A[1] * B[2]
  6537. movq 16(%rbx), %rax
  6538. mulq 8(%rsi)
  6539. addq %rax, %r11
  6540. adcq %rdx, %r12
  6541. adcq $0x00, %r13
  6542. # A[2] * B[1]
  6543. movq 8(%rbx), %rax
  6544. mulq 16(%rsi)
  6545. addq %rax, %r11
  6546. adcq %rdx, %r12
  6547. adcq $0x00, %r13
  6548. # A[3] * B[0]
  6549. movq (%rbx), %rax
  6550. mulq 24(%rsi)
  6551. addq %rax, %r11
  6552. adcq %rdx, %r12
  6553. adcq $0x00, %r13
  6554. # A[1] * B[3]
  6555. movq 24(%rbx), %rax
  6556. mulq 8(%rsi)
  6557. xorq %r14, %r14
  6558. addq %rax, %r12
  6559. adcq %rdx, %r13
  6560. adcq $0x00, %r14
  6561. # A[2] * B[2]
  6562. movq 16(%rbx), %rax
  6563. mulq 16(%rsi)
  6564. addq %rax, %r12
  6565. adcq %rdx, %r13
  6566. adcq $0x00, %r14
  6567. # A[3] * B[1]
  6568. movq 8(%rbx), %rax
  6569. mulq 24(%rsi)
  6570. addq %rax, %r12
  6571. adcq %rdx, %r13
  6572. adcq $0x00, %r14
  6573. # A[2] * B[3]
  6574. movq 24(%rbx), %rax
  6575. mulq 16(%rsi)
  6576. xorq %r15, %r15
  6577. addq %rax, %r13
  6578. adcq %rdx, %r14
  6579. adcq $0x00, %r15
  6580. # A[3] * B[2]
  6581. movq 16(%rbx), %rax
  6582. mulq 24(%rsi)
  6583. addq %rax, %r13
  6584. adcq %rdx, %r14
  6585. adcq $0x00, %r15
  6586. # A[3] * B[3]
  6587. movq 24(%rbx), %rax
  6588. mulq 24(%rsi)
  6589. addq %rax, %r14
  6590. adcq %rdx, %r15
  6591. # Reduce
  6592. movq $0x7fffffffffffffff, %rcx
  6593. # Move top half into t4-t7 and remove top bit from t3
  6594. shldq $0x01, %r14, %r15
  6595. shldq $0x01, %r13, %r14
  6596. shldq $0x01, %r12, %r13
  6597. shldq $0x01, %r11, %r12
  6598. andq %rcx, %r11
  6599. # Multiply top half by 19
  6600. movq $19, %rax
  6601. mulq %r12
  6602. xorq %r12, %r12
  6603. addq %rax, %r8
  6604. movq $19, %rax
  6605. adcq %rdx, %r12
  6606. mulq %r13
  6607. xorq %r13, %r13
  6608. addq %rax, %r9
  6609. movq $19, %rax
  6610. adcq %rdx, %r13
  6611. mulq %r14
  6612. xorq %r14, %r14
  6613. addq %rax, %r10
  6614. movq $19, %rax
  6615. adcq %rdx, %r14
  6616. mulq %r15
  6617. # Add remaining product results in
  6618. addq %r12, %r9
  6619. adcq %r13, %r10
  6620. adcq %r14, %r11
  6621. adcq %rax, %r11
  6622. adcq $0x00, %rdx
  6623. # Overflow
  6624. shldq $0x01, %r11, %rdx
  6625. imulq $19, %rdx, %rax
  6626. andq %rcx, %r11
  6627. addq %rax, %r8
  6628. adcq $0x00, %r9
  6629. adcq $0x00, %r10
  6630. adcq $0x00, %r11
  6631. # Reduce if top bit set
  6632. movq %r11, %rdx
  6633. shrq $63, %rdx
  6634. imulq $19, %rdx, %rax
  6635. andq %rcx, %r11
  6636. addq %rax, %r8
  6637. adcq $0x00, %r9
  6638. adcq $0x00, %r10
  6639. adcq $0x00, %r11
  6640. # Store
  6641. movq %r8, (%rdi)
  6642. movq %r9, 8(%rdi)
  6643. movq %r10, 16(%rdi)
  6644. movq %r11, 24(%rdi)
  6645. movq 8(%rsp), %rdi
  6646. movq 8(%rsp), %rsi
  6647. movq 160(%rsp), %rbx
  6648. # Multiply
  6649. # A[0] * B[0]
  6650. movq (%rbx), %rax
  6651. mulq (%rsi)
  6652. movq %rax, %r8
  6653. movq %rdx, %r9
  6654. # A[0] * B[1]
  6655. movq 8(%rbx), %rax
  6656. mulq (%rsi)
  6657. xorq %r10, %r10
  6658. addq %rax, %r9
  6659. adcq %rdx, %r10
  6660. # A[1] * B[0]
  6661. movq (%rbx), %rax
  6662. mulq 8(%rsi)
  6663. xorq %r11, %r11
  6664. addq %rax, %r9
  6665. adcq %rdx, %r10
  6666. adcq $0x00, %r11
  6667. # A[0] * B[2]
  6668. movq 16(%rbx), %rax
  6669. mulq (%rsi)
  6670. addq %rax, %r10
  6671. adcq %rdx, %r11
  6672. # A[1] * B[1]
  6673. movq 8(%rbx), %rax
  6674. mulq 8(%rsi)
  6675. xorq %r12, %r12
  6676. addq %rax, %r10
  6677. adcq %rdx, %r11
  6678. adcq $0x00, %r12
  6679. # A[2] * B[0]
  6680. movq (%rbx), %rax
  6681. mulq 16(%rsi)
  6682. addq %rax, %r10
  6683. adcq %rdx, %r11
  6684. adcq $0x00, %r12
  6685. # A[0] * B[3]
  6686. movq 24(%rbx), %rax
  6687. mulq (%rsi)
  6688. xorq %r13, %r13
  6689. addq %rax, %r11
  6690. adcq %rdx, %r12
  6691. adcq $0x00, %r13
  6692. # A[1] * B[2]
  6693. movq 16(%rbx), %rax
  6694. mulq 8(%rsi)
  6695. addq %rax, %r11
  6696. adcq %rdx, %r12
  6697. adcq $0x00, %r13
  6698. # A[2] * B[1]
  6699. movq 8(%rbx), %rax
  6700. mulq 16(%rsi)
  6701. addq %rax, %r11
  6702. adcq %rdx, %r12
  6703. adcq $0x00, %r13
  6704. # A[3] * B[0]
  6705. movq (%rbx), %rax
  6706. mulq 24(%rsi)
  6707. addq %rax, %r11
  6708. adcq %rdx, %r12
  6709. adcq $0x00, %r13
  6710. # A[1] * B[3]
  6711. movq 24(%rbx), %rax
  6712. mulq 8(%rsi)
  6713. xorq %r14, %r14
  6714. addq %rax, %r12
  6715. adcq %rdx, %r13
  6716. adcq $0x00, %r14
  6717. # A[2] * B[2]
  6718. movq 16(%rbx), %rax
  6719. mulq 16(%rsi)
  6720. addq %rax, %r12
  6721. adcq %rdx, %r13
  6722. adcq $0x00, %r14
  6723. # A[3] * B[1]
  6724. movq 8(%rbx), %rax
  6725. mulq 24(%rsi)
  6726. addq %rax, %r12
  6727. adcq %rdx, %r13
  6728. adcq $0x00, %r14
  6729. # A[2] * B[3]
  6730. movq 24(%rbx), %rax
  6731. mulq 16(%rsi)
  6732. xorq %r15, %r15
  6733. addq %rax, %r13
  6734. adcq %rdx, %r14
  6735. adcq $0x00, %r15
  6736. # A[3] * B[2]
  6737. movq 16(%rbx), %rax
  6738. mulq 24(%rsi)
  6739. addq %rax, %r13
  6740. adcq %rdx, %r14
  6741. adcq $0x00, %r15
  6742. # A[3] * B[3]
  6743. movq 24(%rbx), %rax
  6744. mulq 24(%rsi)
  6745. addq %rax, %r14
  6746. adcq %rdx, %r15
  6747. # Reduce
  6748. movq $0x7fffffffffffffff, %rcx
  6749. # Move top half into t4-t7 and remove top bit from t3
  6750. shldq $0x01, %r14, %r15
  6751. shldq $0x01, %r13, %r14
  6752. shldq $0x01, %r12, %r13
  6753. shldq $0x01, %r11, %r12
  6754. andq %rcx, %r11
  6755. # Multiply top half by 19
  6756. movq $19, %rax
  6757. mulq %r12
  6758. xorq %r12, %r12
  6759. addq %rax, %r8
  6760. movq $19, %rax
  6761. adcq %rdx, %r12
  6762. mulq %r13
  6763. xorq %r13, %r13
  6764. addq %rax, %r9
  6765. movq $19, %rax
  6766. adcq %rdx, %r13
  6767. mulq %r14
  6768. xorq %r14, %r14
  6769. addq %rax, %r10
  6770. movq $19, %rax
  6771. adcq %rdx, %r14
  6772. mulq %r15
  6773. # Add remaining product results in
  6774. addq %r12, %r9
  6775. adcq %r13, %r10
  6776. adcq %r14, %r11
  6777. adcq %rax, %r11
  6778. adcq $0x00, %rdx
  6779. # Overflow
  6780. shldq $0x01, %r11, %rdx
  6781. imulq $19, %rdx, %rax
  6782. andq %rcx, %r11
  6783. addq %rax, %r8
  6784. adcq $0x00, %r9
  6785. adcq $0x00, %r10
  6786. adcq $0x00, %r11
  6787. # Reduce if top bit set
  6788. movq %r11, %rdx
  6789. shrq $63, %rdx
  6790. imulq $19, %rdx, %rax
  6791. andq %rcx, %r11
  6792. addq %rax, %r8
  6793. adcq $0x00, %r9
  6794. adcq $0x00, %r10
  6795. adcq $0x00, %r11
  6796. # Store
  6797. movq %r8, (%rdi)
  6798. movq %r9, 8(%rdi)
  6799. movq %r10, 16(%rdi)
  6800. movq %r11, 24(%rdi)
  6801. movq 24(%rsp), %rdi
  6802. movq 144(%rsp), %rsi
  6803. movq 136(%rsp), %rbx
  6804. # Multiply
  6805. # A[0] * B[0]
  6806. movq (%rbx), %rax
  6807. mulq (%rsi)
  6808. movq %rax, %r8
  6809. movq %rdx, %r9
  6810. # A[0] * B[1]
  6811. movq 8(%rbx), %rax
  6812. mulq (%rsi)
  6813. xorq %r10, %r10
  6814. addq %rax, %r9
  6815. adcq %rdx, %r10
  6816. # A[1] * B[0]
  6817. movq (%rbx), %rax
  6818. mulq 8(%rsi)
  6819. xorq %r11, %r11
  6820. addq %rax, %r9
  6821. adcq %rdx, %r10
  6822. adcq $0x00, %r11
  6823. # A[0] * B[2]
  6824. movq 16(%rbx), %rax
  6825. mulq (%rsi)
  6826. addq %rax, %r10
  6827. adcq %rdx, %r11
  6828. # A[1] * B[1]
  6829. movq 8(%rbx), %rax
  6830. mulq 8(%rsi)
  6831. xorq %r12, %r12
  6832. addq %rax, %r10
  6833. adcq %rdx, %r11
  6834. adcq $0x00, %r12
  6835. # A[2] * B[0]
  6836. movq (%rbx), %rax
  6837. mulq 16(%rsi)
  6838. addq %rax, %r10
  6839. adcq %rdx, %r11
  6840. adcq $0x00, %r12
  6841. # A[0] * B[3]
  6842. movq 24(%rbx), %rax
  6843. mulq (%rsi)
  6844. xorq %r13, %r13
  6845. addq %rax, %r11
  6846. adcq %rdx, %r12
  6847. adcq $0x00, %r13
  6848. # A[1] * B[2]
  6849. movq 16(%rbx), %rax
  6850. mulq 8(%rsi)
  6851. addq %rax, %r11
  6852. adcq %rdx, %r12
  6853. adcq $0x00, %r13
  6854. # A[2] * B[1]
  6855. movq 8(%rbx), %rax
  6856. mulq 16(%rsi)
  6857. addq %rax, %r11
  6858. adcq %rdx, %r12
  6859. adcq $0x00, %r13
  6860. # A[3] * B[0]
  6861. movq (%rbx), %rax
  6862. mulq 24(%rsi)
  6863. addq %rax, %r11
  6864. adcq %rdx, %r12
  6865. adcq $0x00, %r13
  6866. # A[1] * B[3]
  6867. movq 24(%rbx), %rax
  6868. mulq 8(%rsi)
  6869. xorq %r14, %r14
  6870. addq %rax, %r12
  6871. adcq %rdx, %r13
  6872. adcq $0x00, %r14
  6873. # A[2] * B[2]
  6874. movq 16(%rbx), %rax
  6875. mulq 16(%rsi)
  6876. addq %rax, %r12
  6877. adcq %rdx, %r13
  6878. adcq $0x00, %r14
  6879. # A[3] * B[1]
  6880. movq 8(%rbx), %rax
  6881. mulq 24(%rsi)
  6882. addq %rax, %r12
  6883. adcq %rdx, %r13
  6884. adcq $0x00, %r14
  6885. # A[2] * B[3]
  6886. movq 24(%rbx), %rax
  6887. mulq 16(%rsi)
  6888. xorq %r15, %r15
  6889. addq %rax, %r13
  6890. adcq %rdx, %r14
  6891. adcq $0x00, %r15
  6892. # A[3] * B[2]
  6893. movq 16(%rbx), %rax
  6894. mulq 24(%rsi)
  6895. addq %rax, %r13
  6896. adcq %rdx, %r14
  6897. adcq $0x00, %r15
  6898. # A[3] * B[3]
  6899. movq 24(%rbx), %rax
  6900. mulq 24(%rsi)
  6901. addq %rax, %r14
  6902. adcq %rdx, %r15
  6903. # Reduce
  6904. movq $0x7fffffffffffffff, %rcx
  6905. # Move top half into t4-t7 and remove top bit from t3
  6906. shldq $0x01, %r14, %r15
  6907. shldq $0x01, %r13, %r14
  6908. shldq $0x01, %r12, %r13
  6909. shldq $0x01, %r11, %r12
  6910. andq %rcx, %r11
  6911. # Multiply top half by 19
  6912. movq $19, %rax
  6913. mulq %r12
  6914. xorq %r12, %r12
  6915. addq %rax, %r8
  6916. movq $19, %rax
  6917. adcq %rdx, %r12
  6918. mulq %r13
  6919. xorq %r13, %r13
  6920. addq %rax, %r9
  6921. movq $19, %rax
  6922. adcq %rdx, %r13
  6923. mulq %r14
  6924. xorq %r14, %r14
  6925. addq %rax, %r10
  6926. movq $19, %rax
  6927. adcq %rdx, %r14
  6928. mulq %r15
  6929. # Add remaining product results in
  6930. addq %r12, %r9
  6931. adcq %r13, %r10
  6932. adcq %r14, %r11
  6933. adcq %rax, %r11
  6934. adcq $0x00, %rdx
  6935. # Overflow
  6936. shldq $0x01, %r11, %rdx
  6937. imulq $19, %rdx, %rax
  6938. andq %rcx, %r11
  6939. addq %rax, %r8
  6940. adcq $0x00, %r9
  6941. adcq $0x00, %r10
  6942. adcq $0x00, %r11
  6943. # Reduce if top bit set
  6944. movq %r11, %rdx
  6945. shrq $63, %rdx
  6946. imulq $19, %rdx, %rax
  6947. andq %rcx, %r11
  6948. addq %rax, %r8
  6949. adcq $0x00, %r9
  6950. adcq $0x00, %r10
  6951. adcq $0x00, %r11
  6952. # Store
  6953. movq %r8, (%rdi)
  6954. movq %r9, 8(%rdi)
  6955. movq %r10, 16(%rdi)
  6956. movq %r11, 24(%rdi)
  6957. leaq 48(%rsp), %rdi
  6958. movq 128(%rsp), %rsi
  6959. movq 128(%rsp), %rbx
  6960. # Add
  6961. movq (%rsi), %r8
  6962. movq 8(%rsi), %r9
  6963. addq (%rbx), %r8
  6964. movq 16(%rsi), %r10
  6965. adcq 8(%rbx), %r9
  6966. movq 24(%rsi), %rcx
  6967. adcq 16(%rbx), %r10
  6968. movq $-19, %rax
  6969. adcq 24(%rbx), %rcx
  6970. movq $0x7fffffffffffffff, %rdx
  6971. movq %rcx, %r11
  6972. sarq $63, %rcx
  6973. # Mask the modulus
  6974. andq %rcx, %rax
  6975. andq %rcx, %rdx
  6976. # Sub modulus (if overflow)
  6977. subq %rax, %r8
  6978. sbbq %rcx, %r9
  6979. sbbq %rcx, %r10
  6980. sbbq %rdx, %r11
  6981. movq %r8, (%rdi)
  6982. movq %r9, 8(%rdi)
  6983. movq %r10, 16(%rdi)
  6984. movq %r11, 24(%rdi)
  6985. movq (%rsp), %rdi
  6986. movq 16(%rsp), %rsi
  6987. movq 8(%rsp), %rbx
  6988. # Sub
  6989. movq (%rsi), %r8
  6990. movq 8(%rsi), %r9
  6991. movq 16(%rsi), %r10
  6992. movq 24(%rsi), %r11
  6993. subq (%rbx), %r8
  6994. movq $0x00, %rcx
  6995. sbbq 8(%rbx), %r9
  6996. movq $-19, %rax
  6997. sbbq 16(%rbx), %r10
  6998. movq $0x7fffffffffffffff, %rdx
  6999. sbbq 24(%rbx), %r11
  7000. sbbq $0x00, %rcx
  7001. # Mask the modulus
  7002. andq %rcx, %rax
  7003. andq %rcx, %rdx
  7004. # Add modulus (if underflow)
  7005. addq %rax, %r8
  7006. adcq %rcx, %r9
  7007. adcq %rcx, %r10
  7008. adcq %rdx, %r11
  7009. movq %r8, (%rdi)
  7010. movq %r9, 8(%rdi)
  7011. movq %r10, 16(%rdi)
  7012. movq %r11, 24(%rdi)
  7013. movq 8(%rsp), %rdi
  7014. movq 16(%rsp), %rsi
  7015. movq 8(%rsp), %rbx
  7016. # Add
  7017. movq (%rsi), %r8
  7018. movq 8(%rsi), %r9
  7019. addq (%rbx), %r8
  7020. movq 16(%rsi), %r10
  7021. adcq 8(%rbx), %r9
  7022. movq 24(%rsi), %rcx
  7023. adcq 16(%rbx), %r10
  7024. movq $-19, %rax
  7025. adcq 24(%rbx), %rcx
  7026. movq $0x7fffffffffffffff, %rdx
  7027. movq %rcx, %r11
  7028. sarq $63, %rcx
  7029. # Mask the modulus
  7030. andq %rcx, %rax
  7031. andq %rcx, %rdx
  7032. # Sub modulus (if overflow)
  7033. subq %rax, %r8
  7034. sbbq %rcx, %r9
  7035. sbbq %rcx, %r10
  7036. sbbq %rdx, %r11
  7037. movq %r8, (%rdi)
  7038. movq %r9, 8(%rdi)
  7039. movq %r10, 16(%rdi)
  7040. movq %r11, 24(%rdi)
  7041. movq 16(%rsp), %rdi
  7042. leaq 48(%rsp), %rsi
  7043. movq 24(%rsp), %rbx
  7044. # Add
  7045. movq (%rsi), %r8
  7046. movq 8(%rsi), %r9
  7047. addq (%rbx), %r8
  7048. movq 16(%rsi), %r10
  7049. adcq 8(%rbx), %r9
  7050. movq 24(%rsi), %rcx
  7051. adcq 16(%rbx), %r10
  7052. movq $-19, %rax
  7053. adcq 24(%rbx), %rcx
  7054. movq $0x7fffffffffffffff, %rdx
  7055. movq %rcx, %r11
  7056. sarq $63, %rcx
  7057. # Mask the modulus
  7058. andq %rcx, %rax
  7059. andq %rcx, %rdx
  7060. # Sub modulus (if overflow)
  7061. subq %rax, %r8
  7062. sbbq %rcx, %r9
  7063. sbbq %rcx, %r10
  7064. sbbq %rdx, %r11
  7065. movq %r8, (%rdi)
  7066. movq %r9, 8(%rdi)
  7067. movq %r10, 16(%rdi)
  7068. movq %r11, 24(%rdi)
  7069. movq 24(%rsp), %rdi
  7070. leaq 48(%rsp), %rsi
  7071. movq 24(%rsp), %rbx
  7072. # Sub
  7073. movq (%rsi), %r8
  7074. movq 8(%rsi), %r9
  7075. movq 16(%rsi), %r10
  7076. movq 24(%rsi), %r11
  7077. subq (%rbx), %r8
  7078. movq $0x00, %rcx
  7079. sbbq 8(%rbx), %r9
  7080. movq $-19, %rax
  7081. sbbq 16(%rbx), %r10
  7082. movq $0x7fffffffffffffff, %rdx
  7083. sbbq 24(%rbx), %r11
  7084. sbbq $0x00, %rcx
  7085. # Mask the modulus
  7086. andq %rcx, %rax
  7087. andq %rcx, %rdx
  7088. # Add modulus (if underflow)
  7089. addq %rax, %r8
  7090. adcq %rcx, %r9
  7091. adcq %rcx, %r10
  7092. adcq %rdx, %r11
  7093. movq %r8, (%rdi)
  7094. movq %r9, 8(%rdi)
  7095. movq %r10, 16(%rdi)
  7096. movq %r11, 24(%rdi)
  7097. addq $0x50, %rsp
  7098. popq %r15
  7099. popq %r14
  7100. popq %r13
  7101. popq %r12
  7102. popq %rbx
  7103. repz retq
  7104. #ifndef __APPLE__
  7105. .size fe_ge_madd_x64,.-fe_ge_madd_x64
  7106. #endif /* __APPLE__ */
  7107. #ifndef __APPLE__
  7108. .text
  7109. .globl fe_ge_msub_x64
  7110. .type fe_ge_msub_x64,@function
  7111. .align 16
  7112. fe_ge_msub_x64:
  7113. #else
  7114. .section __TEXT,__text
  7115. .globl _fe_ge_msub_x64
  7116. .p2align 4
  7117. _fe_ge_msub_x64:
  7118. #endif /* __APPLE__ */
  7119. pushq %rbx
  7120. pushq %r12
  7121. pushq %r13
  7122. pushq %r14
  7123. pushq %r15
  7124. subq $0x50, %rsp
  7125. movq %rdi, (%rsp)
  7126. movq %rsi, 8(%rsp)
  7127. movq %rdx, 16(%rsp)
  7128. movq %rcx, 24(%rsp)
  7129. movq %r8, 32(%rsp)
  7130. movq %r9, 40(%rsp)
  7131. movq (%rsp), %rdi
  7132. movq 40(%rsp), %rsi
  7133. movq 32(%rsp), %rbx
  7134. # Add
  7135. movq (%rsi), %r8
  7136. movq 8(%rsi), %r9
  7137. addq (%rbx), %r8
  7138. movq 16(%rsi), %r10
  7139. adcq 8(%rbx), %r9
  7140. movq 24(%rsi), %rcx
  7141. adcq 16(%rbx), %r10
  7142. movq $-19, %rax
  7143. adcq 24(%rbx), %rcx
  7144. movq $0x7fffffffffffffff, %rdx
  7145. movq %rcx, %r11
  7146. sarq $63, %rcx
  7147. # Mask the modulus
  7148. andq %rcx, %rax
  7149. andq %rcx, %rdx
  7150. # Sub modulus (if overflow)
  7151. subq %rax, %r8
  7152. sbbq %rcx, %r9
  7153. sbbq %rcx, %r10
  7154. sbbq %rdx, %r11
  7155. movq %r8, (%rdi)
  7156. movq %r9, 8(%rdi)
  7157. movq %r10, 16(%rdi)
  7158. movq %r11, 24(%rdi)
  7159. movq 8(%rsp), %rdi
  7160. movq 40(%rsp), %rsi
  7161. movq 32(%rsp), %rbx
  7162. # Sub
  7163. movq (%rsi), %r8
  7164. movq 8(%rsi), %r9
  7165. movq 16(%rsi), %r10
  7166. movq 24(%rsi), %r11
  7167. subq (%rbx), %r8
  7168. movq $0x00, %rcx
  7169. sbbq 8(%rbx), %r9
  7170. movq $-19, %rax
  7171. sbbq 16(%rbx), %r10
  7172. movq $0x7fffffffffffffff, %rdx
  7173. sbbq 24(%rbx), %r11
  7174. sbbq $0x00, %rcx
  7175. # Mask the modulus
  7176. andq %rcx, %rax
  7177. andq %rcx, %rdx
  7178. # Add modulus (if underflow)
  7179. addq %rax, %r8
  7180. adcq %rcx, %r9
  7181. adcq %rcx, %r10
  7182. adcq %rdx, %r11
  7183. movq %r8, (%rdi)
  7184. movq %r9, 8(%rdi)
  7185. movq %r10, 16(%rdi)
  7186. movq %r11, 24(%rdi)
  7187. movq 16(%rsp), %rdi
  7188. movq (%rsp), %rsi
  7189. movq 160(%rsp), %rbx
  7190. # Multiply
  7191. # A[0] * B[0]
  7192. movq (%rbx), %rax
  7193. mulq (%rsi)
  7194. movq %rax, %r8
  7195. movq %rdx, %r9
  7196. # A[0] * B[1]
  7197. movq 8(%rbx), %rax
  7198. mulq (%rsi)
  7199. xorq %r10, %r10
  7200. addq %rax, %r9
  7201. adcq %rdx, %r10
  7202. # A[1] * B[0]
  7203. movq (%rbx), %rax
  7204. mulq 8(%rsi)
  7205. xorq %r11, %r11
  7206. addq %rax, %r9
  7207. adcq %rdx, %r10
  7208. adcq $0x00, %r11
  7209. # A[0] * B[2]
  7210. movq 16(%rbx), %rax
  7211. mulq (%rsi)
  7212. addq %rax, %r10
  7213. adcq %rdx, %r11
  7214. # A[1] * B[1]
  7215. movq 8(%rbx), %rax
  7216. mulq 8(%rsi)
  7217. xorq %r12, %r12
  7218. addq %rax, %r10
  7219. adcq %rdx, %r11
  7220. adcq $0x00, %r12
  7221. # A[2] * B[0]
  7222. movq (%rbx), %rax
  7223. mulq 16(%rsi)
  7224. addq %rax, %r10
  7225. adcq %rdx, %r11
  7226. adcq $0x00, %r12
  7227. # A[0] * B[3]
  7228. movq 24(%rbx), %rax
  7229. mulq (%rsi)
  7230. xorq %r13, %r13
  7231. addq %rax, %r11
  7232. adcq %rdx, %r12
  7233. adcq $0x00, %r13
  7234. # A[1] * B[2]
  7235. movq 16(%rbx), %rax
  7236. mulq 8(%rsi)
  7237. addq %rax, %r11
  7238. adcq %rdx, %r12
  7239. adcq $0x00, %r13
  7240. # A[2] * B[1]
  7241. movq 8(%rbx), %rax
  7242. mulq 16(%rsi)
  7243. addq %rax, %r11
  7244. adcq %rdx, %r12
  7245. adcq $0x00, %r13
  7246. # A[3] * B[0]
  7247. movq (%rbx), %rax
  7248. mulq 24(%rsi)
  7249. addq %rax, %r11
  7250. adcq %rdx, %r12
  7251. adcq $0x00, %r13
  7252. # A[1] * B[3]
  7253. movq 24(%rbx), %rax
  7254. mulq 8(%rsi)
  7255. xorq %r14, %r14
  7256. addq %rax, %r12
  7257. adcq %rdx, %r13
  7258. adcq $0x00, %r14
  7259. # A[2] * B[2]
  7260. movq 16(%rbx), %rax
  7261. mulq 16(%rsi)
  7262. addq %rax, %r12
  7263. adcq %rdx, %r13
  7264. adcq $0x00, %r14
  7265. # A[3] * B[1]
  7266. movq 8(%rbx), %rax
  7267. mulq 24(%rsi)
  7268. addq %rax, %r12
  7269. adcq %rdx, %r13
  7270. adcq $0x00, %r14
  7271. # A[2] * B[3]
  7272. movq 24(%rbx), %rax
  7273. mulq 16(%rsi)
  7274. xorq %r15, %r15
  7275. addq %rax, %r13
  7276. adcq %rdx, %r14
  7277. adcq $0x00, %r15
  7278. # A[3] * B[2]
  7279. movq 16(%rbx), %rax
  7280. mulq 24(%rsi)
  7281. addq %rax, %r13
  7282. adcq %rdx, %r14
  7283. adcq $0x00, %r15
  7284. # A[3] * B[3]
  7285. movq 24(%rbx), %rax
  7286. mulq 24(%rsi)
  7287. addq %rax, %r14
  7288. adcq %rdx, %r15
  7289. # Reduce
  7290. movq $0x7fffffffffffffff, %rcx
  7291. # Move top half into t4-t7 and remove top bit from t3
  7292. shldq $0x01, %r14, %r15
  7293. shldq $0x01, %r13, %r14
  7294. shldq $0x01, %r12, %r13
  7295. shldq $0x01, %r11, %r12
  7296. andq %rcx, %r11
  7297. # Multiply top half by 19
  7298. movq $19, %rax
  7299. mulq %r12
  7300. xorq %r12, %r12
  7301. addq %rax, %r8
  7302. movq $19, %rax
  7303. adcq %rdx, %r12
  7304. mulq %r13
  7305. xorq %r13, %r13
  7306. addq %rax, %r9
  7307. movq $19, %rax
  7308. adcq %rdx, %r13
  7309. mulq %r14
  7310. xorq %r14, %r14
  7311. addq %rax, %r10
  7312. movq $19, %rax
  7313. adcq %rdx, %r14
  7314. mulq %r15
  7315. # Add remaining product results in
  7316. addq %r12, %r9
  7317. adcq %r13, %r10
  7318. adcq %r14, %r11
  7319. adcq %rax, %r11
  7320. adcq $0x00, %rdx
  7321. # Overflow
  7322. shldq $0x01, %r11, %rdx
  7323. imulq $19, %rdx, %rax
  7324. andq %rcx, %r11
  7325. addq %rax, %r8
  7326. adcq $0x00, %r9
  7327. adcq $0x00, %r10
  7328. adcq $0x00, %r11
  7329. # Reduce if top bit set
  7330. movq %r11, %rdx
  7331. shrq $63, %rdx
  7332. imulq $19, %rdx, %rax
  7333. andq %rcx, %r11
  7334. addq %rax, %r8
  7335. adcq $0x00, %r9
  7336. adcq $0x00, %r10
  7337. adcq $0x00, %r11
  7338. # Store
  7339. movq %r8, (%rdi)
  7340. movq %r9, 8(%rdi)
  7341. movq %r10, 16(%rdi)
  7342. movq %r11, 24(%rdi)
  7343. movq 8(%rsp), %rdi
  7344. movq 8(%rsp), %rsi
  7345. movq 152(%rsp), %rbx
  7346. # Multiply
  7347. # A[0] * B[0]
  7348. movq (%rbx), %rax
  7349. mulq (%rsi)
  7350. movq %rax, %r8
  7351. movq %rdx, %r9
  7352. # A[0] * B[1]
  7353. movq 8(%rbx), %rax
  7354. mulq (%rsi)
  7355. xorq %r10, %r10
  7356. addq %rax, %r9
  7357. adcq %rdx, %r10
  7358. # A[1] * B[0]
  7359. movq (%rbx), %rax
  7360. mulq 8(%rsi)
  7361. xorq %r11, %r11
  7362. addq %rax, %r9
  7363. adcq %rdx, %r10
  7364. adcq $0x00, %r11
  7365. # A[0] * B[2]
  7366. movq 16(%rbx), %rax
  7367. mulq (%rsi)
  7368. addq %rax, %r10
  7369. adcq %rdx, %r11
  7370. # A[1] * B[1]
  7371. movq 8(%rbx), %rax
  7372. mulq 8(%rsi)
  7373. xorq %r12, %r12
  7374. addq %rax, %r10
  7375. adcq %rdx, %r11
  7376. adcq $0x00, %r12
  7377. # A[2] * B[0]
  7378. movq (%rbx), %rax
  7379. mulq 16(%rsi)
  7380. addq %rax, %r10
  7381. adcq %rdx, %r11
  7382. adcq $0x00, %r12
  7383. # A[0] * B[3]
  7384. movq 24(%rbx), %rax
  7385. mulq (%rsi)
  7386. xorq %r13, %r13
  7387. addq %rax, %r11
  7388. adcq %rdx, %r12
  7389. adcq $0x00, %r13
  7390. # A[1] * B[2]
  7391. movq 16(%rbx), %rax
  7392. mulq 8(%rsi)
  7393. addq %rax, %r11
  7394. adcq %rdx, %r12
  7395. adcq $0x00, %r13
  7396. # A[2] * B[1]
  7397. movq 8(%rbx), %rax
  7398. mulq 16(%rsi)
  7399. addq %rax, %r11
  7400. adcq %rdx, %r12
  7401. adcq $0x00, %r13
  7402. # A[3] * B[0]
  7403. movq (%rbx), %rax
  7404. mulq 24(%rsi)
  7405. addq %rax, %r11
  7406. adcq %rdx, %r12
  7407. adcq $0x00, %r13
  7408. # A[1] * B[3]
  7409. movq 24(%rbx), %rax
  7410. mulq 8(%rsi)
  7411. xorq %r14, %r14
  7412. addq %rax, %r12
  7413. adcq %rdx, %r13
  7414. adcq $0x00, %r14
  7415. # A[2] * B[2]
  7416. movq 16(%rbx), %rax
  7417. mulq 16(%rsi)
  7418. addq %rax, %r12
  7419. adcq %rdx, %r13
  7420. adcq $0x00, %r14
  7421. # A[3] * B[1]
  7422. movq 8(%rbx), %rax
  7423. mulq 24(%rsi)
  7424. addq %rax, %r12
  7425. adcq %rdx, %r13
  7426. adcq $0x00, %r14
  7427. # A[2] * B[3]
  7428. movq 24(%rbx), %rax
  7429. mulq 16(%rsi)
  7430. xorq %r15, %r15
  7431. addq %rax, %r13
  7432. adcq %rdx, %r14
  7433. adcq $0x00, %r15
  7434. # A[3] * B[2]
  7435. movq 16(%rbx), %rax
  7436. mulq 24(%rsi)
  7437. addq %rax, %r13
  7438. adcq %rdx, %r14
  7439. adcq $0x00, %r15
  7440. # A[3] * B[3]
  7441. movq 24(%rbx), %rax
  7442. mulq 24(%rsi)
  7443. addq %rax, %r14
  7444. adcq %rdx, %r15
  7445. # Reduce
  7446. movq $0x7fffffffffffffff, %rcx
  7447. # Move top half into t4-t7 and remove top bit from t3
  7448. shldq $0x01, %r14, %r15
  7449. shldq $0x01, %r13, %r14
  7450. shldq $0x01, %r12, %r13
  7451. shldq $0x01, %r11, %r12
  7452. andq %rcx, %r11
  7453. # Multiply top half by 19
  7454. movq $19, %rax
  7455. mulq %r12
  7456. xorq %r12, %r12
  7457. addq %rax, %r8
  7458. movq $19, %rax
  7459. adcq %rdx, %r12
  7460. mulq %r13
  7461. xorq %r13, %r13
  7462. addq %rax, %r9
  7463. movq $19, %rax
  7464. adcq %rdx, %r13
  7465. mulq %r14
  7466. xorq %r14, %r14
  7467. addq %rax, %r10
  7468. movq $19, %rax
  7469. adcq %rdx, %r14
  7470. mulq %r15
  7471. # Add remaining product results in
  7472. addq %r12, %r9
  7473. adcq %r13, %r10
  7474. adcq %r14, %r11
  7475. adcq %rax, %r11
  7476. adcq $0x00, %rdx
  7477. # Overflow
  7478. shldq $0x01, %r11, %rdx
  7479. imulq $19, %rdx, %rax
  7480. andq %rcx, %r11
  7481. addq %rax, %r8
  7482. adcq $0x00, %r9
  7483. adcq $0x00, %r10
  7484. adcq $0x00, %r11
  7485. # Reduce if top bit set
  7486. movq %r11, %rdx
  7487. shrq $63, %rdx
  7488. imulq $19, %rdx, %rax
  7489. andq %rcx, %r11
  7490. addq %rax, %r8
  7491. adcq $0x00, %r9
  7492. adcq $0x00, %r10
  7493. adcq $0x00, %r11
  7494. # Store
  7495. movq %r8, (%rdi)
  7496. movq %r9, 8(%rdi)
  7497. movq %r10, 16(%rdi)
  7498. movq %r11, 24(%rdi)
  7499. movq 24(%rsp), %rdi
  7500. movq 144(%rsp), %rsi
  7501. movq 136(%rsp), %rbx
  7502. # Multiply
  7503. # A[0] * B[0]
  7504. movq (%rbx), %rax
  7505. mulq (%rsi)
  7506. movq %rax, %r8
  7507. movq %rdx, %r9
  7508. # A[0] * B[1]
  7509. movq 8(%rbx), %rax
  7510. mulq (%rsi)
  7511. xorq %r10, %r10
  7512. addq %rax, %r9
  7513. adcq %rdx, %r10
  7514. # A[1] * B[0]
  7515. movq (%rbx), %rax
  7516. mulq 8(%rsi)
  7517. xorq %r11, %r11
  7518. addq %rax, %r9
  7519. adcq %rdx, %r10
  7520. adcq $0x00, %r11
  7521. # A[0] * B[2]
  7522. movq 16(%rbx), %rax
  7523. mulq (%rsi)
  7524. addq %rax, %r10
  7525. adcq %rdx, %r11
  7526. # A[1] * B[1]
  7527. movq 8(%rbx), %rax
  7528. mulq 8(%rsi)
  7529. xorq %r12, %r12
  7530. addq %rax, %r10
  7531. adcq %rdx, %r11
  7532. adcq $0x00, %r12
  7533. # A[2] * B[0]
  7534. movq (%rbx), %rax
  7535. mulq 16(%rsi)
  7536. addq %rax, %r10
  7537. adcq %rdx, %r11
  7538. adcq $0x00, %r12
  7539. # A[0] * B[3]
  7540. movq 24(%rbx), %rax
  7541. mulq (%rsi)
  7542. xorq %r13, %r13
  7543. addq %rax, %r11
  7544. adcq %rdx, %r12
  7545. adcq $0x00, %r13
  7546. # A[1] * B[2]
  7547. movq 16(%rbx), %rax
  7548. mulq 8(%rsi)
  7549. addq %rax, %r11
  7550. adcq %rdx, %r12
  7551. adcq $0x00, %r13
  7552. # A[2] * B[1]
  7553. movq 8(%rbx), %rax
  7554. mulq 16(%rsi)
  7555. addq %rax, %r11
  7556. adcq %rdx, %r12
  7557. adcq $0x00, %r13
  7558. # A[3] * B[0]
  7559. movq (%rbx), %rax
  7560. mulq 24(%rsi)
  7561. addq %rax, %r11
  7562. adcq %rdx, %r12
  7563. adcq $0x00, %r13
  7564. # A[1] * B[3]
  7565. movq 24(%rbx), %rax
  7566. mulq 8(%rsi)
  7567. xorq %r14, %r14
  7568. addq %rax, %r12
  7569. adcq %rdx, %r13
  7570. adcq $0x00, %r14
  7571. # A[2] * B[2]
  7572. movq 16(%rbx), %rax
  7573. mulq 16(%rsi)
  7574. addq %rax, %r12
  7575. adcq %rdx, %r13
  7576. adcq $0x00, %r14
  7577. # A[3] * B[1]
  7578. movq 8(%rbx), %rax
  7579. mulq 24(%rsi)
  7580. addq %rax, %r12
  7581. adcq %rdx, %r13
  7582. adcq $0x00, %r14
  7583. # A[2] * B[3]
  7584. movq 24(%rbx), %rax
  7585. mulq 16(%rsi)
  7586. xorq %r15, %r15
  7587. addq %rax, %r13
  7588. adcq %rdx, %r14
  7589. adcq $0x00, %r15
  7590. # A[3] * B[2]
  7591. movq 16(%rbx), %rax
  7592. mulq 24(%rsi)
  7593. addq %rax, %r13
  7594. adcq %rdx, %r14
  7595. adcq $0x00, %r15
  7596. # A[3] * B[3]
  7597. movq 24(%rbx), %rax
  7598. mulq 24(%rsi)
  7599. addq %rax, %r14
  7600. adcq %rdx, %r15
  7601. # Reduce
  7602. movq $0x7fffffffffffffff, %rcx
  7603. # Move top half into t4-t7 and remove top bit from t3
  7604. shldq $0x01, %r14, %r15
  7605. shldq $0x01, %r13, %r14
  7606. shldq $0x01, %r12, %r13
  7607. shldq $0x01, %r11, %r12
  7608. andq %rcx, %r11
  7609. # Multiply top half by 19
  7610. movq $19, %rax
  7611. mulq %r12
  7612. xorq %r12, %r12
  7613. addq %rax, %r8
  7614. movq $19, %rax
  7615. adcq %rdx, %r12
  7616. mulq %r13
  7617. xorq %r13, %r13
  7618. addq %rax, %r9
  7619. movq $19, %rax
  7620. adcq %rdx, %r13
  7621. mulq %r14
  7622. xorq %r14, %r14
  7623. addq %rax, %r10
  7624. movq $19, %rax
  7625. adcq %rdx, %r14
  7626. mulq %r15
  7627. # Add remaining product results in
  7628. addq %r12, %r9
  7629. adcq %r13, %r10
  7630. adcq %r14, %r11
  7631. adcq %rax, %r11
  7632. adcq $0x00, %rdx
  7633. # Overflow
  7634. shldq $0x01, %r11, %rdx
  7635. imulq $19, %rdx, %rax
  7636. andq %rcx, %r11
  7637. addq %rax, %r8
  7638. adcq $0x00, %r9
  7639. adcq $0x00, %r10
  7640. adcq $0x00, %r11
  7641. # Reduce if top bit set
  7642. movq %r11, %rdx
  7643. shrq $63, %rdx
  7644. imulq $19, %rdx, %rax
  7645. andq %rcx, %r11
  7646. addq %rax, %r8
  7647. adcq $0x00, %r9
  7648. adcq $0x00, %r10
  7649. adcq $0x00, %r11
  7650. # Store
  7651. movq %r8, (%rdi)
  7652. movq %r9, 8(%rdi)
  7653. movq %r10, 16(%rdi)
  7654. movq %r11, 24(%rdi)
  7655. leaq 48(%rsp), %rdi
  7656. movq 128(%rsp), %rsi
  7657. movq 128(%rsp), %rbx
  7658. # Add
  7659. movq (%rsi), %r8
  7660. movq 8(%rsi), %r9
  7661. addq (%rbx), %r8
  7662. movq 16(%rsi), %r10
  7663. adcq 8(%rbx), %r9
  7664. movq 24(%rsi), %rcx
  7665. adcq 16(%rbx), %r10
  7666. movq $-19, %rax
  7667. adcq 24(%rbx), %rcx
  7668. movq $0x7fffffffffffffff, %rdx
  7669. movq %rcx, %r11
  7670. sarq $63, %rcx
  7671. # Mask the modulus
  7672. andq %rcx, %rax
  7673. andq %rcx, %rdx
  7674. # Sub modulus (if overflow)
  7675. subq %rax, %r8
  7676. sbbq %rcx, %r9
  7677. sbbq %rcx, %r10
  7678. sbbq %rdx, %r11
  7679. movq %r8, (%rdi)
  7680. movq %r9, 8(%rdi)
  7681. movq %r10, 16(%rdi)
  7682. movq %r11, 24(%rdi)
  7683. movq (%rsp), %rdi
  7684. movq 16(%rsp), %rsi
  7685. movq 8(%rsp), %rbx
  7686. # Sub
  7687. movq (%rsi), %r8
  7688. movq 8(%rsi), %r9
  7689. movq 16(%rsi), %r10
  7690. movq 24(%rsi), %r11
  7691. subq (%rbx), %r8
  7692. movq $0x00, %rcx
  7693. sbbq 8(%rbx), %r9
  7694. movq $-19, %rax
  7695. sbbq 16(%rbx), %r10
  7696. movq $0x7fffffffffffffff, %rdx
  7697. sbbq 24(%rbx), %r11
  7698. sbbq $0x00, %rcx
  7699. # Mask the modulus
  7700. andq %rcx, %rax
  7701. andq %rcx, %rdx
  7702. # Add modulus (if underflow)
  7703. addq %rax, %r8
  7704. adcq %rcx, %r9
  7705. adcq %rcx, %r10
  7706. adcq %rdx, %r11
  7707. movq %r8, (%rdi)
  7708. movq %r9, 8(%rdi)
  7709. movq %r10, 16(%rdi)
  7710. movq %r11, 24(%rdi)
  7711. movq 8(%rsp), %rdi
  7712. movq 16(%rsp), %rsi
  7713. movq 8(%rsp), %rbx
  7714. # Add
  7715. movq (%rsi), %r8
  7716. movq 8(%rsi), %r9
  7717. addq (%rbx), %r8
  7718. movq 16(%rsi), %r10
  7719. adcq 8(%rbx), %r9
  7720. movq 24(%rsi), %rcx
  7721. adcq 16(%rbx), %r10
  7722. movq $-19, %rax
  7723. adcq 24(%rbx), %rcx
  7724. movq $0x7fffffffffffffff, %rdx
  7725. movq %rcx, %r11
  7726. sarq $63, %rcx
  7727. # Mask the modulus
  7728. andq %rcx, %rax
  7729. andq %rcx, %rdx
  7730. # Sub modulus (if overflow)
  7731. subq %rax, %r8
  7732. sbbq %rcx, %r9
  7733. sbbq %rcx, %r10
  7734. sbbq %rdx, %r11
  7735. movq %r8, (%rdi)
  7736. movq %r9, 8(%rdi)
  7737. movq %r10, 16(%rdi)
  7738. movq %r11, 24(%rdi)
  7739. movq 16(%rsp), %rdi
  7740. leaq 48(%rsp), %rsi
  7741. movq 24(%rsp), %rbx
  7742. # Sub
  7743. movq (%rsi), %r8
  7744. movq 8(%rsi), %r9
  7745. movq 16(%rsi), %r10
  7746. movq 24(%rsi), %r11
  7747. subq (%rbx), %r8
  7748. movq $0x00, %rcx
  7749. sbbq 8(%rbx), %r9
  7750. movq $-19, %rax
  7751. sbbq 16(%rbx), %r10
  7752. movq $0x7fffffffffffffff, %rdx
  7753. sbbq 24(%rbx), %r11
  7754. sbbq $0x00, %rcx
  7755. # Mask the modulus
  7756. andq %rcx, %rax
  7757. andq %rcx, %rdx
  7758. # Add modulus (if underflow)
  7759. addq %rax, %r8
  7760. adcq %rcx, %r9
  7761. adcq %rcx, %r10
  7762. adcq %rdx, %r11
  7763. movq %r8, (%rdi)
  7764. movq %r9, 8(%rdi)
  7765. movq %r10, 16(%rdi)
  7766. movq %r11, 24(%rdi)
  7767. movq 24(%rsp), %rdi
  7768. leaq 48(%rsp), %rsi
  7769. movq 24(%rsp), %rbx
  7770. # Add
  7771. movq (%rsi), %r8
  7772. movq 8(%rsi), %r9
  7773. addq (%rbx), %r8
  7774. movq 16(%rsi), %r10
  7775. adcq 8(%rbx), %r9
  7776. movq 24(%rsi), %rcx
  7777. adcq 16(%rbx), %r10
  7778. movq $-19, %rax
  7779. adcq 24(%rbx), %rcx
  7780. movq $0x7fffffffffffffff, %rdx
  7781. movq %rcx, %r11
  7782. sarq $63, %rcx
  7783. # Mask the modulus
  7784. andq %rcx, %rax
  7785. andq %rcx, %rdx
  7786. # Sub modulus (if overflow)
  7787. subq %rax, %r8
  7788. sbbq %rcx, %r9
  7789. sbbq %rcx, %r10
  7790. sbbq %rdx, %r11
  7791. movq %r8, (%rdi)
  7792. movq %r9, 8(%rdi)
  7793. movq %r10, 16(%rdi)
  7794. movq %r11, 24(%rdi)
  7795. addq $0x50, %rsp
  7796. popq %r15
  7797. popq %r14
  7798. popq %r13
  7799. popq %r12
  7800. popq %rbx
  7801. repz retq
  7802. #ifndef __APPLE__
  7803. .size fe_ge_msub_x64,.-fe_ge_msub_x64
  7804. #endif /* __APPLE__ */
  7805. #ifndef __APPLE__
  7806. .text
  7807. .globl fe_ge_add_x64
  7808. .type fe_ge_add_x64,@function
  7809. .align 16
  7810. fe_ge_add_x64:
  7811. #else
  7812. .section __TEXT,__text
  7813. .globl _fe_ge_add_x64
  7814. .p2align 4
  7815. _fe_ge_add_x64:
  7816. #endif /* __APPLE__ */
  7817. pushq %rbx
  7818. pushq %r12
  7819. pushq %r13
  7820. pushq %r14
  7821. pushq %r15
  7822. subq $0x50, %rsp
  7823. movq %rdi, (%rsp)
  7824. movq %rsi, 8(%rsp)
  7825. movq %rdx, 16(%rsp)
  7826. movq %rcx, 24(%rsp)
  7827. movq %r8, 32(%rsp)
  7828. movq %r9, 40(%rsp)
  7829. movq (%rsp), %rdi
  7830. movq 40(%rsp), %rsi
  7831. movq 32(%rsp), %rbx
  7832. # Add
  7833. movq (%rsi), %r8
  7834. movq 8(%rsi), %r9
  7835. addq (%rbx), %r8
  7836. movq 16(%rsi), %r10
  7837. adcq 8(%rbx), %r9
  7838. movq 24(%rsi), %rcx
  7839. adcq 16(%rbx), %r10
  7840. movq $-19, %rax
  7841. adcq 24(%rbx), %rcx
  7842. movq $0x7fffffffffffffff, %rdx
  7843. movq %rcx, %r11
  7844. sarq $63, %rcx
  7845. # Mask the modulus
  7846. andq %rcx, %rax
  7847. andq %rcx, %rdx
  7848. # Sub modulus (if overflow)
  7849. subq %rax, %r8
  7850. sbbq %rcx, %r9
  7851. sbbq %rcx, %r10
  7852. sbbq %rdx, %r11
  7853. movq %r8, (%rdi)
  7854. movq %r9, 8(%rdi)
  7855. movq %r10, 16(%rdi)
  7856. movq %r11, 24(%rdi)
  7857. movq 8(%rsp), %rdi
  7858. movq 40(%rsp), %rsi
  7859. movq 32(%rsp), %rbx
  7860. # Sub
  7861. movq (%rsi), %r8
  7862. movq 8(%rsi), %r9
  7863. movq 16(%rsi), %r10
  7864. movq 24(%rsi), %r11
  7865. subq (%rbx), %r8
  7866. movq $0x00, %rcx
  7867. sbbq 8(%rbx), %r9
  7868. movq $-19, %rax
  7869. sbbq 16(%rbx), %r10
  7870. movq $0x7fffffffffffffff, %rdx
  7871. sbbq 24(%rbx), %r11
  7872. sbbq $0x00, %rcx
  7873. # Mask the modulus
  7874. andq %rcx, %rax
  7875. andq %rcx, %rdx
  7876. # Add modulus (if underflow)
  7877. addq %rax, %r8
  7878. adcq %rcx, %r9
  7879. adcq %rcx, %r10
  7880. adcq %rdx, %r11
  7881. movq %r8, (%rdi)
  7882. movq %r9, 8(%rdi)
  7883. movq %r10, 16(%rdi)
  7884. movq %r11, 24(%rdi)
  7885. movq 16(%rsp), %rdi
  7886. movq (%rsp), %rsi
  7887. movq 160(%rsp), %rbx
  7888. # Multiply
  7889. # A[0] * B[0]
  7890. movq (%rbx), %rax
  7891. mulq (%rsi)
  7892. movq %rax, %r8
  7893. movq %rdx, %r9
  7894. # A[0] * B[1]
  7895. movq 8(%rbx), %rax
  7896. mulq (%rsi)
  7897. xorq %r10, %r10
  7898. addq %rax, %r9
  7899. adcq %rdx, %r10
  7900. # A[1] * B[0]
  7901. movq (%rbx), %rax
  7902. mulq 8(%rsi)
  7903. xorq %r11, %r11
  7904. addq %rax, %r9
  7905. adcq %rdx, %r10
  7906. adcq $0x00, %r11
  7907. # A[0] * B[2]
  7908. movq 16(%rbx), %rax
  7909. mulq (%rsi)
  7910. addq %rax, %r10
  7911. adcq %rdx, %r11
  7912. # A[1] * B[1]
  7913. movq 8(%rbx), %rax
  7914. mulq 8(%rsi)
  7915. xorq %r12, %r12
  7916. addq %rax, %r10
  7917. adcq %rdx, %r11
  7918. adcq $0x00, %r12
  7919. # A[2] * B[0]
  7920. movq (%rbx), %rax
  7921. mulq 16(%rsi)
  7922. addq %rax, %r10
  7923. adcq %rdx, %r11
  7924. adcq $0x00, %r12
  7925. # A[0] * B[3]
  7926. movq 24(%rbx), %rax
  7927. mulq (%rsi)
  7928. xorq %r13, %r13
  7929. addq %rax, %r11
  7930. adcq %rdx, %r12
  7931. adcq $0x00, %r13
  7932. # A[1] * B[2]
  7933. movq 16(%rbx), %rax
  7934. mulq 8(%rsi)
  7935. addq %rax, %r11
  7936. adcq %rdx, %r12
  7937. adcq $0x00, %r13
  7938. # A[2] * B[1]
  7939. movq 8(%rbx), %rax
  7940. mulq 16(%rsi)
  7941. addq %rax, %r11
  7942. adcq %rdx, %r12
  7943. adcq $0x00, %r13
  7944. # A[3] * B[0]
  7945. movq (%rbx), %rax
  7946. mulq 24(%rsi)
  7947. addq %rax, %r11
  7948. adcq %rdx, %r12
  7949. adcq $0x00, %r13
  7950. # A[1] * B[3]
  7951. movq 24(%rbx), %rax
  7952. mulq 8(%rsi)
  7953. xorq %r14, %r14
  7954. addq %rax, %r12
  7955. adcq %rdx, %r13
  7956. adcq $0x00, %r14
  7957. # A[2] * B[2]
  7958. movq 16(%rbx), %rax
  7959. mulq 16(%rsi)
  7960. addq %rax, %r12
  7961. adcq %rdx, %r13
  7962. adcq $0x00, %r14
  7963. # A[3] * B[1]
  7964. movq 8(%rbx), %rax
  7965. mulq 24(%rsi)
  7966. addq %rax, %r12
  7967. adcq %rdx, %r13
  7968. adcq $0x00, %r14
  7969. # A[2] * B[3]
  7970. movq 24(%rbx), %rax
  7971. mulq 16(%rsi)
  7972. xorq %r15, %r15
  7973. addq %rax, %r13
  7974. adcq %rdx, %r14
  7975. adcq $0x00, %r15
  7976. # A[3] * B[2]
  7977. movq 16(%rbx), %rax
  7978. mulq 24(%rsi)
  7979. addq %rax, %r13
  7980. adcq %rdx, %r14
  7981. adcq $0x00, %r15
  7982. # A[3] * B[3]
  7983. movq 24(%rbx), %rax
  7984. mulq 24(%rsi)
  7985. addq %rax, %r14
  7986. adcq %rdx, %r15
  7987. # Reduce
  7988. movq $0x7fffffffffffffff, %rcx
  7989. # Move top half into t4-t7 and remove top bit from t3
  7990. shldq $0x01, %r14, %r15
  7991. shldq $0x01, %r13, %r14
  7992. shldq $0x01, %r12, %r13
  7993. shldq $0x01, %r11, %r12
  7994. andq %rcx, %r11
  7995. # Multiply top half by 19
  7996. movq $19, %rax
  7997. mulq %r12
  7998. xorq %r12, %r12
  7999. addq %rax, %r8
  8000. movq $19, %rax
  8001. adcq %rdx, %r12
  8002. mulq %r13
  8003. xorq %r13, %r13
  8004. addq %rax, %r9
  8005. movq $19, %rax
  8006. adcq %rdx, %r13
  8007. mulq %r14
  8008. xorq %r14, %r14
  8009. addq %rax, %r10
  8010. movq $19, %rax
  8011. adcq %rdx, %r14
  8012. mulq %r15
  8013. # Add remaining product results in
  8014. addq %r12, %r9
  8015. adcq %r13, %r10
  8016. adcq %r14, %r11
  8017. adcq %rax, %r11
  8018. adcq $0x00, %rdx
  8019. # Overflow
  8020. shldq $0x01, %r11, %rdx
  8021. imulq $19, %rdx, %rax
  8022. andq %rcx, %r11
  8023. addq %rax, %r8
  8024. adcq $0x00, %r9
  8025. adcq $0x00, %r10
  8026. adcq $0x00, %r11
  8027. # Reduce if top bit set
  8028. movq %r11, %rdx
  8029. shrq $63, %rdx
  8030. imulq $19, %rdx, %rax
  8031. andq %rcx, %r11
  8032. addq %rax, %r8
  8033. adcq $0x00, %r9
  8034. adcq $0x00, %r10
  8035. adcq $0x00, %r11
  8036. # Store
  8037. movq %r8, (%rdi)
  8038. movq %r9, 8(%rdi)
  8039. movq %r10, 16(%rdi)
  8040. movq %r11, 24(%rdi)
  8041. movq 8(%rsp), %rdi
  8042. movq 8(%rsp), %rsi
  8043. movq 168(%rsp), %rbx
  8044. # Multiply
  8045. # A[0] * B[0]
  8046. movq (%rbx), %rax
  8047. mulq (%rsi)
  8048. movq %rax, %r8
  8049. movq %rdx, %r9
  8050. # A[0] * B[1]
  8051. movq 8(%rbx), %rax
  8052. mulq (%rsi)
  8053. xorq %r10, %r10
  8054. addq %rax, %r9
  8055. adcq %rdx, %r10
  8056. # A[1] * B[0]
  8057. movq (%rbx), %rax
  8058. mulq 8(%rsi)
  8059. xorq %r11, %r11
  8060. addq %rax, %r9
  8061. adcq %rdx, %r10
  8062. adcq $0x00, %r11
  8063. # A[0] * B[2]
  8064. movq 16(%rbx), %rax
  8065. mulq (%rsi)
  8066. addq %rax, %r10
  8067. adcq %rdx, %r11
  8068. # A[1] * B[1]
  8069. movq 8(%rbx), %rax
  8070. mulq 8(%rsi)
  8071. xorq %r12, %r12
  8072. addq %rax, %r10
  8073. adcq %rdx, %r11
  8074. adcq $0x00, %r12
  8075. # A[2] * B[0]
  8076. movq (%rbx), %rax
  8077. mulq 16(%rsi)
  8078. addq %rax, %r10
  8079. adcq %rdx, %r11
  8080. adcq $0x00, %r12
  8081. # A[0] * B[3]
  8082. movq 24(%rbx), %rax
  8083. mulq (%rsi)
  8084. xorq %r13, %r13
  8085. addq %rax, %r11
  8086. adcq %rdx, %r12
  8087. adcq $0x00, %r13
  8088. # A[1] * B[2]
  8089. movq 16(%rbx), %rax
  8090. mulq 8(%rsi)
  8091. addq %rax, %r11
  8092. adcq %rdx, %r12
  8093. adcq $0x00, %r13
  8094. # A[2] * B[1]
  8095. movq 8(%rbx), %rax
  8096. mulq 16(%rsi)
  8097. addq %rax, %r11
  8098. adcq %rdx, %r12
  8099. adcq $0x00, %r13
  8100. # A[3] * B[0]
  8101. movq (%rbx), %rax
  8102. mulq 24(%rsi)
  8103. addq %rax, %r11
  8104. adcq %rdx, %r12
  8105. adcq $0x00, %r13
  8106. # A[1] * B[3]
  8107. movq 24(%rbx), %rax
  8108. mulq 8(%rsi)
  8109. xorq %r14, %r14
  8110. addq %rax, %r12
  8111. adcq %rdx, %r13
  8112. adcq $0x00, %r14
  8113. # A[2] * B[2]
  8114. movq 16(%rbx), %rax
  8115. mulq 16(%rsi)
  8116. addq %rax, %r12
  8117. adcq %rdx, %r13
  8118. adcq $0x00, %r14
  8119. # A[3] * B[1]
  8120. movq 8(%rbx), %rax
  8121. mulq 24(%rsi)
  8122. addq %rax, %r12
  8123. adcq %rdx, %r13
  8124. adcq $0x00, %r14
  8125. # A[2] * B[3]
  8126. movq 24(%rbx), %rax
  8127. mulq 16(%rsi)
  8128. xorq %r15, %r15
  8129. addq %rax, %r13
  8130. adcq %rdx, %r14
  8131. adcq $0x00, %r15
  8132. # A[3] * B[2]
  8133. movq 16(%rbx), %rax
  8134. mulq 24(%rsi)
  8135. addq %rax, %r13
  8136. adcq %rdx, %r14
  8137. adcq $0x00, %r15
  8138. # A[3] * B[3]
  8139. movq 24(%rbx), %rax
  8140. mulq 24(%rsi)
  8141. addq %rax, %r14
  8142. adcq %rdx, %r15
  8143. # Reduce
  8144. movq $0x7fffffffffffffff, %rcx
  8145. # Move top half into t4-t7 and remove top bit from t3
  8146. shldq $0x01, %r14, %r15
  8147. shldq $0x01, %r13, %r14
  8148. shldq $0x01, %r12, %r13
  8149. shldq $0x01, %r11, %r12
  8150. andq %rcx, %r11
  8151. # Multiply top half by 19
  8152. movq $19, %rax
  8153. mulq %r12
  8154. xorq %r12, %r12
  8155. addq %rax, %r8
  8156. movq $19, %rax
  8157. adcq %rdx, %r12
  8158. mulq %r13
  8159. xorq %r13, %r13
  8160. addq %rax, %r9
  8161. movq $19, %rax
  8162. adcq %rdx, %r13
  8163. mulq %r14
  8164. xorq %r14, %r14
  8165. addq %rax, %r10
  8166. movq $19, %rax
  8167. adcq %rdx, %r14
  8168. mulq %r15
  8169. # Add remaining product results in
  8170. addq %r12, %r9
  8171. adcq %r13, %r10
  8172. adcq %r14, %r11
  8173. adcq %rax, %r11
  8174. adcq $0x00, %rdx
  8175. # Overflow
  8176. shldq $0x01, %r11, %rdx
  8177. imulq $19, %rdx, %rax
  8178. andq %rcx, %r11
  8179. addq %rax, %r8
  8180. adcq $0x00, %r9
  8181. adcq $0x00, %r10
  8182. adcq $0x00, %r11
  8183. # Reduce if top bit set
  8184. movq %r11, %rdx
  8185. shrq $63, %rdx
  8186. imulq $19, %rdx, %rax
  8187. andq %rcx, %r11
  8188. addq %rax, %r8
  8189. adcq $0x00, %r9
  8190. adcq $0x00, %r10
  8191. adcq $0x00, %r11
  8192. # Store
  8193. movq %r8, (%rdi)
  8194. movq %r9, 8(%rdi)
  8195. movq %r10, 16(%rdi)
  8196. movq %r11, 24(%rdi)
  8197. movq 24(%rsp), %rdi
  8198. movq 152(%rsp), %rsi
  8199. movq 136(%rsp), %rbx
  8200. # Multiply
  8201. # A[0] * B[0]
  8202. movq (%rbx), %rax
  8203. mulq (%rsi)
  8204. movq %rax, %r8
  8205. movq %rdx, %r9
  8206. # A[0] * B[1]
  8207. movq 8(%rbx), %rax
  8208. mulq (%rsi)
  8209. xorq %r10, %r10
  8210. addq %rax, %r9
  8211. adcq %rdx, %r10
  8212. # A[1] * B[0]
  8213. movq (%rbx), %rax
  8214. mulq 8(%rsi)
  8215. xorq %r11, %r11
  8216. addq %rax, %r9
  8217. adcq %rdx, %r10
  8218. adcq $0x00, %r11
  8219. # A[0] * B[2]
  8220. movq 16(%rbx), %rax
  8221. mulq (%rsi)
  8222. addq %rax, %r10
  8223. adcq %rdx, %r11
  8224. # A[1] * B[1]
  8225. movq 8(%rbx), %rax
  8226. mulq 8(%rsi)
  8227. xorq %r12, %r12
  8228. addq %rax, %r10
  8229. adcq %rdx, %r11
  8230. adcq $0x00, %r12
  8231. # A[2] * B[0]
  8232. movq (%rbx), %rax
  8233. mulq 16(%rsi)
  8234. addq %rax, %r10
  8235. adcq %rdx, %r11
  8236. adcq $0x00, %r12
  8237. # A[0] * B[3]
  8238. movq 24(%rbx), %rax
  8239. mulq (%rsi)
  8240. xorq %r13, %r13
  8241. addq %rax, %r11
  8242. adcq %rdx, %r12
  8243. adcq $0x00, %r13
  8244. # A[1] * B[2]
  8245. movq 16(%rbx), %rax
  8246. mulq 8(%rsi)
  8247. addq %rax, %r11
  8248. adcq %rdx, %r12
  8249. adcq $0x00, %r13
  8250. # A[2] * B[1]
  8251. movq 8(%rbx), %rax
  8252. mulq 16(%rsi)
  8253. addq %rax, %r11
  8254. adcq %rdx, %r12
  8255. adcq $0x00, %r13
  8256. # A[3] * B[0]
  8257. movq (%rbx), %rax
  8258. mulq 24(%rsi)
  8259. addq %rax, %r11
  8260. adcq %rdx, %r12
  8261. adcq $0x00, %r13
  8262. # A[1] * B[3]
  8263. movq 24(%rbx), %rax
  8264. mulq 8(%rsi)
  8265. xorq %r14, %r14
  8266. addq %rax, %r12
  8267. adcq %rdx, %r13
  8268. adcq $0x00, %r14
  8269. # A[2] * B[2]
  8270. movq 16(%rbx), %rax
  8271. mulq 16(%rsi)
  8272. addq %rax, %r12
  8273. adcq %rdx, %r13
  8274. adcq $0x00, %r14
  8275. # A[3] * B[1]
  8276. movq 8(%rbx), %rax
  8277. mulq 24(%rsi)
  8278. addq %rax, %r12
  8279. adcq %rdx, %r13
  8280. adcq $0x00, %r14
  8281. # A[2] * B[3]
  8282. movq 24(%rbx), %rax
  8283. mulq 16(%rsi)
  8284. xorq %r15, %r15
  8285. addq %rax, %r13
  8286. adcq %rdx, %r14
  8287. adcq $0x00, %r15
  8288. # A[3] * B[2]
  8289. movq 16(%rbx), %rax
  8290. mulq 24(%rsi)
  8291. addq %rax, %r13
  8292. adcq %rdx, %r14
  8293. adcq $0x00, %r15
  8294. # A[3] * B[3]
  8295. movq 24(%rbx), %rax
  8296. mulq 24(%rsi)
  8297. addq %rax, %r14
  8298. adcq %rdx, %r15
  8299. # Reduce
  8300. movq $0x7fffffffffffffff, %rcx
  8301. # Move top half into t4-t7 and remove top bit from t3
  8302. shldq $0x01, %r14, %r15
  8303. shldq $0x01, %r13, %r14
  8304. shldq $0x01, %r12, %r13
  8305. shldq $0x01, %r11, %r12
  8306. andq %rcx, %r11
  8307. # Multiply top half by 19
  8308. movq $19, %rax
  8309. mulq %r12
  8310. xorq %r12, %r12
  8311. addq %rax, %r8
  8312. movq $19, %rax
  8313. adcq %rdx, %r12
  8314. mulq %r13
  8315. xorq %r13, %r13
  8316. addq %rax, %r9
  8317. movq $19, %rax
  8318. adcq %rdx, %r13
  8319. mulq %r14
  8320. xorq %r14, %r14
  8321. addq %rax, %r10
  8322. movq $19, %rax
  8323. adcq %rdx, %r14
  8324. mulq %r15
  8325. # Add remaining product results in
  8326. addq %r12, %r9
  8327. adcq %r13, %r10
  8328. adcq %r14, %r11
  8329. adcq %rax, %r11
  8330. adcq $0x00, %rdx
  8331. # Overflow
  8332. shldq $0x01, %r11, %rdx
  8333. imulq $19, %rdx, %rax
  8334. andq %rcx, %r11
  8335. addq %rax, %r8
  8336. adcq $0x00, %r9
  8337. adcq $0x00, %r10
  8338. adcq $0x00, %r11
  8339. # Reduce if top bit set
  8340. movq %r11, %rdx
  8341. shrq $63, %rdx
  8342. imulq $19, %rdx, %rax
  8343. andq %rcx, %r11
  8344. addq %rax, %r8
  8345. adcq $0x00, %r9
  8346. adcq $0x00, %r10
  8347. adcq $0x00, %r11
  8348. # Store
  8349. movq %r8, (%rdi)
  8350. movq %r9, 8(%rdi)
  8351. movq %r10, 16(%rdi)
  8352. movq %r11, 24(%rdi)
  8353. movq (%rsp), %rdi
  8354. movq 128(%rsp), %rsi
  8355. movq 144(%rsp), %rbx
  8356. # Multiply
  8357. # A[0] * B[0]
  8358. movq (%rbx), %rax
  8359. mulq (%rsi)
  8360. movq %rax, %r8
  8361. movq %rdx, %r9
  8362. # A[0] * B[1]
  8363. movq 8(%rbx), %rax
  8364. mulq (%rsi)
  8365. xorq %r10, %r10
  8366. addq %rax, %r9
  8367. adcq %rdx, %r10
  8368. # A[1] * B[0]
  8369. movq (%rbx), %rax
  8370. mulq 8(%rsi)
  8371. xorq %r11, %r11
  8372. addq %rax, %r9
  8373. adcq %rdx, %r10
  8374. adcq $0x00, %r11
  8375. # A[0] * B[2]
  8376. movq 16(%rbx), %rax
  8377. mulq (%rsi)
  8378. addq %rax, %r10
  8379. adcq %rdx, %r11
  8380. # A[1] * B[1]
  8381. movq 8(%rbx), %rax
  8382. mulq 8(%rsi)
  8383. xorq %r12, %r12
  8384. addq %rax, %r10
  8385. adcq %rdx, %r11
  8386. adcq $0x00, %r12
  8387. # A[2] * B[0]
  8388. movq (%rbx), %rax
  8389. mulq 16(%rsi)
  8390. addq %rax, %r10
  8391. adcq %rdx, %r11
  8392. adcq $0x00, %r12
  8393. # A[0] * B[3]
  8394. movq 24(%rbx), %rax
  8395. mulq (%rsi)
  8396. xorq %r13, %r13
  8397. addq %rax, %r11
  8398. adcq %rdx, %r12
  8399. adcq $0x00, %r13
  8400. # A[1] * B[2]
  8401. movq 16(%rbx), %rax
  8402. mulq 8(%rsi)
  8403. addq %rax, %r11
  8404. adcq %rdx, %r12
  8405. adcq $0x00, %r13
  8406. # A[2] * B[1]
  8407. movq 8(%rbx), %rax
  8408. mulq 16(%rsi)
  8409. addq %rax, %r11
  8410. adcq %rdx, %r12
  8411. adcq $0x00, %r13
  8412. # A[3] * B[0]
  8413. movq (%rbx), %rax
  8414. mulq 24(%rsi)
  8415. addq %rax, %r11
  8416. adcq %rdx, %r12
  8417. adcq $0x00, %r13
  8418. # A[1] * B[3]
  8419. movq 24(%rbx), %rax
  8420. mulq 8(%rsi)
  8421. xorq %r14, %r14
  8422. addq %rax, %r12
  8423. adcq %rdx, %r13
  8424. adcq $0x00, %r14
  8425. # A[2] * B[2]
  8426. movq 16(%rbx), %rax
  8427. mulq 16(%rsi)
  8428. addq %rax, %r12
  8429. adcq %rdx, %r13
  8430. adcq $0x00, %r14
  8431. # A[3] * B[1]
  8432. movq 8(%rbx), %rax
  8433. mulq 24(%rsi)
  8434. addq %rax, %r12
  8435. adcq %rdx, %r13
  8436. adcq $0x00, %r14
  8437. # A[2] * B[3]
  8438. movq 24(%rbx), %rax
  8439. mulq 16(%rsi)
  8440. xorq %r15, %r15
  8441. addq %rax, %r13
  8442. adcq %rdx, %r14
  8443. adcq $0x00, %r15
  8444. # A[3] * B[2]
  8445. movq 16(%rbx), %rax
  8446. mulq 24(%rsi)
  8447. addq %rax, %r13
  8448. adcq %rdx, %r14
  8449. adcq $0x00, %r15
  8450. # A[3] * B[3]
  8451. movq 24(%rbx), %rax
  8452. mulq 24(%rsi)
  8453. addq %rax, %r14
  8454. adcq %rdx, %r15
  8455. # Reduce
  8456. movq $0x7fffffffffffffff, %rcx
  8457. # Move top half into t4-t7 and remove top bit from t3
  8458. shldq $0x01, %r14, %r15
  8459. shldq $0x01, %r13, %r14
  8460. shldq $0x01, %r12, %r13
  8461. shldq $0x01, %r11, %r12
  8462. andq %rcx, %r11
  8463. # Multiply top half by 19
  8464. movq $19, %rax
  8465. mulq %r12
  8466. xorq %r12, %r12
  8467. addq %rax, %r8
  8468. movq $19, %rax
  8469. adcq %rdx, %r12
  8470. mulq %r13
  8471. xorq %r13, %r13
  8472. addq %rax, %r9
  8473. movq $19, %rax
  8474. adcq %rdx, %r13
  8475. mulq %r14
  8476. xorq %r14, %r14
  8477. addq %rax, %r10
  8478. movq $19, %rax
  8479. adcq %rdx, %r14
  8480. mulq %r15
  8481. # Add remaining product results in
  8482. addq %r12, %r9
  8483. adcq %r13, %r10
  8484. adcq %r14, %r11
  8485. adcq %rax, %r11
  8486. adcq $0x00, %rdx
  8487. # Overflow
  8488. shldq $0x01, %r11, %rdx
  8489. imulq $19, %rdx, %rax
  8490. andq %rcx, %r11
  8491. addq %rax, %r8
  8492. adcq $0x00, %r9
  8493. adcq $0x00, %r10
  8494. adcq $0x00, %r11
  8495. # Reduce if top bit set
  8496. movq %r11, %rdx
  8497. shrq $63, %rdx
  8498. imulq $19, %rdx, %rax
  8499. andq %rcx, %r11
  8500. addq %rax, %r8
  8501. adcq $0x00, %r9
  8502. adcq $0x00, %r10
  8503. adcq $0x00, %r11
  8504. # Store
  8505. movq %r8, (%rdi)
  8506. movq %r9, 8(%rdi)
  8507. movq %r10, 16(%rdi)
  8508. movq %r11, 24(%rdi)
  8509. leaq 48(%rsp), %rdi
  8510. movq (%rsp), %rsi
  8511. movq (%rsp), %rbx
  8512. # Add
  8513. movq (%rsi), %r8
  8514. movq 8(%rsi), %r9
  8515. addq (%rbx), %r8
  8516. movq 16(%rsi), %r10
  8517. adcq 8(%rbx), %r9
  8518. movq 24(%rsi), %rcx
  8519. adcq 16(%rbx), %r10
  8520. movq $-19, %rax
  8521. adcq 24(%rbx), %rcx
  8522. movq $0x7fffffffffffffff, %rdx
  8523. movq %rcx, %r11
  8524. sarq $63, %rcx
  8525. # Mask the modulus
  8526. andq %rcx, %rax
  8527. andq %rcx, %rdx
  8528. # Sub modulus (if overflow)
  8529. subq %rax, %r8
  8530. sbbq %rcx, %r9
  8531. sbbq %rcx, %r10
  8532. sbbq %rdx, %r11
  8533. movq %r8, (%rdi)
  8534. movq %r9, 8(%rdi)
  8535. movq %r10, 16(%rdi)
  8536. movq %r11, 24(%rdi)
  8537. movq (%rsp), %rdi
  8538. movq 16(%rsp), %rsi
  8539. movq 8(%rsp), %rbx
  8540. # Sub
  8541. movq (%rsi), %r8
  8542. movq 8(%rsi), %r9
  8543. movq 16(%rsi), %r10
  8544. movq 24(%rsi), %r11
  8545. subq (%rbx), %r8
  8546. movq $0x00, %rcx
  8547. sbbq 8(%rbx), %r9
  8548. movq $-19, %rax
  8549. sbbq 16(%rbx), %r10
  8550. movq $0x7fffffffffffffff, %rdx
  8551. sbbq 24(%rbx), %r11
  8552. sbbq $0x00, %rcx
  8553. # Mask the modulus
  8554. andq %rcx, %rax
  8555. andq %rcx, %rdx
  8556. # Add modulus (if underflow)
  8557. addq %rax, %r8
  8558. adcq %rcx, %r9
  8559. adcq %rcx, %r10
  8560. adcq %rdx, %r11
  8561. movq %r8, (%rdi)
  8562. movq %r9, 8(%rdi)
  8563. movq %r10, 16(%rdi)
  8564. movq %r11, 24(%rdi)
  8565. movq 8(%rsp), %rdi
  8566. movq 16(%rsp), %rsi
  8567. movq 8(%rsp), %rbx
  8568. # Add
  8569. movq (%rsi), %r8
  8570. movq 8(%rsi), %r9
  8571. addq (%rbx), %r8
  8572. movq 16(%rsi), %r10
  8573. adcq 8(%rbx), %r9
  8574. movq 24(%rsi), %rcx
  8575. adcq 16(%rbx), %r10
  8576. movq $-19, %rax
  8577. adcq 24(%rbx), %rcx
  8578. movq $0x7fffffffffffffff, %rdx
  8579. movq %rcx, %r11
  8580. sarq $63, %rcx
  8581. # Mask the modulus
  8582. andq %rcx, %rax
  8583. andq %rcx, %rdx
  8584. # Sub modulus (if overflow)
  8585. subq %rax, %r8
  8586. sbbq %rcx, %r9
  8587. sbbq %rcx, %r10
  8588. sbbq %rdx, %r11
  8589. movq %r8, (%rdi)
  8590. movq %r9, 8(%rdi)
  8591. movq %r10, 16(%rdi)
  8592. movq %r11, 24(%rdi)
  8593. movq 16(%rsp), %rdi
  8594. leaq 48(%rsp), %rsi
  8595. movq 24(%rsp), %rbx
  8596. # Add
  8597. movq (%rsi), %r8
  8598. movq 8(%rsi), %r9
  8599. addq (%rbx), %r8
  8600. movq 16(%rsi), %r10
  8601. adcq 8(%rbx), %r9
  8602. movq 24(%rsi), %rcx
  8603. adcq 16(%rbx), %r10
  8604. movq $-19, %rax
  8605. adcq 24(%rbx), %rcx
  8606. movq $0x7fffffffffffffff, %rdx
  8607. movq %rcx, %r11
  8608. sarq $63, %rcx
  8609. # Mask the modulus
  8610. andq %rcx, %rax
  8611. andq %rcx, %rdx
  8612. # Sub modulus (if overflow)
  8613. subq %rax, %r8
  8614. sbbq %rcx, %r9
  8615. sbbq %rcx, %r10
  8616. sbbq %rdx, %r11
  8617. movq %r8, (%rdi)
  8618. movq %r9, 8(%rdi)
  8619. movq %r10, 16(%rdi)
  8620. movq %r11, 24(%rdi)
  8621. movq 24(%rsp), %rdi
  8622. leaq 48(%rsp), %rsi
  8623. movq 24(%rsp), %rbx
  8624. # Sub
  8625. movq (%rsi), %r8
  8626. movq 8(%rsi), %r9
  8627. movq 16(%rsi), %r10
  8628. movq 24(%rsi), %r11
  8629. subq (%rbx), %r8
  8630. movq $0x00, %rcx
  8631. sbbq 8(%rbx), %r9
  8632. movq $-19, %rax
  8633. sbbq 16(%rbx), %r10
  8634. movq $0x7fffffffffffffff, %rdx
  8635. sbbq 24(%rbx), %r11
  8636. sbbq $0x00, %rcx
  8637. # Mask the modulus
  8638. andq %rcx, %rax
  8639. andq %rcx, %rdx
  8640. # Add modulus (if underflow)
  8641. addq %rax, %r8
  8642. adcq %rcx, %r9
  8643. adcq %rcx, %r10
  8644. adcq %rdx, %r11
  8645. movq %r8, (%rdi)
  8646. movq %r9, 8(%rdi)
  8647. movq %r10, 16(%rdi)
  8648. movq %r11, 24(%rdi)
  8649. addq $0x50, %rsp
  8650. popq %r15
  8651. popq %r14
  8652. popq %r13
  8653. popq %r12
  8654. popq %rbx
  8655. repz retq
  8656. #ifndef __APPLE__
  8657. .size fe_ge_add_x64,.-fe_ge_add_x64
  8658. #endif /* __APPLE__ */
  8659. #ifndef __APPLE__
  8660. .text
  8661. .globl fe_ge_sub_x64
  8662. .type fe_ge_sub_x64,@function
  8663. .align 16
  8664. fe_ge_sub_x64:
  8665. #else
  8666. .section __TEXT,__text
  8667. .globl _fe_ge_sub_x64
  8668. .p2align 4
  8669. _fe_ge_sub_x64:
  8670. #endif /* __APPLE__ */
  8671. pushq %rbx
  8672. pushq %r12
  8673. pushq %r13
  8674. pushq %r14
  8675. pushq %r15
  8676. subq $0x50, %rsp
  8677. movq %rdi, (%rsp)
  8678. movq %rsi, 8(%rsp)
  8679. movq %rdx, 16(%rsp)
  8680. movq %rcx, 24(%rsp)
  8681. movq %r8, 32(%rsp)
  8682. movq %r9, 40(%rsp)
  8683. movq (%rsp), %rdi
  8684. movq 40(%rsp), %rsi
  8685. movq 32(%rsp), %rbx
  8686. # Add
  8687. movq (%rsi), %r8
  8688. movq 8(%rsi), %r9
  8689. addq (%rbx), %r8
  8690. movq 16(%rsi), %r10
  8691. adcq 8(%rbx), %r9
  8692. movq 24(%rsi), %rcx
  8693. adcq 16(%rbx), %r10
  8694. movq $-19, %rax
  8695. adcq 24(%rbx), %rcx
  8696. movq $0x7fffffffffffffff, %rdx
  8697. movq %rcx, %r11
  8698. sarq $63, %rcx
  8699. # Mask the modulus
  8700. andq %rcx, %rax
  8701. andq %rcx, %rdx
  8702. # Sub modulus (if overflow)
  8703. subq %rax, %r8
  8704. sbbq %rcx, %r9
  8705. sbbq %rcx, %r10
  8706. sbbq %rdx, %r11
  8707. movq %r8, (%rdi)
  8708. movq %r9, 8(%rdi)
  8709. movq %r10, 16(%rdi)
  8710. movq %r11, 24(%rdi)
  8711. movq 8(%rsp), %rdi
  8712. movq 40(%rsp), %rsi
  8713. movq 32(%rsp), %rbx
  8714. # Sub
  8715. movq (%rsi), %r8
  8716. movq 8(%rsi), %r9
  8717. movq 16(%rsi), %r10
  8718. movq 24(%rsi), %r11
  8719. subq (%rbx), %r8
  8720. movq $0x00, %rcx
  8721. sbbq 8(%rbx), %r9
  8722. movq $-19, %rax
  8723. sbbq 16(%rbx), %r10
  8724. movq $0x7fffffffffffffff, %rdx
  8725. sbbq 24(%rbx), %r11
  8726. sbbq $0x00, %rcx
  8727. # Mask the modulus
  8728. andq %rcx, %rax
  8729. andq %rcx, %rdx
  8730. # Add modulus (if underflow)
  8731. addq %rax, %r8
  8732. adcq %rcx, %r9
  8733. adcq %rcx, %r10
  8734. adcq %rdx, %r11
  8735. movq %r8, (%rdi)
  8736. movq %r9, 8(%rdi)
  8737. movq %r10, 16(%rdi)
  8738. movq %r11, 24(%rdi)
  8739. movq 16(%rsp), %rdi
  8740. movq (%rsp), %rsi
  8741. movq 168(%rsp), %rbx
  8742. # Multiply
  8743. # A[0] * B[0]
  8744. movq (%rbx), %rax
  8745. mulq (%rsi)
  8746. movq %rax, %r8
  8747. movq %rdx, %r9
  8748. # A[0] * B[1]
  8749. movq 8(%rbx), %rax
  8750. mulq (%rsi)
  8751. xorq %r10, %r10
  8752. addq %rax, %r9
  8753. adcq %rdx, %r10
  8754. # A[1] * B[0]
  8755. movq (%rbx), %rax
  8756. mulq 8(%rsi)
  8757. xorq %r11, %r11
  8758. addq %rax, %r9
  8759. adcq %rdx, %r10
  8760. adcq $0x00, %r11
  8761. # A[0] * B[2]
  8762. movq 16(%rbx), %rax
  8763. mulq (%rsi)
  8764. addq %rax, %r10
  8765. adcq %rdx, %r11
  8766. # A[1] * B[1]
  8767. movq 8(%rbx), %rax
  8768. mulq 8(%rsi)
  8769. xorq %r12, %r12
  8770. addq %rax, %r10
  8771. adcq %rdx, %r11
  8772. adcq $0x00, %r12
  8773. # A[2] * B[0]
  8774. movq (%rbx), %rax
  8775. mulq 16(%rsi)
  8776. addq %rax, %r10
  8777. adcq %rdx, %r11
  8778. adcq $0x00, %r12
  8779. # A[0] * B[3]
  8780. movq 24(%rbx), %rax
  8781. mulq (%rsi)
  8782. xorq %r13, %r13
  8783. addq %rax, %r11
  8784. adcq %rdx, %r12
  8785. adcq $0x00, %r13
  8786. # A[1] * B[2]
  8787. movq 16(%rbx), %rax
  8788. mulq 8(%rsi)
  8789. addq %rax, %r11
  8790. adcq %rdx, %r12
  8791. adcq $0x00, %r13
  8792. # A[2] * B[1]
  8793. movq 8(%rbx), %rax
  8794. mulq 16(%rsi)
  8795. addq %rax, %r11
  8796. adcq %rdx, %r12
  8797. adcq $0x00, %r13
  8798. # A[3] * B[0]
  8799. movq (%rbx), %rax
  8800. mulq 24(%rsi)
  8801. addq %rax, %r11
  8802. adcq %rdx, %r12
  8803. adcq $0x00, %r13
  8804. # A[1] * B[3]
  8805. movq 24(%rbx), %rax
  8806. mulq 8(%rsi)
  8807. xorq %r14, %r14
  8808. addq %rax, %r12
  8809. adcq %rdx, %r13
  8810. adcq $0x00, %r14
  8811. # A[2] * B[2]
  8812. movq 16(%rbx), %rax
  8813. mulq 16(%rsi)
  8814. addq %rax, %r12
  8815. adcq %rdx, %r13
  8816. adcq $0x00, %r14
  8817. # A[3] * B[1]
  8818. movq 8(%rbx), %rax
  8819. mulq 24(%rsi)
  8820. addq %rax, %r12
  8821. adcq %rdx, %r13
  8822. adcq $0x00, %r14
  8823. # A[2] * B[3]
  8824. movq 24(%rbx), %rax
  8825. mulq 16(%rsi)
  8826. xorq %r15, %r15
  8827. addq %rax, %r13
  8828. adcq %rdx, %r14
  8829. adcq $0x00, %r15
  8830. # A[3] * B[2]
  8831. movq 16(%rbx), %rax
  8832. mulq 24(%rsi)
  8833. addq %rax, %r13
  8834. adcq %rdx, %r14
  8835. adcq $0x00, %r15
  8836. # A[3] * B[3]
  8837. movq 24(%rbx), %rax
  8838. mulq 24(%rsi)
  8839. addq %rax, %r14
  8840. adcq %rdx, %r15
  8841. # Reduce
  8842. movq $0x7fffffffffffffff, %rcx
  8843. # Move top half into t4-t7 and remove top bit from t3
  8844. shldq $0x01, %r14, %r15
  8845. shldq $0x01, %r13, %r14
  8846. shldq $0x01, %r12, %r13
  8847. shldq $0x01, %r11, %r12
  8848. andq %rcx, %r11
  8849. # Multiply top half by 19
  8850. movq $19, %rax
  8851. mulq %r12
  8852. xorq %r12, %r12
  8853. addq %rax, %r8
  8854. movq $19, %rax
  8855. adcq %rdx, %r12
  8856. mulq %r13
  8857. xorq %r13, %r13
  8858. addq %rax, %r9
  8859. movq $19, %rax
  8860. adcq %rdx, %r13
  8861. mulq %r14
  8862. xorq %r14, %r14
  8863. addq %rax, %r10
  8864. movq $19, %rax
  8865. adcq %rdx, %r14
  8866. mulq %r15
  8867. # Add remaining product results in
  8868. addq %r12, %r9
  8869. adcq %r13, %r10
  8870. adcq %r14, %r11
  8871. adcq %rax, %r11
  8872. adcq $0x00, %rdx
  8873. # Overflow
  8874. shldq $0x01, %r11, %rdx
  8875. imulq $19, %rdx, %rax
  8876. andq %rcx, %r11
  8877. addq %rax, %r8
  8878. adcq $0x00, %r9
  8879. adcq $0x00, %r10
  8880. adcq $0x00, %r11
  8881. # Reduce if top bit set
  8882. movq %r11, %rdx
  8883. shrq $63, %rdx
  8884. imulq $19, %rdx, %rax
  8885. andq %rcx, %r11
  8886. addq %rax, %r8
  8887. adcq $0x00, %r9
  8888. adcq $0x00, %r10
  8889. adcq $0x00, %r11
  8890. # Store
  8891. movq %r8, (%rdi)
  8892. movq %r9, 8(%rdi)
  8893. movq %r10, 16(%rdi)
  8894. movq %r11, 24(%rdi)
  8895. movq 8(%rsp), %rdi
  8896. movq 8(%rsp), %rsi
  8897. movq 160(%rsp), %rbx
  8898. # Multiply
  8899. # A[0] * B[0]
  8900. movq (%rbx), %rax
  8901. mulq (%rsi)
  8902. movq %rax, %r8
  8903. movq %rdx, %r9
  8904. # A[0] * B[1]
  8905. movq 8(%rbx), %rax
  8906. mulq (%rsi)
  8907. xorq %r10, %r10
  8908. addq %rax, %r9
  8909. adcq %rdx, %r10
  8910. # A[1] * B[0]
  8911. movq (%rbx), %rax
  8912. mulq 8(%rsi)
  8913. xorq %r11, %r11
  8914. addq %rax, %r9
  8915. adcq %rdx, %r10
  8916. adcq $0x00, %r11
  8917. # A[0] * B[2]
  8918. movq 16(%rbx), %rax
  8919. mulq (%rsi)
  8920. addq %rax, %r10
  8921. adcq %rdx, %r11
  8922. # A[1] * B[1]
  8923. movq 8(%rbx), %rax
  8924. mulq 8(%rsi)
  8925. xorq %r12, %r12
  8926. addq %rax, %r10
  8927. adcq %rdx, %r11
  8928. adcq $0x00, %r12
  8929. # A[2] * B[0]
  8930. movq (%rbx), %rax
  8931. mulq 16(%rsi)
  8932. addq %rax, %r10
  8933. adcq %rdx, %r11
  8934. adcq $0x00, %r12
  8935. # A[0] * B[3]
  8936. movq 24(%rbx), %rax
  8937. mulq (%rsi)
  8938. xorq %r13, %r13
  8939. addq %rax, %r11
  8940. adcq %rdx, %r12
  8941. adcq $0x00, %r13
  8942. # A[1] * B[2]
  8943. movq 16(%rbx), %rax
  8944. mulq 8(%rsi)
  8945. addq %rax, %r11
  8946. adcq %rdx, %r12
  8947. adcq $0x00, %r13
  8948. # A[2] * B[1]
  8949. movq 8(%rbx), %rax
  8950. mulq 16(%rsi)
  8951. addq %rax, %r11
  8952. adcq %rdx, %r12
  8953. adcq $0x00, %r13
  8954. # A[3] * B[0]
  8955. movq (%rbx), %rax
  8956. mulq 24(%rsi)
  8957. addq %rax, %r11
  8958. adcq %rdx, %r12
  8959. adcq $0x00, %r13
  8960. # A[1] * B[3]
  8961. movq 24(%rbx), %rax
  8962. mulq 8(%rsi)
  8963. xorq %r14, %r14
  8964. addq %rax, %r12
  8965. adcq %rdx, %r13
  8966. adcq $0x00, %r14
  8967. # A[2] * B[2]
  8968. movq 16(%rbx), %rax
  8969. mulq 16(%rsi)
  8970. addq %rax, %r12
  8971. adcq %rdx, %r13
  8972. adcq $0x00, %r14
  8973. # A[3] * B[1]
  8974. movq 8(%rbx), %rax
  8975. mulq 24(%rsi)
  8976. addq %rax, %r12
  8977. adcq %rdx, %r13
  8978. adcq $0x00, %r14
  8979. # A[2] * B[3]
  8980. movq 24(%rbx), %rax
  8981. mulq 16(%rsi)
  8982. xorq %r15, %r15
  8983. addq %rax, %r13
  8984. adcq %rdx, %r14
  8985. adcq $0x00, %r15
  8986. # A[3] * B[2]
  8987. movq 16(%rbx), %rax
  8988. mulq 24(%rsi)
  8989. addq %rax, %r13
  8990. adcq %rdx, %r14
  8991. adcq $0x00, %r15
  8992. # A[3] * B[3]
  8993. movq 24(%rbx), %rax
  8994. mulq 24(%rsi)
  8995. addq %rax, %r14
  8996. adcq %rdx, %r15
  8997. # Reduce
  8998. movq $0x7fffffffffffffff, %rcx
  8999. # Move top half into t4-t7 and remove top bit from t3
  9000. shldq $0x01, %r14, %r15
  9001. shldq $0x01, %r13, %r14
  9002. shldq $0x01, %r12, %r13
  9003. shldq $0x01, %r11, %r12
  9004. andq %rcx, %r11
  9005. # Multiply top half by 19
  9006. movq $19, %rax
  9007. mulq %r12
  9008. xorq %r12, %r12
  9009. addq %rax, %r8
  9010. movq $19, %rax
  9011. adcq %rdx, %r12
  9012. mulq %r13
  9013. xorq %r13, %r13
  9014. addq %rax, %r9
  9015. movq $19, %rax
  9016. adcq %rdx, %r13
  9017. mulq %r14
  9018. xorq %r14, %r14
  9019. addq %rax, %r10
  9020. movq $19, %rax
  9021. adcq %rdx, %r14
  9022. mulq %r15
  9023. # Add remaining product results in
  9024. addq %r12, %r9
  9025. adcq %r13, %r10
  9026. adcq %r14, %r11
  9027. adcq %rax, %r11
  9028. adcq $0x00, %rdx
  9029. # Overflow
  9030. shldq $0x01, %r11, %rdx
  9031. imulq $19, %rdx, %rax
  9032. andq %rcx, %r11
  9033. addq %rax, %r8
  9034. adcq $0x00, %r9
  9035. adcq $0x00, %r10
  9036. adcq $0x00, %r11
  9037. # Reduce if top bit set
  9038. movq %r11, %rdx
  9039. shrq $63, %rdx
  9040. imulq $19, %rdx, %rax
  9041. andq %rcx, %r11
  9042. addq %rax, %r8
  9043. adcq $0x00, %r9
  9044. adcq $0x00, %r10
  9045. adcq $0x00, %r11
  9046. # Store
  9047. movq %r8, (%rdi)
  9048. movq %r9, 8(%rdi)
  9049. movq %r10, 16(%rdi)
  9050. movq %r11, 24(%rdi)
  9051. movq 24(%rsp), %rdi
  9052. movq 152(%rsp), %rsi
  9053. movq 136(%rsp), %rbx
  9054. # Multiply
  9055. # A[0] * B[0]
  9056. movq (%rbx), %rax
  9057. mulq (%rsi)
  9058. movq %rax, %r8
  9059. movq %rdx, %r9
  9060. # A[0] * B[1]
  9061. movq 8(%rbx), %rax
  9062. mulq (%rsi)
  9063. xorq %r10, %r10
  9064. addq %rax, %r9
  9065. adcq %rdx, %r10
  9066. # A[1] * B[0]
  9067. movq (%rbx), %rax
  9068. mulq 8(%rsi)
  9069. xorq %r11, %r11
  9070. addq %rax, %r9
  9071. adcq %rdx, %r10
  9072. adcq $0x00, %r11
  9073. # A[0] * B[2]
  9074. movq 16(%rbx), %rax
  9075. mulq (%rsi)
  9076. addq %rax, %r10
  9077. adcq %rdx, %r11
  9078. # A[1] * B[1]
  9079. movq 8(%rbx), %rax
  9080. mulq 8(%rsi)
  9081. xorq %r12, %r12
  9082. addq %rax, %r10
  9083. adcq %rdx, %r11
  9084. adcq $0x00, %r12
  9085. # A[2] * B[0]
  9086. movq (%rbx), %rax
  9087. mulq 16(%rsi)
  9088. addq %rax, %r10
  9089. adcq %rdx, %r11
  9090. adcq $0x00, %r12
  9091. # A[0] * B[3]
  9092. movq 24(%rbx), %rax
  9093. mulq (%rsi)
  9094. xorq %r13, %r13
  9095. addq %rax, %r11
  9096. adcq %rdx, %r12
  9097. adcq $0x00, %r13
  9098. # A[1] * B[2]
  9099. movq 16(%rbx), %rax
  9100. mulq 8(%rsi)
  9101. addq %rax, %r11
  9102. adcq %rdx, %r12
  9103. adcq $0x00, %r13
  9104. # A[2] * B[1]
  9105. movq 8(%rbx), %rax
  9106. mulq 16(%rsi)
  9107. addq %rax, %r11
  9108. adcq %rdx, %r12
  9109. adcq $0x00, %r13
  9110. # A[3] * B[0]
  9111. movq (%rbx), %rax
  9112. mulq 24(%rsi)
  9113. addq %rax, %r11
  9114. adcq %rdx, %r12
  9115. adcq $0x00, %r13
  9116. # A[1] * B[3]
  9117. movq 24(%rbx), %rax
  9118. mulq 8(%rsi)
  9119. xorq %r14, %r14
  9120. addq %rax, %r12
  9121. adcq %rdx, %r13
  9122. adcq $0x00, %r14
  9123. # A[2] * B[2]
  9124. movq 16(%rbx), %rax
  9125. mulq 16(%rsi)
  9126. addq %rax, %r12
  9127. adcq %rdx, %r13
  9128. adcq $0x00, %r14
  9129. # A[3] * B[1]
  9130. movq 8(%rbx), %rax
  9131. mulq 24(%rsi)
  9132. addq %rax, %r12
  9133. adcq %rdx, %r13
  9134. adcq $0x00, %r14
  9135. # A[2] * B[3]
  9136. movq 24(%rbx), %rax
  9137. mulq 16(%rsi)
  9138. xorq %r15, %r15
  9139. addq %rax, %r13
  9140. adcq %rdx, %r14
  9141. adcq $0x00, %r15
  9142. # A[3] * B[2]
  9143. movq 16(%rbx), %rax
  9144. mulq 24(%rsi)
  9145. addq %rax, %r13
  9146. adcq %rdx, %r14
  9147. adcq $0x00, %r15
  9148. # A[3] * B[3]
  9149. movq 24(%rbx), %rax
  9150. mulq 24(%rsi)
  9151. addq %rax, %r14
  9152. adcq %rdx, %r15
  9153. # Reduce
  9154. movq $0x7fffffffffffffff, %rcx
  9155. # Move top half into t4-t7 and remove top bit from t3
  9156. shldq $0x01, %r14, %r15
  9157. shldq $0x01, %r13, %r14
  9158. shldq $0x01, %r12, %r13
  9159. shldq $0x01, %r11, %r12
  9160. andq %rcx, %r11
  9161. # Multiply top half by 19
  9162. movq $19, %rax
  9163. mulq %r12
  9164. xorq %r12, %r12
  9165. addq %rax, %r8
  9166. movq $19, %rax
  9167. adcq %rdx, %r12
  9168. mulq %r13
  9169. xorq %r13, %r13
  9170. addq %rax, %r9
  9171. movq $19, %rax
  9172. adcq %rdx, %r13
  9173. mulq %r14
  9174. xorq %r14, %r14
  9175. addq %rax, %r10
  9176. movq $19, %rax
  9177. adcq %rdx, %r14
  9178. mulq %r15
  9179. # Add remaining product results in
  9180. addq %r12, %r9
  9181. adcq %r13, %r10
  9182. adcq %r14, %r11
  9183. adcq %rax, %r11
  9184. adcq $0x00, %rdx
  9185. # Overflow
  9186. shldq $0x01, %r11, %rdx
  9187. imulq $19, %rdx, %rax
  9188. andq %rcx, %r11
  9189. addq %rax, %r8
  9190. adcq $0x00, %r9
  9191. adcq $0x00, %r10
  9192. adcq $0x00, %r11
  9193. # Reduce if top bit set
  9194. movq %r11, %rdx
  9195. shrq $63, %rdx
  9196. imulq $19, %rdx, %rax
  9197. andq %rcx, %r11
  9198. addq %rax, %r8
  9199. adcq $0x00, %r9
  9200. adcq $0x00, %r10
  9201. adcq $0x00, %r11
  9202. # Store
  9203. movq %r8, (%rdi)
  9204. movq %r9, 8(%rdi)
  9205. movq %r10, 16(%rdi)
  9206. movq %r11, 24(%rdi)
  9207. movq (%rsp), %rdi
  9208. movq 128(%rsp), %rsi
  9209. movq 144(%rsp), %rbx
  9210. # Multiply
  9211. # A[0] * B[0]
  9212. movq (%rbx), %rax
  9213. mulq (%rsi)
  9214. movq %rax, %r8
  9215. movq %rdx, %r9
  9216. # A[0] * B[1]
  9217. movq 8(%rbx), %rax
  9218. mulq (%rsi)
  9219. xorq %r10, %r10
  9220. addq %rax, %r9
  9221. adcq %rdx, %r10
  9222. # A[1] * B[0]
  9223. movq (%rbx), %rax
  9224. mulq 8(%rsi)
  9225. xorq %r11, %r11
  9226. addq %rax, %r9
  9227. adcq %rdx, %r10
  9228. adcq $0x00, %r11
  9229. # A[0] * B[2]
  9230. movq 16(%rbx), %rax
  9231. mulq (%rsi)
  9232. addq %rax, %r10
  9233. adcq %rdx, %r11
  9234. # A[1] * B[1]
  9235. movq 8(%rbx), %rax
  9236. mulq 8(%rsi)
  9237. xorq %r12, %r12
  9238. addq %rax, %r10
  9239. adcq %rdx, %r11
  9240. adcq $0x00, %r12
  9241. # A[2] * B[0]
  9242. movq (%rbx), %rax
  9243. mulq 16(%rsi)
  9244. addq %rax, %r10
  9245. adcq %rdx, %r11
  9246. adcq $0x00, %r12
  9247. # A[0] * B[3]
  9248. movq 24(%rbx), %rax
  9249. mulq (%rsi)
  9250. xorq %r13, %r13
  9251. addq %rax, %r11
  9252. adcq %rdx, %r12
  9253. adcq $0x00, %r13
  9254. # A[1] * B[2]
  9255. movq 16(%rbx), %rax
  9256. mulq 8(%rsi)
  9257. addq %rax, %r11
  9258. adcq %rdx, %r12
  9259. adcq $0x00, %r13
  9260. # A[2] * B[1]
  9261. movq 8(%rbx), %rax
  9262. mulq 16(%rsi)
  9263. addq %rax, %r11
  9264. adcq %rdx, %r12
  9265. adcq $0x00, %r13
  9266. # A[3] * B[0]
  9267. movq (%rbx), %rax
  9268. mulq 24(%rsi)
  9269. addq %rax, %r11
  9270. adcq %rdx, %r12
  9271. adcq $0x00, %r13
  9272. # A[1] * B[3]
  9273. movq 24(%rbx), %rax
  9274. mulq 8(%rsi)
  9275. xorq %r14, %r14
  9276. addq %rax, %r12
  9277. adcq %rdx, %r13
  9278. adcq $0x00, %r14
  9279. # A[2] * B[2]
  9280. movq 16(%rbx), %rax
  9281. mulq 16(%rsi)
  9282. addq %rax, %r12
  9283. adcq %rdx, %r13
  9284. adcq $0x00, %r14
  9285. # A[3] * B[1]
  9286. movq 8(%rbx), %rax
  9287. mulq 24(%rsi)
  9288. addq %rax, %r12
  9289. adcq %rdx, %r13
  9290. adcq $0x00, %r14
  9291. # A[2] * B[3]
  9292. movq 24(%rbx), %rax
  9293. mulq 16(%rsi)
  9294. xorq %r15, %r15
  9295. addq %rax, %r13
  9296. adcq %rdx, %r14
  9297. adcq $0x00, %r15
  9298. # A[3] * B[2]
  9299. movq 16(%rbx), %rax
  9300. mulq 24(%rsi)
  9301. addq %rax, %r13
  9302. adcq %rdx, %r14
  9303. adcq $0x00, %r15
  9304. # A[3] * B[3]
  9305. movq 24(%rbx), %rax
  9306. mulq 24(%rsi)
  9307. addq %rax, %r14
  9308. adcq %rdx, %r15
  9309. # Reduce
  9310. movq $0x7fffffffffffffff, %rcx
  9311. # Move top half into t4-t7 and remove top bit from t3
  9312. shldq $0x01, %r14, %r15
  9313. shldq $0x01, %r13, %r14
  9314. shldq $0x01, %r12, %r13
  9315. shldq $0x01, %r11, %r12
  9316. andq %rcx, %r11
  9317. # Multiply top half by 19
  9318. movq $19, %rax
  9319. mulq %r12
  9320. xorq %r12, %r12
  9321. addq %rax, %r8
  9322. movq $19, %rax
  9323. adcq %rdx, %r12
  9324. mulq %r13
  9325. xorq %r13, %r13
  9326. addq %rax, %r9
  9327. movq $19, %rax
  9328. adcq %rdx, %r13
  9329. mulq %r14
  9330. xorq %r14, %r14
  9331. addq %rax, %r10
  9332. movq $19, %rax
  9333. adcq %rdx, %r14
  9334. mulq %r15
  9335. # Add remaining product results in
  9336. addq %r12, %r9
  9337. adcq %r13, %r10
  9338. adcq %r14, %r11
  9339. adcq %rax, %r11
  9340. adcq $0x00, %rdx
  9341. # Overflow
  9342. shldq $0x01, %r11, %rdx
  9343. imulq $19, %rdx, %rax
  9344. andq %rcx, %r11
  9345. addq %rax, %r8
  9346. adcq $0x00, %r9
  9347. adcq $0x00, %r10
  9348. adcq $0x00, %r11
  9349. # Reduce if top bit set
  9350. movq %r11, %rdx
  9351. shrq $63, %rdx
  9352. imulq $19, %rdx, %rax
  9353. andq %rcx, %r11
  9354. addq %rax, %r8
  9355. adcq $0x00, %r9
  9356. adcq $0x00, %r10
  9357. adcq $0x00, %r11
  9358. # Store
  9359. movq %r8, (%rdi)
  9360. movq %r9, 8(%rdi)
  9361. movq %r10, 16(%rdi)
  9362. movq %r11, 24(%rdi)
  9363. leaq 48(%rsp), %rdi
  9364. movq (%rsp), %rsi
  9365. movq (%rsp), %rbx
  9366. # Add
  9367. movq (%rsi), %r8
  9368. movq 8(%rsi), %r9
  9369. addq (%rbx), %r8
  9370. movq 16(%rsi), %r10
  9371. adcq 8(%rbx), %r9
  9372. movq 24(%rsi), %rcx
  9373. adcq 16(%rbx), %r10
  9374. movq $-19, %rax
  9375. adcq 24(%rbx), %rcx
  9376. movq $0x7fffffffffffffff, %rdx
  9377. movq %rcx, %r11
  9378. sarq $63, %rcx
  9379. # Mask the modulus
  9380. andq %rcx, %rax
  9381. andq %rcx, %rdx
  9382. # Sub modulus (if overflow)
  9383. subq %rax, %r8
  9384. sbbq %rcx, %r9
  9385. sbbq %rcx, %r10
  9386. sbbq %rdx, %r11
  9387. movq %r8, (%rdi)
  9388. movq %r9, 8(%rdi)
  9389. movq %r10, 16(%rdi)
  9390. movq %r11, 24(%rdi)
  9391. movq (%rsp), %rdi
  9392. movq 16(%rsp), %rsi
  9393. movq 8(%rsp), %rbx
  9394. # Sub
  9395. movq (%rsi), %r8
  9396. movq 8(%rsi), %r9
  9397. movq 16(%rsi), %r10
  9398. movq 24(%rsi), %r11
  9399. subq (%rbx), %r8
  9400. movq $0x00, %rcx
  9401. sbbq 8(%rbx), %r9
  9402. movq $-19, %rax
  9403. sbbq 16(%rbx), %r10
  9404. movq $0x7fffffffffffffff, %rdx
  9405. sbbq 24(%rbx), %r11
  9406. sbbq $0x00, %rcx
  9407. # Mask the modulus
  9408. andq %rcx, %rax
  9409. andq %rcx, %rdx
  9410. # Add modulus (if underflow)
  9411. addq %rax, %r8
  9412. adcq %rcx, %r9
  9413. adcq %rcx, %r10
  9414. adcq %rdx, %r11
  9415. movq %r8, (%rdi)
  9416. movq %r9, 8(%rdi)
  9417. movq %r10, 16(%rdi)
  9418. movq %r11, 24(%rdi)
  9419. movq 8(%rsp), %rdi
  9420. movq 16(%rsp), %rsi
  9421. movq 8(%rsp), %rbx
  9422. # Add
  9423. movq (%rsi), %r8
  9424. movq 8(%rsi), %r9
  9425. addq (%rbx), %r8
  9426. movq 16(%rsi), %r10
  9427. adcq 8(%rbx), %r9
  9428. movq 24(%rsi), %rcx
  9429. adcq 16(%rbx), %r10
  9430. movq $-19, %rax
  9431. adcq 24(%rbx), %rcx
  9432. movq $0x7fffffffffffffff, %rdx
  9433. movq %rcx, %r11
  9434. sarq $63, %rcx
  9435. # Mask the modulus
  9436. andq %rcx, %rax
  9437. andq %rcx, %rdx
  9438. # Sub modulus (if overflow)
  9439. subq %rax, %r8
  9440. sbbq %rcx, %r9
  9441. sbbq %rcx, %r10
  9442. sbbq %rdx, %r11
  9443. movq %r8, (%rdi)
  9444. movq %r9, 8(%rdi)
  9445. movq %r10, 16(%rdi)
  9446. movq %r11, 24(%rdi)
  9447. movq 16(%rsp), %rdi
  9448. leaq 48(%rsp), %rsi
  9449. movq 24(%rsp), %rbx
  9450. # Sub
  9451. movq (%rsi), %r8
  9452. movq 8(%rsi), %r9
  9453. movq 16(%rsi), %r10
  9454. movq 24(%rsi), %r11
  9455. subq (%rbx), %r8
  9456. movq $0x00, %rcx
  9457. sbbq 8(%rbx), %r9
  9458. movq $-19, %rax
  9459. sbbq 16(%rbx), %r10
  9460. movq $0x7fffffffffffffff, %rdx
  9461. sbbq 24(%rbx), %r11
  9462. sbbq $0x00, %rcx
  9463. # Mask the modulus
  9464. andq %rcx, %rax
  9465. andq %rcx, %rdx
  9466. # Add modulus (if underflow)
  9467. addq %rax, %r8
  9468. adcq %rcx, %r9
  9469. adcq %rcx, %r10
  9470. adcq %rdx, %r11
  9471. movq %r8, (%rdi)
  9472. movq %r9, 8(%rdi)
  9473. movq %r10, 16(%rdi)
  9474. movq %r11, 24(%rdi)
  9475. movq 24(%rsp), %rdi
  9476. leaq 48(%rsp), %rsi
  9477. movq 24(%rsp), %rbx
  9478. # Add
  9479. movq (%rsi), %r8
  9480. movq 8(%rsi), %r9
  9481. addq (%rbx), %r8
  9482. movq 16(%rsi), %r10
  9483. adcq 8(%rbx), %r9
  9484. movq 24(%rsi), %rcx
  9485. adcq 16(%rbx), %r10
  9486. movq $-19, %rax
  9487. adcq 24(%rbx), %rcx
  9488. movq $0x7fffffffffffffff, %rdx
  9489. movq %rcx, %r11
  9490. sarq $63, %rcx
  9491. # Mask the modulus
  9492. andq %rcx, %rax
  9493. andq %rcx, %rdx
  9494. # Sub modulus (if overflow)
  9495. subq %rax, %r8
  9496. sbbq %rcx, %r9
  9497. sbbq %rcx, %r10
  9498. sbbq %rdx, %r11
  9499. movq %r8, (%rdi)
  9500. movq %r9, 8(%rdi)
  9501. movq %r10, 16(%rdi)
  9502. movq %r11, 24(%rdi)
  9503. addq $0x50, %rsp
  9504. popq %r15
  9505. popq %r14
  9506. popq %r13
  9507. popq %r12
  9508. popq %rbx
  9509. repz retq
  9510. #ifndef __APPLE__
  9511. .size fe_ge_sub_x64,.-fe_ge_sub_x64
  9512. #endif /* __APPLE__ */
  9513. #ifdef HAVE_INTEL_AVX2
  9514. #ifndef __APPLE__
  9515. .text
  9516. .globl fe_mul_avx2
  9517. .type fe_mul_avx2,@function
  9518. .align 16
  9519. fe_mul_avx2:
  9520. #else
  9521. .section __TEXT,__text
  9522. .globl _fe_mul_avx2
  9523. .p2align 4
  9524. _fe_mul_avx2:
  9525. #endif /* __APPLE__ */
  9526. pushq %r12
  9527. pushq %r13
  9528. pushq %r14
  9529. pushq %r15
  9530. pushq %rbx
  9531. movq %rdx, %rbx
  9532. # Multiply
  9533. # A[0] * B[0]
  9534. movq (%rbx), %rdx
  9535. mulxq (%rsi), %r8, %r9
  9536. # A[2] * B[0]
  9537. mulxq 16(%rsi), %r10, %r11
  9538. # A[1] * B[0]
  9539. mulxq 8(%rsi), %rax, %rcx
  9540. xorq %r15, %r15
  9541. adcxq %rax, %r9
  9542. # A[1] * B[3]
  9543. movq 24(%rbx), %rdx
  9544. mulxq 8(%rsi), %r12, %r13
  9545. adcxq %rcx, %r10
  9546. # A[0] * B[1]
  9547. movq 8(%rbx), %rdx
  9548. mulxq (%rsi), %rax, %rcx
  9549. adoxq %rax, %r9
  9550. # A[2] * B[1]
  9551. mulxq 16(%rsi), %rax, %r14
  9552. adoxq %rcx, %r10
  9553. adcxq %rax, %r11
  9554. # A[1] * B[2]
  9555. movq 16(%rbx), %rdx
  9556. mulxq 8(%rsi), %rax, %rcx
  9557. adcxq %r14, %r12
  9558. adoxq %rax, %r11
  9559. adcxq %r15, %r13
  9560. adoxq %rcx, %r12
  9561. # A[0] * B[2]
  9562. mulxq (%rsi), %rax, %rcx
  9563. adoxq %r15, %r13
  9564. xorq %r14, %r14
  9565. adcxq %rax, %r10
  9566. # A[1] * B[1]
  9567. movq 8(%rbx), %rdx
  9568. mulxq 8(%rsi), %rdx, %rax
  9569. adcxq %rcx, %r11
  9570. adoxq %rdx, %r10
  9571. # A[3] * B[1]
  9572. movq 8(%rbx), %rdx
  9573. adoxq %rax, %r11
  9574. mulxq 24(%rsi), %rax, %rcx
  9575. adcxq %rax, %r12
  9576. # A[2] * B[2]
  9577. movq 16(%rbx), %rdx
  9578. mulxq 16(%rsi), %rdx, %rax
  9579. adcxq %rcx, %r13
  9580. adoxq %rdx, %r12
  9581. # A[3] * B[3]
  9582. movq 24(%rbx), %rdx
  9583. adoxq %rax, %r13
  9584. mulxq 24(%rsi), %rax, %rcx
  9585. adoxq %r15, %r14
  9586. adcxq %rax, %r14
  9587. # A[0] * B[3]
  9588. mulxq (%rsi), %rdx, %rax
  9589. adcxq %rcx, %r15
  9590. xorq %rcx, %rcx
  9591. adcxq %rdx, %r11
  9592. # A[3] * B[0]
  9593. movq (%rbx), %rdx
  9594. adcxq %rax, %r12
  9595. mulxq 24(%rsi), %rdx, %rax
  9596. adoxq %rdx, %r11
  9597. adoxq %rax, %r12
  9598. # A[2] * B[3]
  9599. movq 24(%rbx), %rdx
  9600. mulxq 16(%rsi), %rdx, %rax
  9601. adcxq %rdx, %r13
  9602. # A[3] * B[2]
  9603. movq 16(%rbx), %rdx
  9604. adcxq %rax, %r14
  9605. mulxq 24(%rsi), %rax, %rdx
  9606. adcxq %rcx, %r15
  9607. adoxq %rax, %r13
  9608. adoxq %rdx, %r14
  9609. adoxq %rcx, %r15
  9610. # Reduce
  9611. movq $0x7fffffffffffffff, %rcx
  9612. # Move top half into t4-t7 and remove top bit from t3
  9613. shldq $0x01, %r14, %r15
  9614. shldq $0x01, %r13, %r14
  9615. shldq $0x01, %r12, %r13
  9616. shldq $0x01, %r11, %r12
  9617. andq %rcx, %r11
  9618. # Multiply top half by 19
  9619. movq $19, %rdx
  9620. xorq %rcx, %rcx
  9621. mulxq %r12, %rax, %r12
  9622. adcxq %rax, %r8
  9623. adoxq %r12, %r9
  9624. mulxq %r13, %rax, %r13
  9625. adcxq %rax, %r9
  9626. adoxq %r13, %r10
  9627. mulxq %r14, %rax, %r14
  9628. adcxq %rax, %r10
  9629. adoxq %r14, %r11
  9630. mulxq %r15, %r15, %rdx
  9631. adcxq %r15, %r11
  9632. adoxq %rcx, %rdx
  9633. adcxq %rcx, %rdx
  9634. # Overflow
  9635. shldq $0x01, %r11, %rdx
  9636. movq $0x7fffffffffffffff, %rcx
  9637. imulq $19, %rdx, %rax
  9638. andq %rcx, %r11
  9639. addq %rax, %r8
  9640. adcq $0x00, %r9
  9641. adcq $0x00, %r10
  9642. adcq $0x00, %r11
  9643. # Reduce if top bit set
  9644. movq %r11, %rdx
  9645. shrq $63, %rdx
  9646. imulq $19, %rdx, %rax
  9647. andq %rcx, %r11
  9648. addq %rax, %r8
  9649. adcq $0x00, %r9
  9650. adcq $0x00, %r10
  9651. adcq $0x00, %r11
  9652. # Store
  9653. movq %r8, (%rdi)
  9654. movq %r9, 8(%rdi)
  9655. movq %r10, 16(%rdi)
  9656. movq %r11, 24(%rdi)
  9657. popq %rbx
  9658. popq %r15
  9659. popq %r14
  9660. popq %r13
  9661. popq %r12
  9662. repz retq
  9663. #ifndef __APPLE__
  9664. .size fe_mul_avx2,.-fe_mul_avx2
  9665. #endif /* __APPLE__ */
  9666. #ifndef __APPLE__
  9667. .text
  9668. .globl fe_sq_avx2
  9669. .type fe_sq_avx2,@function
  9670. .align 16
  9671. fe_sq_avx2:
  9672. #else
  9673. .section __TEXT,__text
  9674. .globl _fe_sq_avx2
  9675. .p2align 4
  9676. _fe_sq_avx2:
  9677. #endif /* __APPLE__ */
  9678. pushq %rbx
  9679. pushq %r12
  9680. pushq %r13
  9681. pushq %r14
  9682. pushq %r15
  9683. # Square
  9684. # A[0] * A[1]
  9685. movq (%rsi), %rdx
  9686. mulxq 8(%rsi), %r9, %r10
  9687. # A[0] * A[3]
  9688. mulxq 24(%rsi), %r11, %r12
  9689. # A[2] * A[1]
  9690. movq 16(%rsi), %rdx
  9691. mulxq 8(%rsi), %rcx, %rbx
  9692. xorq %r15, %r15
  9693. adoxq %rcx, %r11
  9694. # A[2] * A[3]
  9695. mulxq 24(%rsi), %r13, %r14
  9696. adoxq %rbx, %r12
  9697. # A[2] * A[0]
  9698. mulxq (%rsi), %rcx, %rbx
  9699. adoxq %r15, %r13
  9700. adcxq %rcx, %r10
  9701. adoxq %r15, %r14
  9702. # A[1] * A[3]
  9703. movq 8(%rsi), %rdx
  9704. mulxq 24(%rsi), %rax, %r8
  9705. adcxq %rbx, %r11
  9706. adcxq %rax, %r12
  9707. adcxq %r8, %r13
  9708. adcxq %r15, %r14
  9709. # Double with Carry Flag
  9710. xorq %r15, %r15
  9711. # A[0] * A[0]
  9712. movq (%rsi), %rdx
  9713. mulxq %rdx, %r8, %rax
  9714. adcxq %r9, %r9
  9715. # A[1] * A[1]
  9716. movq 8(%rsi), %rdx
  9717. mulxq %rdx, %rcx, %rbx
  9718. adcxq %r10, %r10
  9719. adoxq %rax, %r9
  9720. adcxq %r11, %r11
  9721. adoxq %rcx, %r10
  9722. # A[2] * A[2]
  9723. movq 16(%rsi), %rdx
  9724. mulxq %rdx, %rax, %rcx
  9725. adcxq %r12, %r12
  9726. adoxq %rbx, %r11
  9727. adcxq %r13, %r13
  9728. adoxq %rax, %r12
  9729. # A[3] * A[3]
  9730. movq 24(%rsi), %rdx
  9731. mulxq %rdx, %rax, %rbx
  9732. adcxq %r14, %r14
  9733. adoxq %rcx, %r13
  9734. adcxq %r15, %r15
  9735. adoxq %rax, %r14
  9736. adoxq %rbx, %r15
  9737. # Reduce
  9738. movq $0x7fffffffffffffff, %rcx
  9739. # Move top half into t4-t7 and remove top bit from t3
  9740. shldq $0x01, %r14, %r15
  9741. shldq $0x01, %r13, %r14
  9742. shldq $0x01, %r12, %r13
  9743. shldq $0x01, %r11, %r12
  9744. andq %rcx, %r11
  9745. # Multiply top half by 19
  9746. movq $19, %rdx
  9747. xorq %rcx, %rcx
  9748. mulxq %r12, %rax, %r12
  9749. adcxq %rax, %r8
  9750. adoxq %r12, %r9
  9751. mulxq %r13, %rax, %r13
  9752. adcxq %rax, %r9
  9753. adoxq %r13, %r10
  9754. mulxq %r14, %rax, %r14
  9755. adcxq %rax, %r10
  9756. adoxq %r14, %r11
  9757. mulxq %r15, %r15, %rdx
  9758. adcxq %r15, %r11
  9759. adoxq %rcx, %rdx
  9760. adcxq %rcx, %rdx
  9761. # Overflow
  9762. shldq $0x01, %r11, %rdx
  9763. movq $0x7fffffffffffffff, %rcx
  9764. imulq $19, %rdx, %rax
  9765. andq %rcx, %r11
  9766. addq %rax, %r8
  9767. adcq $0x00, %r9
  9768. adcq $0x00, %r10
  9769. adcq $0x00, %r11
  9770. # Reduce if top bit set
  9771. movq %r11, %rdx
  9772. shrq $63, %rdx
  9773. imulq $19, %rdx, %rax
  9774. andq %rcx, %r11
  9775. addq %rax, %r8
  9776. adcq $0x00, %r9
  9777. adcq $0x00, %r10
  9778. adcq $0x00, %r11
  9779. # Store
  9780. movq %r8, (%rdi)
  9781. movq %r9, 8(%rdi)
  9782. movq %r10, 16(%rdi)
  9783. movq %r11, 24(%rdi)
  9784. popq %r15
  9785. popq %r14
  9786. popq %r13
  9787. popq %r12
  9788. popq %rbx
  9789. repz retq
  9790. #ifndef __APPLE__
  9791. .size fe_sq_avx2,.-fe_sq_avx2
  9792. #endif /* __APPLE__ */
  9793. #ifndef __APPLE__
  9794. .text
  9795. .globl fe_sq_n_avx2
  9796. .type fe_sq_n_avx2,@function
  9797. .align 16
  9798. fe_sq_n_avx2:
  9799. #else
  9800. .section __TEXT,__text
  9801. .globl _fe_sq_n_avx2
  9802. .p2align 4
  9803. _fe_sq_n_avx2:
  9804. #endif /* __APPLE__ */
  9805. pushq %rbx
  9806. pushq %r12
  9807. pushq %r13
  9808. pushq %r14
  9809. pushq %r15
  9810. pushq %rbp
  9811. movq %rdx, %rbp
  9812. L_fe_sq_n_avx2:
  9813. # Square
  9814. # A[0] * A[1]
  9815. movq (%rsi), %rdx
  9816. mulxq 8(%rsi), %r9, %r10
  9817. # A[0] * A[3]
  9818. mulxq 24(%rsi), %r11, %r12
  9819. # A[2] * A[1]
  9820. movq 16(%rsi), %rdx
  9821. mulxq 8(%rsi), %rcx, %rbx
  9822. xorq %r15, %r15
  9823. adoxq %rcx, %r11
  9824. # A[2] * A[3]
  9825. mulxq 24(%rsi), %r13, %r14
  9826. adoxq %rbx, %r12
  9827. # A[2] * A[0]
  9828. mulxq (%rsi), %rcx, %rbx
  9829. adoxq %r15, %r13
  9830. adcxq %rcx, %r10
  9831. adoxq %r15, %r14
  9832. # A[1] * A[3]
  9833. movq 8(%rsi), %rdx
  9834. mulxq 24(%rsi), %rax, %r8
  9835. adcxq %rbx, %r11
  9836. adcxq %rax, %r12
  9837. adcxq %r8, %r13
  9838. adcxq %r15, %r14
  9839. # Double with Carry Flag
  9840. xorq %r15, %r15
  9841. # A[0] * A[0]
  9842. movq (%rsi), %rdx
  9843. mulxq %rdx, %r8, %rax
  9844. adcxq %r9, %r9
  9845. # A[1] * A[1]
  9846. movq 8(%rsi), %rdx
  9847. mulxq %rdx, %rcx, %rbx
  9848. adcxq %r10, %r10
  9849. adoxq %rax, %r9
  9850. adcxq %r11, %r11
  9851. adoxq %rcx, %r10
  9852. # A[2] * A[2]
  9853. movq 16(%rsi), %rdx
  9854. mulxq %rdx, %rax, %rcx
  9855. adcxq %r12, %r12
  9856. adoxq %rbx, %r11
  9857. adcxq %r13, %r13
  9858. adoxq %rax, %r12
  9859. # A[3] * A[3]
  9860. movq 24(%rsi), %rdx
  9861. mulxq %rdx, %rax, %rbx
  9862. adcxq %r14, %r14
  9863. adoxq %rcx, %r13
  9864. adcxq %r15, %r15
  9865. adoxq %rax, %r14
  9866. adoxq %rbx, %r15
  9867. # Reduce
  9868. movq $0x7fffffffffffffff, %rcx
  9869. # Move top half into t4-t7 and remove top bit from t3
  9870. shldq $0x01, %r14, %r15
  9871. shldq $0x01, %r13, %r14
  9872. shldq $0x01, %r12, %r13
  9873. shldq $0x01, %r11, %r12
  9874. andq %rcx, %r11
  9875. # Multiply top half by 19
  9876. movq $19, %rdx
  9877. xorq %rcx, %rcx
  9878. mulxq %r12, %rax, %r12
  9879. adcxq %rax, %r8
  9880. adoxq %r12, %r9
  9881. mulxq %r13, %rax, %r13
  9882. adcxq %rax, %r9
  9883. adoxq %r13, %r10
  9884. mulxq %r14, %rax, %r14
  9885. adcxq %rax, %r10
  9886. adoxq %r14, %r11
  9887. mulxq %r15, %r15, %rdx
  9888. adcxq %r15, %r11
  9889. adoxq %rcx, %rdx
  9890. adcxq %rcx, %rdx
  9891. # Overflow
  9892. shldq $0x01, %r11, %rdx
  9893. movq $0x7fffffffffffffff, %rcx
  9894. imulq $19, %rdx, %rax
  9895. andq %rcx, %r11
  9896. addq %rax, %r8
  9897. adcq $0x00, %r9
  9898. adcq $0x00, %r10
  9899. adcq $0x00, %r11
  9900. # Reduce if top bit set
  9901. movq %r11, %rdx
  9902. shrq $63, %rdx
  9903. imulq $19, %rdx, %rax
  9904. andq %rcx, %r11
  9905. addq %rax, %r8
  9906. adcq $0x00, %r9
  9907. adcq $0x00, %r10
  9908. adcq $0x00, %r11
  9909. # Store
  9910. movq %r8, (%rdi)
  9911. movq %r9, 8(%rdi)
  9912. movq %r10, 16(%rdi)
  9913. movq %r11, 24(%rdi)
  9914. decb %bpl
  9915. jnz L_fe_sq_n_avx2
  9916. popq %rbp
  9917. popq %r15
  9918. popq %r14
  9919. popq %r13
  9920. popq %r12
  9921. popq %rbx
  9922. repz retq
  9923. #ifndef __APPLE__
  9924. .size fe_sq_n_avx2,.-fe_sq_n_avx2
  9925. #endif /* __APPLE__ */
  9926. #ifndef __APPLE__
  9927. .text
  9928. .globl fe_mul121666_avx2
  9929. .type fe_mul121666_avx2,@function
  9930. .align 16
  9931. fe_mul121666_avx2:
  9932. #else
  9933. .section __TEXT,__text
  9934. .globl _fe_mul121666_avx2
  9935. .p2align 4
  9936. _fe_mul121666_avx2:
  9937. #endif /* __APPLE__ */
  9938. pushq %r12
  9939. pushq %r13
  9940. movq $0x1db42, %rdx
  9941. mulxq (%rsi), %rax, %r13
  9942. mulxq 8(%rsi), %rcx, %r12
  9943. mulxq 16(%rsi), %r8, %r11
  9944. mulxq 24(%rsi), %r9, %r10
  9945. addq %r13, %rcx
  9946. adcq %r12, %r8
  9947. adcq %r11, %r9
  9948. adcq $0x00, %r10
  9949. movq $0x7fffffffffffffff, %r13
  9950. shldq $0x01, %r9, %r10
  9951. andq %r13, %r9
  9952. imulq $19, %r10, %r10
  9953. addq %r10, %rax
  9954. adcq $0x00, %rcx
  9955. adcq $0x00, %r8
  9956. adcq $0x00, %r9
  9957. movq %rax, (%rdi)
  9958. movq %rcx, 8(%rdi)
  9959. movq %r8, 16(%rdi)
  9960. movq %r9, 24(%rdi)
  9961. popq %r13
  9962. popq %r12
  9963. repz retq
  9964. #ifndef __APPLE__
  9965. .size fe_mul121666_avx2,.-fe_mul121666_avx2
  9966. #endif /* __APPLE__ */
  9967. #ifndef __APPLE__
  9968. .text
  9969. .globl fe_sq2_avx2
  9970. .type fe_sq2_avx2,@function
  9971. .align 16
  9972. fe_sq2_avx2:
  9973. #else
  9974. .section __TEXT,__text
  9975. .globl _fe_sq2_avx2
  9976. .p2align 4
  9977. _fe_sq2_avx2:
  9978. #endif /* __APPLE__ */
  9979. pushq %rbx
  9980. pushq %r12
  9981. pushq %r13
  9982. pushq %r14
  9983. pushq %r15
  9984. # Square * 2
  9985. # A[0] * A[1]
  9986. movq (%rsi), %rdx
  9987. mulxq 8(%rsi), %r9, %r10
  9988. # A[0] * A[3]
  9989. mulxq 24(%rsi), %r11, %r12
  9990. # A[2] * A[1]
  9991. movq 16(%rsi), %rdx
  9992. mulxq 8(%rsi), %rcx, %rbx
  9993. xorq %r15, %r15
  9994. adoxq %rcx, %r11
  9995. # A[2] * A[3]
  9996. mulxq 24(%rsi), %r13, %r14
  9997. adoxq %rbx, %r12
  9998. # A[2] * A[0]
  9999. mulxq (%rsi), %rcx, %rbx
  10000. adoxq %r15, %r13
  10001. adcxq %rcx, %r10
  10002. adoxq %r15, %r14
  10003. # A[1] * A[3]
  10004. movq 8(%rsi), %rdx
  10005. mulxq 24(%rsi), %rax, %r8
  10006. adcxq %rbx, %r11
  10007. adcxq %rax, %r12
  10008. adcxq %r8, %r13
  10009. adcxq %r15, %r14
  10010. # Double with Carry Flag
  10011. xorq %r15, %r15
  10012. # A[0] * A[0]
  10013. movq (%rsi), %rdx
  10014. mulxq %rdx, %r8, %rax
  10015. adcxq %r9, %r9
  10016. # A[1] * A[1]
  10017. movq 8(%rsi), %rdx
  10018. mulxq %rdx, %rcx, %rbx
  10019. adcxq %r10, %r10
  10020. adoxq %rax, %r9
  10021. adcxq %r11, %r11
  10022. adoxq %rcx, %r10
  10023. # A[2] * A[2]
  10024. movq 16(%rsi), %rdx
  10025. mulxq %rdx, %rax, %rcx
  10026. adcxq %r12, %r12
  10027. adoxq %rbx, %r11
  10028. adcxq %r13, %r13
  10029. adoxq %rax, %r12
  10030. # A[3] * A[3]
  10031. movq 24(%rsi), %rdx
  10032. mulxq %rdx, %rax, %rbx
  10033. adcxq %r14, %r14
  10034. adoxq %rcx, %r13
  10035. adcxq %r15, %r15
  10036. adoxq %rax, %r14
  10037. adoxq %rbx, %r15
  10038. # Reduce
  10039. movq $0x7fffffffffffffff, %rbx
  10040. xorq %rax, %rax
  10041. # Move top half into t4-t7 and remove top bit from t3 and double
  10042. shldq $3, %r15, %rax
  10043. shldq $2, %r14, %r15
  10044. shldq $2, %r13, %r14
  10045. shldq $2, %r12, %r13
  10046. shldq $2, %r11, %r12
  10047. shldq $0x01, %r10, %r11
  10048. shldq $0x01, %r9, %r10
  10049. shldq $0x01, %r8, %r9
  10050. shlq $0x01, %r8
  10051. andq %rbx, %r11
  10052. # Two out left, one in right
  10053. andq %rbx, %r15
  10054. # Multiply top bits by 19*19
  10055. imulq $0x169, %rax, %rcx
  10056. xorq %rbx, %rbx
  10057. # Multiply top half by 19
  10058. movq $19, %rdx
  10059. adoxq %rcx, %r8
  10060. mulxq %r12, %rax, %r12
  10061. adcxq %rax, %r8
  10062. adoxq %r12, %r9
  10063. mulxq %r13, %rax, %r13
  10064. adcxq %rax, %r9
  10065. adoxq %r13, %r10
  10066. mulxq %r14, %rax, %r14
  10067. adcxq %rax, %r10
  10068. adoxq %r14, %r11
  10069. mulxq %r15, %r15, %rdx
  10070. adcxq %r15, %r11
  10071. adoxq %rbx, %rdx
  10072. adcxq %rbx, %rdx
  10073. # Overflow
  10074. shldq $0x01, %r11, %rdx
  10075. movq $0x7fffffffffffffff, %rbx
  10076. imulq $19, %rdx, %rax
  10077. andq %rbx, %r11
  10078. addq %rax, %r8
  10079. adcq $0x00, %r9
  10080. adcq $0x00, %r10
  10081. adcq $0x00, %r11
  10082. # Reduce if top bit set
  10083. movq %r11, %rdx
  10084. shrq $63, %rdx
  10085. imulq $19, %rdx, %rax
  10086. andq %rbx, %r11
  10087. addq %rax, %r8
  10088. adcq $0x00, %r9
  10089. adcq $0x00, %r10
  10090. adcq $0x00, %r11
  10091. # Store
  10092. movq %r8, (%rdi)
  10093. movq %r9, 8(%rdi)
  10094. movq %r10, 16(%rdi)
  10095. movq %r11, 24(%rdi)
  10096. popq %r15
  10097. popq %r14
  10098. popq %r13
  10099. popq %r12
  10100. popq %rbx
  10101. repz retq
  10102. #ifndef __APPLE__
  10103. .size fe_sq2_avx2,.-fe_sq2_avx2
  10104. #endif /* __APPLE__ */
  10105. #ifndef __APPLE__
  10106. .text
  10107. .globl fe_invert_avx2
  10108. .type fe_invert_avx2,@function
  10109. .align 16
  10110. fe_invert_avx2:
  10111. #else
  10112. .section __TEXT,__text
  10113. .globl _fe_invert_avx2
  10114. .p2align 4
  10115. _fe_invert_avx2:
  10116. #endif /* __APPLE__ */
  10117. subq $0x90, %rsp
  10118. # Invert
  10119. movq %rdi, 128(%rsp)
  10120. movq %rsi, 136(%rsp)
  10121. movq %rsp, %rdi
  10122. movq 136(%rsp), %rsi
  10123. #ifndef __APPLE__
  10124. callq fe_sq_avx2@plt
  10125. #else
  10126. callq _fe_sq_avx2
  10127. #endif /* __APPLE__ */
  10128. leaq 32(%rsp), %rdi
  10129. movq %rsp, %rsi
  10130. #ifndef __APPLE__
  10131. callq fe_sq_avx2@plt
  10132. #else
  10133. callq _fe_sq_avx2
  10134. #endif /* __APPLE__ */
  10135. leaq 32(%rsp), %rdi
  10136. leaq 32(%rsp), %rsi
  10137. #ifndef __APPLE__
  10138. callq fe_sq_avx2@plt
  10139. #else
  10140. callq _fe_sq_avx2
  10141. #endif /* __APPLE__ */
  10142. leaq 32(%rsp), %rdi
  10143. movq 136(%rsp), %rsi
  10144. leaq 32(%rsp), %rdx
  10145. #ifndef __APPLE__
  10146. callq fe_mul_avx2@plt
  10147. #else
  10148. callq _fe_mul_avx2
  10149. #endif /* __APPLE__ */
  10150. movq %rsp, %rdi
  10151. movq %rsp, %rsi
  10152. leaq 32(%rsp), %rdx
  10153. #ifndef __APPLE__
  10154. callq fe_mul_avx2@plt
  10155. #else
  10156. callq _fe_mul_avx2
  10157. #endif /* __APPLE__ */
  10158. leaq 64(%rsp), %rdi
  10159. movq %rsp, %rsi
  10160. #ifndef __APPLE__
  10161. callq fe_sq_avx2@plt
  10162. #else
  10163. callq _fe_sq_avx2
  10164. #endif /* __APPLE__ */
  10165. leaq 32(%rsp), %rdi
  10166. leaq 32(%rsp), %rsi
  10167. leaq 64(%rsp), %rdx
  10168. #ifndef __APPLE__
  10169. callq fe_mul_avx2@plt
  10170. #else
  10171. callq _fe_mul_avx2
  10172. #endif /* __APPLE__ */
  10173. leaq 64(%rsp), %rdi
  10174. leaq 32(%rsp), %rsi
  10175. #ifndef __APPLE__
  10176. callq fe_sq_avx2@plt
  10177. #else
  10178. callq _fe_sq_avx2
  10179. #endif /* __APPLE__ */
  10180. leaq 64(%rsp), %rdi
  10181. leaq 64(%rsp), %rsi
  10182. movq $4, %rdx
  10183. #ifndef __APPLE__
  10184. callq fe_sq_n_avx2@plt
  10185. #else
  10186. callq _fe_sq_n_avx2
  10187. #endif /* __APPLE__ */
  10188. leaq 32(%rsp), %rdi
  10189. leaq 64(%rsp), %rsi
  10190. leaq 32(%rsp), %rdx
  10191. #ifndef __APPLE__
  10192. callq fe_mul_avx2@plt
  10193. #else
  10194. callq _fe_mul_avx2
  10195. #endif /* __APPLE__ */
  10196. leaq 64(%rsp), %rdi
  10197. leaq 32(%rsp), %rsi
  10198. #ifndef __APPLE__
  10199. callq fe_sq_avx2@plt
  10200. #else
  10201. callq _fe_sq_avx2
  10202. #endif /* __APPLE__ */
  10203. leaq 64(%rsp), %rdi
  10204. leaq 64(%rsp), %rsi
  10205. movq $9, %rdx
  10206. #ifndef __APPLE__
  10207. callq fe_sq_n_avx2@plt
  10208. #else
  10209. callq _fe_sq_n_avx2
  10210. #endif /* __APPLE__ */
  10211. leaq 64(%rsp), %rdi
  10212. leaq 64(%rsp), %rsi
  10213. leaq 32(%rsp), %rdx
  10214. #ifndef __APPLE__
  10215. callq fe_mul_avx2@plt
  10216. #else
  10217. callq _fe_mul_avx2
  10218. #endif /* __APPLE__ */
  10219. leaq 96(%rsp), %rdi
  10220. leaq 64(%rsp), %rsi
  10221. #ifndef __APPLE__
  10222. callq fe_sq_avx2@plt
  10223. #else
  10224. callq _fe_sq_avx2
  10225. #endif /* __APPLE__ */
  10226. leaq 96(%rsp), %rdi
  10227. leaq 96(%rsp), %rsi
  10228. movq $19, %rdx
  10229. #ifndef __APPLE__
  10230. callq fe_sq_n_avx2@plt
  10231. #else
  10232. callq _fe_sq_n_avx2
  10233. #endif /* __APPLE__ */
  10234. leaq 64(%rsp), %rdi
  10235. leaq 96(%rsp), %rsi
  10236. leaq 64(%rsp), %rdx
  10237. #ifndef __APPLE__
  10238. callq fe_mul_avx2@plt
  10239. #else
  10240. callq _fe_mul_avx2
  10241. #endif /* __APPLE__ */
  10242. leaq 64(%rsp), %rdi
  10243. leaq 64(%rsp), %rsi
  10244. #ifndef __APPLE__
  10245. callq fe_sq_avx2@plt
  10246. #else
  10247. callq _fe_sq_avx2
  10248. #endif /* __APPLE__ */
  10249. leaq 64(%rsp), %rdi
  10250. leaq 64(%rsp), %rsi
  10251. movq $9, %rdx
  10252. #ifndef __APPLE__
  10253. callq fe_sq_n_avx2@plt
  10254. #else
  10255. callq _fe_sq_n_avx2
  10256. #endif /* __APPLE__ */
  10257. leaq 32(%rsp), %rdi
  10258. leaq 64(%rsp), %rsi
  10259. leaq 32(%rsp), %rdx
  10260. #ifndef __APPLE__
  10261. callq fe_mul_avx2@plt
  10262. #else
  10263. callq _fe_mul_avx2
  10264. #endif /* __APPLE__ */
  10265. leaq 64(%rsp), %rdi
  10266. leaq 32(%rsp), %rsi
  10267. #ifndef __APPLE__
  10268. callq fe_sq_avx2@plt
  10269. #else
  10270. callq _fe_sq_avx2
  10271. #endif /* __APPLE__ */
  10272. leaq 64(%rsp), %rdi
  10273. leaq 64(%rsp), %rsi
  10274. movq $49, %rdx
  10275. #ifndef __APPLE__
  10276. callq fe_sq_n_avx2@plt
  10277. #else
  10278. callq _fe_sq_n_avx2
  10279. #endif /* __APPLE__ */
  10280. leaq 64(%rsp), %rdi
  10281. leaq 64(%rsp), %rsi
  10282. leaq 32(%rsp), %rdx
  10283. #ifndef __APPLE__
  10284. callq fe_mul_avx2@plt
  10285. #else
  10286. callq _fe_mul_avx2
  10287. #endif /* __APPLE__ */
  10288. leaq 96(%rsp), %rdi
  10289. leaq 64(%rsp), %rsi
  10290. #ifndef __APPLE__
  10291. callq fe_sq_avx2@plt
  10292. #else
  10293. callq _fe_sq_avx2
  10294. #endif /* __APPLE__ */
  10295. leaq 96(%rsp), %rdi
  10296. leaq 96(%rsp), %rsi
  10297. movq $0x63, %rdx
  10298. #ifndef __APPLE__
  10299. callq fe_sq_n_avx2@plt
  10300. #else
  10301. callq _fe_sq_n_avx2
  10302. #endif /* __APPLE__ */
  10303. leaq 64(%rsp), %rdi
  10304. leaq 96(%rsp), %rsi
  10305. leaq 64(%rsp), %rdx
  10306. #ifndef __APPLE__
  10307. callq fe_mul_avx2@plt
  10308. #else
  10309. callq _fe_mul_avx2
  10310. #endif /* __APPLE__ */
  10311. leaq 64(%rsp), %rdi
  10312. leaq 64(%rsp), %rsi
  10313. #ifndef __APPLE__
  10314. callq fe_sq_avx2@plt
  10315. #else
  10316. callq _fe_sq_avx2
  10317. #endif /* __APPLE__ */
  10318. leaq 64(%rsp), %rdi
  10319. leaq 64(%rsp), %rsi
  10320. movq $49, %rdx
  10321. #ifndef __APPLE__
  10322. callq fe_sq_n_avx2@plt
  10323. #else
  10324. callq _fe_sq_n_avx2
  10325. #endif /* __APPLE__ */
  10326. leaq 32(%rsp), %rdi
  10327. leaq 64(%rsp), %rsi
  10328. leaq 32(%rsp), %rdx
  10329. #ifndef __APPLE__
  10330. callq fe_mul_avx2@plt
  10331. #else
  10332. callq _fe_mul_avx2
  10333. #endif /* __APPLE__ */
  10334. leaq 32(%rsp), %rdi
  10335. leaq 32(%rsp), %rsi
  10336. #ifndef __APPLE__
  10337. callq fe_sq_avx2@plt
  10338. #else
  10339. callq _fe_sq_avx2
  10340. #endif /* __APPLE__ */
  10341. leaq 32(%rsp), %rdi
  10342. leaq 32(%rsp), %rsi
  10343. movq $4, %rdx
  10344. #ifndef __APPLE__
  10345. callq fe_sq_n_avx2@plt
  10346. #else
  10347. callq _fe_sq_n_avx2
  10348. #endif /* __APPLE__ */
  10349. movq 128(%rsp), %rdi
  10350. leaq 32(%rsp), %rsi
  10351. movq %rsp, %rdx
  10352. #ifndef __APPLE__
  10353. callq fe_mul_avx2@plt
  10354. #else
  10355. callq _fe_mul_avx2
  10356. #endif /* __APPLE__ */
  10357. movq 136(%rsp), %rsi
  10358. movq 128(%rsp), %rdi
  10359. addq $0x90, %rsp
  10360. repz retq
  10361. #ifndef __APPLE__
  10362. .text
  10363. .globl curve25519_avx2
  10364. .type curve25519_avx2,@function
  10365. .align 16
  10366. curve25519_avx2:
  10367. #else
  10368. .section __TEXT,__text
  10369. .globl _curve25519_avx2
  10370. .p2align 4
  10371. _curve25519_avx2:
  10372. #endif /* __APPLE__ */
  10373. pushq %rbx
  10374. pushq %r12
  10375. pushq %r13
  10376. pushq %r14
  10377. pushq %r15
  10378. pushq %rbp
  10379. movq %rdx, %r8
  10380. subq $0xc0, %rsp
  10381. movq $0x00, 184(%rsp)
  10382. movq %rdi, 176(%rsp)
  10383. # Set one
  10384. movq $0x01, (%rdi)
  10385. movq $0x00, 8(%rdi)
  10386. movq $0x00, 16(%rdi)
  10387. movq $0x00, 24(%rdi)
  10388. # Set zero
  10389. movq $0x00, (%rsp)
  10390. movq $0x00, 8(%rsp)
  10391. movq $0x00, 16(%rsp)
  10392. movq $0x00, 24(%rsp)
  10393. # Set one
  10394. movq $0x01, 32(%rsp)
  10395. movq $0x00, 40(%rsp)
  10396. movq $0x00, 48(%rsp)
  10397. movq $0x00, 56(%rsp)
  10398. # Copy
  10399. movq (%r8), %r9
  10400. movq 8(%r8), %r10
  10401. movq 16(%r8), %r11
  10402. movq 24(%r8), %r12
  10403. movq %r9, 64(%rsp)
  10404. movq %r10, 72(%rsp)
  10405. movq %r11, 80(%rsp)
  10406. movq %r12, 88(%rsp)
  10407. movb $62, 168(%rsp)
  10408. movq $3, 160(%rsp)
  10409. L_curve25519_avx2_words:
  10410. L_curve25519_avx2_bits:
  10411. movq 184(%rsp), %rbx
  10412. movq 160(%rsp), %r9
  10413. movb 168(%rsp), %cl
  10414. movq (%rsi,%r9,8), %rax
  10415. shrq %cl, %rax
  10416. andq $0x01, %rax
  10417. xorq %rax, %rbx
  10418. negq %rbx
  10419. # Conditional Swap
  10420. movq (%rdi), %r9
  10421. movq 8(%rdi), %r10
  10422. movq 16(%rdi), %r11
  10423. movq 24(%rdi), %r12
  10424. xorq 64(%rsp), %r9
  10425. xorq 72(%rsp), %r10
  10426. xorq 80(%rsp), %r11
  10427. xorq 88(%rsp), %r12
  10428. andq %rbx, %r9
  10429. andq %rbx, %r10
  10430. andq %rbx, %r11
  10431. andq %rbx, %r12
  10432. xorq %r9, (%rdi)
  10433. xorq %r10, 8(%rdi)
  10434. xorq %r11, 16(%rdi)
  10435. xorq %r12, 24(%rdi)
  10436. xorq %r9, 64(%rsp)
  10437. xorq %r10, 72(%rsp)
  10438. xorq %r11, 80(%rsp)
  10439. xorq %r12, 88(%rsp)
  10440. # Conditional Swap
  10441. movq (%rsp), %r9
  10442. movq 8(%rsp), %r10
  10443. movq 16(%rsp), %r11
  10444. movq 24(%rsp), %r12
  10445. xorq 32(%rsp), %r9
  10446. xorq 40(%rsp), %r10
  10447. xorq 48(%rsp), %r11
  10448. xorq 56(%rsp), %r12
  10449. andq %rbx, %r9
  10450. andq %rbx, %r10
  10451. andq %rbx, %r11
  10452. andq %rbx, %r12
  10453. xorq %r9, (%rsp)
  10454. xorq %r10, 8(%rsp)
  10455. xorq %r11, 16(%rsp)
  10456. xorq %r12, 24(%rsp)
  10457. xorq %r9, 32(%rsp)
  10458. xorq %r10, 40(%rsp)
  10459. xorq %r11, 48(%rsp)
  10460. xorq %r12, 56(%rsp)
  10461. movq %rax, 184(%rsp)
  10462. # Add
  10463. movq (%rdi), %r9
  10464. movq 8(%rdi), %r10
  10465. movq 16(%rdi), %r11
  10466. movq 24(%rdi), %rax
  10467. movq %r9, %r13
  10468. addq (%rsp), %r9
  10469. movq %r10, %r14
  10470. adcq 8(%rsp), %r10
  10471. movq %r11, %r15
  10472. adcq 16(%rsp), %r11
  10473. movq %rax, %rbp
  10474. adcq 24(%rsp), %rax
  10475. movq $-19, %rcx
  10476. movq %rax, %r12
  10477. movq $0x7fffffffffffffff, %rbx
  10478. sarq $63, %rax
  10479. # Mask the modulus
  10480. andq %rax, %rcx
  10481. andq %rax, %rbx
  10482. # Sub modulus (if overflow)
  10483. subq %rcx, %r9
  10484. sbbq %rax, %r10
  10485. sbbq %rax, %r11
  10486. sbbq %rbx, %r12
  10487. # Sub
  10488. subq (%rsp), %r13
  10489. movq $0x00, %rax
  10490. sbbq 8(%rsp), %r14
  10491. movq $-19, %rcx
  10492. sbbq 16(%rsp), %r15
  10493. movq $0x7fffffffffffffff, %rbx
  10494. sbbq 24(%rsp), %rbp
  10495. sbbq $0x00, %rax
  10496. # Mask the modulus
  10497. andq %rax, %rcx
  10498. andq %rax, %rbx
  10499. # Add modulus (if underflow)
  10500. addq %rcx, %r13
  10501. adcq %rax, %r14
  10502. adcq %rax, %r15
  10503. adcq %rbx, %rbp
  10504. movq %r9, (%rdi)
  10505. movq %r10, 8(%rdi)
  10506. movq %r11, 16(%rdi)
  10507. movq %r12, 24(%rdi)
  10508. movq %r13, 128(%rsp)
  10509. movq %r14, 136(%rsp)
  10510. movq %r15, 144(%rsp)
  10511. movq %rbp, 152(%rsp)
  10512. # Add
  10513. movq 64(%rsp), %r9
  10514. movq 72(%rsp), %r10
  10515. movq 80(%rsp), %r11
  10516. movq 88(%rsp), %rax
  10517. movq %r9, %r13
  10518. addq 32(%rsp), %r9
  10519. movq %r10, %r14
  10520. adcq 40(%rsp), %r10
  10521. movq %r11, %r15
  10522. adcq 48(%rsp), %r11
  10523. movq %rax, %rbp
  10524. adcq 56(%rsp), %rax
  10525. movq $-19, %rcx
  10526. movq %rax, %r12
  10527. movq $0x7fffffffffffffff, %rbx
  10528. sarq $63, %rax
  10529. # Mask the modulus
  10530. andq %rax, %rcx
  10531. andq %rax, %rbx
  10532. # Sub modulus (if overflow)
  10533. subq %rcx, %r9
  10534. sbbq %rax, %r10
  10535. sbbq %rax, %r11
  10536. sbbq %rbx, %r12
  10537. # Sub
  10538. subq 32(%rsp), %r13
  10539. movq $0x00, %rax
  10540. sbbq 40(%rsp), %r14
  10541. movq $-19, %rcx
  10542. sbbq 48(%rsp), %r15
  10543. movq $0x7fffffffffffffff, %rbx
  10544. sbbq 56(%rsp), %rbp
  10545. sbbq $0x00, %rax
  10546. # Mask the modulus
  10547. andq %rax, %rcx
  10548. andq %rax, %rbx
  10549. # Add modulus (if underflow)
  10550. addq %rcx, %r13
  10551. adcq %rax, %r14
  10552. adcq %rax, %r15
  10553. adcq %rbx, %rbp
  10554. movq %r9, (%rsp)
  10555. movq %r10, 8(%rsp)
  10556. movq %r11, 16(%rsp)
  10557. movq %r12, 24(%rsp)
  10558. movq %r13, 96(%rsp)
  10559. movq %r14, 104(%rsp)
  10560. movq %r15, 112(%rsp)
  10561. movq %rbp, 120(%rsp)
  10562. # Multiply
  10563. # A[0] * B[0]
  10564. movq (%rdi), %rdx
  10565. mulxq 96(%rsp), %r9, %r10
  10566. # A[2] * B[0]
  10567. mulxq 112(%rsp), %r11, %r12
  10568. # A[1] * B[0]
  10569. mulxq 104(%rsp), %rcx, %rbx
  10570. xorq %rbp, %rbp
  10571. adcxq %rcx, %r10
  10572. # A[1] * B[3]
  10573. movq 24(%rdi), %rdx
  10574. mulxq 104(%rsp), %r13, %r14
  10575. adcxq %rbx, %r11
  10576. # A[0] * B[1]
  10577. movq 8(%rdi), %rdx
  10578. mulxq 96(%rsp), %rcx, %rbx
  10579. adoxq %rcx, %r10
  10580. # A[2] * B[1]
  10581. mulxq 112(%rsp), %rcx, %r15
  10582. adoxq %rbx, %r11
  10583. adcxq %rcx, %r12
  10584. # A[1] * B[2]
  10585. movq 16(%rdi), %rdx
  10586. mulxq 104(%rsp), %rcx, %rbx
  10587. adcxq %r15, %r13
  10588. adoxq %rcx, %r12
  10589. adcxq %rbp, %r14
  10590. adoxq %rbx, %r13
  10591. # A[0] * B[2]
  10592. mulxq 96(%rsp), %rcx, %rbx
  10593. adoxq %rbp, %r14
  10594. xorq %r15, %r15
  10595. adcxq %rcx, %r11
  10596. # A[1] * B[1]
  10597. movq 8(%rdi), %rdx
  10598. mulxq 104(%rsp), %rdx, %rcx
  10599. adcxq %rbx, %r12
  10600. adoxq %rdx, %r11
  10601. # A[3] * B[1]
  10602. movq 8(%rdi), %rdx
  10603. adoxq %rcx, %r12
  10604. mulxq 120(%rsp), %rcx, %rbx
  10605. adcxq %rcx, %r13
  10606. # A[2] * B[2]
  10607. movq 16(%rdi), %rdx
  10608. mulxq 112(%rsp), %rdx, %rcx
  10609. adcxq %rbx, %r14
  10610. adoxq %rdx, %r13
  10611. # A[3] * B[3]
  10612. movq 24(%rdi), %rdx
  10613. adoxq %rcx, %r14
  10614. mulxq 120(%rsp), %rcx, %rbx
  10615. adoxq %rbp, %r15
  10616. adcxq %rcx, %r15
  10617. # A[0] * B[3]
  10618. mulxq 96(%rsp), %rdx, %rcx
  10619. adcxq %rbx, %rbp
  10620. xorq %rbx, %rbx
  10621. adcxq %rdx, %r12
  10622. # A[3] * B[0]
  10623. movq (%rdi), %rdx
  10624. adcxq %rcx, %r13
  10625. mulxq 120(%rsp), %rdx, %rcx
  10626. adoxq %rdx, %r12
  10627. adoxq %rcx, %r13
  10628. # A[2] * B[3]
  10629. movq 24(%rdi), %rdx
  10630. mulxq 112(%rsp), %rdx, %rcx
  10631. adcxq %rdx, %r14
  10632. # A[3] * B[2]
  10633. movq 16(%rdi), %rdx
  10634. adcxq %rcx, %r15
  10635. mulxq 120(%rsp), %rcx, %rdx
  10636. adcxq %rbx, %rbp
  10637. adoxq %rcx, %r14
  10638. adoxq %rdx, %r15
  10639. adoxq %rbx, %rbp
  10640. # Reduce
  10641. movq $0x7fffffffffffffff, %rbx
  10642. # Move top half into t4-t7 and remove top bit from t3
  10643. shldq $0x01, %r15, %rbp
  10644. shldq $0x01, %r14, %r15
  10645. shldq $0x01, %r13, %r14
  10646. shldq $0x01, %r12, %r13
  10647. andq %rbx, %r12
  10648. # Multiply top half by 19
  10649. movq $19, %rdx
  10650. xorq %rbx, %rbx
  10651. mulxq %r13, %rcx, %r13
  10652. adcxq %rcx, %r9
  10653. adoxq %r13, %r10
  10654. mulxq %r14, %rcx, %r14
  10655. adcxq %rcx, %r10
  10656. adoxq %r14, %r11
  10657. mulxq %r15, %rcx, %r15
  10658. adcxq %rcx, %r11
  10659. adoxq %r15, %r12
  10660. mulxq %rbp, %rbp, %rdx
  10661. adcxq %rbp, %r12
  10662. adoxq %rbx, %rdx
  10663. adcxq %rbx, %rdx
  10664. # Overflow
  10665. shldq $0x01, %r12, %rdx
  10666. movq $0x7fffffffffffffff, %rbx
  10667. imulq $19, %rdx, %rcx
  10668. andq %rbx, %r12
  10669. addq %rcx, %r9
  10670. adcq $0x00, %r10
  10671. adcq $0x00, %r11
  10672. adcq $0x00, %r12
  10673. # Reduce if top bit set
  10674. movq %r12, %rdx
  10675. shrq $63, %rdx
  10676. imulq $19, %rdx, %rcx
  10677. andq %rbx, %r12
  10678. addq %rcx, %r9
  10679. adcq $0x00, %r10
  10680. adcq $0x00, %r11
  10681. adcq $0x00, %r12
  10682. # Store
  10683. movq %r9, 32(%rsp)
  10684. movq %r10, 40(%rsp)
  10685. movq %r11, 48(%rsp)
  10686. movq %r12, 56(%rsp)
  10687. # Multiply
  10688. # A[0] * B[0]
  10689. movq 128(%rsp), %rdx
  10690. mulxq (%rsp), %r9, %r10
  10691. # A[2] * B[0]
  10692. mulxq 16(%rsp), %r11, %r12
  10693. # A[1] * B[0]
  10694. mulxq 8(%rsp), %rcx, %rbx
  10695. xorq %rbp, %rbp
  10696. adcxq %rcx, %r10
  10697. # A[1] * B[3]
  10698. movq 152(%rsp), %rdx
  10699. mulxq 8(%rsp), %r13, %r14
  10700. adcxq %rbx, %r11
  10701. # A[0] * B[1]
  10702. movq 136(%rsp), %rdx
  10703. mulxq (%rsp), %rcx, %rbx
  10704. adoxq %rcx, %r10
  10705. # A[2] * B[1]
  10706. mulxq 16(%rsp), %rcx, %r15
  10707. adoxq %rbx, %r11
  10708. adcxq %rcx, %r12
  10709. # A[1] * B[2]
  10710. movq 144(%rsp), %rdx
  10711. mulxq 8(%rsp), %rcx, %rbx
  10712. adcxq %r15, %r13
  10713. adoxq %rcx, %r12
  10714. adcxq %rbp, %r14
  10715. adoxq %rbx, %r13
  10716. # A[0] * B[2]
  10717. mulxq (%rsp), %rcx, %rbx
  10718. adoxq %rbp, %r14
  10719. xorq %r15, %r15
  10720. adcxq %rcx, %r11
  10721. # A[1] * B[1]
  10722. movq 136(%rsp), %rdx
  10723. mulxq 8(%rsp), %rdx, %rcx
  10724. adcxq %rbx, %r12
  10725. adoxq %rdx, %r11
  10726. # A[3] * B[1]
  10727. movq 136(%rsp), %rdx
  10728. adoxq %rcx, %r12
  10729. mulxq 24(%rsp), %rcx, %rbx
  10730. adcxq %rcx, %r13
  10731. # A[2] * B[2]
  10732. movq 144(%rsp), %rdx
  10733. mulxq 16(%rsp), %rdx, %rcx
  10734. adcxq %rbx, %r14
  10735. adoxq %rdx, %r13
  10736. # A[3] * B[3]
  10737. movq 152(%rsp), %rdx
  10738. adoxq %rcx, %r14
  10739. mulxq 24(%rsp), %rcx, %rbx
  10740. adoxq %rbp, %r15
  10741. adcxq %rcx, %r15
  10742. # A[0] * B[3]
  10743. mulxq (%rsp), %rdx, %rcx
  10744. adcxq %rbx, %rbp
  10745. xorq %rbx, %rbx
  10746. adcxq %rdx, %r12
  10747. # A[3] * B[0]
  10748. movq 128(%rsp), %rdx
  10749. adcxq %rcx, %r13
  10750. mulxq 24(%rsp), %rdx, %rcx
  10751. adoxq %rdx, %r12
  10752. adoxq %rcx, %r13
  10753. # A[2] * B[3]
  10754. movq 152(%rsp), %rdx
  10755. mulxq 16(%rsp), %rdx, %rcx
  10756. adcxq %rdx, %r14
  10757. # A[3] * B[2]
  10758. movq 144(%rsp), %rdx
  10759. adcxq %rcx, %r15
  10760. mulxq 24(%rsp), %rcx, %rdx
  10761. adcxq %rbx, %rbp
  10762. adoxq %rcx, %r14
  10763. adoxq %rdx, %r15
  10764. adoxq %rbx, %rbp
  10765. # Reduce
  10766. movq $0x7fffffffffffffff, %rbx
  10767. # Move top half into t4-t7 and remove top bit from t3
  10768. shldq $0x01, %r15, %rbp
  10769. shldq $0x01, %r14, %r15
  10770. shldq $0x01, %r13, %r14
  10771. shldq $0x01, %r12, %r13
  10772. andq %rbx, %r12
  10773. # Multiply top half by 19
  10774. movq $19, %rdx
  10775. xorq %rbx, %rbx
  10776. mulxq %r13, %rcx, %r13
  10777. adcxq %rcx, %r9
  10778. adoxq %r13, %r10
  10779. mulxq %r14, %rcx, %r14
  10780. adcxq %rcx, %r10
  10781. adoxq %r14, %r11
  10782. mulxq %r15, %rcx, %r15
  10783. adcxq %rcx, %r11
  10784. adoxq %r15, %r12
  10785. mulxq %rbp, %rbp, %rdx
  10786. adcxq %rbp, %r12
  10787. adoxq %rbx, %rdx
  10788. adcxq %rbx, %rdx
  10789. # Overflow
  10790. shldq $0x01, %r12, %rdx
  10791. movq $0x7fffffffffffffff, %rbx
  10792. imulq $19, %rdx, %rcx
  10793. andq %rbx, %r12
  10794. addq %rcx, %r9
  10795. adcq $0x00, %r10
  10796. adcq $0x00, %r11
  10797. adcq $0x00, %r12
  10798. # Reduce if top bit set
  10799. movq %r12, %rdx
  10800. shrq $63, %rdx
  10801. imulq $19, %rdx, %rcx
  10802. andq %rbx, %r12
  10803. addq %rcx, %r9
  10804. adcq $0x00, %r10
  10805. adcq $0x00, %r11
  10806. adcq $0x00, %r12
  10807. # Store
  10808. movq %r9, (%rsp)
  10809. movq %r10, 8(%rsp)
  10810. movq %r11, 16(%rsp)
  10811. movq %r12, 24(%rsp)
  10812. # Square
  10813. # A[0] * A[1]
  10814. movq 128(%rsp), %rdx
  10815. mulxq 136(%rsp), %r10, %r11
  10816. # A[0] * A[3]
  10817. mulxq 152(%rsp), %r12, %r13
  10818. # A[2] * A[1]
  10819. movq 144(%rsp), %rdx
  10820. mulxq 136(%rsp), %rcx, %rbx
  10821. xorq %rbp, %rbp
  10822. adoxq %rcx, %r12
  10823. # A[2] * A[3]
  10824. mulxq 152(%rsp), %r14, %r15
  10825. adoxq %rbx, %r13
  10826. # A[2] * A[0]
  10827. mulxq 128(%rsp), %rcx, %rbx
  10828. adoxq %rbp, %r14
  10829. adcxq %rcx, %r11
  10830. adoxq %rbp, %r15
  10831. # A[1] * A[3]
  10832. movq 136(%rsp), %rdx
  10833. mulxq 152(%rsp), %rax, %r9
  10834. adcxq %rbx, %r12
  10835. adcxq %rax, %r13
  10836. adcxq %r9, %r14
  10837. adcxq %rbp, %r15
  10838. # Double with Carry Flag
  10839. xorq %rbp, %rbp
  10840. # A[0] * A[0]
  10841. movq 128(%rsp), %rdx
  10842. mulxq %rdx, %r9, %rax
  10843. adcxq %r10, %r10
  10844. # A[1] * A[1]
  10845. movq 136(%rsp), %rdx
  10846. mulxq %rdx, %rcx, %rbx
  10847. adcxq %r11, %r11
  10848. adoxq %rax, %r10
  10849. adcxq %r12, %r12
  10850. adoxq %rcx, %r11
  10851. # A[2] * A[2]
  10852. movq 144(%rsp), %rdx
  10853. mulxq %rdx, %rax, %rcx
  10854. adcxq %r13, %r13
  10855. adoxq %rbx, %r12
  10856. adcxq %r14, %r14
  10857. adoxq %rax, %r13
  10858. # A[3] * A[3]
  10859. movq 152(%rsp), %rdx
  10860. mulxq %rdx, %rax, %rbx
  10861. adcxq %r15, %r15
  10862. adoxq %rcx, %r14
  10863. adcxq %rbp, %rbp
  10864. adoxq %rax, %r15
  10865. adoxq %rbx, %rbp
  10866. # Reduce
  10867. movq $0x7fffffffffffffff, %rcx
  10868. # Move top half into t4-t7 and remove top bit from t3
  10869. shldq $0x01, %r15, %rbp
  10870. shldq $0x01, %r14, %r15
  10871. shldq $0x01, %r13, %r14
  10872. shldq $0x01, %r12, %r13
  10873. andq %rcx, %r12
  10874. # Multiply top half by 19
  10875. movq $19, %rdx
  10876. xorq %rcx, %rcx
  10877. mulxq %r13, %rax, %r13
  10878. adcxq %rax, %r9
  10879. adoxq %r13, %r10
  10880. mulxq %r14, %rax, %r14
  10881. adcxq %rax, %r10
  10882. adoxq %r14, %r11
  10883. mulxq %r15, %rax, %r15
  10884. adcxq %rax, %r11
  10885. adoxq %r15, %r12
  10886. mulxq %rbp, %rbp, %rdx
  10887. adcxq %rbp, %r12
  10888. adoxq %rcx, %rdx
  10889. adcxq %rcx, %rdx
  10890. # Overflow
  10891. shldq $0x01, %r12, %rdx
  10892. movq $0x7fffffffffffffff, %rcx
  10893. imulq $19, %rdx, %rax
  10894. andq %rcx, %r12
  10895. addq %rax, %r9
  10896. adcq $0x00, %r10
  10897. adcq $0x00, %r11
  10898. adcq $0x00, %r12
  10899. # Reduce if top bit set
  10900. movq %r12, %rdx
  10901. shrq $63, %rdx
  10902. imulq $19, %rdx, %rax
  10903. andq %rcx, %r12
  10904. addq %rax, %r9
  10905. adcq $0x00, %r10
  10906. adcq $0x00, %r11
  10907. adcq $0x00, %r12
  10908. # Store
  10909. movq %r9, 96(%rsp)
  10910. movq %r10, 104(%rsp)
  10911. movq %r11, 112(%rsp)
  10912. movq %r12, 120(%rsp)
  10913. # Square
  10914. # A[0] * A[1]
  10915. movq (%rdi), %rdx
  10916. mulxq 8(%rdi), %r10, %r11
  10917. # A[0] * A[3]
  10918. mulxq 24(%rdi), %r12, %r13
  10919. # A[2] * A[1]
  10920. movq 16(%rdi), %rdx
  10921. mulxq 8(%rdi), %rcx, %rbx
  10922. xorq %rbp, %rbp
  10923. adoxq %rcx, %r12
  10924. # A[2] * A[3]
  10925. mulxq 24(%rdi), %r14, %r15
  10926. adoxq %rbx, %r13
  10927. # A[2] * A[0]
  10928. mulxq (%rdi), %rcx, %rbx
  10929. adoxq %rbp, %r14
  10930. adcxq %rcx, %r11
  10931. adoxq %rbp, %r15
  10932. # A[1] * A[3]
  10933. movq 8(%rdi), %rdx
  10934. mulxq 24(%rdi), %rax, %r9
  10935. adcxq %rbx, %r12
  10936. adcxq %rax, %r13
  10937. adcxq %r9, %r14
  10938. adcxq %rbp, %r15
  10939. # Double with Carry Flag
  10940. xorq %rbp, %rbp
  10941. # A[0] * A[0]
  10942. movq (%rdi), %rdx
  10943. mulxq %rdx, %r9, %rax
  10944. adcxq %r10, %r10
  10945. # A[1] * A[1]
  10946. movq 8(%rdi), %rdx
  10947. mulxq %rdx, %rcx, %rbx
  10948. adcxq %r11, %r11
  10949. adoxq %rax, %r10
  10950. adcxq %r12, %r12
  10951. adoxq %rcx, %r11
  10952. # A[2] * A[2]
  10953. movq 16(%rdi), %rdx
  10954. mulxq %rdx, %rax, %rcx
  10955. adcxq %r13, %r13
  10956. adoxq %rbx, %r12
  10957. adcxq %r14, %r14
  10958. adoxq %rax, %r13
  10959. # A[3] * A[3]
  10960. movq 24(%rdi), %rdx
  10961. mulxq %rdx, %rax, %rbx
  10962. adcxq %r15, %r15
  10963. adoxq %rcx, %r14
  10964. adcxq %rbp, %rbp
  10965. adoxq %rax, %r15
  10966. adoxq %rbx, %rbp
  10967. # Reduce
  10968. movq $0x7fffffffffffffff, %rcx
  10969. # Move top half into t4-t7 and remove top bit from t3
  10970. shldq $0x01, %r15, %rbp
  10971. shldq $0x01, %r14, %r15
  10972. shldq $0x01, %r13, %r14
  10973. shldq $0x01, %r12, %r13
  10974. andq %rcx, %r12
  10975. # Multiply top half by 19
  10976. movq $19, %rdx
  10977. xorq %rcx, %rcx
  10978. mulxq %r13, %rax, %r13
  10979. adcxq %rax, %r9
  10980. adoxq %r13, %r10
  10981. mulxq %r14, %rax, %r14
  10982. adcxq %rax, %r10
  10983. adoxq %r14, %r11
  10984. mulxq %r15, %rax, %r15
  10985. adcxq %rax, %r11
  10986. adoxq %r15, %r12
  10987. mulxq %rbp, %rbp, %rdx
  10988. adcxq %rbp, %r12
  10989. adoxq %rcx, %rdx
  10990. adcxq %rcx, %rdx
  10991. # Overflow
  10992. shldq $0x01, %r12, %rdx
  10993. movq $0x7fffffffffffffff, %rcx
  10994. imulq $19, %rdx, %rax
  10995. andq %rcx, %r12
  10996. addq %rax, %r9
  10997. adcq $0x00, %r10
  10998. adcq $0x00, %r11
  10999. adcq $0x00, %r12
  11000. # Reduce if top bit set
  11001. movq %r12, %rdx
  11002. shrq $63, %rdx
  11003. imulq $19, %rdx, %rax
  11004. andq %rcx, %r12
  11005. addq %rax, %r9
  11006. adcq $0x00, %r10
  11007. adcq $0x00, %r11
  11008. adcq $0x00, %r12
  11009. # Store
  11010. movq %r9, 128(%rsp)
  11011. movq %r10, 136(%rsp)
  11012. movq %r11, 144(%rsp)
  11013. movq %r12, 152(%rsp)
  11014. # Add
  11015. movq 32(%rsp), %r9
  11016. movq 40(%rsp), %r10
  11017. movq 48(%rsp), %r11
  11018. movq 56(%rsp), %rax
  11019. movq %r9, %r13
  11020. addq (%rsp), %r9
  11021. movq %r10, %r14
  11022. adcq 8(%rsp), %r10
  11023. movq %r11, %r15
  11024. adcq 16(%rsp), %r11
  11025. movq %rax, %rbp
  11026. adcq 24(%rsp), %rax
  11027. movq $-19, %rcx
  11028. movq %rax, %r12
  11029. movq $0x7fffffffffffffff, %rbx
  11030. sarq $63, %rax
  11031. # Mask the modulus
  11032. andq %rax, %rcx
  11033. andq %rax, %rbx
  11034. # Sub modulus (if overflow)
  11035. subq %rcx, %r9
  11036. sbbq %rax, %r10
  11037. sbbq %rax, %r11
  11038. sbbq %rbx, %r12
  11039. # Sub
  11040. subq (%rsp), %r13
  11041. movq $0x00, %rax
  11042. sbbq 8(%rsp), %r14
  11043. movq $-19, %rcx
  11044. sbbq 16(%rsp), %r15
  11045. movq $0x7fffffffffffffff, %rbx
  11046. sbbq 24(%rsp), %rbp
  11047. sbbq $0x00, %rax
  11048. # Mask the modulus
  11049. andq %rax, %rcx
  11050. andq %rax, %rbx
  11051. # Add modulus (if underflow)
  11052. addq %rcx, %r13
  11053. adcq %rax, %r14
  11054. adcq %rax, %r15
  11055. adcq %rbx, %rbp
  11056. movq %r9, 64(%rsp)
  11057. movq %r10, 72(%rsp)
  11058. movq %r11, 80(%rsp)
  11059. movq %r12, 88(%rsp)
  11060. movq %r13, (%rsp)
  11061. movq %r14, 8(%rsp)
  11062. movq %r15, 16(%rsp)
  11063. movq %rbp, 24(%rsp)
  11064. # Multiply
  11065. # A[0] * B[0]
  11066. movq 96(%rsp), %rdx
  11067. mulxq 128(%rsp), %r9, %r10
  11068. # A[2] * B[0]
  11069. mulxq 144(%rsp), %r11, %r12
  11070. # A[1] * B[0]
  11071. mulxq 136(%rsp), %rcx, %rbx
  11072. xorq %rbp, %rbp
  11073. adcxq %rcx, %r10
  11074. # A[1] * B[3]
  11075. movq 120(%rsp), %rdx
  11076. mulxq 136(%rsp), %r13, %r14
  11077. adcxq %rbx, %r11
  11078. # A[0] * B[1]
  11079. movq 104(%rsp), %rdx
  11080. mulxq 128(%rsp), %rcx, %rbx
  11081. adoxq %rcx, %r10
  11082. # A[2] * B[1]
  11083. mulxq 144(%rsp), %rcx, %r15
  11084. adoxq %rbx, %r11
  11085. adcxq %rcx, %r12
  11086. # A[1] * B[2]
  11087. movq 112(%rsp), %rdx
  11088. mulxq 136(%rsp), %rcx, %rbx
  11089. adcxq %r15, %r13
  11090. adoxq %rcx, %r12
  11091. adcxq %rbp, %r14
  11092. adoxq %rbx, %r13
  11093. # A[0] * B[2]
  11094. mulxq 128(%rsp), %rcx, %rbx
  11095. adoxq %rbp, %r14
  11096. xorq %r15, %r15
  11097. adcxq %rcx, %r11
  11098. # A[1] * B[1]
  11099. movq 104(%rsp), %rdx
  11100. mulxq 136(%rsp), %rdx, %rcx
  11101. adcxq %rbx, %r12
  11102. adoxq %rdx, %r11
  11103. # A[3] * B[1]
  11104. movq 104(%rsp), %rdx
  11105. adoxq %rcx, %r12
  11106. mulxq 152(%rsp), %rcx, %rbx
  11107. adcxq %rcx, %r13
  11108. # A[2] * B[2]
  11109. movq 112(%rsp), %rdx
  11110. mulxq 144(%rsp), %rdx, %rcx
  11111. adcxq %rbx, %r14
  11112. adoxq %rdx, %r13
  11113. # A[3] * B[3]
  11114. movq 120(%rsp), %rdx
  11115. adoxq %rcx, %r14
  11116. mulxq 152(%rsp), %rcx, %rbx
  11117. adoxq %rbp, %r15
  11118. adcxq %rcx, %r15
  11119. # A[0] * B[3]
  11120. mulxq 128(%rsp), %rdx, %rcx
  11121. adcxq %rbx, %rbp
  11122. xorq %rbx, %rbx
  11123. adcxq %rdx, %r12
  11124. # A[3] * B[0]
  11125. movq 96(%rsp), %rdx
  11126. adcxq %rcx, %r13
  11127. mulxq 152(%rsp), %rdx, %rcx
  11128. adoxq %rdx, %r12
  11129. adoxq %rcx, %r13
  11130. # A[2] * B[3]
  11131. movq 120(%rsp), %rdx
  11132. mulxq 144(%rsp), %rdx, %rcx
  11133. adcxq %rdx, %r14
  11134. # A[3] * B[2]
  11135. movq 112(%rsp), %rdx
  11136. adcxq %rcx, %r15
  11137. mulxq 152(%rsp), %rcx, %rdx
  11138. adcxq %rbx, %rbp
  11139. adoxq %rcx, %r14
  11140. adoxq %rdx, %r15
  11141. adoxq %rbx, %rbp
  11142. # Reduce
  11143. movq $0x7fffffffffffffff, %rbx
  11144. # Move top half into t4-t7 and remove top bit from t3
  11145. shldq $0x01, %r15, %rbp
  11146. shldq $0x01, %r14, %r15
  11147. shldq $0x01, %r13, %r14
  11148. shldq $0x01, %r12, %r13
  11149. andq %rbx, %r12
  11150. # Multiply top half by 19
  11151. movq $19, %rdx
  11152. xorq %rbx, %rbx
  11153. mulxq %r13, %rcx, %r13
  11154. adcxq %rcx, %r9
  11155. adoxq %r13, %r10
  11156. mulxq %r14, %rcx, %r14
  11157. adcxq %rcx, %r10
  11158. adoxq %r14, %r11
  11159. mulxq %r15, %rcx, %r15
  11160. adcxq %rcx, %r11
  11161. adoxq %r15, %r12
  11162. mulxq %rbp, %rbp, %rdx
  11163. adcxq %rbp, %r12
  11164. adoxq %rbx, %rdx
  11165. adcxq %rbx, %rdx
  11166. # Overflow
  11167. shldq $0x01, %r12, %rdx
  11168. movq $0x7fffffffffffffff, %rbx
  11169. imulq $19, %rdx, %rcx
  11170. andq %rbx, %r12
  11171. addq %rcx, %r9
  11172. adcq $0x00, %r10
  11173. adcq $0x00, %r11
  11174. adcq $0x00, %r12
  11175. # Reduce if top bit set
  11176. movq %r12, %rdx
  11177. shrq $63, %rdx
  11178. imulq $19, %rdx, %rcx
  11179. andq %rbx, %r12
  11180. addq %rcx, %r9
  11181. adcq $0x00, %r10
  11182. adcq $0x00, %r11
  11183. adcq $0x00, %r12
  11184. # Store
  11185. movq %r9, (%rdi)
  11186. movq %r10, 8(%rdi)
  11187. movq %r11, 16(%rdi)
  11188. movq %r12, 24(%rdi)
  11189. # Sub
  11190. movq 128(%rsp), %r9
  11191. movq 136(%rsp), %r10
  11192. movq 144(%rsp), %r11
  11193. movq 152(%rsp), %r12
  11194. subq 96(%rsp), %r9
  11195. movq $0x00, %rax
  11196. sbbq 104(%rsp), %r10
  11197. movq $-19, %rcx
  11198. sbbq 112(%rsp), %r11
  11199. movq $0x7fffffffffffffff, %rbx
  11200. sbbq 120(%rsp), %r12
  11201. sbbq $0x00, %rax
  11202. # Mask the modulus
  11203. andq %rax, %rcx
  11204. andq %rax, %rbx
  11205. # Add modulus (if underflow)
  11206. addq %rcx, %r9
  11207. adcq %rax, %r10
  11208. adcq %rax, %r11
  11209. adcq %rbx, %r12
  11210. movq %r9, 128(%rsp)
  11211. movq %r10, 136(%rsp)
  11212. movq %r11, 144(%rsp)
  11213. movq %r12, 152(%rsp)
  11214. # Square
  11215. # A[0] * A[1]
  11216. movq (%rsp), %rdx
  11217. mulxq 8(%rsp), %r10, %r11
  11218. # A[0] * A[3]
  11219. mulxq 24(%rsp), %r12, %r13
  11220. # A[2] * A[1]
  11221. movq 16(%rsp), %rdx
  11222. mulxq 8(%rsp), %rcx, %rbx
  11223. xorq %rbp, %rbp
  11224. adoxq %rcx, %r12
  11225. # A[2] * A[3]
  11226. mulxq 24(%rsp), %r14, %r15
  11227. adoxq %rbx, %r13
  11228. # A[2] * A[0]
  11229. mulxq (%rsp), %rcx, %rbx
  11230. adoxq %rbp, %r14
  11231. adcxq %rcx, %r11
  11232. adoxq %rbp, %r15
  11233. # A[1] * A[3]
  11234. movq 8(%rsp), %rdx
  11235. mulxq 24(%rsp), %rax, %r9
  11236. adcxq %rbx, %r12
  11237. adcxq %rax, %r13
  11238. adcxq %r9, %r14
  11239. adcxq %rbp, %r15
  11240. # Double with Carry Flag
  11241. xorq %rbp, %rbp
  11242. # A[0] * A[0]
  11243. movq (%rsp), %rdx
  11244. mulxq %rdx, %r9, %rax
  11245. adcxq %r10, %r10
  11246. # A[1] * A[1]
  11247. movq 8(%rsp), %rdx
  11248. mulxq %rdx, %rcx, %rbx
  11249. adcxq %r11, %r11
  11250. adoxq %rax, %r10
  11251. adcxq %r12, %r12
  11252. adoxq %rcx, %r11
  11253. # A[2] * A[2]
  11254. movq 16(%rsp), %rdx
  11255. mulxq %rdx, %rax, %rcx
  11256. adcxq %r13, %r13
  11257. adoxq %rbx, %r12
  11258. adcxq %r14, %r14
  11259. adoxq %rax, %r13
  11260. # A[3] * A[3]
  11261. movq 24(%rsp), %rdx
  11262. mulxq %rdx, %rax, %rbx
  11263. adcxq %r15, %r15
  11264. adoxq %rcx, %r14
  11265. adcxq %rbp, %rbp
  11266. adoxq %rax, %r15
  11267. adoxq %rbx, %rbp
  11268. # Reduce
  11269. movq $0x7fffffffffffffff, %rcx
  11270. # Move top half into t4-t7 and remove top bit from t3
  11271. shldq $0x01, %r15, %rbp
  11272. shldq $0x01, %r14, %r15
  11273. shldq $0x01, %r13, %r14
  11274. shldq $0x01, %r12, %r13
  11275. andq %rcx, %r12
  11276. # Multiply top half by 19
  11277. movq $19, %rdx
  11278. xorq %rcx, %rcx
  11279. mulxq %r13, %rax, %r13
  11280. adcxq %rax, %r9
  11281. adoxq %r13, %r10
  11282. mulxq %r14, %rax, %r14
  11283. adcxq %rax, %r10
  11284. adoxq %r14, %r11
  11285. mulxq %r15, %rax, %r15
  11286. adcxq %rax, %r11
  11287. adoxq %r15, %r12
  11288. mulxq %rbp, %rbp, %rdx
  11289. adcxq %rbp, %r12
  11290. adoxq %rcx, %rdx
  11291. adcxq %rcx, %rdx
  11292. # Overflow
  11293. shldq $0x01, %r12, %rdx
  11294. movq $0x7fffffffffffffff, %rcx
  11295. imulq $19, %rdx, %rax
  11296. andq %rcx, %r12
  11297. addq %rax, %r9
  11298. adcq $0x00, %r10
  11299. adcq $0x00, %r11
  11300. adcq $0x00, %r12
  11301. # Reduce if top bit set
  11302. movq %r12, %rdx
  11303. shrq $63, %rdx
  11304. imulq $19, %rdx, %rax
  11305. andq %rcx, %r12
  11306. addq %rax, %r9
  11307. adcq $0x00, %r10
  11308. adcq $0x00, %r11
  11309. adcq $0x00, %r12
  11310. # Store
  11311. movq %r9, (%rsp)
  11312. movq %r10, 8(%rsp)
  11313. movq %r11, 16(%rsp)
  11314. movq %r12, 24(%rsp)
  11315. movq $0x1db42, %rdx
  11316. mulxq 128(%rsp), %r9, %rbp
  11317. mulxq 136(%rsp), %r10, %r15
  11318. mulxq 144(%rsp), %r11, %r14
  11319. mulxq 152(%rsp), %r12, %r13
  11320. addq %rbp, %r10
  11321. adcq %r15, %r11
  11322. adcq %r14, %r12
  11323. adcq $0x00, %r13
  11324. movq $0x7fffffffffffffff, %rbp
  11325. shldq $0x01, %r12, %r13
  11326. andq %rbp, %r12
  11327. imulq $19, %r13, %r13
  11328. addq %r13, %r9
  11329. adcq $0x00, %r10
  11330. adcq $0x00, %r11
  11331. adcq $0x00, %r12
  11332. movq %r9, 32(%rsp)
  11333. movq %r10, 40(%rsp)
  11334. movq %r11, 48(%rsp)
  11335. movq %r12, 56(%rsp)
  11336. # Square
  11337. # A[0] * A[1]
  11338. movq 64(%rsp), %rdx
  11339. mulxq 72(%rsp), %r10, %r11
  11340. # A[0] * A[3]
  11341. mulxq 88(%rsp), %r12, %r13
  11342. # A[2] * A[1]
  11343. movq 80(%rsp), %rdx
  11344. mulxq 72(%rsp), %rcx, %rbx
  11345. xorq %rbp, %rbp
  11346. adoxq %rcx, %r12
  11347. # A[2] * A[3]
  11348. mulxq 88(%rsp), %r14, %r15
  11349. adoxq %rbx, %r13
  11350. # A[2] * A[0]
  11351. mulxq 64(%rsp), %rcx, %rbx
  11352. adoxq %rbp, %r14
  11353. adcxq %rcx, %r11
  11354. adoxq %rbp, %r15
  11355. # A[1] * A[3]
  11356. movq 72(%rsp), %rdx
  11357. mulxq 88(%rsp), %rax, %r9
  11358. adcxq %rbx, %r12
  11359. adcxq %rax, %r13
  11360. adcxq %r9, %r14
  11361. adcxq %rbp, %r15
  11362. # Double with Carry Flag
  11363. xorq %rbp, %rbp
  11364. # A[0] * A[0]
  11365. movq 64(%rsp), %rdx
  11366. mulxq %rdx, %r9, %rax
  11367. adcxq %r10, %r10
  11368. # A[1] * A[1]
  11369. movq 72(%rsp), %rdx
  11370. mulxq %rdx, %rcx, %rbx
  11371. adcxq %r11, %r11
  11372. adoxq %rax, %r10
  11373. adcxq %r12, %r12
  11374. adoxq %rcx, %r11
  11375. # A[2] * A[2]
  11376. movq 80(%rsp), %rdx
  11377. mulxq %rdx, %rax, %rcx
  11378. adcxq %r13, %r13
  11379. adoxq %rbx, %r12
  11380. adcxq %r14, %r14
  11381. adoxq %rax, %r13
  11382. # A[3] * A[3]
  11383. movq 88(%rsp), %rdx
  11384. mulxq %rdx, %rax, %rbx
  11385. adcxq %r15, %r15
  11386. adoxq %rcx, %r14
  11387. adcxq %rbp, %rbp
  11388. adoxq %rax, %r15
  11389. adoxq %rbx, %rbp
  11390. # Reduce
  11391. movq $0x7fffffffffffffff, %rcx
  11392. # Move top half into t4-t7 and remove top bit from t3
  11393. shldq $0x01, %r15, %rbp
  11394. shldq $0x01, %r14, %r15
  11395. shldq $0x01, %r13, %r14
  11396. shldq $0x01, %r12, %r13
  11397. andq %rcx, %r12
  11398. # Multiply top half by 19
  11399. movq $19, %rdx
  11400. xorq %rcx, %rcx
  11401. mulxq %r13, %rax, %r13
  11402. adcxq %rax, %r9
  11403. adoxq %r13, %r10
  11404. mulxq %r14, %rax, %r14
  11405. adcxq %rax, %r10
  11406. adoxq %r14, %r11
  11407. mulxq %r15, %rax, %r15
  11408. adcxq %rax, %r11
  11409. adoxq %r15, %r12
  11410. mulxq %rbp, %rbp, %rdx
  11411. adcxq %rbp, %r12
  11412. adoxq %rcx, %rdx
  11413. adcxq %rcx, %rdx
  11414. # Overflow
  11415. shldq $0x01, %r12, %rdx
  11416. movq $0x7fffffffffffffff, %rcx
  11417. imulq $19, %rdx, %rax
  11418. andq %rcx, %r12
  11419. addq %rax, %r9
  11420. adcq $0x00, %r10
  11421. adcq $0x00, %r11
  11422. adcq $0x00, %r12
  11423. # Reduce if top bit set
  11424. movq %r12, %rdx
  11425. shrq $63, %rdx
  11426. imulq $19, %rdx, %rax
  11427. andq %rcx, %r12
  11428. addq %rax, %r9
  11429. adcq $0x00, %r10
  11430. adcq $0x00, %r11
  11431. adcq $0x00, %r12
  11432. # Store
  11433. movq %r9, 64(%rsp)
  11434. movq %r10, 72(%rsp)
  11435. movq %r11, 80(%rsp)
  11436. movq %r12, 88(%rsp)
  11437. # Add
  11438. movq 96(%rsp), %r9
  11439. movq 104(%rsp), %r10
  11440. addq 32(%rsp), %r9
  11441. movq 112(%rsp), %r11
  11442. adcq 40(%rsp), %r10
  11443. movq 120(%rsp), %rax
  11444. adcq 48(%rsp), %r11
  11445. movq $-19, %rcx
  11446. adcq 56(%rsp), %rax
  11447. movq $0x7fffffffffffffff, %rbx
  11448. movq %rax, %r12
  11449. sarq $63, %rax
  11450. # Mask the modulus
  11451. andq %rax, %rcx
  11452. andq %rax, %rbx
  11453. # Sub modulus (if overflow)
  11454. subq %rcx, %r9
  11455. sbbq %rax, %r10
  11456. sbbq %rax, %r11
  11457. sbbq %rbx, %r12
  11458. movq %r9, 96(%rsp)
  11459. movq %r10, 104(%rsp)
  11460. movq %r11, 112(%rsp)
  11461. movq %r12, 120(%rsp)
  11462. # Multiply
  11463. # A[0] * B[0]
  11464. movq (%rsp), %rdx
  11465. mulxq (%r8), %r9, %r10
  11466. # A[2] * B[0]
  11467. mulxq 16(%r8), %r11, %r12
  11468. # A[1] * B[0]
  11469. mulxq 8(%r8), %rcx, %rbx
  11470. xorq %rbp, %rbp
  11471. adcxq %rcx, %r10
  11472. # A[1] * B[3]
  11473. movq 24(%rsp), %rdx
  11474. mulxq 8(%r8), %r13, %r14
  11475. adcxq %rbx, %r11
  11476. # A[0] * B[1]
  11477. movq 8(%rsp), %rdx
  11478. mulxq (%r8), %rcx, %rbx
  11479. adoxq %rcx, %r10
  11480. # A[2] * B[1]
  11481. mulxq 16(%r8), %rcx, %r15
  11482. adoxq %rbx, %r11
  11483. adcxq %rcx, %r12
  11484. # A[1] * B[2]
  11485. movq 16(%rsp), %rdx
  11486. mulxq 8(%r8), %rcx, %rbx
  11487. adcxq %r15, %r13
  11488. adoxq %rcx, %r12
  11489. adcxq %rbp, %r14
  11490. adoxq %rbx, %r13
  11491. # A[0] * B[2]
  11492. mulxq (%r8), %rcx, %rbx
  11493. adoxq %rbp, %r14
  11494. xorq %r15, %r15
  11495. adcxq %rcx, %r11
  11496. # A[1] * B[1]
  11497. movq 8(%rsp), %rdx
  11498. mulxq 8(%r8), %rdx, %rcx
  11499. adcxq %rbx, %r12
  11500. adoxq %rdx, %r11
  11501. # A[3] * B[1]
  11502. movq 8(%rsp), %rdx
  11503. adoxq %rcx, %r12
  11504. mulxq 24(%r8), %rcx, %rbx
  11505. adcxq %rcx, %r13
  11506. # A[2] * B[2]
  11507. movq 16(%rsp), %rdx
  11508. mulxq 16(%r8), %rdx, %rcx
  11509. adcxq %rbx, %r14
  11510. adoxq %rdx, %r13
  11511. # A[3] * B[3]
  11512. movq 24(%rsp), %rdx
  11513. adoxq %rcx, %r14
  11514. mulxq 24(%r8), %rcx, %rbx
  11515. adoxq %rbp, %r15
  11516. adcxq %rcx, %r15
  11517. # A[0] * B[3]
  11518. mulxq (%r8), %rdx, %rcx
  11519. adcxq %rbx, %rbp
  11520. xorq %rbx, %rbx
  11521. adcxq %rdx, %r12
  11522. # A[3] * B[0]
  11523. movq (%rsp), %rdx
  11524. adcxq %rcx, %r13
  11525. mulxq 24(%r8), %rdx, %rcx
  11526. adoxq %rdx, %r12
  11527. adoxq %rcx, %r13
  11528. # A[2] * B[3]
  11529. movq 24(%rsp), %rdx
  11530. mulxq 16(%r8), %rdx, %rcx
  11531. adcxq %rdx, %r14
  11532. # A[3] * B[2]
  11533. movq 16(%rsp), %rdx
  11534. adcxq %rcx, %r15
  11535. mulxq 24(%r8), %rcx, %rdx
  11536. adcxq %rbx, %rbp
  11537. adoxq %rcx, %r14
  11538. adoxq %rdx, %r15
  11539. adoxq %rbx, %rbp
  11540. # Reduce
  11541. movq $0x7fffffffffffffff, %rbx
  11542. # Move top half into t4-t7 and remove top bit from t3
  11543. shldq $0x01, %r15, %rbp
  11544. shldq $0x01, %r14, %r15
  11545. shldq $0x01, %r13, %r14
  11546. shldq $0x01, %r12, %r13
  11547. andq %rbx, %r12
  11548. # Multiply top half by 19
  11549. movq $19, %rdx
  11550. xorq %rbx, %rbx
  11551. mulxq %r13, %rcx, %r13
  11552. adcxq %rcx, %r9
  11553. adoxq %r13, %r10
  11554. mulxq %r14, %rcx, %r14
  11555. adcxq %rcx, %r10
  11556. adoxq %r14, %r11
  11557. mulxq %r15, %rcx, %r15
  11558. adcxq %rcx, %r11
  11559. adoxq %r15, %r12
  11560. mulxq %rbp, %rbp, %rdx
  11561. adcxq %rbp, %r12
  11562. adoxq %rbx, %rdx
  11563. adcxq %rbx, %rdx
  11564. # Overflow
  11565. shldq $0x01, %r12, %rdx
  11566. movq $0x7fffffffffffffff, %rbx
  11567. imulq $19, %rdx, %rcx
  11568. andq %rbx, %r12
  11569. addq %rcx, %r9
  11570. adcq $0x00, %r10
  11571. adcq $0x00, %r11
  11572. adcq $0x00, %r12
  11573. # Reduce if top bit set
  11574. movq %r12, %rdx
  11575. shrq $63, %rdx
  11576. imulq $19, %rdx, %rcx
  11577. andq %rbx, %r12
  11578. addq %rcx, %r9
  11579. adcq $0x00, %r10
  11580. adcq $0x00, %r11
  11581. adcq $0x00, %r12
  11582. # Store
  11583. movq %r9, 32(%rsp)
  11584. movq %r10, 40(%rsp)
  11585. movq %r11, 48(%rsp)
  11586. movq %r12, 56(%rsp)
  11587. # Multiply
  11588. # A[0] * B[0]
  11589. movq 96(%rsp), %rdx
  11590. mulxq 128(%rsp), %r9, %r10
  11591. # A[2] * B[0]
  11592. mulxq 144(%rsp), %r11, %r12
  11593. # A[1] * B[0]
  11594. mulxq 136(%rsp), %rcx, %rbx
  11595. xorq %rbp, %rbp
  11596. adcxq %rcx, %r10
  11597. # A[1] * B[3]
  11598. movq 120(%rsp), %rdx
  11599. mulxq 136(%rsp), %r13, %r14
  11600. adcxq %rbx, %r11
  11601. # A[0] * B[1]
  11602. movq 104(%rsp), %rdx
  11603. mulxq 128(%rsp), %rcx, %rbx
  11604. adoxq %rcx, %r10
  11605. # A[2] * B[1]
  11606. mulxq 144(%rsp), %rcx, %r15
  11607. adoxq %rbx, %r11
  11608. adcxq %rcx, %r12
  11609. # A[1] * B[2]
  11610. movq 112(%rsp), %rdx
  11611. mulxq 136(%rsp), %rcx, %rbx
  11612. adcxq %r15, %r13
  11613. adoxq %rcx, %r12
  11614. adcxq %rbp, %r14
  11615. adoxq %rbx, %r13
  11616. # A[0] * B[2]
  11617. mulxq 128(%rsp), %rcx, %rbx
  11618. adoxq %rbp, %r14
  11619. xorq %r15, %r15
  11620. adcxq %rcx, %r11
  11621. # A[1] * B[1]
  11622. movq 104(%rsp), %rdx
  11623. mulxq 136(%rsp), %rdx, %rcx
  11624. adcxq %rbx, %r12
  11625. adoxq %rdx, %r11
  11626. # A[3] * B[1]
  11627. movq 104(%rsp), %rdx
  11628. adoxq %rcx, %r12
  11629. mulxq 152(%rsp), %rcx, %rbx
  11630. adcxq %rcx, %r13
  11631. # A[2] * B[2]
  11632. movq 112(%rsp), %rdx
  11633. mulxq 144(%rsp), %rdx, %rcx
  11634. adcxq %rbx, %r14
  11635. adoxq %rdx, %r13
  11636. # A[3] * B[3]
  11637. movq 120(%rsp), %rdx
  11638. adoxq %rcx, %r14
  11639. mulxq 152(%rsp), %rcx, %rbx
  11640. adoxq %rbp, %r15
  11641. adcxq %rcx, %r15
  11642. # A[0] * B[3]
  11643. mulxq 128(%rsp), %rdx, %rcx
  11644. adcxq %rbx, %rbp
  11645. xorq %rbx, %rbx
  11646. adcxq %rdx, %r12
  11647. # A[3] * B[0]
  11648. movq 96(%rsp), %rdx
  11649. adcxq %rcx, %r13
  11650. mulxq 152(%rsp), %rdx, %rcx
  11651. adoxq %rdx, %r12
  11652. adoxq %rcx, %r13
  11653. # A[2] * B[3]
  11654. movq 120(%rsp), %rdx
  11655. mulxq 144(%rsp), %rdx, %rcx
  11656. adcxq %rdx, %r14
  11657. # A[3] * B[2]
  11658. movq 112(%rsp), %rdx
  11659. adcxq %rcx, %r15
  11660. mulxq 152(%rsp), %rcx, %rdx
  11661. adcxq %rbx, %rbp
  11662. adoxq %rcx, %r14
  11663. adoxq %rdx, %r15
  11664. adoxq %rbx, %rbp
  11665. # Reduce
  11666. movq $0x7fffffffffffffff, %rbx
  11667. # Move top half into t4-t7 and remove top bit from t3
  11668. shldq $0x01, %r15, %rbp
  11669. shldq $0x01, %r14, %r15
  11670. shldq $0x01, %r13, %r14
  11671. shldq $0x01, %r12, %r13
  11672. andq %rbx, %r12
  11673. # Multiply top half by 19
  11674. movq $19, %rdx
  11675. xorq %rbx, %rbx
  11676. mulxq %r13, %rcx, %r13
  11677. adcxq %rcx, %r9
  11678. adoxq %r13, %r10
  11679. mulxq %r14, %rcx, %r14
  11680. adcxq %rcx, %r10
  11681. adoxq %r14, %r11
  11682. mulxq %r15, %rcx, %r15
  11683. adcxq %rcx, %r11
  11684. adoxq %r15, %r12
  11685. mulxq %rbp, %rbp, %rdx
  11686. adcxq %rbp, %r12
  11687. adoxq %rbx, %rdx
  11688. adcxq %rbx, %rdx
  11689. # Overflow
  11690. shldq $0x01, %r12, %rdx
  11691. movq $0x7fffffffffffffff, %rbx
  11692. imulq $19, %rdx, %rcx
  11693. andq %rbx, %r12
  11694. addq %rcx, %r9
  11695. adcq $0x00, %r10
  11696. adcq $0x00, %r11
  11697. adcq $0x00, %r12
  11698. # Reduce if top bit set
  11699. movq %r12, %rdx
  11700. shrq $63, %rdx
  11701. imulq $19, %rdx, %rcx
  11702. andq %rbx, %r12
  11703. addq %rcx, %r9
  11704. adcq $0x00, %r10
  11705. adcq $0x00, %r11
  11706. adcq $0x00, %r12
  11707. # Store
  11708. movq %r9, (%rsp)
  11709. movq %r10, 8(%rsp)
  11710. movq %r11, 16(%rsp)
  11711. movq %r12, 24(%rsp)
  11712. decb 168(%rsp)
  11713. jge L_curve25519_avx2_bits
  11714. movq $63, 168(%rsp)
  11715. decb 160(%rsp)
  11716. jge L_curve25519_avx2_words
  11717. # Invert
  11718. leaq 32(%rsp), %rdi
  11719. movq %rsp, %rsi
  11720. #ifndef __APPLE__
  11721. callq fe_sq_avx2@plt
  11722. #else
  11723. callq _fe_sq_avx2
  11724. #endif /* __APPLE__ */
  11725. leaq 64(%rsp), %rdi
  11726. leaq 32(%rsp), %rsi
  11727. #ifndef __APPLE__
  11728. callq fe_sq_avx2@plt
  11729. #else
  11730. callq _fe_sq_avx2
  11731. #endif /* __APPLE__ */
  11732. leaq 64(%rsp), %rdi
  11733. leaq 64(%rsp), %rsi
  11734. #ifndef __APPLE__
  11735. callq fe_sq_avx2@plt
  11736. #else
  11737. callq _fe_sq_avx2
  11738. #endif /* __APPLE__ */
  11739. leaq 64(%rsp), %rdi
  11740. movq %rsp, %rsi
  11741. leaq 64(%rsp), %rdx
  11742. #ifndef __APPLE__
  11743. callq fe_mul_avx2@plt
  11744. #else
  11745. callq _fe_mul_avx2
  11746. #endif /* __APPLE__ */
  11747. leaq 32(%rsp), %rdi
  11748. leaq 32(%rsp), %rsi
  11749. leaq 64(%rsp), %rdx
  11750. #ifndef __APPLE__
  11751. callq fe_mul_avx2@plt
  11752. #else
  11753. callq _fe_mul_avx2
  11754. #endif /* __APPLE__ */
  11755. leaq 96(%rsp), %rdi
  11756. leaq 32(%rsp), %rsi
  11757. #ifndef __APPLE__
  11758. callq fe_sq_avx2@plt
  11759. #else
  11760. callq _fe_sq_avx2
  11761. #endif /* __APPLE__ */
  11762. leaq 64(%rsp), %rdi
  11763. leaq 64(%rsp), %rsi
  11764. leaq 96(%rsp), %rdx
  11765. #ifndef __APPLE__
  11766. callq fe_mul_avx2@plt
  11767. #else
  11768. callq _fe_mul_avx2
  11769. #endif /* __APPLE__ */
  11770. leaq 96(%rsp), %rdi
  11771. leaq 64(%rsp), %rsi
  11772. #ifndef __APPLE__
  11773. callq fe_sq_avx2@plt
  11774. #else
  11775. callq _fe_sq_avx2
  11776. #endif /* __APPLE__ */
  11777. leaq 96(%rsp), %rdi
  11778. leaq 96(%rsp), %rsi
  11779. movq $4, %rdx
  11780. #ifndef __APPLE__
  11781. callq fe_sq_n_avx2@plt
  11782. #else
  11783. callq _fe_sq_n_avx2
  11784. #endif /* __APPLE__ */
  11785. leaq 64(%rsp), %rdi
  11786. leaq 96(%rsp), %rsi
  11787. leaq 64(%rsp), %rdx
  11788. #ifndef __APPLE__
  11789. callq fe_mul_avx2@plt
  11790. #else
  11791. callq _fe_mul_avx2
  11792. #endif /* __APPLE__ */
  11793. leaq 96(%rsp), %rdi
  11794. leaq 64(%rsp), %rsi
  11795. #ifndef __APPLE__
  11796. callq fe_sq_avx2@plt
  11797. #else
  11798. callq _fe_sq_avx2
  11799. #endif /* __APPLE__ */
  11800. leaq 96(%rsp), %rdi
  11801. leaq 96(%rsp), %rsi
  11802. movq $9, %rdx
  11803. #ifndef __APPLE__
  11804. callq fe_sq_n_avx2@plt
  11805. #else
  11806. callq _fe_sq_n_avx2
  11807. #endif /* __APPLE__ */
  11808. leaq 96(%rsp), %rdi
  11809. leaq 96(%rsp), %rsi
  11810. leaq 64(%rsp), %rdx
  11811. #ifndef __APPLE__
  11812. callq fe_mul_avx2@plt
  11813. #else
  11814. callq _fe_mul_avx2
  11815. #endif /* __APPLE__ */
  11816. leaq 128(%rsp), %rdi
  11817. leaq 96(%rsp), %rsi
  11818. #ifndef __APPLE__
  11819. callq fe_sq_avx2@plt
  11820. #else
  11821. callq _fe_sq_avx2
  11822. #endif /* __APPLE__ */
  11823. leaq 128(%rsp), %rdi
  11824. leaq 128(%rsp), %rsi
  11825. movq $19, %rdx
  11826. #ifndef __APPLE__
  11827. callq fe_sq_n_avx2@plt
  11828. #else
  11829. callq _fe_sq_n_avx2
  11830. #endif /* __APPLE__ */
  11831. leaq 96(%rsp), %rdi
  11832. leaq 128(%rsp), %rsi
  11833. leaq 96(%rsp), %rdx
  11834. #ifndef __APPLE__
  11835. callq fe_mul_avx2@plt
  11836. #else
  11837. callq _fe_mul_avx2
  11838. #endif /* __APPLE__ */
  11839. leaq 96(%rsp), %rdi
  11840. leaq 96(%rsp), %rsi
  11841. #ifndef __APPLE__
  11842. callq fe_sq_avx2@plt
  11843. #else
  11844. callq _fe_sq_avx2
  11845. #endif /* __APPLE__ */
  11846. leaq 96(%rsp), %rdi
  11847. leaq 96(%rsp), %rsi
  11848. movq $9, %rdx
  11849. #ifndef __APPLE__
  11850. callq fe_sq_n_avx2@plt
  11851. #else
  11852. callq _fe_sq_n_avx2
  11853. #endif /* __APPLE__ */
  11854. leaq 64(%rsp), %rdi
  11855. leaq 96(%rsp), %rsi
  11856. leaq 64(%rsp), %rdx
  11857. #ifndef __APPLE__
  11858. callq fe_mul_avx2@plt
  11859. #else
  11860. callq _fe_mul_avx2
  11861. #endif /* __APPLE__ */
  11862. leaq 96(%rsp), %rdi
  11863. leaq 64(%rsp), %rsi
  11864. #ifndef __APPLE__
  11865. callq fe_sq_avx2@plt
  11866. #else
  11867. callq _fe_sq_avx2
  11868. #endif /* __APPLE__ */
  11869. leaq 96(%rsp), %rdi
  11870. leaq 96(%rsp), %rsi
  11871. movq $49, %rdx
  11872. #ifndef __APPLE__
  11873. callq fe_sq_n_avx2@plt
  11874. #else
  11875. callq _fe_sq_n_avx2
  11876. #endif /* __APPLE__ */
  11877. leaq 96(%rsp), %rdi
  11878. leaq 96(%rsp), %rsi
  11879. leaq 64(%rsp), %rdx
  11880. #ifndef __APPLE__
  11881. callq fe_mul_avx2@plt
  11882. #else
  11883. callq _fe_mul_avx2
  11884. #endif /* __APPLE__ */
  11885. leaq 128(%rsp), %rdi
  11886. leaq 96(%rsp), %rsi
  11887. #ifndef __APPLE__
  11888. callq fe_sq_avx2@plt
  11889. #else
  11890. callq _fe_sq_avx2
  11891. #endif /* __APPLE__ */
  11892. leaq 128(%rsp), %rdi
  11893. leaq 128(%rsp), %rsi
  11894. movq $0x63, %rdx
  11895. #ifndef __APPLE__
  11896. callq fe_sq_n_avx2@plt
  11897. #else
  11898. callq _fe_sq_n_avx2
  11899. #endif /* __APPLE__ */
  11900. leaq 96(%rsp), %rdi
  11901. leaq 128(%rsp), %rsi
  11902. leaq 96(%rsp), %rdx
  11903. #ifndef __APPLE__
  11904. callq fe_mul_avx2@plt
  11905. #else
  11906. callq _fe_mul_avx2
  11907. #endif /* __APPLE__ */
  11908. leaq 96(%rsp), %rdi
  11909. leaq 96(%rsp), %rsi
  11910. #ifndef __APPLE__
  11911. callq fe_sq_avx2@plt
  11912. #else
  11913. callq _fe_sq_avx2
  11914. #endif /* __APPLE__ */
  11915. leaq 96(%rsp), %rdi
  11916. leaq 96(%rsp), %rsi
  11917. movq $49, %rdx
  11918. #ifndef __APPLE__
  11919. callq fe_sq_n_avx2@plt
  11920. #else
  11921. callq _fe_sq_n_avx2
  11922. #endif /* __APPLE__ */
  11923. leaq 64(%rsp), %rdi
  11924. leaq 96(%rsp), %rsi
  11925. leaq 64(%rsp), %rdx
  11926. #ifndef __APPLE__
  11927. callq fe_mul_avx2@plt
  11928. #else
  11929. callq _fe_mul_avx2
  11930. #endif /* __APPLE__ */
  11931. leaq 64(%rsp), %rdi
  11932. leaq 64(%rsp), %rsi
  11933. #ifndef __APPLE__
  11934. callq fe_sq_avx2@plt
  11935. #else
  11936. callq _fe_sq_avx2
  11937. #endif /* __APPLE__ */
  11938. leaq 64(%rsp), %rdi
  11939. leaq 64(%rsp), %rsi
  11940. movq $4, %rdx
  11941. #ifndef __APPLE__
  11942. callq fe_sq_n_avx2@plt
  11943. #else
  11944. callq _fe_sq_n_avx2
  11945. #endif /* __APPLE__ */
  11946. movq %rsp, %rdi
  11947. leaq 64(%rsp), %rsi
  11948. leaq 32(%rsp), %rdx
  11949. #ifndef __APPLE__
  11950. callq fe_mul_avx2@plt
  11951. #else
  11952. callq _fe_mul_avx2
  11953. #endif /* __APPLE__ */
  11954. movq 176(%rsp), %rdi
  11955. # Multiply
  11956. # A[0] * B[0]
  11957. movq (%rsp), %rdx
  11958. mulxq (%rdi), %r9, %r10
  11959. # A[2] * B[0]
  11960. mulxq 16(%rdi), %r11, %r12
  11961. # A[1] * B[0]
  11962. mulxq 8(%rdi), %rcx, %rbx
  11963. xorq %rbp, %rbp
  11964. adcxq %rcx, %r10
  11965. # A[1] * B[3]
  11966. movq 24(%rsp), %rdx
  11967. mulxq 8(%rdi), %r13, %r14
  11968. adcxq %rbx, %r11
  11969. # A[0] * B[1]
  11970. movq 8(%rsp), %rdx
  11971. mulxq (%rdi), %rcx, %rbx
  11972. adoxq %rcx, %r10
  11973. # A[2] * B[1]
  11974. mulxq 16(%rdi), %rcx, %r15
  11975. adoxq %rbx, %r11
  11976. adcxq %rcx, %r12
  11977. # A[1] * B[2]
  11978. movq 16(%rsp), %rdx
  11979. mulxq 8(%rdi), %rcx, %rbx
  11980. adcxq %r15, %r13
  11981. adoxq %rcx, %r12
  11982. adcxq %rbp, %r14
  11983. adoxq %rbx, %r13
  11984. # A[0] * B[2]
  11985. mulxq (%rdi), %rcx, %rbx
  11986. adoxq %rbp, %r14
  11987. xorq %r15, %r15
  11988. adcxq %rcx, %r11
  11989. # A[1] * B[1]
  11990. movq 8(%rsp), %rdx
  11991. mulxq 8(%rdi), %rdx, %rcx
  11992. adcxq %rbx, %r12
  11993. adoxq %rdx, %r11
  11994. # A[3] * B[1]
  11995. movq 8(%rsp), %rdx
  11996. adoxq %rcx, %r12
  11997. mulxq 24(%rdi), %rcx, %rbx
  11998. adcxq %rcx, %r13
  11999. # A[2] * B[2]
  12000. movq 16(%rsp), %rdx
  12001. mulxq 16(%rdi), %rdx, %rcx
  12002. adcxq %rbx, %r14
  12003. adoxq %rdx, %r13
  12004. # A[3] * B[3]
  12005. movq 24(%rsp), %rdx
  12006. adoxq %rcx, %r14
  12007. mulxq 24(%rdi), %rcx, %rbx
  12008. adoxq %rbp, %r15
  12009. adcxq %rcx, %r15
  12010. # A[0] * B[3]
  12011. mulxq (%rdi), %rdx, %rcx
  12012. adcxq %rbx, %rbp
  12013. xorq %rbx, %rbx
  12014. adcxq %rdx, %r12
  12015. # A[3] * B[0]
  12016. movq (%rsp), %rdx
  12017. adcxq %rcx, %r13
  12018. mulxq 24(%rdi), %rdx, %rcx
  12019. adoxq %rdx, %r12
  12020. adoxq %rcx, %r13
  12021. # A[2] * B[3]
  12022. movq 24(%rsp), %rdx
  12023. mulxq 16(%rdi), %rdx, %rcx
  12024. adcxq %rdx, %r14
  12025. # A[3] * B[2]
  12026. movq 16(%rsp), %rdx
  12027. adcxq %rcx, %r15
  12028. mulxq 24(%rdi), %rcx, %rdx
  12029. adcxq %rbx, %rbp
  12030. adoxq %rcx, %r14
  12031. adoxq %rdx, %r15
  12032. adoxq %rbx, %rbp
  12033. # Reduce
  12034. movq $0x7fffffffffffffff, %rbx
  12035. # Move top half into t4-t7 and remove top bit from t3
  12036. shldq $0x01, %r15, %rbp
  12037. shldq $0x01, %r14, %r15
  12038. shldq $0x01, %r13, %r14
  12039. shldq $0x01, %r12, %r13
  12040. andq %rbx, %r12
  12041. # Multiply top half by 19
  12042. movq $19, %rdx
  12043. xorq %rbx, %rbx
  12044. mulxq %r13, %rcx, %r13
  12045. adcxq %rcx, %r9
  12046. adoxq %r13, %r10
  12047. mulxq %r14, %rcx, %r14
  12048. adcxq %rcx, %r10
  12049. adoxq %r14, %r11
  12050. mulxq %r15, %rcx, %r15
  12051. adcxq %rcx, %r11
  12052. adoxq %r15, %r12
  12053. mulxq %rbp, %rbp, %rdx
  12054. adcxq %rbp, %r12
  12055. adoxq %rbx, %rdx
  12056. adcxq %rbx, %rdx
  12057. # Overflow
  12058. shldq $0x01, %r12, %rdx
  12059. movq $0x7fffffffffffffff, %rbx
  12060. imulq $19, %rdx, %rcx
  12061. andq %rbx, %r12
  12062. addq %rcx, %r9
  12063. adcq $0x00, %r10
  12064. adcq $0x00, %r11
  12065. adcq $0x00, %r12
  12066. # Reduce if top bit set
  12067. movq %r12, %rdx
  12068. shrq $63, %rdx
  12069. imulq $19, %rdx, %rcx
  12070. andq %rbx, %r12
  12071. addq %rcx, %r9
  12072. adcq $0x00, %r10
  12073. adcq $0x00, %r11
  12074. adcq $0x00, %r12
  12075. # Store
  12076. movq %r9, (%rdi)
  12077. movq %r10, 8(%rdi)
  12078. movq %r11, 16(%rdi)
  12079. movq %r12, 24(%rdi)
  12080. xorq %rax, %rax
  12081. addq $0xc0, %rsp
  12082. popq %rbp
  12083. popq %r15
  12084. popq %r14
  12085. popq %r13
  12086. popq %r12
  12087. popq %rbx
  12088. repz retq
  12089. #ifndef __APPLE__
  12090. .size curve25519_avx2,.-curve25519_avx2
  12091. #endif /* __APPLE__ */
  12092. #ifndef __APPLE__
  12093. .text
  12094. .globl fe_pow22523_avx2
  12095. .type fe_pow22523_avx2,@function
  12096. .align 16
  12097. fe_pow22523_avx2:
  12098. #else
  12099. .section __TEXT,__text
  12100. .globl _fe_pow22523_avx2
  12101. .p2align 4
  12102. _fe_pow22523_avx2:
  12103. #endif /* __APPLE__ */
  12104. subq $0x70, %rsp
  12105. # pow22523
  12106. movq %rdi, 96(%rsp)
  12107. movq %rsi, 104(%rsp)
  12108. movq %rsp, %rdi
  12109. movq 104(%rsp), %rsi
  12110. #ifndef __APPLE__
  12111. callq fe_sq_avx2@plt
  12112. #else
  12113. callq _fe_sq_avx2
  12114. #endif /* __APPLE__ */
  12115. leaq 32(%rsp), %rdi
  12116. movq %rsp, %rsi
  12117. #ifndef __APPLE__
  12118. callq fe_sq_avx2@plt
  12119. #else
  12120. callq _fe_sq_avx2
  12121. #endif /* __APPLE__ */
  12122. leaq 32(%rsp), %rdi
  12123. leaq 32(%rsp), %rsi
  12124. #ifndef __APPLE__
  12125. callq fe_sq_avx2@plt
  12126. #else
  12127. callq _fe_sq_avx2
  12128. #endif /* __APPLE__ */
  12129. leaq 32(%rsp), %rdi
  12130. movq 104(%rsp), %rsi
  12131. leaq 32(%rsp), %rdx
  12132. #ifndef __APPLE__
  12133. callq fe_mul_avx2@plt
  12134. #else
  12135. callq _fe_mul_avx2
  12136. #endif /* __APPLE__ */
  12137. movq %rsp, %rdi
  12138. movq %rsp, %rsi
  12139. leaq 32(%rsp), %rdx
  12140. #ifndef __APPLE__
  12141. callq fe_mul_avx2@plt
  12142. #else
  12143. callq _fe_mul_avx2
  12144. #endif /* __APPLE__ */
  12145. movq %rsp, %rdi
  12146. movq %rsp, %rsi
  12147. #ifndef __APPLE__
  12148. callq fe_sq_avx2@plt
  12149. #else
  12150. callq _fe_sq_avx2
  12151. #endif /* __APPLE__ */
  12152. movq %rsp, %rdi
  12153. leaq 32(%rsp), %rsi
  12154. movq %rsp, %rdx
  12155. #ifndef __APPLE__
  12156. callq fe_mul_avx2@plt
  12157. #else
  12158. callq _fe_mul_avx2
  12159. #endif /* __APPLE__ */
  12160. leaq 32(%rsp), %rdi
  12161. movq %rsp, %rsi
  12162. #ifndef __APPLE__
  12163. callq fe_sq_avx2@plt
  12164. #else
  12165. callq _fe_sq_avx2
  12166. #endif /* __APPLE__ */
  12167. leaq 32(%rsp), %rdi
  12168. leaq 32(%rsp), %rsi
  12169. movb $4, %dl
  12170. #ifndef __APPLE__
  12171. callq fe_sq_n_avx2@plt
  12172. #else
  12173. callq _fe_sq_n_avx2
  12174. #endif /* __APPLE__ */
  12175. movq %rsp, %rdi
  12176. leaq 32(%rsp), %rsi
  12177. movq %rsp, %rdx
  12178. #ifndef __APPLE__
  12179. callq fe_mul_avx2@plt
  12180. #else
  12181. callq _fe_mul_avx2
  12182. #endif /* __APPLE__ */
  12183. leaq 32(%rsp), %rdi
  12184. movq %rsp, %rsi
  12185. #ifndef __APPLE__
  12186. callq fe_sq_avx2@plt
  12187. #else
  12188. callq _fe_sq_avx2
  12189. #endif /* __APPLE__ */
  12190. leaq 32(%rsp), %rdi
  12191. leaq 32(%rsp), %rsi
  12192. movb $9, %dl
  12193. #ifndef __APPLE__
  12194. callq fe_sq_n_avx2@plt
  12195. #else
  12196. callq _fe_sq_n_avx2
  12197. #endif /* __APPLE__ */
  12198. leaq 32(%rsp), %rdi
  12199. leaq 32(%rsp), %rsi
  12200. movq %rsp, %rdx
  12201. #ifndef __APPLE__
  12202. callq fe_mul_avx2@plt
  12203. #else
  12204. callq _fe_mul_avx2
  12205. #endif /* __APPLE__ */
  12206. leaq 64(%rsp), %rdi
  12207. leaq 32(%rsp), %rsi
  12208. #ifndef __APPLE__
  12209. callq fe_sq_avx2@plt
  12210. #else
  12211. callq _fe_sq_avx2
  12212. #endif /* __APPLE__ */
  12213. leaq 64(%rsp), %rdi
  12214. leaq 64(%rsp), %rsi
  12215. movb $19, %dl
  12216. #ifndef __APPLE__
  12217. callq fe_sq_n_avx2@plt
  12218. #else
  12219. callq _fe_sq_n_avx2
  12220. #endif /* __APPLE__ */
  12221. leaq 32(%rsp), %rdi
  12222. leaq 64(%rsp), %rsi
  12223. leaq 32(%rsp), %rdx
  12224. #ifndef __APPLE__
  12225. callq fe_mul_avx2@plt
  12226. #else
  12227. callq _fe_mul_avx2
  12228. #endif /* __APPLE__ */
  12229. leaq 32(%rsp), %rdi
  12230. leaq 32(%rsp), %rsi
  12231. #ifndef __APPLE__
  12232. callq fe_sq_avx2@plt
  12233. #else
  12234. callq _fe_sq_avx2
  12235. #endif /* __APPLE__ */
  12236. leaq 32(%rsp), %rdi
  12237. leaq 32(%rsp), %rsi
  12238. movb $9, %dl
  12239. #ifndef __APPLE__
  12240. callq fe_sq_n_avx2@plt
  12241. #else
  12242. callq _fe_sq_n_avx2
  12243. #endif /* __APPLE__ */
  12244. movq %rsp, %rdi
  12245. leaq 32(%rsp), %rsi
  12246. movq %rsp, %rdx
  12247. #ifndef __APPLE__
  12248. callq fe_mul_avx2@plt
  12249. #else
  12250. callq _fe_mul_avx2
  12251. #endif /* __APPLE__ */
  12252. leaq 32(%rsp), %rdi
  12253. movq %rsp, %rsi
  12254. #ifndef __APPLE__
  12255. callq fe_sq_avx2@plt
  12256. #else
  12257. callq _fe_sq_avx2
  12258. #endif /* __APPLE__ */
  12259. leaq 32(%rsp), %rdi
  12260. leaq 32(%rsp), %rsi
  12261. movb $49, %dl
  12262. #ifndef __APPLE__
  12263. callq fe_sq_n_avx2@plt
  12264. #else
  12265. callq _fe_sq_n_avx2
  12266. #endif /* __APPLE__ */
  12267. leaq 32(%rsp), %rdi
  12268. leaq 32(%rsp), %rsi
  12269. movq %rsp, %rdx
  12270. #ifndef __APPLE__
  12271. callq fe_mul_avx2@plt
  12272. #else
  12273. callq _fe_mul_avx2
  12274. #endif /* __APPLE__ */
  12275. leaq 64(%rsp), %rdi
  12276. leaq 32(%rsp), %rsi
  12277. #ifndef __APPLE__
  12278. callq fe_sq_avx2@plt
  12279. #else
  12280. callq _fe_sq_avx2
  12281. #endif /* __APPLE__ */
  12282. leaq 64(%rsp), %rdi
  12283. leaq 64(%rsp), %rsi
  12284. movb $0x63, %dl
  12285. #ifndef __APPLE__
  12286. callq fe_sq_n_avx2@plt
  12287. #else
  12288. callq _fe_sq_n_avx2
  12289. #endif /* __APPLE__ */
  12290. leaq 32(%rsp), %rdi
  12291. leaq 64(%rsp), %rsi
  12292. leaq 32(%rsp), %rdx
  12293. #ifndef __APPLE__
  12294. callq fe_mul_avx2@plt
  12295. #else
  12296. callq _fe_mul_avx2
  12297. #endif /* __APPLE__ */
  12298. leaq 32(%rsp), %rdi
  12299. leaq 32(%rsp), %rsi
  12300. #ifndef __APPLE__
  12301. callq fe_sq_avx2@plt
  12302. #else
  12303. callq _fe_sq_avx2
  12304. #endif /* __APPLE__ */
  12305. leaq 32(%rsp), %rdi
  12306. leaq 32(%rsp), %rsi
  12307. movb $49, %dl
  12308. #ifndef __APPLE__
  12309. callq fe_sq_n_avx2@plt
  12310. #else
  12311. callq _fe_sq_n_avx2
  12312. #endif /* __APPLE__ */
  12313. movq %rsp, %rdi
  12314. leaq 32(%rsp), %rsi
  12315. movq %rsp, %rdx
  12316. #ifndef __APPLE__
  12317. callq fe_mul_avx2@plt
  12318. #else
  12319. callq _fe_mul_avx2
  12320. #endif /* __APPLE__ */
  12321. movq %rsp, %rdi
  12322. movq %rsp, %rsi
  12323. #ifndef __APPLE__
  12324. callq fe_sq_avx2@plt
  12325. #else
  12326. callq _fe_sq_avx2
  12327. #endif /* __APPLE__ */
  12328. movq %rsp, %rdi
  12329. movq %rsp, %rsi
  12330. #ifndef __APPLE__
  12331. callq fe_sq_avx2@plt
  12332. #else
  12333. callq _fe_sq_avx2
  12334. #endif /* __APPLE__ */
  12335. movq 96(%rsp), %rdi
  12336. movq %rsp, %rsi
  12337. movq 104(%rsp), %rdx
  12338. #ifndef __APPLE__
  12339. callq fe_mul_avx2@plt
  12340. #else
  12341. callq _fe_mul_avx2
  12342. #endif /* __APPLE__ */
  12343. movq 104(%rsp), %rsi
  12344. movq 96(%rsp), %rdi
  12345. addq $0x70, %rsp
  12346. repz retq
  12347. #ifndef __APPLE__
  12348. .text
  12349. .globl fe_ge_to_p2_avx2
  12350. .type fe_ge_to_p2_avx2,@function
  12351. .align 16
  12352. fe_ge_to_p2_avx2:
  12353. #else
  12354. .section __TEXT,__text
  12355. .globl _fe_ge_to_p2_avx2
  12356. .p2align 4
  12357. _fe_ge_to_p2_avx2:
  12358. #endif /* __APPLE__ */
  12359. pushq %rbx
  12360. pushq %r12
  12361. pushq %r13
  12362. pushq %r14
  12363. pushq %r15
  12364. subq $40, %rsp
  12365. movq %rsi, (%rsp)
  12366. movq %rdx, 8(%rsp)
  12367. movq %rcx, 16(%rsp)
  12368. movq %r8, 24(%rsp)
  12369. movq %r9, 32(%rsp)
  12370. movq 16(%rsp), %rsi
  12371. movq 88(%rsp), %rbx
  12372. # Multiply
  12373. # A[0] * B[0]
  12374. movq (%rbx), %rdx
  12375. mulxq (%rsi), %r8, %r9
  12376. # A[2] * B[0]
  12377. mulxq 16(%rsi), %r10, %r11
  12378. # A[1] * B[0]
  12379. mulxq 8(%rsi), %rcx, %rax
  12380. xorq %r15, %r15
  12381. adcxq %rcx, %r9
  12382. # A[1] * B[3]
  12383. movq 24(%rbx), %rdx
  12384. mulxq 8(%rsi), %r12, %r13
  12385. adcxq %rax, %r10
  12386. # A[0] * B[1]
  12387. movq 8(%rbx), %rdx
  12388. mulxq (%rsi), %rcx, %rax
  12389. adoxq %rcx, %r9
  12390. # A[2] * B[1]
  12391. mulxq 16(%rsi), %rcx, %r14
  12392. adoxq %rax, %r10
  12393. adcxq %rcx, %r11
  12394. # A[1] * B[2]
  12395. movq 16(%rbx), %rdx
  12396. mulxq 8(%rsi), %rcx, %rax
  12397. adcxq %r14, %r12
  12398. adoxq %rcx, %r11
  12399. adcxq %r15, %r13
  12400. adoxq %rax, %r12
  12401. # A[0] * B[2]
  12402. mulxq (%rsi), %rcx, %rax
  12403. adoxq %r15, %r13
  12404. xorq %r14, %r14
  12405. adcxq %rcx, %r10
  12406. # A[1] * B[1]
  12407. movq 8(%rbx), %rdx
  12408. mulxq 8(%rsi), %rdx, %rcx
  12409. adcxq %rax, %r11
  12410. adoxq %rdx, %r10
  12411. # A[3] * B[1]
  12412. movq 8(%rbx), %rdx
  12413. adoxq %rcx, %r11
  12414. mulxq 24(%rsi), %rcx, %rax
  12415. adcxq %rcx, %r12
  12416. # A[2] * B[2]
  12417. movq 16(%rbx), %rdx
  12418. mulxq 16(%rsi), %rdx, %rcx
  12419. adcxq %rax, %r13
  12420. adoxq %rdx, %r12
  12421. # A[3] * B[3]
  12422. movq 24(%rbx), %rdx
  12423. adoxq %rcx, %r13
  12424. mulxq 24(%rsi), %rcx, %rax
  12425. adoxq %r15, %r14
  12426. adcxq %rcx, %r14
  12427. # A[0] * B[3]
  12428. mulxq (%rsi), %rdx, %rcx
  12429. adcxq %rax, %r15
  12430. xorq %rax, %rax
  12431. adcxq %rdx, %r11
  12432. # A[3] * B[0]
  12433. movq (%rbx), %rdx
  12434. adcxq %rcx, %r12
  12435. mulxq 24(%rsi), %rdx, %rcx
  12436. adoxq %rdx, %r11
  12437. adoxq %rcx, %r12
  12438. # A[2] * B[3]
  12439. movq 24(%rbx), %rdx
  12440. mulxq 16(%rsi), %rdx, %rcx
  12441. adcxq %rdx, %r13
  12442. # A[3] * B[2]
  12443. movq 16(%rbx), %rdx
  12444. adcxq %rcx, %r14
  12445. mulxq 24(%rsi), %rcx, %rdx
  12446. adcxq %rax, %r15
  12447. adoxq %rcx, %r13
  12448. adoxq %rdx, %r14
  12449. adoxq %rax, %r15
  12450. # Reduce
  12451. movq $0x7fffffffffffffff, %rax
  12452. # Move top half into t4-t7 and remove top bit from t3
  12453. shldq $0x01, %r14, %r15
  12454. shldq $0x01, %r13, %r14
  12455. shldq $0x01, %r12, %r13
  12456. shldq $0x01, %r11, %r12
  12457. andq %rax, %r11
  12458. # Multiply top half by 19
  12459. movq $19, %rdx
  12460. xorq %rax, %rax
  12461. mulxq %r12, %rcx, %r12
  12462. adcxq %rcx, %r8
  12463. adoxq %r12, %r9
  12464. mulxq %r13, %rcx, %r13
  12465. adcxq %rcx, %r9
  12466. adoxq %r13, %r10
  12467. mulxq %r14, %rcx, %r14
  12468. adcxq %rcx, %r10
  12469. adoxq %r14, %r11
  12470. mulxq %r15, %r15, %rdx
  12471. adcxq %r15, %r11
  12472. adoxq %rax, %rdx
  12473. adcxq %rax, %rdx
  12474. # Overflow
  12475. shldq $0x01, %r11, %rdx
  12476. movq $0x7fffffffffffffff, %rax
  12477. imulq $19, %rdx, %rcx
  12478. andq %rax, %r11
  12479. addq %rcx, %r8
  12480. adcq $0x00, %r9
  12481. adcq $0x00, %r10
  12482. adcq $0x00, %r11
  12483. # Reduce if top bit set
  12484. movq %r11, %rdx
  12485. shrq $63, %rdx
  12486. imulq $19, %rdx, %rcx
  12487. andq %rax, %r11
  12488. addq %rcx, %r8
  12489. adcq $0x00, %r9
  12490. adcq $0x00, %r10
  12491. adcq $0x00, %r11
  12492. # Store
  12493. movq %r8, (%rdi)
  12494. movq %r9, 8(%rdi)
  12495. movq %r10, 16(%rdi)
  12496. movq %r11, 24(%rdi)
  12497. movq (%rsp), %rdi
  12498. movq 24(%rsp), %rsi
  12499. movq 32(%rsp), %rbx
  12500. # Multiply
  12501. # A[0] * B[0]
  12502. movq (%rbx), %rdx
  12503. mulxq (%rsi), %r8, %r9
  12504. # A[2] * B[0]
  12505. mulxq 16(%rsi), %r10, %r11
  12506. # A[1] * B[0]
  12507. mulxq 8(%rsi), %rcx, %rax
  12508. xorq %r15, %r15
  12509. adcxq %rcx, %r9
  12510. # A[1] * B[3]
  12511. movq 24(%rbx), %rdx
  12512. mulxq 8(%rsi), %r12, %r13
  12513. adcxq %rax, %r10
  12514. # A[0] * B[1]
  12515. movq 8(%rbx), %rdx
  12516. mulxq (%rsi), %rcx, %rax
  12517. adoxq %rcx, %r9
  12518. # A[2] * B[1]
  12519. mulxq 16(%rsi), %rcx, %r14
  12520. adoxq %rax, %r10
  12521. adcxq %rcx, %r11
  12522. # A[1] * B[2]
  12523. movq 16(%rbx), %rdx
  12524. mulxq 8(%rsi), %rcx, %rax
  12525. adcxq %r14, %r12
  12526. adoxq %rcx, %r11
  12527. adcxq %r15, %r13
  12528. adoxq %rax, %r12
  12529. # A[0] * B[2]
  12530. mulxq (%rsi), %rcx, %rax
  12531. adoxq %r15, %r13
  12532. xorq %r14, %r14
  12533. adcxq %rcx, %r10
  12534. # A[1] * B[1]
  12535. movq 8(%rbx), %rdx
  12536. mulxq 8(%rsi), %rdx, %rcx
  12537. adcxq %rax, %r11
  12538. adoxq %rdx, %r10
  12539. # A[3] * B[1]
  12540. movq 8(%rbx), %rdx
  12541. adoxq %rcx, %r11
  12542. mulxq 24(%rsi), %rcx, %rax
  12543. adcxq %rcx, %r12
  12544. # A[2] * B[2]
  12545. movq 16(%rbx), %rdx
  12546. mulxq 16(%rsi), %rdx, %rcx
  12547. adcxq %rax, %r13
  12548. adoxq %rdx, %r12
  12549. # A[3] * B[3]
  12550. movq 24(%rbx), %rdx
  12551. adoxq %rcx, %r13
  12552. mulxq 24(%rsi), %rcx, %rax
  12553. adoxq %r15, %r14
  12554. adcxq %rcx, %r14
  12555. # A[0] * B[3]
  12556. mulxq (%rsi), %rdx, %rcx
  12557. adcxq %rax, %r15
  12558. xorq %rax, %rax
  12559. adcxq %rdx, %r11
  12560. # A[3] * B[0]
  12561. movq (%rbx), %rdx
  12562. adcxq %rcx, %r12
  12563. mulxq 24(%rsi), %rdx, %rcx
  12564. adoxq %rdx, %r11
  12565. adoxq %rcx, %r12
  12566. # A[2] * B[3]
  12567. movq 24(%rbx), %rdx
  12568. mulxq 16(%rsi), %rdx, %rcx
  12569. adcxq %rdx, %r13
  12570. # A[3] * B[2]
  12571. movq 16(%rbx), %rdx
  12572. adcxq %rcx, %r14
  12573. mulxq 24(%rsi), %rcx, %rdx
  12574. adcxq %rax, %r15
  12575. adoxq %rcx, %r13
  12576. adoxq %rdx, %r14
  12577. adoxq %rax, %r15
  12578. # Reduce
  12579. movq $0x7fffffffffffffff, %rax
  12580. # Move top half into t4-t7 and remove top bit from t3
  12581. shldq $0x01, %r14, %r15
  12582. shldq $0x01, %r13, %r14
  12583. shldq $0x01, %r12, %r13
  12584. shldq $0x01, %r11, %r12
  12585. andq %rax, %r11
  12586. # Multiply top half by 19
  12587. movq $19, %rdx
  12588. xorq %rax, %rax
  12589. mulxq %r12, %rcx, %r12
  12590. adcxq %rcx, %r8
  12591. adoxq %r12, %r9
  12592. mulxq %r13, %rcx, %r13
  12593. adcxq %rcx, %r9
  12594. adoxq %r13, %r10
  12595. mulxq %r14, %rcx, %r14
  12596. adcxq %rcx, %r10
  12597. adoxq %r14, %r11
  12598. mulxq %r15, %r15, %rdx
  12599. adcxq %r15, %r11
  12600. adoxq %rax, %rdx
  12601. adcxq %rax, %rdx
  12602. # Overflow
  12603. shldq $0x01, %r11, %rdx
  12604. movq $0x7fffffffffffffff, %rax
  12605. imulq $19, %rdx, %rcx
  12606. andq %rax, %r11
  12607. addq %rcx, %r8
  12608. adcq $0x00, %r9
  12609. adcq $0x00, %r10
  12610. adcq $0x00, %r11
  12611. # Reduce if top bit set
  12612. movq %r11, %rdx
  12613. shrq $63, %rdx
  12614. imulq $19, %rdx, %rcx
  12615. andq %rax, %r11
  12616. addq %rcx, %r8
  12617. adcq $0x00, %r9
  12618. adcq $0x00, %r10
  12619. adcq $0x00, %r11
  12620. # Store
  12621. movq %r8, (%rdi)
  12622. movq %r9, 8(%rdi)
  12623. movq %r10, 16(%rdi)
  12624. movq %r11, 24(%rdi)
  12625. movq 8(%rsp), %rdi
  12626. movq 88(%rsp), %rsi
  12627. # Multiply
  12628. # A[0] * B[0]
  12629. movq (%rsi), %rdx
  12630. mulxq (%rbx), %r8, %r9
  12631. # A[2] * B[0]
  12632. mulxq 16(%rbx), %r10, %r11
  12633. # A[1] * B[0]
  12634. mulxq 8(%rbx), %rcx, %rax
  12635. xorq %r15, %r15
  12636. adcxq %rcx, %r9
  12637. # A[1] * B[3]
  12638. movq 24(%rsi), %rdx
  12639. mulxq 8(%rbx), %r12, %r13
  12640. adcxq %rax, %r10
  12641. # A[0] * B[1]
  12642. movq 8(%rsi), %rdx
  12643. mulxq (%rbx), %rcx, %rax
  12644. adoxq %rcx, %r9
  12645. # A[2] * B[1]
  12646. mulxq 16(%rbx), %rcx, %r14
  12647. adoxq %rax, %r10
  12648. adcxq %rcx, %r11
  12649. # A[1] * B[2]
  12650. movq 16(%rsi), %rdx
  12651. mulxq 8(%rbx), %rcx, %rax
  12652. adcxq %r14, %r12
  12653. adoxq %rcx, %r11
  12654. adcxq %r15, %r13
  12655. adoxq %rax, %r12
  12656. # A[0] * B[2]
  12657. mulxq (%rbx), %rcx, %rax
  12658. adoxq %r15, %r13
  12659. xorq %r14, %r14
  12660. adcxq %rcx, %r10
  12661. # A[1] * B[1]
  12662. movq 8(%rsi), %rdx
  12663. mulxq 8(%rbx), %rdx, %rcx
  12664. adcxq %rax, %r11
  12665. adoxq %rdx, %r10
  12666. # A[3] * B[1]
  12667. movq 8(%rsi), %rdx
  12668. adoxq %rcx, %r11
  12669. mulxq 24(%rbx), %rcx, %rax
  12670. adcxq %rcx, %r12
  12671. # A[2] * B[2]
  12672. movq 16(%rsi), %rdx
  12673. mulxq 16(%rbx), %rdx, %rcx
  12674. adcxq %rax, %r13
  12675. adoxq %rdx, %r12
  12676. # A[3] * B[3]
  12677. movq 24(%rsi), %rdx
  12678. adoxq %rcx, %r13
  12679. mulxq 24(%rbx), %rcx, %rax
  12680. adoxq %r15, %r14
  12681. adcxq %rcx, %r14
  12682. # A[0] * B[3]
  12683. mulxq (%rbx), %rdx, %rcx
  12684. adcxq %rax, %r15
  12685. xorq %rax, %rax
  12686. adcxq %rdx, %r11
  12687. # A[3] * B[0]
  12688. movq (%rsi), %rdx
  12689. adcxq %rcx, %r12
  12690. mulxq 24(%rbx), %rdx, %rcx
  12691. adoxq %rdx, %r11
  12692. adoxq %rcx, %r12
  12693. # A[2] * B[3]
  12694. movq 24(%rsi), %rdx
  12695. mulxq 16(%rbx), %rdx, %rcx
  12696. adcxq %rdx, %r13
  12697. # A[3] * B[2]
  12698. movq 16(%rsi), %rdx
  12699. adcxq %rcx, %r14
  12700. mulxq 24(%rbx), %rcx, %rdx
  12701. adcxq %rax, %r15
  12702. adoxq %rcx, %r13
  12703. adoxq %rdx, %r14
  12704. adoxq %rax, %r15
  12705. # Reduce
  12706. movq $0x7fffffffffffffff, %rax
  12707. # Move top half into t4-t7 and remove top bit from t3
  12708. shldq $0x01, %r14, %r15
  12709. shldq $0x01, %r13, %r14
  12710. shldq $0x01, %r12, %r13
  12711. shldq $0x01, %r11, %r12
  12712. andq %rax, %r11
  12713. # Multiply top half by 19
  12714. movq $19, %rdx
  12715. xorq %rax, %rax
  12716. mulxq %r12, %rcx, %r12
  12717. adcxq %rcx, %r8
  12718. adoxq %r12, %r9
  12719. mulxq %r13, %rcx, %r13
  12720. adcxq %rcx, %r9
  12721. adoxq %r13, %r10
  12722. mulxq %r14, %rcx, %r14
  12723. adcxq %rcx, %r10
  12724. adoxq %r14, %r11
  12725. mulxq %r15, %r15, %rdx
  12726. adcxq %r15, %r11
  12727. adoxq %rax, %rdx
  12728. adcxq %rax, %rdx
  12729. # Overflow
  12730. shldq $0x01, %r11, %rdx
  12731. movq $0x7fffffffffffffff, %rax
  12732. imulq $19, %rdx, %rcx
  12733. andq %rax, %r11
  12734. addq %rcx, %r8
  12735. adcq $0x00, %r9
  12736. adcq $0x00, %r10
  12737. adcq $0x00, %r11
  12738. # Reduce if top bit set
  12739. movq %r11, %rdx
  12740. shrq $63, %rdx
  12741. imulq $19, %rdx, %rcx
  12742. andq %rax, %r11
  12743. addq %rcx, %r8
  12744. adcq $0x00, %r9
  12745. adcq $0x00, %r10
  12746. adcq $0x00, %r11
  12747. # Store
  12748. movq %r8, (%rdi)
  12749. movq %r9, 8(%rdi)
  12750. movq %r10, 16(%rdi)
  12751. movq %r11, 24(%rdi)
  12752. addq $40, %rsp
  12753. popq %r15
  12754. popq %r14
  12755. popq %r13
  12756. popq %r12
  12757. popq %rbx
  12758. repz retq
  12759. #ifndef __APPLE__
  12760. .size fe_ge_to_p2_avx2,.-fe_ge_to_p2_avx2
  12761. #endif /* __APPLE__ */
  12762. #ifndef __APPLE__
  12763. .text
  12764. .globl fe_ge_to_p3_avx2
  12765. .type fe_ge_to_p3_avx2,@function
  12766. .align 16
  12767. fe_ge_to_p3_avx2:
  12768. #else
  12769. .section __TEXT,__text
  12770. .globl _fe_ge_to_p3_avx2
  12771. .p2align 4
  12772. _fe_ge_to_p3_avx2:
  12773. #endif /* __APPLE__ */
  12774. pushq %rbx
  12775. pushq %r12
  12776. pushq %r13
  12777. pushq %r14
  12778. pushq %r15
  12779. subq $40, %rsp
  12780. movq %rsi, (%rsp)
  12781. movq %rdx, 8(%rsp)
  12782. movq %rcx, 16(%rsp)
  12783. movq %r8, 24(%rsp)
  12784. movq %r9, 32(%rsp)
  12785. movq 24(%rsp), %rsi
  12786. movq 96(%rsp), %rbx
  12787. # Multiply
  12788. # A[0] * B[0]
  12789. movq (%rbx), %rdx
  12790. mulxq (%rsi), %r8, %r9
  12791. # A[2] * B[0]
  12792. mulxq 16(%rsi), %r10, %r11
  12793. # A[1] * B[0]
  12794. mulxq 8(%rsi), %rcx, %rax
  12795. xorq %r15, %r15
  12796. adcxq %rcx, %r9
  12797. # A[1] * B[3]
  12798. movq 24(%rbx), %rdx
  12799. mulxq 8(%rsi), %r12, %r13
  12800. adcxq %rax, %r10
  12801. # A[0] * B[1]
  12802. movq 8(%rbx), %rdx
  12803. mulxq (%rsi), %rcx, %rax
  12804. adoxq %rcx, %r9
  12805. # A[2] * B[1]
  12806. mulxq 16(%rsi), %rcx, %r14
  12807. adoxq %rax, %r10
  12808. adcxq %rcx, %r11
  12809. # A[1] * B[2]
  12810. movq 16(%rbx), %rdx
  12811. mulxq 8(%rsi), %rcx, %rax
  12812. adcxq %r14, %r12
  12813. adoxq %rcx, %r11
  12814. adcxq %r15, %r13
  12815. adoxq %rax, %r12
  12816. # A[0] * B[2]
  12817. mulxq (%rsi), %rcx, %rax
  12818. adoxq %r15, %r13
  12819. xorq %r14, %r14
  12820. adcxq %rcx, %r10
  12821. # A[1] * B[1]
  12822. movq 8(%rbx), %rdx
  12823. mulxq 8(%rsi), %rdx, %rcx
  12824. adcxq %rax, %r11
  12825. adoxq %rdx, %r10
  12826. # A[3] * B[1]
  12827. movq 8(%rbx), %rdx
  12828. adoxq %rcx, %r11
  12829. mulxq 24(%rsi), %rcx, %rax
  12830. adcxq %rcx, %r12
  12831. # A[2] * B[2]
  12832. movq 16(%rbx), %rdx
  12833. mulxq 16(%rsi), %rdx, %rcx
  12834. adcxq %rax, %r13
  12835. adoxq %rdx, %r12
  12836. # A[3] * B[3]
  12837. movq 24(%rbx), %rdx
  12838. adoxq %rcx, %r13
  12839. mulxq 24(%rsi), %rcx, %rax
  12840. adoxq %r15, %r14
  12841. adcxq %rcx, %r14
  12842. # A[0] * B[3]
  12843. mulxq (%rsi), %rdx, %rcx
  12844. adcxq %rax, %r15
  12845. xorq %rax, %rax
  12846. adcxq %rdx, %r11
  12847. # A[3] * B[0]
  12848. movq (%rbx), %rdx
  12849. adcxq %rcx, %r12
  12850. mulxq 24(%rsi), %rdx, %rcx
  12851. adoxq %rdx, %r11
  12852. adoxq %rcx, %r12
  12853. # A[2] * B[3]
  12854. movq 24(%rbx), %rdx
  12855. mulxq 16(%rsi), %rdx, %rcx
  12856. adcxq %rdx, %r13
  12857. # A[3] * B[2]
  12858. movq 16(%rbx), %rdx
  12859. adcxq %rcx, %r14
  12860. mulxq 24(%rsi), %rcx, %rdx
  12861. adcxq %rax, %r15
  12862. adoxq %rcx, %r13
  12863. adoxq %rdx, %r14
  12864. adoxq %rax, %r15
  12865. # Reduce
  12866. movq $0x7fffffffffffffff, %rax
  12867. # Move top half into t4-t7 and remove top bit from t3
  12868. shldq $0x01, %r14, %r15
  12869. shldq $0x01, %r13, %r14
  12870. shldq $0x01, %r12, %r13
  12871. shldq $0x01, %r11, %r12
  12872. andq %rax, %r11
  12873. # Multiply top half by 19
  12874. movq $19, %rdx
  12875. xorq %rax, %rax
  12876. mulxq %r12, %rcx, %r12
  12877. adcxq %rcx, %r8
  12878. adoxq %r12, %r9
  12879. mulxq %r13, %rcx, %r13
  12880. adcxq %rcx, %r9
  12881. adoxq %r13, %r10
  12882. mulxq %r14, %rcx, %r14
  12883. adcxq %rcx, %r10
  12884. adoxq %r14, %r11
  12885. mulxq %r15, %r15, %rdx
  12886. adcxq %r15, %r11
  12887. adoxq %rax, %rdx
  12888. adcxq %rax, %rdx
  12889. # Overflow
  12890. shldq $0x01, %r11, %rdx
  12891. movq $0x7fffffffffffffff, %rax
  12892. imulq $19, %rdx, %rcx
  12893. andq %rax, %r11
  12894. addq %rcx, %r8
  12895. adcq $0x00, %r9
  12896. adcq $0x00, %r10
  12897. adcq $0x00, %r11
  12898. # Reduce if top bit set
  12899. movq %r11, %rdx
  12900. shrq $63, %rdx
  12901. imulq $19, %rdx, %rcx
  12902. andq %rax, %r11
  12903. addq %rcx, %r8
  12904. adcq $0x00, %r9
  12905. adcq $0x00, %r10
  12906. adcq $0x00, %r11
  12907. # Store
  12908. movq %r8, (%rdi)
  12909. movq %r9, 8(%rdi)
  12910. movq %r10, 16(%rdi)
  12911. movq %r11, 24(%rdi)
  12912. movq (%rsp), %rdi
  12913. movq 32(%rsp), %rsi
  12914. movq 88(%rsp), %rbx
  12915. # Multiply
  12916. # A[0] * B[0]
  12917. movq (%rbx), %rdx
  12918. mulxq (%rsi), %r8, %r9
  12919. # A[2] * B[0]
  12920. mulxq 16(%rsi), %r10, %r11
  12921. # A[1] * B[0]
  12922. mulxq 8(%rsi), %rcx, %rax
  12923. xorq %r15, %r15
  12924. adcxq %rcx, %r9
  12925. # A[1] * B[3]
  12926. movq 24(%rbx), %rdx
  12927. mulxq 8(%rsi), %r12, %r13
  12928. adcxq %rax, %r10
  12929. # A[0] * B[1]
  12930. movq 8(%rbx), %rdx
  12931. mulxq (%rsi), %rcx, %rax
  12932. adoxq %rcx, %r9
  12933. # A[2] * B[1]
  12934. mulxq 16(%rsi), %rcx, %r14
  12935. adoxq %rax, %r10
  12936. adcxq %rcx, %r11
  12937. # A[1] * B[2]
  12938. movq 16(%rbx), %rdx
  12939. mulxq 8(%rsi), %rcx, %rax
  12940. adcxq %r14, %r12
  12941. adoxq %rcx, %r11
  12942. adcxq %r15, %r13
  12943. adoxq %rax, %r12
  12944. # A[0] * B[2]
  12945. mulxq (%rsi), %rcx, %rax
  12946. adoxq %r15, %r13
  12947. xorq %r14, %r14
  12948. adcxq %rcx, %r10
  12949. # A[1] * B[1]
  12950. movq 8(%rbx), %rdx
  12951. mulxq 8(%rsi), %rdx, %rcx
  12952. adcxq %rax, %r11
  12953. adoxq %rdx, %r10
  12954. # A[3] * B[1]
  12955. movq 8(%rbx), %rdx
  12956. adoxq %rcx, %r11
  12957. mulxq 24(%rsi), %rcx, %rax
  12958. adcxq %rcx, %r12
  12959. # A[2] * B[2]
  12960. movq 16(%rbx), %rdx
  12961. mulxq 16(%rsi), %rdx, %rcx
  12962. adcxq %rax, %r13
  12963. adoxq %rdx, %r12
  12964. # A[3] * B[3]
  12965. movq 24(%rbx), %rdx
  12966. adoxq %rcx, %r13
  12967. mulxq 24(%rsi), %rcx, %rax
  12968. adoxq %r15, %r14
  12969. adcxq %rcx, %r14
  12970. # A[0] * B[3]
  12971. mulxq (%rsi), %rdx, %rcx
  12972. adcxq %rax, %r15
  12973. xorq %rax, %rax
  12974. adcxq %rdx, %r11
  12975. # A[3] * B[0]
  12976. movq (%rbx), %rdx
  12977. adcxq %rcx, %r12
  12978. mulxq 24(%rsi), %rdx, %rcx
  12979. adoxq %rdx, %r11
  12980. adoxq %rcx, %r12
  12981. # A[2] * B[3]
  12982. movq 24(%rbx), %rdx
  12983. mulxq 16(%rsi), %rdx, %rcx
  12984. adcxq %rdx, %r13
  12985. # A[3] * B[2]
  12986. movq 16(%rbx), %rdx
  12987. adcxq %rcx, %r14
  12988. mulxq 24(%rsi), %rcx, %rdx
  12989. adcxq %rax, %r15
  12990. adoxq %rcx, %r13
  12991. adoxq %rdx, %r14
  12992. adoxq %rax, %r15
  12993. # Reduce
  12994. movq $0x7fffffffffffffff, %rax
  12995. # Move top half into t4-t7 and remove top bit from t3
  12996. shldq $0x01, %r14, %r15
  12997. shldq $0x01, %r13, %r14
  12998. shldq $0x01, %r12, %r13
  12999. shldq $0x01, %r11, %r12
  13000. andq %rax, %r11
  13001. # Multiply top half by 19
  13002. movq $19, %rdx
  13003. xorq %rax, %rax
  13004. mulxq %r12, %rcx, %r12
  13005. adcxq %rcx, %r8
  13006. adoxq %r12, %r9
  13007. mulxq %r13, %rcx, %r13
  13008. adcxq %rcx, %r9
  13009. adoxq %r13, %r10
  13010. mulxq %r14, %rcx, %r14
  13011. adcxq %rcx, %r10
  13012. adoxq %r14, %r11
  13013. mulxq %r15, %r15, %rdx
  13014. adcxq %r15, %r11
  13015. adoxq %rax, %rdx
  13016. adcxq %rax, %rdx
  13017. # Overflow
  13018. shldq $0x01, %r11, %rdx
  13019. movq $0x7fffffffffffffff, %rax
  13020. imulq $19, %rdx, %rcx
  13021. andq %rax, %r11
  13022. addq %rcx, %r8
  13023. adcq $0x00, %r9
  13024. adcq $0x00, %r10
  13025. adcq $0x00, %r11
  13026. # Reduce if top bit set
  13027. movq %r11, %rdx
  13028. shrq $63, %rdx
  13029. imulq $19, %rdx, %rcx
  13030. andq %rax, %r11
  13031. addq %rcx, %r8
  13032. adcq $0x00, %r9
  13033. adcq $0x00, %r10
  13034. adcq $0x00, %r11
  13035. # Store
  13036. movq %r8, (%rdi)
  13037. movq %r9, 8(%rdi)
  13038. movq %r10, 16(%rdi)
  13039. movq %r11, 24(%rdi)
  13040. movq 8(%rsp), %rdi
  13041. movq 96(%rsp), %rsi
  13042. # Multiply
  13043. # A[0] * B[0]
  13044. movq (%rsi), %rdx
  13045. mulxq (%rbx), %r8, %r9
  13046. # A[2] * B[0]
  13047. mulxq 16(%rbx), %r10, %r11
  13048. # A[1] * B[0]
  13049. mulxq 8(%rbx), %rcx, %rax
  13050. xorq %r15, %r15
  13051. adcxq %rcx, %r9
  13052. # A[1] * B[3]
  13053. movq 24(%rsi), %rdx
  13054. mulxq 8(%rbx), %r12, %r13
  13055. adcxq %rax, %r10
  13056. # A[0] * B[1]
  13057. movq 8(%rsi), %rdx
  13058. mulxq (%rbx), %rcx, %rax
  13059. adoxq %rcx, %r9
  13060. # A[2] * B[1]
  13061. mulxq 16(%rbx), %rcx, %r14
  13062. adoxq %rax, %r10
  13063. adcxq %rcx, %r11
  13064. # A[1] * B[2]
  13065. movq 16(%rsi), %rdx
  13066. mulxq 8(%rbx), %rcx, %rax
  13067. adcxq %r14, %r12
  13068. adoxq %rcx, %r11
  13069. adcxq %r15, %r13
  13070. adoxq %rax, %r12
  13071. # A[0] * B[2]
  13072. mulxq (%rbx), %rcx, %rax
  13073. adoxq %r15, %r13
  13074. xorq %r14, %r14
  13075. adcxq %rcx, %r10
  13076. # A[1] * B[1]
  13077. movq 8(%rsi), %rdx
  13078. mulxq 8(%rbx), %rdx, %rcx
  13079. adcxq %rax, %r11
  13080. adoxq %rdx, %r10
  13081. # A[3] * B[1]
  13082. movq 8(%rsi), %rdx
  13083. adoxq %rcx, %r11
  13084. mulxq 24(%rbx), %rcx, %rax
  13085. adcxq %rcx, %r12
  13086. # A[2] * B[2]
  13087. movq 16(%rsi), %rdx
  13088. mulxq 16(%rbx), %rdx, %rcx
  13089. adcxq %rax, %r13
  13090. adoxq %rdx, %r12
  13091. # A[3] * B[3]
  13092. movq 24(%rsi), %rdx
  13093. adoxq %rcx, %r13
  13094. mulxq 24(%rbx), %rcx, %rax
  13095. adoxq %r15, %r14
  13096. adcxq %rcx, %r14
  13097. # A[0] * B[3]
  13098. mulxq (%rbx), %rdx, %rcx
  13099. adcxq %rax, %r15
  13100. xorq %rax, %rax
  13101. adcxq %rdx, %r11
  13102. # A[3] * B[0]
  13103. movq (%rsi), %rdx
  13104. adcxq %rcx, %r12
  13105. mulxq 24(%rbx), %rdx, %rcx
  13106. adoxq %rdx, %r11
  13107. adoxq %rcx, %r12
  13108. # A[2] * B[3]
  13109. movq 24(%rsi), %rdx
  13110. mulxq 16(%rbx), %rdx, %rcx
  13111. adcxq %rdx, %r13
  13112. # A[3] * B[2]
  13113. movq 16(%rsi), %rdx
  13114. adcxq %rcx, %r14
  13115. mulxq 24(%rbx), %rcx, %rdx
  13116. adcxq %rax, %r15
  13117. adoxq %rcx, %r13
  13118. adoxq %rdx, %r14
  13119. adoxq %rax, %r15
  13120. # Reduce
  13121. movq $0x7fffffffffffffff, %rax
  13122. # Move top half into t4-t7 and remove top bit from t3
  13123. shldq $0x01, %r14, %r15
  13124. shldq $0x01, %r13, %r14
  13125. shldq $0x01, %r12, %r13
  13126. shldq $0x01, %r11, %r12
  13127. andq %rax, %r11
  13128. # Multiply top half by 19
  13129. movq $19, %rdx
  13130. xorq %rax, %rax
  13131. mulxq %r12, %rcx, %r12
  13132. adcxq %rcx, %r8
  13133. adoxq %r12, %r9
  13134. mulxq %r13, %rcx, %r13
  13135. adcxq %rcx, %r9
  13136. adoxq %r13, %r10
  13137. mulxq %r14, %rcx, %r14
  13138. adcxq %rcx, %r10
  13139. adoxq %r14, %r11
  13140. mulxq %r15, %r15, %rdx
  13141. adcxq %r15, %r11
  13142. adoxq %rax, %rdx
  13143. adcxq %rax, %rdx
  13144. # Overflow
  13145. shldq $0x01, %r11, %rdx
  13146. movq $0x7fffffffffffffff, %rax
  13147. imulq $19, %rdx, %rcx
  13148. andq %rax, %r11
  13149. addq %rcx, %r8
  13150. adcq $0x00, %r9
  13151. adcq $0x00, %r10
  13152. adcq $0x00, %r11
  13153. # Reduce if top bit set
  13154. movq %r11, %rdx
  13155. shrq $63, %rdx
  13156. imulq $19, %rdx, %rcx
  13157. andq %rax, %r11
  13158. addq %rcx, %r8
  13159. adcq $0x00, %r9
  13160. adcq $0x00, %r10
  13161. adcq $0x00, %r11
  13162. # Store
  13163. movq %r8, (%rdi)
  13164. movq %r9, 8(%rdi)
  13165. movq %r10, 16(%rdi)
  13166. movq %r11, 24(%rdi)
  13167. movq 16(%rsp), %rdi
  13168. movq 24(%rsp), %rsi
  13169. movq 32(%rsp), %rbx
  13170. # Multiply
  13171. # A[0] * B[0]
  13172. movq (%rbx), %rdx
  13173. mulxq (%rsi), %r8, %r9
  13174. # A[2] * B[0]
  13175. mulxq 16(%rsi), %r10, %r11
  13176. # A[1] * B[0]
  13177. mulxq 8(%rsi), %rcx, %rax
  13178. xorq %r15, %r15
  13179. adcxq %rcx, %r9
  13180. # A[1] * B[3]
  13181. movq 24(%rbx), %rdx
  13182. mulxq 8(%rsi), %r12, %r13
  13183. adcxq %rax, %r10
  13184. # A[0] * B[1]
  13185. movq 8(%rbx), %rdx
  13186. mulxq (%rsi), %rcx, %rax
  13187. adoxq %rcx, %r9
  13188. # A[2] * B[1]
  13189. mulxq 16(%rsi), %rcx, %r14
  13190. adoxq %rax, %r10
  13191. adcxq %rcx, %r11
  13192. # A[1] * B[2]
  13193. movq 16(%rbx), %rdx
  13194. mulxq 8(%rsi), %rcx, %rax
  13195. adcxq %r14, %r12
  13196. adoxq %rcx, %r11
  13197. adcxq %r15, %r13
  13198. adoxq %rax, %r12
  13199. # A[0] * B[2]
  13200. mulxq (%rsi), %rcx, %rax
  13201. adoxq %r15, %r13
  13202. xorq %r14, %r14
  13203. adcxq %rcx, %r10
  13204. # A[1] * B[1]
  13205. movq 8(%rbx), %rdx
  13206. mulxq 8(%rsi), %rdx, %rcx
  13207. adcxq %rax, %r11
  13208. adoxq %rdx, %r10
  13209. # A[3] * B[1]
  13210. movq 8(%rbx), %rdx
  13211. adoxq %rcx, %r11
  13212. mulxq 24(%rsi), %rcx, %rax
  13213. adcxq %rcx, %r12
  13214. # A[2] * B[2]
  13215. movq 16(%rbx), %rdx
  13216. mulxq 16(%rsi), %rdx, %rcx
  13217. adcxq %rax, %r13
  13218. adoxq %rdx, %r12
  13219. # A[3] * B[3]
  13220. movq 24(%rbx), %rdx
  13221. adoxq %rcx, %r13
  13222. mulxq 24(%rsi), %rcx, %rax
  13223. adoxq %r15, %r14
  13224. adcxq %rcx, %r14
  13225. # A[0] * B[3]
  13226. mulxq (%rsi), %rdx, %rcx
  13227. adcxq %rax, %r15
  13228. xorq %rax, %rax
  13229. adcxq %rdx, %r11
  13230. # A[3] * B[0]
  13231. movq (%rbx), %rdx
  13232. adcxq %rcx, %r12
  13233. mulxq 24(%rsi), %rdx, %rcx
  13234. adoxq %rdx, %r11
  13235. adoxq %rcx, %r12
  13236. # A[2] * B[3]
  13237. movq 24(%rbx), %rdx
  13238. mulxq 16(%rsi), %rdx, %rcx
  13239. adcxq %rdx, %r13
  13240. # A[3] * B[2]
  13241. movq 16(%rbx), %rdx
  13242. adcxq %rcx, %r14
  13243. mulxq 24(%rsi), %rcx, %rdx
  13244. adcxq %rax, %r15
  13245. adoxq %rcx, %r13
  13246. adoxq %rdx, %r14
  13247. adoxq %rax, %r15
  13248. # Reduce
  13249. movq $0x7fffffffffffffff, %rax
  13250. # Move top half into t4-t7 and remove top bit from t3
  13251. shldq $0x01, %r14, %r15
  13252. shldq $0x01, %r13, %r14
  13253. shldq $0x01, %r12, %r13
  13254. shldq $0x01, %r11, %r12
  13255. andq %rax, %r11
  13256. # Multiply top half by 19
  13257. movq $19, %rdx
  13258. xorq %rax, %rax
  13259. mulxq %r12, %rcx, %r12
  13260. adcxq %rcx, %r8
  13261. adoxq %r12, %r9
  13262. mulxq %r13, %rcx, %r13
  13263. adcxq %rcx, %r9
  13264. adoxq %r13, %r10
  13265. mulxq %r14, %rcx, %r14
  13266. adcxq %rcx, %r10
  13267. adoxq %r14, %r11
  13268. mulxq %r15, %r15, %rdx
  13269. adcxq %r15, %r11
  13270. adoxq %rax, %rdx
  13271. adcxq %rax, %rdx
  13272. # Overflow
  13273. shldq $0x01, %r11, %rdx
  13274. movq $0x7fffffffffffffff, %rax
  13275. imulq $19, %rdx, %rcx
  13276. andq %rax, %r11
  13277. addq %rcx, %r8
  13278. adcq $0x00, %r9
  13279. adcq $0x00, %r10
  13280. adcq $0x00, %r11
  13281. # Reduce if top bit set
  13282. movq %r11, %rdx
  13283. shrq $63, %rdx
  13284. imulq $19, %rdx, %rcx
  13285. andq %rax, %r11
  13286. addq %rcx, %r8
  13287. adcq $0x00, %r9
  13288. adcq $0x00, %r10
  13289. adcq $0x00, %r11
  13290. # Store
  13291. movq %r8, (%rdi)
  13292. movq %r9, 8(%rdi)
  13293. movq %r10, 16(%rdi)
  13294. movq %r11, 24(%rdi)
  13295. addq $40, %rsp
  13296. popq %r15
  13297. popq %r14
  13298. popq %r13
  13299. popq %r12
  13300. popq %rbx
  13301. repz retq
  13302. #ifndef __APPLE__
  13303. .size fe_ge_to_p3_avx2,.-fe_ge_to_p3_avx2
  13304. #endif /* __APPLE__ */
  13305. #ifndef __APPLE__
  13306. .text
  13307. .globl fe_ge_dbl_avx2
  13308. .type fe_ge_dbl_avx2,@function
  13309. .align 16
  13310. fe_ge_dbl_avx2:
  13311. #else
  13312. .section __TEXT,__text
  13313. .globl _fe_ge_dbl_avx2
  13314. .p2align 4
  13315. _fe_ge_dbl_avx2:
  13316. #endif /* __APPLE__ */
  13317. pushq %rbp
  13318. pushq %rbx
  13319. pushq %r12
  13320. pushq %r13
  13321. pushq %r14
  13322. pushq %r15
  13323. subq $48, %rsp
  13324. movq %rdi, (%rsp)
  13325. movq %rsi, 8(%rsp)
  13326. movq %rdx, 16(%rsp)
  13327. movq %rcx, 24(%rsp)
  13328. movq %r8, 32(%rsp)
  13329. movq %r9, 40(%rsp)
  13330. movq 32(%rsp), %rsi
  13331. # Square
  13332. # A[0] * A[1]
  13333. movq (%rsi), %rdx
  13334. mulxq 8(%rsi), %r9, %r10
  13335. # A[0] * A[3]
  13336. mulxq 24(%rsi), %r11, %r12
  13337. # A[2] * A[1]
  13338. movq 16(%rsi), %rdx
  13339. mulxq 8(%rsi), %rcx, %rax
  13340. xorq %r15, %r15
  13341. adoxq %rcx, %r11
  13342. # A[2] * A[3]
  13343. mulxq 24(%rsi), %r13, %r14
  13344. adoxq %rax, %r12
  13345. # A[2] * A[0]
  13346. mulxq (%rsi), %rcx, %rax
  13347. adoxq %r15, %r13
  13348. adcxq %rcx, %r10
  13349. adoxq %r15, %r14
  13350. # A[1] * A[3]
  13351. movq 8(%rsi), %rdx
  13352. mulxq 24(%rsi), %rbp, %r8
  13353. adcxq %rax, %r11
  13354. adcxq %rbp, %r12
  13355. adcxq %r8, %r13
  13356. adcxq %r15, %r14
  13357. # Double with Carry Flag
  13358. xorq %r15, %r15
  13359. # A[0] * A[0]
  13360. movq (%rsi), %rdx
  13361. mulxq %rdx, %r8, %rbp
  13362. adcxq %r9, %r9
  13363. # A[1] * A[1]
  13364. movq 8(%rsi), %rdx
  13365. mulxq %rdx, %rcx, %rax
  13366. adcxq %r10, %r10
  13367. adoxq %rbp, %r9
  13368. adcxq %r11, %r11
  13369. adoxq %rcx, %r10
  13370. # A[2] * A[2]
  13371. movq 16(%rsi), %rdx
  13372. mulxq %rdx, %rbp, %rcx
  13373. adcxq %r12, %r12
  13374. adoxq %rax, %r11
  13375. adcxq %r13, %r13
  13376. adoxq %rbp, %r12
  13377. # A[3] * A[3]
  13378. movq 24(%rsi), %rdx
  13379. mulxq %rdx, %rbp, %rax
  13380. adcxq %r14, %r14
  13381. adoxq %rcx, %r13
  13382. adcxq %r15, %r15
  13383. adoxq %rbp, %r14
  13384. adoxq %rax, %r15
  13385. # Reduce
  13386. movq $0x7fffffffffffffff, %rcx
  13387. # Move top half into t4-t7 and remove top bit from t3
  13388. shldq $0x01, %r14, %r15
  13389. shldq $0x01, %r13, %r14
  13390. shldq $0x01, %r12, %r13
  13391. shldq $0x01, %r11, %r12
  13392. andq %rcx, %r11
  13393. # Multiply top half by 19
  13394. movq $19, %rdx
  13395. xorq %rcx, %rcx
  13396. mulxq %r12, %rbp, %r12
  13397. adcxq %rbp, %r8
  13398. adoxq %r12, %r9
  13399. mulxq %r13, %rbp, %r13
  13400. adcxq %rbp, %r9
  13401. adoxq %r13, %r10
  13402. mulxq %r14, %rbp, %r14
  13403. adcxq %rbp, %r10
  13404. adoxq %r14, %r11
  13405. mulxq %r15, %r15, %rdx
  13406. adcxq %r15, %r11
  13407. adoxq %rcx, %rdx
  13408. adcxq %rcx, %rdx
  13409. # Overflow
  13410. shldq $0x01, %r11, %rdx
  13411. movq $0x7fffffffffffffff, %rcx
  13412. imulq $19, %rdx, %rbp
  13413. andq %rcx, %r11
  13414. addq %rbp, %r8
  13415. adcq $0x00, %r9
  13416. adcq $0x00, %r10
  13417. adcq $0x00, %r11
  13418. # Reduce if top bit set
  13419. movq %r11, %rdx
  13420. shrq $63, %rdx
  13421. imulq $19, %rdx, %rbp
  13422. andq %rcx, %r11
  13423. addq %rbp, %r8
  13424. adcq $0x00, %r9
  13425. adcq $0x00, %r10
  13426. adcq $0x00, %r11
  13427. # Store
  13428. movq %r8, (%rdi)
  13429. movq %r9, 8(%rdi)
  13430. movq %r10, 16(%rdi)
  13431. movq %r11, 24(%rdi)
  13432. movq 16(%rsp), %rdi
  13433. movq 40(%rsp), %rbx
  13434. # Square
  13435. # A[0] * A[1]
  13436. movq (%rbx), %rdx
  13437. mulxq 8(%rbx), %r9, %r10
  13438. # A[0] * A[3]
  13439. mulxq 24(%rbx), %r11, %r12
  13440. # A[2] * A[1]
  13441. movq 16(%rbx), %rdx
  13442. mulxq 8(%rbx), %rcx, %rax
  13443. xorq %r15, %r15
  13444. adoxq %rcx, %r11
  13445. # A[2] * A[3]
  13446. mulxq 24(%rbx), %r13, %r14
  13447. adoxq %rax, %r12
  13448. # A[2] * A[0]
  13449. mulxq (%rbx), %rcx, %rax
  13450. adoxq %r15, %r13
  13451. adcxq %rcx, %r10
  13452. adoxq %r15, %r14
  13453. # A[1] * A[3]
  13454. movq 8(%rbx), %rdx
  13455. mulxq 24(%rbx), %rbp, %r8
  13456. adcxq %rax, %r11
  13457. adcxq %rbp, %r12
  13458. adcxq %r8, %r13
  13459. adcxq %r15, %r14
  13460. # Double with Carry Flag
  13461. xorq %r15, %r15
  13462. # A[0] * A[0]
  13463. movq (%rbx), %rdx
  13464. mulxq %rdx, %r8, %rbp
  13465. adcxq %r9, %r9
  13466. # A[1] * A[1]
  13467. movq 8(%rbx), %rdx
  13468. mulxq %rdx, %rcx, %rax
  13469. adcxq %r10, %r10
  13470. adoxq %rbp, %r9
  13471. adcxq %r11, %r11
  13472. adoxq %rcx, %r10
  13473. # A[2] * A[2]
  13474. movq 16(%rbx), %rdx
  13475. mulxq %rdx, %rbp, %rcx
  13476. adcxq %r12, %r12
  13477. adoxq %rax, %r11
  13478. adcxq %r13, %r13
  13479. adoxq %rbp, %r12
  13480. # A[3] * A[3]
  13481. movq 24(%rbx), %rdx
  13482. mulxq %rdx, %rbp, %rax
  13483. adcxq %r14, %r14
  13484. adoxq %rcx, %r13
  13485. adcxq %r15, %r15
  13486. adoxq %rbp, %r14
  13487. adoxq %rax, %r15
  13488. # Reduce
  13489. movq $0x7fffffffffffffff, %rcx
  13490. # Move top half into t4-t7 and remove top bit from t3
  13491. shldq $0x01, %r14, %r15
  13492. shldq $0x01, %r13, %r14
  13493. shldq $0x01, %r12, %r13
  13494. shldq $0x01, %r11, %r12
  13495. andq %rcx, %r11
  13496. # Multiply top half by 19
  13497. movq $19, %rdx
  13498. xorq %rcx, %rcx
  13499. mulxq %r12, %rbp, %r12
  13500. adcxq %rbp, %r8
  13501. adoxq %r12, %r9
  13502. mulxq %r13, %rbp, %r13
  13503. adcxq %rbp, %r9
  13504. adoxq %r13, %r10
  13505. mulxq %r14, %rbp, %r14
  13506. adcxq %rbp, %r10
  13507. adoxq %r14, %r11
  13508. mulxq %r15, %r15, %rdx
  13509. adcxq %r15, %r11
  13510. adoxq %rcx, %rdx
  13511. adcxq %rcx, %rdx
  13512. # Overflow
  13513. shldq $0x01, %r11, %rdx
  13514. movq $0x7fffffffffffffff, %rcx
  13515. imulq $19, %rdx, %rbp
  13516. andq %rcx, %r11
  13517. addq %rbp, %r8
  13518. adcq $0x00, %r9
  13519. adcq $0x00, %r10
  13520. adcq $0x00, %r11
  13521. # Reduce if top bit set
  13522. movq %r11, %rdx
  13523. shrq $63, %rdx
  13524. imulq $19, %rdx, %rbp
  13525. andq %rcx, %r11
  13526. addq %rbp, %r8
  13527. adcq $0x00, %r9
  13528. adcq $0x00, %r10
  13529. adcq $0x00, %r11
  13530. # Store
  13531. movq %r8, (%rdi)
  13532. movq %r9, 8(%rdi)
  13533. movq %r10, 16(%rdi)
  13534. movq %r11, 24(%rdi)
  13535. movq 8(%rsp), %rdi
  13536. # Add
  13537. movq (%rsi), %r8
  13538. movq 8(%rsi), %r9
  13539. addq (%rbx), %r8
  13540. movq 16(%rsi), %r10
  13541. adcq 8(%rbx), %r9
  13542. movq 24(%rsi), %rdx
  13543. adcq 16(%rbx), %r10
  13544. movq $-19, %rcx
  13545. adcq 24(%rbx), %rdx
  13546. movq $0x7fffffffffffffff, %rax
  13547. movq %rdx, %r11
  13548. sarq $63, %rdx
  13549. # Mask the modulus
  13550. andq %rdx, %rcx
  13551. andq %rdx, %rax
  13552. # Sub modulus (if overflow)
  13553. subq %rcx, %r8
  13554. sbbq %rdx, %r9
  13555. sbbq %rdx, %r10
  13556. sbbq %rax, %r11
  13557. movq %r8, (%rdi)
  13558. movq %r9, 8(%rdi)
  13559. movq %r10, 16(%rdi)
  13560. movq %r11, 24(%rdi)
  13561. movq 24(%rsp), %rsi
  13562. # Square
  13563. # A[0] * A[1]
  13564. movq (%rdi), %rdx
  13565. mulxq 8(%rdi), %r9, %r10
  13566. # A[0] * A[3]
  13567. mulxq 24(%rdi), %r11, %r12
  13568. # A[2] * A[1]
  13569. movq 16(%rdi), %rdx
  13570. mulxq 8(%rdi), %rcx, %rax
  13571. xorq %r15, %r15
  13572. adoxq %rcx, %r11
  13573. # A[2] * A[3]
  13574. mulxq 24(%rdi), %r13, %r14
  13575. adoxq %rax, %r12
  13576. # A[2] * A[0]
  13577. mulxq (%rdi), %rcx, %rax
  13578. adoxq %r15, %r13
  13579. adcxq %rcx, %r10
  13580. adoxq %r15, %r14
  13581. # A[1] * A[3]
  13582. movq 8(%rdi), %rdx
  13583. mulxq 24(%rdi), %rbp, %r8
  13584. adcxq %rax, %r11
  13585. adcxq %rbp, %r12
  13586. adcxq %r8, %r13
  13587. adcxq %r15, %r14
  13588. # Double with Carry Flag
  13589. xorq %r15, %r15
  13590. # A[0] * A[0]
  13591. movq (%rdi), %rdx
  13592. mulxq %rdx, %r8, %rbp
  13593. adcxq %r9, %r9
  13594. # A[1] * A[1]
  13595. movq 8(%rdi), %rdx
  13596. mulxq %rdx, %rcx, %rax
  13597. adcxq %r10, %r10
  13598. adoxq %rbp, %r9
  13599. adcxq %r11, %r11
  13600. adoxq %rcx, %r10
  13601. # A[2] * A[2]
  13602. movq 16(%rdi), %rdx
  13603. mulxq %rdx, %rbp, %rcx
  13604. adcxq %r12, %r12
  13605. adoxq %rax, %r11
  13606. adcxq %r13, %r13
  13607. adoxq %rbp, %r12
  13608. # A[3] * A[3]
  13609. movq 24(%rdi), %rdx
  13610. mulxq %rdx, %rbp, %rax
  13611. adcxq %r14, %r14
  13612. adoxq %rcx, %r13
  13613. adcxq %r15, %r15
  13614. adoxq %rbp, %r14
  13615. adoxq %rax, %r15
  13616. # Reduce
  13617. movq $0x7fffffffffffffff, %rcx
  13618. # Move top half into t4-t7 and remove top bit from t3
  13619. shldq $0x01, %r14, %r15
  13620. shldq $0x01, %r13, %r14
  13621. shldq $0x01, %r12, %r13
  13622. shldq $0x01, %r11, %r12
  13623. andq %rcx, %r11
  13624. # Multiply top half by 19
  13625. movq $19, %rdx
  13626. xorq %rcx, %rcx
  13627. mulxq %r12, %rbp, %r12
  13628. adcxq %rbp, %r8
  13629. adoxq %r12, %r9
  13630. mulxq %r13, %rbp, %r13
  13631. adcxq %rbp, %r9
  13632. adoxq %r13, %r10
  13633. mulxq %r14, %rbp, %r14
  13634. adcxq %rbp, %r10
  13635. adoxq %r14, %r11
  13636. mulxq %r15, %r15, %rdx
  13637. adcxq %r15, %r11
  13638. adoxq %rcx, %rdx
  13639. adcxq %rcx, %rdx
  13640. # Overflow
  13641. shldq $0x01, %r11, %rdx
  13642. movq $0x7fffffffffffffff, %rcx
  13643. imulq $19, %rdx, %rbp
  13644. andq %rcx, %r11
  13645. addq %rbp, %r8
  13646. adcq $0x00, %r9
  13647. adcq $0x00, %r10
  13648. adcq $0x00, %r11
  13649. # Reduce if top bit set
  13650. movq %r11, %rdx
  13651. shrq $63, %rdx
  13652. imulq $19, %rdx, %rbp
  13653. andq %rcx, %r11
  13654. addq %rbp, %r8
  13655. adcq $0x00, %r9
  13656. adcq $0x00, %r10
  13657. adcq $0x00, %r11
  13658. # Store
  13659. movq %r8, (%rsi)
  13660. movq %r9, 8(%rsi)
  13661. movq %r10, 16(%rsi)
  13662. movq %r11, 24(%rsi)
  13663. movq 16(%rsp), %rsi
  13664. movq (%rsp), %rbx
  13665. # Add
  13666. movq (%rsi), %r8
  13667. movq 8(%rsi), %r9
  13668. movq 16(%rsi), %r10
  13669. movq 24(%rsi), %rdx
  13670. movq %r8, %r12
  13671. addq (%rbx), %r8
  13672. movq %r9, %r13
  13673. adcq 8(%rbx), %r9
  13674. movq %r10, %r14
  13675. adcq 16(%rbx), %r10
  13676. movq %rdx, %r15
  13677. adcq 24(%rbx), %rdx
  13678. movq $-19, %rcx
  13679. movq %rdx, %r11
  13680. movq $0x7fffffffffffffff, %rax
  13681. sarq $63, %rdx
  13682. # Mask the modulus
  13683. andq %rdx, %rcx
  13684. andq %rdx, %rax
  13685. # Sub modulus (if overflow)
  13686. subq %rcx, %r8
  13687. sbbq %rdx, %r9
  13688. sbbq %rdx, %r10
  13689. sbbq %rax, %r11
  13690. # Sub
  13691. subq (%rbx), %r12
  13692. movq $0x00, %rdx
  13693. sbbq 8(%rbx), %r13
  13694. movq $-19, %rcx
  13695. sbbq 16(%rbx), %r14
  13696. movq $0x7fffffffffffffff, %rax
  13697. sbbq 24(%rbx), %r15
  13698. sbbq $0x00, %rdx
  13699. # Mask the modulus
  13700. andq %rdx, %rcx
  13701. andq %rdx, %rax
  13702. # Add modulus (if underflow)
  13703. addq %rcx, %r12
  13704. adcq %rdx, %r13
  13705. adcq %rdx, %r14
  13706. adcq %rax, %r15
  13707. movq %r8, (%rdi)
  13708. movq %r9, 8(%rdi)
  13709. movq %r10, 16(%rdi)
  13710. movq %r11, 24(%rdi)
  13711. movq %r12, (%rsi)
  13712. movq %r13, 8(%rsi)
  13713. movq %r14, 16(%rsi)
  13714. movq %r15, 24(%rsi)
  13715. movq 24(%rsp), %rsi
  13716. # Sub
  13717. movq (%rsi), %r8
  13718. movq 8(%rsi), %r9
  13719. movq 16(%rsi), %r10
  13720. movq 24(%rsi), %r11
  13721. subq (%rdi), %r8
  13722. movq $0x00, %rdx
  13723. sbbq 8(%rdi), %r9
  13724. movq $-19, %rcx
  13725. sbbq 16(%rdi), %r10
  13726. movq $0x7fffffffffffffff, %rax
  13727. sbbq 24(%rdi), %r11
  13728. sbbq $0x00, %rdx
  13729. # Mask the modulus
  13730. andq %rdx, %rcx
  13731. andq %rdx, %rax
  13732. # Add modulus (if underflow)
  13733. addq %rcx, %r8
  13734. adcq %rdx, %r9
  13735. adcq %rdx, %r10
  13736. adcq %rax, %r11
  13737. movq %r8, (%rbx)
  13738. movq %r9, 8(%rbx)
  13739. movq %r10, 16(%rbx)
  13740. movq %r11, 24(%rbx)
  13741. movq 104(%rsp), %rdi
  13742. # Square * 2
  13743. # A[0] * A[1]
  13744. movq (%rdi), %rdx
  13745. mulxq 8(%rdi), %r9, %r10
  13746. # A[0] * A[3]
  13747. mulxq 24(%rdi), %r11, %r12
  13748. # A[2] * A[1]
  13749. movq 16(%rdi), %rdx
  13750. mulxq 8(%rdi), %rcx, %rax
  13751. xorq %r15, %r15
  13752. adoxq %rcx, %r11
  13753. # A[2] * A[3]
  13754. mulxq 24(%rdi), %r13, %r14
  13755. adoxq %rax, %r12
  13756. # A[2] * A[0]
  13757. mulxq (%rdi), %rcx, %rax
  13758. adoxq %r15, %r13
  13759. adcxq %rcx, %r10
  13760. adoxq %r15, %r14
  13761. # A[1] * A[3]
  13762. movq 8(%rdi), %rdx
  13763. mulxq 24(%rdi), %rbp, %r8
  13764. adcxq %rax, %r11
  13765. adcxq %rbp, %r12
  13766. adcxq %r8, %r13
  13767. adcxq %r15, %r14
  13768. # Double with Carry Flag
  13769. xorq %r15, %r15
  13770. # A[0] * A[0]
  13771. movq (%rdi), %rdx
  13772. mulxq %rdx, %r8, %rbp
  13773. adcxq %r9, %r9
  13774. # A[1] * A[1]
  13775. movq 8(%rdi), %rdx
  13776. mulxq %rdx, %rcx, %rax
  13777. adcxq %r10, %r10
  13778. adoxq %rbp, %r9
  13779. adcxq %r11, %r11
  13780. adoxq %rcx, %r10
  13781. # A[2] * A[2]
  13782. movq 16(%rdi), %rdx
  13783. mulxq %rdx, %rbp, %rcx
  13784. adcxq %r12, %r12
  13785. adoxq %rax, %r11
  13786. adcxq %r13, %r13
  13787. adoxq %rbp, %r12
  13788. # A[3] * A[3]
  13789. movq 24(%rdi), %rdx
  13790. mulxq %rdx, %rbp, %rax
  13791. adcxq %r14, %r14
  13792. adoxq %rcx, %r13
  13793. adcxq %r15, %r15
  13794. adoxq %rbp, %r14
  13795. adoxq %rax, %r15
  13796. # Reduce
  13797. movq $0x7fffffffffffffff, %rax
  13798. xorq %rbp, %rbp
  13799. # Move top half into t4-t7 and remove top bit from t3 and double
  13800. shldq $3, %r15, %rbp
  13801. shldq $2, %r14, %r15
  13802. shldq $2, %r13, %r14
  13803. shldq $2, %r12, %r13
  13804. shldq $2, %r11, %r12
  13805. shldq $0x01, %r10, %r11
  13806. shldq $0x01, %r9, %r10
  13807. shldq $0x01, %r8, %r9
  13808. shlq $0x01, %r8
  13809. andq %rax, %r11
  13810. # Two out left, one in right
  13811. andq %rax, %r15
  13812. # Multiply top bits by 19*19
  13813. imulq $0x169, %rbp, %rcx
  13814. xorq %rax, %rax
  13815. # Multiply top half by 19
  13816. movq $19, %rdx
  13817. adoxq %rcx, %r8
  13818. mulxq %r12, %rbp, %r12
  13819. adcxq %rbp, %r8
  13820. adoxq %r12, %r9
  13821. mulxq %r13, %rbp, %r13
  13822. adcxq %rbp, %r9
  13823. adoxq %r13, %r10
  13824. mulxq %r14, %rbp, %r14
  13825. adcxq %rbp, %r10
  13826. adoxq %r14, %r11
  13827. mulxq %r15, %r15, %rdx
  13828. adcxq %r15, %r11
  13829. adoxq %rax, %rdx
  13830. adcxq %rax, %rdx
  13831. # Overflow
  13832. shldq $0x01, %r11, %rdx
  13833. movq $0x7fffffffffffffff, %rax
  13834. imulq $19, %rdx, %rbp
  13835. andq %rax, %r11
  13836. addq %rbp, %r8
  13837. adcq $0x00, %r9
  13838. adcq $0x00, %r10
  13839. adcq $0x00, %r11
  13840. # Reduce if top bit set
  13841. movq %r11, %rdx
  13842. shrq $63, %rdx
  13843. imulq $19, %rdx, %rbp
  13844. andq %rax, %r11
  13845. addq %rbp, %r8
  13846. adcq $0x00, %r9
  13847. adcq $0x00, %r10
  13848. adcq $0x00, %r11
  13849. # Store
  13850. movq %r8, (%rsi)
  13851. movq %r9, 8(%rsi)
  13852. movq %r10, 16(%rsi)
  13853. movq %r11, 24(%rsi)
  13854. movq 16(%rsp), %rdi
  13855. # Sub
  13856. movq (%rsi), %r8
  13857. movq 8(%rsi), %r9
  13858. movq 16(%rsi), %r10
  13859. movq 24(%rsi), %r11
  13860. subq (%rdi), %r8
  13861. movq $0x00, %rdx
  13862. sbbq 8(%rdi), %r9
  13863. movq $-19, %rcx
  13864. sbbq 16(%rdi), %r10
  13865. movq $0x7fffffffffffffff, %rax
  13866. sbbq 24(%rdi), %r11
  13867. sbbq $0x00, %rdx
  13868. # Mask the modulus
  13869. andq %rdx, %rcx
  13870. andq %rdx, %rax
  13871. # Add modulus (if underflow)
  13872. addq %rcx, %r8
  13873. adcq %rdx, %r9
  13874. adcq %rdx, %r10
  13875. adcq %rax, %r11
  13876. movq %r8, (%rsi)
  13877. movq %r9, 8(%rsi)
  13878. movq %r10, 16(%rsi)
  13879. movq %r11, 24(%rsi)
  13880. addq $48, %rsp
  13881. popq %r15
  13882. popq %r14
  13883. popq %r13
  13884. popq %r12
  13885. popq %rbx
  13886. popq %rbp
  13887. repz retq
  13888. #ifndef __APPLE__
  13889. .size fe_ge_dbl_avx2,.-fe_ge_dbl_avx2
  13890. #endif /* __APPLE__ */
  13891. #ifndef __APPLE__
  13892. .text
  13893. .globl fe_ge_madd_avx2
  13894. .type fe_ge_madd_avx2,@function
  13895. .align 16
  13896. fe_ge_madd_avx2:
  13897. #else
  13898. .section __TEXT,__text
  13899. .globl _fe_ge_madd_avx2
  13900. .p2align 4
  13901. _fe_ge_madd_avx2:
  13902. #endif /* __APPLE__ */
  13903. pushq %rbp
  13904. pushq %rbx
  13905. pushq %r12
  13906. pushq %r13
  13907. pushq %r14
  13908. pushq %r15
  13909. subq $48, %rsp
  13910. movq %rdi, (%rsp)
  13911. movq %rsi, 8(%rsp)
  13912. movq %rdx, 16(%rsp)
  13913. movq %rcx, 24(%rsp)
  13914. movq %r8, 32(%rsp)
  13915. movq %r9, 40(%rsp)
  13916. movq 8(%rsp), %rsi
  13917. movq 40(%rsp), %rbx
  13918. movq 32(%rsp), %rbp
  13919. # Add
  13920. movq (%rbx), %r8
  13921. movq 8(%rbx), %r9
  13922. movq 16(%rbx), %r10
  13923. movq 24(%rbx), %rdx
  13924. movq %r8, %r12
  13925. addq (%rbp), %r8
  13926. movq %r9, %r13
  13927. adcq 8(%rbp), %r9
  13928. movq %r10, %r14
  13929. adcq 16(%rbp), %r10
  13930. movq %rdx, %r15
  13931. adcq 24(%rbp), %rdx
  13932. movq $-19, %rcx
  13933. movq %rdx, %r11
  13934. movq $0x7fffffffffffffff, %rax
  13935. sarq $63, %rdx
  13936. # Mask the modulus
  13937. andq %rdx, %rcx
  13938. andq %rdx, %rax
  13939. # Sub modulus (if overflow)
  13940. subq %rcx, %r8
  13941. sbbq %rdx, %r9
  13942. sbbq %rdx, %r10
  13943. sbbq %rax, %r11
  13944. # Sub
  13945. subq (%rbp), %r12
  13946. movq $0x00, %rdx
  13947. sbbq 8(%rbp), %r13
  13948. movq $-19, %rcx
  13949. sbbq 16(%rbp), %r14
  13950. movq $0x7fffffffffffffff, %rax
  13951. sbbq 24(%rbp), %r15
  13952. sbbq $0x00, %rdx
  13953. # Mask the modulus
  13954. andq %rdx, %rcx
  13955. andq %rdx, %rax
  13956. # Add modulus (if underflow)
  13957. addq %rcx, %r12
  13958. adcq %rdx, %r13
  13959. adcq %rdx, %r14
  13960. adcq %rax, %r15
  13961. movq %r8, (%rdi)
  13962. movq %r9, 8(%rdi)
  13963. movq %r10, 16(%rdi)
  13964. movq %r11, 24(%rdi)
  13965. movq %r12, (%rsi)
  13966. movq %r13, 8(%rsi)
  13967. movq %r14, 16(%rsi)
  13968. movq %r15, 24(%rsi)
  13969. movq 16(%rsp), %rbx
  13970. movq 128(%rsp), %rbp
  13971. # Multiply
  13972. # A[0] * B[0]
  13973. movq (%rbp), %rdx
  13974. mulxq (%rdi), %r8, %r9
  13975. # A[2] * B[0]
  13976. mulxq 16(%rdi), %r10, %r11
  13977. # A[1] * B[0]
  13978. mulxq 8(%rdi), %rcx, %rax
  13979. xorq %r15, %r15
  13980. adcxq %rcx, %r9
  13981. # A[1] * B[3]
  13982. movq 24(%rbp), %rdx
  13983. mulxq 8(%rdi), %r12, %r13
  13984. adcxq %rax, %r10
  13985. # A[0] * B[1]
  13986. movq 8(%rbp), %rdx
  13987. mulxq (%rdi), %rcx, %rax
  13988. adoxq %rcx, %r9
  13989. # A[2] * B[1]
  13990. mulxq 16(%rdi), %rcx, %r14
  13991. adoxq %rax, %r10
  13992. adcxq %rcx, %r11
  13993. # A[1] * B[2]
  13994. movq 16(%rbp), %rdx
  13995. mulxq 8(%rdi), %rcx, %rax
  13996. adcxq %r14, %r12
  13997. adoxq %rcx, %r11
  13998. adcxq %r15, %r13
  13999. adoxq %rax, %r12
  14000. # A[0] * B[2]
  14001. mulxq (%rdi), %rcx, %rax
  14002. adoxq %r15, %r13
  14003. xorq %r14, %r14
  14004. adcxq %rcx, %r10
  14005. # A[1] * B[1]
  14006. movq 8(%rbp), %rdx
  14007. mulxq 8(%rdi), %rdx, %rcx
  14008. adcxq %rax, %r11
  14009. adoxq %rdx, %r10
  14010. # A[3] * B[1]
  14011. movq 8(%rbp), %rdx
  14012. adoxq %rcx, %r11
  14013. mulxq 24(%rdi), %rcx, %rax
  14014. adcxq %rcx, %r12
  14015. # A[2] * B[2]
  14016. movq 16(%rbp), %rdx
  14017. mulxq 16(%rdi), %rdx, %rcx
  14018. adcxq %rax, %r13
  14019. adoxq %rdx, %r12
  14020. # A[3] * B[3]
  14021. movq 24(%rbp), %rdx
  14022. adoxq %rcx, %r13
  14023. mulxq 24(%rdi), %rcx, %rax
  14024. adoxq %r15, %r14
  14025. adcxq %rcx, %r14
  14026. # A[0] * B[3]
  14027. mulxq (%rdi), %rdx, %rcx
  14028. adcxq %rax, %r15
  14029. xorq %rax, %rax
  14030. adcxq %rdx, %r11
  14031. # A[3] * B[0]
  14032. movq (%rbp), %rdx
  14033. adcxq %rcx, %r12
  14034. mulxq 24(%rdi), %rdx, %rcx
  14035. adoxq %rdx, %r11
  14036. adoxq %rcx, %r12
  14037. # A[2] * B[3]
  14038. movq 24(%rbp), %rdx
  14039. mulxq 16(%rdi), %rdx, %rcx
  14040. adcxq %rdx, %r13
  14041. # A[3] * B[2]
  14042. movq 16(%rbp), %rdx
  14043. adcxq %rcx, %r14
  14044. mulxq 24(%rdi), %rcx, %rdx
  14045. adcxq %rax, %r15
  14046. adoxq %rcx, %r13
  14047. adoxq %rdx, %r14
  14048. adoxq %rax, %r15
  14049. # Reduce
  14050. movq $0x7fffffffffffffff, %rax
  14051. # Move top half into t4-t7 and remove top bit from t3
  14052. shldq $0x01, %r14, %r15
  14053. shldq $0x01, %r13, %r14
  14054. shldq $0x01, %r12, %r13
  14055. shldq $0x01, %r11, %r12
  14056. andq %rax, %r11
  14057. # Multiply top half by 19
  14058. movq $19, %rdx
  14059. xorq %rax, %rax
  14060. mulxq %r12, %rcx, %r12
  14061. adcxq %rcx, %r8
  14062. adoxq %r12, %r9
  14063. mulxq %r13, %rcx, %r13
  14064. adcxq %rcx, %r9
  14065. adoxq %r13, %r10
  14066. mulxq %r14, %rcx, %r14
  14067. adcxq %rcx, %r10
  14068. adoxq %r14, %r11
  14069. mulxq %r15, %r15, %rdx
  14070. adcxq %r15, %r11
  14071. adoxq %rax, %rdx
  14072. adcxq %rax, %rdx
  14073. # Overflow
  14074. shldq $0x01, %r11, %rdx
  14075. movq $0x7fffffffffffffff, %rax
  14076. imulq $19, %rdx, %rcx
  14077. andq %rax, %r11
  14078. addq %rcx, %r8
  14079. adcq $0x00, %r9
  14080. adcq $0x00, %r10
  14081. adcq $0x00, %r11
  14082. # Reduce if top bit set
  14083. movq %r11, %rdx
  14084. shrq $63, %rdx
  14085. imulq $19, %rdx, %rcx
  14086. andq %rax, %r11
  14087. addq %rcx, %r8
  14088. adcq $0x00, %r9
  14089. adcq $0x00, %r10
  14090. adcq $0x00, %r11
  14091. # Store
  14092. movq %r8, (%rbx)
  14093. movq %r9, 8(%rbx)
  14094. movq %r10, 16(%rbx)
  14095. movq %r11, 24(%rbx)
  14096. movq 136(%rsp), %rdi
  14097. # Multiply
  14098. # A[0] * B[0]
  14099. movq (%rdi), %rdx
  14100. mulxq (%rsi), %r8, %r9
  14101. # A[2] * B[0]
  14102. mulxq 16(%rsi), %r10, %r11
  14103. # A[1] * B[0]
  14104. mulxq 8(%rsi), %rcx, %rax
  14105. xorq %r15, %r15
  14106. adcxq %rcx, %r9
  14107. # A[1] * B[3]
  14108. movq 24(%rdi), %rdx
  14109. mulxq 8(%rsi), %r12, %r13
  14110. adcxq %rax, %r10
  14111. # A[0] * B[1]
  14112. movq 8(%rdi), %rdx
  14113. mulxq (%rsi), %rcx, %rax
  14114. adoxq %rcx, %r9
  14115. # A[2] * B[1]
  14116. mulxq 16(%rsi), %rcx, %r14
  14117. adoxq %rax, %r10
  14118. adcxq %rcx, %r11
  14119. # A[1] * B[2]
  14120. movq 16(%rdi), %rdx
  14121. mulxq 8(%rsi), %rcx, %rax
  14122. adcxq %r14, %r12
  14123. adoxq %rcx, %r11
  14124. adcxq %r15, %r13
  14125. adoxq %rax, %r12
  14126. # A[0] * B[2]
  14127. mulxq (%rsi), %rcx, %rax
  14128. adoxq %r15, %r13
  14129. xorq %r14, %r14
  14130. adcxq %rcx, %r10
  14131. # A[1] * B[1]
  14132. movq 8(%rdi), %rdx
  14133. mulxq 8(%rsi), %rdx, %rcx
  14134. adcxq %rax, %r11
  14135. adoxq %rdx, %r10
  14136. # A[3] * B[1]
  14137. movq 8(%rdi), %rdx
  14138. adoxq %rcx, %r11
  14139. mulxq 24(%rsi), %rcx, %rax
  14140. adcxq %rcx, %r12
  14141. # A[2] * B[2]
  14142. movq 16(%rdi), %rdx
  14143. mulxq 16(%rsi), %rdx, %rcx
  14144. adcxq %rax, %r13
  14145. adoxq %rdx, %r12
  14146. # A[3] * B[3]
  14147. movq 24(%rdi), %rdx
  14148. adoxq %rcx, %r13
  14149. mulxq 24(%rsi), %rcx, %rax
  14150. adoxq %r15, %r14
  14151. adcxq %rcx, %r14
  14152. # A[0] * B[3]
  14153. mulxq (%rsi), %rdx, %rcx
  14154. adcxq %rax, %r15
  14155. xorq %rax, %rax
  14156. adcxq %rdx, %r11
  14157. # A[3] * B[0]
  14158. movq (%rdi), %rdx
  14159. adcxq %rcx, %r12
  14160. mulxq 24(%rsi), %rdx, %rcx
  14161. adoxq %rdx, %r11
  14162. adoxq %rcx, %r12
  14163. # A[2] * B[3]
  14164. movq 24(%rdi), %rdx
  14165. mulxq 16(%rsi), %rdx, %rcx
  14166. adcxq %rdx, %r13
  14167. # A[3] * B[2]
  14168. movq 16(%rdi), %rdx
  14169. adcxq %rcx, %r14
  14170. mulxq 24(%rsi), %rcx, %rdx
  14171. adcxq %rax, %r15
  14172. adoxq %rcx, %r13
  14173. adoxq %rdx, %r14
  14174. adoxq %rax, %r15
  14175. # Reduce
  14176. movq $0x7fffffffffffffff, %rax
  14177. # Move top half into t4-t7 and remove top bit from t3
  14178. shldq $0x01, %r14, %r15
  14179. shldq $0x01, %r13, %r14
  14180. shldq $0x01, %r12, %r13
  14181. shldq $0x01, %r11, %r12
  14182. andq %rax, %r11
  14183. # Multiply top half by 19
  14184. movq $19, %rdx
  14185. xorq %rax, %rax
  14186. mulxq %r12, %rcx, %r12
  14187. adcxq %rcx, %r8
  14188. adoxq %r12, %r9
  14189. mulxq %r13, %rcx, %r13
  14190. adcxq %rcx, %r9
  14191. adoxq %r13, %r10
  14192. mulxq %r14, %rcx, %r14
  14193. adcxq %rcx, %r10
  14194. adoxq %r14, %r11
  14195. mulxq %r15, %r15, %rdx
  14196. adcxq %r15, %r11
  14197. adoxq %rax, %rdx
  14198. adcxq %rax, %rdx
  14199. # Overflow
  14200. shldq $0x01, %r11, %rdx
  14201. movq $0x7fffffffffffffff, %rax
  14202. imulq $19, %rdx, %rcx
  14203. andq %rax, %r11
  14204. addq %rcx, %r8
  14205. adcq $0x00, %r9
  14206. adcq $0x00, %r10
  14207. adcq $0x00, %r11
  14208. # Reduce if top bit set
  14209. movq %r11, %rdx
  14210. shrq $63, %rdx
  14211. imulq $19, %rdx, %rcx
  14212. andq %rax, %r11
  14213. addq %rcx, %r8
  14214. adcq $0x00, %r9
  14215. adcq $0x00, %r10
  14216. adcq $0x00, %r11
  14217. # Store
  14218. movq %r8, (%rsi)
  14219. movq %r9, 8(%rsi)
  14220. movq %r10, 16(%rsi)
  14221. movq %r11, 24(%rsi)
  14222. movq 24(%rsp), %rdi
  14223. movq 120(%rsp), %rsi
  14224. movq 112(%rsp), %rbp
  14225. # Multiply
  14226. # A[0] * B[0]
  14227. movq (%rbp), %rdx
  14228. mulxq (%rsi), %r8, %r9
  14229. # A[2] * B[0]
  14230. mulxq 16(%rsi), %r10, %r11
  14231. # A[1] * B[0]
  14232. mulxq 8(%rsi), %rcx, %rax
  14233. xorq %r15, %r15
  14234. adcxq %rcx, %r9
  14235. # A[1] * B[3]
  14236. movq 24(%rbp), %rdx
  14237. mulxq 8(%rsi), %r12, %r13
  14238. adcxq %rax, %r10
  14239. # A[0] * B[1]
  14240. movq 8(%rbp), %rdx
  14241. mulxq (%rsi), %rcx, %rax
  14242. adoxq %rcx, %r9
  14243. # A[2] * B[1]
  14244. mulxq 16(%rsi), %rcx, %r14
  14245. adoxq %rax, %r10
  14246. adcxq %rcx, %r11
  14247. # A[1] * B[2]
  14248. movq 16(%rbp), %rdx
  14249. mulxq 8(%rsi), %rcx, %rax
  14250. adcxq %r14, %r12
  14251. adoxq %rcx, %r11
  14252. adcxq %r15, %r13
  14253. adoxq %rax, %r12
  14254. # A[0] * B[2]
  14255. mulxq (%rsi), %rcx, %rax
  14256. adoxq %r15, %r13
  14257. xorq %r14, %r14
  14258. adcxq %rcx, %r10
  14259. # A[1] * B[1]
  14260. movq 8(%rbp), %rdx
  14261. mulxq 8(%rsi), %rdx, %rcx
  14262. adcxq %rax, %r11
  14263. adoxq %rdx, %r10
  14264. # A[3] * B[1]
  14265. movq 8(%rbp), %rdx
  14266. adoxq %rcx, %r11
  14267. mulxq 24(%rsi), %rcx, %rax
  14268. adcxq %rcx, %r12
  14269. # A[2] * B[2]
  14270. movq 16(%rbp), %rdx
  14271. mulxq 16(%rsi), %rdx, %rcx
  14272. adcxq %rax, %r13
  14273. adoxq %rdx, %r12
  14274. # A[3] * B[3]
  14275. movq 24(%rbp), %rdx
  14276. adoxq %rcx, %r13
  14277. mulxq 24(%rsi), %rcx, %rax
  14278. adoxq %r15, %r14
  14279. adcxq %rcx, %r14
  14280. # A[0] * B[3]
  14281. mulxq (%rsi), %rdx, %rcx
  14282. adcxq %rax, %r15
  14283. xorq %rax, %rax
  14284. adcxq %rdx, %r11
  14285. # A[3] * B[0]
  14286. movq (%rbp), %rdx
  14287. adcxq %rcx, %r12
  14288. mulxq 24(%rsi), %rdx, %rcx
  14289. adoxq %rdx, %r11
  14290. adoxq %rcx, %r12
  14291. # A[2] * B[3]
  14292. movq 24(%rbp), %rdx
  14293. mulxq 16(%rsi), %rdx, %rcx
  14294. adcxq %rdx, %r13
  14295. # A[3] * B[2]
  14296. movq 16(%rbp), %rdx
  14297. adcxq %rcx, %r14
  14298. mulxq 24(%rsi), %rcx, %rdx
  14299. adcxq %rax, %r15
  14300. adoxq %rcx, %r13
  14301. adoxq %rdx, %r14
  14302. adoxq %rax, %r15
  14303. # Reduce
  14304. movq $0x7fffffffffffffff, %rax
  14305. # Move top half into t4-t7 and remove top bit from t3
  14306. shldq $0x01, %r14, %r15
  14307. shldq $0x01, %r13, %r14
  14308. shldq $0x01, %r12, %r13
  14309. shldq $0x01, %r11, %r12
  14310. andq %rax, %r11
  14311. # Multiply top half by 19
  14312. movq $19, %rdx
  14313. xorq %rax, %rax
  14314. mulxq %r12, %rcx, %r12
  14315. adcxq %rcx, %r8
  14316. adoxq %r12, %r9
  14317. mulxq %r13, %rcx, %r13
  14318. adcxq %rcx, %r9
  14319. adoxq %r13, %r10
  14320. mulxq %r14, %rcx, %r14
  14321. adcxq %rcx, %r10
  14322. adoxq %r14, %r11
  14323. mulxq %r15, %r15, %rdx
  14324. adcxq %r15, %r11
  14325. adoxq %rax, %rdx
  14326. adcxq %rax, %rdx
  14327. # Overflow
  14328. shldq $0x01, %r11, %rdx
  14329. movq $0x7fffffffffffffff, %rax
  14330. imulq $19, %rdx, %rcx
  14331. andq %rax, %r11
  14332. addq %rcx, %r8
  14333. adcq $0x00, %r9
  14334. adcq $0x00, %r10
  14335. adcq $0x00, %r11
  14336. # Reduce if top bit set
  14337. movq %r11, %rdx
  14338. shrq $63, %rdx
  14339. imulq $19, %rdx, %rcx
  14340. andq %rax, %r11
  14341. addq %rcx, %r8
  14342. adcq $0x00, %r9
  14343. adcq $0x00, %r10
  14344. adcq $0x00, %r11
  14345. # Store
  14346. movq %r8, (%rdi)
  14347. movq %r9, 8(%rdi)
  14348. movq %r10, 16(%rdi)
  14349. movq %r11, 24(%rdi)
  14350. movq 8(%rsp), %rdi
  14351. movq (%rsp), %rsi
  14352. # Add
  14353. movq (%rbx), %r8
  14354. movq 8(%rbx), %r9
  14355. movq 16(%rbx), %r10
  14356. movq 24(%rbx), %rdx
  14357. movq %r8, %r12
  14358. addq (%rdi), %r8
  14359. movq %r9, %r13
  14360. adcq 8(%rdi), %r9
  14361. movq %r10, %r14
  14362. adcq 16(%rdi), %r10
  14363. movq %rdx, %r15
  14364. adcq 24(%rdi), %rdx
  14365. movq $-19, %rcx
  14366. movq %rdx, %r11
  14367. movq $0x7fffffffffffffff, %rax
  14368. sarq $63, %rdx
  14369. # Mask the modulus
  14370. andq %rdx, %rcx
  14371. andq %rdx, %rax
  14372. # Sub modulus (if overflow)
  14373. subq %rcx, %r8
  14374. sbbq %rdx, %r9
  14375. sbbq %rdx, %r10
  14376. sbbq %rax, %r11
  14377. # Sub
  14378. subq (%rdi), %r12
  14379. movq $0x00, %rdx
  14380. sbbq 8(%rdi), %r13
  14381. movq $-19, %rcx
  14382. sbbq 16(%rdi), %r14
  14383. movq $0x7fffffffffffffff, %rax
  14384. sbbq 24(%rdi), %r15
  14385. sbbq $0x00, %rdx
  14386. # Mask the modulus
  14387. andq %rdx, %rcx
  14388. andq %rdx, %rax
  14389. # Add modulus (if underflow)
  14390. addq %rcx, %r12
  14391. adcq %rdx, %r13
  14392. adcq %rdx, %r14
  14393. adcq %rax, %r15
  14394. movq %r8, (%rdi)
  14395. movq %r9, 8(%rdi)
  14396. movq %r10, 16(%rdi)
  14397. movq %r11, 24(%rdi)
  14398. movq %r12, (%rsi)
  14399. movq %r13, 8(%rsi)
  14400. movq %r14, 16(%rsi)
  14401. movq %r15, 24(%rsi)
  14402. movq 104(%rsp), %rdi
  14403. # Double
  14404. movq (%rdi), %r8
  14405. movq 8(%rdi), %r9
  14406. addq %r8, %r8
  14407. movq 16(%rdi), %r10
  14408. adcq %r9, %r9
  14409. movq 24(%rdi), %rdx
  14410. adcq %r10, %r10
  14411. movq $-19, %rcx
  14412. adcq %rdx, %rdx
  14413. movq $0x7fffffffffffffff, %rax
  14414. movq %rdx, %r11
  14415. sarq $63, %rdx
  14416. # Mask the modulus
  14417. andq %rdx, %rcx
  14418. andq %rdx, %rax
  14419. # Sub modulus (if overflow)
  14420. subq %rcx, %r8
  14421. sbbq %rdx, %r9
  14422. sbbq %rdx, %r10
  14423. sbbq %rax, %r11
  14424. movq %r8, (%rbx)
  14425. movq %r9, 8(%rbx)
  14426. movq %r10, 16(%rbx)
  14427. movq %r11, 24(%rbx)
  14428. movq 24(%rsp), %rdi
  14429. # Add
  14430. movq (%rbx), %r8
  14431. movq 8(%rbx), %r9
  14432. movq 16(%rbx), %r10
  14433. movq 24(%rbx), %rdx
  14434. movq %r8, %r12
  14435. addq (%rdi), %r8
  14436. movq %r9, %r13
  14437. adcq 8(%rdi), %r9
  14438. movq %r10, %r14
  14439. adcq 16(%rdi), %r10
  14440. movq %rdx, %r15
  14441. adcq 24(%rdi), %rdx
  14442. movq $-19, %rcx
  14443. movq %rdx, %r11
  14444. movq $0x7fffffffffffffff, %rax
  14445. sarq $63, %rdx
  14446. # Mask the modulus
  14447. andq %rdx, %rcx
  14448. andq %rdx, %rax
  14449. # Sub modulus (if overflow)
  14450. subq %rcx, %r8
  14451. sbbq %rdx, %r9
  14452. sbbq %rdx, %r10
  14453. sbbq %rax, %r11
  14454. # Sub
  14455. subq (%rdi), %r12
  14456. movq $0x00, %rdx
  14457. sbbq 8(%rdi), %r13
  14458. movq $-19, %rcx
  14459. sbbq 16(%rdi), %r14
  14460. movq $0x7fffffffffffffff, %rax
  14461. sbbq 24(%rdi), %r15
  14462. sbbq $0x00, %rdx
  14463. # Mask the modulus
  14464. andq %rdx, %rcx
  14465. andq %rdx, %rax
  14466. # Add modulus (if underflow)
  14467. addq %rcx, %r12
  14468. adcq %rdx, %r13
  14469. adcq %rdx, %r14
  14470. adcq %rax, %r15
  14471. movq %r8, (%rbx)
  14472. movq %r9, 8(%rbx)
  14473. movq %r10, 16(%rbx)
  14474. movq %r11, 24(%rbx)
  14475. movq %r12, (%rdi)
  14476. movq %r13, 8(%rdi)
  14477. movq %r14, 16(%rdi)
  14478. movq %r15, 24(%rdi)
  14479. addq $48, %rsp
  14480. popq %r15
  14481. popq %r14
  14482. popq %r13
  14483. popq %r12
  14484. popq %rbx
  14485. popq %rbp
  14486. repz retq
  14487. #ifndef __APPLE__
  14488. .size fe_ge_madd_avx2,.-fe_ge_madd_avx2
  14489. #endif /* __APPLE__ */
  14490. #ifndef __APPLE__
  14491. .text
  14492. .globl fe_ge_msub_avx2
  14493. .type fe_ge_msub_avx2,@function
  14494. .align 16
  14495. fe_ge_msub_avx2:
  14496. #else
  14497. .section __TEXT,__text
  14498. .globl _fe_ge_msub_avx2
  14499. .p2align 4
  14500. _fe_ge_msub_avx2:
  14501. #endif /* __APPLE__ */
  14502. pushq %rbp
  14503. pushq %rbx
  14504. pushq %r12
  14505. pushq %r13
  14506. pushq %r14
  14507. pushq %r15
  14508. subq $48, %rsp
  14509. movq %rdi, (%rsp)
  14510. movq %rsi, 8(%rsp)
  14511. movq %rdx, 16(%rsp)
  14512. movq %rcx, 24(%rsp)
  14513. movq %r8, 32(%rsp)
  14514. movq %r9, 40(%rsp)
  14515. movq 8(%rsp), %rsi
  14516. movq 40(%rsp), %rbx
  14517. movq 32(%rsp), %rbp
  14518. # Add
  14519. movq (%rbx), %r8
  14520. movq 8(%rbx), %r9
  14521. movq 16(%rbx), %r10
  14522. movq 24(%rbx), %rdx
  14523. movq %r8, %r12
  14524. addq (%rbp), %r8
  14525. movq %r9, %r13
  14526. adcq 8(%rbp), %r9
  14527. movq %r10, %r14
  14528. adcq 16(%rbp), %r10
  14529. movq %rdx, %r15
  14530. adcq 24(%rbp), %rdx
  14531. movq $-19, %rcx
  14532. movq %rdx, %r11
  14533. movq $0x7fffffffffffffff, %rax
  14534. sarq $63, %rdx
  14535. # Mask the modulus
  14536. andq %rdx, %rcx
  14537. andq %rdx, %rax
  14538. # Sub modulus (if overflow)
  14539. subq %rcx, %r8
  14540. sbbq %rdx, %r9
  14541. sbbq %rdx, %r10
  14542. sbbq %rax, %r11
  14543. # Sub
  14544. subq (%rbp), %r12
  14545. movq $0x00, %rdx
  14546. sbbq 8(%rbp), %r13
  14547. movq $-19, %rcx
  14548. sbbq 16(%rbp), %r14
  14549. movq $0x7fffffffffffffff, %rax
  14550. sbbq 24(%rbp), %r15
  14551. sbbq $0x00, %rdx
  14552. # Mask the modulus
  14553. andq %rdx, %rcx
  14554. andq %rdx, %rax
  14555. # Add modulus (if underflow)
  14556. addq %rcx, %r12
  14557. adcq %rdx, %r13
  14558. adcq %rdx, %r14
  14559. adcq %rax, %r15
  14560. movq %r8, (%rdi)
  14561. movq %r9, 8(%rdi)
  14562. movq %r10, 16(%rdi)
  14563. movq %r11, 24(%rdi)
  14564. movq %r12, (%rsi)
  14565. movq %r13, 8(%rsi)
  14566. movq %r14, 16(%rsi)
  14567. movq %r15, 24(%rsi)
  14568. movq 16(%rsp), %rbx
  14569. movq 136(%rsp), %rbp
  14570. # Multiply
  14571. # A[0] * B[0]
  14572. movq (%rbp), %rdx
  14573. mulxq (%rdi), %r8, %r9
  14574. # A[2] * B[0]
  14575. mulxq 16(%rdi), %r10, %r11
  14576. # A[1] * B[0]
  14577. mulxq 8(%rdi), %rcx, %rax
  14578. xorq %r15, %r15
  14579. adcxq %rcx, %r9
  14580. # A[1] * B[3]
  14581. movq 24(%rbp), %rdx
  14582. mulxq 8(%rdi), %r12, %r13
  14583. adcxq %rax, %r10
  14584. # A[0] * B[1]
  14585. movq 8(%rbp), %rdx
  14586. mulxq (%rdi), %rcx, %rax
  14587. adoxq %rcx, %r9
  14588. # A[2] * B[1]
  14589. mulxq 16(%rdi), %rcx, %r14
  14590. adoxq %rax, %r10
  14591. adcxq %rcx, %r11
  14592. # A[1] * B[2]
  14593. movq 16(%rbp), %rdx
  14594. mulxq 8(%rdi), %rcx, %rax
  14595. adcxq %r14, %r12
  14596. adoxq %rcx, %r11
  14597. adcxq %r15, %r13
  14598. adoxq %rax, %r12
  14599. # A[0] * B[2]
  14600. mulxq (%rdi), %rcx, %rax
  14601. adoxq %r15, %r13
  14602. xorq %r14, %r14
  14603. adcxq %rcx, %r10
  14604. # A[1] * B[1]
  14605. movq 8(%rbp), %rdx
  14606. mulxq 8(%rdi), %rdx, %rcx
  14607. adcxq %rax, %r11
  14608. adoxq %rdx, %r10
  14609. # A[3] * B[1]
  14610. movq 8(%rbp), %rdx
  14611. adoxq %rcx, %r11
  14612. mulxq 24(%rdi), %rcx, %rax
  14613. adcxq %rcx, %r12
  14614. # A[2] * B[2]
  14615. movq 16(%rbp), %rdx
  14616. mulxq 16(%rdi), %rdx, %rcx
  14617. adcxq %rax, %r13
  14618. adoxq %rdx, %r12
  14619. # A[3] * B[3]
  14620. movq 24(%rbp), %rdx
  14621. adoxq %rcx, %r13
  14622. mulxq 24(%rdi), %rcx, %rax
  14623. adoxq %r15, %r14
  14624. adcxq %rcx, %r14
  14625. # A[0] * B[3]
  14626. mulxq (%rdi), %rdx, %rcx
  14627. adcxq %rax, %r15
  14628. xorq %rax, %rax
  14629. adcxq %rdx, %r11
  14630. # A[3] * B[0]
  14631. movq (%rbp), %rdx
  14632. adcxq %rcx, %r12
  14633. mulxq 24(%rdi), %rdx, %rcx
  14634. adoxq %rdx, %r11
  14635. adoxq %rcx, %r12
  14636. # A[2] * B[3]
  14637. movq 24(%rbp), %rdx
  14638. mulxq 16(%rdi), %rdx, %rcx
  14639. adcxq %rdx, %r13
  14640. # A[3] * B[2]
  14641. movq 16(%rbp), %rdx
  14642. adcxq %rcx, %r14
  14643. mulxq 24(%rdi), %rcx, %rdx
  14644. adcxq %rax, %r15
  14645. adoxq %rcx, %r13
  14646. adoxq %rdx, %r14
  14647. adoxq %rax, %r15
  14648. # Reduce
  14649. movq $0x7fffffffffffffff, %rax
  14650. # Move top half into t4-t7 and remove top bit from t3
  14651. shldq $0x01, %r14, %r15
  14652. shldq $0x01, %r13, %r14
  14653. shldq $0x01, %r12, %r13
  14654. shldq $0x01, %r11, %r12
  14655. andq %rax, %r11
  14656. # Multiply top half by 19
  14657. movq $19, %rdx
  14658. xorq %rax, %rax
  14659. mulxq %r12, %rcx, %r12
  14660. adcxq %rcx, %r8
  14661. adoxq %r12, %r9
  14662. mulxq %r13, %rcx, %r13
  14663. adcxq %rcx, %r9
  14664. adoxq %r13, %r10
  14665. mulxq %r14, %rcx, %r14
  14666. adcxq %rcx, %r10
  14667. adoxq %r14, %r11
  14668. mulxq %r15, %r15, %rdx
  14669. adcxq %r15, %r11
  14670. adoxq %rax, %rdx
  14671. adcxq %rax, %rdx
  14672. # Overflow
  14673. shldq $0x01, %r11, %rdx
  14674. movq $0x7fffffffffffffff, %rax
  14675. imulq $19, %rdx, %rcx
  14676. andq %rax, %r11
  14677. addq %rcx, %r8
  14678. adcq $0x00, %r9
  14679. adcq $0x00, %r10
  14680. adcq $0x00, %r11
  14681. # Reduce if top bit set
  14682. movq %r11, %rdx
  14683. shrq $63, %rdx
  14684. imulq $19, %rdx, %rcx
  14685. andq %rax, %r11
  14686. addq %rcx, %r8
  14687. adcq $0x00, %r9
  14688. adcq $0x00, %r10
  14689. adcq $0x00, %r11
  14690. # Store
  14691. movq %r8, (%rbx)
  14692. movq %r9, 8(%rbx)
  14693. movq %r10, 16(%rbx)
  14694. movq %r11, 24(%rbx)
  14695. movq 128(%rsp), %rdi
  14696. # Multiply
  14697. # A[0] * B[0]
  14698. movq (%rdi), %rdx
  14699. mulxq (%rsi), %r8, %r9
  14700. # A[2] * B[0]
  14701. mulxq 16(%rsi), %r10, %r11
  14702. # A[1] * B[0]
  14703. mulxq 8(%rsi), %rcx, %rax
  14704. xorq %r15, %r15
  14705. adcxq %rcx, %r9
  14706. # A[1] * B[3]
  14707. movq 24(%rdi), %rdx
  14708. mulxq 8(%rsi), %r12, %r13
  14709. adcxq %rax, %r10
  14710. # A[0] * B[1]
  14711. movq 8(%rdi), %rdx
  14712. mulxq (%rsi), %rcx, %rax
  14713. adoxq %rcx, %r9
  14714. # A[2] * B[1]
  14715. mulxq 16(%rsi), %rcx, %r14
  14716. adoxq %rax, %r10
  14717. adcxq %rcx, %r11
  14718. # A[1] * B[2]
  14719. movq 16(%rdi), %rdx
  14720. mulxq 8(%rsi), %rcx, %rax
  14721. adcxq %r14, %r12
  14722. adoxq %rcx, %r11
  14723. adcxq %r15, %r13
  14724. adoxq %rax, %r12
  14725. # A[0] * B[2]
  14726. mulxq (%rsi), %rcx, %rax
  14727. adoxq %r15, %r13
  14728. xorq %r14, %r14
  14729. adcxq %rcx, %r10
  14730. # A[1] * B[1]
  14731. movq 8(%rdi), %rdx
  14732. mulxq 8(%rsi), %rdx, %rcx
  14733. adcxq %rax, %r11
  14734. adoxq %rdx, %r10
  14735. # A[3] * B[1]
  14736. movq 8(%rdi), %rdx
  14737. adoxq %rcx, %r11
  14738. mulxq 24(%rsi), %rcx, %rax
  14739. adcxq %rcx, %r12
  14740. # A[2] * B[2]
  14741. movq 16(%rdi), %rdx
  14742. mulxq 16(%rsi), %rdx, %rcx
  14743. adcxq %rax, %r13
  14744. adoxq %rdx, %r12
  14745. # A[3] * B[3]
  14746. movq 24(%rdi), %rdx
  14747. adoxq %rcx, %r13
  14748. mulxq 24(%rsi), %rcx, %rax
  14749. adoxq %r15, %r14
  14750. adcxq %rcx, %r14
  14751. # A[0] * B[3]
  14752. mulxq (%rsi), %rdx, %rcx
  14753. adcxq %rax, %r15
  14754. xorq %rax, %rax
  14755. adcxq %rdx, %r11
  14756. # A[3] * B[0]
  14757. movq (%rdi), %rdx
  14758. adcxq %rcx, %r12
  14759. mulxq 24(%rsi), %rdx, %rcx
  14760. adoxq %rdx, %r11
  14761. adoxq %rcx, %r12
  14762. # A[2] * B[3]
  14763. movq 24(%rdi), %rdx
  14764. mulxq 16(%rsi), %rdx, %rcx
  14765. adcxq %rdx, %r13
  14766. # A[3] * B[2]
  14767. movq 16(%rdi), %rdx
  14768. adcxq %rcx, %r14
  14769. mulxq 24(%rsi), %rcx, %rdx
  14770. adcxq %rax, %r15
  14771. adoxq %rcx, %r13
  14772. adoxq %rdx, %r14
  14773. adoxq %rax, %r15
  14774. # Reduce
  14775. movq $0x7fffffffffffffff, %rax
  14776. # Move top half into t4-t7 and remove top bit from t3
  14777. shldq $0x01, %r14, %r15
  14778. shldq $0x01, %r13, %r14
  14779. shldq $0x01, %r12, %r13
  14780. shldq $0x01, %r11, %r12
  14781. andq %rax, %r11
  14782. # Multiply top half by 19
  14783. movq $19, %rdx
  14784. xorq %rax, %rax
  14785. mulxq %r12, %rcx, %r12
  14786. adcxq %rcx, %r8
  14787. adoxq %r12, %r9
  14788. mulxq %r13, %rcx, %r13
  14789. adcxq %rcx, %r9
  14790. adoxq %r13, %r10
  14791. mulxq %r14, %rcx, %r14
  14792. adcxq %rcx, %r10
  14793. adoxq %r14, %r11
  14794. mulxq %r15, %r15, %rdx
  14795. adcxq %r15, %r11
  14796. adoxq %rax, %rdx
  14797. adcxq %rax, %rdx
  14798. # Overflow
  14799. shldq $0x01, %r11, %rdx
  14800. movq $0x7fffffffffffffff, %rax
  14801. imulq $19, %rdx, %rcx
  14802. andq %rax, %r11
  14803. addq %rcx, %r8
  14804. adcq $0x00, %r9
  14805. adcq $0x00, %r10
  14806. adcq $0x00, %r11
  14807. # Reduce if top bit set
  14808. movq %r11, %rdx
  14809. shrq $63, %rdx
  14810. imulq $19, %rdx, %rcx
  14811. andq %rax, %r11
  14812. addq %rcx, %r8
  14813. adcq $0x00, %r9
  14814. adcq $0x00, %r10
  14815. adcq $0x00, %r11
  14816. # Store
  14817. movq %r8, (%rsi)
  14818. movq %r9, 8(%rsi)
  14819. movq %r10, 16(%rsi)
  14820. movq %r11, 24(%rsi)
  14821. movq 24(%rsp), %rdi
  14822. movq 120(%rsp), %rsi
  14823. movq 112(%rsp), %rbp
  14824. # Multiply
  14825. # A[0] * B[0]
  14826. movq (%rbp), %rdx
  14827. mulxq (%rsi), %r8, %r9
  14828. # A[2] * B[0]
  14829. mulxq 16(%rsi), %r10, %r11
  14830. # A[1] * B[0]
  14831. mulxq 8(%rsi), %rcx, %rax
  14832. xorq %r15, %r15
  14833. adcxq %rcx, %r9
  14834. # A[1] * B[3]
  14835. movq 24(%rbp), %rdx
  14836. mulxq 8(%rsi), %r12, %r13
  14837. adcxq %rax, %r10
  14838. # A[0] * B[1]
  14839. movq 8(%rbp), %rdx
  14840. mulxq (%rsi), %rcx, %rax
  14841. adoxq %rcx, %r9
  14842. # A[2] * B[1]
  14843. mulxq 16(%rsi), %rcx, %r14
  14844. adoxq %rax, %r10
  14845. adcxq %rcx, %r11
  14846. # A[1] * B[2]
  14847. movq 16(%rbp), %rdx
  14848. mulxq 8(%rsi), %rcx, %rax
  14849. adcxq %r14, %r12
  14850. adoxq %rcx, %r11
  14851. adcxq %r15, %r13
  14852. adoxq %rax, %r12
  14853. # A[0] * B[2]
  14854. mulxq (%rsi), %rcx, %rax
  14855. adoxq %r15, %r13
  14856. xorq %r14, %r14
  14857. adcxq %rcx, %r10
  14858. # A[1] * B[1]
  14859. movq 8(%rbp), %rdx
  14860. mulxq 8(%rsi), %rdx, %rcx
  14861. adcxq %rax, %r11
  14862. adoxq %rdx, %r10
  14863. # A[3] * B[1]
  14864. movq 8(%rbp), %rdx
  14865. adoxq %rcx, %r11
  14866. mulxq 24(%rsi), %rcx, %rax
  14867. adcxq %rcx, %r12
  14868. # A[2] * B[2]
  14869. movq 16(%rbp), %rdx
  14870. mulxq 16(%rsi), %rdx, %rcx
  14871. adcxq %rax, %r13
  14872. adoxq %rdx, %r12
  14873. # A[3] * B[3]
  14874. movq 24(%rbp), %rdx
  14875. adoxq %rcx, %r13
  14876. mulxq 24(%rsi), %rcx, %rax
  14877. adoxq %r15, %r14
  14878. adcxq %rcx, %r14
  14879. # A[0] * B[3]
  14880. mulxq (%rsi), %rdx, %rcx
  14881. adcxq %rax, %r15
  14882. xorq %rax, %rax
  14883. adcxq %rdx, %r11
  14884. # A[3] * B[0]
  14885. movq (%rbp), %rdx
  14886. adcxq %rcx, %r12
  14887. mulxq 24(%rsi), %rdx, %rcx
  14888. adoxq %rdx, %r11
  14889. adoxq %rcx, %r12
  14890. # A[2] * B[3]
  14891. movq 24(%rbp), %rdx
  14892. mulxq 16(%rsi), %rdx, %rcx
  14893. adcxq %rdx, %r13
  14894. # A[3] * B[2]
  14895. movq 16(%rbp), %rdx
  14896. adcxq %rcx, %r14
  14897. mulxq 24(%rsi), %rcx, %rdx
  14898. adcxq %rax, %r15
  14899. adoxq %rcx, %r13
  14900. adoxq %rdx, %r14
  14901. adoxq %rax, %r15
  14902. # Reduce
  14903. movq $0x7fffffffffffffff, %rax
  14904. # Move top half into t4-t7 and remove top bit from t3
  14905. shldq $0x01, %r14, %r15
  14906. shldq $0x01, %r13, %r14
  14907. shldq $0x01, %r12, %r13
  14908. shldq $0x01, %r11, %r12
  14909. andq %rax, %r11
  14910. # Multiply top half by 19
  14911. movq $19, %rdx
  14912. xorq %rax, %rax
  14913. mulxq %r12, %rcx, %r12
  14914. adcxq %rcx, %r8
  14915. adoxq %r12, %r9
  14916. mulxq %r13, %rcx, %r13
  14917. adcxq %rcx, %r9
  14918. adoxq %r13, %r10
  14919. mulxq %r14, %rcx, %r14
  14920. adcxq %rcx, %r10
  14921. adoxq %r14, %r11
  14922. mulxq %r15, %r15, %rdx
  14923. adcxq %r15, %r11
  14924. adoxq %rax, %rdx
  14925. adcxq %rax, %rdx
  14926. # Overflow
  14927. shldq $0x01, %r11, %rdx
  14928. movq $0x7fffffffffffffff, %rax
  14929. imulq $19, %rdx, %rcx
  14930. andq %rax, %r11
  14931. addq %rcx, %r8
  14932. adcq $0x00, %r9
  14933. adcq $0x00, %r10
  14934. adcq $0x00, %r11
  14935. # Reduce if top bit set
  14936. movq %r11, %rdx
  14937. shrq $63, %rdx
  14938. imulq $19, %rdx, %rcx
  14939. andq %rax, %r11
  14940. addq %rcx, %r8
  14941. adcq $0x00, %r9
  14942. adcq $0x00, %r10
  14943. adcq $0x00, %r11
  14944. # Store
  14945. movq %r8, (%rdi)
  14946. movq %r9, 8(%rdi)
  14947. movq %r10, 16(%rdi)
  14948. movq %r11, 24(%rdi)
  14949. movq 8(%rsp), %rsi
  14950. movq (%rsp), %rbp
  14951. # Add
  14952. movq (%rbx), %r8
  14953. movq 8(%rbx), %r9
  14954. movq 16(%rbx), %r10
  14955. movq 24(%rbx), %rdx
  14956. movq %r8, %r12
  14957. addq (%rsi), %r8
  14958. movq %r9, %r13
  14959. adcq 8(%rsi), %r9
  14960. movq %r10, %r14
  14961. adcq 16(%rsi), %r10
  14962. movq %rdx, %r15
  14963. adcq 24(%rsi), %rdx
  14964. movq $-19, %rcx
  14965. movq %rdx, %r11
  14966. movq $0x7fffffffffffffff, %rax
  14967. sarq $63, %rdx
  14968. # Mask the modulus
  14969. andq %rdx, %rcx
  14970. andq %rdx, %rax
  14971. # Sub modulus (if overflow)
  14972. subq %rcx, %r8
  14973. sbbq %rdx, %r9
  14974. sbbq %rdx, %r10
  14975. sbbq %rax, %r11
  14976. # Sub
  14977. subq (%rsi), %r12
  14978. movq $0x00, %rdx
  14979. sbbq 8(%rsi), %r13
  14980. movq $-19, %rcx
  14981. sbbq 16(%rsi), %r14
  14982. movq $0x7fffffffffffffff, %rax
  14983. sbbq 24(%rsi), %r15
  14984. sbbq $0x00, %rdx
  14985. # Mask the modulus
  14986. andq %rdx, %rcx
  14987. andq %rdx, %rax
  14988. # Add modulus (if underflow)
  14989. addq %rcx, %r12
  14990. adcq %rdx, %r13
  14991. adcq %rdx, %r14
  14992. adcq %rax, %r15
  14993. movq %r8, (%rsi)
  14994. movq %r9, 8(%rsi)
  14995. movq %r10, 16(%rsi)
  14996. movq %r11, 24(%rsi)
  14997. movq %r12, (%rbp)
  14998. movq %r13, 8(%rbp)
  14999. movq %r14, 16(%rbp)
  15000. movq %r15, 24(%rbp)
  15001. movq 104(%rsp), %rsi
  15002. # Double
  15003. movq (%rsi), %r8
  15004. movq 8(%rsi), %r9
  15005. addq %r8, %r8
  15006. movq 16(%rsi), %r10
  15007. adcq %r9, %r9
  15008. movq 24(%rsi), %rdx
  15009. adcq %r10, %r10
  15010. movq $-19, %rcx
  15011. adcq %rdx, %rdx
  15012. movq $0x7fffffffffffffff, %rax
  15013. movq %rdx, %r11
  15014. sarq $63, %rdx
  15015. # Mask the modulus
  15016. andq %rdx, %rcx
  15017. andq %rdx, %rax
  15018. # Sub modulus (if overflow)
  15019. subq %rcx, %r8
  15020. sbbq %rdx, %r9
  15021. sbbq %rdx, %r10
  15022. sbbq %rax, %r11
  15023. movq %r8, (%rbx)
  15024. movq %r9, 8(%rbx)
  15025. movq %r10, 16(%rbx)
  15026. movq %r11, 24(%rbx)
  15027. # Add
  15028. movq (%rbx), %r8
  15029. movq 8(%rbx), %r9
  15030. movq 16(%rbx), %r10
  15031. movq 24(%rbx), %rdx
  15032. movq %r8, %r12
  15033. addq (%rdi), %r8
  15034. movq %r9, %r13
  15035. adcq 8(%rdi), %r9
  15036. movq %r10, %r14
  15037. adcq 16(%rdi), %r10
  15038. movq %rdx, %r15
  15039. adcq 24(%rdi), %rdx
  15040. movq $-19, %rcx
  15041. movq %rdx, %r11
  15042. movq $0x7fffffffffffffff, %rax
  15043. sarq $63, %rdx
  15044. # Mask the modulus
  15045. andq %rdx, %rcx
  15046. andq %rdx, %rax
  15047. # Sub modulus (if overflow)
  15048. subq %rcx, %r8
  15049. sbbq %rdx, %r9
  15050. sbbq %rdx, %r10
  15051. sbbq %rax, %r11
  15052. # Sub
  15053. subq (%rdi), %r12
  15054. movq $0x00, %rdx
  15055. sbbq 8(%rdi), %r13
  15056. movq $-19, %rcx
  15057. sbbq 16(%rdi), %r14
  15058. movq $0x7fffffffffffffff, %rax
  15059. sbbq 24(%rdi), %r15
  15060. sbbq $0x00, %rdx
  15061. # Mask the modulus
  15062. andq %rdx, %rcx
  15063. andq %rdx, %rax
  15064. # Add modulus (if underflow)
  15065. addq %rcx, %r12
  15066. adcq %rdx, %r13
  15067. adcq %rdx, %r14
  15068. adcq %rax, %r15
  15069. movq %r8, (%rdi)
  15070. movq %r9, 8(%rdi)
  15071. movq %r10, 16(%rdi)
  15072. movq %r11, 24(%rdi)
  15073. movq %r12, (%rbx)
  15074. movq %r13, 8(%rbx)
  15075. movq %r14, 16(%rbx)
  15076. movq %r15, 24(%rbx)
  15077. addq $48, %rsp
  15078. popq %r15
  15079. popq %r14
  15080. popq %r13
  15081. popq %r12
  15082. popq %rbx
  15083. popq %rbp
  15084. repz retq
  15085. #ifndef __APPLE__
  15086. .size fe_ge_msub_avx2,.-fe_ge_msub_avx2
  15087. #endif /* __APPLE__ */
  15088. #ifndef __APPLE__
  15089. .text
  15090. .globl fe_ge_add_avx2
  15091. .type fe_ge_add_avx2,@function
  15092. .align 16
  15093. fe_ge_add_avx2:
  15094. #else
  15095. .section __TEXT,__text
  15096. .globl _fe_ge_add_avx2
  15097. .p2align 4
  15098. _fe_ge_add_avx2:
  15099. #endif /* __APPLE__ */
  15100. pushq %rbx
  15101. pushq %rbp
  15102. pushq %r12
  15103. pushq %r13
  15104. pushq %r14
  15105. pushq %r15
  15106. subq $0x50, %rsp
  15107. movq %rdi, (%rsp)
  15108. movq %rsi, 8(%rsp)
  15109. movq %rdx, 16(%rsp)
  15110. movq %rcx, 24(%rsp)
  15111. movq %r8, 32(%rsp)
  15112. movq %r9, 40(%rsp)
  15113. movq 8(%rsp), %rsi
  15114. movq 40(%rsp), %rbx
  15115. movq 32(%rsp), %rbp
  15116. # Add
  15117. movq (%rbx), %r8
  15118. movq 8(%rbx), %r9
  15119. movq 16(%rbx), %r10
  15120. movq 24(%rbx), %rdx
  15121. movq %r8, %r12
  15122. addq (%rbp), %r8
  15123. movq %r9, %r13
  15124. adcq 8(%rbp), %r9
  15125. movq %r10, %r14
  15126. adcq 16(%rbp), %r10
  15127. movq %rdx, %r15
  15128. adcq 24(%rbp), %rdx
  15129. movq $-19, %rcx
  15130. movq %rdx, %r11
  15131. movq $0x7fffffffffffffff, %rax
  15132. sarq $63, %rdx
  15133. # Mask the modulus
  15134. andq %rdx, %rcx
  15135. andq %rdx, %rax
  15136. # Sub modulus (if overflow)
  15137. subq %rcx, %r8
  15138. sbbq %rdx, %r9
  15139. sbbq %rdx, %r10
  15140. sbbq %rax, %r11
  15141. # Sub
  15142. subq (%rbp), %r12
  15143. movq $0x00, %rdx
  15144. sbbq 8(%rbp), %r13
  15145. movq $-19, %rcx
  15146. sbbq 16(%rbp), %r14
  15147. movq $0x7fffffffffffffff, %rax
  15148. sbbq 24(%rbp), %r15
  15149. sbbq $0x00, %rdx
  15150. # Mask the modulus
  15151. andq %rdx, %rcx
  15152. andq %rdx, %rax
  15153. # Add modulus (if underflow)
  15154. addq %rcx, %r12
  15155. adcq %rdx, %r13
  15156. adcq %rdx, %r14
  15157. adcq %rax, %r15
  15158. movq %r8, (%rdi)
  15159. movq %r9, 8(%rdi)
  15160. movq %r10, 16(%rdi)
  15161. movq %r11, 24(%rdi)
  15162. movq %r12, (%rsi)
  15163. movq %r13, 8(%rsi)
  15164. movq %r14, 16(%rsi)
  15165. movq %r15, 24(%rsi)
  15166. movq 16(%rsp), %rbx
  15167. movq 168(%rsp), %rbp
  15168. # Multiply
  15169. # A[0] * B[0]
  15170. movq (%rbp), %rdx
  15171. mulxq (%rdi), %r8, %r9
  15172. # A[2] * B[0]
  15173. mulxq 16(%rdi), %r10, %r11
  15174. # A[1] * B[0]
  15175. mulxq 8(%rdi), %rcx, %rax
  15176. xorq %r15, %r15
  15177. adcxq %rcx, %r9
  15178. # A[1] * B[3]
  15179. movq 24(%rbp), %rdx
  15180. mulxq 8(%rdi), %r12, %r13
  15181. adcxq %rax, %r10
  15182. # A[0] * B[1]
  15183. movq 8(%rbp), %rdx
  15184. mulxq (%rdi), %rcx, %rax
  15185. adoxq %rcx, %r9
  15186. # A[2] * B[1]
  15187. mulxq 16(%rdi), %rcx, %r14
  15188. adoxq %rax, %r10
  15189. adcxq %rcx, %r11
  15190. # A[1] * B[2]
  15191. movq 16(%rbp), %rdx
  15192. mulxq 8(%rdi), %rcx, %rax
  15193. adcxq %r14, %r12
  15194. adoxq %rcx, %r11
  15195. adcxq %r15, %r13
  15196. adoxq %rax, %r12
  15197. # A[0] * B[2]
  15198. mulxq (%rdi), %rcx, %rax
  15199. adoxq %r15, %r13
  15200. xorq %r14, %r14
  15201. adcxq %rcx, %r10
  15202. # A[1] * B[1]
  15203. movq 8(%rbp), %rdx
  15204. mulxq 8(%rdi), %rdx, %rcx
  15205. adcxq %rax, %r11
  15206. adoxq %rdx, %r10
  15207. # A[3] * B[1]
  15208. movq 8(%rbp), %rdx
  15209. adoxq %rcx, %r11
  15210. mulxq 24(%rdi), %rcx, %rax
  15211. adcxq %rcx, %r12
  15212. # A[2] * B[2]
  15213. movq 16(%rbp), %rdx
  15214. mulxq 16(%rdi), %rdx, %rcx
  15215. adcxq %rax, %r13
  15216. adoxq %rdx, %r12
  15217. # A[3] * B[3]
  15218. movq 24(%rbp), %rdx
  15219. adoxq %rcx, %r13
  15220. mulxq 24(%rdi), %rcx, %rax
  15221. adoxq %r15, %r14
  15222. adcxq %rcx, %r14
  15223. # A[0] * B[3]
  15224. mulxq (%rdi), %rdx, %rcx
  15225. adcxq %rax, %r15
  15226. xorq %rax, %rax
  15227. adcxq %rdx, %r11
  15228. # A[3] * B[0]
  15229. movq (%rbp), %rdx
  15230. adcxq %rcx, %r12
  15231. mulxq 24(%rdi), %rdx, %rcx
  15232. adoxq %rdx, %r11
  15233. adoxq %rcx, %r12
  15234. # A[2] * B[3]
  15235. movq 24(%rbp), %rdx
  15236. mulxq 16(%rdi), %rdx, %rcx
  15237. adcxq %rdx, %r13
  15238. # A[3] * B[2]
  15239. movq 16(%rbp), %rdx
  15240. adcxq %rcx, %r14
  15241. mulxq 24(%rdi), %rcx, %rdx
  15242. adcxq %rax, %r15
  15243. adoxq %rcx, %r13
  15244. adoxq %rdx, %r14
  15245. adoxq %rax, %r15
  15246. # Reduce
  15247. movq $0x7fffffffffffffff, %rax
  15248. # Move top half into t4-t7 and remove top bit from t3
  15249. shldq $0x01, %r14, %r15
  15250. shldq $0x01, %r13, %r14
  15251. shldq $0x01, %r12, %r13
  15252. shldq $0x01, %r11, %r12
  15253. andq %rax, %r11
  15254. # Multiply top half by 19
  15255. movq $19, %rdx
  15256. xorq %rax, %rax
  15257. mulxq %r12, %rcx, %r12
  15258. adcxq %rcx, %r8
  15259. adoxq %r12, %r9
  15260. mulxq %r13, %rcx, %r13
  15261. adcxq %rcx, %r9
  15262. adoxq %r13, %r10
  15263. mulxq %r14, %rcx, %r14
  15264. adcxq %rcx, %r10
  15265. adoxq %r14, %r11
  15266. mulxq %r15, %r15, %rdx
  15267. adcxq %r15, %r11
  15268. adoxq %rax, %rdx
  15269. adcxq %rax, %rdx
  15270. # Overflow
  15271. shldq $0x01, %r11, %rdx
  15272. movq $0x7fffffffffffffff, %rax
  15273. imulq $19, %rdx, %rcx
  15274. andq %rax, %r11
  15275. addq %rcx, %r8
  15276. adcq $0x00, %r9
  15277. adcq $0x00, %r10
  15278. adcq $0x00, %r11
  15279. # Reduce if top bit set
  15280. movq %r11, %rdx
  15281. shrq $63, %rdx
  15282. imulq $19, %rdx, %rcx
  15283. andq %rax, %r11
  15284. addq %rcx, %r8
  15285. adcq $0x00, %r9
  15286. adcq $0x00, %r10
  15287. adcq $0x00, %r11
  15288. # Store
  15289. movq %r8, (%rbx)
  15290. movq %r9, 8(%rbx)
  15291. movq %r10, 16(%rbx)
  15292. movq %r11, 24(%rbx)
  15293. movq 176(%rsp), %rbx
  15294. # Multiply
  15295. # A[0] * B[0]
  15296. movq (%rbx), %rdx
  15297. mulxq (%rsi), %r8, %r9
  15298. # A[2] * B[0]
  15299. mulxq 16(%rsi), %r10, %r11
  15300. # A[1] * B[0]
  15301. mulxq 8(%rsi), %rcx, %rax
  15302. xorq %r15, %r15
  15303. adcxq %rcx, %r9
  15304. # A[1] * B[3]
  15305. movq 24(%rbx), %rdx
  15306. mulxq 8(%rsi), %r12, %r13
  15307. adcxq %rax, %r10
  15308. # A[0] * B[1]
  15309. movq 8(%rbx), %rdx
  15310. mulxq (%rsi), %rcx, %rax
  15311. adoxq %rcx, %r9
  15312. # A[2] * B[1]
  15313. mulxq 16(%rsi), %rcx, %r14
  15314. adoxq %rax, %r10
  15315. adcxq %rcx, %r11
  15316. # A[1] * B[2]
  15317. movq 16(%rbx), %rdx
  15318. mulxq 8(%rsi), %rcx, %rax
  15319. adcxq %r14, %r12
  15320. adoxq %rcx, %r11
  15321. adcxq %r15, %r13
  15322. adoxq %rax, %r12
  15323. # A[0] * B[2]
  15324. mulxq (%rsi), %rcx, %rax
  15325. adoxq %r15, %r13
  15326. xorq %r14, %r14
  15327. adcxq %rcx, %r10
  15328. # A[1] * B[1]
  15329. movq 8(%rbx), %rdx
  15330. mulxq 8(%rsi), %rdx, %rcx
  15331. adcxq %rax, %r11
  15332. adoxq %rdx, %r10
  15333. # A[3] * B[1]
  15334. movq 8(%rbx), %rdx
  15335. adoxq %rcx, %r11
  15336. mulxq 24(%rsi), %rcx, %rax
  15337. adcxq %rcx, %r12
  15338. # A[2] * B[2]
  15339. movq 16(%rbx), %rdx
  15340. mulxq 16(%rsi), %rdx, %rcx
  15341. adcxq %rax, %r13
  15342. adoxq %rdx, %r12
  15343. # A[3] * B[3]
  15344. movq 24(%rbx), %rdx
  15345. adoxq %rcx, %r13
  15346. mulxq 24(%rsi), %rcx, %rax
  15347. adoxq %r15, %r14
  15348. adcxq %rcx, %r14
  15349. # A[0] * B[3]
  15350. mulxq (%rsi), %rdx, %rcx
  15351. adcxq %rax, %r15
  15352. xorq %rax, %rax
  15353. adcxq %rdx, %r11
  15354. # A[3] * B[0]
  15355. movq (%rbx), %rdx
  15356. adcxq %rcx, %r12
  15357. mulxq 24(%rsi), %rdx, %rcx
  15358. adoxq %rdx, %r11
  15359. adoxq %rcx, %r12
  15360. # A[2] * B[3]
  15361. movq 24(%rbx), %rdx
  15362. mulxq 16(%rsi), %rdx, %rcx
  15363. adcxq %rdx, %r13
  15364. # A[3] * B[2]
  15365. movq 16(%rbx), %rdx
  15366. adcxq %rcx, %r14
  15367. mulxq 24(%rsi), %rcx, %rdx
  15368. adcxq %rax, %r15
  15369. adoxq %rcx, %r13
  15370. adoxq %rdx, %r14
  15371. adoxq %rax, %r15
  15372. # Reduce
  15373. movq $0x7fffffffffffffff, %rax
  15374. # Move top half into t4-t7 and remove top bit from t3
  15375. shldq $0x01, %r14, %r15
  15376. shldq $0x01, %r13, %r14
  15377. shldq $0x01, %r12, %r13
  15378. shldq $0x01, %r11, %r12
  15379. andq %rax, %r11
  15380. # Multiply top half by 19
  15381. movq $19, %rdx
  15382. xorq %rax, %rax
  15383. mulxq %r12, %rcx, %r12
  15384. adcxq %rcx, %r8
  15385. adoxq %r12, %r9
  15386. mulxq %r13, %rcx, %r13
  15387. adcxq %rcx, %r9
  15388. adoxq %r13, %r10
  15389. mulxq %r14, %rcx, %r14
  15390. adcxq %rcx, %r10
  15391. adoxq %r14, %r11
  15392. mulxq %r15, %r15, %rdx
  15393. adcxq %r15, %r11
  15394. adoxq %rax, %rdx
  15395. adcxq %rax, %rdx
  15396. # Overflow
  15397. shldq $0x01, %r11, %rdx
  15398. movq $0x7fffffffffffffff, %rax
  15399. imulq $19, %rdx, %rcx
  15400. andq %rax, %r11
  15401. addq %rcx, %r8
  15402. adcq $0x00, %r9
  15403. adcq $0x00, %r10
  15404. adcq $0x00, %r11
  15405. # Reduce if top bit set
  15406. movq %r11, %rdx
  15407. shrq $63, %rdx
  15408. imulq $19, %rdx, %rcx
  15409. andq %rax, %r11
  15410. addq %rcx, %r8
  15411. adcq $0x00, %r9
  15412. adcq $0x00, %r10
  15413. adcq $0x00, %r11
  15414. # Store
  15415. movq %r8, (%rsi)
  15416. movq %r9, 8(%rsi)
  15417. movq %r10, 16(%rsi)
  15418. movq %r11, 24(%rsi)
  15419. movq 24(%rsp), %rsi
  15420. movq 160(%rsp), %rbx
  15421. movq 144(%rsp), %rbp
  15422. # Multiply
  15423. # A[0] * B[0]
  15424. movq (%rbp), %rdx
  15425. mulxq (%rbx), %r8, %r9
  15426. # A[2] * B[0]
  15427. mulxq 16(%rbx), %r10, %r11
  15428. # A[1] * B[0]
  15429. mulxq 8(%rbx), %rcx, %rax
  15430. xorq %r15, %r15
  15431. adcxq %rcx, %r9
  15432. # A[1] * B[3]
  15433. movq 24(%rbp), %rdx
  15434. mulxq 8(%rbx), %r12, %r13
  15435. adcxq %rax, %r10
  15436. # A[0] * B[1]
  15437. movq 8(%rbp), %rdx
  15438. mulxq (%rbx), %rcx, %rax
  15439. adoxq %rcx, %r9
  15440. # A[2] * B[1]
  15441. mulxq 16(%rbx), %rcx, %r14
  15442. adoxq %rax, %r10
  15443. adcxq %rcx, %r11
  15444. # A[1] * B[2]
  15445. movq 16(%rbp), %rdx
  15446. mulxq 8(%rbx), %rcx, %rax
  15447. adcxq %r14, %r12
  15448. adoxq %rcx, %r11
  15449. adcxq %r15, %r13
  15450. adoxq %rax, %r12
  15451. # A[0] * B[2]
  15452. mulxq (%rbx), %rcx, %rax
  15453. adoxq %r15, %r13
  15454. xorq %r14, %r14
  15455. adcxq %rcx, %r10
  15456. # A[1] * B[1]
  15457. movq 8(%rbp), %rdx
  15458. mulxq 8(%rbx), %rdx, %rcx
  15459. adcxq %rax, %r11
  15460. adoxq %rdx, %r10
  15461. # A[3] * B[1]
  15462. movq 8(%rbp), %rdx
  15463. adoxq %rcx, %r11
  15464. mulxq 24(%rbx), %rcx, %rax
  15465. adcxq %rcx, %r12
  15466. # A[2] * B[2]
  15467. movq 16(%rbp), %rdx
  15468. mulxq 16(%rbx), %rdx, %rcx
  15469. adcxq %rax, %r13
  15470. adoxq %rdx, %r12
  15471. # A[3] * B[3]
  15472. movq 24(%rbp), %rdx
  15473. adoxq %rcx, %r13
  15474. mulxq 24(%rbx), %rcx, %rax
  15475. adoxq %r15, %r14
  15476. adcxq %rcx, %r14
  15477. # A[0] * B[3]
  15478. mulxq (%rbx), %rdx, %rcx
  15479. adcxq %rax, %r15
  15480. xorq %rax, %rax
  15481. adcxq %rdx, %r11
  15482. # A[3] * B[0]
  15483. movq (%rbp), %rdx
  15484. adcxq %rcx, %r12
  15485. mulxq 24(%rbx), %rdx, %rcx
  15486. adoxq %rdx, %r11
  15487. adoxq %rcx, %r12
  15488. # A[2] * B[3]
  15489. movq 24(%rbp), %rdx
  15490. mulxq 16(%rbx), %rdx, %rcx
  15491. adcxq %rdx, %r13
  15492. # A[3] * B[2]
  15493. movq 16(%rbp), %rdx
  15494. adcxq %rcx, %r14
  15495. mulxq 24(%rbx), %rcx, %rdx
  15496. adcxq %rax, %r15
  15497. adoxq %rcx, %r13
  15498. adoxq %rdx, %r14
  15499. adoxq %rax, %r15
  15500. # Reduce
  15501. movq $0x7fffffffffffffff, %rax
  15502. # Move top half into t4-t7 and remove top bit from t3
  15503. shldq $0x01, %r14, %r15
  15504. shldq $0x01, %r13, %r14
  15505. shldq $0x01, %r12, %r13
  15506. shldq $0x01, %r11, %r12
  15507. andq %rax, %r11
  15508. # Multiply top half by 19
  15509. movq $19, %rdx
  15510. xorq %rax, %rax
  15511. mulxq %r12, %rcx, %r12
  15512. adcxq %rcx, %r8
  15513. adoxq %r12, %r9
  15514. mulxq %r13, %rcx, %r13
  15515. adcxq %rcx, %r9
  15516. adoxq %r13, %r10
  15517. mulxq %r14, %rcx, %r14
  15518. adcxq %rcx, %r10
  15519. adoxq %r14, %r11
  15520. mulxq %r15, %r15, %rdx
  15521. adcxq %r15, %r11
  15522. adoxq %rax, %rdx
  15523. adcxq %rax, %rdx
  15524. # Overflow
  15525. shldq $0x01, %r11, %rdx
  15526. movq $0x7fffffffffffffff, %rax
  15527. imulq $19, %rdx, %rcx
  15528. andq %rax, %r11
  15529. addq %rcx, %r8
  15530. adcq $0x00, %r9
  15531. adcq $0x00, %r10
  15532. adcq $0x00, %r11
  15533. # Reduce if top bit set
  15534. movq %r11, %rdx
  15535. shrq $63, %rdx
  15536. imulq $19, %rdx, %rcx
  15537. andq %rax, %r11
  15538. addq %rcx, %r8
  15539. adcq $0x00, %r9
  15540. adcq $0x00, %r10
  15541. adcq $0x00, %r11
  15542. # Store
  15543. movq %r8, (%rsi)
  15544. movq %r9, 8(%rsi)
  15545. movq %r10, 16(%rsi)
  15546. movq %r11, 24(%rsi)
  15547. movq 136(%rsp), %rsi
  15548. movq 152(%rsp), %rbx
  15549. # Multiply
  15550. # A[0] * B[0]
  15551. movq (%rbx), %rdx
  15552. mulxq (%rsi), %r8, %r9
  15553. # A[2] * B[0]
  15554. mulxq 16(%rsi), %r10, %r11
  15555. # A[1] * B[0]
  15556. mulxq 8(%rsi), %rcx, %rax
  15557. xorq %r15, %r15
  15558. adcxq %rcx, %r9
  15559. # A[1] * B[3]
  15560. movq 24(%rbx), %rdx
  15561. mulxq 8(%rsi), %r12, %r13
  15562. adcxq %rax, %r10
  15563. # A[0] * B[1]
  15564. movq 8(%rbx), %rdx
  15565. mulxq (%rsi), %rcx, %rax
  15566. adoxq %rcx, %r9
  15567. # A[2] * B[1]
  15568. mulxq 16(%rsi), %rcx, %r14
  15569. adoxq %rax, %r10
  15570. adcxq %rcx, %r11
  15571. # A[1] * B[2]
  15572. movq 16(%rbx), %rdx
  15573. mulxq 8(%rsi), %rcx, %rax
  15574. adcxq %r14, %r12
  15575. adoxq %rcx, %r11
  15576. adcxq %r15, %r13
  15577. adoxq %rax, %r12
  15578. # A[0] * B[2]
  15579. mulxq (%rsi), %rcx, %rax
  15580. adoxq %r15, %r13
  15581. xorq %r14, %r14
  15582. adcxq %rcx, %r10
  15583. # A[1] * B[1]
  15584. movq 8(%rbx), %rdx
  15585. mulxq 8(%rsi), %rdx, %rcx
  15586. adcxq %rax, %r11
  15587. adoxq %rdx, %r10
  15588. # A[3] * B[1]
  15589. movq 8(%rbx), %rdx
  15590. adoxq %rcx, %r11
  15591. mulxq 24(%rsi), %rcx, %rax
  15592. adcxq %rcx, %r12
  15593. # A[2] * B[2]
  15594. movq 16(%rbx), %rdx
  15595. mulxq 16(%rsi), %rdx, %rcx
  15596. adcxq %rax, %r13
  15597. adoxq %rdx, %r12
  15598. # A[3] * B[3]
  15599. movq 24(%rbx), %rdx
  15600. adoxq %rcx, %r13
  15601. mulxq 24(%rsi), %rcx, %rax
  15602. adoxq %r15, %r14
  15603. adcxq %rcx, %r14
  15604. # A[0] * B[3]
  15605. mulxq (%rsi), %rdx, %rcx
  15606. adcxq %rax, %r15
  15607. xorq %rax, %rax
  15608. adcxq %rdx, %r11
  15609. # A[3] * B[0]
  15610. movq (%rbx), %rdx
  15611. adcxq %rcx, %r12
  15612. mulxq 24(%rsi), %rdx, %rcx
  15613. adoxq %rdx, %r11
  15614. adoxq %rcx, %r12
  15615. # A[2] * B[3]
  15616. movq 24(%rbx), %rdx
  15617. mulxq 16(%rsi), %rdx, %rcx
  15618. adcxq %rdx, %r13
  15619. # A[3] * B[2]
  15620. movq 16(%rbx), %rdx
  15621. adcxq %rcx, %r14
  15622. mulxq 24(%rsi), %rcx, %rdx
  15623. adcxq %rax, %r15
  15624. adoxq %rcx, %r13
  15625. adoxq %rdx, %r14
  15626. adoxq %rax, %r15
  15627. # Reduce
  15628. movq $0x7fffffffffffffff, %rax
  15629. # Move top half into t4-t7 and remove top bit from t3
  15630. shldq $0x01, %r14, %r15
  15631. shldq $0x01, %r13, %r14
  15632. shldq $0x01, %r12, %r13
  15633. shldq $0x01, %r11, %r12
  15634. andq %rax, %r11
  15635. # Multiply top half by 19
  15636. movq $19, %rdx
  15637. xorq %rax, %rax
  15638. mulxq %r12, %rcx, %r12
  15639. adcxq %rcx, %r8
  15640. adoxq %r12, %r9
  15641. mulxq %r13, %rcx, %r13
  15642. adcxq %rcx, %r9
  15643. adoxq %r13, %r10
  15644. mulxq %r14, %rcx, %r14
  15645. adcxq %rcx, %r10
  15646. adoxq %r14, %r11
  15647. mulxq %r15, %r15, %rdx
  15648. adcxq %r15, %r11
  15649. adoxq %rax, %rdx
  15650. adcxq %rax, %rdx
  15651. # Overflow
  15652. shldq $0x01, %r11, %rdx
  15653. movq $0x7fffffffffffffff, %rax
  15654. imulq $19, %rdx, %rcx
  15655. andq %rax, %r11
  15656. addq %rcx, %r8
  15657. adcq $0x00, %r9
  15658. adcq $0x00, %r10
  15659. adcq $0x00, %r11
  15660. # Reduce if top bit set
  15661. movq %r11, %rdx
  15662. shrq $63, %rdx
  15663. imulq $19, %rdx, %rcx
  15664. andq %rax, %r11
  15665. addq %rcx, %r8
  15666. adcq $0x00, %r9
  15667. adcq $0x00, %r10
  15668. adcq $0x00, %r11
  15669. # Store
  15670. movq %r8, (%rdi)
  15671. movq %r9, 8(%rdi)
  15672. movq %r10, 16(%rdi)
  15673. movq %r11, 24(%rdi)
  15674. leaq 48(%rsp), %rsi
  15675. # Double
  15676. movq (%rdi), %r8
  15677. movq 8(%rdi), %r9
  15678. addq %r8, %r8
  15679. movq 16(%rdi), %r10
  15680. adcq %r9, %r9
  15681. movq 24(%rdi), %rdx
  15682. adcq %r10, %r10
  15683. movq $-19, %rcx
  15684. adcq %rdx, %rdx
  15685. movq $0x7fffffffffffffff, %rax
  15686. movq %rdx, %r11
  15687. sarq $63, %rdx
  15688. # Mask the modulus
  15689. andq %rdx, %rcx
  15690. andq %rdx, %rax
  15691. # Sub modulus (if overflow)
  15692. subq %rcx, %r8
  15693. sbbq %rdx, %r9
  15694. sbbq %rdx, %r10
  15695. sbbq %rax, %r11
  15696. movq %r8, (%rsi)
  15697. movq %r9, 8(%rsi)
  15698. movq %r10, 16(%rsi)
  15699. movq %r11, 24(%rsi)
  15700. movq 8(%rsp), %rbx
  15701. movq 16(%rsp), %rbp
  15702. # Add
  15703. movq (%rbp), %r8
  15704. movq 8(%rbp), %r9
  15705. movq 16(%rbp), %r10
  15706. movq 24(%rbp), %rdx
  15707. movq %r8, %r12
  15708. addq (%rbx), %r8
  15709. movq %r9, %r13
  15710. adcq 8(%rbx), %r9
  15711. movq %r10, %r14
  15712. adcq 16(%rbx), %r10
  15713. movq %rdx, %r15
  15714. adcq 24(%rbx), %rdx
  15715. movq $-19, %rcx
  15716. movq %rdx, %r11
  15717. movq $0x7fffffffffffffff, %rax
  15718. sarq $63, %rdx
  15719. # Mask the modulus
  15720. andq %rdx, %rcx
  15721. andq %rdx, %rax
  15722. # Sub modulus (if overflow)
  15723. subq %rcx, %r8
  15724. sbbq %rdx, %r9
  15725. sbbq %rdx, %r10
  15726. sbbq %rax, %r11
  15727. # Sub
  15728. subq (%rbx), %r12
  15729. movq $0x00, %rdx
  15730. sbbq 8(%rbx), %r13
  15731. movq $-19, %rcx
  15732. sbbq 16(%rbx), %r14
  15733. movq $0x7fffffffffffffff, %rax
  15734. sbbq 24(%rbx), %r15
  15735. sbbq $0x00, %rdx
  15736. # Mask the modulus
  15737. andq %rdx, %rcx
  15738. andq %rdx, %rax
  15739. # Add modulus (if underflow)
  15740. addq %rcx, %r12
  15741. adcq %rdx, %r13
  15742. adcq %rdx, %r14
  15743. adcq %rax, %r15
  15744. movq %r8, (%rbx)
  15745. movq %r9, 8(%rbx)
  15746. movq %r10, 16(%rbx)
  15747. movq %r11, 24(%rbx)
  15748. movq %r12, (%rdi)
  15749. movq %r13, 8(%rdi)
  15750. movq %r14, 16(%rdi)
  15751. movq %r15, 24(%rdi)
  15752. movq 24(%rsp), %rdi
  15753. # Add
  15754. movq (%rsi), %r8
  15755. movq 8(%rsi), %r9
  15756. movq 16(%rsi), %r10
  15757. movq 24(%rsi), %rdx
  15758. movq %r8, %r12
  15759. addq (%rdi), %r8
  15760. movq %r9, %r13
  15761. adcq 8(%rdi), %r9
  15762. movq %r10, %r14
  15763. adcq 16(%rdi), %r10
  15764. movq %rdx, %r15
  15765. adcq 24(%rdi), %rdx
  15766. movq $-19, %rcx
  15767. movq %rdx, %r11
  15768. movq $0x7fffffffffffffff, %rax
  15769. sarq $63, %rdx
  15770. # Mask the modulus
  15771. andq %rdx, %rcx
  15772. andq %rdx, %rax
  15773. # Sub modulus (if overflow)
  15774. subq %rcx, %r8
  15775. sbbq %rdx, %r9
  15776. sbbq %rdx, %r10
  15777. sbbq %rax, %r11
  15778. # Sub
  15779. subq (%rdi), %r12
  15780. movq $0x00, %rdx
  15781. sbbq 8(%rdi), %r13
  15782. movq $-19, %rcx
  15783. sbbq 16(%rdi), %r14
  15784. movq $0x7fffffffffffffff, %rax
  15785. sbbq 24(%rdi), %r15
  15786. sbbq $0x00, %rdx
  15787. # Mask the modulus
  15788. andq %rdx, %rcx
  15789. andq %rdx, %rax
  15790. # Add modulus (if underflow)
  15791. addq %rcx, %r12
  15792. adcq %rdx, %r13
  15793. adcq %rdx, %r14
  15794. adcq %rax, %r15
  15795. movq %r8, (%rbp)
  15796. movq %r9, 8(%rbp)
  15797. movq %r10, 16(%rbp)
  15798. movq %r11, 24(%rbp)
  15799. movq %r12, (%rdi)
  15800. movq %r13, 8(%rdi)
  15801. movq %r14, 16(%rdi)
  15802. movq %r15, 24(%rdi)
  15803. addq $0x50, %rsp
  15804. popq %r15
  15805. popq %r14
  15806. popq %r13
  15807. popq %r12
  15808. popq %rbp
  15809. popq %rbx
  15810. repz retq
  15811. #ifndef __APPLE__
  15812. .size fe_ge_add_avx2,.-fe_ge_add_avx2
  15813. #endif /* __APPLE__ */
  15814. #ifndef __APPLE__
  15815. .text
  15816. .globl fe_ge_sub_avx2
  15817. .type fe_ge_sub_avx2,@function
  15818. .align 16
  15819. fe_ge_sub_avx2:
  15820. #else
  15821. .section __TEXT,__text
  15822. .globl _fe_ge_sub_avx2
  15823. .p2align 4
  15824. _fe_ge_sub_avx2:
  15825. #endif /* __APPLE__ */
  15826. pushq %rbx
  15827. pushq %rbp
  15828. pushq %r12
  15829. pushq %r13
  15830. pushq %r14
  15831. pushq %r15
  15832. subq $0x50, %rsp
  15833. movq %rdi, (%rsp)
  15834. movq %rsi, 8(%rsp)
  15835. movq %rdx, 16(%rsp)
  15836. movq %rcx, 24(%rsp)
  15837. movq %r8, 32(%rsp)
  15838. movq %r9, 40(%rsp)
  15839. movq 8(%rsp), %rsi
  15840. movq 40(%rsp), %rbx
  15841. movq 32(%rsp), %rbp
  15842. # Add
  15843. movq (%rbx), %r8
  15844. movq 8(%rbx), %r9
  15845. movq 16(%rbx), %r10
  15846. movq 24(%rbx), %rdx
  15847. movq %r8, %r12
  15848. addq (%rbp), %r8
  15849. movq %r9, %r13
  15850. adcq 8(%rbp), %r9
  15851. movq %r10, %r14
  15852. adcq 16(%rbp), %r10
  15853. movq %rdx, %r15
  15854. adcq 24(%rbp), %rdx
  15855. movq $-19, %rcx
  15856. movq %rdx, %r11
  15857. movq $0x7fffffffffffffff, %rax
  15858. sarq $63, %rdx
  15859. # Mask the modulus
  15860. andq %rdx, %rcx
  15861. andq %rdx, %rax
  15862. # Sub modulus (if overflow)
  15863. subq %rcx, %r8
  15864. sbbq %rdx, %r9
  15865. sbbq %rdx, %r10
  15866. sbbq %rax, %r11
  15867. # Sub
  15868. subq (%rbp), %r12
  15869. movq $0x00, %rdx
  15870. sbbq 8(%rbp), %r13
  15871. movq $-19, %rcx
  15872. sbbq 16(%rbp), %r14
  15873. movq $0x7fffffffffffffff, %rax
  15874. sbbq 24(%rbp), %r15
  15875. sbbq $0x00, %rdx
  15876. # Mask the modulus
  15877. andq %rdx, %rcx
  15878. andq %rdx, %rax
  15879. # Add modulus (if underflow)
  15880. addq %rcx, %r12
  15881. adcq %rdx, %r13
  15882. adcq %rdx, %r14
  15883. adcq %rax, %r15
  15884. movq %r8, (%rdi)
  15885. movq %r9, 8(%rdi)
  15886. movq %r10, 16(%rdi)
  15887. movq %r11, 24(%rdi)
  15888. movq %r12, (%rsi)
  15889. movq %r13, 8(%rsi)
  15890. movq %r14, 16(%rsi)
  15891. movq %r15, 24(%rsi)
  15892. movq 16(%rsp), %rbx
  15893. movq 176(%rsp), %rbp
  15894. # Multiply
  15895. # A[0] * B[0]
  15896. movq (%rbp), %rdx
  15897. mulxq (%rdi), %r8, %r9
  15898. # A[2] * B[0]
  15899. mulxq 16(%rdi), %r10, %r11
  15900. # A[1] * B[0]
  15901. mulxq 8(%rdi), %rcx, %rax
  15902. xorq %r15, %r15
  15903. adcxq %rcx, %r9
  15904. # A[1] * B[3]
  15905. movq 24(%rbp), %rdx
  15906. mulxq 8(%rdi), %r12, %r13
  15907. adcxq %rax, %r10
  15908. # A[0] * B[1]
  15909. movq 8(%rbp), %rdx
  15910. mulxq (%rdi), %rcx, %rax
  15911. adoxq %rcx, %r9
  15912. # A[2] * B[1]
  15913. mulxq 16(%rdi), %rcx, %r14
  15914. adoxq %rax, %r10
  15915. adcxq %rcx, %r11
  15916. # A[1] * B[2]
  15917. movq 16(%rbp), %rdx
  15918. mulxq 8(%rdi), %rcx, %rax
  15919. adcxq %r14, %r12
  15920. adoxq %rcx, %r11
  15921. adcxq %r15, %r13
  15922. adoxq %rax, %r12
  15923. # A[0] * B[2]
  15924. mulxq (%rdi), %rcx, %rax
  15925. adoxq %r15, %r13
  15926. xorq %r14, %r14
  15927. adcxq %rcx, %r10
  15928. # A[1] * B[1]
  15929. movq 8(%rbp), %rdx
  15930. mulxq 8(%rdi), %rdx, %rcx
  15931. adcxq %rax, %r11
  15932. adoxq %rdx, %r10
  15933. # A[3] * B[1]
  15934. movq 8(%rbp), %rdx
  15935. adoxq %rcx, %r11
  15936. mulxq 24(%rdi), %rcx, %rax
  15937. adcxq %rcx, %r12
  15938. # A[2] * B[2]
  15939. movq 16(%rbp), %rdx
  15940. mulxq 16(%rdi), %rdx, %rcx
  15941. adcxq %rax, %r13
  15942. adoxq %rdx, %r12
  15943. # A[3] * B[3]
  15944. movq 24(%rbp), %rdx
  15945. adoxq %rcx, %r13
  15946. mulxq 24(%rdi), %rcx, %rax
  15947. adoxq %r15, %r14
  15948. adcxq %rcx, %r14
  15949. # A[0] * B[3]
  15950. mulxq (%rdi), %rdx, %rcx
  15951. adcxq %rax, %r15
  15952. xorq %rax, %rax
  15953. adcxq %rdx, %r11
  15954. # A[3] * B[0]
  15955. movq (%rbp), %rdx
  15956. adcxq %rcx, %r12
  15957. mulxq 24(%rdi), %rdx, %rcx
  15958. adoxq %rdx, %r11
  15959. adoxq %rcx, %r12
  15960. # A[2] * B[3]
  15961. movq 24(%rbp), %rdx
  15962. mulxq 16(%rdi), %rdx, %rcx
  15963. adcxq %rdx, %r13
  15964. # A[3] * B[2]
  15965. movq 16(%rbp), %rdx
  15966. adcxq %rcx, %r14
  15967. mulxq 24(%rdi), %rcx, %rdx
  15968. adcxq %rax, %r15
  15969. adoxq %rcx, %r13
  15970. adoxq %rdx, %r14
  15971. adoxq %rax, %r15
  15972. # Reduce
  15973. movq $0x7fffffffffffffff, %rax
  15974. # Move top half into t4-t7 and remove top bit from t3
  15975. shldq $0x01, %r14, %r15
  15976. shldq $0x01, %r13, %r14
  15977. shldq $0x01, %r12, %r13
  15978. shldq $0x01, %r11, %r12
  15979. andq %rax, %r11
  15980. # Multiply top half by 19
  15981. movq $19, %rdx
  15982. xorq %rax, %rax
  15983. mulxq %r12, %rcx, %r12
  15984. adcxq %rcx, %r8
  15985. adoxq %r12, %r9
  15986. mulxq %r13, %rcx, %r13
  15987. adcxq %rcx, %r9
  15988. adoxq %r13, %r10
  15989. mulxq %r14, %rcx, %r14
  15990. adcxq %rcx, %r10
  15991. adoxq %r14, %r11
  15992. mulxq %r15, %r15, %rdx
  15993. adcxq %r15, %r11
  15994. adoxq %rax, %rdx
  15995. adcxq %rax, %rdx
  15996. # Overflow
  15997. shldq $0x01, %r11, %rdx
  15998. movq $0x7fffffffffffffff, %rax
  15999. imulq $19, %rdx, %rcx
  16000. andq %rax, %r11
  16001. addq %rcx, %r8
  16002. adcq $0x00, %r9
  16003. adcq $0x00, %r10
  16004. adcq $0x00, %r11
  16005. # Reduce if top bit set
  16006. movq %r11, %rdx
  16007. shrq $63, %rdx
  16008. imulq $19, %rdx, %rcx
  16009. andq %rax, %r11
  16010. addq %rcx, %r8
  16011. adcq $0x00, %r9
  16012. adcq $0x00, %r10
  16013. adcq $0x00, %r11
  16014. # Store
  16015. movq %r8, (%rbx)
  16016. movq %r9, 8(%rbx)
  16017. movq %r10, 16(%rbx)
  16018. movq %r11, 24(%rbx)
  16019. movq 168(%rsp), %rbx
  16020. # Multiply
  16021. # A[0] * B[0]
  16022. movq (%rbx), %rdx
  16023. mulxq (%rsi), %r8, %r9
  16024. # A[2] * B[0]
  16025. mulxq 16(%rsi), %r10, %r11
  16026. # A[1] * B[0]
  16027. mulxq 8(%rsi), %rcx, %rax
  16028. xorq %r15, %r15
  16029. adcxq %rcx, %r9
  16030. # A[1] * B[3]
  16031. movq 24(%rbx), %rdx
  16032. mulxq 8(%rsi), %r12, %r13
  16033. adcxq %rax, %r10
  16034. # A[0] * B[1]
  16035. movq 8(%rbx), %rdx
  16036. mulxq (%rsi), %rcx, %rax
  16037. adoxq %rcx, %r9
  16038. # A[2] * B[1]
  16039. mulxq 16(%rsi), %rcx, %r14
  16040. adoxq %rax, %r10
  16041. adcxq %rcx, %r11
  16042. # A[1] * B[2]
  16043. movq 16(%rbx), %rdx
  16044. mulxq 8(%rsi), %rcx, %rax
  16045. adcxq %r14, %r12
  16046. adoxq %rcx, %r11
  16047. adcxq %r15, %r13
  16048. adoxq %rax, %r12
  16049. # A[0] * B[2]
  16050. mulxq (%rsi), %rcx, %rax
  16051. adoxq %r15, %r13
  16052. xorq %r14, %r14
  16053. adcxq %rcx, %r10
  16054. # A[1] * B[1]
  16055. movq 8(%rbx), %rdx
  16056. mulxq 8(%rsi), %rdx, %rcx
  16057. adcxq %rax, %r11
  16058. adoxq %rdx, %r10
  16059. # A[3] * B[1]
  16060. movq 8(%rbx), %rdx
  16061. adoxq %rcx, %r11
  16062. mulxq 24(%rsi), %rcx, %rax
  16063. adcxq %rcx, %r12
  16064. # A[2] * B[2]
  16065. movq 16(%rbx), %rdx
  16066. mulxq 16(%rsi), %rdx, %rcx
  16067. adcxq %rax, %r13
  16068. adoxq %rdx, %r12
  16069. # A[3] * B[3]
  16070. movq 24(%rbx), %rdx
  16071. adoxq %rcx, %r13
  16072. mulxq 24(%rsi), %rcx, %rax
  16073. adoxq %r15, %r14
  16074. adcxq %rcx, %r14
  16075. # A[0] * B[3]
  16076. mulxq (%rsi), %rdx, %rcx
  16077. adcxq %rax, %r15
  16078. xorq %rax, %rax
  16079. adcxq %rdx, %r11
  16080. # A[3] * B[0]
  16081. movq (%rbx), %rdx
  16082. adcxq %rcx, %r12
  16083. mulxq 24(%rsi), %rdx, %rcx
  16084. adoxq %rdx, %r11
  16085. adoxq %rcx, %r12
  16086. # A[2] * B[3]
  16087. movq 24(%rbx), %rdx
  16088. mulxq 16(%rsi), %rdx, %rcx
  16089. adcxq %rdx, %r13
  16090. # A[3] * B[2]
  16091. movq 16(%rbx), %rdx
  16092. adcxq %rcx, %r14
  16093. mulxq 24(%rsi), %rcx, %rdx
  16094. adcxq %rax, %r15
  16095. adoxq %rcx, %r13
  16096. adoxq %rdx, %r14
  16097. adoxq %rax, %r15
  16098. # Reduce
  16099. movq $0x7fffffffffffffff, %rax
  16100. # Move top half into t4-t7 and remove top bit from t3
  16101. shldq $0x01, %r14, %r15
  16102. shldq $0x01, %r13, %r14
  16103. shldq $0x01, %r12, %r13
  16104. shldq $0x01, %r11, %r12
  16105. andq %rax, %r11
  16106. # Multiply top half by 19
  16107. movq $19, %rdx
  16108. xorq %rax, %rax
  16109. mulxq %r12, %rcx, %r12
  16110. adcxq %rcx, %r8
  16111. adoxq %r12, %r9
  16112. mulxq %r13, %rcx, %r13
  16113. adcxq %rcx, %r9
  16114. adoxq %r13, %r10
  16115. mulxq %r14, %rcx, %r14
  16116. adcxq %rcx, %r10
  16117. adoxq %r14, %r11
  16118. mulxq %r15, %r15, %rdx
  16119. adcxq %r15, %r11
  16120. adoxq %rax, %rdx
  16121. adcxq %rax, %rdx
  16122. # Overflow
  16123. shldq $0x01, %r11, %rdx
  16124. movq $0x7fffffffffffffff, %rax
  16125. imulq $19, %rdx, %rcx
  16126. andq %rax, %r11
  16127. addq %rcx, %r8
  16128. adcq $0x00, %r9
  16129. adcq $0x00, %r10
  16130. adcq $0x00, %r11
  16131. # Reduce if top bit set
  16132. movq %r11, %rdx
  16133. shrq $63, %rdx
  16134. imulq $19, %rdx, %rcx
  16135. andq %rax, %r11
  16136. addq %rcx, %r8
  16137. adcq $0x00, %r9
  16138. adcq $0x00, %r10
  16139. adcq $0x00, %r11
  16140. # Store
  16141. movq %r8, (%rsi)
  16142. movq %r9, 8(%rsi)
  16143. movq %r10, 16(%rsi)
  16144. movq %r11, 24(%rsi)
  16145. movq 24(%rsp), %rsi
  16146. movq 160(%rsp), %rbx
  16147. movq 144(%rsp), %rbp
  16148. # Multiply
  16149. # A[0] * B[0]
  16150. movq (%rbp), %rdx
  16151. mulxq (%rbx), %r8, %r9
  16152. # A[2] * B[0]
  16153. mulxq 16(%rbx), %r10, %r11
  16154. # A[1] * B[0]
  16155. mulxq 8(%rbx), %rcx, %rax
  16156. xorq %r15, %r15
  16157. adcxq %rcx, %r9
  16158. # A[1] * B[3]
  16159. movq 24(%rbp), %rdx
  16160. mulxq 8(%rbx), %r12, %r13
  16161. adcxq %rax, %r10
  16162. # A[0] * B[1]
  16163. movq 8(%rbp), %rdx
  16164. mulxq (%rbx), %rcx, %rax
  16165. adoxq %rcx, %r9
  16166. # A[2] * B[1]
  16167. mulxq 16(%rbx), %rcx, %r14
  16168. adoxq %rax, %r10
  16169. adcxq %rcx, %r11
  16170. # A[1] * B[2]
  16171. movq 16(%rbp), %rdx
  16172. mulxq 8(%rbx), %rcx, %rax
  16173. adcxq %r14, %r12
  16174. adoxq %rcx, %r11
  16175. adcxq %r15, %r13
  16176. adoxq %rax, %r12
  16177. # A[0] * B[2]
  16178. mulxq (%rbx), %rcx, %rax
  16179. adoxq %r15, %r13
  16180. xorq %r14, %r14
  16181. adcxq %rcx, %r10
  16182. # A[1] * B[1]
  16183. movq 8(%rbp), %rdx
  16184. mulxq 8(%rbx), %rdx, %rcx
  16185. adcxq %rax, %r11
  16186. adoxq %rdx, %r10
  16187. # A[3] * B[1]
  16188. movq 8(%rbp), %rdx
  16189. adoxq %rcx, %r11
  16190. mulxq 24(%rbx), %rcx, %rax
  16191. adcxq %rcx, %r12
  16192. # A[2] * B[2]
  16193. movq 16(%rbp), %rdx
  16194. mulxq 16(%rbx), %rdx, %rcx
  16195. adcxq %rax, %r13
  16196. adoxq %rdx, %r12
  16197. # A[3] * B[3]
  16198. movq 24(%rbp), %rdx
  16199. adoxq %rcx, %r13
  16200. mulxq 24(%rbx), %rcx, %rax
  16201. adoxq %r15, %r14
  16202. adcxq %rcx, %r14
  16203. # A[0] * B[3]
  16204. mulxq (%rbx), %rdx, %rcx
  16205. adcxq %rax, %r15
  16206. xorq %rax, %rax
  16207. adcxq %rdx, %r11
  16208. # A[3] * B[0]
  16209. movq (%rbp), %rdx
  16210. adcxq %rcx, %r12
  16211. mulxq 24(%rbx), %rdx, %rcx
  16212. adoxq %rdx, %r11
  16213. adoxq %rcx, %r12
  16214. # A[2] * B[3]
  16215. movq 24(%rbp), %rdx
  16216. mulxq 16(%rbx), %rdx, %rcx
  16217. adcxq %rdx, %r13
  16218. # A[3] * B[2]
  16219. movq 16(%rbp), %rdx
  16220. adcxq %rcx, %r14
  16221. mulxq 24(%rbx), %rcx, %rdx
  16222. adcxq %rax, %r15
  16223. adoxq %rcx, %r13
  16224. adoxq %rdx, %r14
  16225. adoxq %rax, %r15
  16226. # Reduce
  16227. movq $0x7fffffffffffffff, %rax
  16228. # Move top half into t4-t7 and remove top bit from t3
  16229. shldq $0x01, %r14, %r15
  16230. shldq $0x01, %r13, %r14
  16231. shldq $0x01, %r12, %r13
  16232. shldq $0x01, %r11, %r12
  16233. andq %rax, %r11
  16234. # Multiply top half by 19
  16235. movq $19, %rdx
  16236. xorq %rax, %rax
  16237. mulxq %r12, %rcx, %r12
  16238. adcxq %rcx, %r8
  16239. adoxq %r12, %r9
  16240. mulxq %r13, %rcx, %r13
  16241. adcxq %rcx, %r9
  16242. adoxq %r13, %r10
  16243. mulxq %r14, %rcx, %r14
  16244. adcxq %rcx, %r10
  16245. adoxq %r14, %r11
  16246. mulxq %r15, %r15, %rdx
  16247. adcxq %r15, %r11
  16248. adoxq %rax, %rdx
  16249. adcxq %rax, %rdx
  16250. # Overflow
  16251. shldq $0x01, %r11, %rdx
  16252. movq $0x7fffffffffffffff, %rax
  16253. imulq $19, %rdx, %rcx
  16254. andq %rax, %r11
  16255. addq %rcx, %r8
  16256. adcq $0x00, %r9
  16257. adcq $0x00, %r10
  16258. adcq $0x00, %r11
  16259. # Reduce if top bit set
  16260. movq %r11, %rdx
  16261. shrq $63, %rdx
  16262. imulq $19, %rdx, %rcx
  16263. andq %rax, %r11
  16264. addq %rcx, %r8
  16265. adcq $0x00, %r9
  16266. adcq $0x00, %r10
  16267. adcq $0x00, %r11
  16268. # Store
  16269. movq %r8, (%rsi)
  16270. movq %r9, 8(%rsi)
  16271. movq %r10, 16(%rsi)
  16272. movq %r11, 24(%rsi)
  16273. movq 136(%rsp), %rsi
  16274. movq 152(%rsp), %rbx
  16275. # Multiply
  16276. # A[0] * B[0]
  16277. movq (%rbx), %rdx
  16278. mulxq (%rsi), %r8, %r9
  16279. # A[2] * B[0]
  16280. mulxq 16(%rsi), %r10, %r11
  16281. # A[1] * B[0]
  16282. mulxq 8(%rsi), %rcx, %rax
  16283. xorq %r15, %r15
  16284. adcxq %rcx, %r9
  16285. # A[1] * B[3]
  16286. movq 24(%rbx), %rdx
  16287. mulxq 8(%rsi), %r12, %r13
  16288. adcxq %rax, %r10
  16289. # A[0] * B[1]
  16290. movq 8(%rbx), %rdx
  16291. mulxq (%rsi), %rcx, %rax
  16292. adoxq %rcx, %r9
  16293. # A[2] * B[1]
  16294. mulxq 16(%rsi), %rcx, %r14
  16295. adoxq %rax, %r10
  16296. adcxq %rcx, %r11
  16297. # A[1] * B[2]
  16298. movq 16(%rbx), %rdx
  16299. mulxq 8(%rsi), %rcx, %rax
  16300. adcxq %r14, %r12
  16301. adoxq %rcx, %r11
  16302. adcxq %r15, %r13
  16303. adoxq %rax, %r12
  16304. # A[0] * B[2]
  16305. mulxq (%rsi), %rcx, %rax
  16306. adoxq %r15, %r13
  16307. xorq %r14, %r14
  16308. adcxq %rcx, %r10
  16309. # A[1] * B[1]
  16310. movq 8(%rbx), %rdx
  16311. mulxq 8(%rsi), %rdx, %rcx
  16312. adcxq %rax, %r11
  16313. adoxq %rdx, %r10
  16314. # A[3] * B[1]
  16315. movq 8(%rbx), %rdx
  16316. adoxq %rcx, %r11
  16317. mulxq 24(%rsi), %rcx, %rax
  16318. adcxq %rcx, %r12
  16319. # A[2] * B[2]
  16320. movq 16(%rbx), %rdx
  16321. mulxq 16(%rsi), %rdx, %rcx
  16322. adcxq %rax, %r13
  16323. adoxq %rdx, %r12
  16324. # A[3] * B[3]
  16325. movq 24(%rbx), %rdx
  16326. adoxq %rcx, %r13
  16327. mulxq 24(%rsi), %rcx, %rax
  16328. adoxq %r15, %r14
  16329. adcxq %rcx, %r14
  16330. # A[0] * B[3]
  16331. mulxq (%rsi), %rdx, %rcx
  16332. adcxq %rax, %r15
  16333. xorq %rax, %rax
  16334. adcxq %rdx, %r11
  16335. # A[3] * B[0]
  16336. movq (%rbx), %rdx
  16337. adcxq %rcx, %r12
  16338. mulxq 24(%rsi), %rdx, %rcx
  16339. adoxq %rdx, %r11
  16340. adoxq %rcx, %r12
  16341. # A[2] * B[3]
  16342. movq 24(%rbx), %rdx
  16343. mulxq 16(%rsi), %rdx, %rcx
  16344. adcxq %rdx, %r13
  16345. # A[3] * B[2]
  16346. movq 16(%rbx), %rdx
  16347. adcxq %rcx, %r14
  16348. mulxq 24(%rsi), %rcx, %rdx
  16349. adcxq %rax, %r15
  16350. adoxq %rcx, %r13
  16351. adoxq %rdx, %r14
  16352. adoxq %rax, %r15
  16353. # Reduce
  16354. movq $0x7fffffffffffffff, %rax
  16355. # Move top half into t4-t7 and remove top bit from t3
  16356. shldq $0x01, %r14, %r15
  16357. shldq $0x01, %r13, %r14
  16358. shldq $0x01, %r12, %r13
  16359. shldq $0x01, %r11, %r12
  16360. andq %rax, %r11
  16361. # Multiply top half by 19
  16362. movq $19, %rdx
  16363. xorq %rax, %rax
  16364. mulxq %r12, %rcx, %r12
  16365. adcxq %rcx, %r8
  16366. adoxq %r12, %r9
  16367. mulxq %r13, %rcx, %r13
  16368. adcxq %rcx, %r9
  16369. adoxq %r13, %r10
  16370. mulxq %r14, %rcx, %r14
  16371. adcxq %rcx, %r10
  16372. adoxq %r14, %r11
  16373. mulxq %r15, %r15, %rdx
  16374. adcxq %r15, %r11
  16375. adoxq %rax, %rdx
  16376. adcxq %rax, %rdx
  16377. # Overflow
  16378. shldq $0x01, %r11, %rdx
  16379. movq $0x7fffffffffffffff, %rax
  16380. imulq $19, %rdx, %rcx
  16381. andq %rax, %r11
  16382. addq %rcx, %r8
  16383. adcq $0x00, %r9
  16384. adcq $0x00, %r10
  16385. adcq $0x00, %r11
  16386. # Reduce if top bit set
  16387. movq %r11, %rdx
  16388. shrq $63, %rdx
  16389. imulq $19, %rdx, %rcx
  16390. andq %rax, %r11
  16391. addq %rcx, %r8
  16392. adcq $0x00, %r9
  16393. adcq $0x00, %r10
  16394. adcq $0x00, %r11
  16395. # Store
  16396. movq %r8, (%rdi)
  16397. movq %r9, 8(%rdi)
  16398. movq %r10, 16(%rdi)
  16399. movq %r11, 24(%rdi)
  16400. leaq 48(%rsp), %rsi
  16401. # Double
  16402. movq (%rdi), %r8
  16403. movq 8(%rdi), %r9
  16404. addq %r8, %r8
  16405. movq 16(%rdi), %r10
  16406. adcq %r9, %r9
  16407. movq 24(%rdi), %rdx
  16408. adcq %r10, %r10
  16409. movq $-19, %rcx
  16410. adcq %rdx, %rdx
  16411. movq $0x7fffffffffffffff, %rax
  16412. movq %rdx, %r11
  16413. sarq $63, %rdx
  16414. # Mask the modulus
  16415. andq %rdx, %rcx
  16416. andq %rdx, %rax
  16417. # Sub modulus (if overflow)
  16418. subq %rcx, %r8
  16419. sbbq %rdx, %r9
  16420. sbbq %rdx, %r10
  16421. sbbq %rax, %r11
  16422. movq %r8, (%rsi)
  16423. movq %r9, 8(%rsi)
  16424. movq %r10, 16(%rsi)
  16425. movq %r11, 24(%rsi)
  16426. movq 8(%rsp), %rbx
  16427. movq 16(%rsp), %rbp
  16428. # Add
  16429. movq (%rbp), %r8
  16430. movq 8(%rbp), %r9
  16431. movq 16(%rbp), %r10
  16432. movq 24(%rbp), %rdx
  16433. movq %r8, %r12
  16434. addq (%rbx), %r8
  16435. movq %r9, %r13
  16436. adcq 8(%rbx), %r9
  16437. movq %r10, %r14
  16438. adcq 16(%rbx), %r10
  16439. movq %rdx, %r15
  16440. adcq 24(%rbx), %rdx
  16441. movq $-19, %rcx
  16442. movq %rdx, %r11
  16443. movq $0x7fffffffffffffff, %rax
  16444. sarq $63, %rdx
  16445. # Mask the modulus
  16446. andq %rdx, %rcx
  16447. andq %rdx, %rax
  16448. # Sub modulus (if overflow)
  16449. subq %rcx, %r8
  16450. sbbq %rdx, %r9
  16451. sbbq %rdx, %r10
  16452. sbbq %rax, %r11
  16453. # Sub
  16454. subq (%rbx), %r12
  16455. movq $0x00, %rdx
  16456. sbbq 8(%rbx), %r13
  16457. movq $-19, %rcx
  16458. sbbq 16(%rbx), %r14
  16459. movq $0x7fffffffffffffff, %rax
  16460. sbbq 24(%rbx), %r15
  16461. sbbq $0x00, %rdx
  16462. # Mask the modulus
  16463. andq %rdx, %rcx
  16464. andq %rdx, %rax
  16465. # Add modulus (if underflow)
  16466. addq %rcx, %r12
  16467. adcq %rdx, %r13
  16468. adcq %rdx, %r14
  16469. adcq %rax, %r15
  16470. movq %r8, (%rbx)
  16471. movq %r9, 8(%rbx)
  16472. movq %r10, 16(%rbx)
  16473. movq %r11, 24(%rbx)
  16474. movq %r12, (%rdi)
  16475. movq %r13, 8(%rdi)
  16476. movq %r14, 16(%rdi)
  16477. movq %r15, 24(%rdi)
  16478. movq 24(%rsp), %rdi
  16479. # Add
  16480. movq (%rsi), %r8
  16481. movq 8(%rsi), %r9
  16482. movq 16(%rsi), %r10
  16483. movq 24(%rsi), %rdx
  16484. movq %r8, %r12
  16485. addq (%rdi), %r8
  16486. movq %r9, %r13
  16487. adcq 8(%rdi), %r9
  16488. movq %r10, %r14
  16489. adcq 16(%rdi), %r10
  16490. movq %rdx, %r15
  16491. adcq 24(%rdi), %rdx
  16492. movq $-19, %rcx
  16493. movq %rdx, %r11
  16494. movq $0x7fffffffffffffff, %rax
  16495. sarq $63, %rdx
  16496. # Mask the modulus
  16497. andq %rdx, %rcx
  16498. andq %rdx, %rax
  16499. # Sub modulus (if overflow)
  16500. subq %rcx, %r8
  16501. sbbq %rdx, %r9
  16502. sbbq %rdx, %r10
  16503. sbbq %rax, %r11
  16504. # Sub
  16505. subq (%rdi), %r12
  16506. movq $0x00, %rdx
  16507. sbbq 8(%rdi), %r13
  16508. movq $-19, %rcx
  16509. sbbq 16(%rdi), %r14
  16510. movq $0x7fffffffffffffff, %rax
  16511. sbbq 24(%rdi), %r15
  16512. sbbq $0x00, %rdx
  16513. # Mask the modulus
  16514. andq %rdx, %rcx
  16515. andq %rdx, %rax
  16516. # Add modulus (if underflow)
  16517. addq %rcx, %r12
  16518. adcq %rdx, %r13
  16519. adcq %rdx, %r14
  16520. adcq %rax, %r15
  16521. movq %r8, (%rdi)
  16522. movq %r9, 8(%rdi)
  16523. movq %r10, 16(%rdi)
  16524. movq %r11, 24(%rdi)
  16525. movq %r12, (%rbp)
  16526. movq %r13, 8(%rbp)
  16527. movq %r14, 16(%rbp)
  16528. movq %r15, 24(%rbp)
  16529. addq $0x50, %rsp
  16530. popq %r15
  16531. popq %r14
  16532. popq %r13
  16533. popq %r12
  16534. popq %rbp
  16535. popq %rbx
  16536. repz retq
  16537. #ifndef __APPLE__
  16538. .size fe_ge_sub_avx2,.-fe_ge_sub_avx2
  16539. #endif /* __APPLE__ */
  16540. #endif /* HAVE_INTEL_AVX2 */
  16541. #if defined(__linux__) && defined(__ELF__)
  16542. .section .note.GNU-stack,"",%progbits
  16543. #endif