fe_x25519_asm.S 404 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552655365546555655665576558655965606561656265636564656565666567656865696570657165726573657465756576657765786579658065816582658365846585658665876588658965906591659265936594659565966597659865996600660166026603660466056606660766086609661066116612661366146615661666176618661966206621662266236624662566266627662866296630663166326633663466356636663766386639664066416642664366446645664666476648664966506651665266536654665566566657665866596660666166626663666466656666666766686669667066716672667366746675667666776678667966806681668266836684668566866687668866896690669166926693669466956696669766986699670067016702670367046705670667076708670967106711671267136714671567166717671867196720672167226723672467256726672767286729673067316732673367346735673667376738673967406741674267436744674567466747674867496750675167526753675467556756675767586759676067616762676367646765676667676768676967706771677267736774677567766777677867796780678167826783678467856786678767886789679067916792679367946795679667976798679968006801680268036804680568066807680868096810681168126813681468156816681768186819682068216822682368246825682668276828682968306831683268336834683568366837683868396840684168426843684468456846684768486849685068516852685368546855685668576858685968606861686268636864686568666867686868696870687168726873687468756876687768786879688068816882688368846885688668876888688968906891689268936894689568966897689868996900690169026903690469056906690769086909691069116912691369146915691669176918691969206921692269236924692569266927692869296930693169326933693469356936693769386939694069416942694369446945694669476948694969506951695269536954695569566957695869596960696169626963696469656966696769686969697069716972697369746975697669776978697969806981698269836984698569866987698869896990699169926993699469956996699769986999700070017002700370047005700670077008700970107011701270137014701570167017701870197020702170227023702470257026702770287029703070317032703370347035703670377038703970407041704270437044704570467047704870497050705170527053705470557056705770587059706070617062706370647065706670677068706970707071707270737074707570767077707870797080708170827083708470857086708770887089709070917092709370947095709670977098709971007101710271037104710571067107710871097110711171127113711471157116711771187119712071217122712371247125712671277128712971307131713271337134713571367137713871397140714171427143714471457146714771487149715071517152715371547155715671577158715971607161716271637164716571667167716871697170717171727173717471757176717771787179718071817182718371847185718671877188718971907191719271937194719571967197719871997200720172027203720472057206720772087209721072117212721372147215721672177218721972207221722272237224722572267227722872297230723172327233723472357236723772387239724072417242724372447245724672477248724972507251725272537254725572567257725872597260726172627263726472657266726772687269727072717272727372747275727672777278727972807281728272837284728572867287728872897290729172927293729472957296729772987299730073017302730373047305730673077308730973107311731273137314731573167317731873197320732173227323732473257326732773287329733073317332733373347335733673377338733973407341734273437344734573467347734873497350735173527353735473557356735773587359736073617362736373647365736673677368736973707371737273737374737573767377737873797380738173827383738473857386738773887389739073917392739373947395739673977398739974007401740274037404740574067407740874097410741174127413741474157416741774187419742074217422742374247425742674277428742974307431743274337434743574367437743874397440744174427443744474457446744774487449745074517452745374547455745674577458745974607461746274637464746574667467746874697470747174727473747474757476747774787479748074817482748374847485748674877488748974907491749274937494749574967497749874997500750175027503750475057506750775087509751075117512751375147515751675177518751975207521752275237524752575267527752875297530753175327533753475357536753775387539754075417542754375447545754675477548754975507551755275537554755575567557755875597560756175627563756475657566756775687569757075717572757375747575757675777578757975807581758275837584758575867587758875897590759175927593759475957596759775987599760076017602760376047605760676077608760976107611761276137614761576167617761876197620762176227623762476257626762776287629763076317632763376347635763676377638763976407641764276437644764576467647764876497650765176527653765476557656765776587659766076617662766376647665766676677668766976707671767276737674767576767677767876797680768176827683768476857686768776887689769076917692769376947695769676977698769977007701770277037704770577067707770877097710771177127713771477157716771777187719772077217722772377247725772677277728772977307731773277337734773577367737773877397740774177427743774477457746774777487749775077517752775377547755775677577758775977607761776277637764776577667767776877697770777177727773777477757776777777787779778077817782778377847785778677877788778977907791779277937794779577967797779877997800780178027803780478057806780778087809781078117812781378147815781678177818781978207821782278237824782578267827782878297830783178327833783478357836783778387839784078417842784378447845784678477848784978507851785278537854785578567857785878597860786178627863786478657866786778687869787078717872787378747875787678777878787978807881788278837884788578867887788878897890789178927893789478957896789778987899790079017902790379047905790679077908790979107911791279137914791579167917791879197920792179227923792479257926792779287929793079317932793379347935793679377938793979407941794279437944794579467947794879497950795179527953795479557956795779587959796079617962796379647965796679677968796979707971797279737974797579767977797879797980798179827983798479857986798779887989799079917992799379947995799679977998799980008001800280038004800580068007800880098010801180128013801480158016801780188019802080218022802380248025802680278028802980308031803280338034803580368037803880398040804180428043804480458046804780488049805080518052805380548055805680578058805980608061806280638064806580668067806880698070807180728073807480758076807780788079808080818082808380848085808680878088808980908091809280938094809580968097809880998100810181028103810481058106810781088109811081118112811381148115811681178118811981208121812281238124812581268127812881298130813181328133813481358136813781388139814081418142814381448145814681478148814981508151815281538154815581568157815881598160816181628163816481658166816781688169817081718172817381748175817681778178817981808181818281838184818581868187818881898190819181928193819481958196819781988199820082018202820382048205820682078208820982108211821282138214821582168217821882198220822182228223822482258226822782288229823082318232823382348235823682378238823982408241824282438244824582468247824882498250825182528253825482558256825782588259826082618262826382648265826682678268826982708271827282738274827582768277827882798280828182828283828482858286828782888289829082918292829382948295829682978298829983008301830283038304830583068307830883098310831183128313831483158316831783188319832083218322832383248325832683278328832983308331833283338334833583368337833883398340834183428343834483458346834783488349835083518352835383548355835683578358835983608361836283638364836583668367836883698370837183728373837483758376837783788379838083818382838383848385838683878388838983908391839283938394839583968397839883998400840184028403840484058406840784088409841084118412841384148415841684178418841984208421842284238424842584268427842884298430843184328433843484358436843784388439844084418442844384448445844684478448844984508451845284538454845584568457845884598460846184628463846484658466846784688469847084718472847384748475847684778478847984808481848284838484848584868487848884898490849184928493849484958496849784988499850085018502850385048505850685078508850985108511851285138514851585168517851885198520852185228523852485258526852785288529853085318532853385348535853685378538853985408541854285438544854585468547854885498550855185528553855485558556855785588559856085618562856385648565856685678568856985708571857285738574857585768577857885798580858185828583858485858586858785888589859085918592859385948595859685978598859986008601860286038604860586068607860886098610861186128613861486158616861786188619862086218622862386248625862686278628862986308631863286338634863586368637863886398640864186428643864486458646864786488649865086518652865386548655865686578658865986608661866286638664866586668667866886698670867186728673867486758676867786788679868086818682868386848685868686878688868986908691869286938694869586968697869886998700870187028703870487058706870787088709871087118712871387148715871687178718871987208721872287238724872587268727872887298730873187328733873487358736873787388739874087418742874387448745874687478748874987508751875287538754875587568757875887598760876187628763876487658766876787688769877087718772877387748775877687778778877987808781878287838784878587868787878887898790879187928793879487958796879787988799880088018802880388048805880688078808880988108811881288138814881588168817881888198820882188228823882488258826882788288829883088318832883388348835883688378838883988408841884288438844884588468847884888498850885188528853885488558856885788588859886088618862886388648865886688678868886988708871887288738874887588768877887888798880888188828883888488858886888788888889889088918892889388948895889688978898889989008901890289038904890589068907890889098910891189128913891489158916891789188919892089218922892389248925892689278928892989308931893289338934893589368937893889398940894189428943894489458946894789488949895089518952895389548955895689578958895989608961896289638964896589668967896889698970897189728973897489758976897789788979898089818982898389848985898689878988898989908991899289938994899589968997899889999000900190029003900490059006900790089009901090119012901390149015901690179018901990209021902290239024902590269027902890299030903190329033903490359036903790389039904090419042904390449045904690479048904990509051905290539054905590569057905890599060906190629063906490659066906790689069907090719072907390749075907690779078907990809081908290839084908590869087908890899090909190929093909490959096909790989099910091019102910391049105910691079108910991109111911291139114911591169117911891199120912191229123912491259126912791289129913091319132913391349135913691379138913991409141914291439144914591469147914891499150915191529153915491559156915791589159916091619162916391649165916691679168916991709171917291739174917591769177917891799180918191829183918491859186918791889189919091919192919391949195919691979198919992009201920292039204920592069207920892099210921192129213921492159216921792189219922092219222922392249225922692279228922992309231923292339234923592369237923892399240924192429243924492459246924792489249925092519252925392549255925692579258925992609261926292639264926592669267926892699270927192729273927492759276927792789279928092819282928392849285928692879288928992909291929292939294929592969297929892999300930193029303930493059306930793089309931093119312931393149315931693179318931993209321932293239324932593269327932893299330933193329333933493359336933793389339934093419342934393449345934693479348934993509351935293539354935593569357935893599360936193629363936493659366936793689369937093719372937393749375937693779378937993809381938293839384938593869387938893899390939193929393939493959396939793989399940094019402940394049405940694079408940994109411941294139414941594169417941894199420942194229423942494259426942794289429943094319432943394349435943694379438943994409441944294439444944594469447944894499450945194529453945494559456945794589459946094619462946394649465946694679468946994709471947294739474947594769477947894799480948194829483948494859486948794889489949094919492949394949495949694979498949995009501950295039504950595069507950895099510951195129513951495159516951795189519952095219522952395249525952695279528952995309531953295339534953595369537953895399540954195429543954495459546954795489549955095519552955395549555955695579558955995609561956295639564956595669567956895699570957195729573957495759576957795789579958095819582958395849585958695879588958995909591959295939594959595969597959895999600960196029603960496059606960796089609961096119612961396149615961696179618961996209621962296239624962596269627962896299630963196329633963496359636963796389639964096419642964396449645964696479648964996509651965296539654965596569657965896599660966196629663966496659666966796689669967096719672967396749675967696779678967996809681968296839684968596869687968896899690969196929693969496959696969796989699970097019702970397049705970697079708970997109711971297139714971597169717971897199720972197229723972497259726972797289729973097319732973397349735973697379738973997409741974297439744974597469747974897499750975197529753975497559756975797589759976097619762976397649765976697679768976997709771977297739774977597769777977897799780978197829783978497859786978797889789979097919792979397949795979697979798979998009801980298039804980598069807980898099810981198129813981498159816981798189819982098219822982398249825982698279828982998309831983298339834983598369837983898399840984198429843984498459846984798489849985098519852985398549855985698579858985998609861986298639864986598669867986898699870987198729873987498759876987798789879988098819882988398849885988698879888988998909891989298939894989598969897989898999900990199029903990499059906990799089909991099119912991399149915991699179918991999209921992299239924992599269927992899299930993199329933993499359936993799389939994099419942994399449945994699479948994999509951995299539954995599569957995899599960996199629963996499659966996799689969997099719972997399749975997699779978997999809981998299839984998599869987998899899990999199929993999499959996999799989999100001000110002100031000410005100061000710008100091001010011100121001310014100151001610017100181001910020100211002210023100241002510026100271002810029100301003110032100331003410035100361003710038100391004010041100421004310044100451004610047100481004910050100511005210053100541005510056100571005810059100601006110062100631006410065100661006710068100691007010071100721007310074100751007610077100781007910080100811008210083100841008510086100871008810089100901009110092100931009410095100961009710098100991010010101101021010310104101051010610107101081010910110101111011210113101141011510116101171011810119101201012110122101231012410125101261012710128101291013010131101321013310134101351013610137101381013910140101411014210143101441014510146101471014810149101501015110152101531015410155101561015710158101591016010161101621016310164101651016610167101681016910170101711017210173101741017510176101771017810179101801018110182101831018410185101861018710188101891019010191101921019310194101951019610197101981019910200102011020210203102041020510206102071020810209102101021110212102131021410215102161021710218102191022010221102221022310224102251022610227102281022910230102311023210233102341023510236102371023810239102401024110242102431024410245102461024710248102491025010251102521025310254102551025610257102581025910260102611026210263102641026510266102671026810269102701027110272102731027410275102761027710278102791028010281102821028310284102851028610287102881028910290102911029210293102941029510296102971029810299103001030110302103031030410305103061030710308103091031010311103121031310314103151031610317103181031910320103211032210323103241032510326103271032810329103301033110332103331033410335103361033710338103391034010341103421034310344103451034610347103481034910350103511035210353103541035510356103571035810359103601036110362103631036410365103661036710368103691037010371103721037310374103751037610377103781037910380103811038210383103841038510386103871038810389103901039110392103931039410395103961039710398103991040010401104021040310404104051040610407104081040910410104111041210413104141041510416104171041810419104201042110422104231042410425104261042710428104291043010431104321043310434104351043610437104381043910440104411044210443104441044510446104471044810449104501045110452104531045410455104561045710458104591046010461104621046310464104651046610467104681046910470104711047210473104741047510476104771047810479104801048110482104831048410485104861048710488104891049010491104921049310494104951049610497104981049910500105011050210503105041050510506105071050810509105101051110512105131051410515105161051710518105191052010521105221052310524105251052610527105281052910530105311053210533105341053510536105371053810539105401054110542105431054410545105461054710548105491055010551105521055310554105551055610557105581055910560105611056210563105641056510566105671056810569105701057110572105731057410575105761057710578105791058010581105821058310584105851058610587105881058910590105911059210593105941059510596105971059810599106001060110602106031060410605106061060710608106091061010611106121061310614106151061610617106181061910620106211062210623106241062510626106271062810629106301063110632106331063410635106361063710638106391064010641106421064310644106451064610647106481064910650106511065210653106541065510656106571065810659106601066110662106631066410665106661066710668106691067010671106721067310674106751067610677106781067910680106811068210683106841068510686106871068810689106901069110692106931069410695106961069710698106991070010701107021070310704107051070610707107081070910710107111071210713107141071510716107171071810719107201072110722107231072410725107261072710728107291073010731107321073310734107351073610737107381073910740107411074210743107441074510746107471074810749107501075110752107531075410755107561075710758107591076010761107621076310764107651076610767107681076910770107711077210773107741077510776107771077810779107801078110782107831078410785107861078710788107891079010791107921079310794107951079610797107981079910800108011080210803108041080510806108071080810809108101081110812108131081410815108161081710818108191082010821108221082310824108251082610827108281082910830108311083210833108341083510836108371083810839108401084110842108431084410845108461084710848108491085010851108521085310854108551085610857108581085910860108611086210863108641086510866108671086810869108701087110872108731087410875108761087710878108791088010881108821088310884108851088610887108881088910890108911089210893108941089510896108971089810899109001090110902109031090410905109061090710908109091091010911109121091310914109151091610917109181091910920109211092210923109241092510926109271092810929109301093110932109331093410935109361093710938109391094010941109421094310944109451094610947109481094910950109511095210953109541095510956109571095810959109601096110962109631096410965109661096710968109691097010971109721097310974109751097610977109781097910980109811098210983109841098510986109871098810989109901099110992109931099410995109961099710998109991100011001110021100311004110051100611007110081100911010110111101211013110141101511016110171101811019110201102111022110231102411025110261102711028110291103011031110321103311034110351103611037110381103911040110411104211043110441104511046110471104811049110501105111052110531105411055110561105711058110591106011061110621106311064110651106611067110681106911070110711107211073110741107511076110771107811079110801108111082110831108411085110861108711088110891109011091110921109311094110951109611097110981109911100111011110211103111041110511106111071110811109111101111111112111131111411115111161111711118111191112011121111221112311124111251112611127111281112911130111311113211133111341113511136111371113811139111401114111142111431114411145111461114711148111491115011151111521115311154111551115611157111581115911160111611116211163111641116511166111671116811169111701117111172111731117411175111761117711178111791118011181111821118311184111851118611187111881118911190111911119211193111941119511196111971119811199112001120111202112031120411205112061120711208112091121011211112121121311214112151121611217112181121911220112211122211223112241122511226112271122811229112301123111232112331123411235112361123711238112391124011241112421124311244112451124611247112481124911250112511125211253112541125511256112571125811259112601126111262112631126411265112661126711268112691127011271112721127311274112751127611277112781127911280112811128211283112841128511286112871128811289112901129111292112931129411295112961129711298112991130011301113021130311304113051130611307113081130911310113111131211313113141131511316113171131811319113201132111322113231132411325113261132711328113291133011331113321133311334113351133611337113381133911340113411134211343113441134511346113471134811349113501135111352113531135411355113561135711358113591136011361113621136311364113651136611367113681136911370113711137211373113741137511376113771137811379113801138111382113831138411385113861138711388113891139011391113921139311394113951139611397113981139911400114011140211403114041140511406114071140811409114101141111412114131141411415114161141711418114191142011421114221142311424114251142611427114281142911430114311143211433114341143511436114371143811439114401144111442114431144411445114461144711448114491145011451114521145311454114551145611457114581145911460114611146211463114641146511466114671146811469114701147111472114731147411475114761147711478114791148011481114821148311484114851148611487114881148911490114911149211493114941149511496114971149811499115001150111502115031150411505115061150711508115091151011511115121151311514115151151611517115181151911520115211152211523115241152511526115271152811529115301153111532115331153411535115361153711538115391154011541115421154311544115451154611547115481154911550115511155211553115541155511556115571155811559115601156111562115631156411565115661156711568115691157011571115721157311574115751157611577115781157911580115811158211583115841158511586115871158811589115901159111592115931159411595115961159711598115991160011601116021160311604116051160611607116081160911610116111161211613116141161511616116171161811619116201162111622116231162411625116261162711628116291163011631116321163311634116351163611637116381163911640116411164211643116441164511646116471164811649116501165111652116531165411655116561165711658116591166011661116621166311664116651166611667116681166911670116711167211673116741167511676116771167811679116801168111682116831168411685116861168711688116891169011691116921169311694116951169611697116981169911700117011170211703117041170511706117071170811709117101171111712117131171411715117161171711718117191172011721117221172311724117251172611727117281172911730117311173211733117341173511736117371173811739117401174111742117431174411745117461174711748117491175011751117521175311754117551175611757117581175911760117611176211763117641176511766117671176811769117701177111772117731177411775117761177711778117791178011781117821178311784117851178611787117881178911790117911179211793117941179511796117971179811799118001180111802118031180411805118061180711808118091181011811118121181311814118151181611817118181181911820118211182211823118241182511826118271182811829118301183111832118331183411835118361183711838118391184011841118421184311844118451184611847118481184911850118511185211853118541185511856118571185811859118601186111862118631186411865118661186711868118691187011871118721187311874118751187611877118781187911880118811188211883118841188511886118871188811889118901189111892118931189411895118961189711898118991190011901119021190311904119051190611907119081190911910119111191211913119141191511916119171191811919119201192111922119231192411925119261192711928119291193011931119321193311934119351193611937119381193911940119411194211943119441194511946119471194811949119501195111952119531195411955119561195711958119591196011961119621196311964119651196611967119681196911970119711197211973119741197511976119771197811979119801198111982119831198411985119861198711988119891199011991119921199311994119951199611997119981199912000120011200212003120041200512006120071200812009120101201112012120131201412015120161201712018120191202012021120221202312024120251202612027120281202912030120311203212033120341203512036120371203812039120401204112042120431204412045120461204712048120491205012051120521205312054120551205612057120581205912060120611206212063120641206512066120671206812069120701207112072120731207412075120761207712078120791208012081120821208312084120851208612087120881208912090120911209212093120941209512096120971209812099121001210112102121031210412105121061210712108121091211012111121121211312114121151211612117121181211912120121211212212123121241212512126121271212812129121301213112132121331213412135121361213712138121391214012141121421214312144121451214612147121481214912150121511215212153121541215512156121571215812159121601216112162121631216412165121661216712168121691217012171121721217312174121751217612177121781217912180121811218212183121841218512186121871218812189121901219112192121931219412195121961219712198121991220012201122021220312204122051220612207122081220912210122111221212213122141221512216122171221812219122201222112222122231222412225122261222712228122291223012231122321223312234122351223612237122381223912240122411224212243122441224512246122471224812249122501225112252122531225412255122561225712258122591226012261122621226312264122651226612267122681226912270122711227212273122741227512276122771227812279122801228112282122831228412285122861228712288122891229012291122921229312294122951229612297122981229912300123011230212303123041230512306123071230812309123101231112312123131231412315123161231712318123191232012321123221232312324123251232612327123281232912330123311233212333123341233512336123371233812339123401234112342123431234412345123461234712348123491235012351123521235312354123551235612357123581235912360123611236212363123641236512366123671236812369123701237112372123731237412375123761237712378123791238012381123821238312384123851238612387123881238912390123911239212393123941239512396123971239812399124001240112402124031240412405124061240712408124091241012411124121241312414124151241612417124181241912420124211242212423124241242512426124271242812429124301243112432124331243412435124361243712438124391244012441124421244312444124451244612447124481244912450124511245212453124541245512456124571245812459124601246112462124631246412465124661246712468124691247012471124721247312474124751247612477124781247912480124811248212483124841248512486124871248812489124901249112492124931249412495124961249712498124991250012501125021250312504125051250612507125081250912510125111251212513125141251512516125171251812519125201252112522125231252412525125261252712528125291253012531125321253312534125351253612537125381253912540125411254212543125441254512546125471254812549125501255112552125531255412555125561255712558125591256012561125621256312564125651256612567125681256912570125711257212573125741257512576125771257812579125801258112582125831258412585125861258712588125891259012591125921259312594125951259612597125981259912600126011260212603126041260512606126071260812609126101261112612126131261412615126161261712618126191262012621126221262312624126251262612627126281262912630126311263212633126341263512636126371263812639126401264112642126431264412645126461264712648126491265012651126521265312654126551265612657126581265912660126611266212663126641266512666126671266812669126701267112672126731267412675126761267712678126791268012681126821268312684126851268612687126881268912690126911269212693126941269512696126971269812699127001270112702127031270412705127061270712708127091271012711127121271312714127151271612717127181271912720127211272212723127241272512726127271272812729127301273112732127331273412735127361273712738127391274012741127421274312744127451274612747127481274912750127511275212753127541275512756127571275812759127601276112762127631276412765127661276712768127691277012771127721277312774127751277612777127781277912780127811278212783127841278512786127871278812789127901279112792127931279412795127961279712798127991280012801128021280312804128051280612807128081280912810128111281212813128141281512816128171281812819128201282112822128231282412825128261282712828128291283012831128321283312834128351283612837128381283912840128411284212843128441284512846128471284812849128501285112852128531285412855128561285712858128591286012861128621286312864128651286612867128681286912870128711287212873128741287512876128771287812879128801288112882128831288412885128861288712888128891289012891128921289312894128951289612897128981289912900129011290212903129041290512906129071290812909129101291112912129131291412915129161291712918129191292012921129221292312924129251292612927129281292912930129311293212933129341293512936129371293812939129401294112942129431294412945129461294712948129491295012951129521295312954129551295612957129581295912960129611296212963129641296512966129671296812969129701297112972129731297412975129761297712978129791298012981129821298312984129851298612987129881298912990129911299212993129941299512996129971299812999130001300113002130031300413005130061300713008130091301013011130121301313014130151301613017130181301913020130211302213023130241302513026130271302813029130301303113032130331303413035130361303713038130391304013041130421304313044130451304613047130481304913050130511305213053130541305513056130571305813059130601306113062130631306413065130661306713068130691307013071130721307313074130751307613077130781307913080130811308213083130841308513086130871308813089130901309113092130931309413095130961309713098130991310013101131021310313104131051310613107131081310913110131111311213113131141311513116131171311813119131201312113122131231312413125131261312713128131291313013131131321313313134131351313613137131381313913140131411314213143131441314513146131471314813149131501315113152131531315413155131561315713158131591316013161131621316313164131651316613167131681316913170131711317213173131741317513176131771317813179131801318113182131831318413185131861318713188131891319013191131921319313194131951319613197131981319913200132011320213203132041320513206132071320813209132101321113212132131321413215132161321713218132191322013221132221322313224132251322613227132281322913230132311323213233132341323513236132371323813239132401324113242132431324413245132461324713248132491325013251132521325313254132551325613257132581325913260132611326213263132641326513266132671326813269132701327113272132731327413275132761327713278132791328013281132821328313284132851328613287132881328913290132911329213293132941329513296132971329813299133001330113302133031330413305133061330713308133091331013311133121331313314133151331613317133181331913320133211332213323133241332513326133271332813329133301333113332133331333413335133361333713338133391334013341133421334313344133451334613347133481334913350133511335213353133541335513356133571335813359133601336113362133631336413365133661336713368133691337013371133721337313374133751337613377133781337913380133811338213383133841338513386133871338813389133901339113392133931339413395133961339713398133991340013401134021340313404134051340613407134081340913410134111341213413134141341513416134171341813419134201342113422134231342413425134261342713428134291343013431134321343313434134351343613437134381343913440134411344213443134441344513446134471344813449134501345113452134531345413455134561345713458134591346013461134621346313464134651346613467134681346913470134711347213473134741347513476134771347813479134801348113482134831348413485134861348713488134891349013491134921349313494134951349613497134981349913500135011350213503135041350513506135071350813509135101351113512135131351413515135161351713518135191352013521135221352313524135251352613527135281352913530135311353213533135341353513536135371353813539135401354113542135431354413545135461354713548135491355013551135521355313554135551355613557135581355913560135611356213563135641356513566135671356813569135701357113572135731357413575135761357713578135791358013581135821358313584135851358613587135881358913590135911359213593135941359513596135971359813599136001360113602136031360413605136061360713608136091361013611136121361313614136151361613617136181361913620136211362213623136241362513626136271362813629136301363113632136331363413635136361363713638136391364013641136421364313644136451364613647136481364913650136511365213653136541365513656136571365813659136601366113662136631366413665136661366713668136691367013671136721367313674136751367613677136781367913680136811368213683136841368513686136871368813689136901369113692136931369413695136961369713698136991370013701137021370313704137051370613707137081370913710137111371213713137141371513716137171371813719137201372113722137231372413725137261372713728137291373013731137321373313734137351373613737137381373913740137411374213743137441374513746137471374813749137501375113752137531375413755137561375713758137591376013761137621376313764137651376613767137681376913770137711377213773137741377513776137771377813779137801378113782137831378413785137861378713788137891379013791137921379313794137951379613797137981379913800138011380213803138041380513806138071380813809138101381113812138131381413815138161381713818138191382013821138221382313824138251382613827138281382913830138311383213833138341383513836138371383813839138401384113842138431384413845138461384713848138491385013851138521385313854138551385613857138581385913860138611386213863138641386513866138671386813869138701387113872138731387413875138761387713878138791388013881138821388313884138851388613887138881388913890138911389213893138941389513896138971389813899139001390113902139031390413905139061390713908139091391013911139121391313914139151391613917139181391913920139211392213923139241392513926139271392813929139301393113932139331393413935139361393713938139391394013941139421394313944139451394613947139481394913950139511395213953139541395513956139571395813959139601396113962139631396413965139661396713968139691397013971139721397313974139751397613977139781397913980139811398213983139841398513986139871398813989139901399113992139931399413995139961399713998139991400014001140021400314004140051400614007140081400914010140111401214013140141401514016140171401814019140201402114022140231402414025140261402714028140291403014031140321403314034140351403614037140381403914040140411404214043140441404514046140471404814049140501405114052140531405414055140561405714058140591406014061140621406314064140651406614067140681406914070140711407214073140741407514076140771407814079140801408114082140831408414085140861408714088140891409014091140921409314094140951409614097140981409914100141011410214103141041410514106141071410814109141101411114112141131411414115141161411714118141191412014121141221412314124141251412614127141281412914130141311413214133141341413514136141371413814139141401414114142141431414414145141461414714148141491415014151141521415314154141551415614157141581415914160141611416214163141641416514166141671416814169141701417114172141731417414175141761417714178141791418014181141821418314184141851418614187141881418914190141911419214193141941419514196141971419814199142001420114202142031420414205142061420714208142091421014211142121421314214142151421614217142181421914220142211422214223142241422514226142271422814229142301423114232142331423414235142361423714238142391424014241142421424314244142451424614247142481424914250142511425214253142541425514256142571425814259142601426114262142631426414265142661426714268142691427014271142721427314274142751427614277142781427914280142811428214283142841428514286142871428814289142901429114292142931429414295142961429714298142991430014301143021430314304143051430614307143081430914310143111431214313143141431514316143171431814319143201432114322143231432414325143261432714328143291433014331143321433314334143351433614337143381433914340143411434214343143441434514346143471434814349143501435114352143531435414355143561435714358143591436014361143621436314364143651436614367143681436914370143711437214373143741437514376143771437814379143801438114382143831438414385143861438714388143891439014391143921439314394143951439614397143981439914400144011440214403144041440514406144071440814409144101441114412144131441414415144161441714418144191442014421144221442314424144251442614427144281442914430144311443214433144341443514436144371443814439144401444114442144431444414445144461444714448144491445014451144521445314454144551445614457144581445914460144611446214463144641446514466144671446814469144701447114472144731447414475144761447714478144791448014481144821448314484144851448614487144881448914490144911449214493144941449514496144971449814499145001450114502145031450414505145061450714508145091451014511145121451314514145151451614517145181451914520145211452214523145241452514526145271452814529145301453114532145331453414535145361453714538145391454014541145421454314544145451454614547145481454914550145511455214553145541455514556145571455814559145601456114562145631456414565145661456714568145691457014571145721457314574145751457614577145781457914580145811458214583145841458514586145871458814589145901459114592145931459414595145961459714598145991460014601146021460314604146051460614607146081460914610146111461214613146141461514616146171461814619146201462114622146231462414625146261462714628146291463014631146321463314634146351463614637146381463914640146411464214643146441464514646146471464814649146501465114652146531465414655146561465714658146591466014661146621466314664146651466614667146681466914670146711467214673146741467514676146771467814679146801468114682146831468414685146861468714688146891469014691146921469314694146951469614697146981469914700147011470214703147041470514706147071470814709147101471114712147131471414715147161471714718147191472014721147221472314724147251472614727147281472914730147311473214733147341473514736147371473814739147401474114742147431474414745147461474714748147491475014751147521475314754147551475614757147581475914760147611476214763147641476514766147671476814769147701477114772147731477414775147761477714778147791478014781147821478314784147851478614787147881478914790147911479214793147941479514796147971479814799148001480114802148031480414805148061480714808148091481014811148121481314814148151481614817148181481914820148211482214823148241482514826148271482814829148301483114832148331483414835148361483714838148391484014841148421484314844148451484614847148481484914850148511485214853148541485514856148571485814859148601486114862148631486414865148661486714868148691487014871148721487314874148751487614877148781487914880148811488214883148841488514886148871488814889148901489114892148931489414895148961489714898148991490014901149021490314904149051490614907149081490914910149111491214913149141491514916149171491814919149201492114922149231492414925149261492714928149291493014931149321493314934149351493614937149381493914940149411494214943149441494514946149471494814949149501495114952149531495414955149561495714958149591496014961149621496314964149651496614967149681496914970149711497214973149741497514976149771497814979149801498114982149831498414985149861498714988149891499014991149921499314994149951499614997149981499915000150011500215003150041500515006150071500815009150101501115012150131501415015150161501715018150191502015021150221502315024150251502615027150281502915030150311503215033150341503515036150371503815039150401504115042150431504415045150461504715048150491505015051150521505315054150551505615057150581505915060150611506215063150641506515066150671506815069150701507115072150731507415075150761507715078150791508015081150821508315084150851508615087150881508915090150911509215093150941509515096150971509815099151001510115102151031510415105151061510715108151091511015111151121511315114151151511615117151181511915120151211512215123151241512515126151271512815129151301513115132151331513415135151361513715138151391514015141151421514315144151451514615147151481514915150151511515215153151541515515156151571515815159151601516115162151631516415165151661516715168151691517015171151721517315174151751517615177151781517915180151811518215183151841518515186151871518815189151901519115192151931519415195151961519715198151991520015201152021520315204152051520615207152081520915210152111521215213152141521515216152171521815219152201522115222152231522415225152261522715228152291523015231152321523315234152351523615237152381523915240152411524215243152441524515246152471524815249152501525115252152531525415255152561525715258152591526015261152621526315264152651526615267152681526915270152711527215273152741527515276152771527815279152801528115282152831528415285152861528715288152891529015291152921529315294152951529615297152981529915300153011530215303153041530515306153071530815309153101531115312153131531415315153161531715318153191532015321153221532315324153251532615327153281532915330153311533215333153341533515336153371533815339153401534115342153431534415345153461534715348153491535015351153521535315354153551535615357153581535915360153611536215363153641536515366153671536815369153701537115372153731537415375153761537715378153791538015381153821538315384153851538615387153881538915390153911539215393153941539515396153971539815399154001540115402154031540415405154061540715408154091541015411154121541315414154151541615417154181541915420154211542215423154241542515426154271542815429154301543115432154331543415435154361543715438154391544015441154421544315444154451544615447154481544915450154511545215453154541545515456154571545815459154601546115462154631546415465154661546715468154691547015471154721547315474154751547615477154781547915480154811548215483154841548515486154871548815489154901549115492154931549415495154961549715498154991550015501155021550315504155051550615507155081550915510155111551215513155141551515516155171551815519155201552115522155231552415525155261552715528155291553015531155321553315534155351553615537155381553915540155411554215543155441554515546155471554815549155501555115552155531555415555155561555715558155591556015561155621556315564155651556615567155681556915570155711557215573155741557515576155771557815579155801558115582155831558415585155861558715588155891559015591155921559315594155951559615597155981559915600156011560215603156041560515606156071560815609156101561115612156131561415615156161561715618156191562015621156221562315624156251562615627156281562915630156311563215633156341563515636156371563815639156401564115642156431564415645156461564715648156491565015651156521565315654156551565615657156581565915660156611566215663156641566515666156671566815669156701567115672156731567415675156761567715678156791568015681156821568315684156851568615687156881568915690156911569215693156941569515696156971569815699157001570115702157031570415705157061570715708157091571015711157121571315714157151571615717157181571915720157211572215723157241572515726157271572815729157301573115732157331573415735157361573715738157391574015741157421574315744157451574615747157481574915750157511575215753157541575515756157571575815759157601576115762157631576415765157661576715768157691577015771157721577315774157751577615777157781577915780157811578215783157841578515786157871578815789157901579115792157931579415795157961579715798157991580015801158021580315804158051580615807158081580915810158111581215813158141581515816158171581815819158201582115822158231582415825158261582715828158291583015831158321583315834158351583615837158381583915840158411584215843158441584515846158471584815849158501585115852158531585415855158561585715858158591586015861158621586315864158651586615867158681586915870158711587215873158741587515876158771587815879158801588115882158831588415885158861588715888158891589015891158921589315894158951589615897158981589915900159011590215903159041590515906159071590815909159101591115912159131591415915159161591715918159191592015921159221592315924159251592615927159281592915930159311593215933159341593515936159371593815939159401594115942159431594415945159461594715948159491595015951159521595315954159551595615957159581595915960159611596215963159641596515966159671596815969159701597115972159731597415975159761597715978159791598015981159821598315984159851598615987159881598915990159911599215993159941599515996159971599815999160001600116002160031600416005160061600716008160091601016011160121601316014160151601616017160181601916020160211602216023160241602516026160271602816029160301603116032160331603416035160361603716038160391604016041160421604316044160451604616047160481604916050160511605216053160541605516056160571605816059160601606116062160631606416065160661606716068160691607016071160721607316074160751607616077160781607916080160811608216083160841608516086160871608816089160901609116092160931609416095160961609716098160991610016101161021610316104161051610616107161081610916110161111611216113161141611516116161171611816119161201612116122161231612416125161261612716128161291613016131161321613316134161351613616137161381613916140161411614216143161441614516146161471614816149161501615116152161531615416155161561615716158161591616016161161621616316164161651616616167161681616916170161711617216173161741617516176161771617816179161801618116182161831618416185161861618716188161891619016191161921619316194161951619616197161981619916200162011620216203162041620516206162071620816209162101621116212162131621416215162161621716218162191622016221162221622316224162251622616227162281622916230162311623216233162341623516236162371623816239162401624116242162431624416245162461624716248162491625016251162521625316254162551625616257162581625916260162611626216263162641626516266162671626816269162701627116272162731627416275162761627716278162791628016281162821628316284162851628616287162881628916290162911629216293162941629516296162971629816299163001630116302163031630416305163061630716308163091631016311163121631316314163151631616317163181631916320163211632216323163241632516326163271632816329163301633116332163331633416335163361633716338163391634016341163421634316344163451634616347163481634916350163511635216353163541635516356163571635816359163601636116362163631636416365163661636716368163691637016371163721637316374163751637616377163781637916380163811638216383163841638516386163871638816389163901639116392163931639416395163961639716398163991640016401164021640316404164051640616407164081640916410164111641216413164141641516416164171641816419164201642116422164231642416425164261642716428164291643016431164321643316434164351643616437164381643916440164411644216443164441644516446164471644816449164501645116452164531645416455164561645716458164591646016461164621646316464164651646616467164681646916470164711647216473164741647516476164771647816479164801648116482164831648416485164861648716488164891649016491164921649316494164951649616497164981649916500165011650216503165041650516506165071650816509165101651116512165131651416515165161651716518165191652016521165221652316524165251652616527165281652916530165311653216533165341653516536165371653816539165401654116542165431654416545165461654716548165491655016551165521655316554165551655616557165581655916560165611656216563165641656516566165671656816569165701657116572165731657416575165761657716578165791658016581165821658316584165851658616587165881658916590165911659216593165941659516596
  1. /* fe_x25519_asm
  2. *
  3. * Copyright (C) 2006-2022 wolfSSL Inc.
  4. *
  5. * This file is part of wolfSSL.
  6. *
  7. * wolfSSL is free software; you can redistribute it and/or modify
  8. * it under the terms of the GNU General Public License as published by
  9. * the Free Software Foundation; either version 2 of the License, or
  10. * (at your option) any later version.
  11. *
  12. * wolfSSL is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15. * GNU General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU General Public License
  18. * along with this program; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
  20. */
  21. #ifdef WOLFSSL_USER_SETTINGS
  22. #ifdef WOLFSSL_USER_SETTINGS_ASM
  23. /*
  24. * user_settings_asm.h is a file generated by the script user_settings_asm.sh.
  25. * The script takes in a user_settings.h and produces user_settings_asm.h, which
  26. * is a stripped down version of user_settings.h containing only preprocessor
  27. * directives. This makes the header safe to include in assembly (.S) files.
  28. */
  29. #include "user_settings_asm.h"
  30. #else
  31. /*
  32. * Note: if user_settings.h contains any C code (e.g. a typedef or function
  33. * prototype), including it here in an assembly (.S) file will cause an
  34. * assembler failure. See user_settings_asm.h above.
  35. */
  36. #include "user_settings.h"
  37. #endif /* WOLFSSL_USER_SETTINGS_ASM */
  38. #endif /* WOLFSSL_USER_SETTINGS */
  39. #ifndef HAVE_INTEL_AVX1
  40. #define HAVE_INTEL_AVX1
  41. #endif /* HAVE_INTEL_AVX1 */
  42. #ifndef NO_AVX2_SUPPORT
  43. #define HAVE_INTEL_AVX2
  44. #endif /* NO_AVX2_SUPPORT */
  45. #ifndef __APPLE__
  46. .text
  47. .globl fe_init
  48. .type fe_init,@function
  49. .align 16
  50. fe_init:
  51. #else
  52. .section __TEXT,__text
  53. .globl _fe_init
  54. .p2align 4
  55. _fe_init:
  56. #endif /* __APPLE__ */
  57. #ifdef HAVE_INTEL_AVX2
  58. #ifndef __APPLE__
  59. movq cpuFlagsSet@GOTPCREL(%rip), %rax
  60. movl (%rax), %eax
  61. #else
  62. movl _cpuFlagsSet(%rip), %eax
  63. #endif /* __APPLE__ */
  64. testl %eax, %eax
  65. je L_fe_init_get_flags
  66. repz retq
  67. L_fe_init_get_flags:
  68. #ifndef __APPLE__
  69. callq cpuid_get_flags@plt
  70. #else
  71. callq _cpuid_get_flags
  72. #endif /* __APPLE__ */
  73. #ifndef __APPLE__
  74. movq intelFlags@GOTPCREL(%rip), %rdx
  75. movl %eax, (%rdx)
  76. #else
  77. movl %eax, _intelFlags(%rip)
  78. #endif /* __APPLE__ */
  79. andl $0x50, %eax
  80. cmpl $0x50, %eax
  81. jne L_fe_init_flags_done
  82. #ifndef __APPLE__
  83. movq fe_mul_avx2@GOTPCREL(%rip), %rax
  84. #else
  85. leaq _fe_mul_avx2(%rip), %rax
  86. #endif /* __APPLE__ */
  87. #ifndef __APPLE__
  88. movq fe_mul_p@GOTPCREL(%rip), %rdx
  89. movq %rax, (%rdx)
  90. #else
  91. movq %rax, _fe_mul_p(%rip)
  92. #endif /* __APPLE__ */
  93. #ifndef __APPLE__
  94. movq fe_sq_avx2@GOTPCREL(%rip), %rax
  95. #else
  96. leaq _fe_sq_avx2(%rip), %rax
  97. #endif /* __APPLE__ */
  98. #ifndef __APPLE__
  99. movq fe_sq_p@GOTPCREL(%rip), %rdx
  100. movq %rax, (%rdx)
  101. #else
  102. movq %rax, _fe_sq_p(%rip)
  103. #endif /* __APPLE__ */
  104. #ifndef __APPLE__
  105. movq fe_mul121666_avx2@GOTPCREL(%rip), %rax
  106. #else
  107. leaq _fe_mul121666_avx2(%rip), %rax
  108. #endif /* __APPLE__ */
  109. #ifndef __APPLE__
  110. movq fe_mul121666_p@GOTPCREL(%rip), %rdx
  111. movq %rax, (%rdx)
  112. #else
  113. movq %rax, _fe_mul121666_p(%rip)
  114. #endif /* __APPLE__ */
  115. #ifndef __APPLE__
  116. movq fe_sq2_avx2@GOTPCREL(%rip), %rax
  117. #else
  118. leaq _fe_sq2_avx2(%rip), %rax
  119. #endif /* __APPLE__ */
  120. #ifndef __APPLE__
  121. movq fe_sq2_p@GOTPCREL(%rip), %rdx
  122. movq %rax, (%rdx)
  123. #else
  124. movq %rax, _fe_sq2_p(%rip)
  125. #endif /* __APPLE__ */
  126. #ifndef __APPLE__
  127. movq fe_invert_avx2@GOTPCREL(%rip), %rax
  128. #else
  129. leaq _fe_invert_avx2(%rip), %rax
  130. #endif /* __APPLE__ */
  131. #ifndef __APPLE__
  132. movq fe_invert_p@GOTPCREL(%rip), %rdx
  133. movq %rax, (%rdx)
  134. #else
  135. movq %rax, _fe_invert_p(%rip)
  136. #endif /* __APPLE__ */
  137. #ifndef __APPLE__
  138. movq curve25519_avx2@GOTPCREL(%rip), %rax
  139. #else
  140. leaq _curve25519_avx2(%rip), %rax
  141. #endif /* __APPLE__ */
  142. #ifndef __APPLE__
  143. movq curve25519_p@GOTPCREL(%rip), %rdx
  144. movq %rax, (%rdx)
  145. #else
  146. movq %rax, _curve25519_p(%rip)
  147. #endif /* __APPLE__ */
  148. #ifndef __APPLE__
  149. movq fe_pow22523_avx2@GOTPCREL(%rip), %rax
  150. #else
  151. leaq _fe_pow22523_avx2(%rip), %rax
  152. #endif /* __APPLE__ */
  153. #ifndef __APPLE__
  154. movq fe_pow22523_p@GOTPCREL(%rip), %rdx
  155. movq %rax, (%rdx)
  156. #else
  157. movq %rax, _fe_pow22523_p(%rip)
  158. #endif /* __APPLE__ */
  159. #ifndef __APPLE__
  160. movq fe_ge_to_p2_avx2@GOTPCREL(%rip), %rax
  161. #else
  162. leaq _fe_ge_to_p2_avx2(%rip), %rax
  163. #endif /* __APPLE__ */
  164. #ifndef __APPLE__
  165. movq fe_ge_to_p2_p@GOTPCREL(%rip), %rdx
  166. movq %rax, (%rdx)
  167. #else
  168. movq %rax, _fe_ge_to_p2_p(%rip)
  169. #endif /* __APPLE__ */
  170. #ifndef __APPLE__
  171. movq fe_ge_to_p3_avx2@GOTPCREL(%rip), %rax
  172. #else
  173. leaq _fe_ge_to_p3_avx2(%rip), %rax
  174. #endif /* __APPLE__ */
  175. #ifndef __APPLE__
  176. movq fe_ge_to_p3_p@GOTPCREL(%rip), %rdx
  177. movq %rax, (%rdx)
  178. #else
  179. movq %rax, _fe_ge_to_p3_p(%rip)
  180. #endif /* __APPLE__ */
  181. #ifndef __APPLE__
  182. movq fe_ge_dbl_avx2@GOTPCREL(%rip), %rax
  183. #else
  184. leaq _fe_ge_dbl_avx2(%rip), %rax
  185. #endif /* __APPLE__ */
  186. #ifndef __APPLE__
  187. movq fe_ge_dbl_p@GOTPCREL(%rip), %rdx
  188. movq %rax, (%rdx)
  189. #else
  190. movq %rax, _fe_ge_dbl_p(%rip)
  191. #endif /* __APPLE__ */
  192. #ifndef __APPLE__
  193. movq fe_ge_madd_avx2@GOTPCREL(%rip), %rax
  194. #else
  195. leaq _fe_ge_madd_avx2(%rip), %rax
  196. #endif /* __APPLE__ */
  197. #ifndef __APPLE__
  198. movq fe_ge_madd_p@GOTPCREL(%rip), %rdx
  199. movq %rax, (%rdx)
  200. #else
  201. movq %rax, _fe_ge_madd_p(%rip)
  202. #endif /* __APPLE__ */
  203. #ifndef __APPLE__
  204. movq fe_ge_msub_avx2@GOTPCREL(%rip), %rax
  205. #else
  206. leaq _fe_ge_msub_avx2(%rip), %rax
  207. #endif /* __APPLE__ */
  208. #ifndef __APPLE__
  209. movq fe_ge_msub_p@GOTPCREL(%rip), %rdx
  210. movq %rax, (%rdx)
  211. #else
  212. movq %rax, _fe_ge_msub_p(%rip)
  213. #endif /* __APPLE__ */
  214. #ifndef __APPLE__
  215. movq fe_ge_add_avx2@GOTPCREL(%rip), %rax
  216. #else
  217. leaq _fe_ge_add_avx2(%rip), %rax
  218. #endif /* __APPLE__ */
  219. #ifndef __APPLE__
  220. movq fe_ge_add_p@GOTPCREL(%rip), %rdx
  221. movq %rax, (%rdx)
  222. #else
  223. movq %rax, _fe_ge_add_p(%rip)
  224. #endif /* __APPLE__ */
  225. #ifndef __APPLE__
  226. movq fe_ge_sub_avx2@GOTPCREL(%rip), %rax
  227. #else
  228. leaq _fe_ge_sub_avx2(%rip), %rax
  229. #endif /* __APPLE__ */
  230. #ifndef __APPLE__
  231. movq fe_ge_sub_p@GOTPCREL(%rip), %rdx
  232. movq %rax, (%rdx)
  233. #else
  234. movq %rax, _fe_ge_sub_p(%rip)
  235. #endif /* __APPLE__ */
  236. L_fe_init_flags_done:
  237. #ifndef __APPLE__
  238. movq cpuFlagsSet@GOTPCREL(%rip), %rdx
  239. movl $0x1, (%rdx)
  240. #else
  241. movl $0x1, _cpuFlagsSet(%rip)
  242. #endif /* __APPLE__ */
  243. #endif /* HAVE_INTEL_AVX2 */
  244. repz retq
  245. #ifndef __APPLE__
  246. .size fe_init,.-fe_init
  247. #endif /* __APPLE__ */
  248. #ifndef __APPLE__
  249. .text
  250. .globl fe_frombytes
  251. .type fe_frombytes,@function
  252. .align 16
  253. fe_frombytes:
  254. #else
  255. .section __TEXT,__text
  256. .globl _fe_frombytes
  257. .p2align 4
  258. _fe_frombytes:
  259. #endif /* __APPLE__ */
  260. movq $0x7fffffffffffffff, %r9
  261. movq (%rsi), %rdx
  262. movq 8(%rsi), %rax
  263. movq 16(%rsi), %rcx
  264. movq 24(%rsi), %r8
  265. andq %r9, %r8
  266. movq %rdx, (%rdi)
  267. movq %rax, 8(%rdi)
  268. movq %rcx, 16(%rdi)
  269. movq %r8, 24(%rdi)
  270. repz retq
  271. #ifndef __APPLE__
  272. .size fe_frombytes,.-fe_frombytes
  273. #endif /* __APPLE__ */
  274. #ifndef __APPLE__
  275. .text
  276. .globl fe_tobytes
  277. .type fe_tobytes,@function
  278. .align 16
  279. fe_tobytes:
  280. #else
  281. .section __TEXT,__text
  282. .globl _fe_tobytes
  283. .p2align 4
  284. _fe_tobytes:
  285. #endif /* __APPLE__ */
  286. movq $0x7fffffffffffffff, %r10
  287. movq (%rsi), %rdx
  288. movq 8(%rsi), %rax
  289. movq 16(%rsi), %rcx
  290. movq 24(%rsi), %r8
  291. addq $19, %rdx
  292. adcq $0x00, %rax
  293. adcq $0x00, %rcx
  294. adcq $0x00, %r8
  295. shrq $63, %r8
  296. imulq $19, %r8, %r9
  297. movq (%rsi), %rdx
  298. movq 8(%rsi), %rax
  299. movq 16(%rsi), %rcx
  300. movq 24(%rsi), %r8
  301. addq %r9, %rdx
  302. adcq $0x00, %rax
  303. adcq $0x00, %rcx
  304. adcq $0x00, %r8
  305. andq %r10, %r8
  306. movq %rdx, (%rdi)
  307. movq %rax, 8(%rdi)
  308. movq %rcx, 16(%rdi)
  309. movq %r8, 24(%rdi)
  310. repz retq
  311. #ifndef __APPLE__
  312. .size fe_tobytes,.-fe_tobytes
  313. #endif /* __APPLE__ */
  314. #ifndef __APPLE__
  315. .text
  316. .globl fe_1
  317. .type fe_1,@function
  318. .align 16
  319. fe_1:
  320. #else
  321. .section __TEXT,__text
  322. .globl _fe_1
  323. .p2align 4
  324. _fe_1:
  325. #endif /* __APPLE__ */
  326. # Set one
  327. movq $0x01, (%rdi)
  328. movq $0x00, 8(%rdi)
  329. movq $0x00, 16(%rdi)
  330. movq $0x00, 24(%rdi)
  331. repz retq
  332. #ifndef __APPLE__
  333. .size fe_1,.-fe_1
  334. #endif /* __APPLE__ */
  335. #ifndef __APPLE__
  336. .text
  337. .globl fe_0
  338. .type fe_0,@function
  339. .align 16
  340. fe_0:
  341. #else
  342. .section __TEXT,__text
  343. .globl _fe_0
  344. .p2align 4
  345. _fe_0:
  346. #endif /* __APPLE__ */
  347. # Set zero
  348. movq $0x00, (%rdi)
  349. movq $0x00, 8(%rdi)
  350. movq $0x00, 16(%rdi)
  351. movq $0x00, 24(%rdi)
  352. repz retq
  353. #ifndef __APPLE__
  354. .size fe_0,.-fe_0
  355. #endif /* __APPLE__ */
  356. #ifndef __APPLE__
  357. .text
  358. .globl fe_copy
  359. .type fe_copy,@function
  360. .align 16
  361. fe_copy:
  362. #else
  363. .section __TEXT,__text
  364. .globl _fe_copy
  365. .p2align 4
  366. _fe_copy:
  367. #endif /* __APPLE__ */
  368. # Copy
  369. movq (%rsi), %rdx
  370. movq 8(%rsi), %rax
  371. movq 16(%rsi), %rcx
  372. movq 24(%rsi), %r8
  373. movq %rdx, (%rdi)
  374. movq %rax, 8(%rdi)
  375. movq %rcx, 16(%rdi)
  376. movq %r8, 24(%rdi)
  377. repz retq
  378. #ifndef __APPLE__
  379. .size fe_copy,.-fe_copy
  380. #endif /* __APPLE__ */
  381. #ifndef __APPLE__
  382. .text
  383. .globl fe_sub
  384. .type fe_sub,@function
  385. .align 16
  386. fe_sub:
  387. #else
  388. .section __TEXT,__text
  389. .globl _fe_sub
  390. .p2align 4
  391. _fe_sub:
  392. #endif /* __APPLE__ */
  393. pushq %r12
  394. # Sub
  395. movq (%rsi), %rax
  396. movq 8(%rsi), %rcx
  397. movq 16(%rsi), %r8
  398. movq 24(%rsi), %r9
  399. subq (%rdx), %rax
  400. movq $0x00, %r10
  401. sbbq 8(%rdx), %rcx
  402. movq $-19, %r11
  403. sbbq 16(%rdx), %r8
  404. movq $0x7fffffffffffffff, %r12
  405. sbbq 24(%rdx), %r9
  406. sbbq $0x00, %r10
  407. # Mask the modulus
  408. andq %r10, %r11
  409. andq %r10, %r12
  410. # Add modulus (if underflow)
  411. addq %r11, %rax
  412. adcq %r10, %rcx
  413. adcq %r10, %r8
  414. adcq %r12, %r9
  415. movq %rax, (%rdi)
  416. movq %rcx, 8(%rdi)
  417. movq %r8, 16(%rdi)
  418. movq %r9, 24(%rdi)
  419. popq %r12
  420. repz retq
  421. #ifndef __APPLE__
  422. .size fe_sub,.-fe_sub
  423. #endif /* __APPLE__ */
  424. #ifndef __APPLE__
  425. .text
  426. .globl fe_add
  427. .type fe_add,@function
  428. .align 16
  429. fe_add:
  430. #else
  431. .section __TEXT,__text
  432. .globl _fe_add
  433. .p2align 4
  434. _fe_add:
  435. #endif /* __APPLE__ */
  436. pushq %r12
  437. # Add
  438. movq (%rsi), %rax
  439. movq 8(%rsi), %rcx
  440. addq (%rdx), %rax
  441. movq 16(%rsi), %r8
  442. adcq 8(%rdx), %rcx
  443. movq 24(%rsi), %r10
  444. adcq 16(%rdx), %r8
  445. movq $-19, %r11
  446. adcq 24(%rdx), %r10
  447. movq $0x7fffffffffffffff, %r12
  448. movq %r10, %r9
  449. sarq $63, %r10
  450. # Mask the modulus
  451. andq %r10, %r11
  452. andq %r10, %r12
  453. # Sub modulus (if overflow)
  454. subq %r11, %rax
  455. sbbq %r10, %rcx
  456. sbbq %r10, %r8
  457. sbbq %r12, %r9
  458. movq %rax, (%rdi)
  459. movq %rcx, 8(%rdi)
  460. movq %r8, 16(%rdi)
  461. movq %r9, 24(%rdi)
  462. popq %r12
  463. repz retq
  464. #ifndef __APPLE__
  465. .size fe_add,.-fe_add
  466. #endif /* __APPLE__ */
  467. #ifndef __APPLE__
  468. .text
  469. .globl fe_neg
  470. .type fe_neg,@function
  471. .align 16
  472. fe_neg:
  473. #else
  474. .section __TEXT,__text
  475. .globl _fe_neg
  476. .p2align 4
  477. _fe_neg:
  478. #endif /* __APPLE__ */
  479. movq $-19, %rdx
  480. movq $-1, %rax
  481. movq $-1, %rcx
  482. movq $0x7fffffffffffffff, %r8
  483. subq (%rsi), %rdx
  484. sbbq 8(%rsi), %rax
  485. sbbq 16(%rsi), %rcx
  486. sbbq 24(%rsi), %r8
  487. movq %rdx, (%rdi)
  488. movq %rax, 8(%rdi)
  489. movq %rcx, 16(%rdi)
  490. movq %r8, 24(%rdi)
  491. repz retq
  492. #ifndef __APPLE__
  493. .size fe_neg,.-fe_neg
  494. #endif /* __APPLE__ */
  495. #ifndef __APPLE__
  496. .text
  497. .globl fe_cmov
  498. .type fe_cmov,@function
  499. .align 16
  500. fe_cmov:
  501. #else
  502. .section __TEXT,__text
  503. .globl _fe_cmov
  504. .p2align 4
  505. _fe_cmov:
  506. #endif /* __APPLE__ */
  507. cmpl $0x01, %edx
  508. movq (%rdi), %rcx
  509. movq 8(%rdi), %r8
  510. movq 16(%rdi), %r9
  511. movq 24(%rdi), %r10
  512. cmoveq (%rsi), %rcx
  513. cmoveq 8(%rsi), %r8
  514. cmoveq 16(%rsi), %r9
  515. cmoveq 24(%rsi), %r10
  516. movq %rcx, (%rdi)
  517. movq %r8, 8(%rdi)
  518. movq %r9, 16(%rdi)
  519. movq %r10, 24(%rdi)
  520. repz retq
  521. #ifndef __APPLE__
  522. .size fe_cmov,.-fe_cmov
  523. #endif /* __APPLE__ */
  524. #ifndef __APPLE__
  525. .text
  526. .globl fe_isnonzero
  527. .type fe_isnonzero,@function
  528. .align 16
  529. fe_isnonzero:
  530. #else
  531. .section __TEXT,__text
  532. .globl _fe_isnonzero
  533. .p2align 4
  534. _fe_isnonzero:
  535. #endif /* __APPLE__ */
  536. movq $0x7fffffffffffffff, %r10
  537. movq (%rdi), %rax
  538. movq 8(%rdi), %rdx
  539. movq 16(%rdi), %rcx
  540. movq 24(%rdi), %r8
  541. addq $19, %rax
  542. adcq $0x00, %rdx
  543. adcq $0x00, %rcx
  544. adcq $0x00, %r8
  545. shrq $63, %r8
  546. imulq $19, %r8, %r9
  547. movq (%rdi), %rax
  548. movq 8(%rdi), %rdx
  549. movq 16(%rdi), %rcx
  550. movq 24(%rdi), %r8
  551. addq %r9, %rax
  552. adcq $0x00, %rdx
  553. adcq $0x00, %rcx
  554. adcq $0x00, %r8
  555. andq %r10, %r8
  556. orq %rdx, %rax
  557. orq %rcx, %rax
  558. orq %r8, %rax
  559. repz retq
  560. #ifndef __APPLE__
  561. .size fe_isnonzero,.-fe_isnonzero
  562. #endif /* __APPLE__ */
  563. #ifndef __APPLE__
  564. .text
  565. .globl fe_isnegative
  566. .type fe_isnegative,@function
  567. .align 16
  568. fe_isnegative:
  569. #else
  570. .section __TEXT,__text
  571. .globl _fe_isnegative
  572. .p2align 4
  573. _fe_isnegative:
  574. #endif /* __APPLE__ */
  575. movq $0x7fffffffffffffff, %r11
  576. movq (%rdi), %rdx
  577. movq 8(%rdi), %rcx
  578. movq 16(%rdi), %r8
  579. movq 24(%rdi), %r9
  580. movq %rdx, %rax
  581. addq $19, %rdx
  582. adcq $0x00, %rcx
  583. adcq $0x00, %r8
  584. adcq $0x00, %r9
  585. shrq $63, %r9
  586. imulq $19, %r9, %r10
  587. addq %r10, %rax
  588. andq $0x01, %rax
  589. repz retq
  590. #ifndef __APPLE__
  591. .size fe_isnegative,.-fe_isnegative
  592. #endif /* __APPLE__ */
  593. #ifndef __APPLE__
  594. .text
  595. .globl fe_cmov_table
  596. .type fe_cmov_table,@function
  597. .align 16
  598. fe_cmov_table:
  599. #else
  600. .section __TEXT,__text
  601. .globl _fe_cmov_table
  602. .p2align 4
  603. _fe_cmov_table:
  604. #endif /* __APPLE__ */
  605. pushq %r12
  606. pushq %r13
  607. pushq %r14
  608. pushq %r15
  609. movq %rdx, %rcx
  610. movsbq %cl, %rax
  611. cdq
  612. xorb %dl, %al
  613. subb %dl, %al
  614. movb %al, %r15b
  615. movq $0x01, %rax
  616. xorq %rdx, %rdx
  617. xorq %r8, %r8
  618. xorq %r9, %r9
  619. movq $0x01, %r10
  620. xorq %r11, %r11
  621. xorq %r12, %r12
  622. xorq %r13, %r13
  623. cmpb $0x01, %r15b
  624. movq (%rsi), %r14
  625. cmoveq %r14, %rax
  626. movq 8(%rsi), %r14
  627. cmoveq %r14, %rdx
  628. movq 16(%rsi), %r14
  629. cmoveq %r14, %r8
  630. movq 24(%rsi), %r14
  631. cmoveq %r14, %r9
  632. movq 32(%rsi), %r14
  633. cmoveq %r14, %r10
  634. movq 40(%rsi), %r14
  635. cmoveq %r14, %r11
  636. movq 48(%rsi), %r14
  637. cmoveq %r14, %r12
  638. movq 56(%rsi), %r14
  639. cmoveq %r14, %r13
  640. cmpb $2, %r15b
  641. movq 96(%rsi), %r14
  642. cmoveq %r14, %rax
  643. movq 104(%rsi), %r14
  644. cmoveq %r14, %rdx
  645. movq 112(%rsi), %r14
  646. cmoveq %r14, %r8
  647. movq 120(%rsi), %r14
  648. cmoveq %r14, %r9
  649. movq 128(%rsi), %r14
  650. cmoveq %r14, %r10
  651. movq 136(%rsi), %r14
  652. cmoveq %r14, %r11
  653. movq 144(%rsi), %r14
  654. cmoveq %r14, %r12
  655. movq 152(%rsi), %r14
  656. cmoveq %r14, %r13
  657. cmpb $3, %r15b
  658. movq 192(%rsi), %r14
  659. cmoveq %r14, %rax
  660. movq 200(%rsi), %r14
  661. cmoveq %r14, %rdx
  662. movq 208(%rsi), %r14
  663. cmoveq %r14, %r8
  664. movq 216(%rsi), %r14
  665. cmoveq %r14, %r9
  666. movq 224(%rsi), %r14
  667. cmoveq %r14, %r10
  668. movq 232(%rsi), %r14
  669. cmoveq %r14, %r11
  670. movq 240(%rsi), %r14
  671. cmoveq %r14, %r12
  672. movq 248(%rsi), %r14
  673. cmoveq %r14, %r13
  674. cmpb $4, %r15b
  675. movq 288(%rsi), %r14
  676. cmoveq %r14, %rax
  677. movq 296(%rsi), %r14
  678. cmoveq %r14, %rdx
  679. movq 304(%rsi), %r14
  680. cmoveq %r14, %r8
  681. movq 312(%rsi), %r14
  682. cmoveq %r14, %r9
  683. movq 320(%rsi), %r14
  684. cmoveq %r14, %r10
  685. movq 328(%rsi), %r14
  686. cmoveq %r14, %r11
  687. movq 336(%rsi), %r14
  688. cmoveq %r14, %r12
  689. movq 344(%rsi), %r14
  690. cmoveq %r14, %r13
  691. cmpb $5, %r15b
  692. movq 384(%rsi), %r14
  693. cmoveq %r14, %rax
  694. movq 392(%rsi), %r14
  695. cmoveq %r14, %rdx
  696. movq 400(%rsi), %r14
  697. cmoveq %r14, %r8
  698. movq 408(%rsi), %r14
  699. cmoveq %r14, %r9
  700. movq 416(%rsi), %r14
  701. cmoveq %r14, %r10
  702. movq 424(%rsi), %r14
  703. cmoveq %r14, %r11
  704. movq 432(%rsi), %r14
  705. cmoveq %r14, %r12
  706. movq 440(%rsi), %r14
  707. cmoveq %r14, %r13
  708. cmpb $6, %r15b
  709. movq 480(%rsi), %r14
  710. cmoveq %r14, %rax
  711. movq 488(%rsi), %r14
  712. cmoveq %r14, %rdx
  713. movq 496(%rsi), %r14
  714. cmoveq %r14, %r8
  715. movq 504(%rsi), %r14
  716. cmoveq %r14, %r9
  717. movq 512(%rsi), %r14
  718. cmoveq %r14, %r10
  719. movq 520(%rsi), %r14
  720. cmoveq %r14, %r11
  721. movq 528(%rsi), %r14
  722. cmoveq %r14, %r12
  723. movq 536(%rsi), %r14
  724. cmoveq %r14, %r13
  725. cmpb $7, %r15b
  726. movq 576(%rsi), %r14
  727. cmoveq %r14, %rax
  728. movq 584(%rsi), %r14
  729. cmoveq %r14, %rdx
  730. movq 592(%rsi), %r14
  731. cmoveq %r14, %r8
  732. movq 600(%rsi), %r14
  733. cmoveq %r14, %r9
  734. movq 608(%rsi), %r14
  735. cmoveq %r14, %r10
  736. movq 616(%rsi), %r14
  737. cmoveq %r14, %r11
  738. movq 624(%rsi), %r14
  739. cmoveq %r14, %r12
  740. movq 632(%rsi), %r14
  741. cmoveq %r14, %r13
  742. cmpb $8, %r15b
  743. movq 672(%rsi), %r14
  744. cmoveq %r14, %rax
  745. movq 680(%rsi), %r14
  746. cmoveq %r14, %rdx
  747. movq 688(%rsi), %r14
  748. cmoveq %r14, %r8
  749. movq 696(%rsi), %r14
  750. cmoveq %r14, %r9
  751. movq 704(%rsi), %r14
  752. cmoveq %r14, %r10
  753. movq 712(%rsi), %r14
  754. cmoveq %r14, %r11
  755. movq 720(%rsi), %r14
  756. cmoveq %r14, %r12
  757. movq 728(%rsi), %r14
  758. cmoveq %r14, %r13
  759. cmpb $0x00, %cl
  760. movq %rax, %r14
  761. cmovlq %r10, %rax
  762. cmovlq %r14, %r10
  763. movq %rdx, %r14
  764. cmovlq %r11, %rdx
  765. cmovlq %r14, %r11
  766. movq %r8, %r14
  767. cmovlq %r12, %r8
  768. cmovlq %r14, %r12
  769. movq %r9, %r14
  770. cmovlq %r13, %r9
  771. cmovlq %r14, %r13
  772. movq %rax, (%rdi)
  773. movq %rdx, 8(%rdi)
  774. movq %r8, 16(%rdi)
  775. movq %r9, 24(%rdi)
  776. movq %r10, 32(%rdi)
  777. movq %r11, 40(%rdi)
  778. movq %r12, 48(%rdi)
  779. movq %r13, 56(%rdi)
  780. xorq %rax, %rax
  781. xorq %rdx, %rdx
  782. xorq %r8, %r8
  783. xorq %r9, %r9
  784. cmpb $0x01, %r15b
  785. movq 64(%rsi), %r14
  786. cmoveq %r14, %rax
  787. movq 72(%rsi), %r14
  788. cmoveq %r14, %rdx
  789. movq 80(%rsi), %r14
  790. cmoveq %r14, %r8
  791. movq 88(%rsi), %r14
  792. cmoveq %r14, %r9
  793. cmpb $2, %r15b
  794. movq 160(%rsi), %r14
  795. cmoveq %r14, %rax
  796. movq 168(%rsi), %r14
  797. cmoveq %r14, %rdx
  798. movq 176(%rsi), %r14
  799. cmoveq %r14, %r8
  800. movq 184(%rsi), %r14
  801. cmoveq %r14, %r9
  802. cmpb $3, %r15b
  803. movq 256(%rsi), %r14
  804. cmoveq %r14, %rax
  805. movq 264(%rsi), %r14
  806. cmoveq %r14, %rdx
  807. movq 272(%rsi), %r14
  808. cmoveq %r14, %r8
  809. movq 280(%rsi), %r14
  810. cmoveq %r14, %r9
  811. cmpb $4, %r15b
  812. movq 352(%rsi), %r14
  813. cmoveq %r14, %rax
  814. movq 360(%rsi), %r14
  815. cmoveq %r14, %rdx
  816. movq 368(%rsi), %r14
  817. cmoveq %r14, %r8
  818. movq 376(%rsi), %r14
  819. cmoveq %r14, %r9
  820. cmpb $5, %r15b
  821. movq 448(%rsi), %r14
  822. cmoveq %r14, %rax
  823. movq 456(%rsi), %r14
  824. cmoveq %r14, %rdx
  825. movq 464(%rsi), %r14
  826. cmoveq %r14, %r8
  827. movq 472(%rsi), %r14
  828. cmoveq %r14, %r9
  829. cmpb $6, %r15b
  830. movq 544(%rsi), %r14
  831. cmoveq %r14, %rax
  832. movq 552(%rsi), %r14
  833. cmoveq %r14, %rdx
  834. movq 560(%rsi), %r14
  835. cmoveq %r14, %r8
  836. movq 568(%rsi), %r14
  837. cmoveq %r14, %r9
  838. cmpb $7, %r15b
  839. movq 640(%rsi), %r14
  840. cmoveq %r14, %rax
  841. movq 648(%rsi), %r14
  842. cmoveq %r14, %rdx
  843. movq 656(%rsi), %r14
  844. cmoveq %r14, %r8
  845. movq 664(%rsi), %r14
  846. cmoveq %r14, %r9
  847. cmpb $8, %r15b
  848. movq 736(%rsi), %r14
  849. cmoveq %r14, %rax
  850. movq 744(%rsi), %r14
  851. cmoveq %r14, %rdx
  852. movq 752(%rsi), %r14
  853. cmoveq %r14, %r8
  854. movq 760(%rsi), %r14
  855. cmoveq %r14, %r9
  856. movq $-19, %r10
  857. movq $-1, %r11
  858. movq $-1, %r12
  859. movq $0x7fffffffffffffff, %r13
  860. subq %rax, %r10
  861. sbbq %rdx, %r11
  862. sbbq %r8, %r12
  863. sbbq %r9, %r13
  864. cmpb $0x00, %cl
  865. cmovlq %r10, %rax
  866. cmovlq %r11, %rdx
  867. cmovlq %r12, %r8
  868. cmovlq %r13, %r9
  869. movq %rax, 64(%rdi)
  870. movq %rdx, 72(%rdi)
  871. movq %r8, 80(%rdi)
  872. movq %r9, 88(%rdi)
  873. popq %r15
  874. popq %r14
  875. popq %r13
  876. popq %r12
  877. repz retq
  878. #ifndef __APPLE__
  879. .size fe_cmov_table,.-fe_cmov_table
  880. #endif /* __APPLE__ */
  881. #ifndef __APPLE__
  882. .text
  883. .globl fe_mul
  884. .type fe_mul,@function
  885. .align 16
  886. fe_mul:
  887. #else
  888. .section __TEXT,__text
  889. .globl _fe_mul
  890. .p2align 4
  891. _fe_mul:
  892. #endif /* __APPLE__ */
  893. #ifndef __APPLE__
  894. jmpq *fe_mul_p(%rip)
  895. #else
  896. jmpq *_fe_mul_p(%rip)
  897. #endif /* __APPLE__ */
  898. #ifndef __APPLE__
  899. .size fe_mul,.-fe_mul
  900. #endif /* __APPLE__ */
  901. #ifndef __APPLE__
  902. .text
  903. .globl fe_sq
  904. .type fe_sq,@function
  905. .align 16
  906. fe_sq:
  907. #else
  908. .section __TEXT,__text
  909. .globl _fe_sq
  910. .p2align 4
  911. _fe_sq:
  912. #endif /* __APPLE__ */
  913. #ifndef __APPLE__
  914. jmpq *fe_sq_p(%rip)
  915. #else
  916. jmpq *_fe_sq_p(%rip)
  917. #endif /* __APPLE__ */
  918. #ifndef __APPLE__
  919. .size fe_sq,.-fe_sq
  920. #endif /* __APPLE__ */
  921. #ifndef __APPLE__
  922. .text
  923. .globl fe_mul121666
  924. .type fe_mul121666,@function
  925. .align 16
  926. fe_mul121666:
  927. #else
  928. .section __TEXT,__text
  929. .globl _fe_mul121666
  930. .p2align 4
  931. _fe_mul121666:
  932. #endif /* __APPLE__ */
  933. #ifndef __APPLE__
  934. jmpq *fe_mul121666_p(%rip)
  935. #else
  936. jmpq *_fe_mul121666_p(%rip)
  937. #endif /* __APPLE__ */
  938. #ifndef __APPLE__
  939. .size fe_mul121666,.-fe_mul121666
  940. #endif /* __APPLE__ */
  941. #ifndef __APPLE__
  942. .text
  943. .globl fe_sq2
  944. .type fe_sq2,@function
  945. .align 16
  946. fe_sq2:
  947. #else
  948. .section __TEXT,__text
  949. .globl _fe_sq2
  950. .p2align 4
  951. _fe_sq2:
  952. #endif /* __APPLE__ */
  953. #ifndef __APPLE__
  954. jmpq *fe_sq2_p(%rip)
  955. #else
  956. jmpq *_fe_sq2_p(%rip)
  957. #endif /* __APPLE__ */
  958. #ifndef __APPLE__
  959. .size fe_sq2,.-fe_sq2
  960. #endif /* __APPLE__ */
  961. #ifndef __APPLE__
  962. .text
  963. .globl fe_invert
  964. .type fe_invert,@function
  965. .align 16
  966. fe_invert:
  967. #else
  968. .section __TEXT,__text
  969. .globl _fe_invert
  970. .p2align 4
  971. _fe_invert:
  972. #endif /* __APPLE__ */
  973. #ifndef __APPLE__
  974. jmpq *fe_invert_p(%rip)
  975. #else
  976. jmpq *_fe_invert_p(%rip)
  977. #endif /* __APPLE__ */
  978. #ifndef __APPLE__
  979. .size fe_invert,.-fe_invert
  980. #endif /* __APPLE__ */
  981. #ifndef __APPLE__
  982. .text
  983. .globl curve25519
  984. .type curve25519,@function
  985. .align 16
  986. curve25519:
  987. #else
  988. .section __TEXT,__text
  989. .globl _curve25519
  990. .p2align 4
  991. _curve25519:
  992. #endif /* __APPLE__ */
  993. #ifndef __APPLE__
  994. jmpq *curve25519_p(%rip)
  995. #else
  996. jmpq *_curve25519_p(%rip)
  997. #endif /* __APPLE__ */
  998. #ifndef __APPLE__
  999. .size curve25519,.-curve25519
  1000. #endif /* __APPLE__ */
  1001. #ifndef __APPLE__
  1002. .text
  1003. .globl fe_pow22523
  1004. .type fe_pow22523,@function
  1005. .align 16
  1006. fe_pow22523:
  1007. #else
  1008. .section __TEXT,__text
  1009. .globl _fe_pow22523
  1010. .p2align 4
  1011. _fe_pow22523:
  1012. #endif /* __APPLE__ */
  1013. #ifndef __APPLE__
  1014. jmpq *fe_pow22523_p(%rip)
  1015. #else
  1016. jmpq *_fe_pow22523_p(%rip)
  1017. #endif /* __APPLE__ */
  1018. #ifndef __APPLE__
  1019. .size fe_pow22523,.-fe_pow22523
  1020. #endif /* __APPLE__ */
  1021. #ifndef __APPLE__
  1022. .text
  1023. .globl fe_ge_to_p2
  1024. .type fe_ge_to_p2,@function
  1025. .align 16
  1026. fe_ge_to_p2:
  1027. #else
  1028. .section __TEXT,__text
  1029. .globl _fe_ge_to_p2
  1030. .p2align 4
  1031. _fe_ge_to_p2:
  1032. #endif /* __APPLE__ */
  1033. #ifndef __APPLE__
  1034. jmpq *fe_ge_to_p2_p(%rip)
  1035. #else
  1036. jmpq *_fe_ge_to_p2_p(%rip)
  1037. #endif /* __APPLE__ */
  1038. #ifndef __APPLE__
  1039. .size fe_ge_to_p2,.-fe_ge_to_p2
  1040. #endif /* __APPLE__ */
  1041. #ifndef __APPLE__
  1042. .text
  1043. .globl fe_ge_to_p3
  1044. .type fe_ge_to_p3,@function
  1045. .align 16
  1046. fe_ge_to_p3:
  1047. #else
  1048. .section __TEXT,__text
  1049. .globl _fe_ge_to_p3
  1050. .p2align 4
  1051. _fe_ge_to_p3:
  1052. #endif /* __APPLE__ */
  1053. #ifndef __APPLE__
  1054. jmpq *fe_ge_to_p3_p(%rip)
  1055. #else
  1056. jmpq *_fe_ge_to_p3_p(%rip)
  1057. #endif /* __APPLE__ */
  1058. #ifndef __APPLE__
  1059. .size fe_ge_to_p3,.-fe_ge_to_p3
  1060. #endif /* __APPLE__ */
  1061. #ifndef __APPLE__
  1062. .text
  1063. .globl fe_ge_dbl
  1064. .type fe_ge_dbl,@function
  1065. .align 16
  1066. fe_ge_dbl:
  1067. #else
  1068. .section __TEXT,__text
  1069. .globl _fe_ge_dbl
  1070. .p2align 4
  1071. _fe_ge_dbl:
  1072. #endif /* __APPLE__ */
  1073. #ifndef __APPLE__
  1074. jmpq *fe_ge_dbl_p(%rip)
  1075. #else
  1076. jmpq *_fe_ge_dbl_p(%rip)
  1077. #endif /* __APPLE__ */
  1078. #ifndef __APPLE__
  1079. .size fe_ge_dbl,.-fe_ge_dbl
  1080. #endif /* __APPLE__ */
  1081. #ifndef __APPLE__
  1082. .text
  1083. .globl fe_ge_madd
  1084. .type fe_ge_madd,@function
  1085. .align 16
  1086. fe_ge_madd:
  1087. #else
  1088. .section __TEXT,__text
  1089. .globl _fe_ge_madd
  1090. .p2align 4
  1091. _fe_ge_madd:
  1092. #endif /* __APPLE__ */
  1093. #ifndef __APPLE__
  1094. jmpq *fe_ge_madd_p(%rip)
  1095. #else
  1096. jmpq *_fe_ge_madd_p(%rip)
  1097. #endif /* __APPLE__ */
  1098. #ifndef __APPLE__
  1099. .size fe_ge_madd,.-fe_ge_madd
  1100. #endif /* __APPLE__ */
  1101. #ifndef __APPLE__
  1102. .text
  1103. .globl fe_ge_msub
  1104. .type fe_ge_msub,@function
  1105. .align 16
  1106. fe_ge_msub:
  1107. #else
  1108. .section __TEXT,__text
  1109. .globl _fe_ge_msub
  1110. .p2align 4
  1111. _fe_ge_msub:
  1112. #endif /* __APPLE__ */
  1113. #ifndef __APPLE__
  1114. jmpq *fe_ge_msub_p(%rip)
  1115. #else
  1116. jmpq *_fe_ge_msub_p(%rip)
  1117. #endif /* __APPLE__ */
  1118. #ifndef __APPLE__
  1119. .size fe_ge_msub,.-fe_ge_msub
  1120. #endif /* __APPLE__ */
  1121. #ifndef __APPLE__
  1122. .text
  1123. .globl fe_ge_add
  1124. .type fe_ge_add,@function
  1125. .align 16
  1126. fe_ge_add:
  1127. #else
  1128. .section __TEXT,__text
  1129. .globl _fe_ge_add
  1130. .p2align 4
  1131. _fe_ge_add:
  1132. #endif /* __APPLE__ */
  1133. #ifndef __APPLE__
  1134. jmpq *fe_ge_add_p(%rip)
  1135. #else
  1136. jmpq *_fe_ge_add_p(%rip)
  1137. #endif /* __APPLE__ */
  1138. #ifndef __APPLE__
  1139. .size fe_ge_add,.-fe_ge_add
  1140. #endif /* __APPLE__ */
  1141. #ifndef __APPLE__
  1142. .text
  1143. .globl fe_ge_sub
  1144. .type fe_ge_sub,@function
  1145. .align 16
  1146. fe_ge_sub:
  1147. #else
  1148. .section __TEXT,__text
  1149. .globl _fe_ge_sub
  1150. .p2align 4
  1151. _fe_ge_sub:
  1152. #endif /* __APPLE__ */
  1153. #ifndef __APPLE__
  1154. jmpq *fe_ge_sub_p(%rip)
  1155. #else
  1156. jmpq *_fe_ge_sub_p(%rip)
  1157. #endif /* __APPLE__ */
  1158. #ifndef __APPLE__
  1159. .size fe_ge_sub,.-fe_ge_sub
  1160. #endif /* __APPLE__ */
  1161. #ifndef __APPLE__
  1162. .data
  1163. .type cpuFlagsSet, @object
  1164. .size cpuFlagsSet,4
  1165. cpuFlagsSet:
  1166. .long 0
  1167. #else
  1168. .section __DATA,__data
  1169. .p2align 2
  1170. _cpuFlagsSet:
  1171. .long 0
  1172. #endif /* __APPLE__ */
  1173. #ifndef __APPLE__
  1174. .data
  1175. .type intelFlags, @object
  1176. .size intelFlags,4
  1177. intelFlags:
  1178. .long 0
  1179. #else
  1180. .section __DATA,__data
  1181. .p2align 2
  1182. _intelFlags:
  1183. .long 0
  1184. #endif /* __APPLE__ */
  1185. #ifndef __APPLE__
  1186. .data
  1187. .type fe_mul_p, @object
  1188. .size fe_mul_p,8
  1189. fe_mul_p:
  1190. .quad fe_mul_x64
  1191. #else
  1192. .section __DATA,__data
  1193. .p2align 2
  1194. _fe_mul_p:
  1195. .quad _fe_mul_x64
  1196. #endif /* __APPLE__ */
  1197. #ifndef __APPLE__
  1198. .data
  1199. .type fe_sq_p, @object
  1200. .size fe_sq_p,8
  1201. fe_sq_p:
  1202. .quad fe_sq_x64
  1203. #else
  1204. .section __DATA,__data
  1205. .p2align 2
  1206. _fe_sq_p:
  1207. .quad _fe_sq_x64
  1208. #endif /* __APPLE__ */
  1209. #ifndef __APPLE__
  1210. .data
  1211. .type fe_mul121666_p, @object
  1212. .size fe_mul121666_p,8
  1213. fe_mul121666_p:
  1214. .quad fe_mul121666_x64
  1215. #else
  1216. .section __DATA,__data
  1217. .p2align 2
  1218. _fe_mul121666_p:
  1219. .quad _fe_mul121666_x64
  1220. #endif /* __APPLE__ */
  1221. #ifndef __APPLE__
  1222. .data
  1223. .type fe_sq2_p, @object
  1224. .size fe_sq2_p,8
  1225. fe_sq2_p:
  1226. .quad fe_sq2_x64
  1227. #else
  1228. .section __DATA,__data
  1229. .p2align 2
  1230. _fe_sq2_p:
  1231. .quad _fe_sq2_x64
  1232. #endif /* __APPLE__ */
  1233. #ifndef __APPLE__
  1234. .data
  1235. .type fe_invert_p, @object
  1236. .size fe_invert_p,8
  1237. fe_invert_p:
  1238. .quad fe_invert_x64
  1239. #else
  1240. .section __DATA,__data
  1241. .p2align 2
  1242. _fe_invert_p:
  1243. .quad _fe_invert_x64
  1244. #endif /* __APPLE__ */
  1245. #ifndef __APPLE__
  1246. .data
  1247. .type curve25519_p, @object
  1248. .size curve25519_p,8
  1249. curve25519_p:
  1250. .quad curve25519_x64
  1251. #else
  1252. .section __DATA,__data
  1253. .p2align 2
  1254. _curve25519_p:
  1255. .quad _curve25519_x64
  1256. #endif /* __APPLE__ */
  1257. #ifndef __APPLE__
  1258. .data
  1259. .type fe_pow22523_p, @object
  1260. .size fe_pow22523_p,8
  1261. fe_pow22523_p:
  1262. .quad fe_pow22523_x64
  1263. #else
  1264. .section __DATA,__data
  1265. .p2align 2
  1266. _fe_pow22523_p:
  1267. .quad _fe_pow22523_x64
  1268. #endif /* __APPLE__ */
  1269. #ifndef __APPLE__
  1270. .data
  1271. .type fe_ge_to_p2_p, @object
  1272. .size fe_ge_to_p2_p,8
  1273. fe_ge_to_p2_p:
  1274. .quad fe_ge_to_p2_x64
  1275. #else
  1276. .section __DATA,__data
  1277. .p2align 2
  1278. _fe_ge_to_p2_p:
  1279. .quad _fe_ge_to_p2_x64
  1280. #endif /* __APPLE__ */
  1281. #ifndef __APPLE__
  1282. .data
  1283. .type fe_ge_to_p3_p, @object
  1284. .size fe_ge_to_p3_p,8
  1285. fe_ge_to_p3_p:
  1286. .quad fe_ge_to_p3_x64
  1287. #else
  1288. .section __DATA,__data
  1289. .p2align 2
  1290. _fe_ge_to_p3_p:
  1291. .quad _fe_ge_to_p3_x64
  1292. #endif /* __APPLE__ */
  1293. #ifndef __APPLE__
  1294. .data
  1295. .type fe_ge_dbl_p, @object
  1296. .size fe_ge_dbl_p,8
  1297. fe_ge_dbl_p:
  1298. .quad fe_ge_dbl_x64
  1299. #else
  1300. .section __DATA,__data
  1301. .p2align 2
  1302. _fe_ge_dbl_p:
  1303. .quad _fe_ge_dbl_x64
  1304. #endif /* __APPLE__ */
  1305. #ifndef __APPLE__
  1306. .data
  1307. .type fe_ge_madd_p, @object
  1308. .size fe_ge_madd_p,8
  1309. fe_ge_madd_p:
  1310. .quad fe_ge_madd_x64
  1311. #else
  1312. .section __DATA,__data
  1313. .p2align 2
  1314. _fe_ge_madd_p:
  1315. .quad _fe_ge_madd_x64
  1316. #endif /* __APPLE__ */
  1317. #ifndef __APPLE__
  1318. .data
  1319. .type fe_ge_msub_p, @object
  1320. .size fe_ge_msub_p,8
  1321. fe_ge_msub_p:
  1322. .quad fe_ge_msub_x64
  1323. #else
  1324. .section __DATA,__data
  1325. .p2align 2
  1326. _fe_ge_msub_p:
  1327. .quad _fe_ge_msub_x64
  1328. #endif /* __APPLE__ */
  1329. #ifndef __APPLE__
  1330. .data
  1331. .type fe_ge_add_p, @object
  1332. .size fe_ge_add_p,8
  1333. fe_ge_add_p:
  1334. .quad fe_ge_add_x64
  1335. #else
  1336. .section __DATA,__data
  1337. .p2align 2
  1338. _fe_ge_add_p:
  1339. .quad _fe_ge_add_x64
  1340. #endif /* __APPLE__ */
  1341. #ifndef __APPLE__
  1342. .data
  1343. .type fe_ge_sub_p, @object
  1344. .size fe_ge_sub_p,8
  1345. fe_ge_sub_p:
  1346. .quad fe_ge_sub_x64
  1347. #else
  1348. .section __DATA,__data
  1349. .p2align 2
  1350. _fe_ge_sub_p:
  1351. .quad _fe_ge_sub_x64
  1352. #endif /* __APPLE__ */
  1353. #ifndef __APPLE__
  1354. .text
  1355. .globl fe_mul_x64
  1356. .type fe_mul_x64,@function
  1357. .align 16
  1358. fe_mul_x64:
  1359. #else
  1360. .section __TEXT,__text
  1361. .globl _fe_mul_x64
  1362. .p2align 4
  1363. _fe_mul_x64:
  1364. #endif /* __APPLE__ */
  1365. pushq %r12
  1366. pushq %r13
  1367. pushq %r14
  1368. pushq %r15
  1369. pushq %rbx
  1370. movq %rdx, %rcx
  1371. # Multiply
  1372. # A[0] * B[0]
  1373. movq (%rcx), %rax
  1374. mulq (%rsi)
  1375. movq %rax, %r8
  1376. movq %rdx, %r9
  1377. # A[0] * B[1]
  1378. movq 8(%rcx), %rax
  1379. mulq (%rsi)
  1380. xorq %r10, %r10
  1381. addq %rax, %r9
  1382. adcq %rdx, %r10
  1383. # A[1] * B[0]
  1384. movq (%rcx), %rax
  1385. mulq 8(%rsi)
  1386. xorq %r11, %r11
  1387. addq %rax, %r9
  1388. adcq %rdx, %r10
  1389. adcq $0x00, %r11
  1390. # A[0] * B[2]
  1391. movq 16(%rcx), %rax
  1392. mulq (%rsi)
  1393. addq %rax, %r10
  1394. adcq %rdx, %r11
  1395. # A[1] * B[1]
  1396. movq 8(%rcx), %rax
  1397. mulq 8(%rsi)
  1398. xorq %r12, %r12
  1399. addq %rax, %r10
  1400. adcq %rdx, %r11
  1401. adcq $0x00, %r12
  1402. # A[2] * B[0]
  1403. movq (%rcx), %rax
  1404. mulq 16(%rsi)
  1405. addq %rax, %r10
  1406. adcq %rdx, %r11
  1407. adcq $0x00, %r12
  1408. # A[0] * B[3]
  1409. movq 24(%rcx), %rax
  1410. mulq (%rsi)
  1411. xorq %r13, %r13
  1412. addq %rax, %r11
  1413. adcq %rdx, %r12
  1414. adcq $0x00, %r13
  1415. # A[1] * B[2]
  1416. movq 16(%rcx), %rax
  1417. mulq 8(%rsi)
  1418. addq %rax, %r11
  1419. adcq %rdx, %r12
  1420. adcq $0x00, %r13
  1421. # A[2] * B[1]
  1422. movq 8(%rcx), %rax
  1423. mulq 16(%rsi)
  1424. addq %rax, %r11
  1425. adcq %rdx, %r12
  1426. adcq $0x00, %r13
  1427. # A[3] * B[0]
  1428. movq (%rcx), %rax
  1429. mulq 24(%rsi)
  1430. addq %rax, %r11
  1431. adcq %rdx, %r12
  1432. adcq $0x00, %r13
  1433. # A[1] * B[3]
  1434. movq 24(%rcx), %rax
  1435. mulq 8(%rsi)
  1436. xorq %r14, %r14
  1437. addq %rax, %r12
  1438. adcq %rdx, %r13
  1439. adcq $0x00, %r14
  1440. # A[2] * B[2]
  1441. movq 16(%rcx), %rax
  1442. mulq 16(%rsi)
  1443. addq %rax, %r12
  1444. adcq %rdx, %r13
  1445. adcq $0x00, %r14
  1446. # A[3] * B[1]
  1447. movq 8(%rcx), %rax
  1448. mulq 24(%rsi)
  1449. addq %rax, %r12
  1450. adcq %rdx, %r13
  1451. adcq $0x00, %r14
  1452. # A[2] * B[3]
  1453. movq 24(%rcx), %rax
  1454. mulq 16(%rsi)
  1455. xorq %r15, %r15
  1456. addq %rax, %r13
  1457. adcq %rdx, %r14
  1458. adcq $0x00, %r15
  1459. # A[3] * B[2]
  1460. movq 16(%rcx), %rax
  1461. mulq 24(%rsi)
  1462. addq %rax, %r13
  1463. adcq %rdx, %r14
  1464. adcq $0x00, %r15
  1465. # A[3] * B[3]
  1466. movq 24(%rcx), %rax
  1467. mulq 24(%rsi)
  1468. addq %rax, %r14
  1469. adcq %rdx, %r15
  1470. # Reduce
  1471. movq $0x7fffffffffffffff, %rbx
  1472. # Move top half into t4-t7 and remove top bit from t3
  1473. shldq $0x01, %r14, %r15
  1474. shldq $0x01, %r13, %r14
  1475. shldq $0x01, %r12, %r13
  1476. shldq $0x01, %r11, %r12
  1477. andq %rbx, %r11
  1478. # Multiply top half by 19
  1479. movq $19, %rax
  1480. mulq %r12
  1481. xorq %r12, %r12
  1482. addq %rax, %r8
  1483. movq $19, %rax
  1484. adcq %rdx, %r12
  1485. mulq %r13
  1486. xorq %r13, %r13
  1487. addq %rax, %r9
  1488. movq $19, %rax
  1489. adcq %rdx, %r13
  1490. mulq %r14
  1491. xorq %r14, %r14
  1492. addq %rax, %r10
  1493. movq $19, %rax
  1494. adcq %rdx, %r14
  1495. mulq %r15
  1496. # Add remaining product results in
  1497. addq %r12, %r9
  1498. adcq %r13, %r10
  1499. adcq %r14, %r11
  1500. adcq %rax, %r11
  1501. adcq $0x00, %rdx
  1502. # Overflow
  1503. shldq $0x01, %r11, %rdx
  1504. imulq $19, %rdx, %rax
  1505. andq %rbx, %r11
  1506. addq %rax, %r8
  1507. adcq $0x00, %r9
  1508. adcq $0x00, %r10
  1509. adcq $0x00, %r11
  1510. # Reduce if top bit set
  1511. movq %r11, %rdx
  1512. sarq $63, %rdx
  1513. andq $19, %rdx
  1514. andq %rbx, %r11
  1515. addq %rdx, %r8
  1516. adcq $0x00, %r9
  1517. adcq $0x00, %r10
  1518. adcq $0x00, %r11
  1519. # Store
  1520. movq %r8, (%rdi)
  1521. movq %r9, 8(%rdi)
  1522. movq %r10, 16(%rdi)
  1523. movq %r11, 24(%rdi)
  1524. popq %rbx
  1525. popq %r15
  1526. popq %r14
  1527. popq %r13
  1528. popq %r12
  1529. repz retq
  1530. #ifndef __APPLE__
  1531. .size fe_mul_x64,.-fe_mul_x64
  1532. #endif /* __APPLE__ */
  1533. #ifndef __APPLE__
  1534. .text
  1535. .globl fe_sq_x64
  1536. .type fe_sq_x64,@function
  1537. .align 16
  1538. fe_sq_x64:
  1539. #else
  1540. .section __TEXT,__text
  1541. .globl _fe_sq_x64
  1542. .p2align 4
  1543. _fe_sq_x64:
  1544. #endif /* __APPLE__ */
  1545. pushq %r12
  1546. pushq %r13
  1547. pushq %r14
  1548. pushq %r15
  1549. # Square
  1550. # A[0] * A[1]
  1551. movq (%rsi), %rax
  1552. mulq 8(%rsi)
  1553. movq %rax, %r8
  1554. movq %rdx, %r9
  1555. # A[0] * A[2]
  1556. movq (%rsi), %rax
  1557. mulq 16(%rsi)
  1558. xorq %r10, %r10
  1559. addq %rax, %r9
  1560. adcq %rdx, %r10
  1561. # A[0] * A[3]
  1562. movq (%rsi), %rax
  1563. mulq 24(%rsi)
  1564. xorq %r11, %r11
  1565. addq %rax, %r10
  1566. adcq %rdx, %r11
  1567. # A[1] * A[2]
  1568. movq 8(%rsi), %rax
  1569. mulq 16(%rsi)
  1570. xorq %r12, %r12
  1571. addq %rax, %r10
  1572. adcq %rdx, %r11
  1573. adcq $0x00, %r12
  1574. # A[1] * A[3]
  1575. movq 8(%rsi), %rax
  1576. mulq 24(%rsi)
  1577. addq %rax, %r11
  1578. adcq %rdx, %r12
  1579. # A[2] * A[3]
  1580. movq 16(%rsi), %rax
  1581. mulq 24(%rsi)
  1582. xorq %r13, %r13
  1583. addq %rax, %r12
  1584. adcq %rdx, %r13
  1585. # Double
  1586. xorq %r14, %r14
  1587. addq %r8, %r8
  1588. adcq %r9, %r9
  1589. adcq %r10, %r10
  1590. adcq %r11, %r11
  1591. adcq %r12, %r12
  1592. adcq %r13, %r13
  1593. adcq $0x00, %r14
  1594. # A[0] * A[0]
  1595. movq (%rsi), %rax
  1596. mulq %rax
  1597. movq %rax, %rcx
  1598. movq %rdx, %r15
  1599. # A[1] * A[1]
  1600. movq 8(%rsi), %rax
  1601. mulq %rax
  1602. addq %r15, %r8
  1603. adcq %rax, %r9
  1604. adcq $0x00, %rdx
  1605. movq %rdx, %r15
  1606. # A[2] * A[2]
  1607. movq 16(%rsi), %rax
  1608. mulq %rax
  1609. addq %r15, %r10
  1610. adcq %rax, %r11
  1611. adcq $0x00, %rdx
  1612. movq %rdx, %r15
  1613. # A[3] * A[3]
  1614. movq 24(%rsi), %rax
  1615. mulq %rax
  1616. addq %rax, %r13
  1617. adcq %rdx, %r14
  1618. addq %r15, %r12
  1619. adcq $0x00, %r13
  1620. adcq $0x00, %r14
  1621. # Reduce
  1622. movq $0x7fffffffffffffff, %r15
  1623. # Move top half into t4-t7 and remove top bit from t3
  1624. shldq $0x01, %r13, %r14
  1625. shldq $0x01, %r12, %r13
  1626. shldq $0x01, %r11, %r12
  1627. shldq $0x01, %r10, %r11
  1628. andq %r15, %r10
  1629. # Multiply top half by 19
  1630. movq $19, %rax
  1631. mulq %r11
  1632. xorq %r11, %r11
  1633. addq %rax, %rcx
  1634. movq $19, %rax
  1635. adcq %rdx, %r11
  1636. mulq %r12
  1637. xorq %r12, %r12
  1638. addq %rax, %r8
  1639. movq $19, %rax
  1640. adcq %rdx, %r12
  1641. mulq %r13
  1642. xorq %r13, %r13
  1643. addq %rax, %r9
  1644. movq $19, %rax
  1645. adcq %rdx, %r13
  1646. mulq %r14
  1647. # Add remaining product results in
  1648. addq %r11, %r8
  1649. adcq %r12, %r9
  1650. adcq %r13, %r10
  1651. adcq %rax, %r10
  1652. adcq $0x00, %rdx
  1653. # Overflow
  1654. shldq $0x01, %r10, %rdx
  1655. imulq $19, %rdx, %rax
  1656. andq %r15, %r10
  1657. addq %rax, %rcx
  1658. adcq $0x00, %r8
  1659. adcq $0x00, %r9
  1660. adcq $0x00, %r10
  1661. # Reduce if top bit set
  1662. movq %r10, %rdx
  1663. sarq $63, %rdx
  1664. andq $19, %rdx
  1665. andq %r15, %r10
  1666. addq %rdx, %rcx
  1667. adcq $0x00, %r8
  1668. adcq $0x00, %r9
  1669. adcq $0x00, %r10
  1670. # Store
  1671. movq %rcx, (%rdi)
  1672. movq %r8, 8(%rdi)
  1673. movq %r9, 16(%rdi)
  1674. movq %r10, 24(%rdi)
  1675. popq %r15
  1676. popq %r14
  1677. popq %r13
  1678. popq %r12
  1679. repz retq
  1680. #ifndef __APPLE__
  1681. .size fe_sq_x64,.-fe_sq_x64
  1682. #endif /* __APPLE__ */
  1683. #ifndef __APPLE__
  1684. .text
  1685. .globl fe_sq_n_x64
  1686. .type fe_sq_n_x64,@function
  1687. .align 16
  1688. fe_sq_n_x64:
  1689. #else
  1690. .section __TEXT,__text
  1691. .globl _fe_sq_n_x64
  1692. .p2align 4
  1693. _fe_sq_n_x64:
  1694. #endif /* __APPLE__ */
  1695. pushq %r12
  1696. pushq %r13
  1697. pushq %r14
  1698. pushq %r15
  1699. pushq %rbx
  1700. movq %rdx, %rcx
  1701. L_fe_sq_n_x64:
  1702. # Square
  1703. # A[0] * A[1]
  1704. movq (%rsi), %rax
  1705. mulq 8(%rsi)
  1706. movq %rax, %r9
  1707. movq %rdx, %r10
  1708. # A[0] * A[2]
  1709. movq (%rsi), %rax
  1710. mulq 16(%rsi)
  1711. xorq %r11, %r11
  1712. addq %rax, %r10
  1713. adcq %rdx, %r11
  1714. # A[0] * A[3]
  1715. movq (%rsi), %rax
  1716. mulq 24(%rsi)
  1717. xorq %r12, %r12
  1718. addq %rax, %r11
  1719. adcq %rdx, %r12
  1720. # A[1] * A[2]
  1721. movq 8(%rsi), %rax
  1722. mulq 16(%rsi)
  1723. xorq %r13, %r13
  1724. addq %rax, %r11
  1725. adcq %rdx, %r12
  1726. adcq $0x00, %r13
  1727. # A[1] * A[3]
  1728. movq 8(%rsi), %rax
  1729. mulq 24(%rsi)
  1730. addq %rax, %r12
  1731. adcq %rdx, %r13
  1732. # A[2] * A[3]
  1733. movq 16(%rsi), %rax
  1734. mulq 24(%rsi)
  1735. xorq %r14, %r14
  1736. addq %rax, %r13
  1737. adcq %rdx, %r14
  1738. # Double
  1739. xorq %r15, %r15
  1740. addq %r9, %r9
  1741. adcq %r10, %r10
  1742. adcq %r11, %r11
  1743. adcq %r12, %r12
  1744. adcq %r13, %r13
  1745. adcq %r14, %r14
  1746. adcq $0x00, %r15
  1747. # A[0] * A[0]
  1748. movq (%rsi), %rax
  1749. mulq %rax
  1750. movq %rax, %r8
  1751. movq %rdx, %rbx
  1752. # A[1] * A[1]
  1753. movq 8(%rsi), %rax
  1754. mulq %rax
  1755. addq %rbx, %r9
  1756. adcq %rax, %r10
  1757. adcq $0x00, %rdx
  1758. movq %rdx, %rbx
  1759. # A[2] * A[2]
  1760. movq 16(%rsi), %rax
  1761. mulq %rax
  1762. addq %rbx, %r11
  1763. adcq %rax, %r12
  1764. adcq $0x00, %rdx
  1765. movq %rdx, %rbx
  1766. # A[3] * A[3]
  1767. movq 24(%rsi), %rax
  1768. mulq %rax
  1769. addq %rax, %r14
  1770. adcq %rdx, %r15
  1771. addq %rbx, %r13
  1772. adcq $0x00, %r14
  1773. adcq $0x00, %r15
  1774. # Reduce
  1775. movq $0x7fffffffffffffff, %rbx
  1776. # Move top half into t4-t7 and remove top bit from t3
  1777. shldq $0x01, %r14, %r15
  1778. shldq $0x01, %r13, %r14
  1779. shldq $0x01, %r12, %r13
  1780. shldq $0x01, %r11, %r12
  1781. andq %rbx, %r11
  1782. # Multiply top half by 19
  1783. movq $19, %rax
  1784. mulq %r12
  1785. xorq %r12, %r12
  1786. addq %rax, %r8
  1787. movq $19, %rax
  1788. adcq %rdx, %r12
  1789. mulq %r13
  1790. xorq %r13, %r13
  1791. addq %rax, %r9
  1792. movq $19, %rax
  1793. adcq %rdx, %r13
  1794. mulq %r14
  1795. xorq %r14, %r14
  1796. addq %rax, %r10
  1797. movq $19, %rax
  1798. adcq %rdx, %r14
  1799. mulq %r15
  1800. # Add remaining product results in
  1801. addq %r12, %r9
  1802. adcq %r13, %r10
  1803. adcq %r14, %r11
  1804. adcq %rax, %r11
  1805. adcq $0x00, %rdx
  1806. # Overflow
  1807. shldq $0x01, %r11, %rdx
  1808. imulq $19, %rdx, %rax
  1809. andq %rbx, %r11
  1810. addq %rax, %r8
  1811. adcq $0x00, %r9
  1812. adcq $0x00, %r10
  1813. adcq $0x00, %r11
  1814. # Reduce if top bit set
  1815. movq %r11, %rdx
  1816. sarq $63, %rdx
  1817. andq $19, %rdx
  1818. andq %rbx, %r11
  1819. addq %rdx, %r8
  1820. adcq $0x00, %r9
  1821. adcq $0x00, %r10
  1822. adcq $0x00, %r11
  1823. # Store
  1824. movq %r8, (%rdi)
  1825. movq %r9, 8(%rdi)
  1826. movq %r10, 16(%rdi)
  1827. movq %r11, 24(%rdi)
  1828. decb %cl
  1829. jnz L_fe_sq_n_x64
  1830. popq %rbx
  1831. popq %r15
  1832. popq %r14
  1833. popq %r13
  1834. popq %r12
  1835. repz retq
  1836. #ifndef __APPLE__
  1837. .size fe_sq_n_x64,.-fe_sq_n_x64
  1838. #endif /* __APPLE__ */
  1839. #ifndef __APPLE__
  1840. .text
  1841. .globl fe_mul121666_x64
  1842. .type fe_mul121666_x64,@function
  1843. .align 16
  1844. fe_mul121666_x64:
  1845. #else
  1846. .section __TEXT,__text
  1847. .globl _fe_mul121666_x64
  1848. .p2align 4
  1849. _fe_mul121666_x64:
  1850. #endif /* __APPLE__ */
  1851. pushq %r12
  1852. # Multiply by 121666
  1853. movq $0x1db42, %rax
  1854. mulq (%rsi)
  1855. xorq %r10, %r10
  1856. movq %rax, %r8
  1857. movq %rdx, %r9
  1858. movq $0x1db42, %rax
  1859. mulq 8(%rsi)
  1860. xorq %r11, %r11
  1861. addq %rax, %r9
  1862. adcq %rdx, %r10
  1863. movq $0x1db42, %rax
  1864. mulq 16(%rsi)
  1865. xorq %r12, %r12
  1866. addq %rax, %r10
  1867. adcq %rdx, %r11
  1868. movq $0x1db42, %rax
  1869. mulq 24(%rsi)
  1870. movq $0x7fffffffffffffff, %rcx
  1871. addq %rax, %r11
  1872. adcq %rdx, %r12
  1873. shldq $0x01, %r11, %r12
  1874. andq %rcx, %r11
  1875. movq $19, %rax
  1876. mulq %r12
  1877. addq %rax, %r8
  1878. adcq $0x00, %r9
  1879. adcq $0x00, %r10
  1880. adcq $0x00, %r11
  1881. movq %r8, (%rdi)
  1882. movq %r9, 8(%rdi)
  1883. movq %r10, 16(%rdi)
  1884. movq %r11, 24(%rdi)
  1885. popq %r12
  1886. repz retq
  1887. #ifndef __APPLE__
  1888. .size fe_mul121666_x64,.-fe_mul121666_x64
  1889. #endif /* __APPLE__ */
  1890. #ifndef __APPLE__
  1891. .text
  1892. .globl fe_sq2_x64
  1893. .type fe_sq2_x64,@function
  1894. .align 16
  1895. fe_sq2_x64:
  1896. #else
  1897. .section __TEXT,__text
  1898. .globl _fe_sq2_x64
  1899. .p2align 4
  1900. _fe_sq2_x64:
  1901. #endif /* __APPLE__ */
  1902. pushq %r12
  1903. pushq %r13
  1904. pushq %r14
  1905. pushq %r15
  1906. pushq %rbx
  1907. # Square * 2
  1908. # A[0] * A[1]
  1909. movq (%rsi), %rax
  1910. mulq 8(%rsi)
  1911. movq %rax, %r8
  1912. movq %rdx, %r9
  1913. # A[0] * A[2]
  1914. movq (%rsi), %rax
  1915. mulq 16(%rsi)
  1916. xorq %r10, %r10
  1917. addq %rax, %r9
  1918. adcq %rdx, %r10
  1919. # A[0] * A[3]
  1920. movq (%rsi), %rax
  1921. mulq 24(%rsi)
  1922. xorq %r11, %r11
  1923. addq %rax, %r10
  1924. adcq %rdx, %r11
  1925. # A[1] * A[2]
  1926. movq 8(%rsi), %rax
  1927. mulq 16(%rsi)
  1928. xorq %r12, %r12
  1929. addq %rax, %r10
  1930. adcq %rdx, %r11
  1931. adcq $0x00, %r12
  1932. # A[1] * A[3]
  1933. movq 8(%rsi), %rax
  1934. mulq 24(%rsi)
  1935. addq %rax, %r11
  1936. adcq %rdx, %r12
  1937. # A[2] * A[3]
  1938. movq 16(%rsi), %rax
  1939. mulq 24(%rsi)
  1940. xorq %r13, %r13
  1941. addq %rax, %r12
  1942. adcq %rdx, %r13
  1943. # Double
  1944. xorq %r14, %r14
  1945. addq %r8, %r8
  1946. adcq %r9, %r9
  1947. adcq %r10, %r10
  1948. adcq %r11, %r11
  1949. adcq %r12, %r12
  1950. adcq %r13, %r13
  1951. adcq $0x00, %r14
  1952. # A[0] * A[0]
  1953. movq (%rsi), %rax
  1954. mulq %rax
  1955. movq %rax, %rcx
  1956. movq %rdx, %r15
  1957. # A[1] * A[1]
  1958. movq 8(%rsi), %rax
  1959. mulq %rax
  1960. addq %r15, %r8
  1961. adcq %rax, %r9
  1962. adcq $0x00, %rdx
  1963. movq %rdx, %r15
  1964. # A[2] * A[2]
  1965. movq 16(%rsi), %rax
  1966. mulq %rax
  1967. addq %r15, %r10
  1968. adcq %rax, %r11
  1969. adcq $0x00, %rdx
  1970. movq %rdx, %r15
  1971. # A[3] * A[3]
  1972. movq 24(%rsi), %rax
  1973. mulq %rax
  1974. addq %rax, %r13
  1975. adcq %rdx, %r14
  1976. addq %r15, %r12
  1977. adcq $0x00, %r13
  1978. adcq $0x00, %r14
  1979. # Reduce
  1980. movq $0x7fffffffffffffff, %rbx
  1981. xorq %rax, %rax
  1982. # Move top half into t4-t7 and remove top bit from t3
  1983. shldq $3, %r14, %rax
  1984. shldq $2, %r13, %r14
  1985. shldq $2, %r12, %r13
  1986. shldq $2, %r11, %r12
  1987. shldq $2, %r10, %r11
  1988. shldq $0x01, %r9, %r10
  1989. shldq $0x01, %r8, %r9
  1990. shldq $0x01, %rcx, %r8
  1991. shlq $0x01, %rcx
  1992. andq %rbx, %r10
  1993. # Two out left, one in right
  1994. andq %rbx, %r14
  1995. # Multiply top bits by 19*19
  1996. imulq $0x169, %rax, %r15
  1997. # Multiply top half by 19
  1998. movq $19, %rax
  1999. mulq %r11
  2000. xorq %r11, %r11
  2001. addq %rax, %rcx
  2002. movq $19, %rax
  2003. adcq %rdx, %r11
  2004. mulq %r12
  2005. xorq %r12, %r12
  2006. addq %rax, %r8
  2007. movq $19, %rax
  2008. adcq %rdx, %r12
  2009. mulq %r13
  2010. xorq %r13, %r13
  2011. addq %rax, %r9
  2012. movq $19, %rax
  2013. adcq %rdx, %r13
  2014. mulq %r14
  2015. # Add remaining produce results in
  2016. addq %r15, %rcx
  2017. adcq %r11, %r8
  2018. adcq %r12, %r9
  2019. adcq %r13, %r10
  2020. adcq %rax, %r10
  2021. adcq $0x00, %rdx
  2022. # Overflow
  2023. shldq $0x01, %r10, %rdx
  2024. imulq $19, %rdx, %rax
  2025. andq %rbx, %r10
  2026. addq %rax, %rcx
  2027. adcq $0x00, %r8
  2028. adcq $0x00, %r9
  2029. adcq $0x00, %r10
  2030. # Reduce if top bit set
  2031. movq %r10, %rdx
  2032. sarq $63, %rdx
  2033. andq $19, %rdx
  2034. andq %rbx, %r10
  2035. addq %rdx, %rcx
  2036. adcq $0x00, %r8
  2037. adcq $0x00, %r9
  2038. adcq $0x00, %r10
  2039. # Store
  2040. movq %rcx, (%rdi)
  2041. movq %r8, 8(%rdi)
  2042. movq %r9, 16(%rdi)
  2043. movq %r10, 24(%rdi)
  2044. popq %rbx
  2045. popq %r15
  2046. popq %r14
  2047. popq %r13
  2048. popq %r12
  2049. repz retq
  2050. #ifndef __APPLE__
  2051. .size fe_sq2_x64,.-fe_sq2_x64
  2052. #endif /* __APPLE__ */
  2053. #ifndef __APPLE__
  2054. .text
  2055. .globl fe_invert_x64
  2056. .type fe_invert_x64,@function
  2057. .align 16
  2058. fe_invert_x64:
  2059. #else
  2060. .section __TEXT,__text
  2061. .globl _fe_invert_x64
  2062. .p2align 4
  2063. _fe_invert_x64:
  2064. #endif /* __APPLE__ */
  2065. subq $0x90, %rsp
  2066. # Invert
  2067. movq %rdi, 128(%rsp)
  2068. movq %rsi, 136(%rsp)
  2069. movq %rsp, %rdi
  2070. movq 136(%rsp), %rsi
  2071. #ifndef __APPLE__
  2072. callq fe_sq_x64@plt
  2073. #else
  2074. callq _fe_sq_x64
  2075. #endif /* __APPLE__ */
  2076. leaq 32(%rsp), %rdi
  2077. movq %rsp, %rsi
  2078. #ifndef __APPLE__
  2079. callq fe_sq_x64@plt
  2080. #else
  2081. callq _fe_sq_x64
  2082. #endif /* __APPLE__ */
  2083. leaq 32(%rsp), %rdi
  2084. leaq 32(%rsp), %rsi
  2085. #ifndef __APPLE__
  2086. callq fe_sq_x64@plt
  2087. #else
  2088. callq _fe_sq_x64
  2089. #endif /* __APPLE__ */
  2090. leaq 32(%rsp), %rdi
  2091. movq 136(%rsp), %rsi
  2092. leaq 32(%rsp), %rdx
  2093. #ifndef __APPLE__
  2094. callq fe_mul_x64@plt
  2095. #else
  2096. callq _fe_mul_x64
  2097. #endif /* __APPLE__ */
  2098. movq %rsp, %rdi
  2099. movq %rsp, %rsi
  2100. leaq 32(%rsp), %rdx
  2101. #ifndef __APPLE__
  2102. callq fe_mul_x64@plt
  2103. #else
  2104. callq _fe_mul_x64
  2105. #endif /* __APPLE__ */
  2106. leaq 64(%rsp), %rdi
  2107. movq %rsp, %rsi
  2108. #ifndef __APPLE__
  2109. callq fe_sq_x64@plt
  2110. #else
  2111. callq _fe_sq_x64
  2112. #endif /* __APPLE__ */
  2113. leaq 32(%rsp), %rdi
  2114. leaq 32(%rsp), %rsi
  2115. leaq 64(%rsp), %rdx
  2116. #ifndef __APPLE__
  2117. callq fe_mul_x64@plt
  2118. #else
  2119. callq _fe_mul_x64
  2120. #endif /* __APPLE__ */
  2121. leaq 64(%rsp), %rdi
  2122. leaq 32(%rsp), %rsi
  2123. #ifndef __APPLE__
  2124. callq fe_sq_x64@plt
  2125. #else
  2126. callq _fe_sq_x64
  2127. #endif /* __APPLE__ */
  2128. leaq 64(%rsp), %rdi
  2129. leaq 64(%rsp), %rsi
  2130. movq $4, %rdx
  2131. #ifndef __APPLE__
  2132. callq fe_sq_n_x64@plt
  2133. #else
  2134. callq _fe_sq_n_x64
  2135. #endif /* __APPLE__ */
  2136. leaq 32(%rsp), %rdi
  2137. leaq 64(%rsp), %rsi
  2138. leaq 32(%rsp), %rdx
  2139. #ifndef __APPLE__
  2140. callq fe_mul_x64@plt
  2141. #else
  2142. callq _fe_mul_x64
  2143. #endif /* __APPLE__ */
  2144. leaq 64(%rsp), %rdi
  2145. leaq 32(%rsp), %rsi
  2146. #ifndef __APPLE__
  2147. callq fe_sq_x64@plt
  2148. #else
  2149. callq _fe_sq_x64
  2150. #endif /* __APPLE__ */
  2151. leaq 64(%rsp), %rdi
  2152. leaq 64(%rsp), %rsi
  2153. movq $9, %rdx
  2154. #ifndef __APPLE__
  2155. callq fe_sq_n_x64@plt
  2156. #else
  2157. callq _fe_sq_n_x64
  2158. #endif /* __APPLE__ */
  2159. leaq 64(%rsp), %rdi
  2160. leaq 64(%rsp), %rsi
  2161. leaq 32(%rsp), %rdx
  2162. #ifndef __APPLE__
  2163. callq fe_mul_x64@plt
  2164. #else
  2165. callq _fe_mul_x64
  2166. #endif /* __APPLE__ */
  2167. leaq 96(%rsp), %rdi
  2168. leaq 64(%rsp), %rsi
  2169. #ifndef __APPLE__
  2170. callq fe_sq_x64@plt
  2171. #else
  2172. callq _fe_sq_x64
  2173. #endif /* __APPLE__ */
  2174. leaq 96(%rsp), %rdi
  2175. leaq 96(%rsp), %rsi
  2176. movq $19, %rdx
  2177. #ifndef __APPLE__
  2178. callq fe_sq_n_x64@plt
  2179. #else
  2180. callq _fe_sq_n_x64
  2181. #endif /* __APPLE__ */
  2182. leaq 64(%rsp), %rdi
  2183. leaq 96(%rsp), %rsi
  2184. leaq 64(%rsp), %rdx
  2185. #ifndef __APPLE__
  2186. callq fe_mul_x64@plt
  2187. #else
  2188. callq _fe_mul_x64
  2189. #endif /* __APPLE__ */
  2190. leaq 64(%rsp), %rdi
  2191. leaq 64(%rsp), %rsi
  2192. #ifndef __APPLE__
  2193. callq fe_sq_x64@plt
  2194. #else
  2195. callq _fe_sq_x64
  2196. #endif /* __APPLE__ */
  2197. leaq 64(%rsp), %rdi
  2198. leaq 64(%rsp), %rsi
  2199. movq $9, %rdx
  2200. #ifndef __APPLE__
  2201. callq fe_sq_n_x64@plt
  2202. #else
  2203. callq _fe_sq_n_x64
  2204. #endif /* __APPLE__ */
  2205. leaq 32(%rsp), %rdi
  2206. leaq 64(%rsp), %rsi
  2207. leaq 32(%rsp), %rdx
  2208. #ifndef __APPLE__
  2209. callq fe_mul_x64@plt
  2210. #else
  2211. callq _fe_mul_x64
  2212. #endif /* __APPLE__ */
  2213. leaq 64(%rsp), %rdi
  2214. leaq 32(%rsp), %rsi
  2215. #ifndef __APPLE__
  2216. callq fe_sq_x64@plt
  2217. #else
  2218. callq _fe_sq_x64
  2219. #endif /* __APPLE__ */
  2220. leaq 64(%rsp), %rdi
  2221. leaq 64(%rsp), %rsi
  2222. movq $49, %rdx
  2223. #ifndef __APPLE__
  2224. callq fe_sq_n_x64@plt
  2225. #else
  2226. callq _fe_sq_n_x64
  2227. #endif /* __APPLE__ */
  2228. leaq 64(%rsp), %rdi
  2229. leaq 64(%rsp), %rsi
  2230. leaq 32(%rsp), %rdx
  2231. #ifndef __APPLE__
  2232. callq fe_mul_x64@plt
  2233. #else
  2234. callq _fe_mul_x64
  2235. #endif /* __APPLE__ */
  2236. leaq 96(%rsp), %rdi
  2237. leaq 64(%rsp), %rsi
  2238. #ifndef __APPLE__
  2239. callq fe_sq_x64@plt
  2240. #else
  2241. callq _fe_sq_x64
  2242. #endif /* __APPLE__ */
  2243. leaq 96(%rsp), %rdi
  2244. leaq 96(%rsp), %rsi
  2245. movq $0x63, %rdx
  2246. #ifndef __APPLE__
  2247. callq fe_sq_n_x64@plt
  2248. #else
  2249. callq _fe_sq_n_x64
  2250. #endif /* __APPLE__ */
  2251. leaq 64(%rsp), %rdi
  2252. leaq 96(%rsp), %rsi
  2253. leaq 64(%rsp), %rdx
  2254. #ifndef __APPLE__
  2255. callq fe_mul_x64@plt
  2256. #else
  2257. callq _fe_mul_x64
  2258. #endif /* __APPLE__ */
  2259. leaq 64(%rsp), %rdi
  2260. leaq 64(%rsp), %rsi
  2261. #ifndef __APPLE__
  2262. callq fe_sq_x64@plt
  2263. #else
  2264. callq _fe_sq_x64
  2265. #endif /* __APPLE__ */
  2266. leaq 64(%rsp), %rdi
  2267. leaq 64(%rsp), %rsi
  2268. movq $49, %rdx
  2269. #ifndef __APPLE__
  2270. callq fe_sq_n_x64@plt
  2271. #else
  2272. callq _fe_sq_n_x64
  2273. #endif /* __APPLE__ */
  2274. leaq 32(%rsp), %rdi
  2275. leaq 64(%rsp), %rsi
  2276. leaq 32(%rsp), %rdx
  2277. #ifndef __APPLE__
  2278. callq fe_mul_x64@plt
  2279. #else
  2280. callq _fe_mul_x64
  2281. #endif /* __APPLE__ */
  2282. leaq 32(%rsp), %rdi
  2283. leaq 32(%rsp), %rsi
  2284. #ifndef __APPLE__
  2285. callq fe_sq_x64@plt
  2286. #else
  2287. callq _fe_sq_x64
  2288. #endif /* __APPLE__ */
  2289. leaq 32(%rsp), %rdi
  2290. leaq 32(%rsp), %rsi
  2291. movq $4, %rdx
  2292. #ifndef __APPLE__
  2293. callq fe_sq_n_x64@plt
  2294. #else
  2295. callq _fe_sq_n_x64
  2296. #endif /* __APPLE__ */
  2297. movq 128(%rsp), %rdi
  2298. leaq 32(%rsp), %rsi
  2299. movq %rsp, %rdx
  2300. #ifndef __APPLE__
  2301. callq fe_mul_x64@plt
  2302. #else
  2303. callq _fe_mul_x64
  2304. #endif /* __APPLE__ */
  2305. movq 136(%rsp), %rsi
  2306. movq 128(%rsp), %rdi
  2307. addq $0x90, %rsp
  2308. repz retq
  2309. #ifndef __APPLE__
  2310. .text
  2311. .globl curve25519_x64
  2312. .type curve25519_x64,@function
  2313. .align 16
  2314. curve25519_x64:
  2315. #else
  2316. .section __TEXT,__text
  2317. .globl _curve25519_x64
  2318. .p2align 4
  2319. _curve25519_x64:
  2320. #endif /* __APPLE__ */
  2321. pushq %r12
  2322. pushq %r13
  2323. pushq %r14
  2324. pushq %r15
  2325. pushq %rbx
  2326. pushq %rbp
  2327. movq %rdx, %r8
  2328. subq $0xb8, %rsp
  2329. xorq %rbx, %rbx
  2330. movq %rdi, 176(%rsp)
  2331. # Set one
  2332. movq $0x01, (%rdi)
  2333. movq $0x00, 8(%rdi)
  2334. movq $0x00, 16(%rdi)
  2335. movq $0x00, 24(%rdi)
  2336. # Set zero
  2337. movq $0x00, (%rsp)
  2338. movq $0x00, 8(%rsp)
  2339. movq $0x00, 16(%rsp)
  2340. movq $0x00, 24(%rsp)
  2341. # Set one
  2342. movq $0x01, 32(%rsp)
  2343. movq $0x00, 40(%rsp)
  2344. movq $0x00, 48(%rsp)
  2345. movq $0x00, 56(%rsp)
  2346. # Copy
  2347. movq (%r8), %rcx
  2348. movq 8(%r8), %r9
  2349. movq 16(%r8), %r10
  2350. movq 24(%r8), %r11
  2351. movq %rcx, 64(%rsp)
  2352. movq %r9, 72(%rsp)
  2353. movq %r10, 80(%rsp)
  2354. movq %r11, 88(%rsp)
  2355. movb $62, 168(%rsp)
  2356. movq $3, 160(%rsp)
  2357. L_curve25519_x64_words:
  2358. L_curve25519_x64_bits:
  2359. movq 160(%rsp), %r9
  2360. movb 168(%rsp), %cl
  2361. movq (%rsi,%r9,8), %rbp
  2362. shrq %cl, %rbp
  2363. andq $0x01, %rbp
  2364. xorq %rbp, %rbx
  2365. negq %rbx
  2366. # Conditional Swap
  2367. movq (%rdi), %rcx
  2368. movq 8(%rdi), %r9
  2369. movq 16(%rdi), %r10
  2370. movq 24(%rdi), %r11
  2371. xorq 64(%rsp), %rcx
  2372. xorq 72(%rsp), %r9
  2373. xorq 80(%rsp), %r10
  2374. xorq 88(%rsp), %r11
  2375. andq %rbx, %rcx
  2376. andq %rbx, %r9
  2377. andq %rbx, %r10
  2378. andq %rbx, %r11
  2379. xorq %rcx, (%rdi)
  2380. xorq %r9, 8(%rdi)
  2381. xorq %r10, 16(%rdi)
  2382. xorq %r11, 24(%rdi)
  2383. xorq %rcx, 64(%rsp)
  2384. xorq %r9, 72(%rsp)
  2385. xorq %r10, 80(%rsp)
  2386. xorq %r11, 88(%rsp)
  2387. # Conditional Swap
  2388. movq (%rsp), %rcx
  2389. movq 8(%rsp), %r9
  2390. movq 16(%rsp), %r10
  2391. movq 24(%rsp), %r11
  2392. xorq 32(%rsp), %rcx
  2393. xorq 40(%rsp), %r9
  2394. xorq 48(%rsp), %r10
  2395. xorq 56(%rsp), %r11
  2396. andq %rbx, %rcx
  2397. andq %rbx, %r9
  2398. andq %rbx, %r10
  2399. andq %rbx, %r11
  2400. xorq %rcx, (%rsp)
  2401. xorq %r9, 8(%rsp)
  2402. xorq %r10, 16(%rsp)
  2403. xorq %r11, 24(%rsp)
  2404. xorq %rcx, 32(%rsp)
  2405. xorq %r9, 40(%rsp)
  2406. xorq %r10, 48(%rsp)
  2407. xorq %r11, 56(%rsp)
  2408. movq %rbp, %rbx
  2409. # Add
  2410. movq (%rdi), %rcx
  2411. movq 8(%rdi), %r9
  2412. movq 16(%rdi), %r10
  2413. movq 24(%rdi), %rbp
  2414. movq %rcx, %r12
  2415. addq (%rsp), %rcx
  2416. movq %r9, %r13
  2417. adcq 8(%rsp), %r9
  2418. movq %r10, %r14
  2419. adcq 16(%rsp), %r10
  2420. movq %rbp, %r15
  2421. adcq 24(%rsp), %rbp
  2422. movq $-19, %rax
  2423. movq %rbp, %r11
  2424. movq $0x7fffffffffffffff, %rdx
  2425. sarq $63, %rbp
  2426. # Mask the modulus
  2427. andq %rbp, %rax
  2428. andq %rbp, %rdx
  2429. # Sub modulus (if overflow)
  2430. subq %rax, %rcx
  2431. sbbq %rbp, %r9
  2432. sbbq %rbp, %r10
  2433. sbbq %rdx, %r11
  2434. # Sub
  2435. subq (%rsp), %r12
  2436. movq $0x00, %rbp
  2437. sbbq 8(%rsp), %r13
  2438. movq $-19, %rax
  2439. sbbq 16(%rsp), %r14
  2440. movq $0x7fffffffffffffff, %rdx
  2441. sbbq 24(%rsp), %r15
  2442. sbbq $0x00, %rbp
  2443. # Mask the modulus
  2444. andq %rbp, %rax
  2445. andq %rbp, %rdx
  2446. # Add modulus (if underflow)
  2447. addq %rax, %r12
  2448. adcq %rbp, %r13
  2449. adcq %rbp, %r14
  2450. adcq %rdx, %r15
  2451. movq %rcx, (%rdi)
  2452. movq %r9, 8(%rdi)
  2453. movq %r10, 16(%rdi)
  2454. movq %r11, 24(%rdi)
  2455. movq %r12, 128(%rsp)
  2456. movq %r13, 136(%rsp)
  2457. movq %r14, 144(%rsp)
  2458. movq %r15, 152(%rsp)
  2459. # Add
  2460. movq 64(%rsp), %rcx
  2461. movq 72(%rsp), %r9
  2462. movq 80(%rsp), %r10
  2463. movq 88(%rsp), %rbp
  2464. movq %rcx, %r12
  2465. addq 32(%rsp), %rcx
  2466. movq %r9, %r13
  2467. adcq 40(%rsp), %r9
  2468. movq %r10, %r14
  2469. adcq 48(%rsp), %r10
  2470. movq %rbp, %r15
  2471. adcq 56(%rsp), %rbp
  2472. movq $-19, %rax
  2473. movq %rbp, %r11
  2474. movq $0x7fffffffffffffff, %rdx
  2475. sarq $63, %rbp
  2476. # Mask the modulus
  2477. andq %rbp, %rax
  2478. andq %rbp, %rdx
  2479. # Sub modulus (if overflow)
  2480. subq %rax, %rcx
  2481. sbbq %rbp, %r9
  2482. sbbq %rbp, %r10
  2483. sbbq %rdx, %r11
  2484. # Sub
  2485. subq 32(%rsp), %r12
  2486. movq $0x00, %rbp
  2487. sbbq 40(%rsp), %r13
  2488. movq $-19, %rax
  2489. sbbq 48(%rsp), %r14
  2490. movq $0x7fffffffffffffff, %rdx
  2491. sbbq 56(%rsp), %r15
  2492. sbbq $0x00, %rbp
  2493. # Mask the modulus
  2494. andq %rbp, %rax
  2495. andq %rbp, %rdx
  2496. # Add modulus (if underflow)
  2497. addq %rax, %r12
  2498. adcq %rbp, %r13
  2499. adcq %rbp, %r14
  2500. adcq %rdx, %r15
  2501. movq %rcx, (%rsp)
  2502. movq %r9, 8(%rsp)
  2503. movq %r10, 16(%rsp)
  2504. movq %r11, 24(%rsp)
  2505. movq %r12, 96(%rsp)
  2506. movq %r13, 104(%rsp)
  2507. movq %r14, 112(%rsp)
  2508. movq %r15, 120(%rsp)
  2509. # Multiply
  2510. # A[0] * B[0]
  2511. movq (%rdi), %rax
  2512. mulq 96(%rsp)
  2513. movq %rax, %rcx
  2514. movq %rdx, %r9
  2515. # A[0] * B[1]
  2516. movq 8(%rdi), %rax
  2517. mulq 96(%rsp)
  2518. xorq %r10, %r10
  2519. addq %rax, %r9
  2520. adcq %rdx, %r10
  2521. # A[1] * B[0]
  2522. movq (%rdi), %rax
  2523. mulq 104(%rsp)
  2524. xorq %r11, %r11
  2525. addq %rax, %r9
  2526. adcq %rdx, %r10
  2527. adcq $0x00, %r11
  2528. # A[0] * B[2]
  2529. movq 16(%rdi), %rax
  2530. mulq 96(%rsp)
  2531. addq %rax, %r10
  2532. adcq %rdx, %r11
  2533. # A[1] * B[1]
  2534. movq 8(%rdi), %rax
  2535. mulq 104(%rsp)
  2536. xorq %r12, %r12
  2537. addq %rax, %r10
  2538. adcq %rdx, %r11
  2539. adcq $0x00, %r12
  2540. # A[2] * B[0]
  2541. movq (%rdi), %rax
  2542. mulq 112(%rsp)
  2543. addq %rax, %r10
  2544. adcq %rdx, %r11
  2545. adcq $0x00, %r12
  2546. # A[0] * B[3]
  2547. movq 24(%rdi), %rax
  2548. mulq 96(%rsp)
  2549. xorq %r13, %r13
  2550. addq %rax, %r11
  2551. adcq %rdx, %r12
  2552. adcq $0x00, %r13
  2553. # A[1] * B[2]
  2554. movq 16(%rdi), %rax
  2555. mulq 104(%rsp)
  2556. addq %rax, %r11
  2557. adcq %rdx, %r12
  2558. adcq $0x00, %r13
  2559. # A[2] * B[1]
  2560. movq 8(%rdi), %rax
  2561. mulq 112(%rsp)
  2562. addq %rax, %r11
  2563. adcq %rdx, %r12
  2564. adcq $0x00, %r13
  2565. # A[3] * B[0]
  2566. movq (%rdi), %rax
  2567. mulq 120(%rsp)
  2568. addq %rax, %r11
  2569. adcq %rdx, %r12
  2570. adcq $0x00, %r13
  2571. # A[1] * B[3]
  2572. movq 24(%rdi), %rax
  2573. mulq 104(%rsp)
  2574. xorq %r14, %r14
  2575. addq %rax, %r12
  2576. adcq %rdx, %r13
  2577. adcq $0x00, %r14
  2578. # A[2] * B[2]
  2579. movq 16(%rdi), %rax
  2580. mulq 112(%rsp)
  2581. addq %rax, %r12
  2582. adcq %rdx, %r13
  2583. adcq $0x00, %r14
  2584. # A[3] * B[1]
  2585. movq 8(%rdi), %rax
  2586. mulq 120(%rsp)
  2587. addq %rax, %r12
  2588. adcq %rdx, %r13
  2589. adcq $0x00, %r14
  2590. # A[2] * B[3]
  2591. movq 24(%rdi), %rax
  2592. mulq 112(%rsp)
  2593. xorq %r15, %r15
  2594. addq %rax, %r13
  2595. adcq %rdx, %r14
  2596. adcq $0x00, %r15
  2597. # A[3] * B[2]
  2598. movq 16(%rdi), %rax
  2599. mulq 120(%rsp)
  2600. addq %rax, %r13
  2601. adcq %rdx, %r14
  2602. adcq $0x00, %r15
  2603. # A[3] * B[3]
  2604. movq 24(%rdi), %rax
  2605. mulq 120(%rsp)
  2606. addq %rax, %r14
  2607. adcq %rdx, %r15
  2608. # Reduce
  2609. movq $0x7fffffffffffffff, %rbp
  2610. # Move top half into t4-t7 and remove top bit from t3
  2611. shldq $0x01, %r14, %r15
  2612. shldq $0x01, %r13, %r14
  2613. shldq $0x01, %r12, %r13
  2614. shldq $0x01, %r11, %r12
  2615. andq %rbp, %r11
  2616. # Multiply top half by 19
  2617. movq $19, %rax
  2618. mulq %r12
  2619. xorq %r12, %r12
  2620. addq %rax, %rcx
  2621. movq $19, %rax
  2622. adcq %rdx, %r12
  2623. mulq %r13
  2624. xorq %r13, %r13
  2625. addq %rax, %r9
  2626. movq $19, %rax
  2627. adcq %rdx, %r13
  2628. mulq %r14
  2629. xorq %r14, %r14
  2630. addq %rax, %r10
  2631. movq $19, %rax
  2632. adcq %rdx, %r14
  2633. mulq %r15
  2634. # Add remaining product results in
  2635. addq %r12, %r9
  2636. adcq %r13, %r10
  2637. adcq %r14, %r11
  2638. adcq %rax, %r11
  2639. adcq $0x00, %rdx
  2640. # Overflow
  2641. shldq $0x01, %r11, %rdx
  2642. imulq $19, %rdx, %rax
  2643. andq %rbp, %r11
  2644. addq %rax, %rcx
  2645. adcq $0x00, %r9
  2646. adcq $0x00, %r10
  2647. adcq $0x00, %r11
  2648. # Reduce if top bit set
  2649. movq %r11, %rdx
  2650. sarq $63, %rdx
  2651. andq $19, %rdx
  2652. andq %rbp, %r11
  2653. addq %rdx, %rcx
  2654. adcq $0x00, %r9
  2655. adcq $0x00, %r10
  2656. adcq $0x00, %r11
  2657. # Store
  2658. movq %rcx, 32(%rsp)
  2659. movq %r9, 40(%rsp)
  2660. movq %r10, 48(%rsp)
  2661. movq %r11, 56(%rsp)
  2662. # Multiply
  2663. # A[0] * B[0]
  2664. movq 128(%rsp), %rax
  2665. mulq (%rsp)
  2666. movq %rax, %rcx
  2667. movq %rdx, %r9
  2668. # A[0] * B[1]
  2669. movq 136(%rsp), %rax
  2670. mulq (%rsp)
  2671. xorq %r10, %r10
  2672. addq %rax, %r9
  2673. adcq %rdx, %r10
  2674. # A[1] * B[0]
  2675. movq 128(%rsp), %rax
  2676. mulq 8(%rsp)
  2677. xorq %r11, %r11
  2678. addq %rax, %r9
  2679. adcq %rdx, %r10
  2680. adcq $0x00, %r11
  2681. # A[0] * B[2]
  2682. movq 144(%rsp), %rax
  2683. mulq (%rsp)
  2684. addq %rax, %r10
  2685. adcq %rdx, %r11
  2686. # A[1] * B[1]
  2687. movq 136(%rsp), %rax
  2688. mulq 8(%rsp)
  2689. xorq %r12, %r12
  2690. addq %rax, %r10
  2691. adcq %rdx, %r11
  2692. adcq $0x00, %r12
  2693. # A[2] * B[0]
  2694. movq 128(%rsp), %rax
  2695. mulq 16(%rsp)
  2696. addq %rax, %r10
  2697. adcq %rdx, %r11
  2698. adcq $0x00, %r12
  2699. # A[0] * B[3]
  2700. movq 152(%rsp), %rax
  2701. mulq (%rsp)
  2702. xorq %r13, %r13
  2703. addq %rax, %r11
  2704. adcq %rdx, %r12
  2705. adcq $0x00, %r13
  2706. # A[1] * B[2]
  2707. movq 144(%rsp), %rax
  2708. mulq 8(%rsp)
  2709. addq %rax, %r11
  2710. adcq %rdx, %r12
  2711. adcq $0x00, %r13
  2712. # A[2] * B[1]
  2713. movq 136(%rsp), %rax
  2714. mulq 16(%rsp)
  2715. addq %rax, %r11
  2716. adcq %rdx, %r12
  2717. adcq $0x00, %r13
  2718. # A[3] * B[0]
  2719. movq 128(%rsp), %rax
  2720. mulq 24(%rsp)
  2721. addq %rax, %r11
  2722. adcq %rdx, %r12
  2723. adcq $0x00, %r13
  2724. # A[1] * B[3]
  2725. movq 152(%rsp), %rax
  2726. mulq 8(%rsp)
  2727. xorq %r14, %r14
  2728. addq %rax, %r12
  2729. adcq %rdx, %r13
  2730. adcq $0x00, %r14
  2731. # A[2] * B[2]
  2732. movq 144(%rsp), %rax
  2733. mulq 16(%rsp)
  2734. addq %rax, %r12
  2735. adcq %rdx, %r13
  2736. adcq $0x00, %r14
  2737. # A[3] * B[1]
  2738. movq 136(%rsp), %rax
  2739. mulq 24(%rsp)
  2740. addq %rax, %r12
  2741. adcq %rdx, %r13
  2742. adcq $0x00, %r14
  2743. # A[2] * B[3]
  2744. movq 152(%rsp), %rax
  2745. mulq 16(%rsp)
  2746. xorq %r15, %r15
  2747. addq %rax, %r13
  2748. adcq %rdx, %r14
  2749. adcq $0x00, %r15
  2750. # A[3] * B[2]
  2751. movq 144(%rsp), %rax
  2752. mulq 24(%rsp)
  2753. addq %rax, %r13
  2754. adcq %rdx, %r14
  2755. adcq $0x00, %r15
  2756. # A[3] * B[3]
  2757. movq 152(%rsp), %rax
  2758. mulq 24(%rsp)
  2759. addq %rax, %r14
  2760. adcq %rdx, %r15
  2761. # Reduce
  2762. movq $0x7fffffffffffffff, %rbp
  2763. # Move top half into t4-t7 and remove top bit from t3
  2764. shldq $0x01, %r14, %r15
  2765. shldq $0x01, %r13, %r14
  2766. shldq $0x01, %r12, %r13
  2767. shldq $0x01, %r11, %r12
  2768. andq %rbp, %r11
  2769. # Multiply top half by 19
  2770. movq $19, %rax
  2771. mulq %r12
  2772. xorq %r12, %r12
  2773. addq %rax, %rcx
  2774. movq $19, %rax
  2775. adcq %rdx, %r12
  2776. mulq %r13
  2777. xorq %r13, %r13
  2778. addq %rax, %r9
  2779. movq $19, %rax
  2780. adcq %rdx, %r13
  2781. mulq %r14
  2782. xorq %r14, %r14
  2783. addq %rax, %r10
  2784. movq $19, %rax
  2785. adcq %rdx, %r14
  2786. mulq %r15
  2787. # Add remaining product results in
  2788. addq %r12, %r9
  2789. adcq %r13, %r10
  2790. adcq %r14, %r11
  2791. adcq %rax, %r11
  2792. adcq $0x00, %rdx
  2793. # Overflow
  2794. shldq $0x01, %r11, %rdx
  2795. imulq $19, %rdx, %rax
  2796. andq %rbp, %r11
  2797. addq %rax, %rcx
  2798. adcq $0x00, %r9
  2799. adcq $0x00, %r10
  2800. adcq $0x00, %r11
  2801. # Reduce if top bit set
  2802. movq %r11, %rdx
  2803. sarq $63, %rdx
  2804. andq $19, %rdx
  2805. andq %rbp, %r11
  2806. addq %rdx, %rcx
  2807. adcq $0x00, %r9
  2808. adcq $0x00, %r10
  2809. adcq $0x00, %r11
  2810. # Store
  2811. movq %rcx, (%rsp)
  2812. movq %r9, 8(%rsp)
  2813. movq %r10, 16(%rsp)
  2814. movq %r11, 24(%rsp)
  2815. # Square
  2816. # A[0] * A[1]
  2817. movq 128(%rsp), %rax
  2818. mulq 136(%rsp)
  2819. movq %rax, %r9
  2820. movq %rdx, %r10
  2821. # A[0] * A[2]
  2822. movq 128(%rsp), %rax
  2823. mulq 144(%rsp)
  2824. xorq %r11, %r11
  2825. addq %rax, %r10
  2826. adcq %rdx, %r11
  2827. # A[0] * A[3]
  2828. movq 128(%rsp), %rax
  2829. mulq 152(%rsp)
  2830. xorq %r12, %r12
  2831. addq %rax, %r11
  2832. adcq %rdx, %r12
  2833. # A[1] * A[2]
  2834. movq 136(%rsp), %rax
  2835. mulq 144(%rsp)
  2836. xorq %r13, %r13
  2837. addq %rax, %r11
  2838. adcq %rdx, %r12
  2839. adcq $0x00, %r13
  2840. # A[1] * A[3]
  2841. movq 136(%rsp), %rax
  2842. mulq 152(%rsp)
  2843. addq %rax, %r12
  2844. adcq %rdx, %r13
  2845. # A[2] * A[3]
  2846. movq 144(%rsp), %rax
  2847. mulq 152(%rsp)
  2848. xorq %r14, %r14
  2849. addq %rax, %r13
  2850. adcq %rdx, %r14
  2851. # Double
  2852. xorq %r15, %r15
  2853. addq %r9, %r9
  2854. adcq %r10, %r10
  2855. adcq %r11, %r11
  2856. adcq %r12, %r12
  2857. adcq %r13, %r13
  2858. adcq %r14, %r14
  2859. adcq $0x00, %r15
  2860. # A[0] * A[0]
  2861. movq 128(%rsp), %rax
  2862. mulq %rax
  2863. movq %rax, %rcx
  2864. movq %rdx, %rbp
  2865. # A[1] * A[1]
  2866. movq 136(%rsp), %rax
  2867. mulq %rax
  2868. addq %rbp, %r9
  2869. adcq %rax, %r10
  2870. adcq $0x00, %rdx
  2871. movq %rdx, %rbp
  2872. # A[2] * A[2]
  2873. movq 144(%rsp), %rax
  2874. mulq %rax
  2875. addq %rbp, %r11
  2876. adcq %rax, %r12
  2877. adcq $0x00, %rdx
  2878. movq %rdx, %rbp
  2879. # A[3] * A[3]
  2880. movq 152(%rsp), %rax
  2881. mulq %rax
  2882. addq %rax, %r14
  2883. adcq %rdx, %r15
  2884. addq %rbp, %r13
  2885. adcq $0x00, %r14
  2886. adcq $0x00, %r15
  2887. # Reduce
  2888. movq $0x7fffffffffffffff, %rbp
  2889. # Move top half into t4-t7 and remove top bit from t3
  2890. shldq $0x01, %r14, %r15
  2891. shldq $0x01, %r13, %r14
  2892. shldq $0x01, %r12, %r13
  2893. shldq $0x01, %r11, %r12
  2894. andq %rbp, %r11
  2895. # Multiply top half by 19
  2896. movq $19, %rax
  2897. mulq %r12
  2898. xorq %r12, %r12
  2899. addq %rax, %rcx
  2900. movq $19, %rax
  2901. adcq %rdx, %r12
  2902. mulq %r13
  2903. xorq %r13, %r13
  2904. addq %rax, %r9
  2905. movq $19, %rax
  2906. adcq %rdx, %r13
  2907. mulq %r14
  2908. xorq %r14, %r14
  2909. addq %rax, %r10
  2910. movq $19, %rax
  2911. adcq %rdx, %r14
  2912. mulq %r15
  2913. # Add remaining product results in
  2914. addq %r12, %r9
  2915. adcq %r13, %r10
  2916. adcq %r14, %r11
  2917. adcq %rax, %r11
  2918. adcq $0x00, %rdx
  2919. # Overflow
  2920. shldq $0x01, %r11, %rdx
  2921. imulq $19, %rdx, %rax
  2922. andq %rbp, %r11
  2923. addq %rax, %rcx
  2924. adcq $0x00, %r9
  2925. adcq $0x00, %r10
  2926. adcq $0x00, %r11
  2927. # Reduce if top bit set
  2928. movq %r11, %rdx
  2929. sarq $63, %rdx
  2930. andq $19, %rdx
  2931. andq %rbp, %r11
  2932. addq %rdx, %rcx
  2933. adcq $0x00, %r9
  2934. adcq $0x00, %r10
  2935. adcq $0x00, %r11
  2936. # Store
  2937. movq %rcx, 96(%rsp)
  2938. movq %r9, 104(%rsp)
  2939. movq %r10, 112(%rsp)
  2940. movq %r11, 120(%rsp)
  2941. # Square
  2942. # A[0] * A[1]
  2943. movq (%rdi), %rax
  2944. mulq 8(%rdi)
  2945. movq %rax, %r9
  2946. movq %rdx, %r10
  2947. # A[0] * A[2]
  2948. movq (%rdi), %rax
  2949. mulq 16(%rdi)
  2950. xorq %r11, %r11
  2951. addq %rax, %r10
  2952. adcq %rdx, %r11
  2953. # A[0] * A[3]
  2954. movq (%rdi), %rax
  2955. mulq 24(%rdi)
  2956. xorq %r12, %r12
  2957. addq %rax, %r11
  2958. adcq %rdx, %r12
  2959. # A[1] * A[2]
  2960. movq 8(%rdi), %rax
  2961. mulq 16(%rdi)
  2962. xorq %r13, %r13
  2963. addq %rax, %r11
  2964. adcq %rdx, %r12
  2965. adcq $0x00, %r13
  2966. # A[1] * A[3]
  2967. movq 8(%rdi), %rax
  2968. mulq 24(%rdi)
  2969. addq %rax, %r12
  2970. adcq %rdx, %r13
  2971. # A[2] * A[3]
  2972. movq 16(%rdi), %rax
  2973. mulq 24(%rdi)
  2974. xorq %r14, %r14
  2975. addq %rax, %r13
  2976. adcq %rdx, %r14
  2977. # Double
  2978. xorq %r15, %r15
  2979. addq %r9, %r9
  2980. adcq %r10, %r10
  2981. adcq %r11, %r11
  2982. adcq %r12, %r12
  2983. adcq %r13, %r13
  2984. adcq %r14, %r14
  2985. adcq $0x00, %r15
  2986. # A[0] * A[0]
  2987. movq (%rdi), %rax
  2988. mulq %rax
  2989. movq %rax, %rcx
  2990. movq %rdx, %rbp
  2991. # A[1] * A[1]
  2992. movq 8(%rdi), %rax
  2993. mulq %rax
  2994. addq %rbp, %r9
  2995. adcq %rax, %r10
  2996. adcq $0x00, %rdx
  2997. movq %rdx, %rbp
  2998. # A[2] * A[2]
  2999. movq 16(%rdi), %rax
  3000. mulq %rax
  3001. addq %rbp, %r11
  3002. adcq %rax, %r12
  3003. adcq $0x00, %rdx
  3004. movq %rdx, %rbp
  3005. # A[3] * A[3]
  3006. movq 24(%rdi), %rax
  3007. mulq %rax
  3008. addq %rax, %r14
  3009. adcq %rdx, %r15
  3010. addq %rbp, %r13
  3011. adcq $0x00, %r14
  3012. adcq $0x00, %r15
  3013. # Reduce
  3014. movq $0x7fffffffffffffff, %rbp
  3015. # Move top half into t4-t7 and remove top bit from t3
  3016. shldq $0x01, %r14, %r15
  3017. shldq $0x01, %r13, %r14
  3018. shldq $0x01, %r12, %r13
  3019. shldq $0x01, %r11, %r12
  3020. andq %rbp, %r11
  3021. # Multiply top half by 19
  3022. movq $19, %rax
  3023. mulq %r12
  3024. xorq %r12, %r12
  3025. addq %rax, %rcx
  3026. movq $19, %rax
  3027. adcq %rdx, %r12
  3028. mulq %r13
  3029. xorq %r13, %r13
  3030. addq %rax, %r9
  3031. movq $19, %rax
  3032. adcq %rdx, %r13
  3033. mulq %r14
  3034. xorq %r14, %r14
  3035. addq %rax, %r10
  3036. movq $19, %rax
  3037. adcq %rdx, %r14
  3038. mulq %r15
  3039. # Add remaining product results in
  3040. addq %r12, %r9
  3041. adcq %r13, %r10
  3042. adcq %r14, %r11
  3043. adcq %rax, %r11
  3044. adcq $0x00, %rdx
  3045. # Overflow
  3046. shldq $0x01, %r11, %rdx
  3047. imulq $19, %rdx, %rax
  3048. andq %rbp, %r11
  3049. addq %rax, %rcx
  3050. adcq $0x00, %r9
  3051. adcq $0x00, %r10
  3052. adcq $0x00, %r11
  3053. # Reduce if top bit set
  3054. movq %r11, %rdx
  3055. sarq $63, %rdx
  3056. andq $19, %rdx
  3057. andq %rbp, %r11
  3058. addq %rdx, %rcx
  3059. adcq $0x00, %r9
  3060. adcq $0x00, %r10
  3061. adcq $0x00, %r11
  3062. # Store
  3063. movq %rcx, 128(%rsp)
  3064. movq %r9, 136(%rsp)
  3065. movq %r10, 144(%rsp)
  3066. movq %r11, 152(%rsp)
  3067. # Add
  3068. movq 32(%rsp), %rcx
  3069. movq 40(%rsp), %r9
  3070. movq 48(%rsp), %r10
  3071. movq 56(%rsp), %rbp
  3072. movq %rcx, %r12
  3073. addq (%rsp), %rcx
  3074. movq %r9, %r13
  3075. adcq 8(%rsp), %r9
  3076. movq %r10, %r14
  3077. adcq 16(%rsp), %r10
  3078. movq %rbp, %r15
  3079. adcq 24(%rsp), %rbp
  3080. movq $-19, %rax
  3081. movq %rbp, %r11
  3082. movq $0x7fffffffffffffff, %rdx
  3083. sarq $63, %rbp
  3084. # Mask the modulus
  3085. andq %rbp, %rax
  3086. andq %rbp, %rdx
  3087. # Sub modulus (if overflow)
  3088. subq %rax, %rcx
  3089. sbbq %rbp, %r9
  3090. sbbq %rbp, %r10
  3091. sbbq %rdx, %r11
  3092. # Sub
  3093. subq (%rsp), %r12
  3094. movq $0x00, %rbp
  3095. sbbq 8(%rsp), %r13
  3096. movq $-19, %rax
  3097. sbbq 16(%rsp), %r14
  3098. movq $0x7fffffffffffffff, %rdx
  3099. sbbq 24(%rsp), %r15
  3100. sbbq $0x00, %rbp
  3101. # Mask the modulus
  3102. andq %rbp, %rax
  3103. andq %rbp, %rdx
  3104. # Add modulus (if underflow)
  3105. addq %rax, %r12
  3106. adcq %rbp, %r13
  3107. adcq %rbp, %r14
  3108. adcq %rdx, %r15
  3109. movq %rcx, 64(%rsp)
  3110. movq %r9, 72(%rsp)
  3111. movq %r10, 80(%rsp)
  3112. movq %r11, 88(%rsp)
  3113. movq %r12, (%rsp)
  3114. movq %r13, 8(%rsp)
  3115. movq %r14, 16(%rsp)
  3116. movq %r15, 24(%rsp)
  3117. # Multiply
  3118. # A[0] * B[0]
  3119. movq 96(%rsp), %rax
  3120. mulq 128(%rsp)
  3121. movq %rax, %rcx
  3122. movq %rdx, %r9
  3123. # A[0] * B[1]
  3124. movq 104(%rsp), %rax
  3125. mulq 128(%rsp)
  3126. xorq %r10, %r10
  3127. addq %rax, %r9
  3128. adcq %rdx, %r10
  3129. # A[1] * B[0]
  3130. movq 96(%rsp), %rax
  3131. mulq 136(%rsp)
  3132. xorq %r11, %r11
  3133. addq %rax, %r9
  3134. adcq %rdx, %r10
  3135. adcq $0x00, %r11
  3136. # A[0] * B[2]
  3137. movq 112(%rsp), %rax
  3138. mulq 128(%rsp)
  3139. addq %rax, %r10
  3140. adcq %rdx, %r11
  3141. # A[1] * B[1]
  3142. movq 104(%rsp), %rax
  3143. mulq 136(%rsp)
  3144. xorq %r12, %r12
  3145. addq %rax, %r10
  3146. adcq %rdx, %r11
  3147. adcq $0x00, %r12
  3148. # A[2] * B[0]
  3149. movq 96(%rsp), %rax
  3150. mulq 144(%rsp)
  3151. addq %rax, %r10
  3152. adcq %rdx, %r11
  3153. adcq $0x00, %r12
  3154. # A[0] * B[3]
  3155. movq 120(%rsp), %rax
  3156. mulq 128(%rsp)
  3157. xorq %r13, %r13
  3158. addq %rax, %r11
  3159. adcq %rdx, %r12
  3160. adcq $0x00, %r13
  3161. # A[1] * B[2]
  3162. movq 112(%rsp), %rax
  3163. mulq 136(%rsp)
  3164. addq %rax, %r11
  3165. adcq %rdx, %r12
  3166. adcq $0x00, %r13
  3167. # A[2] * B[1]
  3168. movq 104(%rsp), %rax
  3169. mulq 144(%rsp)
  3170. addq %rax, %r11
  3171. adcq %rdx, %r12
  3172. adcq $0x00, %r13
  3173. # A[3] * B[0]
  3174. movq 96(%rsp), %rax
  3175. mulq 152(%rsp)
  3176. addq %rax, %r11
  3177. adcq %rdx, %r12
  3178. adcq $0x00, %r13
  3179. # A[1] * B[3]
  3180. movq 120(%rsp), %rax
  3181. mulq 136(%rsp)
  3182. xorq %r14, %r14
  3183. addq %rax, %r12
  3184. adcq %rdx, %r13
  3185. adcq $0x00, %r14
  3186. # A[2] * B[2]
  3187. movq 112(%rsp), %rax
  3188. mulq 144(%rsp)
  3189. addq %rax, %r12
  3190. adcq %rdx, %r13
  3191. adcq $0x00, %r14
  3192. # A[3] * B[1]
  3193. movq 104(%rsp), %rax
  3194. mulq 152(%rsp)
  3195. addq %rax, %r12
  3196. adcq %rdx, %r13
  3197. adcq $0x00, %r14
  3198. # A[2] * B[3]
  3199. movq 120(%rsp), %rax
  3200. mulq 144(%rsp)
  3201. xorq %r15, %r15
  3202. addq %rax, %r13
  3203. adcq %rdx, %r14
  3204. adcq $0x00, %r15
  3205. # A[3] * B[2]
  3206. movq 112(%rsp), %rax
  3207. mulq 152(%rsp)
  3208. addq %rax, %r13
  3209. adcq %rdx, %r14
  3210. adcq $0x00, %r15
  3211. # A[3] * B[3]
  3212. movq 120(%rsp), %rax
  3213. mulq 152(%rsp)
  3214. addq %rax, %r14
  3215. adcq %rdx, %r15
  3216. # Reduce
  3217. movq $0x7fffffffffffffff, %rbp
  3218. # Move top half into t4-t7 and remove top bit from t3
  3219. shldq $0x01, %r14, %r15
  3220. shldq $0x01, %r13, %r14
  3221. shldq $0x01, %r12, %r13
  3222. shldq $0x01, %r11, %r12
  3223. andq %rbp, %r11
  3224. # Multiply top half by 19
  3225. movq $19, %rax
  3226. mulq %r12
  3227. xorq %r12, %r12
  3228. addq %rax, %rcx
  3229. movq $19, %rax
  3230. adcq %rdx, %r12
  3231. mulq %r13
  3232. xorq %r13, %r13
  3233. addq %rax, %r9
  3234. movq $19, %rax
  3235. adcq %rdx, %r13
  3236. mulq %r14
  3237. xorq %r14, %r14
  3238. addq %rax, %r10
  3239. movq $19, %rax
  3240. adcq %rdx, %r14
  3241. mulq %r15
  3242. # Add remaining product results in
  3243. addq %r12, %r9
  3244. adcq %r13, %r10
  3245. adcq %r14, %r11
  3246. adcq %rax, %r11
  3247. adcq $0x00, %rdx
  3248. # Overflow
  3249. shldq $0x01, %r11, %rdx
  3250. imulq $19, %rdx, %rax
  3251. andq %rbp, %r11
  3252. addq %rax, %rcx
  3253. adcq $0x00, %r9
  3254. adcq $0x00, %r10
  3255. adcq $0x00, %r11
  3256. # Reduce if top bit set
  3257. movq %r11, %rdx
  3258. sarq $63, %rdx
  3259. andq $19, %rdx
  3260. andq %rbp, %r11
  3261. addq %rdx, %rcx
  3262. adcq $0x00, %r9
  3263. adcq $0x00, %r10
  3264. adcq $0x00, %r11
  3265. # Store
  3266. movq %rcx, (%rdi)
  3267. movq %r9, 8(%rdi)
  3268. movq %r10, 16(%rdi)
  3269. movq %r11, 24(%rdi)
  3270. # Sub
  3271. movq 128(%rsp), %rcx
  3272. movq 136(%rsp), %r9
  3273. movq 144(%rsp), %r10
  3274. movq 152(%rsp), %r11
  3275. subq 96(%rsp), %rcx
  3276. movq $0x00, %rbp
  3277. sbbq 104(%rsp), %r9
  3278. movq $-19, %rax
  3279. sbbq 112(%rsp), %r10
  3280. movq $0x7fffffffffffffff, %rdx
  3281. sbbq 120(%rsp), %r11
  3282. sbbq $0x00, %rbp
  3283. # Mask the modulus
  3284. andq %rbp, %rax
  3285. andq %rbp, %rdx
  3286. # Add modulus (if underflow)
  3287. addq %rax, %rcx
  3288. adcq %rbp, %r9
  3289. adcq %rbp, %r10
  3290. adcq %rdx, %r11
  3291. movq %rcx, 128(%rsp)
  3292. movq %r9, 136(%rsp)
  3293. movq %r10, 144(%rsp)
  3294. movq %r11, 152(%rsp)
  3295. # Square
  3296. # A[0] * A[1]
  3297. movq (%rsp), %rax
  3298. mulq 8(%rsp)
  3299. movq %rax, %r9
  3300. movq %rdx, %r10
  3301. # A[0] * A[2]
  3302. movq (%rsp), %rax
  3303. mulq 16(%rsp)
  3304. xorq %r11, %r11
  3305. addq %rax, %r10
  3306. adcq %rdx, %r11
  3307. # A[0] * A[3]
  3308. movq (%rsp), %rax
  3309. mulq 24(%rsp)
  3310. xorq %r12, %r12
  3311. addq %rax, %r11
  3312. adcq %rdx, %r12
  3313. # A[1] * A[2]
  3314. movq 8(%rsp), %rax
  3315. mulq 16(%rsp)
  3316. xorq %r13, %r13
  3317. addq %rax, %r11
  3318. adcq %rdx, %r12
  3319. adcq $0x00, %r13
  3320. # A[1] * A[3]
  3321. movq 8(%rsp), %rax
  3322. mulq 24(%rsp)
  3323. addq %rax, %r12
  3324. adcq %rdx, %r13
  3325. # A[2] * A[3]
  3326. movq 16(%rsp), %rax
  3327. mulq 24(%rsp)
  3328. xorq %r14, %r14
  3329. addq %rax, %r13
  3330. adcq %rdx, %r14
  3331. # Double
  3332. xorq %r15, %r15
  3333. addq %r9, %r9
  3334. adcq %r10, %r10
  3335. adcq %r11, %r11
  3336. adcq %r12, %r12
  3337. adcq %r13, %r13
  3338. adcq %r14, %r14
  3339. adcq $0x00, %r15
  3340. # A[0] * A[0]
  3341. movq (%rsp), %rax
  3342. mulq %rax
  3343. movq %rax, %rcx
  3344. movq %rdx, %rbp
  3345. # A[1] * A[1]
  3346. movq 8(%rsp), %rax
  3347. mulq %rax
  3348. addq %rbp, %r9
  3349. adcq %rax, %r10
  3350. adcq $0x00, %rdx
  3351. movq %rdx, %rbp
  3352. # A[2] * A[2]
  3353. movq 16(%rsp), %rax
  3354. mulq %rax
  3355. addq %rbp, %r11
  3356. adcq %rax, %r12
  3357. adcq $0x00, %rdx
  3358. movq %rdx, %rbp
  3359. # A[3] * A[3]
  3360. movq 24(%rsp), %rax
  3361. mulq %rax
  3362. addq %rax, %r14
  3363. adcq %rdx, %r15
  3364. addq %rbp, %r13
  3365. adcq $0x00, %r14
  3366. adcq $0x00, %r15
  3367. # Reduce
  3368. movq $0x7fffffffffffffff, %rbp
  3369. # Move top half into t4-t7 and remove top bit from t3
  3370. shldq $0x01, %r14, %r15
  3371. shldq $0x01, %r13, %r14
  3372. shldq $0x01, %r12, %r13
  3373. shldq $0x01, %r11, %r12
  3374. andq %rbp, %r11
  3375. # Multiply top half by 19
  3376. movq $19, %rax
  3377. mulq %r12
  3378. xorq %r12, %r12
  3379. addq %rax, %rcx
  3380. movq $19, %rax
  3381. adcq %rdx, %r12
  3382. mulq %r13
  3383. xorq %r13, %r13
  3384. addq %rax, %r9
  3385. movq $19, %rax
  3386. adcq %rdx, %r13
  3387. mulq %r14
  3388. xorq %r14, %r14
  3389. addq %rax, %r10
  3390. movq $19, %rax
  3391. adcq %rdx, %r14
  3392. mulq %r15
  3393. # Add remaining product results in
  3394. addq %r12, %r9
  3395. adcq %r13, %r10
  3396. adcq %r14, %r11
  3397. adcq %rax, %r11
  3398. adcq $0x00, %rdx
  3399. # Overflow
  3400. shldq $0x01, %r11, %rdx
  3401. imulq $19, %rdx, %rax
  3402. andq %rbp, %r11
  3403. addq %rax, %rcx
  3404. adcq $0x00, %r9
  3405. adcq $0x00, %r10
  3406. adcq $0x00, %r11
  3407. # Reduce if top bit set
  3408. movq %r11, %rdx
  3409. sarq $63, %rdx
  3410. andq $19, %rdx
  3411. andq %rbp, %r11
  3412. addq %rdx, %rcx
  3413. adcq $0x00, %r9
  3414. adcq $0x00, %r10
  3415. adcq $0x00, %r11
  3416. # Store
  3417. movq %rcx, (%rsp)
  3418. movq %r9, 8(%rsp)
  3419. movq %r10, 16(%rsp)
  3420. movq %r11, 24(%rsp)
  3421. # Multiply by 121666
  3422. movq $0x1db42, %rax
  3423. mulq 128(%rsp)
  3424. xorq %r10, %r10
  3425. movq %rax, %rcx
  3426. movq %rdx, %r9
  3427. movq $0x1db42, %rax
  3428. mulq 136(%rsp)
  3429. xorq %r11, %r11
  3430. addq %rax, %r9
  3431. adcq %rdx, %r10
  3432. movq $0x1db42, %rax
  3433. mulq 144(%rsp)
  3434. xorq %r13, %r13
  3435. addq %rax, %r10
  3436. adcq %rdx, %r11
  3437. movq $0x1db42, %rax
  3438. mulq 152(%rsp)
  3439. movq $0x7fffffffffffffff, %r12
  3440. addq %rax, %r11
  3441. adcq %rdx, %r13
  3442. shldq $0x01, %r11, %r13
  3443. andq %r12, %r11
  3444. movq $19, %rax
  3445. mulq %r13
  3446. addq %rax, %rcx
  3447. adcq $0x00, %r9
  3448. adcq $0x00, %r10
  3449. adcq $0x00, %r11
  3450. movq %rcx, 32(%rsp)
  3451. movq %r9, 40(%rsp)
  3452. movq %r10, 48(%rsp)
  3453. movq %r11, 56(%rsp)
  3454. # Square
  3455. # A[0] * A[1]
  3456. movq 64(%rsp), %rax
  3457. mulq 72(%rsp)
  3458. movq %rax, %r9
  3459. movq %rdx, %r10
  3460. # A[0] * A[2]
  3461. movq 64(%rsp), %rax
  3462. mulq 80(%rsp)
  3463. xorq %r11, %r11
  3464. addq %rax, %r10
  3465. adcq %rdx, %r11
  3466. # A[0] * A[3]
  3467. movq 64(%rsp), %rax
  3468. mulq 88(%rsp)
  3469. xorq %r12, %r12
  3470. addq %rax, %r11
  3471. adcq %rdx, %r12
  3472. # A[1] * A[2]
  3473. movq 72(%rsp), %rax
  3474. mulq 80(%rsp)
  3475. xorq %r13, %r13
  3476. addq %rax, %r11
  3477. adcq %rdx, %r12
  3478. adcq $0x00, %r13
  3479. # A[1] * A[3]
  3480. movq 72(%rsp), %rax
  3481. mulq 88(%rsp)
  3482. addq %rax, %r12
  3483. adcq %rdx, %r13
  3484. # A[2] * A[3]
  3485. movq 80(%rsp), %rax
  3486. mulq 88(%rsp)
  3487. xorq %r14, %r14
  3488. addq %rax, %r13
  3489. adcq %rdx, %r14
  3490. # Double
  3491. xorq %r15, %r15
  3492. addq %r9, %r9
  3493. adcq %r10, %r10
  3494. adcq %r11, %r11
  3495. adcq %r12, %r12
  3496. adcq %r13, %r13
  3497. adcq %r14, %r14
  3498. adcq $0x00, %r15
  3499. # A[0] * A[0]
  3500. movq 64(%rsp), %rax
  3501. mulq %rax
  3502. movq %rax, %rcx
  3503. movq %rdx, %rbp
  3504. # A[1] * A[1]
  3505. movq 72(%rsp), %rax
  3506. mulq %rax
  3507. addq %rbp, %r9
  3508. adcq %rax, %r10
  3509. adcq $0x00, %rdx
  3510. movq %rdx, %rbp
  3511. # A[2] * A[2]
  3512. movq 80(%rsp), %rax
  3513. mulq %rax
  3514. addq %rbp, %r11
  3515. adcq %rax, %r12
  3516. adcq $0x00, %rdx
  3517. movq %rdx, %rbp
  3518. # A[3] * A[3]
  3519. movq 88(%rsp), %rax
  3520. mulq %rax
  3521. addq %rax, %r14
  3522. adcq %rdx, %r15
  3523. addq %rbp, %r13
  3524. adcq $0x00, %r14
  3525. adcq $0x00, %r15
  3526. # Reduce
  3527. movq $0x7fffffffffffffff, %rbp
  3528. # Move top half into t4-t7 and remove top bit from t3
  3529. shldq $0x01, %r14, %r15
  3530. shldq $0x01, %r13, %r14
  3531. shldq $0x01, %r12, %r13
  3532. shldq $0x01, %r11, %r12
  3533. andq %rbp, %r11
  3534. # Multiply top half by 19
  3535. movq $19, %rax
  3536. mulq %r12
  3537. xorq %r12, %r12
  3538. addq %rax, %rcx
  3539. movq $19, %rax
  3540. adcq %rdx, %r12
  3541. mulq %r13
  3542. xorq %r13, %r13
  3543. addq %rax, %r9
  3544. movq $19, %rax
  3545. adcq %rdx, %r13
  3546. mulq %r14
  3547. xorq %r14, %r14
  3548. addq %rax, %r10
  3549. movq $19, %rax
  3550. adcq %rdx, %r14
  3551. mulq %r15
  3552. # Add remaining product results in
  3553. addq %r12, %r9
  3554. adcq %r13, %r10
  3555. adcq %r14, %r11
  3556. adcq %rax, %r11
  3557. adcq $0x00, %rdx
  3558. # Overflow
  3559. shldq $0x01, %r11, %rdx
  3560. imulq $19, %rdx, %rax
  3561. andq %rbp, %r11
  3562. addq %rax, %rcx
  3563. adcq $0x00, %r9
  3564. adcq $0x00, %r10
  3565. adcq $0x00, %r11
  3566. # Reduce if top bit set
  3567. movq %r11, %rdx
  3568. sarq $63, %rdx
  3569. andq $19, %rdx
  3570. andq %rbp, %r11
  3571. addq %rdx, %rcx
  3572. adcq $0x00, %r9
  3573. adcq $0x00, %r10
  3574. adcq $0x00, %r11
  3575. # Store
  3576. movq %rcx, 64(%rsp)
  3577. movq %r9, 72(%rsp)
  3578. movq %r10, 80(%rsp)
  3579. movq %r11, 88(%rsp)
  3580. # Add
  3581. movq 96(%rsp), %rcx
  3582. movq 104(%rsp), %r9
  3583. addq 32(%rsp), %rcx
  3584. movq 112(%rsp), %r10
  3585. adcq 40(%rsp), %r9
  3586. movq 120(%rsp), %rbp
  3587. adcq 48(%rsp), %r10
  3588. movq $-19, %rax
  3589. adcq 56(%rsp), %rbp
  3590. movq $0x7fffffffffffffff, %rdx
  3591. movq %rbp, %r11
  3592. sarq $63, %rbp
  3593. # Mask the modulus
  3594. andq %rbp, %rax
  3595. andq %rbp, %rdx
  3596. # Sub modulus (if overflow)
  3597. subq %rax, %rcx
  3598. sbbq %rbp, %r9
  3599. sbbq %rbp, %r10
  3600. sbbq %rdx, %r11
  3601. movq %rcx, 96(%rsp)
  3602. movq %r9, 104(%rsp)
  3603. movq %r10, 112(%rsp)
  3604. movq %r11, 120(%rsp)
  3605. # Multiply
  3606. # A[0] * B[0]
  3607. movq (%rsp), %rax
  3608. mulq (%r8)
  3609. movq %rax, %rcx
  3610. movq %rdx, %r9
  3611. # A[0] * B[1]
  3612. movq 8(%rsp), %rax
  3613. mulq (%r8)
  3614. xorq %r10, %r10
  3615. addq %rax, %r9
  3616. adcq %rdx, %r10
  3617. # A[1] * B[0]
  3618. movq (%rsp), %rax
  3619. mulq 8(%r8)
  3620. xorq %r11, %r11
  3621. addq %rax, %r9
  3622. adcq %rdx, %r10
  3623. adcq $0x00, %r11
  3624. # A[0] * B[2]
  3625. movq 16(%rsp), %rax
  3626. mulq (%r8)
  3627. addq %rax, %r10
  3628. adcq %rdx, %r11
  3629. # A[1] * B[1]
  3630. movq 8(%rsp), %rax
  3631. mulq 8(%r8)
  3632. xorq %r12, %r12
  3633. addq %rax, %r10
  3634. adcq %rdx, %r11
  3635. adcq $0x00, %r12
  3636. # A[2] * B[0]
  3637. movq (%rsp), %rax
  3638. mulq 16(%r8)
  3639. addq %rax, %r10
  3640. adcq %rdx, %r11
  3641. adcq $0x00, %r12
  3642. # A[0] * B[3]
  3643. movq 24(%rsp), %rax
  3644. mulq (%r8)
  3645. xorq %r13, %r13
  3646. addq %rax, %r11
  3647. adcq %rdx, %r12
  3648. adcq $0x00, %r13
  3649. # A[1] * B[2]
  3650. movq 16(%rsp), %rax
  3651. mulq 8(%r8)
  3652. addq %rax, %r11
  3653. adcq %rdx, %r12
  3654. adcq $0x00, %r13
  3655. # A[2] * B[1]
  3656. movq 8(%rsp), %rax
  3657. mulq 16(%r8)
  3658. addq %rax, %r11
  3659. adcq %rdx, %r12
  3660. adcq $0x00, %r13
  3661. # A[3] * B[0]
  3662. movq (%rsp), %rax
  3663. mulq 24(%r8)
  3664. addq %rax, %r11
  3665. adcq %rdx, %r12
  3666. adcq $0x00, %r13
  3667. # A[1] * B[3]
  3668. movq 24(%rsp), %rax
  3669. mulq 8(%r8)
  3670. xorq %r14, %r14
  3671. addq %rax, %r12
  3672. adcq %rdx, %r13
  3673. adcq $0x00, %r14
  3674. # A[2] * B[2]
  3675. movq 16(%rsp), %rax
  3676. mulq 16(%r8)
  3677. addq %rax, %r12
  3678. adcq %rdx, %r13
  3679. adcq $0x00, %r14
  3680. # A[3] * B[1]
  3681. movq 8(%rsp), %rax
  3682. mulq 24(%r8)
  3683. addq %rax, %r12
  3684. adcq %rdx, %r13
  3685. adcq $0x00, %r14
  3686. # A[2] * B[3]
  3687. movq 24(%rsp), %rax
  3688. mulq 16(%r8)
  3689. xorq %r15, %r15
  3690. addq %rax, %r13
  3691. adcq %rdx, %r14
  3692. adcq $0x00, %r15
  3693. # A[3] * B[2]
  3694. movq 16(%rsp), %rax
  3695. mulq 24(%r8)
  3696. addq %rax, %r13
  3697. adcq %rdx, %r14
  3698. adcq $0x00, %r15
  3699. # A[3] * B[3]
  3700. movq 24(%rsp), %rax
  3701. mulq 24(%r8)
  3702. addq %rax, %r14
  3703. adcq %rdx, %r15
  3704. # Reduce
  3705. movq $0x7fffffffffffffff, %rbp
  3706. # Move top half into t4-t7 and remove top bit from t3
  3707. shldq $0x01, %r14, %r15
  3708. shldq $0x01, %r13, %r14
  3709. shldq $0x01, %r12, %r13
  3710. shldq $0x01, %r11, %r12
  3711. andq %rbp, %r11
  3712. # Multiply top half by 19
  3713. movq $19, %rax
  3714. mulq %r12
  3715. xorq %r12, %r12
  3716. addq %rax, %rcx
  3717. movq $19, %rax
  3718. adcq %rdx, %r12
  3719. mulq %r13
  3720. xorq %r13, %r13
  3721. addq %rax, %r9
  3722. movq $19, %rax
  3723. adcq %rdx, %r13
  3724. mulq %r14
  3725. xorq %r14, %r14
  3726. addq %rax, %r10
  3727. movq $19, %rax
  3728. adcq %rdx, %r14
  3729. mulq %r15
  3730. # Add remaining product results in
  3731. addq %r12, %r9
  3732. adcq %r13, %r10
  3733. adcq %r14, %r11
  3734. adcq %rax, %r11
  3735. adcq $0x00, %rdx
  3736. # Overflow
  3737. shldq $0x01, %r11, %rdx
  3738. imulq $19, %rdx, %rax
  3739. andq %rbp, %r11
  3740. addq %rax, %rcx
  3741. adcq $0x00, %r9
  3742. adcq $0x00, %r10
  3743. adcq $0x00, %r11
  3744. # Reduce if top bit set
  3745. movq %r11, %rdx
  3746. sarq $63, %rdx
  3747. andq $19, %rdx
  3748. andq %rbp, %r11
  3749. addq %rdx, %rcx
  3750. adcq $0x00, %r9
  3751. adcq $0x00, %r10
  3752. adcq $0x00, %r11
  3753. # Store
  3754. movq %rcx, 32(%rsp)
  3755. movq %r9, 40(%rsp)
  3756. movq %r10, 48(%rsp)
  3757. movq %r11, 56(%rsp)
  3758. # Multiply
  3759. # A[0] * B[0]
  3760. movq 96(%rsp), %rax
  3761. mulq 128(%rsp)
  3762. movq %rax, %rcx
  3763. movq %rdx, %r9
  3764. # A[0] * B[1]
  3765. movq 104(%rsp), %rax
  3766. mulq 128(%rsp)
  3767. xorq %r10, %r10
  3768. addq %rax, %r9
  3769. adcq %rdx, %r10
  3770. # A[1] * B[0]
  3771. movq 96(%rsp), %rax
  3772. mulq 136(%rsp)
  3773. xorq %r11, %r11
  3774. addq %rax, %r9
  3775. adcq %rdx, %r10
  3776. adcq $0x00, %r11
  3777. # A[0] * B[2]
  3778. movq 112(%rsp), %rax
  3779. mulq 128(%rsp)
  3780. addq %rax, %r10
  3781. adcq %rdx, %r11
  3782. # A[1] * B[1]
  3783. movq 104(%rsp), %rax
  3784. mulq 136(%rsp)
  3785. xorq %r12, %r12
  3786. addq %rax, %r10
  3787. adcq %rdx, %r11
  3788. adcq $0x00, %r12
  3789. # A[2] * B[0]
  3790. movq 96(%rsp), %rax
  3791. mulq 144(%rsp)
  3792. addq %rax, %r10
  3793. adcq %rdx, %r11
  3794. adcq $0x00, %r12
  3795. # A[0] * B[3]
  3796. movq 120(%rsp), %rax
  3797. mulq 128(%rsp)
  3798. xorq %r13, %r13
  3799. addq %rax, %r11
  3800. adcq %rdx, %r12
  3801. adcq $0x00, %r13
  3802. # A[1] * B[2]
  3803. movq 112(%rsp), %rax
  3804. mulq 136(%rsp)
  3805. addq %rax, %r11
  3806. adcq %rdx, %r12
  3807. adcq $0x00, %r13
  3808. # A[2] * B[1]
  3809. movq 104(%rsp), %rax
  3810. mulq 144(%rsp)
  3811. addq %rax, %r11
  3812. adcq %rdx, %r12
  3813. adcq $0x00, %r13
  3814. # A[3] * B[0]
  3815. movq 96(%rsp), %rax
  3816. mulq 152(%rsp)
  3817. addq %rax, %r11
  3818. adcq %rdx, %r12
  3819. adcq $0x00, %r13
  3820. # A[1] * B[3]
  3821. movq 120(%rsp), %rax
  3822. mulq 136(%rsp)
  3823. xorq %r14, %r14
  3824. addq %rax, %r12
  3825. adcq %rdx, %r13
  3826. adcq $0x00, %r14
  3827. # A[2] * B[2]
  3828. movq 112(%rsp), %rax
  3829. mulq 144(%rsp)
  3830. addq %rax, %r12
  3831. adcq %rdx, %r13
  3832. adcq $0x00, %r14
  3833. # A[3] * B[1]
  3834. movq 104(%rsp), %rax
  3835. mulq 152(%rsp)
  3836. addq %rax, %r12
  3837. adcq %rdx, %r13
  3838. adcq $0x00, %r14
  3839. # A[2] * B[3]
  3840. movq 120(%rsp), %rax
  3841. mulq 144(%rsp)
  3842. xorq %r15, %r15
  3843. addq %rax, %r13
  3844. adcq %rdx, %r14
  3845. adcq $0x00, %r15
  3846. # A[3] * B[2]
  3847. movq 112(%rsp), %rax
  3848. mulq 152(%rsp)
  3849. addq %rax, %r13
  3850. adcq %rdx, %r14
  3851. adcq $0x00, %r15
  3852. # A[3] * B[3]
  3853. movq 120(%rsp), %rax
  3854. mulq 152(%rsp)
  3855. addq %rax, %r14
  3856. adcq %rdx, %r15
  3857. # Reduce
  3858. movq $0x7fffffffffffffff, %rbp
  3859. # Move top half into t4-t7 and remove top bit from t3
  3860. shldq $0x01, %r14, %r15
  3861. shldq $0x01, %r13, %r14
  3862. shldq $0x01, %r12, %r13
  3863. shldq $0x01, %r11, %r12
  3864. andq %rbp, %r11
  3865. # Multiply top half by 19
  3866. movq $19, %rax
  3867. mulq %r12
  3868. xorq %r12, %r12
  3869. addq %rax, %rcx
  3870. movq $19, %rax
  3871. adcq %rdx, %r12
  3872. mulq %r13
  3873. xorq %r13, %r13
  3874. addq %rax, %r9
  3875. movq $19, %rax
  3876. adcq %rdx, %r13
  3877. mulq %r14
  3878. xorq %r14, %r14
  3879. addq %rax, %r10
  3880. movq $19, %rax
  3881. adcq %rdx, %r14
  3882. mulq %r15
  3883. # Add remaining product results in
  3884. addq %r12, %r9
  3885. adcq %r13, %r10
  3886. adcq %r14, %r11
  3887. adcq %rax, %r11
  3888. adcq $0x00, %rdx
  3889. # Overflow
  3890. shldq $0x01, %r11, %rdx
  3891. imulq $19, %rdx, %rax
  3892. andq %rbp, %r11
  3893. addq %rax, %rcx
  3894. adcq $0x00, %r9
  3895. adcq $0x00, %r10
  3896. adcq $0x00, %r11
  3897. # Reduce if top bit set
  3898. movq %r11, %rdx
  3899. sarq $63, %rdx
  3900. andq $19, %rdx
  3901. andq %rbp, %r11
  3902. addq %rdx, %rcx
  3903. adcq $0x00, %r9
  3904. adcq $0x00, %r10
  3905. adcq $0x00, %r11
  3906. # Store
  3907. movq %rcx, (%rsp)
  3908. movq %r9, 8(%rsp)
  3909. movq %r10, 16(%rsp)
  3910. movq %r11, 24(%rsp)
  3911. decb 168(%rsp)
  3912. jge L_curve25519_x64_bits
  3913. movq $63, 168(%rsp)
  3914. decb 160(%rsp)
  3915. jge L_curve25519_x64_words
  3916. # Invert
  3917. leaq 32(%rsp), %rdi
  3918. movq %rsp, %rsi
  3919. #ifndef __APPLE__
  3920. callq fe_sq_x64@plt
  3921. #else
  3922. callq _fe_sq_x64
  3923. #endif /* __APPLE__ */
  3924. leaq 64(%rsp), %rdi
  3925. leaq 32(%rsp), %rsi
  3926. #ifndef __APPLE__
  3927. callq fe_sq_x64@plt
  3928. #else
  3929. callq _fe_sq_x64
  3930. #endif /* __APPLE__ */
  3931. leaq 64(%rsp), %rdi
  3932. leaq 64(%rsp), %rsi
  3933. #ifndef __APPLE__
  3934. callq fe_sq_x64@plt
  3935. #else
  3936. callq _fe_sq_x64
  3937. #endif /* __APPLE__ */
  3938. leaq 64(%rsp), %rdi
  3939. movq %rsp, %rsi
  3940. leaq 64(%rsp), %rdx
  3941. #ifndef __APPLE__
  3942. callq fe_mul_x64@plt
  3943. #else
  3944. callq _fe_mul_x64
  3945. #endif /* __APPLE__ */
  3946. leaq 32(%rsp), %rdi
  3947. leaq 32(%rsp), %rsi
  3948. leaq 64(%rsp), %rdx
  3949. #ifndef __APPLE__
  3950. callq fe_mul_x64@plt
  3951. #else
  3952. callq _fe_mul_x64
  3953. #endif /* __APPLE__ */
  3954. leaq 96(%rsp), %rdi
  3955. leaq 32(%rsp), %rsi
  3956. #ifndef __APPLE__
  3957. callq fe_sq_x64@plt
  3958. #else
  3959. callq _fe_sq_x64
  3960. #endif /* __APPLE__ */
  3961. leaq 64(%rsp), %rdi
  3962. leaq 64(%rsp), %rsi
  3963. leaq 96(%rsp), %rdx
  3964. #ifndef __APPLE__
  3965. callq fe_mul_x64@plt
  3966. #else
  3967. callq _fe_mul_x64
  3968. #endif /* __APPLE__ */
  3969. leaq 96(%rsp), %rdi
  3970. leaq 64(%rsp), %rsi
  3971. #ifndef __APPLE__
  3972. callq fe_sq_x64@plt
  3973. #else
  3974. callq _fe_sq_x64
  3975. #endif /* __APPLE__ */
  3976. leaq 96(%rsp), %rdi
  3977. leaq 96(%rsp), %rsi
  3978. movq $4, %rdx
  3979. #ifndef __APPLE__
  3980. callq fe_sq_n_x64@plt
  3981. #else
  3982. callq _fe_sq_n_x64
  3983. #endif /* __APPLE__ */
  3984. leaq 64(%rsp), %rdi
  3985. leaq 96(%rsp), %rsi
  3986. leaq 64(%rsp), %rdx
  3987. #ifndef __APPLE__
  3988. callq fe_mul_x64@plt
  3989. #else
  3990. callq _fe_mul_x64
  3991. #endif /* __APPLE__ */
  3992. leaq 96(%rsp), %rdi
  3993. leaq 64(%rsp), %rsi
  3994. #ifndef __APPLE__
  3995. callq fe_sq_x64@plt
  3996. #else
  3997. callq _fe_sq_x64
  3998. #endif /* __APPLE__ */
  3999. leaq 96(%rsp), %rdi
  4000. leaq 96(%rsp), %rsi
  4001. movq $9, %rdx
  4002. #ifndef __APPLE__
  4003. callq fe_sq_n_x64@plt
  4004. #else
  4005. callq _fe_sq_n_x64
  4006. #endif /* __APPLE__ */
  4007. leaq 96(%rsp), %rdi
  4008. leaq 96(%rsp), %rsi
  4009. leaq 64(%rsp), %rdx
  4010. #ifndef __APPLE__
  4011. callq fe_mul_x64@plt
  4012. #else
  4013. callq _fe_mul_x64
  4014. #endif /* __APPLE__ */
  4015. leaq 128(%rsp), %rdi
  4016. leaq 96(%rsp), %rsi
  4017. #ifndef __APPLE__
  4018. callq fe_sq_x64@plt
  4019. #else
  4020. callq _fe_sq_x64
  4021. #endif /* __APPLE__ */
  4022. leaq 128(%rsp), %rdi
  4023. leaq 128(%rsp), %rsi
  4024. movq $19, %rdx
  4025. #ifndef __APPLE__
  4026. callq fe_sq_n_x64@plt
  4027. #else
  4028. callq _fe_sq_n_x64
  4029. #endif /* __APPLE__ */
  4030. leaq 96(%rsp), %rdi
  4031. leaq 128(%rsp), %rsi
  4032. leaq 96(%rsp), %rdx
  4033. #ifndef __APPLE__
  4034. callq fe_mul_x64@plt
  4035. #else
  4036. callq _fe_mul_x64
  4037. #endif /* __APPLE__ */
  4038. leaq 96(%rsp), %rdi
  4039. leaq 96(%rsp), %rsi
  4040. #ifndef __APPLE__
  4041. callq fe_sq_x64@plt
  4042. #else
  4043. callq _fe_sq_x64
  4044. #endif /* __APPLE__ */
  4045. leaq 96(%rsp), %rdi
  4046. leaq 96(%rsp), %rsi
  4047. movq $9, %rdx
  4048. #ifndef __APPLE__
  4049. callq fe_sq_n_x64@plt
  4050. #else
  4051. callq _fe_sq_n_x64
  4052. #endif /* __APPLE__ */
  4053. leaq 64(%rsp), %rdi
  4054. leaq 96(%rsp), %rsi
  4055. leaq 64(%rsp), %rdx
  4056. #ifndef __APPLE__
  4057. callq fe_mul_x64@plt
  4058. #else
  4059. callq _fe_mul_x64
  4060. #endif /* __APPLE__ */
  4061. leaq 96(%rsp), %rdi
  4062. leaq 64(%rsp), %rsi
  4063. #ifndef __APPLE__
  4064. callq fe_sq_x64@plt
  4065. #else
  4066. callq _fe_sq_x64
  4067. #endif /* __APPLE__ */
  4068. leaq 96(%rsp), %rdi
  4069. leaq 96(%rsp), %rsi
  4070. movq $49, %rdx
  4071. #ifndef __APPLE__
  4072. callq fe_sq_n_x64@plt
  4073. #else
  4074. callq _fe_sq_n_x64
  4075. #endif /* __APPLE__ */
  4076. leaq 96(%rsp), %rdi
  4077. leaq 96(%rsp), %rsi
  4078. leaq 64(%rsp), %rdx
  4079. #ifndef __APPLE__
  4080. callq fe_mul_x64@plt
  4081. #else
  4082. callq _fe_mul_x64
  4083. #endif /* __APPLE__ */
  4084. leaq 128(%rsp), %rdi
  4085. leaq 96(%rsp), %rsi
  4086. #ifndef __APPLE__
  4087. callq fe_sq_x64@plt
  4088. #else
  4089. callq _fe_sq_x64
  4090. #endif /* __APPLE__ */
  4091. leaq 128(%rsp), %rdi
  4092. leaq 128(%rsp), %rsi
  4093. movq $0x63, %rdx
  4094. #ifndef __APPLE__
  4095. callq fe_sq_n_x64@plt
  4096. #else
  4097. callq _fe_sq_n_x64
  4098. #endif /* __APPLE__ */
  4099. leaq 96(%rsp), %rdi
  4100. leaq 128(%rsp), %rsi
  4101. leaq 96(%rsp), %rdx
  4102. #ifndef __APPLE__
  4103. callq fe_mul_x64@plt
  4104. #else
  4105. callq _fe_mul_x64
  4106. #endif /* __APPLE__ */
  4107. leaq 96(%rsp), %rdi
  4108. leaq 96(%rsp), %rsi
  4109. #ifndef __APPLE__
  4110. callq fe_sq_x64@plt
  4111. #else
  4112. callq _fe_sq_x64
  4113. #endif /* __APPLE__ */
  4114. leaq 96(%rsp), %rdi
  4115. leaq 96(%rsp), %rsi
  4116. movq $49, %rdx
  4117. #ifndef __APPLE__
  4118. callq fe_sq_n_x64@plt
  4119. #else
  4120. callq _fe_sq_n_x64
  4121. #endif /* __APPLE__ */
  4122. leaq 64(%rsp), %rdi
  4123. leaq 96(%rsp), %rsi
  4124. leaq 64(%rsp), %rdx
  4125. #ifndef __APPLE__
  4126. callq fe_mul_x64@plt
  4127. #else
  4128. callq _fe_mul_x64
  4129. #endif /* __APPLE__ */
  4130. leaq 64(%rsp), %rdi
  4131. leaq 64(%rsp), %rsi
  4132. #ifndef __APPLE__
  4133. callq fe_sq_x64@plt
  4134. #else
  4135. callq _fe_sq_x64
  4136. #endif /* __APPLE__ */
  4137. leaq 64(%rsp), %rdi
  4138. leaq 64(%rsp), %rsi
  4139. movq $4, %rdx
  4140. #ifndef __APPLE__
  4141. callq fe_sq_n_x64@plt
  4142. #else
  4143. callq _fe_sq_n_x64
  4144. #endif /* __APPLE__ */
  4145. movq %rsp, %rdi
  4146. leaq 64(%rsp), %rsi
  4147. leaq 32(%rsp), %rdx
  4148. #ifndef __APPLE__
  4149. callq fe_mul_x64@plt
  4150. #else
  4151. callq _fe_mul_x64
  4152. #endif /* __APPLE__ */
  4153. movq 176(%rsp), %rdi
  4154. # Multiply
  4155. # A[0] * B[0]
  4156. movq (%rsp), %rax
  4157. mulq (%rdi)
  4158. movq %rax, %rcx
  4159. movq %rdx, %r9
  4160. # A[0] * B[1]
  4161. movq 8(%rsp), %rax
  4162. mulq (%rdi)
  4163. xorq %r10, %r10
  4164. addq %rax, %r9
  4165. adcq %rdx, %r10
  4166. # A[1] * B[0]
  4167. movq (%rsp), %rax
  4168. mulq 8(%rdi)
  4169. xorq %r11, %r11
  4170. addq %rax, %r9
  4171. adcq %rdx, %r10
  4172. adcq $0x00, %r11
  4173. # A[0] * B[2]
  4174. movq 16(%rsp), %rax
  4175. mulq (%rdi)
  4176. addq %rax, %r10
  4177. adcq %rdx, %r11
  4178. # A[1] * B[1]
  4179. movq 8(%rsp), %rax
  4180. mulq 8(%rdi)
  4181. xorq %r12, %r12
  4182. addq %rax, %r10
  4183. adcq %rdx, %r11
  4184. adcq $0x00, %r12
  4185. # A[2] * B[0]
  4186. movq (%rsp), %rax
  4187. mulq 16(%rdi)
  4188. addq %rax, %r10
  4189. adcq %rdx, %r11
  4190. adcq $0x00, %r12
  4191. # A[0] * B[3]
  4192. movq 24(%rsp), %rax
  4193. mulq (%rdi)
  4194. xorq %r13, %r13
  4195. addq %rax, %r11
  4196. adcq %rdx, %r12
  4197. adcq $0x00, %r13
  4198. # A[1] * B[2]
  4199. movq 16(%rsp), %rax
  4200. mulq 8(%rdi)
  4201. addq %rax, %r11
  4202. adcq %rdx, %r12
  4203. adcq $0x00, %r13
  4204. # A[2] * B[1]
  4205. movq 8(%rsp), %rax
  4206. mulq 16(%rdi)
  4207. addq %rax, %r11
  4208. adcq %rdx, %r12
  4209. adcq $0x00, %r13
  4210. # A[3] * B[0]
  4211. movq (%rsp), %rax
  4212. mulq 24(%rdi)
  4213. addq %rax, %r11
  4214. adcq %rdx, %r12
  4215. adcq $0x00, %r13
  4216. # A[1] * B[3]
  4217. movq 24(%rsp), %rax
  4218. mulq 8(%rdi)
  4219. xorq %r14, %r14
  4220. addq %rax, %r12
  4221. adcq %rdx, %r13
  4222. adcq $0x00, %r14
  4223. # A[2] * B[2]
  4224. movq 16(%rsp), %rax
  4225. mulq 16(%rdi)
  4226. addq %rax, %r12
  4227. adcq %rdx, %r13
  4228. adcq $0x00, %r14
  4229. # A[3] * B[1]
  4230. movq 8(%rsp), %rax
  4231. mulq 24(%rdi)
  4232. addq %rax, %r12
  4233. adcq %rdx, %r13
  4234. adcq $0x00, %r14
  4235. # A[2] * B[3]
  4236. movq 24(%rsp), %rax
  4237. mulq 16(%rdi)
  4238. xorq %r15, %r15
  4239. addq %rax, %r13
  4240. adcq %rdx, %r14
  4241. adcq $0x00, %r15
  4242. # A[3] * B[2]
  4243. movq 16(%rsp), %rax
  4244. mulq 24(%rdi)
  4245. addq %rax, %r13
  4246. adcq %rdx, %r14
  4247. adcq $0x00, %r15
  4248. # A[3] * B[3]
  4249. movq 24(%rsp), %rax
  4250. mulq 24(%rdi)
  4251. addq %rax, %r14
  4252. adcq %rdx, %r15
  4253. # Reduce
  4254. movq $0x7fffffffffffffff, %rbp
  4255. # Move top half into t4-t7 and remove top bit from t3
  4256. shldq $0x01, %r14, %r15
  4257. shldq $0x01, %r13, %r14
  4258. shldq $0x01, %r12, %r13
  4259. shldq $0x01, %r11, %r12
  4260. andq %rbp, %r11
  4261. # Multiply top half by 19
  4262. movq $19, %rax
  4263. mulq %r12
  4264. xorq %r12, %r12
  4265. addq %rax, %rcx
  4266. movq $19, %rax
  4267. adcq %rdx, %r12
  4268. mulq %r13
  4269. xorq %r13, %r13
  4270. addq %rax, %r9
  4271. movq $19, %rax
  4272. adcq %rdx, %r13
  4273. mulq %r14
  4274. xorq %r14, %r14
  4275. addq %rax, %r10
  4276. movq $19, %rax
  4277. adcq %rdx, %r14
  4278. mulq %r15
  4279. # Add remaining product results in
  4280. addq %r12, %r9
  4281. adcq %r13, %r10
  4282. adcq %r14, %r11
  4283. adcq %rax, %r11
  4284. adcq $0x00, %rdx
  4285. # Overflow
  4286. shldq $0x01, %r11, %rdx
  4287. imulq $19, %rdx, %rax
  4288. andq %rbp, %r11
  4289. addq %rax, %rcx
  4290. adcq $0x00, %r9
  4291. adcq $0x00, %r10
  4292. adcq $0x00, %r11
  4293. # Reduce if top bit set
  4294. movq %r11, %rdx
  4295. sarq $63, %rdx
  4296. andq $19, %rdx
  4297. andq %rbp, %r11
  4298. addq %rdx, %rcx
  4299. adcq $0x00, %r9
  4300. adcq $0x00, %r10
  4301. adcq $0x00, %r11
  4302. movq %rcx, %rax
  4303. addq $19, %rax
  4304. movq %r9, %rax
  4305. adcq $0x00, %rax
  4306. movq %r10, %rax
  4307. adcq $0x00, %rax
  4308. movq %r11, %rax
  4309. adcq $0x00, %rax
  4310. sarq $63, %rax
  4311. andq $19, %rax
  4312. addq %rax, %rcx
  4313. adcq $0x00, %r9
  4314. adcq $0x00, %r10
  4315. adcq $0x00, %r11
  4316. andq %rbp, %r11
  4317. # Store
  4318. movq %rcx, (%rdi)
  4319. movq %r9, 8(%rdi)
  4320. movq %r10, 16(%rdi)
  4321. movq %r11, 24(%rdi)
  4322. xorq %rax, %rax
  4323. addq $0xb8, %rsp
  4324. popq %rbp
  4325. popq %rbx
  4326. popq %r15
  4327. popq %r14
  4328. popq %r13
  4329. popq %r12
  4330. repz retq
  4331. #ifndef __APPLE__
  4332. .size curve25519_x64,.-curve25519_x64
  4333. #endif /* __APPLE__ */
  4334. #ifndef __APPLE__
  4335. .text
  4336. .globl fe_pow22523_x64
  4337. .type fe_pow22523_x64,@function
  4338. .align 16
  4339. fe_pow22523_x64:
  4340. #else
  4341. .section __TEXT,__text
  4342. .globl _fe_pow22523_x64
  4343. .p2align 4
  4344. _fe_pow22523_x64:
  4345. #endif /* __APPLE__ */
  4346. subq $0x70, %rsp
  4347. # pow22523
  4348. movq %rdi, 96(%rsp)
  4349. movq %rsi, 104(%rsp)
  4350. movq %rsp, %rdi
  4351. movq 104(%rsp), %rsi
  4352. #ifndef __APPLE__
  4353. callq fe_sq_x64@plt
  4354. #else
  4355. callq _fe_sq_x64
  4356. #endif /* __APPLE__ */
  4357. leaq 32(%rsp), %rdi
  4358. movq %rsp, %rsi
  4359. #ifndef __APPLE__
  4360. callq fe_sq_x64@plt
  4361. #else
  4362. callq _fe_sq_x64
  4363. #endif /* __APPLE__ */
  4364. leaq 32(%rsp), %rdi
  4365. leaq 32(%rsp), %rsi
  4366. #ifndef __APPLE__
  4367. callq fe_sq_x64@plt
  4368. #else
  4369. callq _fe_sq_x64
  4370. #endif /* __APPLE__ */
  4371. leaq 32(%rsp), %rdi
  4372. movq 104(%rsp), %rsi
  4373. leaq 32(%rsp), %rdx
  4374. #ifndef __APPLE__
  4375. callq fe_mul_x64@plt
  4376. #else
  4377. callq _fe_mul_x64
  4378. #endif /* __APPLE__ */
  4379. movq %rsp, %rdi
  4380. movq %rsp, %rsi
  4381. leaq 32(%rsp), %rdx
  4382. #ifndef __APPLE__
  4383. callq fe_mul_x64@plt
  4384. #else
  4385. callq _fe_mul_x64
  4386. #endif /* __APPLE__ */
  4387. movq %rsp, %rdi
  4388. movq %rsp, %rsi
  4389. #ifndef __APPLE__
  4390. callq fe_sq_x64@plt
  4391. #else
  4392. callq _fe_sq_x64
  4393. #endif /* __APPLE__ */
  4394. movq %rsp, %rdi
  4395. leaq 32(%rsp), %rsi
  4396. movq %rsp, %rdx
  4397. #ifndef __APPLE__
  4398. callq fe_mul_x64@plt
  4399. #else
  4400. callq _fe_mul_x64
  4401. #endif /* __APPLE__ */
  4402. leaq 32(%rsp), %rdi
  4403. movq %rsp, %rsi
  4404. #ifndef __APPLE__
  4405. callq fe_sq_x64@plt
  4406. #else
  4407. callq _fe_sq_x64
  4408. #endif /* __APPLE__ */
  4409. leaq 32(%rsp), %rdi
  4410. leaq 32(%rsp), %rsi
  4411. movq $4, %rdx
  4412. #ifndef __APPLE__
  4413. callq fe_sq_n_x64@plt
  4414. #else
  4415. callq _fe_sq_n_x64
  4416. #endif /* __APPLE__ */
  4417. movq %rsp, %rdi
  4418. leaq 32(%rsp), %rsi
  4419. movq %rsp, %rdx
  4420. #ifndef __APPLE__
  4421. callq fe_mul_x64@plt
  4422. #else
  4423. callq _fe_mul_x64
  4424. #endif /* __APPLE__ */
  4425. leaq 32(%rsp), %rdi
  4426. movq %rsp, %rsi
  4427. #ifndef __APPLE__
  4428. callq fe_sq_x64@plt
  4429. #else
  4430. callq _fe_sq_x64
  4431. #endif /* __APPLE__ */
  4432. leaq 32(%rsp), %rdi
  4433. leaq 32(%rsp), %rsi
  4434. movq $9, %rdx
  4435. #ifndef __APPLE__
  4436. callq fe_sq_n_x64@plt
  4437. #else
  4438. callq _fe_sq_n_x64
  4439. #endif /* __APPLE__ */
  4440. leaq 32(%rsp), %rdi
  4441. leaq 32(%rsp), %rsi
  4442. movq %rsp, %rdx
  4443. #ifndef __APPLE__
  4444. callq fe_mul_x64@plt
  4445. #else
  4446. callq _fe_mul_x64
  4447. #endif /* __APPLE__ */
  4448. leaq 64(%rsp), %rdi
  4449. leaq 32(%rsp), %rsi
  4450. #ifndef __APPLE__
  4451. callq fe_sq_x64@plt
  4452. #else
  4453. callq _fe_sq_x64
  4454. #endif /* __APPLE__ */
  4455. leaq 64(%rsp), %rdi
  4456. leaq 64(%rsp), %rsi
  4457. movq $19, %rdx
  4458. #ifndef __APPLE__
  4459. callq fe_sq_n_x64@plt
  4460. #else
  4461. callq _fe_sq_n_x64
  4462. #endif /* __APPLE__ */
  4463. leaq 32(%rsp), %rdi
  4464. leaq 64(%rsp), %rsi
  4465. leaq 32(%rsp), %rdx
  4466. #ifndef __APPLE__
  4467. callq fe_mul_x64@plt
  4468. #else
  4469. callq _fe_mul_x64
  4470. #endif /* __APPLE__ */
  4471. leaq 32(%rsp), %rdi
  4472. leaq 32(%rsp), %rsi
  4473. #ifndef __APPLE__
  4474. callq fe_sq_x64@plt
  4475. #else
  4476. callq _fe_sq_x64
  4477. #endif /* __APPLE__ */
  4478. leaq 32(%rsp), %rdi
  4479. leaq 32(%rsp), %rsi
  4480. movq $9, %rdx
  4481. #ifndef __APPLE__
  4482. callq fe_sq_n_x64@plt
  4483. #else
  4484. callq _fe_sq_n_x64
  4485. #endif /* __APPLE__ */
  4486. movq %rsp, %rdi
  4487. leaq 32(%rsp), %rsi
  4488. movq %rsp, %rdx
  4489. #ifndef __APPLE__
  4490. callq fe_mul_x64@plt
  4491. #else
  4492. callq _fe_mul_x64
  4493. #endif /* __APPLE__ */
  4494. leaq 32(%rsp), %rdi
  4495. movq %rsp, %rsi
  4496. #ifndef __APPLE__
  4497. callq fe_sq_x64@plt
  4498. #else
  4499. callq _fe_sq_x64
  4500. #endif /* __APPLE__ */
  4501. leaq 32(%rsp), %rdi
  4502. leaq 32(%rsp), %rsi
  4503. movq $49, %rdx
  4504. #ifndef __APPLE__
  4505. callq fe_sq_n_x64@plt
  4506. #else
  4507. callq _fe_sq_n_x64
  4508. #endif /* __APPLE__ */
  4509. leaq 32(%rsp), %rdi
  4510. leaq 32(%rsp), %rsi
  4511. movq %rsp, %rdx
  4512. #ifndef __APPLE__
  4513. callq fe_mul_x64@plt
  4514. #else
  4515. callq _fe_mul_x64
  4516. #endif /* __APPLE__ */
  4517. leaq 64(%rsp), %rdi
  4518. leaq 32(%rsp), %rsi
  4519. #ifndef __APPLE__
  4520. callq fe_sq_x64@plt
  4521. #else
  4522. callq _fe_sq_x64
  4523. #endif /* __APPLE__ */
  4524. leaq 64(%rsp), %rdi
  4525. leaq 64(%rsp), %rsi
  4526. movq $0x63, %rdx
  4527. #ifndef __APPLE__
  4528. callq fe_sq_n_x64@plt
  4529. #else
  4530. callq _fe_sq_n_x64
  4531. #endif /* __APPLE__ */
  4532. leaq 32(%rsp), %rdi
  4533. leaq 64(%rsp), %rsi
  4534. leaq 32(%rsp), %rdx
  4535. #ifndef __APPLE__
  4536. callq fe_mul_x64@plt
  4537. #else
  4538. callq _fe_mul_x64
  4539. #endif /* __APPLE__ */
  4540. leaq 32(%rsp), %rdi
  4541. leaq 32(%rsp), %rsi
  4542. #ifndef __APPLE__
  4543. callq fe_sq_x64@plt
  4544. #else
  4545. callq _fe_sq_x64
  4546. #endif /* __APPLE__ */
  4547. leaq 32(%rsp), %rdi
  4548. leaq 32(%rsp), %rsi
  4549. movq $49, %rdx
  4550. #ifndef __APPLE__
  4551. callq fe_sq_n_x64@plt
  4552. #else
  4553. callq _fe_sq_n_x64
  4554. #endif /* __APPLE__ */
  4555. movq %rsp, %rdi
  4556. leaq 32(%rsp), %rsi
  4557. movq %rsp, %rdx
  4558. #ifndef __APPLE__
  4559. callq fe_mul_x64@plt
  4560. #else
  4561. callq _fe_mul_x64
  4562. #endif /* __APPLE__ */
  4563. movq %rsp, %rdi
  4564. movq %rsp, %rsi
  4565. #ifndef __APPLE__
  4566. callq fe_sq_x64@plt
  4567. #else
  4568. callq _fe_sq_x64
  4569. #endif /* __APPLE__ */
  4570. movq %rsp, %rdi
  4571. movq %rsp, %rsi
  4572. #ifndef __APPLE__
  4573. callq fe_sq_x64@plt
  4574. #else
  4575. callq _fe_sq_x64
  4576. #endif /* __APPLE__ */
  4577. movq 96(%rsp), %rdi
  4578. movq %rsp, %rsi
  4579. movq 104(%rsp), %rdx
  4580. #ifndef __APPLE__
  4581. callq fe_mul_x64@plt
  4582. #else
  4583. callq _fe_mul_x64
  4584. #endif /* __APPLE__ */
  4585. movq 104(%rsp), %rsi
  4586. movq 96(%rsp), %rdi
  4587. addq $0x70, %rsp
  4588. repz retq
  4589. #ifndef __APPLE__
  4590. .text
  4591. .globl fe_ge_to_p2_x64
  4592. .type fe_ge_to_p2_x64,@function
  4593. .align 16
  4594. fe_ge_to_p2_x64:
  4595. #else
  4596. .section __TEXT,__text
  4597. .globl _fe_ge_to_p2_x64
  4598. .p2align 4
  4599. _fe_ge_to_p2_x64:
  4600. #endif /* __APPLE__ */
  4601. pushq %rbx
  4602. pushq %r12
  4603. pushq %r13
  4604. pushq %r14
  4605. pushq %r15
  4606. subq $40, %rsp
  4607. movq %rsi, (%rsp)
  4608. movq %rdx, 8(%rsp)
  4609. movq %rcx, 16(%rsp)
  4610. movq %r8, 24(%rsp)
  4611. movq %r9, 32(%rsp)
  4612. movq 16(%rsp), %rsi
  4613. movq 88(%rsp), %rbx
  4614. # Multiply
  4615. # A[0] * B[0]
  4616. movq (%rbx), %rax
  4617. mulq (%rsi)
  4618. movq %rax, %r8
  4619. movq %rdx, %r9
  4620. # A[0] * B[1]
  4621. movq 8(%rbx), %rax
  4622. mulq (%rsi)
  4623. xorq %r10, %r10
  4624. addq %rax, %r9
  4625. adcq %rdx, %r10
  4626. # A[1] * B[0]
  4627. movq (%rbx), %rax
  4628. mulq 8(%rsi)
  4629. xorq %r11, %r11
  4630. addq %rax, %r9
  4631. adcq %rdx, %r10
  4632. adcq $0x00, %r11
  4633. # A[0] * B[2]
  4634. movq 16(%rbx), %rax
  4635. mulq (%rsi)
  4636. addq %rax, %r10
  4637. adcq %rdx, %r11
  4638. # A[1] * B[1]
  4639. movq 8(%rbx), %rax
  4640. mulq 8(%rsi)
  4641. xorq %r12, %r12
  4642. addq %rax, %r10
  4643. adcq %rdx, %r11
  4644. adcq $0x00, %r12
  4645. # A[2] * B[0]
  4646. movq (%rbx), %rax
  4647. mulq 16(%rsi)
  4648. addq %rax, %r10
  4649. adcq %rdx, %r11
  4650. adcq $0x00, %r12
  4651. # A[0] * B[3]
  4652. movq 24(%rbx), %rax
  4653. mulq (%rsi)
  4654. xorq %r13, %r13
  4655. addq %rax, %r11
  4656. adcq %rdx, %r12
  4657. adcq $0x00, %r13
  4658. # A[1] * B[2]
  4659. movq 16(%rbx), %rax
  4660. mulq 8(%rsi)
  4661. addq %rax, %r11
  4662. adcq %rdx, %r12
  4663. adcq $0x00, %r13
  4664. # A[2] * B[1]
  4665. movq 8(%rbx), %rax
  4666. mulq 16(%rsi)
  4667. addq %rax, %r11
  4668. adcq %rdx, %r12
  4669. adcq $0x00, %r13
  4670. # A[3] * B[0]
  4671. movq (%rbx), %rax
  4672. mulq 24(%rsi)
  4673. addq %rax, %r11
  4674. adcq %rdx, %r12
  4675. adcq $0x00, %r13
  4676. # A[1] * B[3]
  4677. movq 24(%rbx), %rax
  4678. mulq 8(%rsi)
  4679. xorq %r14, %r14
  4680. addq %rax, %r12
  4681. adcq %rdx, %r13
  4682. adcq $0x00, %r14
  4683. # A[2] * B[2]
  4684. movq 16(%rbx), %rax
  4685. mulq 16(%rsi)
  4686. addq %rax, %r12
  4687. adcq %rdx, %r13
  4688. adcq $0x00, %r14
  4689. # A[3] * B[1]
  4690. movq 8(%rbx), %rax
  4691. mulq 24(%rsi)
  4692. addq %rax, %r12
  4693. adcq %rdx, %r13
  4694. adcq $0x00, %r14
  4695. # A[2] * B[3]
  4696. movq 24(%rbx), %rax
  4697. mulq 16(%rsi)
  4698. xorq %r15, %r15
  4699. addq %rax, %r13
  4700. adcq %rdx, %r14
  4701. adcq $0x00, %r15
  4702. # A[3] * B[2]
  4703. movq 16(%rbx), %rax
  4704. mulq 24(%rsi)
  4705. addq %rax, %r13
  4706. adcq %rdx, %r14
  4707. adcq $0x00, %r15
  4708. # A[3] * B[3]
  4709. movq 24(%rbx), %rax
  4710. mulq 24(%rsi)
  4711. addq %rax, %r14
  4712. adcq %rdx, %r15
  4713. # Reduce
  4714. movq $0x7fffffffffffffff, %rcx
  4715. # Move top half into t4-t7 and remove top bit from t3
  4716. shldq $0x01, %r14, %r15
  4717. shldq $0x01, %r13, %r14
  4718. shldq $0x01, %r12, %r13
  4719. shldq $0x01, %r11, %r12
  4720. andq %rcx, %r11
  4721. # Multiply top half by 19
  4722. movq $19, %rax
  4723. mulq %r12
  4724. xorq %r12, %r12
  4725. addq %rax, %r8
  4726. movq $19, %rax
  4727. adcq %rdx, %r12
  4728. mulq %r13
  4729. xorq %r13, %r13
  4730. addq %rax, %r9
  4731. movq $19, %rax
  4732. adcq %rdx, %r13
  4733. mulq %r14
  4734. xorq %r14, %r14
  4735. addq %rax, %r10
  4736. movq $19, %rax
  4737. adcq %rdx, %r14
  4738. mulq %r15
  4739. # Add remaining product results in
  4740. addq %r12, %r9
  4741. adcq %r13, %r10
  4742. adcq %r14, %r11
  4743. adcq %rax, %r11
  4744. adcq $0x00, %rdx
  4745. # Overflow
  4746. shldq $0x01, %r11, %rdx
  4747. imulq $19, %rdx, %rax
  4748. andq %rcx, %r11
  4749. addq %rax, %r8
  4750. adcq $0x00, %r9
  4751. adcq $0x00, %r10
  4752. adcq $0x00, %r11
  4753. # Reduce if top bit set
  4754. movq %r11, %rdx
  4755. sarq $63, %rdx
  4756. andq $19, %rdx
  4757. andq %rcx, %r11
  4758. addq %rdx, %r8
  4759. adcq $0x00, %r9
  4760. adcq $0x00, %r10
  4761. adcq $0x00, %r11
  4762. # Store
  4763. movq %r8, (%rdi)
  4764. movq %r9, 8(%rdi)
  4765. movq %r10, 16(%rdi)
  4766. movq %r11, 24(%rdi)
  4767. movq (%rsp), %rdi
  4768. movq 24(%rsp), %rsi
  4769. movq 32(%rsp), %rbx
  4770. # Multiply
  4771. # A[0] * B[0]
  4772. movq (%rbx), %rax
  4773. mulq (%rsi)
  4774. movq %rax, %r8
  4775. movq %rdx, %r9
  4776. # A[0] * B[1]
  4777. movq 8(%rbx), %rax
  4778. mulq (%rsi)
  4779. xorq %r10, %r10
  4780. addq %rax, %r9
  4781. adcq %rdx, %r10
  4782. # A[1] * B[0]
  4783. movq (%rbx), %rax
  4784. mulq 8(%rsi)
  4785. xorq %r11, %r11
  4786. addq %rax, %r9
  4787. adcq %rdx, %r10
  4788. adcq $0x00, %r11
  4789. # A[0] * B[2]
  4790. movq 16(%rbx), %rax
  4791. mulq (%rsi)
  4792. addq %rax, %r10
  4793. adcq %rdx, %r11
  4794. # A[1] * B[1]
  4795. movq 8(%rbx), %rax
  4796. mulq 8(%rsi)
  4797. xorq %r12, %r12
  4798. addq %rax, %r10
  4799. adcq %rdx, %r11
  4800. adcq $0x00, %r12
  4801. # A[2] * B[0]
  4802. movq (%rbx), %rax
  4803. mulq 16(%rsi)
  4804. addq %rax, %r10
  4805. adcq %rdx, %r11
  4806. adcq $0x00, %r12
  4807. # A[0] * B[3]
  4808. movq 24(%rbx), %rax
  4809. mulq (%rsi)
  4810. xorq %r13, %r13
  4811. addq %rax, %r11
  4812. adcq %rdx, %r12
  4813. adcq $0x00, %r13
  4814. # A[1] * B[2]
  4815. movq 16(%rbx), %rax
  4816. mulq 8(%rsi)
  4817. addq %rax, %r11
  4818. adcq %rdx, %r12
  4819. adcq $0x00, %r13
  4820. # A[2] * B[1]
  4821. movq 8(%rbx), %rax
  4822. mulq 16(%rsi)
  4823. addq %rax, %r11
  4824. adcq %rdx, %r12
  4825. adcq $0x00, %r13
  4826. # A[3] * B[0]
  4827. movq (%rbx), %rax
  4828. mulq 24(%rsi)
  4829. addq %rax, %r11
  4830. adcq %rdx, %r12
  4831. adcq $0x00, %r13
  4832. # A[1] * B[3]
  4833. movq 24(%rbx), %rax
  4834. mulq 8(%rsi)
  4835. xorq %r14, %r14
  4836. addq %rax, %r12
  4837. adcq %rdx, %r13
  4838. adcq $0x00, %r14
  4839. # A[2] * B[2]
  4840. movq 16(%rbx), %rax
  4841. mulq 16(%rsi)
  4842. addq %rax, %r12
  4843. adcq %rdx, %r13
  4844. adcq $0x00, %r14
  4845. # A[3] * B[1]
  4846. movq 8(%rbx), %rax
  4847. mulq 24(%rsi)
  4848. addq %rax, %r12
  4849. adcq %rdx, %r13
  4850. adcq $0x00, %r14
  4851. # A[2] * B[3]
  4852. movq 24(%rbx), %rax
  4853. mulq 16(%rsi)
  4854. xorq %r15, %r15
  4855. addq %rax, %r13
  4856. adcq %rdx, %r14
  4857. adcq $0x00, %r15
  4858. # A[3] * B[2]
  4859. movq 16(%rbx), %rax
  4860. mulq 24(%rsi)
  4861. addq %rax, %r13
  4862. adcq %rdx, %r14
  4863. adcq $0x00, %r15
  4864. # A[3] * B[3]
  4865. movq 24(%rbx), %rax
  4866. mulq 24(%rsi)
  4867. addq %rax, %r14
  4868. adcq %rdx, %r15
  4869. # Reduce
  4870. movq $0x7fffffffffffffff, %rcx
  4871. # Move top half into t4-t7 and remove top bit from t3
  4872. shldq $0x01, %r14, %r15
  4873. shldq $0x01, %r13, %r14
  4874. shldq $0x01, %r12, %r13
  4875. shldq $0x01, %r11, %r12
  4876. andq %rcx, %r11
  4877. # Multiply top half by 19
  4878. movq $19, %rax
  4879. mulq %r12
  4880. xorq %r12, %r12
  4881. addq %rax, %r8
  4882. movq $19, %rax
  4883. adcq %rdx, %r12
  4884. mulq %r13
  4885. xorq %r13, %r13
  4886. addq %rax, %r9
  4887. movq $19, %rax
  4888. adcq %rdx, %r13
  4889. mulq %r14
  4890. xorq %r14, %r14
  4891. addq %rax, %r10
  4892. movq $19, %rax
  4893. adcq %rdx, %r14
  4894. mulq %r15
  4895. # Add remaining product results in
  4896. addq %r12, %r9
  4897. adcq %r13, %r10
  4898. adcq %r14, %r11
  4899. adcq %rax, %r11
  4900. adcq $0x00, %rdx
  4901. # Overflow
  4902. shldq $0x01, %r11, %rdx
  4903. imulq $19, %rdx, %rax
  4904. andq %rcx, %r11
  4905. addq %rax, %r8
  4906. adcq $0x00, %r9
  4907. adcq $0x00, %r10
  4908. adcq $0x00, %r11
  4909. # Reduce if top bit set
  4910. movq %r11, %rdx
  4911. sarq $63, %rdx
  4912. andq $19, %rdx
  4913. andq %rcx, %r11
  4914. addq %rdx, %r8
  4915. adcq $0x00, %r9
  4916. adcq $0x00, %r10
  4917. adcq $0x00, %r11
  4918. # Store
  4919. movq %r8, (%rdi)
  4920. movq %r9, 8(%rdi)
  4921. movq %r10, 16(%rdi)
  4922. movq %r11, 24(%rdi)
  4923. movq 8(%rsp), %rdi
  4924. movq 32(%rsp), %rsi
  4925. movq 88(%rsp), %rbx
  4926. # Multiply
  4927. # A[0] * B[0]
  4928. movq (%rbx), %rax
  4929. mulq (%rsi)
  4930. movq %rax, %r8
  4931. movq %rdx, %r9
  4932. # A[0] * B[1]
  4933. movq 8(%rbx), %rax
  4934. mulq (%rsi)
  4935. xorq %r10, %r10
  4936. addq %rax, %r9
  4937. adcq %rdx, %r10
  4938. # A[1] * B[0]
  4939. movq (%rbx), %rax
  4940. mulq 8(%rsi)
  4941. xorq %r11, %r11
  4942. addq %rax, %r9
  4943. adcq %rdx, %r10
  4944. adcq $0x00, %r11
  4945. # A[0] * B[2]
  4946. movq 16(%rbx), %rax
  4947. mulq (%rsi)
  4948. addq %rax, %r10
  4949. adcq %rdx, %r11
  4950. # A[1] * B[1]
  4951. movq 8(%rbx), %rax
  4952. mulq 8(%rsi)
  4953. xorq %r12, %r12
  4954. addq %rax, %r10
  4955. adcq %rdx, %r11
  4956. adcq $0x00, %r12
  4957. # A[2] * B[0]
  4958. movq (%rbx), %rax
  4959. mulq 16(%rsi)
  4960. addq %rax, %r10
  4961. adcq %rdx, %r11
  4962. adcq $0x00, %r12
  4963. # A[0] * B[3]
  4964. movq 24(%rbx), %rax
  4965. mulq (%rsi)
  4966. xorq %r13, %r13
  4967. addq %rax, %r11
  4968. adcq %rdx, %r12
  4969. adcq $0x00, %r13
  4970. # A[1] * B[2]
  4971. movq 16(%rbx), %rax
  4972. mulq 8(%rsi)
  4973. addq %rax, %r11
  4974. adcq %rdx, %r12
  4975. adcq $0x00, %r13
  4976. # A[2] * B[1]
  4977. movq 8(%rbx), %rax
  4978. mulq 16(%rsi)
  4979. addq %rax, %r11
  4980. adcq %rdx, %r12
  4981. adcq $0x00, %r13
  4982. # A[3] * B[0]
  4983. movq (%rbx), %rax
  4984. mulq 24(%rsi)
  4985. addq %rax, %r11
  4986. adcq %rdx, %r12
  4987. adcq $0x00, %r13
  4988. # A[1] * B[3]
  4989. movq 24(%rbx), %rax
  4990. mulq 8(%rsi)
  4991. xorq %r14, %r14
  4992. addq %rax, %r12
  4993. adcq %rdx, %r13
  4994. adcq $0x00, %r14
  4995. # A[2] * B[2]
  4996. movq 16(%rbx), %rax
  4997. mulq 16(%rsi)
  4998. addq %rax, %r12
  4999. adcq %rdx, %r13
  5000. adcq $0x00, %r14
  5001. # A[3] * B[1]
  5002. movq 8(%rbx), %rax
  5003. mulq 24(%rsi)
  5004. addq %rax, %r12
  5005. adcq %rdx, %r13
  5006. adcq $0x00, %r14
  5007. # A[2] * B[3]
  5008. movq 24(%rbx), %rax
  5009. mulq 16(%rsi)
  5010. xorq %r15, %r15
  5011. addq %rax, %r13
  5012. adcq %rdx, %r14
  5013. adcq $0x00, %r15
  5014. # A[3] * B[2]
  5015. movq 16(%rbx), %rax
  5016. mulq 24(%rsi)
  5017. addq %rax, %r13
  5018. adcq %rdx, %r14
  5019. adcq $0x00, %r15
  5020. # A[3] * B[3]
  5021. movq 24(%rbx), %rax
  5022. mulq 24(%rsi)
  5023. addq %rax, %r14
  5024. adcq %rdx, %r15
  5025. # Reduce
  5026. movq $0x7fffffffffffffff, %rcx
  5027. # Move top half into t4-t7 and remove top bit from t3
  5028. shldq $0x01, %r14, %r15
  5029. shldq $0x01, %r13, %r14
  5030. shldq $0x01, %r12, %r13
  5031. shldq $0x01, %r11, %r12
  5032. andq %rcx, %r11
  5033. # Multiply top half by 19
  5034. movq $19, %rax
  5035. mulq %r12
  5036. xorq %r12, %r12
  5037. addq %rax, %r8
  5038. movq $19, %rax
  5039. adcq %rdx, %r12
  5040. mulq %r13
  5041. xorq %r13, %r13
  5042. addq %rax, %r9
  5043. movq $19, %rax
  5044. adcq %rdx, %r13
  5045. mulq %r14
  5046. xorq %r14, %r14
  5047. addq %rax, %r10
  5048. movq $19, %rax
  5049. adcq %rdx, %r14
  5050. mulq %r15
  5051. # Add remaining product results in
  5052. addq %r12, %r9
  5053. adcq %r13, %r10
  5054. adcq %r14, %r11
  5055. adcq %rax, %r11
  5056. adcq $0x00, %rdx
  5057. # Overflow
  5058. shldq $0x01, %r11, %rdx
  5059. imulq $19, %rdx, %rax
  5060. andq %rcx, %r11
  5061. addq %rax, %r8
  5062. adcq $0x00, %r9
  5063. adcq $0x00, %r10
  5064. adcq $0x00, %r11
  5065. # Reduce if top bit set
  5066. movq %r11, %rdx
  5067. sarq $63, %rdx
  5068. andq $19, %rdx
  5069. andq %rcx, %r11
  5070. addq %rdx, %r8
  5071. adcq $0x00, %r9
  5072. adcq $0x00, %r10
  5073. adcq $0x00, %r11
  5074. # Store
  5075. movq %r8, (%rdi)
  5076. movq %r9, 8(%rdi)
  5077. movq %r10, 16(%rdi)
  5078. movq %r11, 24(%rdi)
  5079. addq $40, %rsp
  5080. popq %r15
  5081. popq %r14
  5082. popq %r13
  5083. popq %r12
  5084. popq %rbx
  5085. repz retq
  5086. #ifndef __APPLE__
  5087. .size fe_ge_to_p2_x64,.-fe_ge_to_p2_x64
  5088. #endif /* __APPLE__ */
  5089. #ifndef __APPLE__
  5090. .text
  5091. .globl fe_ge_to_p3_x64
  5092. .type fe_ge_to_p3_x64,@function
  5093. .align 16
  5094. fe_ge_to_p3_x64:
  5095. #else
  5096. .section __TEXT,__text
  5097. .globl _fe_ge_to_p3_x64
  5098. .p2align 4
  5099. _fe_ge_to_p3_x64:
  5100. #endif /* __APPLE__ */
  5101. pushq %rbx
  5102. pushq %r12
  5103. pushq %r13
  5104. pushq %r14
  5105. pushq %r15
  5106. subq $40, %rsp
  5107. movq %rsi, (%rsp)
  5108. movq %rdx, 8(%rsp)
  5109. movq %rcx, 16(%rsp)
  5110. movq %r8, 24(%rsp)
  5111. movq %r9, 32(%rsp)
  5112. movq 24(%rsp), %rsi
  5113. movq 96(%rsp), %rbx
  5114. # Multiply
  5115. # A[0] * B[0]
  5116. movq (%rbx), %rax
  5117. mulq (%rsi)
  5118. movq %rax, %r8
  5119. movq %rdx, %r9
  5120. # A[0] * B[1]
  5121. movq 8(%rbx), %rax
  5122. mulq (%rsi)
  5123. xorq %r10, %r10
  5124. addq %rax, %r9
  5125. adcq %rdx, %r10
  5126. # A[1] * B[0]
  5127. movq (%rbx), %rax
  5128. mulq 8(%rsi)
  5129. xorq %r11, %r11
  5130. addq %rax, %r9
  5131. adcq %rdx, %r10
  5132. adcq $0x00, %r11
  5133. # A[0] * B[2]
  5134. movq 16(%rbx), %rax
  5135. mulq (%rsi)
  5136. addq %rax, %r10
  5137. adcq %rdx, %r11
  5138. # A[1] * B[1]
  5139. movq 8(%rbx), %rax
  5140. mulq 8(%rsi)
  5141. xorq %r12, %r12
  5142. addq %rax, %r10
  5143. adcq %rdx, %r11
  5144. adcq $0x00, %r12
  5145. # A[2] * B[0]
  5146. movq (%rbx), %rax
  5147. mulq 16(%rsi)
  5148. addq %rax, %r10
  5149. adcq %rdx, %r11
  5150. adcq $0x00, %r12
  5151. # A[0] * B[3]
  5152. movq 24(%rbx), %rax
  5153. mulq (%rsi)
  5154. xorq %r13, %r13
  5155. addq %rax, %r11
  5156. adcq %rdx, %r12
  5157. adcq $0x00, %r13
  5158. # A[1] * B[2]
  5159. movq 16(%rbx), %rax
  5160. mulq 8(%rsi)
  5161. addq %rax, %r11
  5162. adcq %rdx, %r12
  5163. adcq $0x00, %r13
  5164. # A[2] * B[1]
  5165. movq 8(%rbx), %rax
  5166. mulq 16(%rsi)
  5167. addq %rax, %r11
  5168. adcq %rdx, %r12
  5169. adcq $0x00, %r13
  5170. # A[3] * B[0]
  5171. movq (%rbx), %rax
  5172. mulq 24(%rsi)
  5173. addq %rax, %r11
  5174. adcq %rdx, %r12
  5175. adcq $0x00, %r13
  5176. # A[1] * B[3]
  5177. movq 24(%rbx), %rax
  5178. mulq 8(%rsi)
  5179. xorq %r14, %r14
  5180. addq %rax, %r12
  5181. adcq %rdx, %r13
  5182. adcq $0x00, %r14
  5183. # A[2] * B[2]
  5184. movq 16(%rbx), %rax
  5185. mulq 16(%rsi)
  5186. addq %rax, %r12
  5187. adcq %rdx, %r13
  5188. adcq $0x00, %r14
  5189. # A[3] * B[1]
  5190. movq 8(%rbx), %rax
  5191. mulq 24(%rsi)
  5192. addq %rax, %r12
  5193. adcq %rdx, %r13
  5194. adcq $0x00, %r14
  5195. # A[2] * B[3]
  5196. movq 24(%rbx), %rax
  5197. mulq 16(%rsi)
  5198. xorq %r15, %r15
  5199. addq %rax, %r13
  5200. adcq %rdx, %r14
  5201. adcq $0x00, %r15
  5202. # A[3] * B[2]
  5203. movq 16(%rbx), %rax
  5204. mulq 24(%rsi)
  5205. addq %rax, %r13
  5206. adcq %rdx, %r14
  5207. adcq $0x00, %r15
  5208. # A[3] * B[3]
  5209. movq 24(%rbx), %rax
  5210. mulq 24(%rsi)
  5211. addq %rax, %r14
  5212. adcq %rdx, %r15
  5213. # Reduce
  5214. movq $0x7fffffffffffffff, %rcx
  5215. # Move top half into t4-t7 and remove top bit from t3
  5216. shldq $0x01, %r14, %r15
  5217. shldq $0x01, %r13, %r14
  5218. shldq $0x01, %r12, %r13
  5219. shldq $0x01, %r11, %r12
  5220. andq %rcx, %r11
  5221. # Multiply top half by 19
  5222. movq $19, %rax
  5223. mulq %r12
  5224. xorq %r12, %r12
  5225. addq %rax, %r8
  5226. movq $19, %rax
  5227. adcq %rdx, %r12
  5228. mulq %r13
  5229. xorq %r13, %r13
  5230. addq %rax, %r9
  5231. movq $19, %rax
  5232. adcq %rdx, %r13
  5233. mulq %r14
  5234. xorq %r14, %r14
  5235. addq %rax, %r10
  5236. movq $19, %rax
  5237. adcq %rdx, %r14
  5238. mulq %r15
  5239. # Add remaining product results in
  5240. addq %r12, %r9
  5241. adcq %r13, %r10
  5242. adcq %r14, %r11
  5243. adcq %rax, %r11
  5244. adcq $0x00, %rdx
  5245. # Overflow
  5246. shldq $0x01, %r11, %rdx
  5247. imulq $19, %rdx, %rax
  5248. andq %rcx, %r11
  5249. addq %rax, %r8
  5250. adcq $0x00, %r9
  5251. adcq $0x00, %r10
  5252. adcq $0x00, %r11
  5253. # Reduce if top bit set
  5254. movq %r11, %rdx
  5255. sarq $63, %rdx
  5256. andq $19, %rdx
  5257. andq %rcx, %r11
  5258. addq %rdx, %r8
  5259. adcq $0x00, %r9
  5260. adcq $0x00, %r10
  5261. adcq $0x00, %r11
  5262. # Store
  5263. movq %r8, (%rdi)
  5264. movq %r9, 8(%rdi)
  5265. movq %r10, 16(%rdi)
  5266. movq %r11, 24(%rdi)
  5267. movq (%rsp), %rdi
  5268. movq 32(%rsp), %rsi
  5269. movq 88(%rsp), %rbx
  5270. # Multiply
  5271. # A[0] * B[0]
  5272. movq (%rbx), %rax
  5273. mulq (%rsi)
  5274. movq %rax, %r8
  5275. movq %rdx, %r9
  5276. # A[0] * B[1]
  5277. movq 8(%rbx), %rax
  5278. mulq (%rsi)
  5279. xorq %r10, %r10
  5280. addq %rax, %r9
  5281. adcq %rdx, %r10
  5282. # A[1] * B[0]
  5283. movq (%rbx), %rax
  5284. mulq 8(%rsi)
  5285. xorq %r11, %r11
  5286. addq %rax, %r9
  5287. adcq %rdx, %r10
  5288. adcq $0x00, %r11
  5289. # A[0] * B[2]
  5290. movq 16(%rbx), %rax
  5291. mulq (%rsi)
  5292. addq %rax, %r10
  5293. adcq %rdx, %r11
  5294. # A[1] * B[1]
  5295. movq 8(%rbx), %rax
  5296. mulq 8(%rsi)
  5297. xorq %r12, %r12
  5298. addq %rax, %r10
  5299. adcq %rdx, %r11
  5300. adcq $0x00, %r12
  5301. # A[2] * B[0]
  5302. movq (%rbx), %rax
  5303. mulq 16(%rsi)
  5304. addq %rax, %r10
  5305. adcq %rdx, %r11
  5306. adcq $0x00, %r12
  5307. # A[0] * B[3]
  5308. movq 24(%rbx), %rax
  5309. mulq (%rsi)
  5310. xorq %r13, %r13
  5311. addq %rax, %r11
  5312. adcq %rdx, %r12
  5313. adcq $0x00, %r13
  5314. # A[1] * B[2]
  5315. movq 16(%rbx), %rax
  5316. mulq 8(%rsi)
  5317. addq %rax, %r11
  5318. adcq %rdx, %r12
  5319. adcq $0x00, %r13
  5320. # A[2] * B[1]
  5321. movq 8(%rbx), %rax
  5322. mulq 16(%rsi)
  5323. addq %rax, %r11
  5324. adcq %rdx, %r12
  5325. adcq $0x00, %r13
  5326. # A[3] * B[0]
  5327. movq (%rbx), %rax
  5328. mulq 24(%rsi)
  5329. addq %rax, %r11
  5330. adcq %rdx, %r12
  5331. adcq $0x00, %r13
  5332. # A[1] * B[3]
  5333. movq 24(%rbx), %rax
  5334. mulq 8(%rsi)
  5335. xorq %r14, %r14
  5336. addq %rax, %r12
  5337. adcq %rdx, %r13
  5338. adcq $0x00, %r14
  5339. # A[2] * B[2]
  5340. movq 16(%rbx), %rax
  5341. mulq 16(%rsi)
  5342. addq %rax, %r12
  5343. adcq %rdx, %r13
  5344. adcq $0x00, %r14
  5345. # A[3] * B[1]
  5346. movq 8(%rbx), %rax
  5347. mulq 24(%rsi)
  5348. addq %rax, %r12
  5349. adcq %rdx, %r13
  5350. adcq $0x00, %r14
  5351. # A[2] * B[3]
  5352. movq 24(%rbx), %rax
  5353. mulq 16(%rsi)
  5354. xorq %r15, %r15
  5355. addq %rax, %r13
  5356. adcq %rdx, %r14
  5357. adcq $0x00, %r15
  5358. # A[3] * B[2]
  5359. movq 16(%rbx), %rax
  5360. mulq 24(%rsi)
  5361. addq %rax, %r13
  5362. adcq %rdx, %r14
  5363. adcq $0x00, %r15
  5364. # A[3] * B[3]
  5365. movq 24(%rbx), %rax
  5366. mulq 24(%rsi)
  5367. addq %rax, %r14
  5368. adcq %rdx, %r15
  5369. # Reduce
  5370. movq $0x7fffffffffffffff, %rcx
  5371. # Move top half into t4-t7 and remove top bit from t3
  5372. shldq $0x01, %r14, %r15
  5373. shldq $0x01, %r13, %r14
  5374. shldq $0x01, %r12, %r13
  5375. shldq $0x01, %r11, %r12
  5376. andq %rcx, %r11
  5377. # Multiply top half by 19
  5378. movq $19, %rax
  5379. mulq %r12
  5380. xorq %r12, %r12
  5381. addq %rax, %r8
  5382. movq $19, %rax
  5383. adcq %rdx, %r12
  5384. mulq %r13
  5385. xorq %r13, %r13
  5386. addq %rax, %r9
  5387. movq $19, %rax
  5388. adcq %rdx, %r13
  5389. mulq %r14
  5390. xorq %r14, %r14
  5391. addq %rax, %r10
  5392. movq $19, %rax
  5393. adcq %rdx, %r14
  5394. mulq %r15
  5395. # Add remaining product results in
  5396. addq %r12, %r9
  5397. adcq %r13, %r10
  5398. adcq %r14, %r11
  5399. adcq %rax, %r11
  5400. adcq $0x00, %rdx
  5401. # Overflow
  5402. shldq $0x01, %r11, %rdx
  5403. imulq $19, %rdx, %rax
  5404. andq %rcx, %r11
  5405. addq %rax, %r8
  5406. adcq $0x00, %r9
  5407. adcq $0x00, %r10
  5408. adcq $0x00, %r11
  5409. # Reduce if top bit set
  5410. movq %r11, %rdx
  5411. sarq $63, %rdx
  5412. andq $19, %rdx
  5413. andq %rcx, %r11
  5414. addq %rdx, %r8
  5415. adcq $0x00, %r9
  5416. adcq $0x00, %r10
  5417. adcq $0x00, %r11
  5418. # Store
  5419. movq %r8, (%rdi)
  5420. movq %r9, 8(%rdi)
  5421. movq %r10, 16(%rdi)
  5422. movq %r11, 24(%rdi)
  5423. movq 8(%rsp), %rdi
  5424. movq 88(%rsp), %rsi
  5425. movq 96(%rsp), %rbx
  5426. # Multiply
  5427. # A[0] * B[0]
  5428. movq (%rbx), %rax
  5429. mulq (%rsi)
  5430. movq %rax, %r8
  5431. movq %rdx, %r9
  5432. # A[0] * B[1]
  5433. movq 8(%rbx), %rax
  5434. mulq (%rsi)
  5435. xorq %r10, %r10
  5436. addq %rax, %r9
  5437. adcq %rdx, %r10
  5438. # A[1] * B[0]
  5439. movq (%rbx), %rax
  5440. mulq 8(%rsi)
  5441. xorq %r11, %r11
  5442. addq %rax, %r9
  5443. adcq %rdx, %r10
  5444. adcq $0x00, %r11
  5445. # A[0] * B[2]
  5446. movq 16(%rbx), %rax
  5447. mulq (%rsi)
  5448. addq %rax, %r10
  5449. adcq %rdx, %r11
  5450. # A[1] * B[1]
  5451. movq 8(%rbx), %rax
  5452. mulq 8(%rsi)
  5453. xorq %r12, %r12
  5454. addq %rax, %r10
  5455. adcq %rdx, %r11
  5456. adcq $0x00, %r12
  5457. # A[2] * B[0]
  5458. movq (%rbx), %rax
  5459. mulq 16(%rsi)
  5460. addq %rax, %r10
  5461. adcq %rdx, %r11
  5462. adcq $0x00, %r12
  5463. # A[0] * B[3]
  5464. movq 24(%rbx), %rax
  5465. mulq (%rsi)
  5466. xorq %r13, %r13
  5467. addq %rax, %r11
  5468. adcq %rdx, %r12
  5469. adcq $0x00, %r13
  5470. # A[1] * B[2]
  5471. movq 16(%rbx), %rax
  5472. mulq 8(%rsi)
  5473. addq %rax, %r11
  5474. adcq %rdx, %r12
  5475. adcq $0x00, %r13
  5476. # A[2] * B[1]
  5477. movq 8(%rbx), %rax
  5478. mulq 16(%rsi)
  5479. addq %rax, %r11
  5480. adcq %rdx, %r12
  5481. adcq $0x00, %r13
  5482. # A[3] * B[0]
  5483. movq (%rbx), %rax
  5484. mulq 24(%rsi)
  5485. addq %rax, %r11
  5486. adcq %rdx, %r12
  5487. adcq $0x00, %r13
  5488. # A[1] * B[3]
  5489. movq 24(%rbx), %rax
  5490. mulq 8(%rsi)
  5491. xorq %r14, %r14
  5492. addq %rax, %r12
  5493. adcq %rdx, %r13
  5494. adcq $0x00, %r14
  5495. # A[2] * B[2]
  5496. movq 16(%rbx), %rax
  5497. mulq 16(%rsi)
  5498. addq %rax, %r12
  5499. adcq %rdx, %r13
  5500. adcq $0x00, %r14
  5501. # A[3] * B[1]
  5502. movq 8(%rbx), %rax
  5503. mulq 24(%rsi)
  5504. addq %rax, %r12
  5505. adcq %rdx, %r13
  5506. adcq $0x00, %r14
  5507. # A[2] * B[3]
  5508. movq 24(%rbx), %rax
  5509. mulq 16(%rsi)
  5510. xorq %r15, %r15
  5511. addq %rax, %r13
  5512. adcq %rdx, %r14
  5513. adcq $0x00, %r15
  5514. # A[3] * B[2]
  5515. movq 16(%rbx), %rax
  5516. mulq 24(%rsi)
  5517. addq %rax, %r13
  5518. adcq %rdx, %r14
  5519. adcq $0x00, %r15
  5520. # A[3] * B[3]
  5521. movq 24(%rbx), %rax
  5522. mulq 24(%rsi)
  5523. addq %rax, %r14
  5524. adcq %rdx, %r15
  5525. # Reduce
  5526. movq $0x7fffffffffffffff, %rcx
  5527. # Move top half into t4-t7 and remove top bit from t3
  5528. shldq $0x01, %r14, %r15
  5529. shldq $0x01, %r13, %r14
  5530. shldq $0x01, %r12, %r13
  5531. shldq $0x01, %r11, %r12
  5532. andq %rcx, %r11
  5533. # Multiply top half by 19
  5534. movq $19, %rax
  5535. mulq %r12
  5536. xorq %r12, %r12
  5537. addq %rax, %r8
  5538. movq $19, %rax
  5539. adcq %rdx, %r12
  5540. mulq %r13
  5541. xorq %r13, %r13
  5542. addq %rax, %r9
  5543. movq $19, %rax
  5544. adcq %rdx, %r13
  5545. mulq %r14
  5546. xorq %r14, %r14
  5547. addq %rax, %r10
  5548. movq $19, %rax
  5549. adcq %rdx, %r14
  5550. mulq %r15
  5551. # Add remaining product results in
  5552. addq %r12, %r9
  5553. adcq %r13, %r10
  5554. adcq %r14, %r11
  5555. adcq %rax, %r11
  5556. adcq $0x00, %rdx
  5557. # Overflow
  5558. shldq $0x01, %r11, %rdx
  5559. imulq $19, %rdx, %rax
  5560. andq %rcx, %r11
  5561. addq %rax, %r8
  5562. adcq $0x00, %r9
  5563. adcq $0x00, %r10
  5564. adcq $0x00, %r11
  5565. # Reduce if top bit set
  5566. movq %r11, %rdx
  5567. sarq $63, %rdx
  5568. andq $19, %rdx
  5569. andq %rcx, %r11
  5570. addq %rdx, %r8
  5571. adcq $0x00, %r9
  5572. adcq $0x00, %r10
  5573. adcq $0x00, %r11
  5574. # Store
  5575. movq %r8, (%rdi)
  5576. movq %r9, 8(%rdi)
  5577. movq %r10, 16(%rdi)
  5578. movq %r11, 24(%rdi)
  5579. movq 16(%rsp), %rdi
  5580. movq 24(%rsp), %rsi
  5581. movq 32(%rsp), %rbx
  5582. # Multiply
  5583. # A[0] * B[0]
  5584. movq (%rbx), %rax
  5585. mulq (%rsi)
  5586. movq %rax, %r8
  5587. movq %rdx, %r9
  5588. # A[0] * B[1]
  5589. movq 8(%rbx), %rax
  5590. mulq (%rsi)
  5591. xorq %r10, %r10
  5592. addq %rax, %r9
  5593. adcq %rdx, %r10
  5594. # A[1] * B[0]
  5595. movq (%rbx), %rax
  5596. mulq 8(%rsi)
  5597. xorq %r11, %r11
  5598. addq %rax, %r9
  5599. adcq %rdx, %r10
  5600. adcq $0x00, %r11
  5601. # A[0] * B[2]
  5602. movq 16(%rbx), %rax
  5603. mulq (%rsi)
  5604. addq %rax, %r10
  5605. adcq %rdx, %r11
  5606. # A[1] * B[1]
  5607. movq 8(%rbx), %rax
  5608. mulq 8(%rsi)
  5609. xorq %r12, %r12
  5610. addq %rax, %r10
  5611. adcq %rdx, %r11
  5612. adcq $0x00, %r12
  5613. # A[2] * B[0]
  5614. movq (%rbx), %rax
  5615. mulq 16(%rsi)
  5616. addq %rax, %r10
  5617. adcq %rdx, %r11
  5618. adcq $0x00, %r12
  5619. # A[0] * B[3]
  5620. movq 24(%rbx), %rax
  5621. mulq (%rsi)
  5622. xorq %r13, %r13
  5623. addq %rax, %r11
  5624. adcq %rdx, %r12
  5625. adcq $0x00, %r13
  5626. # A[1] * B[2]
  5627. movq 16(%rbx), %rax
  5628. mulq 8(%rsi)
  5629. addq %rax, %r11
  5630. adcq %rdx, %r12
  5631. adcq $0x00, %r13
  5632. # A[2] * B[1]
  5633. movq 8(%rbx), %rax
  5634. mulq 16(%rsi)
  5635. addq %rax, %r11
  5636. adcq %rdx, %r12
  5637. adcq $0x00, %r13
  5638. # A[3] * B[0]
  5639. movq (%rbx), %rax
  5640. mulq 24(%rsi)
  5641. addq %rax, %r11
  5642. adcq %rdx, %r12
  5643. adcq $0x00, %r13
  5644. # A[1] * B[3]
  5645. movq 24(%rbx), %rax
  5646. mulq 8(%rsi)
  5647. xorq %r14, %r14
  5648. addq %rax, %r12
  5649. adcq %rdx, %r13
  5650. adcq $0x00, %r14
  5651. # A[2] * B[2]
  5652. movq 16(%rbx), %rax
  5653. mulq 16(%rsi)
  5654. addq %rax, %r12
  5655. adcq %rdx, %r13
  5656. adcq $0x00, %r14
  5657. # A[3] * B[1]
  5658. movq 8(%rbx), %rax
  5659. mulq 24(%rsi)
  5660. addq %rax, %r12
  5661. adcq %rdx, %r13
  5662. adcq $0x00, %r14
  5663. # A[2] * B[3]
  5664. movq 24(%rbx), %rax
  5665. mulq 16(%rsi)
  5666. xorq %r15, %r15
  5667. addq %rax, %r13
  5668. adcq %rdx, %r14
  5669. adcq $0x00, %r15
  5670. # A[3] * B[2]
  5671. movq 16(%rbx), %rax
  5672. mulq 24(%rsi)
  5673. addq %rax, %r13
  5674. adcq %rdx, %r14
  5675. adcq $0x00, %r15
  5676. # A[3] * B[3]
  5677. movq 24(%rbx), %rax
  5678. mulq 24(%rsi)
  5679. addq %rax, %r14
  5680. adcq %rdx, %r15
  5681. # Reduce
  5682. movq $0x7fffffffffffffff, %rcx
  5683. # Move top half into t4-t7 and remove top bit from t3
  5684. shldq $0x01, %r14, %r15
  5685. shldq $0x01, %r13, %r14
  5686. shldq $0x01, %r12, %r13
  5687. shldq $0x01, %r11, %r12
  5688. andq %rcx, %r11
  5689. # Multiply top half by 19
  5690. movq $19, %rax
  5691. mulq %r12
  5692. xorq %r12, %r12
  5693. addq %rax, %r8
  5694. movq $19, %rax
  5695. adcq %rdx, %r12
  5696. mulq %r13
  5697. xorq %r13, %r13
  5698. addq %rax, %r9
  5699. movq $19, %rax
  5700. adcq %rdx, %r13
  5701. mulq %r14
  5702. xorq %r14, %r14
  5703. addq %rax, %r10
  5704. movq $19, %rax
  5705. adcq %rdx, %r14
  5706. mulq %r15
  5707. # Add remaining product results in
  5708. addq %r12, %r9
  5709. adcq %r13, %r10
  5710. adcq %r14, %r11
  5711. adcq %rax, %r11
  5712. adcq $0x00, %rdx
  5713. # Overflow
  5714. shldq $0x01, %r11, %rdx
  5715. imulq $19, %rdx, %rax
  5716. andq %rcx, %r11
  5717. addq %rax, %r8
  5718. adcq $0x00, %r9
  5719. adcq $0x00, %r10
  5720. adcq $0x00, %r11
  5721. # Reduce if top bit set
  5722. movq %r11, %rdx
  5723. sarq $63, %rdx
  5724. andq $19, %rdx
  5725. andq %rcx, %r11
  5726. addq %rdx, %r8
  5727. adcq $0x00, %r9
  5728. adcq $0x00, %r10
  5729. adcq $0x00, %r11
  5730. # Store
  5731. movq %r8, (%rdi)
  5732. movq %r9, 8(%rdi)
  5733. movq %r10, 16(%rdi)
  5734. movq %r11, 24(%rdi)
  5735. addq $40, %rsp
  5736. popq %r15
  5737. popq %r14
  5738. popq %r13
  5739. popq %r12
  5740. popq %rbx
  5741. repz retq
  5742. #ifndef __APPLE__
  5743. .size fe_ge_to_p3_x64,.-fe_ge_to_p3_x64
  5744. #endif /* __APPLE__ */
  5745. #ifndef __APPLE__
  5746. .text
  5747. .globl fe_ge_dbl_x64
  5748. .type fe_ge_dbl_x64,@function
  5749. .align 16
  5750. fe_ge_dbl_x64:
  5751. #else
  5752. .section __TEXT,__text
  5753. .globl _fe_ge_dbl_x64
  5754. .p2align 4
  5755. _fe_ge_dbl_x64:
  5756. #endif /* __APPLE__ */
  5757. pushq %rbx
  5758. pushq %r12
  5759. pushq %r13
  5760. pushq %r14
  5761. pushq %r15
  5762. subq $0x50, %rsp
  5763. movq %rdi, (%rsp)
  5764. movq %rsi, 8(%rsp)
  5765. movq %rdx, 16(%rsp)
  5766. movq %rcx, 24(%rsp)
  5767. movq %r8, 32(%rsp)
  5768. movq %r9, 40(%rsp)
  5769. movq (%rsp), %rdi
  5770. movq 32(%rsp), %rsi
  5771. # Square
  5772. # A[0] * A[1]
  5773. movq (%rsi), %rax
  5774. mulq 8(%rsi)
  5775. movq %rax, %r9
  5776. movq %rdx, %r10
  5777. # A[0] * A[2]
  5778. movq (%rsi), %rax
  5779. mulq 16(%rsi)
  5780. xorq %r11, %r11
  5781. addq %rax, %r10
  5782. adcq %rdx, %r11
  5783. # A[0] * A[3]
  5784. movq (%rsi), %rax
  5785. mulq 24(%rsi)
  5786. xorq %r12, %r12
  5787. addq %rax, %r11
  5788. adcq %rdx, %r12
  5789. # A[1] * A[2]
  5790. movq 8(%rsi), %rax
  5791. mulq 16(%rsi)
  5792. xorq %r13, %r13
  5793. addq %rax, %r11
  5794. adcq %rdx, %r12
  5795. adcq $0x00, %r13
  5796. # A[1] * A[3]
  5797. movq 8(%rsi), %rax
  5798. mulq 24(%rsi)
  5799. addq %rax, %r12
  5800. adcq %rdx, %r13
  5801. # A[2] * A[3]
  5802. movq 16(%rsi), %rax
  5803. mulq 24(%rsi)
  5804. xorq %r14, %r14
  5805. addq %rax, %r13
  5806. adcq %rdx, %r14
  5807. # Double
  5808. xorq %r15, %r15
  5809. addq %r9, %r9
  5810. adcq %r10, %r10
  5811. adcq %r11, %r11
  5812. adcq %r12, %r12
  5813. adcq %r13, %r13
  5814. adcq %r14, %r14
  5815. adcq $0x00, %r15
  5816. # A[0] * A[0]
  5817. movq (%rsi), %rax
  5818. mulq %rax
  5819. movq %rax, %r8
  5820. movq %rdx, %rcx
  5821. # A[1] * A[1]
  5822. movq 8(%rsi), %rax
  5823. mulq %rax
  5824. addq %rcx, %r9
  5825. adcq %rax, %r10
  5826. adcq $0x00, %rdx
  5827. movq %rdx, %rcx
  5828. # A[2] * A[2]
  5829. movq 16(%rsi), %rax
  5830. mulq %rax
  5831. addq %rcx, %r11
  5832. adcq %rax, %r12
  5833. adcq $0x00, %rdx
  5834. movq %rdx, %rcx
  5835. # A[3] * A[3]
  5836. movq 24(%rsi), %rax
  5837. mulq %rax
  5838. addq %rax, %r14
  5839. adcq %rdx, %r15
  5840. addq %rcx, %r13
  5841. adcq $0x00, %r14
  5842. adcq $0x00, %r15
  5843. # Reduce
  5844. movq $0x7fffffffffffffff, %rcx
  5845. # Move top half into t4-t7 and remove top bit from t3
  5846. shldq $0x01, %r14, %r15
  5847. shldq $0x01, %r13, %r14
  5848. shldq $0x01, %r12, %r13
  5849. shldq $0x01, %r11, %r12
  5850. andq %rcx, %r11
  5851. # Multiply top half by 19
  5852. movq $19, %rax
  5853. mulq %r12
  5854. xorq %r12, %r12
  5855. addq %rax, %r8
  5856. movq $19, %rax
  5857. adcq %rdx, %r12
  5858. mulq %r13
  5859. xorq %r13, %r13
  5860. addq %rax, %r9
  5861. movq $19, %rax
  5862. adcq %rdx, %r13
  5863. mulq %r14
  5864. xorq %r14, %r14
  5865. addq %rax, %r10
  5866. movq $19, %rax
  5867. adcq %rdx, %r14
  5868. mulq %r15
  5869. # Add remaining product results in
  5870. addq %r12, %r9
  5871. adcq %r13, %r10
  5872. adcq %r14, %r11
  5873. adcq %rax, %r11
  5874. adcq $0x00, %rdx
  5875. # Overflow
  5876. shldq $0x01, %r11, %rdx
  5877. imulq $19, %rdx, %rax
  5878. andq %rcx, %r11
  5879. addq %rax, %r8
  5880. adcq $0x00, %r9
  5881. adcq $0x00, %r10
  5882. adcq $0x00, %r11
  5883. # Reduce if top bit set
  5884. movq %r11, %rdx
  5885. sarq $63, %rdx
  5886. andq $19, %rdx
  5887. andq %rcx, %r11
  5888. addq %rdx, %r8
  5889. adcq $0x00, %r9
  5890. adcq $0x00, %r10
  5891. adcq $0x00, %r11
  5892. # Store
  5893. movq %r8, (%rdi)
  5894. movq %r9, 8(%rdi)
  5895. movq %r10, 16(%rdi)
  5896. movq %r11, 24(%rdi)
  5897. movq 16(%rsp), %rdi
  5898. movq 40(%rsp), %rsi
  5899. # Square
  5900. # A[0] * A[1]
  5901. movq (%rsi), %rax
  5902. mulq 8(%rsi)
  5903. movq %rax, %r9
  5904. movq %rdx, %r10
  5905. # A[0] * A[2]
  5906. movq (%rsi), %rax
  5907. mulq 16(%rsi)
  5908. xorq %r11, %r11
  5909. addq %rax, %r10
  5910. adcq %rdx, %r11
  5911. # A[0] * A[3]
  5912. movq (%rsi), %rax
  5913. mulq 24(%rsi)
  5914. xorq %r12, %r12
  5915. addq %rax, %r11
  5916. adcq %rdx, %r12
  5917. # A[1] * A[2]
  5918. movq 8(%rsi), %rax
  5919. mulq 16(%rsi)
  5920. xorq %r13, %r13
  5921. addq %rax, %r11
  5922. adcq %rdx, %r12
  5923. adcq $0x00, %r13
  5924. # A[1] * A[3]
  5925. movq 8(%rsi), %rax
  5926. mulq 24(%rsi)
  5927. addq %rax, %r12
  5928. adcq %rdx, %r13
  5929. # A[2] * A[3]
  5930. movq 16(%rsi), %rax
  5931. mulq 24(%rsi)
  5932. xorq %r14, %r14
  5933. addq %rax, %r13
  5934. adcq %rdx, %r14
  5935. # Double
  5936. xorq %r15, %r15
  5937. addq %r9, %r9
  5938. adcq %r10, %r10
  5939. adcq %r11, %r11
  5940. adcq %r12, %r12
  5941. adcq %r13, %r13
  5942. adcq %r14, %r14
  5943. adcq $0x00, %r15
  5944. # A[0] * A[0]
  5945. movq (%rsi), %rax
  5946. mulq %rax
  5947. movq %rax, %r8
  5948. movq %rdx, %rcx
  5949. # A[1] * A[1]
  5950. movq 8(%rsi), %rax
  5951. mulq %rax
  5952. addq %rcx, %r9
  5953. adcq %rax, %r10
  5954. adcq $0x00, %rdx
  5955. movq %rdx, %rcx
  5956. # A[2] * A[2]
  5957. movq 16(%rsi), %rax
  5958. mulq %rax
  5959. addq %rcx, %r11
  5960. adcq %rax, %r12
  5961. adcq $0x00, %rdx
  5962. movq %rdx, %rcx
  5963. # A[3] * A[3]
  5964. movq 24(%rsi), %rax
  5965. mulq %rax
  5966. addq %rax, %r14
  5967. adcq %rdx, %r15
  5968. addq %rcx, %r13
  5969. adcq $0x00, %r14
  5970. adcq $0x00, %r15
  5971. # Reduce
  5972. movq $0x7fffffffffffffff, %rcx
  5973. # Move top half into t4-t7 and remove top bit from t3
  5974. shldq $0x01, %r14, %r15
  5975. shldq $0x01, %r13, %r14
  5976. shldq $0x01, %r12, %r13
  5977. shldq $0x01, %r11, %r12
  5978. andq %rcx, %r11
  5979. # Multiply top half by 19
  5980. movq $19, %rax
  5981. mulq %r12
  5982. xorq %r12, %r12
  5983. addq %rax, %r8
  5984. movq $19, %rax
  5985. adcq %rdx, %r12
  5986. mulq %r13
  5987. xorq %r13, %r13
  5988. addq %rax, %r9
  5989. movq $19, %rax
  5990. adcq %rdx, %r13
  5991. mulq %r14
  5992. xorq %r14, %r14
  5993. addq %rax, %r10
  5994. movq $19, %rax
  5995. adcq %rdx, %r14
  5996. mulq %r15
  5997. # Add remaining product results in
  5998. addq %r12, %r9
  5999. adcq %r13, %r10
  6000. adcq %r14, %r11
  6001. adcq %rax, %r11
  6002. adcq $0x00, %rdx
  6003. # Overflow
  6004. shldq $0x01, %r11, %rdx
  6005. imulq $19, %rdx, %rax
  6006. andq %rcx, %r11
  6007. addq %rax, %r8
  6008. adcq $0x00, %r9
  6009. adcq $0x00, %r10
  6010. adcq $0x00, %r11
  6011. # Reduce if top bit set
  6012. movq %r11, %rdx
  6013. sarq $63, %rdx
  6014. andq $19, %rdx
  6015. andq %rcx, %r11
  6016. addq %rdx, %r8
  6017. adcq $0x00, %r9
  6018. adcq $0x00, %r10
  6019. adcq $0x00, %r11
  6020. # Store
  6021. movq %r8, (%rdi)
  6022. movq %r9, 8(%rdi)
  6023. movq %r10, 16(%rdi)
  6024. movq %r11, 24(%rdi)
  6025. movq 24(%rsp), %rdi
  6026. movq 128(%rsp), %rsi
  6027. # Square * 2
  6028. # A[0] * A[1]
  6029. movq (%rsi), %rax
  6030. mulq 8(%rsi)
  6031. movq %rax, %r9
  6032. movq %rdx, %r10
  6033. # A[0] * A[2]
  6034. movq (%rsi), %rax
  6035. mulq 16(%rsi)
  6036. xorq %r11, %r11
  6037. addq %rax, %r10
  6038. adcq %rdx, %r11
  6039. # A[0] * A[3]
  6040. movq (%rsi), %rax
  6041. mulq 24(%rsi)
  6042. xorq %r12, %r12
  6043. addq %rax, %r11
  6044. adcq %rdx, %r12
  6045. # A[1] * A[2]
  6046. movq 8(%rsi), %rax
  6047. mulq 16(%rsi)
  6048. xorq %r13, %r13
  6049. addq %rax, %r11
  6050. adcq %rdx, %r12
  6051. adcq $0x00, %r13
  6052. # A[1] * A[3]
  6053. movq 8(%rsi), %rax
  6054. mulq 24(%rsi)
  6055. addq %rax, %r12
  6056. adcq %rdx, %r13
  6057. # A[2] * A[3]
  6058. movq 16(%rsi), %rax
  6059. mulq 24(%rsi)
  6060. xorq %r14, %r14
  6061. addq %rax, %r13
  6062. adcq %rdx, %r14
  6063. # Double
  6064. xorq %r15, %r15
  6065. addq %r9, %r9
  6066. adcq %r10, %r10
  6067. adcq %r11, %r11
  6068. adcq %r12, %r12
  6069. adcq %r13, %r13
  6070. adcq %r14, %r14
  6071. adcq $0x00, %r15
  6072. # A[0] * A[0]
  6073. movq (%rsi), %rax
  6074. mulq %rax
  6075. movq %rax, %r8
  6076. movq %rdx, %rcx
  6077. # A[1] * A[1]
  6078. movq 8(%rsi), %rax
  6079. mulq %rax
  6080. addq %rcx, %r9
  6081. adcq %rax, %r10
  6082. adcq $0x00, %rdx
  6083. movq %rdx, %rcx
  6084. # A[2] * A[2]
  6085. movq 16(%rsi), %rax
  6086. mulq %rax
  6087. addq %rcx, %r11
  6088. adcq %rax, %r12
  6089. adcq $0x00, %rdx
  6090. movq %rdx, %rcx
  6091. # A[3] * A[3]
  6092. movq 24(%rsi), %rax
  6093. mulq %rax
  6094. addq %rax, %r14
  6095. adcq %rdx, %r15
  6096. addq %rcx, %r13
  6097. adcq $0x00, %r14
  6098. adcq $0x00, %r15
  6099. # Reduce
  6100. movq $0x7fffffffffffffff, %rbx
  6101. xorq %rax, %rax
  6102. # Move top half into t4-t7 and remove top bit from t3
  6103. shldq $3, %r15, %rax
  6104. shldq $2, %r14, %r15
  6105. shldq $2, %r13, %r14
  6106. shldq $2, %r12, %r13
  6107. shldq $2, %r11, %r12
  6108. shldq $0x01, %r10, %r11
  6109. shldq $0x01, %r9, %r10
  6110. shldq $0x01, %r8, %r9
  6111. shlq $0x01, %r8
  6112. andq %rbx, %r11
  6113. # Two out left, one in right
  6114. andq %rbx, %r15
  6115. # Multiply top bits by 19*19
  6116. imulq $0x169, %rax, %rcx
  6117. # Multiply top half by 19
  6118. movq $19, %rax
  6119. mulq %r12
  6120. xorq %r12, %r12
  6121. addq %rax, %r8
  6122. movq $19, %rax
  6123. adcq %rdx, %r12
  6124. mulq %r13
  6125. xorq %r13, %r13
  6126. addq %rax, %r9
  6127. movq $19, %rax
  6128. adcq %rdx, %r13
  6129. mulq %r14
  6130. xorq %r14, %r14
  6131. addq %rax, %r10
  6132. movq $19, %rax
  6133. adcq %rdx, %r14
  6134. mulq %r15
  6135. # Add remaining produce results in
  6136. addq %rcx, %r8
  6137. adcq %r12, %r9
  6138. adcq %r13, %r10
  6139. adcq %r14, %r11
  6140. adcq %rax, %r11
  6141. adcq $0x00, %rdx
  6142. # Overflow
  6143. shldq $0x01, %r11, %rdx
  6144. imulq $19, %rdx, %rax
  6145. andq %rbx, %r11
  6146. addq %rax, %r8
  6147. adcq $0x00, %r9
  6148. adcq $0x00, %r10
  6149. adcq $0x00, %r11
  6150. # Reduce if top bit set
  6151. movq %r11, %rdx
  6152. sarq $63, %rdx
  6153. andq $19, %rdx
  6154. andq %rbx, %r11
  6155. addq %rdx, %r8
  6156. adcq $0x00, %r9
  6157. adcq $0x00, %r10
  6158. adcq $0x00, %r11
  6159. # Store
  6160. movq %r8, (%rdi)
  6161. movq %r9, 8(%rdi)
  6162. movq %r10, 16(%rdi)
  6163. movq %r11, 24(%rdi)
  6164. movq 8(%rsp), %rdi
  6165. movq 32(%rsp), %rsi
  6166. movq 40(%rsp), %rbx
  6167. # Add
  6168. movq (%rsi), %r8
  6169. movq 8(%rsi), %r9
  6170. addq (%rbx), %r8
  6171. movq 16(%rsi), %r10
  6172. adcq 8(%rbx), %r9
  6173. movq 24(%rsi), %rcx
  6174. adcq 16(%rbx), %r10
  6175. movq $-19, %rax
  6176. adcq 24(%rbx), %rcx
  6177. movq $0x7fffffffffffffff, %rdx
  6178. movq %rcx, %r11
  6179. sarq $63, %rcx
  6180. # Mask the modulus
  6181. andq %rcx, %rax
  6182. andq %rcx, %rdx
  6183. # Sub modulus (if overflow)
  6184. subq %rax, %r8
  6185. sbbq %rcx, %r9
  6186. sbbq %rcx, %r10
  6187. sbbq %rdx, %r11
  6188. movq %r8, (%rdi)
  6189. movq %r9, 8(%rdi)
  6190. movq %r10, 16(%rdi)
  6191. movq %r11, 24(%rdi)
  6192. leaq 48(%rsp), %rdi
  6193. movq 8(%rsp), %rsi
  6194. # Square
  6195. # A[0] * A[1]
  6196. movq (%rsi), %rax
  6197. mulq 8(%rsi)
  6198. movq %rax, %r9
  6199. movq %rdx, %r10
  6200. # A[0] * A[2]
  6201. movq (%rsi), %rax
  6202. mulq 16(%rsi)
  6203. xorq %r11, %r11
  6204. addq %rax, %r10
  6205. adcq %rdx, %r11
  6206. # A[0] * A[3]
  6207. movq (%rsi), %rax
  6208. mulq 24(%rsi)
  6209. xorq %r12, %r12
  6210. addq %rax, %r11
  6211. adcq %rdx, %r12
  6212. # A[1] * A[2]
  6213. movq 8(%rsi), %rax
  6214. mulq 16(%rsi)
  6215. xorq %r13, %r13
  6216. addq %rax, %r11
  6217. adcq %rdx, %r12
  6218. adcq $0x00, %r13
  6219. # A[1] * A[3]
  6220. movq 8(%rsi), %rax
  6221. mulq 24(%rsi)
  6222. addq %rax, %r12
  6223. adcq %rdx, %r13
  6224. # A[2] * A[3]
  6225. movq 16(%rsi), %rax
  6226. mulq 24(%rsi)
  6227. xorq %r14, %r14
  6228. addq %rax, %r13
  6229. adcq %rdx, %r14
  6230. # Double
  6231. xorq %r15, %r15
  6232. addq %r9, %r9
  6233. adcq %r10, %r10
  6234. adcq %r11, %r11
  6235. adcq %r12, %r12
  6236. adcq %r13, %r13
  6237. adcq %r14, %r14
  6238. adcq $0x00, %r15
  6239. # A[0] * A[0]
  6240. movq (%rsi), %rax
  6241. mulq %rax
  6242. movq %rax, %r8
  6243. movq %rdx, %rcx
  6244. # A[1] * A[1]
  6245. movq 8(%rsi), %rax
  6246. mulq %rax
  6247. addq %rcx, %r9
  6248. adcq %rax, %r10
  6249. adcq $0x00, %rdx
  6250. movq %rdx, %rcx
  6251. # A[2] * A[2]
  6252. movq 16(%rsi), %rax
  6253. mulq %rax
  6254. addq %rcx, %r11
  6255. adcq %rax, %r12
  6256. adcq $0x00, %rdx
  6257. movq %rdx, %rcx
  6258. # A[3] * A[3]
  6259. movq 24(%rsi), %rax
  6260. mulq %rax
  6261. addq %rax, %r14
  6262. adcq %rdx, %r15
  6263. addq %rcx, %r13
  6264. adcq $0x00, %r14
  6265. adcq $0x00, %r15
  6266. # Reduce
  6267. movq $0x7fffffffffffffff, %rcx
  6268. # Move top half into t4-t7 and remove top bit from t3
  6269. shldq $0x01, %r14, %r15
  6270. shldq $0x01, %r13, %r14
  6271. shldq $0x01, %r12, %r13
  6272. shldq $0x01, %r11, %r12
  6273. andq %rcx, %r11
  6274. # Multiply top half by 19
  6275. movq $19, %rax
  6276. mulq %r12
  6277. xorq %r12, %r12
  6278. addq %rax, %r8
  6279. movq $19, %rax
  6280. adcq %rdx, %r12
  6281. mulq %r13
  6282. xorq %r13, %r13
  6283. addq %rax, %r9
  6284. movq $19, %rax
  6285. adcq %rdx, %r13
  6286. mulq %r14
  6287. xorq %r14, %r14
  6288. addq %rax, %r10
  6289. movq $19, %rax
  6290. adcq %rdx, %r14
  6291. mulq %r15
  6292. # Add remaining product results in
  6293. addq %r12, %r9
  6294. adcq %r13, %r10
  6295. adcq %r14, %r11
  6296. adcq %rax, %r11
  6297. adcq $0x00, %rdx
  6298. # Overflow
  6299. shldq $0x01, %r11, %rdx
  6300. imulq $19, %rdx, %rax
  6301. andq %rcx, %r11
  6302. addq %rax, %r8
  6303. adcq $0x00, %r9
  6304. adcq $0x00, %r10
  6305. adcq $0x00, %r11
  6306. # Reduce if top bit set
  6307. movq %r11, %rdx
  6308. sarq $63, %rdx
  6309. andq $19, %rdx
  6310. andq %rcx, %r11
  6311. addq %rdx, %r8
  6312. adcq $0x00, %r9
  6313. adcq $0x00, %r10
  6314. adcq $0x00, %r11
  6315. # Store
  6316. movq %r8, (%rdi)
  6317. movq %r9, 8(%rdi)
  6318. movq %r10, 16(%rdi)
  6319. movq %r11, 24(%rdi)
  6320. movq 8(%rsp), %rdi
  6321. movq 16(%rsp), %rsi
  6322. movq (%rsp), %rbx
  6323. # Add
  6324. movq (%rsi), %r8
  6325. movq 8(%rsi), %r9
  6326. addq (%rbx), %r8
  6327. movq 16(%rsi), %r10
  6328. adcq 8(%rbx), %r9
  6329. movq 24(%rsi), %rcx
  6330. adcq 16(%rbx), %r10
  6331. movq $-19, %rax
  6332. adcq 24(%rbx), %rcx
  6333. movq $0x7fffffffffffffff, %rdx
  6334. movq %rcx, %r11
  6335. sarq $63, %rcx
  6336. # Mask the modulus
  6337. andq %rcx, %rax
  6338. andq %rcx, %rdx
  6339. # Sub modulus (if overflow)
  6340. subq %rax, %r8
  6341. sbbq %rcx, %r9
  6342. sbbq %rcx, %r10
  6343. sbbq %rdx, %r11
  6344. movq %r8, (%rdi)
  6345. movq %r9, 8(%rdi)
  6346. movq %r10, 16(%rdi)
  6347. movq %r11, 24(%rdi)
  6348. movq 16(%rsp), %rdi
  6349. movq 16(%rsp), %rsi
  6350. movq (%rsp), %rbx
  6351. # Sub
  6352. movq (%rsi), %r8
  6353. movq 8(%rsi), %r9
  6354. movq 16(%rsi), %r10
  6355. movq 24(%rsi), %r11
  6356. subq (%rbx), %r8
  6357. movq $0x00, %rcx
  6358. sbbq 8(%rbx), %r9
  6359. movq $-19, %rax
  6360. sbbq 16(%rbx), %r10
  6361. movq $0x7fffffffffffffff, %rdx
  6362. sbbq 24(%rbx), %r11
  6363. sbbq $0x00, %rcx
  6364. # Mask the modulus
  6365. andq %rcx, %rax
  6366. andq %rcx, %rdx
  6367. # Add modulus (if underflow)
  6368. addq %rax, %r8
  6369. adcq %rcx, %r9
  6370. adcq %rcx, %r10
  6371. adcq %rdx, %r11
  6372. movq %r8, (%rdi)
  6373. movq %r9, 8(%rdi)
  6374. movq %r10, 16(%rdi)
  6375. movq %r11, 24(%rdi)
  6376. movq (%rsp), %rdi
  6377. leaq 48(%rsp), %rsi
  6378. movq 8(%rsp), %rbx
  6379. # Sub
  6380. movq (%rsi), %r8
  6381. movq 8(%rsi), %r9
  6382. movq 16(%rsi), %r10
  6383. movq 24(%rsi), %r11
  6384. subq (%rbx), %r8
  6385. movq $0x00, %rcx
  6386. sbbq 8(%rbx), %r9
  6387. movq $-19, %rax
  6388. sbbq 16(%rbx), %r10
  6389. movq $0x7fffffffffffffff, %rdx
  6390. sbbq 24(%rbx), %r11
  6391. sbbq $0x00, %rcx
  6392. # Mask the modulus
  6393. andq %rcx, %rax
  6394. andq %rcx, %rdx
  6395. # Add modulus (if underflow)
  6396. addq %rax, %r8
  6397. adcq %rcx, %r9
  6398. adcq %rcx, %r10
  6399. adcq %rdx, %r11
  6400. movq %r8, (%rdi)
  6401. movq %r9, 8(%rdi)
  6402. movq %r10, 16(%rdi)
  6403. movq %r11, 24(%rdi)
  6404. movq 24(%rsp), %rdi
  6405. movq 24(%rsp), %rsi
  6406. movq 16(%rsp), %rbx
  6407. # Sub
  6408. movq (%rsi), %r8
  6409. movq 8(%rsi), %r9
  6410. movq 16(%rsi), %r10
  6411. movq 24(%rsi), %r11
  6412. subq (%rbx), %r8
  6413. movq $0x00, %rcx
  6414. sbbq 8(%rbx), %r9
  6415. movq $-19, %rax
  6416. sbbq 16(%rbx), %r10
  6417. movq $0x7fffffffffffffff, %rdx
  6418. sbbq 24(%rbx), %r11
  6419. sbbq $0x00, %rcx
  6420. # Mask the modulus
  6421. andq %rcx, %rax
  6422. andq %rcx, %rdx
  6423. # Add modulus (if underflow)
  6424. addq %rax, %r8
  6425. adcq %rcx, %r9
  6426. adcq %rcx, %r10
  6427. adcq %rdx, %r11
  6428. movq %r8, (%rdi)
  6429. movq %r9, 8(%rdi)
  6430. movq %r10, 16(%rdi)
  6431. movq %r11, 24(%rdi)
  6432. addq $0x50, %rsp
  6433. popq %r15
  6434. popq %r14
  6435. popq %r13
  6436. popq %r12
  6437. popq %rbx
  6438. repz retq
  6439. #ifndef __APPLE__
  6440. .size fe_ge_dbl_x64,.-fe_ge_dbl_x64
  6441. #endif /* __APPLE__ */
  6442. #ifndef __APPLE__
  6443. .text
  6444. .globl fe_ge_madd_x64
  6445. .type fe_ge_madd_x64,@function
  6446. .align 16
  6447. fe_ge_madd_x64:
  6448. #else
  6449. .section __TEXT,__text
  6450. .globl _fe_ge_madd_x64
  6451. .p2align 4
  6452. _fe_ge_madd_x64:
  6453. #endif /* __APPLE__ */
  6454. pushq %rbx
  6455. pushq %r12
  6456. pushq %r13
  6457. pushq %r14
  6458. pushq %r15
  6459. subq $0x50, %rsp
  6460. movq %rdi, (%rsp)
  6461. movq %rsi, 8(%rsp)
  6462. movq %rdx, 16(%rsp)
  6463. movq %rcx, 24(%rsp)
  6464. movq %r8, 32(%rsp)
  6465. movq %r9, 40(%rsp)
  6466. movq (%rsp), %rdi
  6467. movq 40(%rsp), %rsi
  6468. movq 32(%rsp), %rbx
  6469. # Add
  6470. movq (%rsi), %r8
  6471. movq 8(%rsi), %r9
  6472. addq (%rbx), %r8
  6473. movq 16(%rsi), %r10
  6474. adcq 8(%rbx), %r9
  6475. movq 24(%rsi), %rcx
  6476. adcq 16(%rbx), %r10
  6477. movq $-19, %rax
  6478. adcq 24(%rbx), %rcx
  6479. movq $0x7fffffffffffffff, %rdx
  6480. movq %rcx, %r11
  6481. sarq $63, %rcx
  6482. # Mask the modulus
  6483. andq %rcx, %rax
  6484. andq %rcx, %rdx
  6485. # Sub modulus (if overflow)
  6486. subq %rax, %r8
  6487. sbbq %rcx, %r9
  6488. sbbq %rcx, %r10
  6489. sbbq %rdx, %r11
  6490. movq %r8, (%rdi)
  6491. movq %r9, 8(%rdi)
  6492. movq %r10, 16(%rdi)
  6493. movq %r11, 24(%rdi)
  6494. movq 8(%rsp), %rdi
  6495. movq 40(%rsp), %rsi
  6496. movq 32(%rsp), %rbx
  6497. # Sub
  6498. movq (%rsi), %r8
  6499. movq 8(%rsi), %r9
  6500. movq 16(%rsi), %r10
  6501. movq 24(%rsi), %r11
  6502. subq (%rbx), %r8
  6503. movq $0x00, %rcx
  6504. sbbq 8(%rbx), %r9
  6505. movq $-19, %rax
  6506. sbbq 16(%rbx), %r10
  6507. movq $0x7fffffffffffffff, %rdx
  6508. sbbq 24(%rbx), %r11
  6509. sbbq $0x00, %rcx
  6510. # Mask the modulus
  6511. andq %rcx, %rax
  6512. andq %rcx, %rdx
  6513. # Add modulus (if underflow)
  6514. addq %rax, %r8
  6515. adcq %rcx, %r9
  6516. adcq %rcx, %r10
  6517. adcq %rdx, %r11
  6518. movq %r8, (%rdi)
  6519. movq %r9, 8(%rdi)
  6520. movq %r10, 16(%rdi)
  6521. movq %r11, 24(%rdi)
  6522. movq 16(%rsp), %rdi
  6523. movq (%rsp), %rsi
  6524. movq 152(%rsp), %rbx
  6525. # Multiply
  6526. # A[0] * B[0]
  6527. movq (%rbx), %rax
  6528. mulq (%rsi)
  6529. movq %rax, %r8
  6530. movq %rdx, %r9
  6531. # A[0] * B[1]
  6532. movq 8(%rbx), %rax
  6533. mulq (%rsi)
  6534. xorq %r10, %r10
  6535. addq %rax, %r9
  6536. adcq %rdx, %r10
  6537. # A[1] * B[0]
  6538. movq (%rbx), %rax
  6539. mulq 8(%rsi)
  6540. xorq %r11, %r11
  6541. addq %rax, %r9
  6542. adcq %rdx, %r10
  6543. adcq $0x00, %r11
  6544. # A[0] * B[2]
  6545. movq 16(%rbx), %rax
  6546. mulq (%rsi)
  6547. addq %rax, %r10
  6548. adcq %rdx, %r11
  6549. # A[1] * B[1]
  6550. movq 8(%rbx), %rax
  6551. mulq 8(%rsi)
  6552. xorq %r12, %r12
  6553. addq %rax, %r10
  6554. adcq %rdx, %r11
  6555. adcq $0x00, %r12
  6556. # A[2] * B[0]
  6557. movq (%rbx), %rax
  6558. mulq 16(%rsi)
  6559. addq %rax, %r10
  6560. adcq %rdx, %r11
  6561. adcq $0x00, %r12
  6562. # A[0] * B[3]
  6563. movq 24(%rbx), %rax
  6564. mulq (%rsi)
  6565. xorq %r13, %r13
  6566. addq %rax, %r11
  6567. adcq %rdx, %r12
  6568. adcq $0x00, %r13
  6569. # A[1] * B[2]
  6570. movq 16(%rbx), %rax
  6571. mulq 8(%rsi)
  6572. addq %rax, %r11
  6573. adcq %rdx, %r12
  6574. adcq $0x00, %r13
  6575. # A[2] * B[1]
  6576. movq 8(%rbx), %rax
  6577. mulq 16(%rsi)
  6578. addq %rax, %r11
  6579. adcq %rdx, %r12
  6580. adcq $0x00, %r13
  6581. # A[3] * B[0]
  6582. movq (%rbx), %rax
  6583. mulq 24(%rsi)
  6584. addq %rax, %r11
  6585. adcq %rdx, %r12
  6586. adcq $0x00, %r13
  6587. # A[1] * B[3]
  6588. movq 24(%rbx), %rax
  6589. mulq 8(%rsi)
  6590. xorq %r14, %r14
  6591. addq %rax, %r12
  6592. adcq %rdx, %r13
  6593. adcq $0x00, %r14
  6594. # A[2] * B[2]
  6595. movq 16(%rbx), %rax
  6596. mulq 16(%rsi)
  6597. addq %rax, %r12
  6598. adcq %rdx, %r13
  6599. adcq $0x00, %r14
  6600. # A[3] * B[1]
  6601. movq 8(%rbx), %rax
  6602. mulq 24(%rsi)
  6603. addq %rax, %r12
  6604. adcq %rdx, %r13
  6605. adcq $0x00, %r14
  6606. # A[2] * B[3]
  6607. movq 24(%rbx), %rax
  6608. mulq 16(%rsi)
  6609. xorq %r15, %r15
  6610. addq %rax, %r13
  6611. adcq %rdx, %r14
  6612. adcq $0x00, %r15
  6613. # A[3] * B[2]
  6614. movq 16(%rbx), %rax
  6615. mulq 24(%rsi)
  6616. addq %rax, %r13
  6617. adcq %rdx, %r14
  6618. adcq $0x00, %r15
  6619. # A[3] * B[3]
  6620. movq 24(%rbx), %rax
  6621. mulq 24(%rsi)
  6622. addq %rax, %r14
  6623. adcq %rdx, %r15
  6624. # Reduce
  6625. movq $0x7fffffffffffffff, %rcx
  6626. # Move top half into t4-t7 and remove top bit from t3
  6627. shldq $0x01, %r14, %r15
  6628. shldq $0x01, %r13, %r14
  6629. shldq $0x01, %r12, %r13
  6630. shldq $0x01, %r11, %r12
  6631. andq %rcx, %r11
  6632. # Multiply top half by 19
  6633. movq $19, %rax
  6634. mulq %r12
  6635. xorq %r12, %r12
  6636. addq %rax, %r8
  6637. movq $19, %rax
  6638. adcq %rdx, %r12
  6639. mulq %r13
  6640. xorq %r13, %r13
  6641. addq %rax, %r9
  6642. movq $19, %rax
  6643. adcq %rdx, %r13
  6644. mulq %r14
  6645. xorq %r14, %r14
  6646. addq %rax, %r10
  6647. movq $19, %rax
  6648. adcq %rdx, %r14
  6649. mulq %r15
  6650. # Add remaining product results in
  6651. addq %r12, %r9
  6652. adcq %r13, %r10
  6653. adcq %r14, %r11
  6654. adcq %rax, %r11
  6655. adcq $0x00, %rdx
  6656. # Overflow
  6657. shldq $0x01, %r11, %rdx
  6658. imulq $19, %rdx, %rax
  6659. andq %rcx, %r11
  6660. addq %rax, %r8
  6661. adcq $0x00, %r9
  6662. adcq $0x00, %r10
  6663. adcq $0x00, %r11
  6664. # Reduce if top bit set
  6665. movq %r11, %rdx
  6666. sarq $63, %rdx
  6667. andq $19, %rdx
  6668. andq %rcx, %r11
  6669. addq %rdx, %r8
  6670. adcq $0x00, %r9
  6671. adcq $0x00, %r10
  6672. adcq $0x00, %r11
  6673. # Store
  6674. movq %r8, (%rdi)
  6675. movq %r9, 8(%rdi)
  6676. movq %r10, 16(%rdi)
  6677. movq %r11, 24(%rdi)
  6678. movq 8(%rsp), %rdi
  6679. movq 8(%rsp), %rsi
  6680. movq 160(%rsp), %rbx
  6681. # Multiply
  6682. # A[0] * B[0]
  6683. movq (%rbx), %rax
  6684. mulq (%rsi)
  6685. movq %rax, %r8
  6686. movq %rdx, %r9
  6687. # A[0] * B[1]
  6688. movq 8(%rbx), %rax
  6689. mulq (%rsi)
  6690. xorq %r10, %r10
  6691. addq %rax, %r9
  6692. adcq %rdx, %r10
  6693. # A[1] * B[0]
  6694. movq (%rbx), %rax
  6695. mulq 8(%rsi)
  6696. xorq %r11, %r11
  6697. addq %rax, %r9
  6698. adcq %rdx, %r10
  6699. adcq $0x00, %r11
  6700. # A[0] * B[2]
  6701. movq 16(%rbx), %rax
  6702. mulq (%rsi)
  6703. addq %rax, %r10
  6704. adcq %rdx, %r11
  6705. # A[1] * B[1]
  6706. movq 8(%rbx), %rax
  6707. mulq 8(%rsi)
  6708. xorq %r12, %r12
  6709. addq %rax, %r10
  6710. adcq %rdx, %r11
  6711. adcq $0x00, %r12
  6712. # A[2] * B[0]
  6713. movq (%rbx), %rax
  6714. mulq 16(%rsi)
  6715. addq %rax, %r10
  6716. adcq %rdx, %r11
  6717. adcq $0x00, %r12
  6718. # A[0] * B[3]
  6719. movq 24(%rbx), %rax
  6720. mulq (%rsi)
  6721. xorq %r13, %r13
  6722. addq %rax, %r11
  6723. adcq %rdx, %r12
  6724. adcq $0x00, %r13
  6725. # A[1] * B[2]
  6726. movq 16(%rbx), %rax
  6727. mulq 8(%rsi)
  6728. addq %rax, %r11
  6729. adcq %rdx, %r12
  6730. adcq $0x00, %r13
  6731. # A[2] * B[1]
  6732. movq 8(%rbx), %rax
  6733. mulq 16(%rsi)
  6734. addq %rax, %r11
  6735. adcq %rdx, %r12
  6736. adcq $0x00, %r13
  6737. # A[3] * B[0]
  6738. movq (%rbx), %rax
  6739. mulq 24(%rsi)
  6740. addq %rax, %r11
  6741. adcq %rdx, %r12
  6742. adcq $0x00, %r13
  6743. # A[1] * B[3]
  6744. movq 24(%rbx), %rax
  6745. mulq 8(%rsi)
  6746. xorq %r14, %r14
  6747. addq %rax, %r12
  6748. adcq %rdx, %r13
  6749. adcq $0x00, %r14
  6750. # A[2] * B[2]
  6751. movq 16(%rbx), %rax
  6752. mulq 16(%rsi)
  6753. addq %rax, %r12
  6754. adcq %rdx, %r13
  6755. adcq $0x00, %r14
  6756. # A[3] * B[1]
  6757. movq 8(%rbx), %rax
  6758. mulq 24(%rsi)
  6759. addq %rax, %r12
  6760. adcq %rdx, %r13
  6761. adcq $0x00, %r14
  6762. # A[2] * B[3]
  6763. movq 24(%rbx), %rax
  6764. mulq 16(%rsi)
  6765. xorq %r15, %r15
  6766. addq %rax, %r13
  6767. adcq %rdx, %r14
  6768. adcq $0x00, %r15
  6769. # A[3] * B[2]
  6770. movq 16(%rbx), %rax
  6771. mulq 24(%rsi)
  6772. addq %rax, %r13
  6773. adcq %rdx, %r14
  6774. adcq $0x00, %r15
  6775. # A[3] * B[3]
  6776. movq 24(%rbx), %rax
  6777. mulq 24(%rsi)
  6778. addq %rax, %r14
  6779. adcq %rdx, %r15
  6780. # Reduce
  6781. movq $0x7fffffffffffffff, %rcx
  6782. # Move top half into t4-t7 and remove top bit from t3
  6783. shldq $0x01, %r14, %r15
  6784. shldq $0x01, %r13, %r14
  6785. shldq $0x01, %r12, %r13
  6786. shldq $0x01, %r11, %r12
  6787. andq %rcx, %r11
  6788. # Multiply top half by 19
  6789. movq $19, %rax
  6790. mulq %r12
  6791. xorq %r12, %r12
  6792. addq %rax, %r8
  6793. movq $19, %rax
  6794. adcq %rdx, %r12
  6795. mulq %r13
  6796. xorq %r13, %r13
  6797. addq %rax, %r9
  6798. movq $19, %rax
  6799. adcq %rdx, %r13
  6800. mulq %r14
  6801. xorq %r14, %r14
  6802. addq %rax, %r10
  6803. movq $19, %rax
  6804. adcq %rdx, %r14
  6805. mulq %r15
  6806. # Add remaining product results in
  6807. addq %r12, %r9
  6808. adcq %r13, %r10
  6809. adcq %r14, %r11
  6810. adcq %rax, %r11
  6811. adcq $0x00, %rdx
  6812. # Overflow
  6813. shldq $0x01, %r11, %rdx
  6814. imulq $19, %rdx, %rax
  6815. andq %rcx, %r11
  6816. addq %rax, %r8
  6817. adcq $0x00, %r9
  6818. adcq $0x00, %r10
  6819. adcq $0x00, %r11
  6820. # Reduce if top bit set
  6821. movq %r11, %rdx
  6822. sarq $63, %rdx
  6823. andq $19, %rdx
  6824. andq %rcx, %r11
  6825. addq %rdx, %r8
  6826. adcq $0x00, %r9
  6827. adcq $0x00, %r10
  6828. adcq $0x00, %r11
  6829. # Store
  6830. movq %r8, (%rdi)
  6831. movq %r9, 8(%rdi)
  6832. movq %r10, 16(%rdi)
  6833. movq %r11, 24(%rdi)
  6834. movq 24(%rsp), %rdi
  6835. movq 144(%rsp), %rsi
  6836. movq 136(%rsp), %rbx
  6837. # Multiply
  6838. # A[0] * B[0]
  6839. movq (%rbx), %rax
  6840. mulq (%rsi)
  6841. movq %rax, %r8
  6842. movq %rdx, %r9
  6843. # A[0] * B[1]
  6844. movq 8(%rbx), %rax
  6845. mulq (%rsi)
  6846. xorq %r10, %r10
  6847. addq %rax, %r9
  6848. adcq %rdx, %r10
  6849. # A[1] * B[0]
  6850. movq (%rbx), %rax
  6851. mulq 8(%rsi)
  6852. xorq %r11, %r11
  6853. addq %rax, %r9
  6854. adcq %rdx, %r10
  6855. adcq $0x00, %r11
  6856. # A[0] * B[2]
  6857. movq 16(%rbx), %rax
  6858. mulq (%rsi)
  6859. addq %rax, %r10
  6860. adcq %rdx, %r11
  6861. # A[1] * B[1]
  6862. movq 8(%rbx), %rax
  6863. mulq 8(%rsi)
  6864. xorq %r12, %r12
  6865. addq %rax, %r10
  6866. adcq %rdx, %r11
  6867. adcq $0x00, %r12
  6868. # A[2] * B[0]
  6869. movq (%rbx), %rax
  6870. mulq 16(%rsi)
  6871. addq %rax, %r10
  6872. adcq %rdx, %r11
  6873. adcq $0x00, %r12
  6874. # A[0] * B[3]
  6875. movq 24(%rbx), %rax
  6876. mulq (%rsi)
  6877. xorq %r13, %r13
  6878. addq %rax, %r11
  6879. adcq %rdx, %r12
  6880. adcq $0x00, %r13
  6881. # A[1] * B[2]
  6882. movq 16(%rbx), %rax
  6883. mulq 8(%rsi)
  6884. addq %rax, %r11
  6885. adcq %rdx, %r12
  6886. adcq $0x00, %r13
  6887. # A[2] * B[1]
  6888. movq 8(%rbx), %rax
  6889. mulq 16(%rsi)
  6890. addq %rax, %r11
  6891. adcq %rdx, %r12
  6892. adcq $0x00, %r13
  6893. # A[3] * B[0]
  6894. movq (%rbx), %rax
  6895. mulq 24(%rsi)
  6896. addq %rax, %r11
  6897. adcq %rdx, %r12
  6898. adcq $0x00, %r13
  6899. # A[1] * B[3]
  6900. movq 24(%rbx), %rax
  6901. mulq 8(%rsi)
  6902. xorq %r14, %r14
  6903. addq %rax, %r12
  6904. adcq %rdx, %r13
  6905. adcq $0x00, %r14
  6906. # A[2] * B[2]
  6907. movq 16(%rbx), %rax
  6908. mulq 16(%rsi)
  6909. addq %rax, %r12
  6910. adcq %rdx, %r13
  6911. adcq $0x00, %r14
  6912. # A[3] * B[1]
  6913. movq 8(%rbx), %rax
  6914. mulq 24(%rsi)
  6915. addq %rax, %r12
  6916. adcq %rdx, %r13
  6917. adcq $0x00, %r14
  6918. # A[2] * B[3]
  6919. movq 24(%rbx), %rax
  6920. mulq 16(%rsi)
  6921. xorq %r15, %r15
  6922. addq %rax, %r13
  6923. adcq %rdx, %r14
  6924. adcq $0x00, %r15
  6925. # A[3] * B[2]
  6926. movq 16(%rbx), %rax
  6927. mulq 24(%rsi)
  6928. addq %rax, %r13
  6929. adcq %rdx, %r14
  6930. adcq $0x00, %r15
  6931. # A[3] * B[3]
  6932. movq 24(%rbx), %rax
  6933. mulq 24(%rsi)
  6934. addq %rax, %r14
  6935. adcq %rdx, %r15
  6936. # Reduce
  6937. movq $0x7fffffffffffffff, %rcx
  6938. # Move top half into t4-t7 and remove top bit from t3
  6939. shldq $0x01, %r14, %r15
  6940. shldq $0x01, %r13, %r14
  6941. shldq $0x01, %r12, %r13
  6942. shldq $0x01, %r11, %r12
  6943. andq %rcx, %r11
  6944. # Multiply top half by 19
  6945. movq $19, %rax
  6946. mulq %r12
  6947. xorq %r12, %r12
  6948. addq %rax, %r8
  6949. movq $19, %rax
  6950. adcq %rdx, %r12
  6951. mulq %r13
  6952. xorq %r13, %r13
  6953. addq %rax, %r9
  6954. movq $19, %rax
  6955. adcq %rdx, %r13
  6956. mulq %r14
  6957. xorq %r14, %r14
  6958. addq %rax, %r10
  6959. movq $19, %rax
  6960. adcq %rdx, %r14
  6961. mulq %r15
  6962. # Add remaining product results in
  6963. addq %r12, %r9
  6964. adcq %r13, %r10
  6965. adcq %r14, %r11
  6966. adcq %rax, %r11
  6967. adcq $0x00, %rdx
  6968. # Overflow
  6969. shldq $0x01, %r11, %rdx
  6970. imulq $19, %rdx, %rax
  6971. andq %rcx, %r11
  6972. addq %rax, %r8
  6973. adcq $0x00, %r9
  6974. adcq $0x00, %r10
  6975. adcq $0x00, %r11
  6976. # Reduce if top bit set
  6977. movq %r11, %rdx
  6978. sarq $63, %rdx
  6979. andq $19, %rdx
  6980. andq %rcx, %r11
  6981. addq %rdx, %r8
  6982. adcq $0x00, %r9
  6983. adcq $0x00, %r10
  6984. adcq $0x00, %r11
  6985. # Store
  6986. movq %r8, (%rdi)
  6987. movq %r9, 8(%rdi)
  6988. movq %r10, 16(%rdi)
  6989. movq %r11, 24(%rdi)
  6990. leaq 48(%rsp), %rdi
  6991. movq 128(%rsp), %rsi
  6992. movq 128(%rsp), %rbx
  6993. # Add
  6994. movq (%rsi), %r8
  6995. movq 8(%rsi), %r9
  6996. addq (%rbx), %r8
  6997. movq 16(%rsi), %r10
  6998. adcq 8(%rbx), %r9
  6999. movq 24(%rsi), %rcx
  7000. adcq 16(%rbx), %r10
  7001. movq $-19, %rax
  7002. adcq 24(%rbx), %rcx
  7003. movq $0x7fffffffffffffff, %rdx
  7004. movq %rcx, %r11
  7005. sarq $63, %rcx
  7006. # Mask the modulus
  7007. andq %rcx, %rax
  7008. andq %rcx, %rdx
  7009. # Sub modulus (if overflow)
  7010. subq %rax, %r8
  7011. sbbq %rcx, %r9
  7012. sbbq %rcx, %r10
  7013. sbbq %rdx, %r11
  7014. movq %r8, (%rdi)
  7015. movq %r9, 8(%rdi)
  7016. movq %r10, 16(%rdi)
  7017. movq %r11, 24(%rdi)
  7018. movq (%rsp), %rdi
  7019. movq 16(%rsp), %rsi
  7020. movq 8(%rsp), %rbx
  7021. # Sub
  7022. movq (%rsi), %r8
  7023. movq 8(%rsi), %r9
  7024. movq 16(%rsi), %r10
  7025. movq 24(%rsi), %r11
  7026. subq (%rbx), %r8
  7027. movq $0x00, %rcx
  7028. sbbq 8(%rbx), %r9
  7029. movq $-19, %rax
  7030. sbbq 16(%rbx), %r10
  7031. movq $0x7fffffffffffffff, %rdx
  7032. sbbq 24(%rbx), %r11
  7033. sbbq $0x00, %rcx
  7034. # Mask the modulus
  7035. andq %rcx, %rax
  7036. andq %rcx, %rdx
  7037. # Add modulus (if underflow)
  7038. addq %rax, %r8
  7039. adcq %rcx, %r9
  7040. adcq %rcx, %r10
  7041. adcq %rdx, %r11
  7042. movq %r8, (%rdi)
  7043. movq %r9, 8(%rdi)
  7044. movq %r10, 16(%rdi)
  7045. movq %r11, 24(%rdi)
  7046. movq 8(%rsp), %rdi
  7047. movq 16(%rsp), %rsi
  7048. movq 8(%rsp), %rbx
  7049. # Add
  7050. movq (%rsi), %r8
  7051. movq 8(%rsi), %r9
  7052. addq (%rbx), %r8
  7053. movq 16(%rsi), %r10
  7054. adcq 8(%rbx), %r9
  7055. movq 24(%rsi), %rcx
  7056. adcq 16(%rbx), %r10
  7057. movq $-19, %rax
  7058. adcq 24(%rbx), %rcx
  7059. movq $0x7fffffffffffffff, %rdx
  7060. movq %rcx, %r11
  7061. sarq $63, %rcx
  7062. # Mask the modulus
  7063. andq %rcx, %rax
  7064. andq %rcx, %rdx
  7065. # Sub modulus (if overflow)
  7066. subq %rax, %r8
  7067. sbbq %rcx, %r9
  7068. sbbq %rcx, %r10
  7069. sbbq %rdx, %r11
  7070. movq %r8, (%rdi)
  7071. movq %r9, 8(%rdi)
  7072. movq %r10, 16(%rdi)
  7073. movq %r11, 24(%rdi)
  7074. movq 16(%rsp), %rdi
  7075. leaq 48(%rsp), %rsi
  7076. movq 24(%rsp), %rbx
  7077. # Add
  7078. movq (%rsi), %r8
  7079. movq 8(%rsi), %r9
  7080. addq (%rbx), %r8
  7081. movq 16(%rsi), %r10
  7082. adcq 8(%rbx), %r9
  7083. movq 24(%rsi), %rcx
  7084. adcq 16(%rbx), %r10
  7085. movq $-19, %rax
  7086. adcq 24(%rbx), %rcx
  7087. movq $0x7fffffffffffffff, %rdx
  7088. movq %rcx, %r11
  7089. sarq $63, %rcx
  7090. # Mask the modulus
  7091. andq %rcx, %rax
  7092. andq %rcx, %rdx
  7093. # Sub modulus (if overflow)
  7094. subq %rax, %r8
  7095. sbbq %rcx, %r9
  7096. sbbq %rcx, %r10
  7097. sbbq %rdx, %r11
  7098. movq %r8, (%rdi)
  7099. movq %r9, 8(%rdi)
  7100. movq %r10, 16(%rdi)
  7101. movq %r11, 24(%rdi)
  7102. movq 24(%rsp), %rdi
  7103. leaq 48(%rsp), %rsi
  7104. movq 24(%rsp), %rbx
  7105. # Sub
  7106. movq (%rsi), %r8
  7107. movq 8(%rsi), %r9
  7108. movq 16(%rsi), %r10
  7109. movq 24(%rsi), %r11
  7110. subq (%rbx), %r8
  7111. movq $0x00, %rcx
  7112. sbbq 8(%rbx), %r9
  7113. movq $-19, %rax
  7114. sbbq 16(%rbx), %r10
  7115. movq $0x7fffffffffffffff, %rdx
  7116. sbbq 24(%rbx), %r11
  7117. sbbq $0x00, %rcx
  7118. # Mask the modulus
  7119. andq %rcx, %rax
  7120. andq %rcx, %rdx
  7121. # Add modulus (if underflow)
  7122. addq %rax, %r8
  7123. adcq %rcx, %r9
  7124. adcq %rcx, %r10
  7125. adcq %rdx, %r11
  7126. movq %r8, (%rdi)
  7127. movq %r9, 8(%rdi)
  7128. movq %r10, 16(%rdi)
  7129. movq %r11, 24(%rdi)
  7130. addq $0x50, %rsp
  7131. popq %r15
  7132. popq %r14
  7133. popq %r13
  7134. popq %r12
  7135. popq %rbx
  7136. repz retq
  7137. #ifndef __APPLE__
  7138. .size fe_ge_madd_x64,.-fe_ge_madd_x64
  7139. #endif /* __APPLE__ */
  7140. #ifndef __APPLE__
  7141. .text
  7142. .globl fe_ge_msub_x64
  7143. .type fe_ge_msub_x64,@function
  7144. .align 16
  7145. fe_ge_msub_x64:
  7146. #else
  7147. .section __TEXT,__text
  7148. .globl _fe_ge_msub_x64
  7149. .p2align 4
  7150. _fe_ge_msub_x64:
  7151. #endif /* __APPLE__ */
  7152. pushq %rbx
  7153. pushq %r12
  7154. pushq %r13
  7155. pushq %r14
  7156. pushq %r15
  7157. subq $0x50, %rsp
  7158. movq %rdi, (%rsp)
  7159. movq %rsi, 8(%rsp)
  7160. movq %rdx, 16(%rsp)
  7161. movq %rcx, 24(%rsp)
  7162. movq %r8, 32(%rsp)
  7163. movq %r9, 40(%rsp)
  7164. movq (%rsp), %rdi
  7165. movq 40(%rsp), %rsi
  7166. movq 32(%rsp), %rbx
  7167. # Add
  7168. movq (%rsi), %r8
  7169. movq 8(%rsi), %r9
  7170. addq (%rbx), %r8
  7171. movq 16(%rsi), %r10
  7172. adcq 8(%rbx), %r9
  7173. movq 24(%rsi), %rcx
  7174. adcq 16(%rbx), %r10
  7175. movq $-19, %rax
  7176. adcq 24(%rbx), %rcx
  7177. movq $0x7fffffffffffffff, %rdx
  7178. movq %rcx, %r11
  7179. sarq $63, %rcx
  7180. # Mask the modulus
  7181. andq %rcx, %rax
  7182. andq %rcx, %rdx
  7183. # Sub modulus (if overflow)
  7184. subq %rax, %r8
  7185. sbbq %rcx, %r9
  7186. sbbq %rcx, %r10
  7187. sbbq %rdx, %r11
  7188. movq %r8, (%rdi)
  7189. movq %r9, 8(%rdi)
  7190. movq %r10, 16(%rdi)
  7191. movq %r11, 24(%rdi)
  7192. movq 8(%rsp), %rdi
  7193. movq 40(%rsp), %rsi
  7194. movq 32(%rsp), %rbx
  7195. # Sub
  7196. movq (%rsi), %r8
  7197. movq 8(%rsi), %r9
  7198. movq 16(%rsi), %r10
  7199. movq 24(%rsi), %r11
  7200. subq (%rbx), %r8
  7201. movq $0x00, %rcx
  7202. sbbq 8(%rbx), %r9
  7203. movq $-19, %rax
  7204. sbbq 16(%rbx), %r10
  7205. movq $0x7fffffffffffffff, %rdx
  7206. sbbq 24(%rbx), %r11
  7207. sbbq $0x00, %rcx
  7208. # Mask the modulus
  7209. andq %rcx, %rax
  7210. andq %rcx, %rdx
  7211. # Add modulus (if underflow)
  7212. addq %rax, %r8
  7213. adcq %rcx, %r9
  7214. adcq %rcx, %r10
  7215. adcq %rdx, %r11
  7216. movq %r8, (%rdi)
  7217. movq %r9, 8(%rdi)
  7218. movq %r10, 16(%rdi)
  7219. movq %r11, 24(%rdi)
  7220. movq 16(%rsp), %rdi
  7221. movq (%rsp), %rsi
  7222. movq 160(%rsp), %rbx
  7223. # Multiply
  7224. # A[0] * B[0]
  7225. movq (%rbx), %rax
  7226. mulq (%rsi)
  7227. movq %rax, %r8
  7228. movq %rdx, %r9
  7229. # A[0] * B[1]
  7230. movq 8(%rbx), %rax
  7231. mulq (%rsi)
  7232. xorq %r10, %r10
  7233. addq %rax, %r9
  7234. adcq %rdx, %r10
  7235. # A[1] * B[0]
  7236. movq (%rbx), %rax
  7237. mulq 8(%rsi)
  7238. xorq %r11, %r11
  7239. addq %rax, %r9
  7240. adcq %rdx, %r10
  7241. adcq $0x00, %r11
  7242. # A[0] * B[2]
  7243. movq 16(%rbx), %rax
  7244. mulq (%rsi)
  7245. addq %rax, %r10
  7246. adcq %rdx, %r11
  7247. # A[1] * B[1]
  7248. movq 8(%rbx), %rax
  7249. mulq 8(%rsi)
  7250. xorq %r12, %r12
  7251. addq %rax, %r10
  7252. adcq %rdx, %r11
  7253. adcq $0x00, %r12
  7254. # A[2] * B[0]
  7255. movq (%rbx), %rax
  7256. mulq 16(%rsi)
  7257. addq %rax, %r10
  7258. adcq %rdx, %r11
  7259. adcq $0x00, %r12
  7260. # A[0] * B[3]
  7261. movq 24(%rbx), %rax
  7262. mulq (%rsi)
  7263. xorq %r13, %r13
  7264. addq %rax, %r11
  7265. adcq %rdx, %r12
  7266. adcq $0x00, %r13
  7267. # A[1] * B[2]
  7268. movq 16(%rbx), %rax
  7269. mulq 8(%rsi)
  7270. addq %rax, %r11
  7271. adcq %rdx, %r12
  7272. adcq $0x00, %r13
  7273. # A[2] * B[1]
  7274. movq 8(%rbx), %rax
  7275. mulq 16(%rsi)
  7276. addq %rax, %r11
  7277. adcq %rdx, %r12
  7278. adcq $0x00, %r13
  7279. # A[3] * B[0]
  7280. movq (%rbx), %rax
  7281. mulq 24(%rsi)
  7282. addq %rax, %r11
  7283. adcq %rdx, %r12
  7284. adcq $0x00, %r13
  7285. # A[1] * B[3]
  7286. movq 24(%rbx), %rax
  7287. mulq 8(%rsi)
  7288. xorq %r14, %r14
  7289. addq %rax, %r12
  7290. adcq %rdx, %r13
  7291. adcq $0x00, %r14
  7292. # A[2] * B[2]
  7293. movq 16(%rbx), %rax
  7294. mulq 16(%rsi)
  7295. addq %rax, %r12
  7296. adcq %rdx, %r13
  7297. adcq $0x00, %r14
  7298. # A[3] * B[1]
  7299. movq 8(%rbx), %rax
  7300. mulq 24(%rsi)
  7301. addq %rax, %r12
  7302. adcq %rdx, %r13
  7303. adcq $0x00, %r14
  7304. # A[2] * B[3]
  7305. movq 24(%rbx), %rax
  7306. mulq 16(%rsi)
  7307. xorq %r15, %r15
  7308. addq %rax, %r13
  7309. adcq %rdx, %r14
  7310. adcq $0x00, %r15
  7311. # A[3] * B[2]
  7312. movq 16(%rbx), %rax
  7313. mulq 24(%rsi)
  7314. addq %rax, %r13
  7315. adcq %rdx, %r14
  7316. adcq $0x00, %r15
  7317. # A[3] * B[3]
  7318. movq 24(%rbx), %rax
  7319. mulq 24(%rsi)
  7320. addq %rax, %r14
  7321. adcq %rdx, %r15
  7322. # Reduce
  7323. movq $0x7fffffffffffffff, %rcx
  7324. # Move top half into t4-t7 and remove top bit from t3
  7325. shldq $0x01, %r14, %r15
  7326. shldq $0x01, %r13, %r14
  7327. shldq $0x01, %r12, %r13
  7328. shldq $0x01, %r11, %r12
  7329. andq %rcx, %r11
  7330. # Multiply top half by 19
  7331. movq $19, %rax
  7332. mulq %r12
  7333. xorq %r12, %r12
  7334. addq %rax, %r8
  7335. movq $19, %rax
  7336. adcq %rdx, %r12
  7337. mulq %r13
  7338. xorq %r13, %r13
  7339. addq %rax, %r9
  7340. movq $19, %rax
  7341. adcq %rdx, %r13
  7342. mulq %r14
  7343. xorq %r14, %r14
  7344. addq %rax, %r10
  7345. movq $19, %rax
  7346. adcq %rdx, %r14
  7347. mulq %r15
  7348. # Add remaining product results in
  7349. addq %r12, %r9
  7350. adcq %r13, %r10
  7351. adcq %r14, %r11
  7352. adcq %rax, %r11
  7353. adcq $0x00, %rdx
  7354. # Overflow
  7355. shldq $0x01, %r11, %rdx
  7356. imulq $19, %rdx, %rax
  7357. andq %rcx, %r11
  7358. addq %rax, %r8
  7359. adcq $0x00, %r9
  7360. adcq $0x00, %r10
  7361. adcq $0x00, %r11
  7362. # Reduce if top bit set
  7363. movq %r11, %rdx
  7364. sarq $63, %rdx
  7365. andq $19, %rdx
  7366. andq %rcx, %r11
  7367. addq %rdx, %r8
  7368. adcq $0x00, %r9
  7369. adcq $0x00, %r10
  7370. adcq $0x00, %r11
  7371. # Store
  7372. movq %r8, (%rdi)
  7373. movq %r9, 8(%rdi)
  7374. movq %r10, 16(%rdi)
  7375. movq %r11, 24(%rdi)
  7376. movq 8(%rsp), %rdi
  7377. movq 8(%rsp), %rsi
  7378. movq 152(%rsp), %rbx
  7379. # Multiply
  7380. # A[0] * B[0]
  7381. movq (%rbx), %rax
  7382. mulq (%rsi)
  7383. movq %rax, %r8
  7384. movq %rdx, %r9
  7385. # A[0] * B[1]
  7386. movq 8(%rbx), %rax
  7387. mulq (%rsi)
  7388. xorq %r10, %r10
  7389. addq %rax, %r9
  7390. adcq %rdx, %r10
  7391. # A[1] * B[0]
  7392. movq (%rbx), %rax
  7393. mulq 8(%rsi)
  7394. xorq %r11, %r11
  7395. addq %rax, %r9
  7396. adcq %rdx, %r10
  7397. adcq $0x00, %r11
  7398. # A[0] * B[2]
  7399. movq 16(%rbx), %rax
  7400. mulq (%rsi)
  7401. addq %rax, %r10
  7402. adcq %rdx, %r11
  7403. # A[1] * B[1]
  7404. movq 8(%rbx), %rax
  7405. mulq 8(%rsi)
  7406. xorq %r12, %r12
  7407. addq %rax, %r10
  7408. adcq %rdx, %r11
  7409. adcq $0x00, %r12
  7410. # A[2] * B[0]
  7411. movq (%rbx), %rax
  7412. mulq 16(%rsi)
  7413. addq %rax, %r10
  7414. adcq %rdx, %r11
  7415. adcq $0x00, %r12
  7416. # A[0] * B[3]
  7417. movq 24(%rbx), %rax
  7418. mulq (%rsi)
  7419. xorq %r13, %r13
  7420. addq %rax, %r11
  7421. adcq %rdx, %r12
  7422. adcq $0x00, %r13
  7423. # A[1] * B[2]
  7424. movq 16(%rbx), %rax
  7425. mulq 8(%rsi)
  7426. addq %rax, %r11
  7427. adcq %rdx, %r12
  7428. adcq $0x00, %r13
  7429. # A[2] * B[1]
  7430. movq 8(%rbx), %rax
  7431. mulq 16(%rsi)
  7432. addq %rax, %r11
  7433. adcq %rdx, %r12
  7434. adcq $0x00, %r13
  7435. # A[3] * B[0]
  7436. movq (%rbx), %rax
  7437. mulq 24(%rsi)
  7438. addq %rax, %r11
  7439. adcq %rdx, %r12
  7440. adcq $0x00, %r13
  7441. # A[1] * B[3]
  7442. movq 24(%rbx), %rax
  7443. mulq 8(%rsi)
  7444. xorq %r14, %r14
  7445. addq %rax, %r12
  7446. adcq %rdx, %r13
  7447. adcq $0x00, %r14
  7448. # A[2] * B[2]
  7449. movq 16(%rbx), %rax
  7450. mulq 16(%rsi)
  7451. addq %rax, %r12
  7452. adcq %rdx, %r13
  7453. adcq $0x00, %r14
  7454. # A[3] * B[1]
  7455. movq 8(%rbx), %rax
  7456. mulq 24(%rsi)
  7457. addq %rax, %r12
  7458. adcq %rdx, %r13
  7459. adcq $0x00, %r14
  7460. # A[2] * B[3]
  7461. movq 24(%rbx), %rax
  7462. mulq 16(%rsi)
  7463. xorq %r15, %r15
  7464. addq %rax, %r13
  7465. adcq %rdx, %r14
  7466. adcq $0x00, %r15
  7467. # A[3] * B[2]
  7468. movq 16(%rbx), %rax
  7469. mulq 24(%rsi)
  7470. addq %rax, %r13
  7471. adcq %rdx, %r14
  7472. adcq $0x00, %r15
  7473. # A[3] * B[3]
  7474. movq 24(%rbx), %rax
  7475. mulq 24(%rsi)
  7476. addq %rax, %r14
  7477. adcq %rdx, %r15
  7478. # Reduce
  7479. movq $0x7fffffffffffffff, %rcx
  7480. # Move top half into t4-t7 and remove top bit from t3
  7481. shldq $0x01, %r14, %r15
  7482. shldq $0x01, %r13, %r14
  7483. shldq $0x01, %r12, %r13
  7484. shldq $0x01, %r11, %r12
  7485. andq %rcx, %r11
  7486. # Multiply top half by 19
  7487. movq $19, %rax
  7488. mulq %r12
  7489. xorq %r12, %r12
  7490. addq %rax, %r8
  7491. movq $19, %rax
  7492. adcq %rdx, %r12
  7493. mulq %r13
  7494. xorq %r13, %r13
  7495. addq %rax, %r9
  7496. movq $19, %rax
  7497. adcq %rdx, %r13
  7498. mulq %r14
  7499. xorq %r14, %r14
  7500. addq %rax, %r10
  7501. movq $19, %rax
  7502. adcq %rdx, %r14
  7503. mulq %r15
  7504. # Add remaining product results in
  7505. addq %r12, %r9
  7506. adcq %r13, %r10
  7507. adcq %r14, %r11
  7508. adcq %rax, %r11
  7509. adcq $0x00, %rdx
  7510. # Overflow
  7511. shldq $0x01, %r11, %rdx
  7512. imulq $19, %rdx, %rax
  7513. andq %rcx, %r11
  7514. addq %rax, %r8
  7515. adcq $0x00, %r9
  7516. adcq $0x00, %r10
  7517. adcq $0x00, %r11
  7518. # Reduce if top bit set
  7519. movq %r11, %rdx
  7520. sarq $63, %rdx
  7521. andq $19, %rdx
  7522. andq %rcx, %r11
  7523. addq %rdx, %r8
  7524. adcq $0x00, %r9
  7525. adcq $0x00, %r10
  7526. adcq $0x00, %r11
  7527. # Store
  7528. movq %r8, (%rdi)
  7529. movq %r9, 8(%rdi)
  7530. movq %r10, 16(%rdi)
  7531. movq %r11, 24(%rdi)
  7532. movq 24(%rsp), %rdi
  7533. movq 144(%rsp), %rsi
  7534. movq 136(%rsp), %rbx
  7535. # Multiply
  7536. # A[0] * B[0]
  7537. movq (%rbx), %rax
  7538. mulq (%rsi)
  7539. movq %rax, %r8
  7540. movq %rdx, %r9
  7541. # A[0] * B[1]
  7542. movq 8(%rbx), %rax
  7543. mulq (%rsi)
  7544. xorq %r10, %r10
  7545. addq %rax, %r9
  7546. adcq %rdx, %r10
  7547. # A[1] * B[0]
  7548. movq (%rbx), %rax
  7549. mulq 8(%rsi)
  7550. xorq %r11, %r11
  7551. addq %rax, %r9
  7552. adcq %rdx, %r10
  7553. adcq $0x00, %r11
  7554. # A[0] * B[2]
  7555. movq 16(%rbx), %rax
  7556. mulq (%rsi)
  7557. addq %rax, %r10
  7558. adcq %rdx, %r11
  7559. # A[1] * B[1]
  7560. movq 8(%rbx), %rax
  7561. mulq 8(%rsi)
  7562. xorq %r12, %r12
  7563. addq %rax, %r10
  7564. adcq %rdx, %r11
  7565. adcq $0x00, %r12
  7566. # A[2] * B[0]
  7567. movq (%rbx), %rax
  7568. mulq 16(%rsi)
  7569. addq %rax, %r10
  7570. adcq %rdx, %r11
  7571. adcq $0x00, %r12
  7572. # A[0] * B[3]
  7573. movq 24(%rbx), %rax
  7574. mulq (%rsi)
  7575. xorq %r13, %r13
  7576. addq %rax, %r11
  7577. adcq %rdx, %r12
  7578. adcq $0x00, %r13
  7579. # A[1] * B[2]
  7580. movq 16(%rbx), %rax
  7581. mulq 8(%rsi)
  7582. addq %rax, %r11
  7583. adcq %rdx, %r12
  7584. adcq $0x00, %r13
  7585. # A[2] * B[1]
  7586. movq 8(%rbx), %rax
  7587. mulq 16(%rsi)
  7588. addq %rax, %r11
  7589. adcq %rdx, %r12
  7590. adcq $0x00, %r13
  7591. # A[3] * B[0]
  7592. movq (%rbx), %rax
  7593. mulq 24(%rsi)
  7594. addq %rax, %r11
  7595. adcq %rdx, %r12
  7596. adcq $0x00, %r13
  7597. # A[1] * B[3]
  7598. movq 24(%rbx), %rax
  7599. mulq 8(%rsi)
  7600. xorq %r14, %r14
  7601. addq %rax, %r12
  7602. adcq %rdx, %r13
  7603. adcq $0x00, %r14
  7604. # A[2] * B[2]
  7605. movq 16(%rbx), %rax
  7606. mulq 16(%rsi)
  7607. addq %rax, %r12
  7608. adcq %rdx, %r13
  7609. adcq $0x00, %r14
  7610. # A[3] * B[1]
  7611. movq 8(%rbx), %rax
  7612. mulq 24(%rsi)
  7613. addq %rax, %r12
  7614. adcq %rdx, %r13
  7615. adcq $0x00, %r14
  7616. # A[2] * B[3]
  7617. movq 24(%rbx), %rax
  7618. mulq 16(%rsi)
  7619. xorq %r15, %r15
  7620. addq %rax, %r13
  7621. adcq %rdx, %r14
  7622. adcq $0x00, %r15
  7623. # A[3] * B[2]
  7624. movq 16(%rbx), %rax
  7625. mulq 24(%rsi)
  7626. addq %rax, %r13
  7627. adcq %rdx, %r14
  7628. adcq $0x00, %r15
  7629. # A[3] * B[3]
  7630. movq 24(%rbx), %rax
  7631. mulq 24(%rsi)
  7632. addq %rax, %r14
  7633. adcq %rdx, %r15
  7634. # Reduce
  7635. movq $0x7fffffffffffffff, %rcx
  7636. # Move top half into t4-t7 and remove top bit from t3
  7637. shldq $0x01, %r14, %r15
  7638. shldq $0x01, %r13, %r14
  7639. shldq $0x01, %r12, %r13
  7640. shldq $0x01, %r11, %r12
  7641. andq %rcx, %r11
  7642. # Multiply top half by 19
  7643. movq $19, %rax
  7644. mulq %r12
  7645. xorq %r12, %r12
  7646. addq %rax, %r8
  7647. movq $19, %rax
  7648. adcq %rdx, %r12
  7649. mulq %r13
  7650. xorq %r13, %r13
  7651. addq %rax, %r9
  7652. movq $19, %rax
  7653. adcq %rdx, %r13
  7654. mulq %r14
  7655. xorq %r14, %r14
  7656. addq %rax, %r10
  7657. movq $19, %rax
  7658. adcq %rdx, %r14
  7659. mulq %r15
  7660. # Add remaining product results in
  7661. addq %r12, %r9
  7662. adcq %r13, %r10
  7663. adcq %r14, %r11
  7664. adcq %rax, %r11
  7665. adcq $0x00, %rdx
  7666. # Overflow
  7667. shldq $0x01, %r11, %rdx
  7668. imulq $19, %rdx, %rax
  7669. andq %rcx, %r11
  7670. addq %rax, %r8
  7671. adcq $0x00, %r9
  7672. adcq $0x00, %r10
  7673. adcq $0x00, %r11
  7674. # Reduce if top bit set
  7675. movq %r11, %rdx
  7676. sarq $63, %rdx
  7677. andq $19, %rdx
  7678. andq %rcx, %r11
  7679. addq %rdx, %r8
  7680. adcq $0x00, %r9
  7681. adcq $0x00, %r10
  7682. adcq $0x00, %r11
  7683. # Store
  7684. movq %r8, (%rdi)
  7685. movq %r9, 8(%rdi)
  7686. movq %r10, 16(%rdi)
  7687. movq %r11, 24(%rdi)
  7688. leaq 48(%rsp), %rdi
  7689. movq 128(%rsp), %rsi
  7690. movq 128(%rsp), %rbx
  7691. # Add
  7692. movq (%rsi), %r8
  7693. movq 8(%rsi), %r9
  7694. addq (%rbx), %r8
  7695. movq 16(%rsi), %r10
  7696. adcq 8(%rbx), %r9
  7697. movq 24(%rsi), %rcx
  7698. adcq 16(%rbx), %r10
  7699. movq $-19, %rax
  7700. adcq 24(%rbx), %rcx
  7701. movq $0x7fffffffffffffff, %rdx
  7702. movq %rcx, %r11
  7703. sarq $63, %rcx
  7704. # Mask the modulus
  7705. andq %rcx, %rax
  7706. andq %rcx, %rdx
  7707. # Sub modulus (if overflow)
  7708. subq %rax, %r8
  7709. sbbq %rcx, %r9
  7710. sbbq %rcx, %r10
  7711. sbbq %rdx, %r11
  7712. movq %r8, (%rdi)
  7713. movq %r9, 8(%rdi)
  7714. movq %r10, 16(%rdi)
  7715. movq %r11, 24(%rdi)
  7716. movq (%rsp), %rdi
  7717. movq 16(%rsp), %rsi
  7718. movq 8(%rsp), %rbx
  7719. # Sub
  7720. movq (%rsi), %r8
  7721. movq 8(%rsi), %r9
  7722. movq 16(%rsi), %r10
  7723. movq 24(%rsi), %r11
  7724. subq (%rbx), %r8
  7725. movq $0x00, %rcx
  7726. sbbq 8(%rbx), %r9
  7727. movq $-19, %rax
  7728. sbbq 16(%rbx), %r10
  7729. movq $0x7fffffffffffffff, %rdx
  7730. sbbq 24(%rbx), %r11
  7731. sbbq $0x00, %rcx
  7732. # Mask the modulus
  7733. andq %rcx, %rax
  7734. andq %rcx, %rdx
  7735. # Add modulus (if underflow)
  7736. addq %rax, %r8
  7737. adcq %rcx, %r9
  7738. adcq %rcx, %r10
  7739. adcq %rdx, %r11
  7740. movq %r8, (%rdi)
  7741. movq %r9, 8(%rdi)
  7742. movq %r10, 16(%rdi)
  7743. movq %r11, 24(%rdi)
  7744. movq 8(%rsp), %rdi
  7745. movq 16(%rsp), %rsi
  7746. movq 8(%rsp), %rbx
  7747. # Add
  7748. movq (%rsi), %r8
  7749. movq 8(%rsi), %r9
  7750. addq (%rbx), %r8
  7751. movq 16(%rsi), %r10
  7752. adcq 8(%rbx), %r9
  7753. movq 24(%rsi), %rcx
  7754. adcq 16(%rbx), %r10
  7755. movq $-19, %rax
  7756. adcq 24(%rbx), %rcx
  7757. movq $0x7fffffffffffffff, %rdx
  7758. movq %rcx, %r11
  7759. sarq $63, %rcx
  7760. # Mask the modulus
  7761. andq %rcx, %rax
  7762. andq %rcx, %rdx
  7763. # Sub modulus (if overflow)
  7764. subq %rax, %r8
  7765. sbbq %rcx, %r9
  7766. sbbq %rcx, %r10
  7767. sbbq %rdx, %r11
  7768. movq %r8, (%rdi)
  7769. movq %r9, 8(%rdi)
  7770. movq %r10, 16(%rdi)
  7771. movq %r11, 24(%rdi)
  7772. movq 16(%rsp), %rdi
  7773. leaq 48(%rsp), %rsi
  7774. movq 24(%rsp), %rbx
  7775. # Sub
  7776. movq (%rsi), %r8
  7777. movq 8(%rsi), %r9
  7778. movq 16(%rsi), %r10
  7779. movq 24(%rsi), %r11
  7780. subq (%rbx), %r8
  7781. movq $0x00, %rcx
  7782. sbbq 8(%rbx), %r9
  7783. movq $-19, %rax
  7784. sbbq 16(%rbx), %r10
  7785. movq $0x7fffffffffffffff, %rdx
  7786. sbbq 24(%rbx), %r11
  7787. sbbq $0x00, %rcx
  7788. # Mask the modulus
  7789. andq %rcx, %rax
  7790. andq %rcx, %rdx
  7791. # Add modulus (if underflow)
  7792. addq %rax, %r8
  7793. adcq %rcx, %r9
  7794. adcq %rcx, %r10
  7795. adcq %rdx, %r11
  7796. movq %r8, (%rdi)
  7797. movq %r9, 8(%rdi)
  7798. movq %r10, 16(%rdi)
  7799. movq %r11, 24(%rdi)
  7800. movq 24(%rsp), %rdi
  7801. leaq 48(%rsp), %rsi
  7802. movq 24(%rsp), %rbx
  7803. # Add
  7804. movq (%rsi), %r8
  7805. movq 8(%rsi), %r9
  7806. addq (%rbx), %r8
  7807. movq 16(%rsi), %r10
  7808. adcq 8(%rbx), %r9
  7809. movq 24(%rsi), %rcx
  7810. adcq 16(%rbx), %r10
  7811. movq $-19, %rax
  7812. adcq 24(%rbx), %rcx
  7813. movq $0x7fffffffffffffff, %rdx
  7814. movq %rcx, %r11
  7815. sarq $63, %rcx
  7816. # Mask the modulus
  7817. andq %rcx, %rax
  7818. andq %rcx, %rdx
  7819. # Sub modulus (if overflow)
  7820. subq %rax, %r8
  7821. sbbq %rcx, %r9
  7822. sbbq %rcx, %r10
  7823. sbbq %rdx, %r11
  7824. movq %r8, (%rdi)
  7825. movq %r9, 8(%rdi)
  7826. movq %r10, 16(%rdi)
  7827. movq %r11, 24(%rdi)
  7828. addq $0x50, %rsp
  7829. popq %r15
  7830. popq %r14
  7831. popq %r13
  7832. popq %r12
  7833. popq %rbx
  7834. repz retq
  7835. #ifndef __APPLE__
  7836. .size fe_ge_msub_x64,.-fe_ge_msub_x64
  7837. #endif /* __APPLE__ */
  7838. #ifndef __APPLE__
  7839. .text
  7840. .globl fe_ge_add_x64
  7841. .type fe_ge_add_x64,@function
  7842. .align 16
  7843. fe_ge_add_x64:
  7844. #else
  7845. .section __TEXT,__text
  7846. .globl _fe_ge_add_x64
  7847. .p2align 4
  7848. _fe_ge_add_x64:
  7849. #endif /* __APPLE__ */
  7850. pushq %rbx
  7851. pushq %r12
  7852. pushq %r13
  7853. pushq %r14
  7854. pushq %r15
  7855. subq $0x50, %rsp
  7856. movq %rdi, (%rsp)
  7857. movq %rsi, 8(%rsp)
  7858. movq %rdx, 16(%rsp)
  7859. movq %rcx, 24(%rsp)
  7860. movq %r8, 32(%rsp)
  7861. movq %r9, 40(%rsp)
  7862. movq (%rsp), %rdi
  7863. movq 40(%rsp), %rsi
  7864. movq 32(%rsp), %rbx
  7865. # Add
  7866. movq (%rsi), %r8
  7867. movq 8(%rsi), %r9
  7868. addq (%rbx), %r8
  7869. movq 16(%rsi), %r10
  7870. adcq 8(%rbx), %r9
  7871. movq 24(%rsi), %rcx
  7872. adcq 16(%rbx), %r10
  7873. movq $-19, %rax
  7874. adcq 24(%rbx), %rcx
  7875. movq $0x7fffffffffffffff, %rdx
  7876. movq %rcx, %r11
  7877. sarq $63, %rcx
  7878. # Mask the modulus
  7879. andq %rcx, %rax
  7880. andq %rcx, %rdx
  7881. # Sub modulus (if overflow)
  7882. subq %rax, %r8
  7883. sbbq %rcx, %r9
  7884. sbbq %rcx, %r10
  7885. sbbq %rdx, %r11
  7886. movq %r8, (%rdi)
  7887. movq %r9, 8(%rdi)
  7888. movq %r10, 16(%rdi)
  7889. movq %r11, 24(%rdi)
  7890. movq 8(%rsp), %rdi
  7891. movq 40(%rsp), %rsi
  7892. movq 32(%rsp), %rbx
  7893. # Sub
  7894. movq (%rsi), %r8
  7895. movq 8(%rsi), %r9
  7896. movq 16(%rsi), %r10
  7897. movq 24(%rsi), %r11
  7898. subq (%rbx), %r8
  7899. movq $0x00, %rcx
  7900. sbbq 8(%rbx), %r9
  7901. movq $-19, %rax
  7902. sbbq 16(%rbx), %r10
  7903. movq $0x7fffffffffffffff, %rdx
  7904. sbbq 24(%rbx), %r11
  7905. sbbq $0x00, %rcx
  7906. # Mask the modulus
  7907. andq %rcx, %rax
  7908. andq %rcx, %rdx
  7909. # Add modulus (if underflow)
  7910. addq %rax, %r8
  7911. adcq %rcx, %r9
  7912. adcq %rcx, %r10
  7913. adcq %rdx, %r11
  7914. movq %r8, (%rdi)
  7915. movq %r9, 8(%rdi)
  7916. movq %r10, 16(%rdi)
  7917. movq %r11, 24(%rdi)
  7918. movq 16(%rsp), %rdi
  7919. movq (%rsp), %rsi
  7920. movq 160(%rsp), %rbx
  7921. # Multiply
  7922. # A[0] * B[0]
  7923. movq (%rbx), %rax
  7924. mulq (%rsi)
  7925. movq %rax, %r8
  7926. movq %rdx, %r9
  7927. # A[0] * B[1]
  7928. movq 8(%rbx), %rax
  7929. mulq (%rsi)
  7930. xorq %r10, %r10
  7931. addq %rax, %r9
  7932. adcq %rdx, %r10
  7933. # A[1] * B[0]
  7934. movq (%rbx), %rax
  7935. mulq 8(%rsi)
  7936. xorq %r11, %r11
  7937. addq %rax, %r9
  7938. adcq %rdx, %r10
  7939. adcq $0x00, %r11
  7940. # A[0] * B[2]
  7941. movq 16(%rbx), %rax
  7942. mulq (%rsi)
  7943. addq %rax, %r10
  7944. adcq %rdx, %r11
  7945. # A[1] * B[1]
  7946. movq 8(%rbx), %rax
  7947. mulq 8(%rsi)
  7948. xorq %r12, %r12
  7949. addq %rax, %r10
  7950. adcq %rdx, %r11
  7951. adcq $0x00, %r12
  7952. # A[2] * B[0]
  7953. movq (%rbx), %rax
  7954. mulq 16(%rsi)
  7955. addq %rax, %r10
  7956. adcq %rdx, %r11
  7957. adcq $0x00, %r12
  7958. # A[0] * B[3]
  7959. movq 24(%rbx), %rax
  7960. mulq (%rsi)
  7961. xorq %r13, %r13
  7962. addq %rax, %r11
  7963. adcq %rdx, %r12
  7964. adcq $0x00, %r13
  7965. # A[1] * B[2]
  7966. movq 16(%rbx), %rax
  7967. mulq 8(%rsi)
  7968. addq %rax, %r11
  7969. adcq %rdx, %r12
  7970. adcq $0x00, %r13
  7971. # A[2] * B[1]
  7972. movq 8(%rbx), %rax
  7973. mulq 16(%rsi)
  7974. addq %rax, %r11
  7975. adcq %rdx, %r12
  7976. adcq $0x00, %r13
  7977. # A[3] * B[0]
  7978. movq (%rbx), %rax
  7979. mulq 24(%rsi)
  7980. addq %rax, %r11
  7981. adcq %rdx, %r12
  7982. adcq $0x00, %r13
  7983. # A[1] * B[3]
  7984. movq 24(%rbx), %rax
  7985. mulq 8(%rsi)
  7986. xorq %r14, %r14
  7987. addq %rax, %r12
  7988. adcq %rdx, %r13
  7989. adcq $0x00, %r14
  7990. # A[2] * B[2]
  7991. movq 16(%rbx), %rax
  7992. mulq 16(%rsi)
  7993. addq %rax, %r12
  7994. adcq %rdx, %r13
  7995. adcq $0x00, %r14
  7996. # A[3] * B[1]
  7997. movq 8(%rbx), %rax
  7998. mulq 24(%rsi)
  7999. addq %rax, %r12
  8000. adcq %rdx, %r13
  8001. adcq $0x00, %r14
  8002. # A[2] * B[3]
  8003. movq 24(%rbx), %rax
  8004. mulq 16(%rsi)
  8005. xorq %r15, %r15
  8006. addq %rax, %r13
  8007. adcq %rdx, %r14
  8008. adcq $0x00, %r15
  8009. # A[3] * B[2]
  8010. movq 16(%rbx), %rax
  8011. mulq 24(%rsi)
  8012. addq %rax, %r13
  8013. adcq %rdx, %r14
  8014. adcq $0x00, %r15
  8015. # A[3] * B[3]
  8016. movq 24(%rbx), %rax
  8017. mulq 24(%rsi)
  8018. addq %rax, %r14
  8019. adcq %rdx, %r15
  8020. # Reduce
  8021. movq $0x7fffffffffffffff, %rcx
  8022. # Move top half into t4-t7 and remove top bit from t3
  8023. shldq $0x01, %r14, %r15
  8024. shldq $0x01, %r13, %r14
  8025. shldq $0x01, %r12, %r13
  8026. shldq $0x01, %r11, %r12
  8027. andq %rcx, %r11
  8028. # Multiply top half by 19
  8029. movq $19, %rax
  8030. mulq %r12
  8031. xorq %r12, %r12
  8032. addq %rax, %r8
  8033. movq $19, %rax
  8034. adcq %rdx, %r12
  8035. mulq %r13
  8036. xorq %r13, %r13
  8037. addq %rax, %r9
  8038. movq $19, %rax
  8039. adcq %rdx, %r13
  8040. mulq %r14
  8041. xorq %r14, %r14
  8042. addq %rax, %r10
  8043. movq $19, %rax
  8044. adcq %rdx, %r14
  8045. mulq %r15
  8046. # Add remaining product results in
  8047. addq %r12, %r9
  8048. adcq %r13, %r10
  8049. adcq %r14, %r11
  8050. adcq %rax, %r11
  8051. adcq $0x00, %rdx
  8052. # Overflow
  8053. shldq $0x01, %r11, %rdx
  8054. imulq $19, %rdx, %rax
  8055. andq %rcx, %r11
  8056. addq %rax, %r8
  8057. adcq $0x00, %r9
  8058. adcq $0x00, %r10
  8059. adcq $0x00, %r11
  8060. # Reduce if top bit set
  8061. movq %r11, %rdx
  8062. sarq $63, %rdx
  8063. andq $19, %rdx
  8064. andq %rcx, %r11
  8065. addq %rdx, %r8
  8066. adcq $0x00, %r9
  8067. adcq $0x00, %r10
  8068. adcq $0x00, %r11
  8069. # Store
  8070. movq %r8, (%rdi)
  8071. movq %r9, 8(%rdi)
  8072. movq %r10, 16(%rdi)
  8073. movq %r11, 24(%rdi)
  8074. movq 8(%rsp), %rdi
  8075. movq 8(%rsp), %rsi
  8076. movq 168(%rsp), %rbx
  8077. # Multiply
  8078. # A[0] * B[0]
  8079. movq (%rbx), %rax
  8080. mulq (%rsi)
  8081. movq %rax, %r8
  8082. movq %rdx, %r9
  8083. # A[0] * B[1]
  8084. movq 8(%rbx), %rax
  8085. mulq (%rsi)
  8086. xorq %r10, %r10
  8087. addq %rax, %r9
  8088. adcq %rdx, %r10
  8089. # A[1] * B[0]
  8090. movq (%rbx), %rax
  8091. mulq 8(%rsi)
  8092. xorq %r11, %r11
  8093. addq %rax, %r9
  8094. adcq %rdx, %r10
  8095. adcq $0x00, %r11
  8096. # A[0] * B[2]
  8097. movq 16(%rbx), %rax
  8098. mulq (%rsi)
  8099. addq %rax, %r10
  8100. adcq %rdx, %r11
  8101. # A[1] * B[1]
  8102. movq 8(%rbx), %rax
  8103. mulq 8(%rsi)
  8104. xorq %r12, %r12
  8105. addq %rax, %r10
  8106. adcq %rdx, %r11
  8107. adcq $0x00, %r12
  8108. # A[2] * B[0]
  8109. movq (%rbx), %rax
  8110. mulq 16(%rsi)
  8111. addq %rax, %r10
  8112. adcq %rdx, %r11
  8113. adcq $0x00, %r12
  8114. # A[0] * B[3]
  8115. movq 24(%rbx), %rax
  8116. mulq (%rsi)
  8117. xorq %r13, %r13
  8118. addq %rax, %r11
  8119. adcq %rdx, %r12
  8120. adcq $0x00, %r13
  8121. # A[1] * B[2]
  8122. movq 16(%rbx), %rax
  8123. mulq 8(%rsi)
  8124. addq %rax, %r11
  8125. adcq %rdx, %r12
  8126. adcq $0x00, %r13
  8127. # A[2] * B[1]
  8128. movq 8(%rbx), %rax
  8129. mulq 16(%rsi)
  8130. addq %rax, %r11
  8131. adcq %rdx, %r12
  8132. adcq $0x00, %r13
  8133. # A[3] * B[0]
  8134. movq (%rbx), %rax
  8135. mulq 24(%rsi)
  8136. addq %rax, %r11
  8137. adcq %rdx, %r12
  8138. adcq $0x00, %r13
  8139. # A[1] * B[3]
  8140. movq 24(%rbx), %rax
  8141. mulq 8(%rsi)
  8142. xorq %r14, %r14
  8143. addq %rax, %r12
  8144. adcq %rdx, %r13
  8145. adcq $0x00, %r14
  8146. # A[2] * B[2]
  8147. movq 16(%rbx), %rax
  8148. mulq 16(%rsi)
  8149. addq %rax, %r12
  8150. adcq %rdx, %r13
  8151. adcq $0x00, %r14
  8152. # A[3] * B[1]
  8153. movq 8(%rbx), %rax
  8154. mulq 24(%rsi)
  8155. addq %rax, %r12
  8156. adcq %rdx, %r13
  8157. adcq $0x00, %r14
  8158. # A[2] * B[3]
  8159. movq 24(%rbx), %rax
  8160. mulq 16(%rsi)
  8161. xorq %r15, %r15
  8162. addq %rax, %r13
  8163. adcq %rdx, %r14
  8164. adcq $0x00, %r15
  8165. # A[3] * B[2]
  8166. movq 16(%rbx), %rax
  8167. mulq 24(%rsi)
  8168. addq %rax, %r13
  8169. adcq %rdx, %r14
  8170. adcq $0x00, %r15
  8171. # A[3] * B[3]
  8172. movq 24(%rbx), %rax
  8173. mulq 24(%rsi)
  8174. addq %rax, %r14
  8175. adcq %rdx, %r15
  8176. # Reduce
  8177. movq $0x7fffffffffffffff, %rcx
  8178. # Move top half into t4-t7 and remove top bit from t3
  8179. shldq $0x01, %r14, %r15
  8180. shldq $0x01, %r13, %r14
  8181. shldq $0x01, %r12, %r13
  8182. shldq $0x01, %r11, %r12
  8183. andq %rcx, %r11
  8184. # Multiply top half by 19
  8185. movq $19, %rax
  8186. mulq %r12
  8187. xorq %r12, %r12
  8188. addq %rax, %r8
  8189. movq $19, %rax
  8190. adcq %rdx, %r12
  8191. mulq %r13
  8192. xorq %r13, %r13
  8193. addq %rax, %r9
  8194. movq $19, %rax
  8195. adcq %rdx, %r13
  8196. mulq %r14
  8197. xorq %r14, %r14
  8198. addq %rax, %r10
  8199. movq $19, %rax
  8200. adcq %rdx, %r14
  8201. mulq %r15
  8202. # Add remaining product results in
  8203. addq %r12, %r9
  8204. adcq %r13, %r10
  8205. adcq %r14, %r11
  8206. adcq %rax, %r11
  8207. adcq $0x00, %rdx
  8208. # Overflow
  8209. shldq $0x01, %r11, %rdx
  8210. imulq $19, %rdx, %rax
  8211. andq %rcx, %r11
  8212. addq %rax, %r8
  8213. adcq $0x00, %r9
  8214. adcq $0x00, %r10
  8215. adcq $0x00, %r11
  8216. # Reduce if top bit set
  8217. movq %r11, %rdx
  8218. sarq $63, %rdx
  8219. andq $19, %rdx
  8220. andq %rcx, %r11
  8221. addq %rdx, %r8
  8222. adcq $0x00, %r9
  8223. adcq $0x00, %r10
  8224. adcq $0x00, %r11
  8225. # Store
  8226. movq %r8, (%rdi)
  8227. movq %r9, 8(%rdi)
  8228. movq %r10, 16(%rdi)
  8229. movq %r11, 24(%rdi)
  8230. movq 24(%rsp), %rdi
  8231. movq 152(%rsp), %rsi
  8232. movq 136(%rsp), %rbx
  8233. # Multiply
  8234. # A[0] * B[0]
  8235. movq (%rbx), %rax
  8236. mulq (%rsi)
  8237. movq %rax, %r8
  8238. movq %rdx, %r9
  8239. # A[0] * B[1]
  8240. movq 8(%rbx), %rax
  8241. mulq (%rsi)
  8242. xorq %r10, %r10
  8243. addq %rax, %r9
  8244. adcq %rdx, %r10
  8245. # A[1] * B[0]
  8246. movq (%rbx), %rax
  8247. mulq 8(%rsi)
  8248. xorq %r11, %r11
  8249. addq %rax, %r9
  8250. adcq %rdx, %r10
  8251. adcq $0x00, %r11
  8252. # A[0] * B[2]
  8253. movq 16(%rbx), %rax
  8254. mulq (%rsi)
  8255. addq %rax, %r10
  8256. adcq %rdx, %r11
  8257. # A[1] * B[1]
  8258. movq 8(%rbx), %rax
  8259. mulq 8(%rsi)
  8260. xorq %r12, %r12
  8261. addq %rax, %r10
  8262. adcq %rdx, %r11
  8263. adcq $0x00, %r12
  8264. # A[2] * B[0]
  8265. movq (%rbx), %rax
  8266. mulq 16(%rsi)
  8267. addq %rax, %r10
  8268. adcq %rdx, %r11
  8269. adcq $0x00, %r12
  8270. # A[0] * B[3]
  8271. movq 24(%rbx), %rax
  8272. mulq (%rsi)
  8273. xorq %r13, %r13
  8274. addq %rax, %r11
  8275. adcq %rdx, %r12
  8276. adcq $0x00, %r13
  8277. # A[1] * B[2]
  8278. movq 16(%rbx), %rax
  8279. mulq 8(%rsi)
  8280. addq %rax, %r11
  8281. adcq %rdx, %r12
  8282. adcq $0x00, %r13
  8283. # A[2] * B[1]
  8284. movq 8(%rbx), %rax
  8285. mulq 16(%rsi)
  8286. addq %rax, %r11
  8287. adcq %rdx, %r12
  8288. adcq $0x00, %r13
  8289. # A[3] * B[0]
  8290. movq (%rbx), %rax
  8291. mulq 24(%rsi)
  8292. addq %rax, %r11
  8293. adcq %rdx, %r12
  8294. adcq $0x00, %r13
  8295. # A[1] * B[3]
  8296. movq 24(%rbx), %rax
  8297. mulq 8(%rsi)
  8298. xorq %r14, %r14
  8299. addq %rax, %r12
  8300. adcq %rdx, %r13
  8301. adcq $0x00, %r14
  8302. # A[2] * B[2]
  8303. movq 16(%rbx), %rax
  8304. mulq 16(%rsi)
  8305. addq %rax, %r12
  8306. adcq %rdx, %r13
  8307. adcq $0x00, %r14
  8308. # A[3] * B[1]
  8309. movq 8(%rbx), %rax
  8310. mulq 24(%rsi)
  8311. addq %rax, %r12
  8312. adcq %rdx, %r13
  8313. adcq $0x00, %r14
  8314. # A[2] * B[3]
  8315. movq 24(%rbx), %rax
  8316. mulq 16(%rsi)
  8317. xorq %r15, %r15
  8318. addq %rax, %r13
  8319. adcq %rdx, %r14
  8320. adcq $0x00, %r15
  8321. # A[3] * B[2]
  8322. movq 16(%rbx), %rax
  8323. mulq 24(%rsi)
  8324. addq %rax, %r13
  8325. adcq %rdx, %r14
  8326. adcq $0x00, %r15
  8327. # A[3] * B[3]
  8328. movq 24(%rbx), %rax
  8329. mulq 24(%rsi)
  8330. addq %rax, %r14
  8331. adcq %rdx, %r15
  8332. # Reduce
  8333. movq $0x7fffffffffffffff, %rcx
  8334. # Move top half into t4-t7 and remove top bit from t3
  8335. shldq $0x01, %r14, %r15
  8336. shldq $0x01, %r13, %r14
  8337. shldq $0x01, %r12, %r13
  8338. shldq $0x01, %r11, %r12
  8339. andq %rcx, %r11
  8340. # Multiply top half by 19
  8341. movq $19, %rax
  8342. mulq %r12
  8343. xorq %r12, %r12
  8344. addq %rax, %r8
  8345. movq $19, %rax
  8346. adcq %rdx, %r12
  8347. mulq %r13
  8348. xorq %r13, %r13
  8349. addq %rax, %r9
  8350. movq $19, %rax
  8351. adcq %rdx, %r13
  8352. mulq %r14
  8353. xorq %r14, %r14
  8354. addq %rax, %r10
  8355. movq $19, %rax
  8356. adcq %rdx, %r14
  8357. mulq %r15
  8358. # Add remaining product results in
  8359. addq %r12, %r9
  8360. adcq %r13, %r10
  8361. adcq %r14, %r11
  8362. adcq %rax, %r11
  8363. adcq $0x00, %rdx
  8364. # Overflow
  8365. shldq $0x01, %r11, %rdx
  8366. imulq $19, %rdx, %rax
  8367. andq %rcx, %r11
  8368. addq %rax, %r8
  8369. adcq $0x00, %r9
  8370. adcq $0x00, %r10
  8371. adcq $0x00, %r11
  8372. # Reduce if top bit set
  8373. movq %r11, %rdx
  8374. sarq $63, %rdx
  8375. andq $19, %rdx
  8376. andq %rcx, %r11
  8377. addq %rdx, %r8
  8378. adcq $0x00, %r9
  8379. adcq $0x00, %r10
  8380. adcq $0x00, %r11
  8381. # Store
  8382. movq %r8, (%rdi)
  8383. movq %r9, 8(%rdi)
  8384. movq %r10, 16(%rdi)
  8385. movq %r11, 24(%rdi)
  8386. movq (%rsp), %rdi
  8387. movq 128(%rsp), %rsi
  8388. movq 144(%rsp), %rbx
  8389. # Multiply
  8390. # A[0] * B[0]
  8391. movq (%rbx), %rax
  8392. mulq (%rsi)
  8393. movq %rax, %r8
  8394. movq %rdx, %r9
  8395. # A[0] * B[1]
  8396. movq 8(%rbx), %rax
  8397. mulq (%rsi)
  8398. xorq %r10, %r10
  8399. addq %rax, %r9
  8400. adcq %rdx, %r10
  8401. # A[1] * B[0]
  8402. movq (%rbx), %rax
  8403. mulq 8(%rsi)
  8404. xorq %r11, %r11
  8405. addq %rax, %r9
  8406. adcq %rdx, %r10
  8407. adcq $0x00, %r11
  8408. # A[0] * B[2]
  8409. movq 16(%rbx), %rax
  8410. mulq (%rsi)
  8411. addq %rax, %r10
  8412. adcq %rdx, %r11
  8413. # A[1] * B[1]
  8414. movq 8(%rbx), %rax
  8415. mulq 8(%rsi)
  8416. xorq %r12, %r12
  8417. addq %rax, %r10
  8418. adcq %rdx, %r11
  8419. adcq $0x00, %r12
  8420. # A[2] * B[0]
  8421. movq (%rbx), %rax
  8422. mulq 16(%rsi)
  8423. addq %rax, %r10
  8424. adcq %rdx, %r11
  8425. adcq $0x00, %r12
  8426. # A[0] * B[3]
  8427. movq 24(%rbx), %rax
  8428. mulq (%rsi)
  8429. xorq %r13, %r13
  8430. addq %rax, %r11
  8431. adcq %rdx, %r12
  8432. adcq $0x00, %r13
  8433. # A[1] * B[2]
  8434. movq 16(%rbx), %rax
  8435. mulq 8(%rsi)
  8436. addq %rax, %r11
  8437. adcq %rdx, %r12
  8438. adcq $0x00, %r13
  8439. # A[2] * B[1]
  8440. movq 8(%rbx), %rax
  8441. mulq 16(%rsi)
  8442. addq %rax, %r11
  8443. adcq %rdx, %r12
  8444. adcq $0x00, %r13
  8445. # A[3] * B[0]
  8446. movq (%rbx), %rax
  8447. mulq 24(%rsi)
  8448. addq %rax, %r11
  8449. adcq %rdx, %r12
  8450. adcq $0x00, %r13
  8451. # A[1] * B[3]
  8452. movq 24(%rbx), %rax
  8453. mulq 8(%rsi)
  8454. xorq %r14, %r14
  8455. addq %rax, %r12
  8456. adcq %rdx, %r13
  8457. adcq $0x00, %r14
  8458. # A[2] * B[2]
  8459. movq 16(%rbx), %rax
  8460. mulq 16(%rsi)
  8461. addq %rax, %r12
  8462. adcq %rdx, %r13
  8463. adcq $0x00, %r14
  8464. # A[3] * B[1]
  8465. movq 8(%rbx), %rax
  8466. mulq 24(%rsi)
  8467. addq %rax, %r12
  8468. adcq %rdx, %r13
  8469. adcq $0x00, %r14
  8470. # A[2] * B[3]
  8471. movq 24(%rbx), %rax
  8472. mulq 16(%rsi)
  8473. xorq %r15, %r15
  8474. addq %rax, %r13
  8475. adcq %rdx, %r14
  8476. adcq $0x00, %r15
  8477. # A[3] * B[2]
  8478. movq 16(%rbx), %rax
  8479. mulq 24(%rsi)
  8480. addq %rax, %r13
  8481. adcq %rdx, %r14
  8482. adcq $0x00, %r15
  8483. # A[3] * B[3]
  8484. movq 24(%rbx), %rax
  8485. mulq 24(%rsi)
  8486. addq %rax, %r14
  8487. adcq %rdx, %r15
  8488. # Reduce
  8489. movq $0x7fffffffffffffff, %rcx
  8490. # Move top half into t4-t7 and remove top bit from t3
  8491. shldq $0x01, %r14, %r15
  8492. shldq $0x01, %r13, %r14
  8493. shldq $0x01, %r12, %r13
  8494. shldq $0x01, %r11, %r12
  8495. andq %rcx, %r11
  8496. # Multiply top half by 19
  8497. movq $19, %rax
  8498. mulq %r12
  8499. xorq %r12, %r12
  8500. addq %rax, %r8
  8501. movq $19, %rax
  8502. adcq %rdx, %r12
  8503. mulq %r13
  8504. xorq %r13, %r13
  8505. addq %rax, %r9
  8506. movq $19, %rax
  8507. adcq %rdx, %r13
  8508. mulq %r14
  8509. xorq %r14, %r14
  8510. addq %rax, %r10
  8511. movq $19, %rax
  8512. adcq %rdx, %r14
  8513. mulq %r15
  8514. # Add remaining product results in
  8515. addq %r12, %r9
  8516. adcq %r13, %r10
  8517. adcq %r14, %r11
  8518. adcq %rax, %r11
  8519. adcq $0x00, %rdx
  8520. # Overflow
  8521. shldq $0x01, %r11, %rdx
  8522. imulq $19, %rdx, %rax
  8523. andq %rcx, %r11
  8524. addq %rax, %r8
  8525. adcq $0x00, %r9
  8526. adcq $0x00, %r10
  8527. adcq $0x00, %r11
  8528. # Reduce if top bit set
  8529. movq %r11, %rdx
  8530. sarq $63, %rdx
  8531. andq $19, %rdx
  8532. andq %rcx, %r11
  8533. addq %rdx, %r8
  8534. adcq $0x00, %r9
  8535. adcq $0x00, %r10
  8536. adcq $0x00, %r11
  8537. # Store
  8538. movq %r8, (%rdi)
  8539. movq %r9, 8(%rdi)
  8540. movq %r10, 16(%rdi)
  8541. movq %r11, 24(%rdi)
  8542. leaq 48(%rsp), %rdi
  8543. movq (%rsp), %rsi
  8544. movq (%rsp), %rbx
  8545. # Add
  8546. movq (%rsi), %r8
  8547. movq 8(%rsi), %r9
  8548. addq (%rbx), %r8
  8549. movq 16(%rsi), %r10
  8550. adcq 8(%rbx), %r9
  8551. movq 24(%rsi), %rcx
  8552. adcq 16(%rbx), %r10
  8553. movq $-19, %rax
  8554. adcq 24(%rbx), %rcx
  8555. movq $0x7fffffffffffffff, %rdx
  8556. movq %rcx, %r11
  8557. sarq $63, %rcx
  8558. # Mask the modulus
  8559. andq %rcx, %rax
  8560. andq %rcx, %rdx
  8561. # Sub modulus (if overflow)
  8562. subq %rax, %r8
  8563. sbbq %rcx, %r9
  8564. sbbq %rcx, %r10
  8565. sbbq %rdx, %r11
  8566. movq %r8, (%rdi)
  8567. movq %r9, 8(%rdi)
  8568. movq %r10, 16(%rdi)
  8569. movq %r11, 24(%rdi)
  8570. movq (%rsp), %rdi
  8571. movq 16(%rsp), %rsi
  8572. movq 8(%rsp), %rbx
  8573. # Sub
  8574. movq (%rsi), %r8
  8575. movq 8(%rsi), %r9
  8576. movq 16(%rsi), %r10
  8577. movq 24(%rsi), %r11
  8578. subq (%rbx), %r8
  8579. movq $0x00, %rcx
  8580. sbbq 8(%rbx), %r9
  8581. movq $-19, %rax
  8582. sbbq 16(%rbx), %r10
  8583. movq $0x7fffffffffffffff, %rdx
  8584. sbbq 24(%rbx), %r11
  8585. sbbq $0x00, %rcx
  8586. # Mask the modulus
  8587. andq %rcx, %rax
  8588. andq %rcx, %rdx
  8589. # Add modulus (if underflow)
  8590. addq %rax, %r8
  8591. adcq %rcx, %r9
  8592. adcq %rcx, %r10
  8593. adcq %rdx, %r11
  8594. movq %r8, (%rdi)
  8595. movq %r9, 8(%rdi)
  8596. movq %r10, 16(%rdi)
  8597. movq %r11, 24(%rdi)
  8598. movq 8(%rsp), %rdi
  8599. movq 16(%rsp), %rsi
  8600. movq 8(%rsp), %rbx
  8601. # Add
  8602. movq (%rsi), %r8
  8603. movq 8(%rsi), %r9
  8604. addq (%rbx), %r8
  8605. movq 16(%rsi), %r10
  8606. adcq 8(%rbx), %r9
  8607. movq 24(%rsi), %rcx
  8608. adcq 16(%rbx), %r10
  8609. movq $-19, %rax
  8610. adcq 24(%rbx), %rcx
  8611. movq $0x7fffffffffffffff, %rdx
  8612. movq %rcx, %r11
  8613. sarq $63, %rcx
  8614. # Mask the modulus
  8615. andq %rcx, %rax
  8616. andq %rcx, %rdx
  8617. # Sub modulus (if overflow)
  8618. subq %rax, %r8
  8619. sbbq %rcx, %r9
  8620. sbbq %rcx, %r10
  8621. sbbq %rdx, %r11
  8622. movq %r8, (%rdi)
  8623. movq %r9, 8(%rdi)
  8624. movq %r10, 16(%rdi)
  8625. movq %r11, 24(%rdi)
  8626. movq 16(%rsp), %rdi
  8627. leaq 48(%rsp), %rsi
  8628. movq 24(%rsp), %rbx
  8629. # Add
  8630. movq (%rsi), %r8
  8631. movq 8(%rsi), %r9
  8632. addq (%rbx), %r8
  8633. movq 16(%rsi), %r10
  8634. adcq 8(%rbx), %r9
  8635. movq 24(%rsi), %rcx
  8636. adcq 16(%rbx), %r10
  8637. movq $-19, %rax
  8638. adcq 24(%rbx), %rcx
  8639. movq $0x7fffffffffffffff, %rdx
  8640. movq %rcx, %r11
  8641. sarq $63, %rcx
  8642. # Mask the modulus
  8643. andq %rcx, %rax
  8644. andq %rcx, %rdx
  8645. # Sub modulus (if overflow)
  8646. subq %rax, %r8
  8647. sbbq %rcx, %r9
  8648. sbbq %rcx, %r10
  8649. sbbq %rdx, %r11
  8650. movq %r8, (%rdi)
  8651. movq %r9, 8(%rdi)
  8652. movq %r10, 16(%rdi)
  8653. movq %r11, 24(%rdi)
  8654. movq 24(%rsp), %rdi
  8655. leaq 48(%rsp), %rsi
  8656. movq 24(%rsp), %rbx
  8657. # Sub
  8658. movq (%rsi), %r8
  8659. movq 8(%rsi), %r9
  8660. movq 16(%rsi), %r10
  8661. movq 24(%rsi), %r11
  8662. subq (%rbx), %r8
  8663. movq $0x00, %rcx
  8664. sbbq 8(%rbx), %r9
  8665. movq $-19, %rax
  8666. sbbq 16(%rbx), %r10
  8667. movq $0x7fffffffffffffff, %rdx
  8668. sbbq 24(%rbx), %r11
  8669. sbbq $0x00, %rcx
  8670. # Mask the modulus
  8671. andq %rcx, %rax
  8672. andq %rcx, %rdx
  8673. # Add modulus (if underflow)
  8674. addq %rax, %r8
  8675. adcq %rcx, %r9
  8676. adcq %rcx, %r10
  8677. adcq %rdx, %r11
  8678. movq %r8, (%rdi)
  8679. movq %r9, 8(%rdi)
  8680. movq %r10, 16(%rdi)
  8681. movq %r11, 24(%rdi)
  8682. addq $0x50, %rsp
  8683. popq %r15
  8684. popq %r14
  8685. popq %r13
  8686. popq %r12
  8687. popq %rbx
  8688. repz retq
  8689. #ifndef __APPLE__
  8690. .size fe_ge_add_x64,.-fe_ge_add_x64
  8691. #endif /* __APPLE__ */
  8692. #ifndef __APPLE__
  8693. .text
  8694. .globl fe_ge_sub_x64
  8695. .type fe_ge_sub_x64,@function
  8696. .align 16
  8697. fe_ge_sub_x64:
  8698. #else
  8699. .section __TEXT,__text
  8700. .globl _fe_ge_sub_x64
  8701. .p2align 4
  8702. _fe_ge_sub_x64:
  8703. #endif /* __APPLE__ */
  8704. pushq %rbx
  8705. pushq %r12
  8706. pushq %r13
  8707. pushq %r14
  8708. pushq %r15
  8709. subq $0x50, %rsp
  8710. movq %rdi, (%rsp)
  8711. movq %rsi, 8(%rsp)
  8712. movq %rdx, 16(%rsp)
  8713. movq %rcx, 24(%rsp)
  8714. movq %r8, 32(%rsp)
  8715. movq %r9, 40(%rsp)
  8716. movq (%rsp), %rdi
  8717. movq 40(%rsp), %rsi
  8718. movq 32(%rsp), %rbx
  8719. # Add
  8720. movq (%rsi), %r8
  8721. movq 8(%rsi), %r9
  8722. addq (%rbx), %r8
  8723. movq 16(%rsi), %r10
  8724. adcq 8(%rbx), %r9
  8725. movq 24(%rsi), %rcx
  8726. adcq 16(%rbx), %r10
  8727. movq $-19, %rax
  8728. adcq 24(%rbx), %rcx
  8729. movq $0x7fffffffffffffff, %rdx
  8730. movq %rcx, %r11
  8731. sarq $63, %rcx
  8732. # Mask the modulus
  8733. andq %rcx, %rax
  8734. andq %rcx, %rdx
  8735. # Sub modulus (if overflow)
  8736. subq %rax, %r8
  8737. sbbq %rcx, %r9
  8738. sbbq %rcx, %r10
  8739. sbbq %rdx, %r11
  8740. movq %r8, (%rdi)
  8741. movq %r9, 8(%rdi)
  8742. movq %r10, 16(%rdi)
  8743. movq %r11, 24(%rdi)
  8744. movq 8(%rsp), %rdi
  8745. movq 40(%rsp), %rsi
  8746. movq 32(%rsp), %rbx
  8747. # Sub
  8748. movq (%rsi), %r8
  8749. movq 8(%rsi), %r9
  8750. movq 16(%rsi), %r10
  8751. movq 24(%rsi), %r11
  8752. subq (%rbx), %r8
  8753. movq $0x00, %rcx
  8754. sbbq 8(%rbx), %r9
  8755. movq $-19, %rax
  8756. sbbq 16(%rbx), %r10
  8757. movq $0x7fffffffffffffff, %rdx
  8758. sbbq 24(%rbx), %r11
  8759. sbbq $0x00, %rcx
  8760. # Mask the modulus
  8761. andq %rcx, %rax
  8762. andq %rcx, %rdx
  8763. # Add modulus (if underflow)
  8764. addq %rax, %r8
  8765. adcq %rcx, %r9
  8766. adcq %rcx, %r10
  8767. adcq %rdx, %r11
  8768. movq %r8, (%rdi)
  8769. movq %r9, 8(%rdi)
  8770. movq %r10, 16(%rdi)
  8771. movq %r11, 24(%rdi)
  8772. movq 16(%rsp), %rdi
  8773. movq (%rsp), %rsi
  8774. movq 168(%rsp), %rbx
  8775. # Multiply
  8776. # A[0] * B[0]
  8777. movq (%rbx), %rax
  8778. mulq (%rsi)
  8779. movq %rax, %r8
  8780. movq %rdx, %r9
  8781. # A[0] * B[1]
  8782. movq 8(%rbx), %rax
  8783. mulq (%rsi)
  8784. xorq %r10, %r10
  8785. addq %rax, %r9
  8786. adcq %rdx, %r10
  8787. # A[1] * B[0]
  8788. movq (%rbx), %rax
  8789. mulq 8(%rsi)
  8790. xorq %r11, %r11
  8791. addq %rax, %r9
  8792. adcq %rdx, %r10
  8793. adcq $0x00, %r11
  8794. # A[0] * B[2]
  8795. movq 16(%rbx), %rax
  8796. mulq (%rsi)
  8797. addq %rax, %r10
  8798. adcq %rdx, %r11
  8799. # A[1] * B[1]
  8800. movq 8(%rbx), %rax
  8801. mulq 8(%rsi)
  8802. xorq %r12, %r12
  8803. addq %rax, %r10
  8804. adcq %rdx, %r11
  8805. adcq $0x00, %r12
  8806. # A[2] * B[0]
  8807. movq (%rbx), %rax
  8808. mulq 16(%rsi)
  8809. addq %rax, %r10
  8810. adcq %rdx, %r11
  8811. adcq $0x00, %r12
  8812. # A[0] * B[3]
  8813. movq 24(%rbx), %rax
  8814. mulq (%rsi)
  8815. xorq %r13, %r13
  8816. addq %rax, %r11
  8817. adcq %rdx, %r12
  8818. adcq $0x00, %r13
  8819. # A[1] * B[2]
  8820. movq 16(%rbx), %rax
  8821. mulq 8(%rsi)
  8822. addq %rax, %r11
  8823. adcq %rdx, %r12
  8824. adcq $0x00, %r13
  8825. # A[2] * B[1]
  8826. movq 8(%rbx), %rax
  8827. mulq 16(%rsi)
  8828. addq %rax, %r11
  8829. adcq %rdx, %r12
  8830. adcq $0x00, %r13
  8831. # A[3] * B[0]
  8832. movq (%rbx), %rax
  8833. mulq 24(%rsi)
  8834. addq %rax, %r11
  8835. adcq %rdx, %r12
  8836. adcq $0x00, %r13
  8837. # A[1] * B[3]
  8838. movq 24(%rbx), %rax
  8839. mulq 8(%rsi)
  8840. xorq %r14, %r14
  8841. addq %rax, %r12
  8842. adcq %rdx, %r13
  8843. adcq $0x00, %r14
  8844. # A[2] * B[2]
  8845. movq 16(%rbx), %rax
  8846. mulq 16(%rsi)
  8847. addq %rax, %r12
  8848. adcq %rdx, %r13
  8849. adcq $0x00, %r14
  8850. # A[3] * B[1]
  8851. movq 8(%rbx), %rax
  8852. mulq 24(%rsi)
  8853. addq %rax, %r12
  8854. adcq %rdx, %r13
  8855. adcq $0x00, %r14
  8856. # A[2] * B[3]
  8857. movq 24(%rbx), %rax
  8858. mulq 16(%rsi)
  8859. xorq %r15, %r15
  8860. addq %rax, %r13
  8861. adcq %rdx, %r14
  8862. adcq $0x00, %r15
  8863. # A[3] * B[2]
  8864. movq 16(%rbx), %rax
  8865. mulq 24(%rsi)
  8866. addq %rax, %r13
  8867. adcq %rdx, %r14
  8868. adcq $0x00, %r15
  8869. # A[3] * B[3]
  8870. movq 24(%rbx), %rax
  8871. mulq 24(%rsi)
  8872. addq %rax, %r14
  8873. adcq %rdx, %r15
  8874. # Reduce
  8875. movq $0x7fffffffffffffff, %rcx
  8876. # Move top half into t4-t7 and remove top bit from t3
  8877. shldq $0x01, %r14, %r15
  8878. shldq $0x01, %r13, %r14
  8879. shldq $0x01, %r12, %r13
  8880. shldq $0x01, %r11, %r12
  8881. andq %rcx, %r11
  8882. # Multiply top half by 19
  8883. movq $19, %rax
  8884. mulq %r12
  8885. xorq %r12, %r12
  8886. addq %rax, %r8
  8887. movq $19, %rax
  8888. adcq %rdx, %r12
  8889. mulq %r13
  8890. xorq %r13, %r13
  8891. addq %rax, %r9
  8892. movq $19, %rax
  8893. adcq %rdx, %r13
  8894. mulq %r14
  8895. xorq %r14, %r14
  8896. addq %rax, %r10
  8897. movq $19, %rax
  8898. adcq %rdx, %r14
  8899. mulq %r15
  8900. # Add remaining product results in
  8901. addq %r12, %r9
  8902. adcq %r13, %r10
  8903. adcq %r14, %r11
  8904. adcq %rax, %r11
  8905. adcq $0x00, %rdx
  8906. # Overflow
  8907. shldq $0x01, %r11, %rdx
  8908. imulq $19, %rdx, %rax
  8909. andq %rcx, %r11
  8910. addq %rax, %r8
  8911. adcq $0x00, %r9
  8912. adcq $0x00, %r10
  8913. adcq $0x00, %r11
  8914. # Reduce if top bit set
  8915. movq %r11, %rdx
  8916. sarq $63, %rdx
  8917. andq $19, %rdx
  8918. andq %rcx, %r11
  8919. addq %rdx, %r8
  8920. adcq $0x00, %r9
  8921. adcq $0x00, %r10
  8922. adcq $0x00, %r11
  8923. # Store
  8924. movq %r8, (%rdi)
  8925. movq %r9, 8(%rdi)
  8926. movq %r10, 16(%rdi)
  8927. movq %r11, 24(%rdi)
  8928. movq 8(%rsp), %rdi
  8929. movq 8(%rsp), %rsi
  8930. movq 160(%rsp), %rbx
  8931. # Multiply
  8932. # A[0] * B[0]
  8933. movq (%rbx), %rax
  8934. mulq (%rsi)
  8935. movq %rax, %r8
  8936. movq %rdx, %r9
  8937. # A[0] * B[1]
  8938. movq 8(%rbx), %rax
  8939. mulq (%rsi)
  8940. xorq %r10, %r10
  8941. addq %rax, %r9
  8942. adcq %rdx, %r10
  8943. # A[1] * B[0]
  8944. movq (%rbx), %rax
  8945. mulq 8(%rsi)
  8946. xorq %r11, %r11
  8947. addq %rax, %r9
  8948. adcq %rdx, %r10
  8949. adcq $0x00, %r11
  8950. # A[0] * B[2]
  8951. movq 16(%rbx), %rax
  8952. mulq (%rsi)
  8953. addq %rax, %r10
  8954. adcq %rdx, %r11
  8955. # A[1] * B[1]
  8956. movq 8(%rbx), %rax
  8957. mulq 8(%rsi)
  8958. xorq %r12, %r12
  8959. addq %rax, %r10
  8960. adcq %rdx, %r11
  8961. adcq $0x00, %r12
  8962. # A[2] * B[0]
  8963. movq (%rbx), %rax
  8964. mulq 16(%rsi)
  8965. addq %rax, %r10
  8966. adcq %rdx, %r11
  8967. adcq $0x00, %r12
  8968. # A[0] * B[3]
  8969. movq 24(%rbx), %rax
  8970. mulq (%rsi)
  8971. xorq %r13, %r13
  8972. addq %rax, %r11
  8973. adcq %rdx, %r12
  8974. adcq $0x00, %r13
  8975. # A[1] * B[2]
  8976. movq 16(%rbx), %rax
  8977. mulq 8(%rsi)
  8978. addq %rax, %r11
  8979. adcq %rdx, %r12
  8980. adcq $0x00, %r13
  8981. # A[2] * B[1]
  8982. movq 8(%rbx), %rax
  8983. mulq 16(%rsi)
  8984. addq %rax, %r11
  8985. adcq %rdx, %r12
  8986. adcq $0x00, %r13
  8987. # A[3] * B[0]
  8988. movq (%rbx), %rax
  8989. mulq 24(%rsi)
  8990. addq %rax, %r11
  8991. adcq %rdx, %r12
  8992. adcq $0x00, %r13
  8993. # A[1] * B[3]
  8994. movq 24(%rbx), %rax
  8995. mulq 8(%rsi)
  8996. xorq %r14, %r14
  8997. addq %rax, %r12
  8998. adcq %rdx, %r13
  8999. adcq $0x00, %r14
  9000. # A[2] * B[2]
  9001. movq 16(%rbx), %rax
  9002. mulq 16(%rsi)
  9003. addq %rax, %r12
  9004. adcq %rdx, %r13
  9005. adcq $0x00, %r14
  9006. # A[3] * B[1]
  9007. movq 8(%rbx), %rax
  9008. mulq 24(%rsi)
  9009. addq %rax, %r12
  9010. adcq %rdx, %r13
  9011. adcq $0x00, %r14
  9012. # A[2] * B[3]
  9013. movq 24(%rbx), %rax
  9014. mulq 16(%rsi)
  9015. xorq %r15, %r15
  9016. addq %rax, %r13
  9017. adcq %rdx, %r14
  9018. adcq $0x00, %r15
  9019. # A[3] * B[2]
  9020. movq 16(%rbx), %rax
  9021. mulq 24(%rsi)
  9022. addq %rax, %r13
  9023. adcq %rdx, %r14
  9024. adcq $0x00, %r15
  9025. # A[3] * B[3]
  9026. movq 24(%rbx), %rax
  9027. mulq 24(%rsi)
  9028. addq %rax, %r14
  9029. adcq %rdx, %r15
  9030. # Reduce
  9031. movq $0x7fffffffffffffff, %rcx
  9032. # Move top half into t4-t7 and remove top bit from t3
  9033. shldq $0x01, %r14, %r15
  9034. shldq $0x01, %r13, %r14
  9035. shldq $0x01, %r12, %r13
  9036. shldq $0x01, %r11, %r12
  9037. andq %rcx, %r11
  9038. # Multiply top half by 19
  9039. movq $19, %rax
  9040. mulq %r12
  9041. xorq %r12, %r12
  9042. addq %rax, %r8
  9043. movq $19, %rax
  9044. adcq %rdx, %r12
  9045. mulq %r13
  9046. xorq %r13, %r13
  9047. addq %rax, %r9
  9048. movq $19, %rax
  9049. adcq %rdx, %r13
  9050. mulq %r14
  9051. xorq %r14, %r14
  9052. addq %rax, %r10
  9053. movq $19, %rax
  9054. adcq %rdx, %r14
  9055. mulq %r15
  9056. # Add remaining product results in
  9057. addq %r12, %r9
  9058. adcq %r13, %r10
  9059. adcq %r14, %r11
  9060. adcq %rax, %r11
  9061. adcq $0x00, %rdx
  9062. # Overflow
  9063. shldq $0x01, %r11, %rdx
  9064. imulq $19, %rdx, %rax
  9065. andq %rcx, %r11
  9066. addq %rax, %r8
  9067. adcq $0x00, %r9
  9068. adcq $0x00, %r10
  9069. adcq $0x00, %r11
  9070. # Reduce if top bit set
  9071. movq %r11, %rdx
  9072. sarq $63, %rdx
  9073. andq $19, %rdx
  9074. andq %rcx, %r11
  9075. addq %rdx, %r8
  9076. adcq $0x00, %r9
  9077. adcq $0x00, %r10
  9078. adcq $0x00, %r11
  9079. # Store
  9080. movq %r8, (%rdi)
  9081. movq %r9, 8(%rdi)
  9082. movq %r10, 16(%rdi)
  9083. movq %r11, 24(%rdi)
  9084. movq 24(%rsp), %rdi
  9085. movq 152(%rsp), %rsi
  9086. movq 136(%rsp), %rbx
  9087. # Multiply
  9088. # A[0] * B[0]
  9089. movq (%rbx), %rax
  9090. mulq (%rsi)
  9091. movq %rax, %r8
  9092. movq %rdx, %r9
  9093. # A[0] * B[1]
  9094. movq 8(%rbx), %rax
  9095. mulq (%rsi)
  9096. xorq %r10, %r10
  9097. addq %rax, %r9
  9098. adcq %rdx, %r10
  9099. # A[1] * B[0]
  9100. movq (%rbx), %rax
  9101. mulq 8(%rsi)
  9102. xorq %r11, %r11
  9103. addq %rax, %r9
  9104. adcq %rdx, %r10
  9105. adcq $0x00, %r11
  9106. # A[0] * B[2]
  9107. movq 16(%rbx), %rax
  9108. mulq (%rsi)
  9109. addq %rax, %r10
  9110. adcq %rdx, %r11
  9111. # A[1] * B[1]
  9112. movq 8(%rbx), %rax
  9113. mulq 8(%rsi)
  9114. xorq %r12, %r12
  9115. addq %rax, %r10
  9116. adcq %rdx, %r11
  9117. adcq $0x00, %r12
  9118. # A[2] * B[0]
  9119. movq (%rbx), %rax
  9120. mulq 16(%rsi)
  9121. addq %rax, %r10
  9122. adcq %rdx, %r11
  9123. adcq $0x00, %r12
  9124. # A[0] * B[3]
  9125. movq 24(%rbx), %rax
  9126. mulq (%rsi)
  9127. xorq %r13, %r13
  9128. addq %rax, %r11
  9129. adcq %rdx, %r12
  9130. adcq $0x00, %r13
  9131. # A[1] * B[2]
  9132. movq 16(%rbx), %rax
  9133. mulq 8(%rsi)
  9134. addq %rax, %r11
  9135. adcq %rdx, %r12
  9136. adcq $0x00, %r13
  9137. # A[2] * B[1]
  9138. movq 8(%rbx), %rax
  9139. mulq 16(%rsi)
  9140. addq %rax, %r11
  9141. adcq %rdx, %r12
  9142. adcq $0x00, %r13
  9143. # A[3] * B[0]
  9144. movq (%rbx), %rax
  9145. mulq 24(%rsi)
  9146. addq %rax, %r11
  9147. adcq %rdx, %r12
  9148. adcq $0x00, %r13
  9149. # A[1] * B[3]
  9150. movq 24(%rbx), %rax
  9151. mulq 8(%rsi)
  9152. xorq %r14, %r14
  9153. addq %rax, %r12
  9154. adcq %rdx, %r13
  9155. adcq $0x00, %r14
  9156. # A[2] * B[2]
  9157. movq 16(%rbx), %rax
  9158. mulq 16(%rsi)
  9159. addq %rax, %r12
  9160. adcq %rdx, %r13
  9161. adcq $0x00, %r14
  9162. # A[3] * B[1]
  9163. movq 8(%rbx), %rax
  9164. mulq 24(%rsi)
  9165. addq %rax, %r12
  9166. adcq %rdx, %r13
  9167. adcq $0x00, %r14
  9168. # A[2] * B[3]
  9169. movq 24(%rbx), %rax
  9170. mulq 16(%rsi)
  9171. xorq %r15, %r15
  9172. addq %rax, %r13
  9173. adcq %rdx, %r14
  9174. adcq $0x00, %r15
  9175. # A[3] * B[2]
  9176. movq 16(%rbx), %rax
  9177. mulq 24(%rsi)
  9178. addq %rax, %r13
  9179. adcq %rdx, %r14
  9180. adcq $0x00, %r15
  9181. # A[3] * B[3]
  9182. movq 24(%rbx), %rax
  9183. mulq 24(%rsi)
  9184. addq %rax, %r14
  9185. adcq %rdx, %r15
  9186. # Reduce
  9187. movq $0x7fffffffffffffff, %rcx
  9188. # Move top half into t4-t7 and remove top bit from t3
  9189. shldq $0x01, %r14, %r15
  9190. shldq $0x01, %r13, %r14
  9191. shldq $0x01, %r12, %r13
  9192. shldq $0x01, %r11, %r12
  9193. andq %rcx, %r11
  9194. # Multiply top half by 19
  9195. movq $19, %rax
  9196. mulq %r12
  9197. xorq %r12, %r12
  9198. addq %rax, %r8
  9199. movq $19, %rax
  9200. adcq %rdx, %r12
  9201. mulq %r13
  9202. xorq %r13, %r13
  9203. addq %rax, %r9
  9204. movq $19, %rax
  9205. adcq %rdx, %r13
  9206. mulq %r14
  9207. xorq %r14, %r14
  9208. addq %rax, %r10
  9209. movq $19, %rax
  9210. adcq %rdx, %r14
  9211. mulq %r15
  9212. # Add remaining product results in
  9213. addq %r12, %r9
  9214. adcq %r13, %r10
  9215. adcq %r14, %r11
  9216. adcq %rax, %r11
  9217. adcq $0x00, %rdx
  9218. # Overflow
  9219. shldq $0x01, %r11, %rdx
  9220. imulq $19, %rdx, %rax
  9221. andq %rcx, %r11
  9222. addq %rax, %r8
  9223. adcq $0x00, %r9
  9224. adcq $0x00, %r10
  9225. adcq $0x00, %r11
  9226. # Reduce if top bit set
  9227. movq %r11, %rdx
  9228. sarq $63, %rdx
  9229. andq $19, %rdx
  9230. andq %rcx, %r11
  9231. addq %rdx, %r8
  9232. adcq $0x00, %r9
  9233. adcq $0x00, %r10
  9234. adcq $0x00, %r11
  9235. # Store
  9236. movq %r8, (%rdi)
  9237. movq %r9, 8(%rdi)
  9238. movq %r10, 16(%rdi)
  9239. movq %r11, 24(%rdi)
  9240. movq (%rsp), %rdi
  9241. movq 128(%rsp), %rsi
  9242. movq 144(%rsp), %rbx
  9243. # Multiply
  9244. # A[0] * B[0]
  9245. movq (%rbx), %rax
  9246. mulq (%rsi)
  9247. movq %rax, %r8
  9248. movq %rdx, %r9
  9249. # A[0] * B[1]
  9250. movq 8(%rbx), %rax
  9251. mulq (%rsi)
  9252. xorq %r10, %r10
  9253. addq %rax, %r9
  9254. adcq %rdx, %r10
  9255. # A[1] * B[0]
  9256. movq (%rbx), %rax
  9257. mulq 8(%rsi)
  9258. xorq %r11, %r11
  9259. addq %rax, %r9
  9260. adcq %rdx, %r10
  9261. adcq $0x00, %r11
  9262. # A[0] * B[2]
  9263. movq 16(%rbx), %rax
  9264. mulq (%rsi)
  9265. addq %rax, %r10
  9266. adcq %rdx, %r11
  9267. # A[1] * B[1]
  9268. movq 8(%rbx), %rax
  9269. mulq 8(%rsi)
  9270. xorq %r12, %r12
  9271. addq %rax, %r10
  9272. adcq %rdx, %r11
  9273. adcq $0x00, %r12
  9274. # A[2] * B[0]
  9275. movq (%rbx), %rax
  9276. mulq 16(%rsi)
  9277. addq %rax, %r10
  9278. adcq %rdx, %r11
  9279. adcq $0x00, %r12
  9280. # A[0] * B[3]
  9281. movq 24(%rbx), %rax
  9282. mulq (%rsi)
  9283. xorq %r13, %r13
  9284. addq %rax, %r11
  9285. adcq %rdx, %r12
  9286. adcq $0x00, %r13
  9287. # A[1] * B[2]
  9288. movq 16(%rbx), %rax
  9289. mulq 8(%rsi)
  9290. addq %rax, %r11
  9291. adcq %rdx, %r12
  9292. adcq $0x00, %r13
  9293. # A[2] * B[1]
  9294. movq 8(%rbx), %rax
  9295. mulq 16(%rsi)
  9296. addq %rax, %r11
  9297. adcq %rdx, %r12
  9298. adcq $0x00, %r13
  9299. # A[3] * B[0]
  9300. movq (%rbx), %rax
  9301. mulq 24(%rsi)
  9302. addq %rax, %r11
  9303. adcq %rdx, %r12
  9304. adcq $0x00, %r13
  9305. # A[1] * B[3]
  9306. movq 24(%rbx), %rax
  9307. mulq 8(%rsi)
  9308. xorq %r14, %r14
  9309. addq %rax, %r12
  9310. adcq %rdx, %r13
  9311. adcq $0x00, %r14
  9312. # A[2] * B[2]
  9313. movq 16(%rbx), %rax
  9314. mulq 16(%rsi)
  9315. addq %rax, %r12
  9316. adcq %rdx, %r13
  9317. adcq $0x00, %r14
  9318. # A[3] * B[1]
  9319. movq 8(%rbx), %rax
  9320. mulq 24(%rsi)
  9321. addq %rax, %r12
  9322. adcq %rdx, %r13
  9323. adcq $0x00, %r14
  9324. # A[2] * B[3]
  9325. movq 24(%rbx), %rax
  9326. mulq 16(%rsi)
  9327. xorq %r15, %r15
  9328. addq %rax, %r13
  9329. adcq %rdx, %r14
  9330. adcq $0x00, %r15
  9331. # A[3] * B[2]
  9332. movq 16(%rbx), %rax
  9333. mulq 24(%rsi)
  9334. addq %rax, %r13
  9335. adcq %rdx, %r14
  9336. adcq $0x00, %r15
  9337. # A[3] * B[3]
  9338. movq 24(%rbx), %rax
  9339. mulq 24(%rsi)
  9340. addq %rax, %r14
  9341. adcq %rdx, %r15
  9342. # Reduce
  9343. movq $0x7fffffffffffffff, %rcx
  9344. # Move top half into t4-t7 and remove top bit from t3
  9345. shldq $0x01, %r14, %r15
  9346. shldq $0x01, %r13, %r14
  9347. shldq $0x01, %r12, %r13
  9348. shldq $0x01, %r11, %r12
  9349. andq %rcx, %r11
  9350. # Multiply top half by 19
  9351. movq $19, %rax
  9352. mulq %r12
  9353. xorq %r12, %r12
  9354. addq %rax, %r8
  9355. movq $19, %rax
  9356. adcq %rdx, %r12
  9357. mulq %r13
  9358. xorq %r13, %r13
  9359. addq %rax, %r9
  9360. movq $19, %rax
  9361. adcq %rdx, %r13
  9362. mulq %r14
  9363. xorq %r14, %r14
  9364. addq %rax, %r10
  9365. movq $19, %rax
  9366. adcq %rdx, %r14
  9367. mulq %r15
  9368. # Add remaining product results in
  9369. addq %r12, %r9
  9370. adcq %r13, %r10
  9371. adcq %r14, %r11
  9372. adcq %rax, %r11
  9373. adcq $0x00, %rdx
  9374. # Overflow
  9375. shldq $0x01, %r11, %rdx
  9376. imulq $19, %rdx, %rax
  9377. andq %rcx, %r11
  9378. addq %rax, %r8
  9379. adcq $0x00, %r9
  9380. adcq $0x00, %r10
  9381. adcq $0x00, %r11
  9382. # Reduce if top bit set
  9383. movq %r11, %rdx
  9384. sarq $63, %rdx
  9385. andq $19, %rdx
  9386. andq %rcx, %r11
  9387. addq %rdx, %r8
  9388. adcq $0x00, %r9
  9389. adcq $0x00, %r10
  9390. adcq $0x00, %r11
  9391. # Store
  9392. movq %r8, (%rdi)
  9393. movq %r9, 8(%rdi)
  9394. movq %r10, 16(%rdi)
  9395. movq %r11, 24(%rdi)
  9396. leaq 48(%rsp), %rdi
  9397. movq (%rsp), %rsi
  9398. movq (%rsp), %rbx
  9399. # Add
  9400. movq (%rsi), %r8
  9401. movq 8(%rsi), %r9
  9402. addq (%rbx), %r8
  9403. movq 16(%rsi), %r10
  9404. adcq 8(%rbx), %r9
  9405. movq 24(%rsi), %rcx
  9406. adcq 16(%rbx), %r10
  9407. movq $-19, %rax
  9408. adcq 24(%rbx), %rcx
  9409. movq $0x7fffffffffffffff, %rdx
  9410. movq %rcx, %r11
  9411. sarq $63, %rcx
  9412. # Mask the modulus
  9413. andq %rcx, %rax
  9414. andq %rcx, %rdx
  9415. # Sub modulus (if overflow)
  9416. subq %rax, %r8
  9417. sbbq %rcx, %r9
  9418. sbbq %rcx, %r10
  9419. sbbq %rdx, %r11
  9420. movq %r8, (%rdi)
  9421. movq %r9, 8(%rdi)
  9422. movq %r10, 16(%rdi)
  9423. movq %r11, 24(%rdi)
  9424. movq (%rsp), %rdi
  9425. movq 16(%rsp), %rsi
  9426. movq 8(%rsp), %rbx
  9427. # Sub
  9428. movq (%rsi), %r8
  9429. movq 8(%rsi), %r9
  9430. movq 16(%rsi), %r10
  9431. movq 24(%rsi), %r11
  9432. subq (%rbx), %r8
  9433. movq $0x00, %rcx
  9434. sbbq 8(%rbx), %r9
  9435. movq $-19, %rax
  9436. sbbq 16(%rbx), %r10
  9437. movq $0x7fffffffffffffff, %rdx
  9438. sbbq 24(%rbx), %r11
  9439. sbbq $0x00, %rcx
  9440. # Mask the modulus
  9441. andq %rcx, %rax
  9442. andq %rcx, %rdx
  9443. # Add modulus (if underflow)
  9444. addq %rax, %r8
  9445. adcq %rcx, %r9
  9446. adcq %rcx, %r10
  9447. adcq %rdx, %r11
  9448. movq %r8, (%rdi)
  9449. movq %r9, 8(%rdi)
  9450. movq %r10, 16(%rdi)
  9451. movq %r11, 24(%rdi)
  9452. movq 8(%rsp), %rdi
  9453. movq 16(%rsp), %rsi
  9454. movq 8(%rsp), %rbx
  9455. # Add
  9456. movq (%rsi), %r8
  9457. movq 8(%rsi), %r9
  9458. addq (%rbx), %r8
  9459. movq 16(%rsi), %r10
  9460. adcq 8(%rbx), %r9
  9461. movq 24(%rsi), %rcx
  9462. adcq 16(%rbx), %r10
  9463. movq $-19, %rax
  9464. adcq 24(%rbx), %rcx
  9465. movq $0x7fffffffffffffff, %rdx
  9466. movq %rcx, %r11
  9467. sarq $63, %rcx
  9468. # Mask the modulus
  9469. andq %rcx, %rax
  9470. andq %rcx, %rdx
  9471. # Sub modulus (if overflow)
  9472. subq %rax, %r8
  9473. sbbq %rcx, %r9
  9474. sbbq %rcx, %r10
  9475. sbbq %rdx, %r11
  9476. movq %r8, (%rdi)
  9477. movq %r9, 8(%rdi)
  9478. movq %r10, 16(%rdi)
  9479. movq %r11, 24(%rdi)
  9480. movq 16(%rsp), %rdi
  9481. leaq 48(%rsp), %rsi
  9482. movq 24(%rsp), %rbx
  9483. # Sub
  9484. movq (%rsi), %r8
  9485. movq 8(%rsi), %r9
  9486. movq 16(%rsi), %r10
  9487. movq 24(%rsi), %r11
  9488. subq (%rbx), %r8
  9489. movq $0x00, %rcx
  9490. sbbq 8(%rbx), %r9
  9491. movq $-19, %rax
  9492. sbbq 16(%rbx), %r10
  9493. movq $0x7fffffffffffffff, %rdx
  9494. sbbq 24(%rbx), %r11
  9495. sbbq $0x00, %rcx
  9496. # Mask the modulus
  9497. andq %rcx, %rax
  9498. andq %rcx, %rdx
  9499. # Add modulus (if underflow)
  9500. addq %rax, %r8
  9501. adcq %rcx, %r9
  9502. adcq %rcx, %r10
  9503. adcq %rdx, %r11
  9504. movq %r8, (%rdi)
  9505. movq %r9, 8(%rdi)
  9506. movq %r10, 16(%rdi)
  9507. movq %r11, 24(%rdi)
  9508. movq 24(%rsp), %rdi
  9509. leaq 48(%rsp), %rsi
  9510. movq 24(%rsp), %rbx
  9511. # Add
  9512. movq (%rsi), %r8
  9513. movq 8(%rsi), %r9
  9514. addq (%rbx), %r8
  9515. movq 16(%rsi), %r10
  9516. adcq 8(%rbx), %r9
  9517. movq 24(%rsi), %rcx
  9518. adcq 16(%rbx), %r10
  9519. movq $-19, %rax
  9520. adcq 24(%rbx), %rcx
  9521. movq $0x7fffffffffffffff, %rdx
  9522. movq %rcx, %r11
  9523. sarq $63, %rcx
  9524. # Mask the modulus
  9525. andq %rcx, %rax
  9526. andq %rcx, %rdx
  9527. # Sub modulus (if overflow)
  9528. subq %rax, %r8
  9529. sbbq %rcx, %r9
  9530. sbbq %rcx, %r10
  9531. sbbq %rdx, %r11
  9532. movq %r8, (%rdi)
  9533. movq %r9, 8(%rdi)
  9534. movq %r10, 16(%rdi)
  9535. movq %r11, 24(%rdi)
  9536. addq $0x50, %rsp
  9537. popq %r15
  9538. popq %r14
  9539. popq %r13
  9540. popq %r12
  9541. popq %rbx
  9542. repz retq
  9543. #ifndef __APPLE__
  9544. .size fe_ge_sub_x64,.-fe_ge_sub_x64
  9545. #endif /* __APPLE__ */
  9546. #ifdef HAVE_INTEL_AVX2
  9547. #ifndef __APPLE__
  9548. .text
  9549. .globl fe_mul_avx2
  9550. .type fe_mul_avx2,@function
  9551. .align 16
  9552. fe_mul_avx2:
  9553. #else
  9554. .section __TEXT,__text
  9555. .globl _fe_mul_avx2
  9556. .p2align 4
  9557. _fe_mul_avx2:
  9558. #endif /* __APPLE__ */
  9559. pushq %r12
  9560. pushq %r13
  9561. pushq %r14
  9562. pushq %r15
  9563. pushq %rbx
  9564. movq %rdx, %rbx
  9565. # Multiply
  9566. # A[0] * B[0]
  9567. movq (%rbx), %rdx
  9568. mulxq (%rsi), %r8, %r9
  9569. # A[2] * B[0]
  9570. mulxq 16(%rsi), %r10, %r11
  9571. # A[1] * B[0]
  9572. mulxq 8(%rsi), %rax, %rcx
  9573. xorq %r15, %r15
  9574. adcxq %rax, %r9
  9575. # A[1] * B[3]
  9576. movq 24(%rbx), %rdx
  9577. mulxq 8(%rsi), %r12, %r13
  9578. adcxq %rcx, %r10
  9579. # A[0] * B[1]
  9580. movq 8(%rbx), %rdx
  9581. mulxq (%rsi), %rax, %rcx
  9582. adoxq %rax, %r9
  9583. # A[2] * B[1]
  9584. mulxq 16(%rsi), %rax, %r14
  9585. adoxq %rcx, %r10
  9586. adcxq %rax, %r11
  9587. # A[1] * B[2]
  9588. movq 16(%rbx), %rdx
  9589. mulxq 8(%rsi), %rax, %rcx
  9590. adcxq %r14, %r12
  9591. adoxq %rax, %r11
  9592. adcxq %r15, %r13
  9593. adoxq %rcx, %r12
  9594. # A[0] * B[2]
  9595. mulxq (%rsi), %rax, %rcx
  9596. adoxq %r15, %r13
  9597. xorq %r14, %r14
  9598. adcxq %rax, %r10
  9599. # A[1] * B[1]
  9600. movq 8(%rbx), %rdx
  9601. mulxq 8(%rsi), %rdx, %rax
  9602. adcxq %rcx, %r11
  9603. adoxq %rdx, %r10
  9604. # A[3] * B[1]
  9605. movq 8(%rbx), %rdx
  9606. adoxq %rax, %r11
  9607. mulxq 24(%rsi), %rax, %rcx
  9608. adcxq %rax, %r12
  9609. # A[2] * B[2]
  9610. movq 16(%rbx), %rdx
  9611. mulxq 16(%rsi), %rdx, %rax
  9612. adcxq %rcx, %r13
  9613. adoxq %rdx, %r12
  9614. # A[3] * B[3]
  9615. movq 24(%rbx), %rdx
  9616. adoxq %rax, %r13
  9617. mulxq 24(%rsi), %rax, %rcx
  9618. adoxq %r15, %r14
  9619. adcxq %rax, %r14
  9620. # A[0] * B[3]
  9621. mulxq (%rsi), %rdx, %rax
  9622. adcxq %rcx, %r15
  9623. xorq %rcx, %rcx
  9624. adcxq %rdx, %r11
  9625. # A[3] * B[0]
  9626. movq (%rbx), %rdx
  9627. adcxq %rax, %r12
  9628. mulxq 24(%rsi), %rdx, %rax
  9629. adoxq %rdx, %r11
  9630. adoxq %rax, %r12
  9631. # A[2] * B[3]
  9632. movq 24(%rbx), %rdx
  9633. mulxq 16(%rsi), %rdx, %rax
  9634. adcxq %rdx, %r13
  9635. # A[3] * B[2]
  9636. movq 16(%rbx), %rdx
  9637. adcxq %rax, %r14
  9638. mulxq 24(%rsi), %rax, %rdx
  9639. adcxq %rcx, %r15
  9640. adoxq %rax, %r13
  9641. adoxq %rdx, %r14
  9642. adoxq %rcx, %r15
  9643. # Reduce
  9644. movq $0x7fffffffffffffff, %rcx
  9645. # Move top half into t4-t7 and remove top bit from t3
  9646. shldq $0x01, %r14, %r15
  9647. shldq $0x01, %r13, %r14
  9648. shldq $0x01, %r12, %r13
  9649. shldq $0x01, %r11, %r12
  9650. andq %rcx, %r11
  9651. # Multiply top half by 19
  9652. movq $19, %rdx
  9653. xorq %rcx, %rcx
  9654. mulxq %r12, %rax, %r12
  9655. adcxq %rax, %r8
  9656. adoxq %r12, %r9
  9657. mulxq %r13, %rax, %r13
  9658. adcxq %rax, %r9
  9659. adoxq %r13, %r10
  9660. mulxq %r14, %rax, %r14
  9661. adcxq %rax, %r10
  9662. adoxq %r14, %r11
  9663. mulxq %r15, %r15, %rdx
  9664. adcxq %r15, %r11
  9665. adoxq %rcx, %rdx
  9666. adcxq %rcx, %rdx
  9667. # Overflow
  9668. shldq $0x01, %r11, %rdx
  9669. movq $0x7fffffffffffffff, %rcx
  9670. imulq $19, %rdx, %rax
  9671. andq %rcx, %r11
  9672. addq %rax, %r8
  9673. adcq $0x00, %r9
  9674. adcq $0x00, %r10
  9675. adcq $0x00, %r11
  9676. # Reduce if top bit set
  9677. movq %r11, %rdx
  9678. sarq $63, %rdx
  9679. andq $19, %rdx
  9680. andq %rcx, %r11
  9681. addq %rdx, %r8
  9682. adcq $0x00, %r9
  9683. adcq $0x00, %r10
  9684. adcq $0x00, %r11
  9685. # Store
  9686. movq %r8, (%rdi)
  9687. movq %r9, 8(%rdi)
  9688. movq %r10, 16(%rdi)
  9689. movq %r11, 24(%rdi)
  9690. popq %rbx
  9691. popq %r15
  9692. popq %r14
  9693. popq %r13
  9694. popq %r12
  9695. repz retq
  9696. #ifndef __APPLE__
  9697. .size fe_mul_avx2,.-fe_mul_avx2
  9698. #endif /* __APPLE__ */
  9699. #ifndef __APPLE__
  9700. .text
  9701. .globl fe_sq_avx2
  9702. .type fe_sq_avx2,@function
  9703. .align 16
  9704. fe_sq_avx2:
  9705. #else
  9706. .section __TEXT,__text
  9707. .globl _fe_sq_avx2
  9708. .p2align 4
  9709. _fe_sq_avx2:
  9710. #endif /* __APPLE__ */
  9711. pushq %rbx
  9712. pushq %r12
  9713. pushq %r13
  9714. pushq %r14
  9715. pushq %r15
  9716. # Square
  9717. # A[0] * A[1]
  9718. movq (%rsi), %rdx
  9719. mulxq 8(%rsi), %r9, %r10
  9720. # A[0] * A[3]
  9721. mulxq 24(%rsi), %r11, %r12
  9722. # A[2] * A[1]
  9723. movq 16(%rsi), %rdx
  9724. mulxq 8(%rsi), %rcx, %rbx
  9725. xorq %r15, %r15
  9726. adoxq %rcx, %r11
  9727. # A[2] * A[3]
  9728. mulxq 24(%rsi), %r13, %r14
  9729. adoxq %rbx, %r12
  9730. # A[2] * A[0]
  9731. mulxq (%rsi), %rcx, %rbx
  9732. adoxq %r15, %r13
  9733. adcxq %rcx, %r10
  9734. adoxq %r15, %r14
  9735. # A[1] * A[3]
  9736. movq 8(%rsi), %rdx
  9737. mulxq 24(%rsi), %rax, %r8
  9738. adcxq %rbx, %r11
  9739. adcxq %rax, %r12
  9740. adcxq %r8, %r13
  9741. adcxq %r15, %r14
  9742. # Double with Carry Flag
  9743. xorq %r15, %r15
  9744. # A[0] * A[0]
  9745. movq (%rsi), %rdx
  9746. mulxq %rdx, %r8, %rax
  9747. adcxq %r9, %r9
  9748. # A[1] * A[1]
  9749. movq 8(%rsi), %rdx
  9750. mulxq %rdx, %rcx, %rbx
  9751. adcxq %r10, %r10
  9752. adoxq %rax, %r9
  9753. adcxq %r11, %r11
  9754. adoxq %rcx, %r10
  9755. # A[2] * A[2]
  9756. movq 16(%rsi), %rdx
  9757. mulxq %rdx, %rax, %rcx
  9758. adcxq %r12, %r12
  9759. adoxq %rbx, %r11
  9760. adcxq %r13, %r13
  9761. adoxq %rax, %r12
  9762. # A[3] * A[3]
  9763. movq 24(%rsi), %rdx
  9764. mulxq %rdx, %rax, %rbx
  9765. adcxq %r14, %r14
  9766. adoxq %rcx, %r13
  9767. adcxq %r15, %r15
  9768. adoxq %rax, %r14
  9769. adoxq %rbx, %r15
  9770. # Reduce
  9771. movq $0x7fffffffffffffff, %rcx
  9772. # Move top half into t4-t7 and remove top bit from t3
  9773. shldq $0x01, %r14, %r15
  9774. shldq $0x01, %r13, %r14
  9775. shldq $0x01, %r12, %r13
  9776. shldq $0x01, %r11, %r12
  9777. andq %rcx, %r11
  9778. # Multiply top half by 19
  9779. movq $19, %rdx
  9780. xorq %rcx, %rcx
  9781. mulxq %r12, %rax, %r12
  9782. adcxq %rax, %r8
  9783. adoxq %r12, %r9
  9784. mulxq %r13, %rax, %r13
  9785. adcxq %rax, %r9
  9786. adoxq %r13, %r10
  9787. mulxq %r14, %rax, %r14
  9788. adcxq %rax, %r10
  9789. adoxq %r14, %r11
  9790. mulxq %r15, %r15, %rdx
  9791. adcxq %r15, %r11
  9792. adoxq %rcx, %rdx
  9793. adcxq %rcx, %rdx
  9794. # Overflow
  9795. shldq $0x01, %r11, %rdx
  9796. movq $0x7fffffffffffffff, %rcx
  9797. imulq $19, %rdx, %rax
  9798. andq %rcx, %r11
  9799. addq %rax, %r8
  9800. adcq $0x00, %r9
  9801. adcq $0x00, %r10
  9802. adcq $0x00, %r11
  9803. # Reduce if top bit set
  9804. movq %r11, %rdx
  9805. sarq $63, %rdx
  9806. andq $19, %rdx
  9807. andq %rcx, %r11
  9808. addq %rdx, %r8
  9809. adcq $0x00, %r9
  9810. adcq $0x00, %r10
  9811. adcq $0x00, %r11
  9812. # Store
  9813. movq %r8, (%rdi)
  9814. movq %r9, 8(%rdi)
  9815. movq %r10, 16(%rdi)
  9816. movq %r11, 24(%rdi)
  9817. popq %r15
  9818. popq %r14
  9819. popq %r13
  9820. popq %r12
  9821. popq %rbx
  9822. repz retq
  9823. #ifndef __APPLE__
  9824. .size fe_sq_avx2,.-fe_sq_avx2
  9825. #endif /* __APPLE__ */
  9826. #ifndef __APPLE__
  9827. .text
  9828. .globl fe_sq_n_avx2
  9829. .type fe_sq_n_avx2,@function
  9830. .align 16
  9831. fe_sq_n_avx2:
  9832. #else
  9833. .section __TEXT,__text
  9834. .globl _fe_sq_n_avx2
  9835. .p2align 4
  9836. _fe_sq_n_avx2:
  9837. #endif /* __APPLE__ */
  9838. pushq %rbx
  9839. pushq %r12
  9840. pushq %r13
  9841. pushq %r14
  9842. pushq %r15
  9843. pushq %rbp
  9844. movq %rdx, %rbp
  9845. L_fe_sq_n_avx2:
  9846. # Square
  9847. # A[0] * A[1]
  9848. movq (%rsi), %rdx
  9849. mulxq 8(%rsi), %r9, %r10
  9850. # A[0] * A[3]
  9851. mulxq 24(%rsi), %r11, %r12
  9852. # A[2] * A[1]
  9853. movq 16(%rsi), %rdx
  9854. mulxq 8(%rsi), %rcx, %rbx
  9855. xorq %r15, %r15
  9856. adoxq %rcx, %r11
  9857. # A[2] * A[3]
  9858. mulxq 24(%rsi), %r13, %r14
  9859. adoxq %rbx, %r12
  9860. # A[2] * A[0]
  9861. mulxq (%rsi), %rcx, %rbx
  9862. adoxq %r15, %r13
  9863. adcxq %rcx, %r10
  9864. adoxq %r15, %r14
  9865. # A[1] * A[3]
  9866. movq 8(%rsi), %rdx
  9867. mulxq 24(%rsi), %rax, %r8
  9868. adcxq %rbx, %r11
  9869. adcxq %rax, %r12
  9870. adcxq %r8, %r13
  9871. adcxq %r15, %r14
  9872. # Double with Carry Flag
  9873. xorq %r15, %r15
  9874. # A[0] * A[0]
  9875. movq (%rsi), %rdx
  9876. mulxq %rdx, %r8, %rax
  9877. adcxq %r9, %r9
  9878. # A[1] * A[1]
  9879. movq 8(%rsi), %rdx
  9880. mulxq %rdx, %rcx, %rbx
  9881. adcxq %r10, %r10
  9882. adoxq %rax, %r9
  9883. adcxq %r11, %r11
  9884. adoxq %rcx, %r10
  9885. # A[2] * A[2]
  9886. movq 16(%rsi), %rdx
  9887. mulxq %rdx, %rax, %rcx
  9888. adcxq %r12, %r12
  9889. adoxq %rbx, %r11
  9890. adcxq %r13, %r13
  9891. adoxq %rax, %r12
  9892. # A[3] * A[3]
  9893. movq 24(%rsi), %rdx
  9894. mulxq %rdx, %rax, %rbx
  9895. adcxq %r14, %r14
  9896. adoxq %rcx, %r13
  9897. adcxq %r15, %r15
  9898. adoxq %rax, %r14
  9899. adoxq %rbx, %r15
  9900. # Reduce
  9901. movq $0x7fffffffffffffff, %rcx
  9902. # Move top half into t4-t7 and remove top bit from t3
  9903. shldq $0x01, %r14, %r15
  9904. shldq $0x01, %r13, %r14
  9905. shldq $0x01, %r12, %r13
  9906. shldq $0x01, %r11, %r12
  9907. andq %rcx, %r11
  9908. # Multiply top half by 19
  9909. movq $19, %rdx
  9910. xorq %rcx, %rcx
  9911. mulxq %r12, %rax, %r12
  9912. adcxq %rax, %r8
  9913. adoxq %r12, %r9
  9914. mulxq %r13, %rax, %r13
  9915. adcxq %rax, %r9
  9916. adoxq %r13, %r10
  9917. mulxq %r14, %rax, %r14
  9918. adcxq %rax, %r10
  9919. adoxq %r14, %r11
  9920. mulxq %r15, %r15, %rdx
  9921. adcxq %r15, %r11
  9922. adoxq %rcx, %rdx
  9923. adcxq %rcx, %rdx
  9924. # Overflow
  9925. shldq $0x01, %r11, %rdx
  9926. movq $0x7fffffffffffffff, %rcx
  9927. imulq $19, %rdx, %rax
  9928. andq %rcx, %r11
  9929. addq %rax, %r8
  9930. adcq $0x00, %r9
  9931. adcq $0x00, %r10
  9932. adcq $0x00, %r11
  9933. # Reduce if top bit set
  9934. movq %r11, %rdx
  9935. sarq $63, %rdx
  9936. andq $19, %rdx
  9937. andq %rcx, %r11
  9938. addq %rdx, %r8
  9939. adcq $0x00, %r9
  9940. adcq $0x00, %r10
  9941. adcq $0x00, %r11
  9942. # Store
  9943. movq %r8, (%rdi)
  9944. movq %r9, 8(%rdi)
  9945. movq %r10, 16(%rdi)
  9946. movq %r11, 24(%rdi)
  9947. decb %bpl
  9948. jnz L_fe_sq_n_avx2
  9949. popq %rbp
  9950. popq %r15
  9951. popq %r14
  9952. popq %r13
  9953. popq %r12
  9954. popq %rbx
  9955. repz retq
  9956. #ifndef __APPLE__
  9957. .size fe_sq_n_avx2,.-fe_sq_n_avx2
  9958. #endif /* __APPLE__ */
  9959. #ifndef __APPLE__
  9960. .text
  9961. .globl fe_mul121666_avx2
  9962. .type fe_mul121666_avx2,@function
  9963. .align 16
  9964. fe_mul121666_avx2:
  9965. #else
  9966. .section __TEXT,__text
  9967. .globl _fe_mul121666_avx2
  9968. .p2align 4
  9969. _fe_mul121666_avx2:
  9970. #endif /* __APPLE__ */
  9971. pushq %r12
  9972. pushq %r13
  9973. movq $0x1db42, %rdx
  9974. mulxq (%rsi), %rax, %r13
  9975. mulxq 8(%rsi), %rcx, %r12
  9976. mulxq 16(%rsi), %r8, %r11
  9977. mulxq 24(%rsi), %r9, %r10
  9978. addq %r13, %rcx
  9979. adcq %r12, %r8
  9980. adcq %r11, %r9
  9981. adcq $0x00, %r10
  9982. movq $0x7fffffffffffffff, %r13
  9983. shldq $0x01, %r9, %r10
  9984. andq %r13, %r9
  9985. imulq $19, %r10, %r10
  9986. addq %r10, %rax
  9987. adcq $0x00, %rcx
  9988. adcq $0x00, %r8
  9989. adcq $0x00, %r9
  9990. movq %rax, (%rdi)
  9991. movq %rcx, 8(%rdi)
  9992. movq %r8, 16(%rdi)
  9993. movq %r9, 24(%rdi)
  9994. popq %r13
  9995. popq %r12
  9996. repz retq
  9997. #ifndef __APPLE__
  9998. .size fe_mul121666_avx2,.-fe_mul121666_avx2
  9999. #endif /* __APPLE__ */
  10000. #ifndef __APPLE__
  10001. .text
  10002. .globl fe_sq2_avx2
  10003. .type fe_sq2_avx2,@function
  10004. .align 16
  10005. fe_sq2_avx2:
  10006. #else
  10007. .section __TEXT,__text
  10008. .globl _fe_sq2_avx2
  10009. .p2align 4
  10010. _fe_sq2_avx2:
  10011. #endif /* __APPLE__ */
  10012. pushq %rbx
  10013. pushq %r12
  10014. pushq %r13
  10015. pushq %r14
  10016. pushq %r15
  10017. # Square * 2
  10018. # A[0] * A[1]
  10019. movq (%rsi), %rdx
  10020. mulxq 8(%rsi), %r9, %r10
  10021. # A[0] * A[3]
  10022. mulxq 24(%rsi), %r11, %r12
  10023. # A[2] * A[1]
  10024. movq 16(%rsi), %rdx
  10025. mulxq 8(%rsi), %rcx, %rbx
  10026. xorq %r15, %r15
  10027. adoxq %rcx, %r11
  10028. # A[2] * A[3]
  10029. mulxq 24(%rsi), %r13, %r14
  10030. adoxq %rbx, %r12
  10031. # A[2] * A[0]
  10032. mulxq (%rsi), %rcx, %rbx
  10033. adoxq %r15, %r13
  10034. adcxq %rcx, %r10
  10035. adoxq %r15, %r14
  10036. # A[1] * A[3]
  10037. movq 8(%rsi), %rdx
  10038. mulxq 24(%rsi), %rax, %r8
  10039. adcxq %rbx, %r11
  10040. adcxq %rax, %r12
  10041. adcxq %r8, %r13
  10042. adcxq %r15, %r14
  10043. # Double with Carry Flag
  10044. xorq %r15, %r15
  10045. # A[0] * A[0]
  10046. movq (%rsi), %rdx
  10047. mulxq %rdx, %r8, %rax
  10048. adcxq %r9, %r9
  10049. # A[1] * A[1]
  10050. movq 8(%rsi), %rdx
  10051. mulxq %rdx, %rcx, %rbx
  10052. adcxq %r10, %r10
  10053. adoxq %rax, %r9
  10054. adcxq %r11, %r11
  10055. adoxq %rcx, %r10
  10056. # A[2] * A[2]
  10057. movq 16(%rsi), %rdx
  10058. mulxq %rdx, %rax, %rcx
  10059. adcxq %r12, %r12
  10060. adoxq %rbx, %r11
  10061. adcxq %r13, %r13
  10062. adoxq %rax, %r12
  10063. # A[3] * A[3]
  10064. movq 24(%rsi), %rdx
  10065. mulxq %rdx, %rax, %rbx
  10066. adcxq %r14, %r14
  10067. adoxq %rcx, %r13
  10068. adcxq %r15, %r15
  10069. adoxq %rax, %r14
  10070. adoxq %rbx, %r15
  10071. # Reduce
  10072. movq $0x7fffffffffffffff, %rbx
  10073. xorq %rax, %rax
  10074. # Move top half into t4-t7 and remove top bit from t3 and double
  10075. shldq $3, %r15, %rax
  10076. shldq $2, %r14, %r15
  10077. shldq $2, %r13, %r14
  10078. shldq $2, %r12, %r13
  10079. shldq $2, %r11, %r12
  10080. shldq $0x01, %r10, %r11
  10081. shldq $0x01, %r9, %r10
  10082. shldq $0x01, %r8, %r9
  10083. shlq $0x01, %r8
  10084. andq %rbx, %r11
  10085. # Two out left, one in right
  10086. andq %rbx, %r15
  10087. # Multiply top bits by 19*19
  10088. imulq $0x169, %rax, %rcx
  10089. xorq %rbx, %rbx
  10090. # Multiply top half by 19
  10091. movq $19, %rdx
  10092. adoxq %rcx, %r8
  10093. mulxq %r12, %rax, %r12
  10094. adcxq %rax, %r8
  10095. adoxq %r12, %r9
  10096. mulxq %r13, %rax, %r13
  10097. adcxq %rax, %r9
  10098. adoxq %r13, %r10
  10099. mulxq %r14, %rax, %r14
  10100. adcxq %rax, %r10
  10101. adoxq %r14, %r11
  10102. mulxq %r15, %r15, %rdx
  10103. adcxq %r15, %r11
  10104. adoxq %rbx, %rdx
  10105. adcxq %rbx, %rdx
  10106. # Overflow
  10107. shldq $0x01, %r11, %rdx
  10108. movq $0x7fffffffffffffff, %rbx
  10109. imulq $19, %rdx, %rax
  10110. andq %rbx, %r11
  10111. addq %rax, %r8
  10112. adcq $0x00, %r9
  10113. adcq $0x00, %r10
  10114. adcq $0x00, %r11
  10115. # Reduce if top bit set
  10116. movq %r11, %rdx
  10117. sarq $63, %rdx
  10118. andq $19, %rdx
  10119. andq %rbx, %r11
  10120. addq %rdx, %r8
  10121. adcq $0x00, %r9
  10122. adcq $0x00, %r10
  10123. adcq $0x00, %r11
  10124. # Store
  10125. movq %r8, (%rdi)
  10126. movq %r9, 8(%rdi)
  10127. movq %r10, 16(%rdi)
  10128. movq %r11, 24(%rdi)
  10129. popq %r15
  10130. popq %r14
  10131. popq %r13
  10132. popq %r12
  10133. popq %rbx
  10134. repz retq
  10135. #ifndef __APPLE__
  10136. .size fe_sq2_avx2,.-fe_sq2_avx2
  10137. #endif /* __APPLE__ */
  10138. #ifndef __APPLE__
  10139. .text
  10140. .globl fe_invert_avx2
  10141. .type fe_invert_avx2,@function
  10142. .align 16
  10143. fe_invert_avx2:
  10144. #else
  10145. .section __TEXT,__text
  10146. .globl _fe_invert_avx2
  10147. .p2align 4
  10148. _fe_invert_avx2:
  10149. #endif /* __APPLE__ */
  10150. subq $0x90, %rsp
  10151. # Invert
  10152. movq %rdi, 128(%rsp)
  10153. movq %rsi, 136(%rsp)
  10154. movq %rsp, %rdi
  10155. movq 136(%rsp), %rsi
  10156. #ifndef __APPLE__
  10157. callq fe_sq_avx2@plt
  10158. #else
  10159. callq _fe_sq_avx2
  10160. #endif /* __APPLE__ */
  10161. leaq 32(%rsp), %rdi
  10162. movq %rsp, %rsi
  10163. #ifndef __APPLE__
  10164. callq fe_sq_avx2@plt
  10165. #else
  10166. callq _fe_sq_avx2
  10167. #endif /* __APPLE__ */
  10168. leaq 32(%rsp), %rdi
  10169. leaq 32(%rsp), %rsi
  10170. #ifndef __APPLE__
  10171. callq fe_sq_avx2@plt
  10172. #else
  10173. callq _fe_sq_avx2
  10174. #endif /* __APPLE__ */
  10175. leaq 32(%rsp), %rdi
  10176. movq 136(%rsp), %rsi
  10177. leaq 32(%rsp), %rdx
  10178. #ifndef __APPLE__
  10179. callq fe_mul_avx2@plt
  10180. #else
  10181. callq _fe_mul_avx2
  10182. #endif /* __APPLE__ */
  10183. movq %rsp, %rdi
  10184. movq %rsp, %rsi
  10185. leaq 32(%rsp), %rdx
  10186. #ifndef __APPLE__
  10187. callq fe_mul_avx2@plt
  10188. #else
  10189. callq _fe_mul_avx2
  10190. #endif /* __APPLE__ */
  10191. leaq 64(%rsp), %rdi
  10192. movq %rsp, %rsi
  10193. #ifndef __APPLE__
  10194. callq fe_sq_avx2@plt
  10195. #else
  10196. callq _fe_sq_avx2
  10197. #endif /* __APPLE__ */
  10198. leaq 32(%rsp), %rdi
  10199. leaq 32(%rsp), %rsi
  10200. leaq 64(%rsp), %rdx
  10201. #ifndef __APPLE__
  10202. callq fe_mul_avx2@plt
  10203. #else
  10204. callq _fe_mul_avx2
  10205. #endif /* __APPLE__ */
  10206. leaq 64(%rsp), %rdi
  10207. leaq 32(%rsp), %rsi
  10208. #ifndef __APPLE__
  10209. callq fe_sq_avx2@plt
  10210. #else
  10211. callq _fe_sq_avx2
  10212. #endif /* __APPLE__ */
  10213. leaq 64(%rsp), %rdi
  10214. leaq 64(%rsp), %rsi
  10215. movq $4, %rdx
  10216. #ifndef __APPLE__
  10217. callq fe_sq_n_avx2@plt
  10218. #else
  10219. callq _fe_sq_n_avx2
  10220. #endif /* __APPLE__ */
  10221. leaq 32(%rsp), %rdi
  10222. leaq 64(%rsp), %rsi
  10223. leaq 32(%rsp), %rdx
  10224. #ifndef __APPLE__
  10225. callq fe_mul_avx2@plt
  10226. #else
  10227. callq _fe_mul_avx2
  10228. #endif /* __APPLE__ */
  10229. leaq 64(%rsp), %rdi
  10230. leaq 32(%rsp), %rsi
  10231. #ifndef __APPLE__
  10232. callq fe_sq_avx2@plt
  10233. #else
  10234. callq _fe_sq_avx2
  10235. #endif /* __APPLE__ */
  10236. leaq 64(%rsp), %rdi
  10237. leaq 64(%rsp), %rsi
  10238. movq $9, %rdx
  10239. #ifndef __APPLE__
  10240. callq fe_sq_n_avx2@plt
  10241. #else
  10242. callq _fe_sq_n_avx2
  10243. #endif /* __APPLE__ */
  10244. leaq 64(%rsp), %rdi
  10245. leaq 64(%rsp), %rsi
  10246. leaq 32(%rsp), %rdx
  10247. #ifndef __APPLE__
  10248. callq fe_mul_avx2@plt
  10249. #else
  10250. callq _fe_mul_avx2
  10251. #endif /* __APPLE__ */
  10252. leaq 96(%rsp), %rdi
  10253. leaq 64(%rsp), %rsi
  10254. #ifndef __APPLE__
  10255. callq fe_sq_avx2@plt
  10256. #else
  10257. callq _fe_sq_avx2
  10258. #endif /* __APPLE__ */
  10259. leaq 96(%rsp), %rdi
  10260. leaq 96(%rsp), %rsi
  10261. movq $19, %rdx
  10262. #ifndef __APPLE__
  10263. callq fe_sq_n_avx2@plt
  10264. #else
  10265. callq _fe_sq_n_avx2
  10266. #endif /* __APPLE__ */
  10267. leaq 64(%rsp), %rdi
  10268. leaq 96(%rsp), %rsi
  10269. leaq 64(%rsp), %rdx
  10270. #ifndef __APPLE__
  10271. callq fe_mul_avx2@plt
  10272. #else
  10273. callq _fe_mul_avx2
  10274. #endif /* __APPLE__ */
  10275. leaq 64(%rsp), %rdi
  10276. leaq 64(%rsp), %rsi
  10277. #ifndef __APPLE__
  10278. callq fe_sq_avx2@plt
  10279. #else
  10280. callq _fe_sq_avx2
  10281. #endif /* __APPLE__ */
  10282. leaq 64(%rsp), %rdi
  10283. leaq 64(%rsp), %rsi
  10284. movq $9, %rdx
  10285. #ifndef __APPLE__
  10286. callq fe_sq_n_avx2@plt
  10287. #else
  10288. callq _fe_sq_n_avx2
  10289. #endif /* __APPLE__ */
  10290. leaq 32(%rsp), %rdi
  10291. leaq 64(%rsp), %rsi
  10292. leaq 32(%rsp), %rdx
  10293. #ifndef __APPLE__
  10294. callq fe_mul_avx2@plt
  10295. #else
  10296. callq _fe_mul_avx2
  10297. #endif /* __APPLE__ */
  10298. leaq 64(%rsp), %rdi
  10299. leaq 32(%rsp), %rsi
  10300. #ifndef __APPLE__
  10301. callq fe_sq_avx2@plt
  10302. #else
  10303. callq _fe_sq_avx2
  10304. #endif /* __APPLE__ */
  10305. leaq 64(%rsp), %rdi
  10306. leaq 64(%rsp), %rsi
  10307. movq $49, %rdx
  10308. #ifndef __APPLE__
  10309. callq fe_sq_n_avx2@plt
  10310. #else
  10311. callq _fe_sq_n_avx2
  10312. #endif /* __APPLE__ */
  10313. leaq 64(%rsp), %rdi
  10314. leaq 64(%rsp), %rsi
  10315. leaq 32(%rsp), %rdx
  10316. #ifndef __APPLE__
  10317. callq fe_mul_avx2@plt
  10318. #else
  10319. callq _fe_mul_avx2
  10320. #endif /* __APPLE__ */
  10321. leaq 96(%rsp), %rdi
  10322. leaq 64(%rsp), %rsi
  10323. #ifndef __APPLE__
  10324. callq fe_sq_avx2@plt
  10325. #else
  10326. callq _fe_sq_avx2
  10327. #endif /* __APPLE__ */
  10328. leaq 96(%rsp), %rdi
  10329. leaq 96(%rsp), %rsi
  10330. movq $0x63, %rdx
  10331. #ifndef __APPLE__
  10332. callq fe_sq_n_avx2@plt
  10333. #else
  10334. callq _fe_sq_n_avx2
  10335. #endif /* __APPLE__ */
  10336. leaq 64(%rsp), %rdi
  10337. leaq 96(%rsp), %rsi
  10338. leaq 64(%rsp), %rdx
  10339. #ifndef __APPLE__
  10340. callq fe_mul_avx2@plt
  10341. #else
  10342. callq _fe_mul_avx2
  10343. #endif /* __APPLE__ */
  10344. leaq 64(%rsp), %rdi
  10345. leaq 64(%rsp), %rsi
  10346. #ifndef __APPLE__
  10347. callq fe_sq_avx2@plt
  10348. #else
  10349. callq _fe_sq_avx2
  10350. #endif /* __APPLE__ */
  10351. leaq 64(%rsp), %rdi
  10352. leaq 64(%rsp), %rsi
  10353. movq $49, %rdx
  10354. #ifndef __APPLE__
  10355. callq fe_sq_n_avx2@plt
  10356. #else
  10357. callq _fe_sq_n_avx2
  10358. #endif /* __APPLE__ */
  10359. leaq 32(%rsp), %rdi
  10360. leaq 64(%rsp), %rsi
  10361. leaq 32(%rsp), %rdx
  10362. #ifndef __APPLE__
  10363. callq fe_mul_avx2@plt
  10364. #else
  10365. callq _fe_mul_avx2
  10366. #endif /* __APPLE__ */
  10367. leaq 32(%rsp), %rdi
  10368. leaq 32(%rsp), %rsi
  10369. #ifndef __APPLE__
  10370. callq fe_sq_avx2@plt
  10371. #else
  10372. callq _fe_sq_avx2
  10373. #endif /* __APPLE__ */
  10374. leaq 32(%rsp), %rdi
  10375. leaq 32(%rsp), %rsi
  10376. movq $4, %rdx
  10377. #ifndef __APPLE__
  10378. callq fe_sq_n_avx2@plt
  10379. #else
  10380. callq _fe_sq_n_avx2
  10381. #endif /* __APPLE__ */
  10382. movq 128(%rsp), %rdi
  10383. leaq 32(%rsp), %rsi
  10384. movq %rsp, %rdx
  10385. #ifndef __APPLE__
  10386. callq fe_mul_avx2@plt
  10387. #else
  10388. callq _fe_mul_avx2
  10389. #endif /* __APPLE__ */
  10390. movq 136(%rsp), %rsi
  10391. movq 128(%rsp), %rdi
  10392. addq $0x90, %rsp
  10393. repz retq
  10394. #ifndef __APPLE__
  10395. .text
  10396. .globl curve25519_avx2
  10397. .type curve25519_avx2,@function
  10398. .align 16
  10399. curve25519_avx2:
  10400. #else
  10401. .section __TEXT,__text
  10402. .globl _curve25519_avx2
  10403. .p2align 4
  10404. _curve25519_avx2:
  10405. #endif /* __APPLE__ */
  10406. pushq %rbx
  10407. pushq %r12
  10408. pushq %r13
  10409. pushq %r14
  10410. pushq %r15
  10411. pushq %rbp
  10412. movq %rdx, %r8
  10413. subq $0xc0, %rsp
  10414. movq $0x00, 184(%rsp)
  10415. movq %rdi, 176(%rsp)
  10416. # Set one
  10417. movq $0x01, (%rdi)
  10418. movq $0x00, 8(%rdi)
  10419. movq $0x00, 16(%rdi)
  10420. movq $0x00, 24(%rdi)
  10421. # Set zero
  10422. movq $0x00, (%rsp)
  10423. movq $0x00, 8(%rsp)
  10424. movq $0x00, 16(%rsp)
  10425. movq $0x00, 24(%rsp)
  10426. # Set one
  10427. movq $0x01, 32(%rsp)
  10428. movq $0x00, 40(%rsp)
  10429. movq $0x00, 48(%rsp)
  10430. movq $0x00, 56(%rsp)
  10431. # Copy
  10432. movq (%r8), %r9
  10433. movq 8(%r8), %r10
  10434. movq 16(%r8), %r11
  10435. movq 24(%r8), %r12
  10436. movq %r9, 64(%rsp)
  10437. movq %r10, 72(%rsp)
  10438. movq %r11, 80(%rsp)
  10439. movq %r12, 88(%rsp)
  10440. movb $62, 168(%rsp)
  10441. movq $3, 160(%rsp)
  10442. L_curve25519_avx2_words:
  10443. L_curve25519_avx2_bits:
  10444. movq 184(%rsp), %rbx
  10445. movq 160(%rsp), %r9
  10446. movb 168(%rsp), %cl
  10447. movq (%rsi,%r9,8), %rax
  10448. shrq %cl, %rax
  10449. andq $0x01, %rax
  10450. xorq %rax, %rbx
  10451. negq %rbx
  10452. # Conditional Swap
  10453. movq (%rdi), %r9
  10454. movq 8(%rdi), %r10
  10455. movq 16(%rdi), %r11
  10456. movq 24(%rdi), %r12
  10457. xorq 64(%rsp), %r9
  10458. xorq 72(%rsp), %r10
  10459. xorq 80(%rsp), %r11
  10460. xorq 88(%rsp), %r12
  10461. andq %rbx, %r9
  10462. andq %rbx, %r10
  10463. andq %rbx, %r11
  10464. andq %rbx, %r12
  10465. xorq %r9, (%rdi)
  10466. xorq %r10, 8(%rdi)
  10467. xorq %r11, 16(%rdi)
  10468. xorq %r12, 24(%rdi)
  10469. xorq %r9, 64(%rsp)
  10470. xorq %r10, 72(%rsp)
  10471. xorq %r11, 80(%rsp)
  10472. xorq %r12, 88(%rsp)
  10473. # Conditional Swap
  10474. movq (%rsp), %r9
  10475. movq 8(%rsp), %r10
  10476. movq 16(%rsp), %r11
  10477. movq 24(%rsp), %r12
  10478. xorq 32(%rsp), %r9
  10479. xorq 40(%rsp), %r10
  10480. xorq 48(%rsp), %r11
  10481. xorq 56(%rsp), %r12
  10482. andq %rbx, %r9
  10483. andq %rbx, %r10
  10484. andq %rbx, %r11
  10485. andq %rbx, %r12
  10486. xorq %r9, (%rsp)
  10487. xorq %r10, 8(%rsp)
  10488. xorq %r11, 16(%rsp)
  10489. xorq %r12, 24(%rsp)
  10490. xorq %r9, 32(%rsp)
  10491. xorq %r10, 40(%rsp)
  10492. xorq %r11, 48(%rsp)
  10493. xorq %r12, 56(%rsp)
  10494. movq %rax, 184(%rsp)
  10495. # Add
  10496. movq (%rdi), %r9
  10497. movq 8(%rdi), %r10
  10498. movq 16(%rdi), %r11
  10499. movq 24(%rdi), %rax
  10500. movq %r9, %r13
  10501. addq (%rsp), %r9
  10502. movq %r10, %r14
  10503. adcq 8(%rsp), %r10
  10504. movq %r11, %r15
  10505. adcq 16(%rsp), %r11
  10506. movq %rax, %rbp
  10507. adcq 24(%rsp), %rax
  10508. movq $-19, %rcx
  10509. movq %rax, %r12
  10510. movq $0x7fffffffffffffff, %rbx
  10511. sarq $63, %rax
  10512. # Mask the modulus
  10513. andq %rax, %rcx
  10514. andq %rax, %rbx
  10515. # Sub modulus (if overflow)
  10516. subq %rcx, %r9
  10517. sbbq %rax, %r10
  10518. sbbq %rax, %r11
  10519. sbbq %rbx, %r12
  10520. # Sub
  10521. subq (%rsp), %r13
  10522. movq $0x00, %rax
  10523. sbbq 8(%rsp), %r14
  10524. movq $-19, %rcx
  10525. sbbq 16(%rsp), %r15
  10526. movq $0x7fffffffffffffff, %rbx
  10527. sbbq 24(%rsp), %rbp
  10528. sbbq $0x00, %rax
  10529. # Mask the modulus
  10530. andq %rax, %rcx
  10531. andq %rax, %rbx
  10532. # Add modulus (if underflow)
  10533. addq %rcx, %r13
  10534. adcq %rax, %r14
  10535. adcq %rax, %r15
  10536. adcq %rbx, %rbp
  10537. movq %r9, (%rdi)
  10538. movq %r10, 8(%rdi)
  10539. movq %r11, 16(%rdi)
  10540. movq %r12, 24(%rdi)
  10541. movq %r13, 128(%rsp)
  10542. movq %r14, 136(%rsp)
  10543. movq %r15, 144(%rsp)
  10544. movq %rbp, 152(%rsp)
  10545. # Add
  10546. movq 64(%rsp), %r9
  10547. movq 72(%rsp), %r10
  10548. movq 80(%rsp), %r11
  10549. movq 88(%rsp), %rax
  10550. movq %r9, %r13
  10551. addq 32(%rsp), %r9
  10552. movq %r10, %r14
  10553. adcq 40(%rsp), %r10
  10554. movq %r11, %r15
  10555. adcq 48(%rsp), %r11
  10556. movq %rax, %rbp
  10557. adcq 56(%rsp), %rax
  10558. movq $-19, %rcx
  10559. movq %rax, %r12
  10560. movq $0x7fffffffffffffff, %rbx
  10561. sarq $63, %rax
  10562. # Mask the modulus
  10563. andq %rax, %rcx
  10564. andq %rax, %rbx
  10565. # Sub modulus (if overflow)
  10566. subq %rcx, %r9
  10567. sbbq %rax, %r10
  10568. sbbq %rax, %r11
  10569. sbbq %rbx, %r12
  10570. # Sub
  10571. subq 32(%rsp), %r13
  10572. movq $0x00, %rax
  10573. sbbq 40(%rsp), %r14
  10574. movq $-19, %rcx
  10575. sbbq 48(%rsp), %r15
  10576. movq $0x7fffffffffffffff, %rbx
  10577. sbbq 56(%rsp), %rbp
  10578. sbbq $0x00, %rax
  10579. # Mask the modulus
  10580. andq %rax, %rcx
  10581. andq %rax, %rbx
  10582. # Add modulus (if underflow)
  10583. addq %rcx, %r13
  10584. adcq %rax, %r14
  10585. adcq %rax, %r15
  10586. adcq %rbx, %rbp
  10587. movq %r9, (%rsp)
  10588. movq %r10, 8(%rsp)
  10589. movq %r11, 16(%rsp)
  10590. movq %r12, 24(%rsp)
  10591. movq %r13, 96(%rsp)
  10592. movq %r14, 104(%rsp)
  10593. movq %r15, 112(%rsp)
  10594. movq %rbp, 120(%rsp)
  10595. # Multiply
  10596. # A[0] * B[0]
  10597. movq (%rdi), %rdx
  10598. mulxq 96(%rsp), %r9, %r10
  10599. # A[2] * B[0]
  10600. mulxq 112(%rsp), %r11, %r12
  10601. # A[1] * B[0]
  10602. mulxq 104(%rsp), %rcx, %rbx
  10603. xorq %rbp, %rbp
  10604. adcxq %rcx, %r10
  10605. # A[1] * B[3]
  10606. movq 24(%rdi), %rdx
  10607. mulxq 104(%rsp), %r13, %r14
  10608. adcxq %rbx, %r11
  10609. # A[0] * B[1]
  10610. movq 8(%rdi), %rdx
  10611. mulxq 96(%rsp), %rcx, %rbx
  10612. adoxq %rcx, %r10
  10613. # A[2] * B[1]
  10614. mulxq 112(%rsp), %rcx, %r15
  10615. adoxq %rbx, %r11
  10616. adcxq %rcx, %r12
  10617. # A[1] * B[2]
  10618. movq 16(%rdi), %rdx
  10619. mulxq 104(%rsp), %rcx, %rbx
  10620. adcxq %r15, %r13
  10621. adoxq %rcx, %r12
  10622. adcxq %rbp, %r14
  10623. adoxq %rbx, %r13
  10624. # A[0] * B[2]
  10625. mulxq 96(%rsp), %rcx, %rbx
  10626. adoxq %rbp, %r14
  10627. xorq %r15, %r15
  10628. adcxq %rcx, %r11
  10629. # A[1] * B[1]
  10630. movq 8(%rdi), %rdx
  10631. mulxq 104(%rsp), %rdx, %rcx
  10632. adcxq %rbx, %r12
  10633. adoxq %rdx, %r11
  10634. # A[3] * B[1]
  10635. movq 8(%rdi), %rdx
  10636. adoxq %rcx, %r12
  10637. mulxq 120(%rsp), %rcx, %rbx
  10638. adcxq %rcx, %r13
  10639. # A[2] * B[2]
  10640. movq 16(%rdi), %rdx
  10641. mulxq 112(%rsp), %rdx, %rcx
  10642. adcxq %rbx, %r14
  10643. adoxq %rdx, %r13
  10644. # A[3] * B[3]
  10645. movq 24(%rdi), %rdx
  10646. adoxq %rcx, %r14
  10647. mulxq 120(%rsp), %rcx, %rbx
  10648. adoxq %rbp, %r15
  10649. adcxq %rcx, %r15
  10650. # A[0] * B[3]
  10651. mulxq 96(%rsp), %rdx, %rcx
  10652. adcxq %rbx, %rbp
  10653. xorq %rbx, %rbx
  10654. adcxq %rdx, %r12
  10655. # A[3] * B[0]
  10656. movq (%rdi), %rdx
  10657. adcxq %rcx, %r13
  10658. mulxq 120(%rsp), %rdx, %rcx
  10659. adoxq %rdx, %r12
  10660. adoxq %rcx, %r13
  10661. # A[2] * B[3]
  10662. movq 24(%rdi), %rdx
  10663. mulxq 112(%rsp), %rdx, %rcx
  10664. adcxq %rdx, %r14
  10665. # A[3] * B[2]
  10666. movq 16(%rdi), %rdx
  10667. adcxq %rcx, %r15
  10668. mulxq 120(%rsp), %rcx, %rdx
  10669. adcxq %rbx, %rbp
  10670. adoxq %rcx, %r14
  10671. adoxq %rdx, %r15
  10672. adoxq %rbx, %rbp
  10673. # Reduce
  10674. movq $0x7fffffffffffffff, %rbx
  10675. # Move top half into t4-t7 and remove top bit from t3
  10676. shldq $0x01, %r15, %rbp
  10677. shldq $0x01, %r14, %r15
  10678. shldq $0x01, %r13, %r14
  10679. shldq $0x01, %r12, %r13
  10680. andq %rbx, %r12
  10681. # Multiply top half by 19
  10682. movq $19, %rdx
  10683. xorq %rbx, %rbx
  10684. mulxq %r13, %rcx, %r13
  10685. adcxq %rcx, %r9
  10686. adoxq %r13, %r10
  10687. mulxq %r14, %rcx, %r14
  10688. adcxq %rcx, %r10
  10689. adoxq %r14, %r11
  10690. mulxq %r15, %rcx, %r15
  10691. adcxq %rcx, %r11
  10692. adoxq %r15, %r12
  10693. mulxq %rbp, %rbp, %rdx
  10694. adcxq %rbp, %r12
  10695. adoxq %rbx, %rdx
  10696. adcxq %rbx, %rdx
  10697. # Overflow
  10698. shldq $0x01, %r12, %rdx
  10699. movq $0x7fffffffffffffff, %rbx
  10700. imulq $19, %rdx, %rcx
  10701. andq %rbx, %r12
  10702. addq %rcx, %r9
  10703. adcq $0x00, %r10
  10704. adcq $0x00, %r11
  10705. adcq $0x00, %r12
  10706. # Reduce if top bit set
  10707. movq %r12, %rdx
  10708. sarq $63, %rdx
  10709. andq $19, %rdx
  10710. andq %rbx, %r12
  10711. addq %rdx, %r9
  10712. adcq $0x00, %r10
  10713. adcq $0x00, %r11
  10714. adcq $0x00, %r12
  10715. # Store
  10716. movq %r9, 32(%rsp)
  10717. movq %r10, 40(%rsp)
  10718. movq %r11, 48(%rsp)
  10719. movq %r12, 56(%rsp)
  10720. # Multiply
  10721. # A[0] * B[0]
  10722. movq 128(%rsp), %rdx
  10723. mulxq (%rsp), %r9, %r10
  10724. # A[2] * B[0]
  10725. mulxq 16(%rsp), %r11, %r12
  10726. # A[1] * B[0]
  10727. mulxq 8(%rsp), %rcx, %rbx
  10728. xorq %rbp, %rbp
  10729. adcxq %rcx, %r10
  10730. # A[1] * B[3]
  10731. movq 152(%rsp), %rdx
  10732. mulxq 8(%rsp), %r13, %r14
  10733. adcxq %rbx, %r11
  10734. # A[0] * B[1]
  10735. movq 136(%rsp), %rdx
  10736. mulxq (%rsp), %rcx, %rbx
  10737. adoxq %rcx, %r10
  10738. # A[2] * B[1]
  10739. mulxq 16(%rsp), %rcx, %r15
  10740. adoxq %rbx, %r11
  10741. adcxq %rcx, %r12
  10742. # A[1] * B[2]
  10743. movq 144(%rsp), %rdx
  10744. mulxq 8(%rsp), %rcx, %rbx
  10745. adcxq %r15, %r13
  10746. adoxq %rcx, %r12
  10747. adcxq %rbp, %r14
  10748. adoxq %rbx, %r13
  10749. # A[0] * B[2]
  10750. mulxq (%rsp), %rcx, %rbx
  10751. adoxq %rbp, %r14
  10752. xorq %r15, %r15
  10753. adcxq %rcx, %r11
  10754. # A[1] * B[1]
  10755. movq 136(%rsp), %rdx
  10756. mulxq 8(%rsp), %rdx, %rcx
  10757. adcxq %rbx, %r12
  10758. adoxq %rdx, %r11
  10759. # A[3] * B[1]
  10760. movq 136(%rsp), %rdx
  10761. adoxq %rcx, %r12
  10762. mulxq 24(%rsp), %rcx, %rbx
  10763. adcxq %rcx, %r13
  10764. # A[2] * B[2]
  10765. movq 144(%rsp), %rdx
  10766. mulxq 16(%rsp), %rdx, %rcx
  10767. adcxq %rbx, %r14
  10768. adoxq %rdx, %r13
  10769. # A[3] * B[3]
  10770. movq 152(%rsp), %rdx
  10771. adoxq %rcx, %r14
  10772. mulxq 24(%rsp), %rcx, %rbx
  10773. adoxq %rbp, %r15
  10774. adcxq %rcx, %r15
  10775. # A[0] * B[3]
  10776. mulxq (%rsp), %rdx, %rcx
  10777. adcxq %rbx, %rbp
  10778. xorq %rbx, %rbx
  10779. adcxq %rdx, %r12
  10780. # A[3] * B[0]
  10781. movq 128(%rsp), %rdx
  10782. adcxq %rcx, %r13
  10783. mulxq 24(%rsp), %rdx, %rcx
  10784. adoxq %rdx, %r12
  10785. adoxq %rcx, %r13
  10786. # A[2] * B[3]
  10787. movq 152(%rsp), %rdx
  10788. mulxq 16(%rsp), %rdx, %rcx
  10789. adcxq %rdx, %r14
  10790. # A[3] * B[2]
  10791. movq 144(%rsp), %rdx
  10792. adcxq %rcx, %r15
  10793. mulxq 24(%rsp), %rcx, %rdx
  10794. adcxq %rbx, %rbp
  10795. adoxq %rcx, %r14
  10796. adoxq %rdx, %r15
  10797. adoxq %rbx, %rbp
  10798. # Reduce
  10799. movq $0x7fffffffffffffff, %rbx
  10800. # Move top half into t4-t7 and remove top bit from t3
  10801. shldq $0x01, %r15, %rbp
  10802. shldq $0x01, %r14, %r15
  10803. shldq $0x01, %r13, %r14
  10804. shldq $0x01, %r12, %r13
  10805. andq %rbx, %r12
  10806. # Multiply top half by 19
  10807. movq $19, %rdx
  10808. xorq %rbx, %rbx
  10809. mulxq %r13, %rcx, %r13
  10810. adcxq %rcx, %r9
  10811. adoxq %r13, %r10
  10812. mulxq %r14, %rcx, %r14
  10813. adcxq %rcx, %r10
  10814. adoxq %r14, %r11
  10815. mulxq %r15, %rcx, %r15
  10816. adcxq %rcx, %r11
  10817. adoxq %r15, %r12
  10818. mulxq %rbp, %rbp, %rdx
  10819. adcxq %rbp, %r12
  10820. adoxq %rbx, %rdx
  10821. adcxq %rbx, %rdx
  10822. # Overflow
  10823. shldq $0x01, %r12, %rdx
  10824. movq $0x7fffffffffffffff, %rbx
  10825. imulq $19, %rdx, %rcx
  10826. andq %rbx, %r12
  10827. addq %rcx, %r9
  10828. adcq $0x00, %r10
  10829. adcq $0x00, %r11
  10830. adcq $0x00, %r12
  10831. # Reduce if top bit set
  10832. movq %r12, %rdx
  10833. sarq $63, %rdx
  10834. andq $19, %rdx
  10835. andq %rbx, %r12
  10836. addq %rdx, %r9
  10837. adcq $0x00, %r10
  10838. adcq $0x00, %r11
  10839. adcq $0x00, %r12
  10840. # Store
  10841. movq %r9, (%rsp)
  10842. movq %r10, 8(%rsp)
  10843. movq %r11, 16(%rsp)
  10844. movq %r12, 24(%rsp)
  10845. # Square
  10846. # A[0] * A[1]
  10847. movq 128(%rsp), %rdx
  10848. mulxq 136(%rsp), %r10, %r11
  10849. # A[0] * A[3]
  10850. mulxq 152(%rsp), %r12, %r13
  10851. # A[2] * A[1]
  10852. movq 144(%rsp), %rdx
  10853. mulxq 136(%rsp), %rcx, %rbx
  10854. xorq %rbp, %rbp
  10855. adoxq %rcx, %r12
  10856. # A[2] * A[3]
  10857. mulxq 152(%rsp), %r14, %r15
  10858. adoxq %rbx, %r13
  10859. # A[2] * A[0]
  10860. mulxq 128(%rsp), %rcx, %rbx
  10861. adoxq %rbp, %r14
  10862. adcxq %rcx, %r11
  10863. adoxq %rbp, %r15
  10864. # A[1] * A[3]
  10865. movq 136(%rsp), %rdx
  10866. mulxq 152(%rsp), %rax, %r9
  10867. adcxq %rbx, %r12
  10868. adcxq %rax, %r13
  10869. adcxq %r9, %r14
  10870. adcxq %rbp, %r15
  10871. # Double with Carry Flag
  10872. xorq %rbp, %rbp
  10873. # A[0] * A[0]
  10874. movq 128(%rsp), %rdx
  10875. mulxq %rdx, %r9, %rax
  10876. adcxq %r10, %r10
  10877. # A[1] * A[1]
  10878. movq 136(%rsp), %rdx
  10879. mulxq %rdx, %rcx, %rbx
  10880. adcxq %r11, %r11
  10881. adoxq %rax, %r10
  10882. adcxq %r12, %r12
  10883. adoxq %rcx, %r11
  10884. # A[2] * A[2]
  10885. movq 144(%rsp), %rdx
  10886. mulxq %rdx, %rax, %rcx
  10887. adcxq %r13, %r13
  10888. adoxq %rbx, %r12
  10889. adcxq %r14, %r14
  10890. adoxq %rax, %r13
  10891. # A[3] * A[3]
  10892. movq 152(%rsp), %rdx
  10893. mulxq %rdx, %rax, %rbx
  10894. adcxq %r15, %r15
  10895. adoxq %rcx, %r14
  10896. adcxq %rbp, %rbp
  10897. adoxq %rax, %r15
  10898. adoxq %rbx, %rbp
  10899. # Reduce
  10900. movq $0x7fffffffffffffff, %rcx
  10901. # Move top half into t4-t7 and remove top bit from t3
  10902. shldq $0x01, %r15, %rbp
  10903. shldq $0x01, %r14, %r15
  10904. shldq $0x01, %r13, %r14
  10905. shldq $0x01, %r12, %r13
  10906. andq %rcx, %r12
  10907. # Multiply top half by 19
  10908. movq $19, %rdx
  10909. xorq %rcx, %rcx
  10910. mulxq %r13, %rax, %r13
  10911. adcxq %rax, %r9
  10912. adoxq %r13, %r10
  10913. mulxq %r14, %rax, %r14
  10914. adcxq %rax, %r10
  10915. adoxq %r14, %r11
  10916. mulxq %r15, %rax, %r15
  10917. adcxq %rax, %r11
  10918. adoxq %r15, %r12
  10919. mulxq %rbp, %rbp, %rdx
  10920. adcxq %rbp, %r12
  10921. adoxq %rcx, %rdx
  10922. adcxq %rcx, %rdx
  10923. # Overflow
  10924. shldq $0x01, %r12, %rdx
  10925. movq $0x7fffffffffffffff, %rcx
  10926. imulq $19, %rdx, %rax
  10927. andq %rcx, %r12
  10928. addq %rax, %r9
  10929. adcq $0x00, %r10
  10930. adcq $0x00, %r11
  10931. adcq $0x00, %r12
  10932. # Reduce if top bit set
  10933. movq %r12, %rdx
  10934. sarq $63, %rdx
  10935. andq $19, %rdx
  10936. andq %rcx, %r12
  10937. addq %rdx, %r9
  10938. adcq $0x00, %r10
  10939. adcq $0x00, %r11
  10940. adcq $0x00, %r12
  10941. # Store
  10942. movq %r9, 96(%rsp)
  10943. movq %r10, 104(%rsp)
  10944. movq %r11, 112(%rsp)
  10945. movq %r12, 120(%rsp)
  10946. # Square
  10947. # A[0] * A[1]
  10948. movq (%rdi), %rdx
  10949. mulxq 8(%rdi), %r10, %r11
  10950. # A[0] * A[3]
  10951. mulxq 24(%rdi), %r12, %r13
  10952. # A[2] * A[1]
  10953. movq 16(%rdi), %rdx
  10954. mulxq 8(%rdi), %rcx, %rbx
  10955. xorq %rbp, %rbp
  10956. adoxq %rcx, %r12
  10957. # A[2] * A[3]
  10958. mulxq 24(%rdi), %r14, %r15
  10959. adoxq %rbx, %r13
  10960. # A[2] * A[0]
  10961. mulxq (%rdi), %rcx, %rbx
  10962. adoxq %rbp, %r14
  10963. adcxq %rcx, %r11
  10964. adoxq %rbp, %r15
  10965. # A[1] * A[3]
  10966. movq 8(%rdi), %rdx
  10967. mulxq 24(%rdi), %rax, %r9
  10968. adcxq %rbx, %r12
  10969. adcxq %rax, %r13
  10970. adcxq %r9, %r14
  10971. adcxq %rbp, %r15
  10972. # Double with Carry Flag
  10973. xorq %rbp, %rbp
  10974. # A[0] * A[0]
  10975. movq (%rdi), %rdx
  10976. mulxq %rdx, %r9, %rax
  10977. adcxq %r10, %r10
  10978. # A[1] * A[1]
  10979. movq 8(%rdi), %rdx
  10980. mulxq %rdx, %rcx, %rbx
  10981. adcxq %r11, %r11
  10982. adoxq %rax, %r10
  10983. adcxq %r12, %r12
  10984. adoxq %rcx, %r11
  10985. # A[2] * A[2]
  10986. movq 16(%rdi), %rdx
  10987. mulxq %rdx, %rax, %rcx
  10988. adcxq %r13, %r13
  10989. adoxq %rbx, %r12
  10990. adcxq %r14, %r14
  10991. adoxq %rax, %r13
  10992. # A[3] * A[3]
  10993. movq 24(%rdi), %rdx
  10994. mulxq %rdx, %rax, %rbx
  10995. adcxq %r15, %r15
  10996. adoxq %rcx, %r14
  10997. adcxq %rbp, %rbp
  10998. adoxq %rax, %r15
  10999. adoxq %rbx, %rbp
  11000. # Reduce
  11001. movq $0x7fffffffffffffff, %rcx
  11002. # Move top half into t4-t7 and remove top bit from t3
  11003. shldq $0x01, %r15, %rbp
  11004. shldq $0x01, %r14, %r15
  11005. shldq $0x01, %r13, %r14
  11006. shldq $0x01, %r12, %r13
  11007. andq %rcx, %r12
  11008. # Multiply top half by 19
  11009. movq $19, %rdx
  11010. xorq %rcx, %rcx
  11011. mulxq %r13, %rax, %r13
  11012. adcxq %rax, %r9
  11013. adoxq %r13, %r10
  11014. mulxq %r14, %rax, %r14
  11015. adcxq %rax, %r10
  11016. adoxq %r14, %r11
  11017. mulxq %r15, %rax, %r15
  11018. adcxq %rax, %r11
  11019. adoxq %r15, %r12
  11020. mulxq %rbp, %rbp, %rdx
  11021. adcxq %rbp, %r12
  11022. adoxq %rcx, %rdx
  11023. adcxq %rcx, %rdx
  11024. # Overflow
  11025. shldq $0x01, %r12, %rdx
  11026. movq $0x7fffffffffffffff, %rcx
  11027. imulq $19, %rdx, %rax
  11028. andq %rcx, %r12
  11029. addq %rax, %r9
  11030. adcq $0x00, %r10
  11031. adcq $0x00, %r11
  11032. adcq $0x00, %r12
  11033. # Reduce if top bit set
  11034. movq %r12, %rdx
  11035. sarq $63, %rdx
  11036. andq $19, %rdx
  11037. andq %rcx, %r12
  11038. addq %rdx, %r9
  11039. adcq $0x00, %r10
  11040. adcq $0x00, %r11
  11041. adcq $0x00, %r12
  11042. # Store
  11043. movq %r9, 128(%rsp)
  11044. movq %r10, 136(%rsp)
  11045. movq %r11, 144(%rsp)
  11046. movq %r12, 152(%rsp)
  11047. # Add
  11048. movq 32(%rsp), %r9
  11049. movq 40(%rsp), %r10
  11050. movq 48(%rsp), %r11
  11051. movq 56(%rsp), %rax
  11052. movq %r9, %r13
  11053. addq (%rsp), %r9
  11054. movq %r10, %r14
  11055. adcq 8(%rsp), %r10
  11056. movq %r11, %r15
  11057. adcq 16(%rsp), %r11
  11058. movq %rax, %rbp
  11059. adcq 24(%rsp), %rax
  11060. movq $-19, %rcx
  11061. movq %rax, %r12
  11062. movq $0x7fffffffffffffff, %rbx
  11063. sarq $63, %rax
  11064. # Mask the modulus
  11065. andq %rax, %rcx
  11066. andq %rax, %rbx
  11067. # Sub modulus (if overflow)
  11068. subq %rcx, %r9
  11069. sbbq %rax, %r10
  11070. sbbq %rax, %r11
  11071. sbbq %rbx, %r12
  11072. # Sub
  11073. subq (%rsp), %r13
  11074. movq $0x00, %rax
  11075. sbbq 8(%rsp), %r14
  11076. movq $-19, %rcx
  11077. sbbq 16(%rsp), %r15
  11078. movq $0x7fffffffffffffff, %rbx
  11079. sbbq 24(%rsp), %rbp
  11080. sbbq $0x00, %rax
  11081. # Mask the modulus
  11082. andq %rax, %rcx
  11083. andq %rax, %rbx
  11084. # Add modulus (if underflow)
  11085. addq %rcx, %r13
  11086. adcq %rax, %r14
  11087. adcq %rax, %r15
  11088. adcq %rbx, %rbp
  11089. movq %r9, 64(%rsp)
  11090. movq %r10, 72(%rsp)
  11091. movq %r11, 80(%rsp)
  11092. movq %r12, 88(%rsp)
  11093. movq %r13, (%rsp)
  11094. movq %r14, 8(%rsp)
  11095. movq %r15, 16(%rsp)
  11096. movq %rbp, 24(%rsp)
  11097. # Multiply
  11098. # A[0] * B[0]
  11099. movq 96(%rsp), %rdx
  11100. mulxq 128(%rsp), %r9, %r10
  11101. # A[2] * B[0]
  11102. mulxq 144(%rsp), %r11, %r12
  11103. # A[1] * B[0]
  11104. mulxq 136(%rsp), %rcx, %rbx
  11105. xorq %rbp, %rbp
  11106. adcxq %rcx, %r10
  11107. # A[1] * B[3]
  11108. movq 120(%rsp), %rdx
  11109. mulxq 136(%rsp), %r13, %r14
  11110. adcxq %rbx, %r11
  11111. # A[0] * B[1]
  11112. movq 104(%rsp), %rdx
  11113. mulxq 128(%rsp), %rcx, %rbx
  11114. adoxq %rcx, %r10
  11115. # A[2] * B[1]
  11116. mulxq 144(%rsp), %rcx, %r15
  11117. adoxq %rbx, %r11
  11118. adcxq %rcx, %r12
  11119. # A[1] * B[2]
  11120. movq 112(%rsp), %rdx
  11121. mulxq 136(%rsp), %rcx, %rbx
  11122. adcxq %r15, %r13
  11123. adoxq %rcx, %r12
  11124. adcxq %rbp, %r14
  11125. adoxq %rbx, %r13
  11126. # A[0] * B[2]
  11127. mulxq 128(%rsp), %rcx, %rbx
  11128. adoxq %rbp, %r14
  11129. xorq %r15, %r15
  11130. adcxq %rcx, %r11
  11131. # A[1] * B[1]
  11132. movq 104(%rsp), %rdx
  11133. mulxq 136(%rsp), %rdx, %rcx
  11134. adcxq %rbx, %r12
  11135. adoxq %rdx, %r11
  11136. # A[3] * B[1]
  11137. movq 104(%rsp), %rdx
  11138. adoxq %rcx, %r12
  11139. mulxq 152(%rsp), %rcx, %rbx
  11140. adcxq %rcx, %r13
  11141. # A[2] * B[2]
  11142. movq 112(%rsp), %rdx
  11143. mulxq 144(%rsp), %rdx, %rcx
  11144. adcxq %rbx, %r14
  11145. adoxq %rdx, %r13
  11146. # A[3] * B[3]
  11147. movq 120(%rsp), %rdx
  11148. adoxq %rcx, %r14
  11149. mulxq 152(%rsp), %rcx, %rbx
  11150. adoxq %rbp, %r15
  11151. adcxq %rcx, %r15
  11152. # A[0] * B[3]
  11153. mulxq 128(%rsp), %rdx, %rcx
  11154. adcxq %rbx, %rbp
  11155. xorq %rbx, %rbx
  11156. adcxq %rdx, %r12
  11157. # A[3] * B[0]
  11158. movq 96(%rsp), %rdx
  11159. adcxq %rcx, %r13
  11160. mulxq 152(%rsp), %rdx, %rcx
  11161. adoxq %rdx, %r12
  11162. adoxq %rcx, %r13
  11163. # A[2] * B[3]
  11164. movq 120(%rsp), %rdx
  11165. mulxq 144(%rsp), %rdx, %rcx
  11166. adcxq %rdx, %r14
  11167. # A[3] * B[2]
  11168. movq 112(%rsp), %rdx
  11169. adcxq %rcx, %r15
  11170. mulxq 152(%rsp), %rcx, %rdx
  11171. adcxq %rbx, %rbp
  11172. adoxq %rcx, %r14
  11173. adoxq %rdx, %r15
  11174. adoxq %rbx, %rbp
  11175. # Reduce
  11176. movq $0x7fffffffffffffff, %rbx
  11177. # Move top half into t4-t7 and remove top bit from t3
  11178. shldq $0x01, %r15, %rbp
  11179. shldq $0x01, %r14, %r15
  11180. shldq $0x01, %r13, %r14
  11181. shldq $0x01, %r12, %r13
  11182. andq %rbx, %r12
  11183. # Multiply top half by 19
  11184. movq $19, %rdx
  11185. xorq %rbx, %rbx
  11186. mulxq %r13, %rcx, %r13
  11187. adcxq %rcx, %r9
  11188. adoxq %r13, %r10
  11189. mulxq %r14, %rcx, %r14
  11190. adcxq %rcx, %r10
  11191. adoxq %r14, %r11
  11192. mulxq %r15, %rcx, %r15
  11193. adcxq %rcx, %r11
  11194. adoxq %r15, %r12
  11195. mulxq %rbp, %rbp, %rdx
  11196. adcxq %rbp, %r12
  11197. adoxq %rbx, %rdx
  11198. adcxq %rbx, %rdx
  11199. # Overflow
  11200. shldq $0x01, %r12, %rdx
  11201. movq $0x7fffffffffffffff, %rbx
  11202. imulq $19, %rdx, %rcx
  11203. andq %rbx, %r12
  11204. addq %rcx, %r9
  11205. adcq $0x00, %r10
  11206. adcq $0x00, %r11
  11207. adcq $0x00, %r12
  11208. # Reduce if top bit set
  11209. movq %r12, %rdx
  11210. sarq $63, %rdx
  11211. andq $19, %rdx
  11212. andq %rbx, %r12
  11213. addq %rdx, %r9
  11214. adcq $0x00, %r10
  11215. adcq $0x00, %r11
  11216. adcq $0x00, %r12
  11217. # Store
  11218. movq %r9, (%rdi)
  11219. movq %r10, 8(%rdi)
  11220. movq %r11, 16(%rdi)
  11221. movq %r12, 24(%rdi)
  11222. # Sub
  11223. movq 128(%rsp), %r9
  11224. movq 136(%rsp), %r10
  11225. movq 144(%rsp), %r11
  11226. movq 152(%rsp), %r12
  11227. subq 96(%rsp), %r9
  11228. movq $0x00, %rax
  11229. sbbq 104(%rsp), %r10
  11230. movq $-19, %rcx
  11231. sbbq 112(%rsp), %r11
  11232. movq $0x7fffffffffffffff, %rbx
  11233. sbbq 120(%rsp), %r12
  11234. sbbq $0x00, %rax
  11235. # Mask the modulus
  11236. andq %rax, %rcx
  11237. andq %rax, %rbx
  11238. # Add modulus (if underflow)
  11239. addq %rcx, %r9
  11240. adcq %rax, %r10
  11241. adcq %rax, %r11
  11242. adcq %rbx, %r12
  11243. movq %r9, 128(%rsp)
  11244. movq %r10, 136(%rsp)
  11245. movq %r11, 144(%rsp)
  11246. movq %r12, 152(%rsp)
  11247. # Square
  11248. # A[0] * A[1]
  11249. movq (%rsp), %rdx
  11250. mulxq 8(%rsp), %r10, %r11
  11251. # A[0] * A[3]
  11252. mulxq 24(%rsp), %r12, %r13
  11253. # A[2] * A[1]
  11254. movq 16(%rsp), %rdx
  11255. mulxq 8(%rsp), %rcx, %rbx
  11256. xorq %rbp, %rbp
  11257. adoxq %rcx, %r12
  11258. # A[2] * A[3]
  11259. mulxq 24(%rsp), %r14, %r15
  11260. adoxq %rbx, %r13
  11261. # A[2] * A[0]
  11262. mulxq (%rsp), %rcx, %rbx
  11263. adoxq %rbp, %r14
  11264. adcxq %rcx, %r11
  11265. adoxq %rbp, %r15
  11266. # A[1] * A[3]
  11267. movq 8(%rsp), %rdx
  11268. mulxq 24(%rsp), %rax, %r9
  11269. adcxq %rbx, %r12
  11270. adcxq %rax, %r13
  11271. adcxq %r9, %r14
  11272. adcxq %rbp, %r15
  11273. # Double with Carry Flag
  11274. xorq %rbp, %rbp
  11275. # A[0] * A[0]
  11276. movq (%rsp), %rdx
  11277. mulxq %rdx, %r9, %rax
  11278. adcxq %r10, %r10
  11279. # A[1] * A[1]
  11280. movq 8(%rsp), %rdx
  11281. mulxq %rdx, %rcx, %rbx
  11282. adcxq %r11, %r11
  11283. adoxq %rax, %r10
  11284. adcxq %r12, %r12
  11285. adoxq %rcx, %r11
  11286. # A[2] * A[2]
  11287. movq 16(%rsp), %rdx
  11288. mulxq %rdx, %rax, %rcx
  11289. adcxq %r13, %r13
  11290. adoxq %rbx, %r12
  11291. adcxq %r14, %r14
  11292. adoxq %rax, %r13
  11293. # A[3] * A[3]
  11294. movq 24(%rsp), %rdx
  11295. mulxq %rdx, %rax, %rbx
  11296. adcxq %r15, %r15
  11297. adoxq %rcx, %r14
  11298. adcxq %rbp, %rbp
  11299. adoxq %rax, %r15
  11300. adoxq %rbx, %rbp
  11301. # Reduce
  11302. movq $0x7fffffffffffffff, %rcx
  11303. # Move top half into t4-t7 and remove top bit from t3
  11304. shldq $0x01, %r15, %rbp
  11305. shldq $0x01, %r14, %r15
  11306. shldq $0x01, %r13, %r14
  11307. shldq $0x01, %r12, %r13
  11308. andq %rcx, %r12
  11309. # Multiply top half by 19
  11310. movq $19, %rdx
  11311. xorq %rcx, %rcx
  11312. mulxq %r13, %rax, %r13
  11313. adcxq %rax, %r9
  11314. adoxq %r13, %r10
  11315. mulxq %r14, %rax, %r14
  11316. adcxq %rax, %r10
  11317. adoxq %r14, %r11
  11318. mulxq %r15, %rax, %r15
  11319. adcxq %rax, %r11
  11320. adoxq %r15, %r12
  11321. mulxq %rbp, %rbp, %rdx
  11322. adcxq %rbp, %r12
  11323. adoxq %rcx, %rdx
  11324. adcxq %rcx, %rdx
  11325. # Overflow
  11326. shldq $0x01, %r12, %rdx
  11327. movq $0x7fffffffffffffff, %rcx
  11328. imulq $19, %rdx, %rax
  11329. andq %rcx, %r12
  11330. addq %rax, %r9
  11331. adcq $0x00, %r10
  11332. adcq $0x00, %r11
  11333. adcq $0x00, %r12
  11334. # Reduce if top bit set
  11335. movq %r12, %rdx
  11336. sarq $63, %rdx
  11337. andq $19, %rdx
  11338. andq %rcx, %r12
  11339. addq %rdx, %r9
  11340. adcq $0x00, %r10
  11341. adcq $0x00, %r11
  11342. adcq $0x00, %r12
  11343. # Store
  11344. movq %r9, (%rsp)
  11345. movq %r10, 8(%rsp)
  11346. movq %r11, 16(%rsp)
  11347. movq %r12, 24(%rsp)
  11348. movq $0x1db42, %rdx
  11349. mulxq 128(%rsp), %r9, %rbp
  11350. mulxq 136(%rsp), %r10, %r15
  11351. mulxq 144(%rsp), %r11, %r14
  11352. mulxq 152(%rsp), %r12, %r13
  11353. addq %rbp, %r10
  11354. adcq %r15, %r11
  11355. adcq %r14, %r12
  11356. adcq $0x00, %r13
  11357. movq $0x7fffffffffffffff, %rbp
  11358. shldq $0x01, %r12, %r13
  11359. andq %rbp, %r12
  11360. imulq $19, %r13, %r13
  11361. addq %r13, %r9
  11362. adcq $0x00, %r10
  11363. adcq $0x00, %r11
  11364. adcq $0x00, %r12
  11365. movq %r9, 32(%rsp)
  11366. movq %r10, 40(%rsp)
  11367. movq %r11, 48(%rsp)
  11368. movq %r12, 56(%rsp)
  11369. # Square
  11370. # A[0] * A[1]
  11371. movq 64(%rsp), %rdx
  11372. mulxq 72(%rsp), %r10, %r11
  11373. # A[0] * A[3]
  11374. mulxq 88(%rsp), %r12, %r13
  11375. # A[2] * A[1]
  11376. movq 80(%rsp), %rdx
  11377. mulxq 72(%rsp), %rcx, %rbx
  11378. xorq %rbp, %rbp
  11379. adoxq %rcx, %r12
  11380. # A[2] * A[3]
  11381. mulxq 88(%rsp), %r14, %r15
  11382. adoxq %rbx, %r13
  11383. # A[2] * A[0]
  11384. mulxq 64(%rsp), %rcx, %rbx
  11385. adoxq %rbp, %r14
  11386. adcxq %rcx, %r11
  11387. adoxq %rbp, %r15
  11388. # A[1] * A[3]
  11389. movq 72(%rsp), %rdx
  11390. mulxq 88(%rsp), %rax, %r9
  11391. adcxq %rbx, %r12
  11392. adcxq %rax, %r13
  11393. adcxq %r9, %r14
  11394. adcxq %rbp, %r15
  11395. # Double with Carry Flag
  11396. xorq %rbp, %rbp
  11397. # A[0] * A[0]
  11398. movq 64(%rsp), %rdx
  11399. mulxq %rdx, %r9, %rax
  11400. adcxq %r10, %r10
  11401. # A[1] * A[1]
  11402. movq 72(%rsp), %rdx
  11403. mulxq %rdx, %rcx, %rbx
  11404. adcxq %r11, %r11
  11405. adoxq %rax, %r10
  11406. adcxq %r12, %r12
  11407. adoxq %rcx, %r11
  11408. # A[2] * A[2]
  11409. movq 80(%rsp), %rdx
  11410. mulxq %rdx, %rax, %rcx
  11411. adcxq %r13, %r13
  11412. adoxq %rbx, %r12
  11413. adcxq %r14, %r14
  11414. adoxq %rax, %r13
  11415. # A[3] * A[3]
  11416. movq 88(%rsp), %rdx
  11417. mulxq %rdx, %rax, %rbx
  11418. adcxq %r15, %r15
  11419. adoxq %rcx, %r14
  11420. adcxq %rbp, %rbp
  11421. adoxq %rax, %r15
  11422. adoxq %rbx, %rbp
  11423. # Reduce
  11424. movq $0x7fffffffffffffff, %rcx
  11425. # Move top half into t4-t7 and remove top bit from t3
  11426. shldq $0x01, %r15, %rbp
  11427. shldq $0x01, %r14, %r15
  11428. shldq $0x01, %r13, %r14
  11429. shldq $0x01, %r12, %r13
  11430. andq %rcx, %r12
  11431. # Multiply top half by 19
  11432. movq $19, %rdx
  11433. xorq %rcx, %rcx
  11434. mulxq %r13, %rax, %r13
  11435. adcxq %rax, %r9
  11436. adoxq %r13, %r10
  11437. mulxq %r14, %rax, %r14
  11438. adcxq %rax, %r10
  11439. adoxq %r14, %r11
  11440. mulxq %r15, %rax, %r15
  11441. adcxq %rax, %r11
  11442. adoxq %r15, %r12
  11443. mulxq %rbp, %rbp, %rdx
  11444. adcxq %rbp, %r12
  11445. adoxq %rcx, %rdx
  11446. adcxq %rcx, %rdx
  11447. # Overflow
  11448. shldq $0x01, %r12, %rdx
  11449. movq $0x7fffffffffffffff, %rcx
  11450. imulq $19, %rdx, %rax
  11451. andq %rcx, %r12
  11452. addq %rax, %r9
  11453. adcq $0x00, %r10
  11454. adcq $0x00, %r11
  11455. adcq $0x00, %r12
  11456. # Reduce if top bit set
  11457. movq %r12, %rdx
  11458. sarq $63, %rdx
  11459. andq $19, %rdx
  11460. andq %rcx, %r12
  11461. addq %rdx, %r9
  11462. adcq $0x00, %r10
  11463. adcq $0x00, %r11
  11464. adcq $0x00, %r12
  11465. # Store
  11466. movq %r9, 64(%rsp)
  11467. movq %r10, 72(%rsp)
  11468. movq %r11, 80(%rsp)
  11469. movq %r12, 88(%rsp)
  11470. # Add
  11471. movq 96(%rsp), %r9
  11472. movq 104(%rsp), %r10
  11473. addq 32(%rsp), %r9
  11474. movq 112(%rsp), %r11
  11475. adcq 40(%rsp), %r10
  11476. movq 120(%rsp), %rax
  11477. adcq 48(%rsp), %r11
  11478. movq $-19, %rcx
  11479. adcq 56(%rsp), %rax
  11480. movq $0x7fffffffffffffff, %rbx
  11481. movq %rax, %r12
  11482. sarq $63, %rax
  11483. # Mask the modulus
  11484. andq %rax, %rcx
  11485. andq %rax, %rbx
  11486. # Sub modulus (if overflow)
  11487. subq %rcx, %r9
  11488. sbbq %rax, %r10
  11489. sbbq %rax, %r11
  11490. sbbq %rbx, %r12
  11491. movq %r9, 96(%rsp)
  11492. movq %r10, 104(%rsp)
  11493. movq %r11, 112(%rsp)
  11494. movq %r12, 120(%rsp)
  11495. # Multiply
  11496. # A[0] * B[0]
  11497. movq (%rsp), %rdx
  11498. mulxq (%r8), %r9, %r10
  11499. # A[2] * B[0]
  11500. mulxq 16(%r8), %r11, %r12
  11501. # A[1] * B[0]
  11502. mulxq 8(%r8), %rcx, %rbx
  11503. xorq %rbp, %rbp
  11504. adcxq %rcx, %r10
  11505. # A[1] * B[3]
  11506. movq 24(%rsp), %rdx
  11507. mulxq 8(%r8), %r13, %r14
  11508. adcxq %rbx, %r11
  11509. # A[0] * B[1]
  11510. movq 8(%rsp), %rdx
  11511. mulxq (%r8), %rcx, %rbx
  11512. adoxq %rcx, %r10
  11513. # A[2] * B[1]
  11514. mulxq 16(%r8), %rcx, %r15
  11515. adoxq %rbx, %r11
  11516. adcxq %rcx, %r12
  11517. # A[1] * B[2]
  11518. movq 16(%rsp), %rdx
  11519. mulxq 8(%r8), %rcx, %rbx
  11520. adcxq %r15, %r13
  11521. adoxq %rcx, %r12
  11522. adcxq %rbp, %r14
  11523. adoxq %rbx, %r13
  11524. # A[0] * B[2]
  11525. mulxq (%r8), %rcx, %rbx
  11526. adoxq %rbp, %r14
  11527. xorq %r15, %r15
  11528. adcxq %rcx, %r11
  11529. # A[1] * B[1]
  11530. movq 8(%rsp), %rdx
  11531. mulxq 8(%r8), %rdx, %rcx
  11532. adcxq %rbx, %r12
  11533. adoxq %rdx, %r11
  11534. # A[3] * B[1]
  11535. movq 8(%rsp), %rdx
  11536. adoxq %rcx, %r12
  11537. mulxq 24(%r8), %rcx, %rbx
  11538. adcxq %rcx, %r13
  11539. # A[2] * B[2]
  11540. movq 16(%rsp), %rdx
  11541. mulxq 16(%r8), %rdx, %rcx
  11542. adcxq %rbx, %r14
  11543. adoxq %rdx, %r13
  11544. # A[3] * B[3]
  11545. movq 24(%rsp), %rdx
  11546. adoxq %rcx, %r14
  11547. mulxq 24(%r8), %rcx, %rbx
  11548. adoxq %rbp, %r15
  11549. adcxq %rcx, %r15
  11550. # A[0] * B[3]
  11551. mulxq (%r8), %rdx, %rcx
  11552. adcxq %rbx, %rbp
  11553. xorq %rbx, %rbx
  11554. adcxq %rdx, %r12
  11555. # A[3] * B[0]
  11556. movq (%rsp), %rdx
  11557. adcxq %rcx, %r13
  11558. mulxq 24(%r8), %rdx, %rcx
  11559. adoxq %rdx, %r12
  11560. adoxq %rcx, %r13
  11561. # A[2] * B[3]
  11562. movq 24(%rsp), %rdx
  11563. mulxq 16(%r8), %rdx, %rcx
  11564. adcxq %rdx, %r14
  11565. # A[3] * B[2]
  11566. movq 16(%rsp), %rdx
  11567. adcxq %rcx, %r15
  11568. mulxq 24(%r8), %rcx, %rdx
  11569. adcxq %rbx, %rbp
  11570. adoxq %rcx, %r14
  11571. adoxq %rdx, %r15
  11572. adoxq %rbx, %rbp
  11573. # Reduce
  11574. movq $0x7fffffffffffffff, %rbx
  11575. # Move top half into t4-t7 and remove top bit from t3
  11576. shldq $0x01, %r15, %rbp
  11577. shldq $0x01, %r14, %r15
  11578. shldq $0x01, %r13, %r14
  11579. shldq $0x01, %r12, %r13
  11580. andq %rbx, %r12
  11581. # Multiply top half by 19
  11582. movq $19, %rdx
  11583. xorq %rbx, %rbx
  11584. mulxq %r13, %rcx, %r13
  11585. adcxq %rcx, %r9
  11586. adoxq %r13, %r10
  11587. mulxq %r14, %rcx, %r14
  11588. adcxq %rcx, %r10
  11589. adoxq %r14, %r11
  11590. mulxq %r15, %rcx, %r15
  11591. adcxq %rcx, %r11
  11592. adoxq %r15, %r12
  11593. mulxq %rbp, %rbp, %rdx
  11594. adcxq %rbp, %r12
  11595. adoxq %rbx, %rdx
  11596. adcxq %rbx, %rdx
  11597. # Overflow
  11598. shldq $0x01, %r12, %rdx
  11599. movq $0x7fffffffffffffff, %rbx
  11600. imulq $19, %rdx, %rcx
  11601. andq %rbx, %r12
  11602. addq %rcx, %r9
  11603. adcq $0x00, %r10
  11604. adcq $0x00, %r11
  11605. adcq $0x00, %r12
  11606. # Reduce if top bit set
  11607. movq %r12, %rdx
  11608. sarq $63, %rdx
  11609. andq $19, %rdx
  11610. andq %rbx, %r12
  11611. addq %rdx, %r9
  11612. adcq $0x00, %r10
  11613. adcq $0x00, %r11
  11614. adcq $0x00, %r12
  11615. # Store
  11616. movq %r9, 32(%rsp)
  11617. movq %r10, 40(%rsp)
  11618. movq %r11, 48(%rsp)
  11619. movq %r12, 56(%rsp)
  11620. # Multiply
  11621. # A[0] * B[0]
  11622. movq 96(%rsp), %rdx
  11623. mulxq 128(%rsp), %r9, %r10
  11624. # A[2] * B[0]
  11625. mulxq 144(%rsp), %r11, %r12
  11626. # A[1] * B[0]
  11627. mulxq 136(%rsp), %rcx, %rbx
  11628. xorq %rbp, %rbp
  11629. adcxq %rcx, %r10
  11630. # A[1] * B[3]
  11631. movq 120(%rsp), %rdx
  11632. mulxq 136(%rsp), %r13, %r14
  11633. adcxq %rbx, %r11
  11634. # A[0] * B[1]
  11635. movq 104(%rsp), %rdx
  11636. mulxq 128(%rsp), %rcx, %rbx
  11637. adoxq %rcx, %r10
  11638. # A[2] * B[1]
  11639. mulxq 144(%rsp), %rcx, %r15
  11640. adoxq %rbx, %r11
  11641. adcxq %rcx, %r12
  11642. # A[1] * B[2]
  11643. movq 112(%rsp), %rdx
  11644. mulxq 136(%rsp), %rcx, %rbx
  11645. adcxq %r15, %r13
  11646. adoxq %rcx, %r12
  11647. adcxq %rbp, %r14
  11648. adoxq %rbx, %r13
  11649. # A[0] * B[2]
  11650. mulxq 128(%rsp), %rcx, %rbx
  11651. adoxq %rbp, %r14
  11652. xorq %r15, %r15
  11653. adcxq %rcx, %r11
  11654. # A[1] * B[1]
  11655. movq 104(%rsp), %rdx
  11656. mulxq 136(%rsp), %rdx, %rcx
  11657. adcxq %rbx, %r12
  11658. adoxq %rdx, %r11
  11659. # A[3] * B[1]
  11660. movq 104(%rsp), %rdx
  11661. adoxq %rcx, %r12
  11662. mulxq 152(%rsp), %rcx, %rbx
  11663. adcxq %rcx, %r13
  11664. # A[2] * B[2]
  11665. movq 112(%rsp), %rdx
  11666. mulxq 144(%rsp), %rdx, %rcx
  11667. adcxq %rbx, %r14
  11668. adoxq %rdx, %r13
  11669. # A[3] * B[3]
  11670. movq 120(%rsp), %rdx
  11671. adoxq %rcx, %r14
  11672. mulxq 152(%rsp), %rcx, %rbx
  11673. adoxq %rbp, %r15
  11674. adcxq %rcx, %r15
  11675. # A[0] * B[3]
  11676. mulxq 128(%rsp), %rdx, %rcx
  11677. adcxq %rbx, %rbp
  11678. xorq %rbx, %rbx
  11679. adcxq %rdx, %r12
  11680. # A[3] * B[0]
  11681. movq 96(%rsp), %rdx
  11682. adcxq %rcx, %r13
  11683. mulxq 152(%rsp), %rdx, %rcx
  11684. adoxq %rdx, %r12
  11685. adoxq %rcx, %r13
  11686. # A[2] * B[3]
  11687. movq 120(%rsp), %rdx
  11688. mulxq 144(%rsp), %rdx, %rcx
  11689. adcxq %rdx, %r14
  11690. # A[3] * B[2]
  11691. movq 112(%rsp), %rdx
  11692. adcxq %rcx, %r15
  11693. mulxq 152(%rsp), %rcx, %rdx
  11694. adcxq %rbx, %rbp
  11695. adoxq %rcx, %r14
  11696. adoxq %rdx, %r15
  11697. adoxq %rbx, %rbp
  11698. # Reduce
  11699. movq $0x7fffffffffffffff, %rbx
  11700. # Move top half into t4-t7 and remove top bit from t3
  11701. shldq $0x01, %r15, %rbp
  11702. shldq $0x01, %r14, %r15
  11703. shldq $0x01, %r13, %r14
  11704. shldq $0x01, %r12, %r13
  11705. andq %rbx, %r12
  11706. # Multiply top half by 19
  11707. movq $19, %rdx
  11708. xorq %rbx, %rbx
  11709. mulxq %r13, %rcx, %r13
  11710. adcxq %rcx, %r9
  11711. adoxq %r13, %r10
  11712. mulxq %r14, %rcx, %r14
  11713. adcxq %rcx, %r10
  11714. adoxq %r14, %r11
  11715. mulxq %r15, %rcx, %r15
  11716. adcxq %rcx, %r11
  11717. adoxq %r15, %r12
  11718. mulxq %rbp, %rbp, %rdx
  11719. adcxq %rbp, %r12
  11720. adoxq %rbx, %rdx
  11721. adcxq %rbx, %rdx
  11722. # Overflow
  11723. shldq $0x01, %r12, %rdx
  11724. movq $0x7fffffffffffffff, %rbx
  11725. imulq $19, %rdx, %rcx
  11726. andq %rbx, %r12
  11727. addq %rcx, %r9
  11728. adcq $0x00, %r10
  11729. adcq $0x00, %r11
  11730. adcq $0x00, %r12
  11731. # Reduce if top bit set
  11732. movq %r12, %rdx
  11733. sarq $63, %rdx
  11734. andq $19, %rdx
  11735. andq %rbx, %r12
  11736. addq %rdx, %r9
  11737. adcq $0x00, %r10
  11738. adcq $0x00, %r11
  11739. adcq $0x00, %r12
  11740. # Store
  11741. movq %r9, (%rsp)
  11742. movq %r10, 8(%rsp)
  11743. movq %r11, 16(%rsp)
  11744. movq %r12, 24(%rsp)
  11745. decb 168(%rsp)
  11746. jge L_curve25519_avx2_bits
  11747. movq $63, 168(%rsp)
  11748. decb 160(%rsp)
  11749. jge L_curve25519_avx2_words
  11750. # Invert
  11751. leaq 32(%rsp), %rdi
  11752. movq %rsp, %rsi
  11753. #ifndef __APPLE__
  11754. callq fe_sq_avx2@plt
  11755. #else
  11756. callq _fe_sq_avx2
  11757. #endif /* __APPLE__ */
  11758. leaq 64(%rsp), %rdi
  11759. leaq 32(%rsp), %rsi
  11760. #ifndef __APPLE__
  11761. callq fe_sq_avx2@plt
  11762. #else
  11763. callq _fe_sq_avx2
  11764. #endif /* __APPLE__ */
  11765. leaq 64(%rsp), %rdi
  11766. leaq 64(%rsp), %rsi
  11767. #ifndef __APPLE__
  11768. callq fe_sq_avx2@plt
  11769. #else
  11770. callq _fe_sq_avx2
  11771. #endif /* __APPLE__ */
  11772. leaq 64(%rsp), %rdi
  11773. movq %rsp, %rsi
  11774. leaq 64(%rsp), %rdx
  11775. #ifndef __APPLE__
  11776. callq fe_mul_avx2@plt
  11777. #else
  11778. callq _fe_mul_avx2
  11779. #endif /* __APPLE__ */
  11780. leaq 32(%rsp), %rdi
  11781. leaq 32(%rsp), %rsi
  11782. leaq 64(%rsp), %rdx
  11783. #ifndef __APPLE__
  11784. callq fe_mul_avx2@plt
  11785. #else
  11786. callq _fe_mul_avx2
  11787. #endif /* __APPLE__ */
  11788. leaq 96(%rsp), %rdi
  11789. leaq 32(%rsp), %rsi
  11790. #ifndef __APPLE__
  11791. callq fe_sq_avx2@plt
  11792. #else
  11793. callq _fe_sq_avx2
  11794. #endif /* __APPLE__ */
  11795. leaq 64(%rsp), %rdi
  11796. leaq 64(%rsp), %rsi
  11797. leaq 96(%rsp), %rdx
  11798. #ifndef __APPLE__
  11799. callq fe_mul_avx2@plt
  11800. #else
  11801. callq _fe_mul_avx2
  11802. #endif /* __APPLE__ */
  11803. leaq 96(%rsp), %rdi
  11804. leaq 64(%rsp), %rsi
  11805. #ifndef __APPLE__
  11806. callq fe_sq_avx2@plt
  11807. #else
  11808. callq _fe_sq_avx2
  11809. #endif /* __APPLE__ */
  11810. leaq 96(%rsp), %rdi
  11811. leaq 96(%rsp), %rsi
  11812. movq $4, %rdx
  11813. #ifndef __APPLE__
  11814. callq fe_sq_n_avx2@plt
  11815. #else
  11816. callq _fe_sq_n_avx2
  11817. #endif /* __APPLE__ */
  11818. leaq 64(%rsp), %rdi
  11819. leaq 96(%rsp), %rsi
  11820. leaq 64(%rsp), %rdx
  11821. #ifndef __APPLE__
  11822. callq fe_mul_avx2@plt
  11823. #else
  11824. callq _fe_mul_avx2
  11825. #endif /* __APPLE__ */
  11826. leaq 96(%rsp), %rdi
  11827. leaq 64(%rsp), %rsi
  11828. #ifndef __APPLE__
  11829. callq fe_sq_avx2@plt
  11830. #else
  11831. callq _fe_sq_avx2
  11832. #endif /* __APPLE__ */
  11833. leaq 96(%rsp), %rdi
  11834. leaq 96(%rsp), %rsi
  11835. movq $9, %rdx
  11836. #ifndef __APPLE__
  11837. callq fe_sq_n_avx2@plt
  11838. #else
  11839. callq _fe_sq_n_avx2
  11840. #endif /* __APPLE__ */
  11841. leaq 96(%rsp), %rdi
  11842. leaq 96(%rsp), %rsi
  11843. leaq 64(%rsp), %rdx
  11844. #ifndef __APPLE__
  11845. callq fe_mul_avx2@plt
  11846. #else
  11847. callq _fe_mul_avx2
  11848. #endif /* __APPLE__ */
  11849. leaq 128(%rsp), %rdi
  11850. leaq 96(%rsp), %rsi
  11851. #ifndef __APPLE__
  11852. callq fe_sq_avx2@plt
  11853. #else
  11854. callq _fe_sq_avx2
  11855. #endif /* __APPLE__ */
  11856. leaq 128(%rsp), %rdi
  11857. leaq 128(%rsp), %rsi
  11858. movq $19, %rdx
  11859. #ifndef __APPLE__
  11860. callq fe_sq_n_avx2@plt
  11861. #else
  11862. callq _fe_sq_n_avx2
  11863. #endif /* __APPLE__ */
  11864. leaq 96(%rsp), %rdi
  11865. leaq 128(%rsp), %rsi
  11866. leaq 96(%rsp), %rdx
  11867. #ifndef __APPLE__
  11868. callq fe_mul_avx2@plt
  11869. #else
  11870. callq _fe_mul_avx2
  11871. #endif /* __APPLE__ */
  11872. leaq 96(%rsp), %rdi
  11873. leaq 96(%rsp), %rsi
  11874. #ifndef __APPLE__
  11875. callq fe_sq_avx2@plt
  11876. #else
  11877. callq _fe_sq_avx2
  11878. #endif /* __APPLE__ */
  11879. leaq 96(%rsp), %rdi
  11880. leaq 96(%rsp), %rsi
  11881. movq $9, %rdx
  11882. #ifndef __APPLE__
  11883. callq fe_sq_n_avx2@plt
  11884. #else
  11885. callq _fe_sq_n_avx2
  11886. #endif /* __APPLE__ */
  11887. leaq 64(%rsp), %rdi
  11888. leaq 96(%rsp), %rsi
  11889. leaq 64(%rsp), %rdx
  11890. #ifndef __APPLE__
  11891. callq fe_mul_avx2@plt
  11892. #else
  11893. callq _fe_mul_avx2
  11894. #endif /* __APPLE__ */
  11895. leaq 96(%rsp), %rdi
  11896. leaq 64(%rsp), %rsi
  11897. #ifndef __APPLE__
  11898. callq fe_sq_avx2@plt
  11899. #else
  11900. callq _fe_sq_avx2
  11901. #endif /* __APPLE__ */
  11902. leaq 96(%rsp), %rdi
  11903. leaq 96(%rsp), %rsi
  11904. movq $49, %rdx
  11905. #ifndef __APPLE__
  11906. callq fe_sq_n_avx2@plt
  11907. #else
  11908. callq _fe_sq_n_avx2
  11909. #endif /* __APPLE__ */
  11910. leaq 96(%rsp), %rdi
  11911. leaq 96(%rsp), %rsi
  11912. leaq 64(%rsp), %rdx
  11913. #ifndef __APPLE__
  11914. callq fe_mul_avx2@plt
  11915. #else
  11916. callq _fe_mul_avx2
  11917. #endif /* __APPLE__ */
  11918. leaq 128(%rsp), %rdi
  11919. leaq 96(%rsp), %rsi
  11920. #ifndef __APPLE__
  11921. callq fe_sq_avx2@plt
  11922. #else
  11923. callq _fe_sq_avx2
  11924. #endif /* __APPLE__ */
  11925. leaq 128(%rsp), %rdi
  11926. leaq 128(%rsp), %rsi
  11927. movq $0x63, %rdx
  11928. #ifndef __APPLE__
  11929. callq fe_sq_n_avx2@plt
  11930. #else
  11931. callq _fe_sq_n_avx2
  11932. #endif /* __APPLE__ */
  11933. leaq 96(%rsp), %rdi
  11934. leaq 128(%rsp), %rsi
  11935. leaq 96(%rsp), %rdx
  11936. #ifndef __APPLE__
  11937. callq fe_mul_avx2@plt
  11938. #else
  11939. callq _fe_mul_avx2
  11940. #endif /* __APPLE__ */
  11941. leaq 96(%rsp), %rdi
  11942. leaq 96(%rsp), %rsi
  11943. #ifndef __APPLE__
  11944. callq fe_sq_avx2@plt
  11945. #else
  11946. callq _fe_sq_avx2
  11947. #endif /* __APPLE__ */
  11948. leaq 96(%rsp), %rdi
  11949. leaq 96(%rsp), %rsi
  11950. movq $49, %rdx
  11951. #ifndef __APPLE__
  11952. callq fe_sq_n_avx2@plt
  11953. #else
  11954. callq _fe_sq_n_avx2
  11955. #endif /* __APPLE__ */
  11956. leaq 64(%rsp), %rdi
  11957. leaq 96(%rsp), %rsi
  11958. leaq 64(%rsp), %rdx
  11959. #ifndef __APPLE__
  11960. callq fe_mul_avx2@plt
  11961. #else
  11962. callq _fe_mul_avx2
  11963. #endif /* __APPLE__ */
  11964. leaq 64(%rsp), %rdi
  11965. leaq 64(%rsp), %rsi
  11966. #ifndef __APPLE__
  11967. callq fe_sq_avx2@plt
  11968. #else
  11969. callq _fe_sq_avx2
  11970. #endif /* __APPLE__ */
  11971. leaq 64(%rsp), %rdi
  11972. leaq 64(%rsp), %rsi
  11973. movq $4, %rdx
  11974. #ifndef __APPLE__
  11975. callq fe_sq_n_avx2@plt
  11976. #else
  11977. callq _fe_sq_n_avx2
  11978. #endif /* __APPLE__ */
  11979. movq %rsp, %rdi
  11980. leaq 64(%rsp), %rsi
  11981. leaq 32(%rsp), %rdx
  11982. #ifndef __APPLE__
  11983. callq fe_mul_avx2@plt
  11984. #else
  11985. callq _fe_mul_avx2
  11986. #endif /* __APPLE__ */
  11987. movq 176(%rsp), %rdi
  11988. # Multiply
  11989. # A[0] * B[0]
  11990. movq (%rsp), %rdx
  11991. mulxq (%rdi), %r9, %r10
  11992. # A[2] * B[0]
  11993. mulxq 16(%rdi), %r11, %r12
  11994. # A[1] * B[0]
  11995. mulxq 8(%rdi), %rcx, %rbx
  11996. xorq %rbp, %rbp
  11997. adcxq %rcx, %r10
  11998. # A[1] * B[3]
  11999. movq 24(%rsp), %rdx
  12000. mulxq 8(%rdi), %r13, %r14
  12001. adcxq %rbx, %r11
  12002. # A[0] * B[1]
  12003. movq 8(%rsp), %rdx
  12004. mulxq (%rdi), %rcx, %rbx
  12005. adoxq %rcx, %r10
  12006. # A[2] * B[1]
  12007. mulxq 16(%rdi), %rcx, %r15
  12008. adoxq %rbx, %r11
  12009. adcxq %rcx, %r12
  12010. # A[1] * B[2]
  12011. movq 16(%rsp), %rdx
  12012. mulxq 8(%rdi), %rcx, %rbx
  12013. adcxq %r15, %r13
  12014. adoxq %rcx, %r12
  12015. adcxq %rbp, %r14
  12016. adoxq %rbx, %r13
  12017. # A[0] * B[2]
  12018. mulxq (%rdi), %rcx, %rbx
  12019. adoxq %rbp, %r14
  12020. xorq %r15, %r15
  12021. adcxq %rcx, %r11
  12022. # A[1] * B[1]
  12023. movq 8(%rsp), %rdx
  12024. mulxq 8(%rdi), %rdx, %rcx
  12025. adcxq %rbx, %r12
  12026. adoxq %rdx, %r11
  12027. # A[3] * B[1]
  12028. movq 8(%rsp), %rdx
  12029. adoxq %rcx, %r12
  12030. mulxq 24(%rdi), %rcx, %rbx
  12031. adcxq %rcx, %r13
  12032. # A[2] * B[2]
  12033. movq 16(%rsp), %rdx
  12034. mulxq 16(%rdi), %rdx, %rcx
  12035. adcxq %rbx, %r14
  12036. adoxq %rdx, %r13
  12037. # A[3] * B[3]
  12038. movq 24(%rsp), %rdx
  12039. adoxq %rcx, %r14
  12040. mulxq 24(%rdi), %rcx, %rbx
  12041. adoxq %rbp, %r15
  12042. adcxq %rcx, %r15
  12043. # A[0] * B[3]
  12044. mulxq (%rdi), %rdx, %rcx
  12045. adcxq %rbx, %rbp
  12046. xorq %rbx, %rbx
  12047. adcxq %rdx, %r12
  12048. # A[3] * B[0]
  12049. movq (%rsp), %rdx
  12050. adcxq %rcx, %r13
  12051. mulxq 24(%rdi), %rdx, %rcx
  12052. adoxq %rdx, %r12
  12053. adoxq %rcx, %r13
  12054. # A[2] * B[3]
  12055. movq 24(%rsp), %rdx
  12056. mulxq 16(%rdi), %rdx, %rcx
  12057. adcxq %rdx, %r14
  12058. # A[3] * B[2]
  12059. movq 16(%rsp), %rdx
  12060. adcxq %rcx, %r15
  12061. mulxq 24(%rdi), %rcx, %rdx
  12062. adcxq %rbx, %rbp
  12063. adoxq %rcx, %r14
  12064. adoxq %rdx, %r15
  12065. adoxq %rbx, %rbp
  12066. # Reduce
  12067. movq $0x7fffffffffffffff, %rbx
  12068. # Move top half into t4-t7 and remove top bit from t3
  12069. shldq $0x01, %r15, %rbp
  12070. shldq $0x01, %r14, %r15
  12071. shldq $0x01, %r13, %r14
  12072. shldq $0x01, %r12, %r13
  12073. andq %rbx, %r12
  12074. # Multiply top half by 19
  12075. movq $19, %rdx
  12076. xorq %rbx, %rbx
  12077. mulxq %r13, %rcx, %r13
  12078. adcxq %rcx, %r9
  12079. adoxq %r13, %r10
  12080. mulxq %r14, %rcx, %r14
  12081. adcxq %rcx, %r10
  12082. adoxq %r14, %r11
  12083. mulxq %r15, %rcx, %r15
  12084. adcxq %rcx, %r11
  12085. adoxq %r15, %r12
  12086. mulxq %rbp, %rbp, %rdx
  12087. adcxq %rbp, %r12
  12088. adoxq %rbx, %rdx
  12089. adcxq %rbx, %rdx
  12090. # Overflow
  12091. shldq $0x01, %r12, %rdx
  12092. movq $0x7fffffffffffffff, %rbx
  12093. imulq $19, %rdx, %rcx
  12094. andq %rbx, %r12
  12095. addq %rcx, %r9
  12096. adcq $0x00, %r10
  12097. adcq $0x00, %r11
  12098. adcq $0x00, %r12
  12099. # Reduce if top bit set
  12100. movq %r12, %rdx
  12101. sarq $63, %rdx
  12102. andq $19, %rdx
  12103. andq %rbx, %r12
  12104. addq %rdx, %r9
  12105. adcq $0x00, %r10
  12106. adcq $0x00, %r11
  12107. adcq $0x00, %r12
  12108. movq $0x7fffffffffffffff, %rbx
  12109. movq %r9, %rdx
  12110. addq $19, %rdx
  12111. movq %r10, %rdx
  12112. adcq $0x00, %rdx
  12113. movq %r11, %rdx
  12114. adcq $0x00, %rdx
  12115. movq %r12, %rdx
  12116. adcq $0x00, %rdx
  12117. sarq $63, %rdx
  12118. andq $19, %rdx
  12119. addq %rdx, %r9
  12120. adcq $0x00, %r10
  12121. adcq $0x00, %r11
  12122. adcq $0x00, %r12
  12123. andq %rbx, %r12
  12124. # Store
  12125. movq %r9, (%rdi)
  12126. movq %r10, 8(%rdi)
  12127. movq %r11, 16(%rdi)
  12128. movq %r12, 24(%rdi)
  12129. xorq %rax, %rax
  12130. addq $0xc0, %rsp
  12131. popq %rbp
  12132. popq %r15
  12133. popq %r14
  12134. popq %r13
  12135. popq %r12
  12136. popq %rbx
  12137. repz retq
  12138. #ifndef __APPLE__
  12139. .size curve25519_avx2,.-curve25519_avx2
  12140. #endif /* __APPLE__ */
  12141. #ifndef __APPLE__
  12142. .text
  12143. .globl fe_pow22523_avx2
  12144. .type fe_pow22523_avx2,@function
  12145. .align 16
  12146. fe_pow22523_avx2:
  12147. #else
  12148. .section __TEXT,__text
  12149. .globl _fe_pow22523_avx2
  12150. .p2align 4
  12151. _fe_pow22523_avx2:
  12152. #endif /* __APPLE__ */
  12153. subq $0x70, %rsp
  12154. # pow22523
  12155. movq %rdi, 96(%rsp)
  12156. movq %rsi, 104(%rsp)
  12157. movq %rsp, %rdi
  12158. movq 104(%rsp), %rsi
  12159. #ifndef __APPLE__
  12160. callq fe_sq_avx2@plt
  12161. #else
  12162. callq _fe_sq_avx2
  12163. #endif /* __APPLE__ */
  12164. leaq 32(%rsp), %rdi
  12165. movq %rsp, %rsi
  12166. #ifndef __APPLE__
  12167. callq fe_sq_avx2@plt
  12168. #else
  12169. callq _fe_sq_avx2
  12170. #endif /* __APPLE__ */
  12171. leaq 32(%rsp), %rdi
  12172. leaq 32(%rsp), %rsi
  12173. #ifndef __APPLE__
  12174. callq fe_sq_avx2@plt
  12175. #else
  12176. callq _fe_sq_avx2
  12177. #endif /* __APPLE__ */
  12178. leaq 32(%rsp), %rdi
  12179. movq 104(%rsp), %rsi
  12180. leaq 32(%rsp), %rdx
  12181. #ifndef __APPLE__
  12182. callq fe_mul_avx2@plt
  12183. #else
  12184. callq _fe_mul_avx2
  12185. #endif /* __APPLE__ */
  12186. movq %rsp, %rdi
  12187. movq %rsp, %rsi
  12188. leaq 32(%rsp), %rdx
  12189. #ifndef __APPLE__
  12190. callq fe_mul_avx2@plt
  12191. #else
  12192. callq _fe_mul_avx2
  12193. #endif /* __APPLE__ */
  12194. movq %rsp, %rdi
  12195. movq %rsp, %rsi
  12196. #ifndef __APPLE__
  12197. callq fe_sq_avx2@plt
  12198. #else
  12199. callq _fe_sq_avx2
  12200. #endif /* __APPLE__ */
  12201. movq %rsp, %rdi
  12202. leaq 32(%rsp), %rsi
  12203. movq %rsp, %rdx
  12204. #ifndef __APPLE__
  12205. callq fe_mul_avx2@plt
  12206. #else
  12207. callq _fe_mul_avx2
  12208. #endif /* __APPLE__ */
  12209. leaq 32(%rsp), %rdi
  12210. movq %rsp, %rsi
  12211. #ifndef __APPLE__
  12212. callq fe_sq_avx2@plt
  12213. #else
  12214. callq _fe_sq_avx2
  12215. #endif /* __APPLE__ */
  12216. leaq 32(%rsp), %rdi
  12217. leaq 32(%rsp), %rsi
  12218. movb $4, %dl
  12219. #ifndef __APPLE__
  12220. callq fe_sq_n_avx2@plt
  12221. #else
  12222. callq _fe_sq_n_avx2
  12223. #endif /* __APPLE__ */
  12224. movq %rsp, %rdi
  12225. leaq 32(%rsp), %rsi
  12226. movq %rsp, %rdx
  12227. #ifndef __APPLE__
  12228. callq fe_mul_avx2@plt
  12229. #else
  12230. callq _fe_mul_avx2
  12231. #endif /* __APPLE__ */
  12232. leaq 32(%rsp), %rdi
  12233. movq %rsp, %rsi
  12234. #ifndef __APPLE__
  12235. callq fe_sq_avx2@plt
  12236. #else
  12237. callq _fe_sq_avx2
  12238. #endif /* __APPLE__ */
  12239. leaq 32(%rsp), %rdi
  12240. leaq 32(%rsp), %rsi
  12241. movb $9, %dl
  12242. #ifndef __APPLE__
  12243. callq fe_sq_n_avx2@plt
  12244. #else
  12245. callq _fe_sq_n_avx2
  12246. #endif /* __APPLE__ */
  12247. leaq 32(%rsp), %rdi
  12248. leaq 32(%rsp), %rsi
  12249. movq %rsp, %rdx
  12250. #ifndef __APPLE__
  12251. callq fe_mul_avx2@plt
  12252. #else
  12253. callq _fe_mul_avx2
  12254. #endif /* __APPLE__ */
  12255. leaq 64(%rsp), %rdi
  12256. leaq 32(%rsp), %rsi
  12257. #ifndef __APPLE__
  12258. callq fe_sq_avx2@plt
  12259. #else
  12260. callq _fe_sq_avx2
  12261. #endif /* __APPLE__ */
  12262. leaq 64(%rsp), %rdi
  12263. leaq 64(%rsp), %rsi
  12264. movb $19, %dl
  12265. #ifndef __APPLE__
  12266. callq fe_sq_n_avx2@plt
  12267. #else
  12268. callq _fe_sq_n_avx2
  12269. #endif /* __APPLE__ */
  12270. leaq 32(%rsp), %rdi
  12271. leaq 64(%rsp), %rsi
  12272. leaq 32(%rsp), %rdx
  12273. #ifndef __APPLE__
  12274. callq fe_mul_avx2@plt
  12275. #else
  12276. callq _fe_mul_avx2
  12277. #endif /* __APPLE__ */
  12278. leaq 32(%rsp), %rdi
  12279. leaq 32(%rsp), %rsi
  12280. #ifndef __APPLE__
  12281. callq fe_sq_avx2@plt
  12282. #else
  12283. callq _fe_sq_avx2
  12284. #endif /* __APPLE__ */
  12285. leaq 32(%rsp), %rdi
  12286. leaq 32(%rsp), %rsi
  12287. movb $9, %dl
  12288. #ifndef __APPLE__
  12289. callq fe_sq_n_avx2@plt
  12290. #else
  12291. callq _fe_sq_n_avx2
  12292. #endif /* __APPLE__ */
  12293. movq %rsp, %rdi
  12294. leaq 32(%rsp), %rsi
  12295. movq %rsp, %rdx
  12296. #ifndef __APPLE__
  12297. callq fe_mul_avx2@plt
  12298. #else
  12299. callq _fe_mul_avx2
  12300. #endif /* __APPLE__ */
  12301. leaq 32(%rsp), %rdi
  12302. movq %rsp, %rsi
  12303. #ifndef __APPLE__
  12304. callq fe_sq_avx2@plt
  12305. #else
  12306. callq _fe_sq_avx2
  12307. #endif /* __APPLE__ */
  12308. leaq 32(%rsp), %rdi
  12309. leaq 32(%rsp), %rsi
  12310. movb $49, %dl
  12311. #ifndef __APPLE__
  12312. callq fe_sq_n_avx2@plt
  12313. #else
  12314. callq _fe_sq_n_avx2
  12315. #endif /* __APPLE__ */
  12316. leaq 32(%rsp), %rdi
  12317. leaq 32(%rsp), %rsi
  12318. movq %rsp, %rdx
  12319. #ifndef __APPLE__
  12320. callq fe_mul_avx2@plt
  12321. #else
  12322. callq _fe_mul_avx2
  12323. #endif /* __APPLE__ */
  12324. leaq 64(%rsp), %rdi
  12325. leaq 32(%rsp), %rsi
  12326. #ifndef __APPLE__
  12327. callq fe_sq_avx2@plt
  12328. #else
  12329. callq _fe_sq_avx2
  12330. #endif /* __APPLE__ */
  12331. leaq 64(%rsp), %rdi
  12332. leaq 64(%rsp), %rsi
  12333. movb $0x63, %dl
  12334. #ifndef __APPLE__
  12335. callq fe_sq_n_avx2@plt
  12336. #else
  12337. callq _fe_sq_n_avx2
  12338. #endif /* __APPLE__ */
  12339. leaq 32(%rsp), %rdi
  12340. leaq 64(%rsp), %rsi
  12341. leaq 32(%rsp), %rdx
  12342. #ifndef __APPLE__
  12343. callq fe_mul_avx2@plt
  12344. #else
  12345. callq _fe_mul_avx2
  12346. #endif /* __APPLE__ */
  12347. leaq 32(%rsp), %rdi
  12348. leaq 32(%rsp), %rsi
  12349. #ifndef __APPLE__
  12350. callq fe_sq_avx2@plt
  12351. #else
  12352. callq _fe_sq_avx2
  12353. #endif /* __APPLE__ */
  12354. leaq 32(%rsp), %rdi
  12355. leaq 32(%rsp), %rsi
  12356. movb $49, %dl
  12357. #ifndef __APPLE__
  12358. callq fe_sq_n_avx2@plt
  12359. #else
  12360. callq _fe_sq_n_avx2
  12361. #endif /* __APPLE__ */
  12362. movq %rsp, %rdi
  12363. leaq 32(%rsp), %rsi
  12364. movq %rsp, %rdx
  12365. #ifndef __APPLE__
  12366. callq fe_mul_avx2@plt
  12367. #else
  12368. callq _fe_mul_avx2
  12369. #endif /* __APPLE__ */
  12370. movq %rsp, %rdi
  12371. movq %rsp, %rsi
  12372. #ifndef __APPLE__
  12373. callq fe_sq_avx2@plt
  12374. #else
  12375. callq _fe_sq_avx2
  12376. #endif /* __APPLE__ */
  12377. movq %rsp, %rdi
  12378. movq %rsp, %rsi
  12379. #ifndef __APPLE__
  12380. callq fe_sq_avx2@plt
  12381. #else
  12382. callq _fe_sq_avx2
  12383. #endif /* __APPLE__ */
  12384. movq 96(%rsp), %rdi
  12385. movq %rsp, %rsi
  12386. movq 104(%rsp), %rdx
  12387. #ifndef __APPLE__
  12388. callq fe_mul_avx2@plt
  12389. #else
  12390. callq _fe_mul_avx2
  12391. #endif /* __APPLE__ */
  12392. movq 104(%rsp), %rsi
  12393. movq 96(%rsp), %rdi
  12394. addq $0x70, %rsp
  12395. repz retq
  12396. #ifndef __APPLE__
  12397. .text
  12398. .globl fe_ge_to_p2_avx2
  12399. .type fe_ge_to_p2_avx2,@function
  12400. .align 16
  12401. fe_ge_to_p2_avx2:
  12402. #else
  12403. .section __TEXT,__text
  12404. .globl _fe_ge_to_p2_avx2
  12405. .p2align 4
  12406. _fe_ge_to_p2_avx2:
  12407. #endif /* __APPLE__ */
  12408. pushq %rbx
  12409. pushq %r12
  12410. pushq %r13
  12411. pushq %r14
  12412. pushq %r15
  12413. subq $40, %rsp
  12414. movq %rsi, (%rsp)
  12415. movq %rdx, 8(%rsp)
  12416. movq %rcx, 16(%rsp)
  12417. movq %r8, 24(%rsp)
  12418. movq %r9, 32(%rsp)
  12419. movq 16(%rsp), %rsi
  12420. movq 88(%rsp), %rbx
  12421. # Multiply
  12422. # A[0] * B[0]
  12423. movq (%rbx), %rdx
  12424. mulxq (%rsi), %r8, %r9
  12425. # A[2] * B[0]
  12426. mulxq 16(%rsi), %r10, %r11
  12427. # A[1] * B[0]
  12428. mulxq 8(%rsi), %rcx, %rax
  12429. xorq %r15, %r15
  12430. adcxq %rcx, %r9
  12431. # A[1] * B[3]
  12432. movq 24(%rbx), %rdx
  12433. mulxq 8(%rsi), %r12, %r13
  12434. adcxq %rax, %r10
  12435. # A[0] * B[1]
  12436. movq 8(%rbx), %rdx
  12437. mulxq (%rsi), %rcx, %rax
  12438. adoxq %rcx, %r9
  12439. # A[2] * B[1]
  12440. mulxq 16(%rsi), %rcx, %r14
  12441. adoxq %rax, %r10
  12442. adcxq %rcx, %r11
  12443. # A[1] * B[2]
  12444. movq 16(%rbx), %rdx
  12445. mulxq 8(%rsi), %rcx, %rax
  12446. adcxq %r14, %r12
  12447. adoxq %rcx, %r11
  12448. adcxq %r15, %r13
  12449. adoxq %rax, %r12
  12450. # A[0] * B[2]
  12451. mulxq (%rsi), %rcx, %rax
  12452. adoxq %r15, %r13
  12453. xorq %r14, %r14
  12454. adcxq %rcx, %r10
  12455. # A[1] * B[1]
  12456. movq 8(%rbx), %rdx
  12457. mulxq 8(%rsi), %rdx, %rcx
  12458. adcxq %rax, %r11
  12459. adoxq %rdx, %r10
  12460. # A[3] * B[1]
  12461. movq 8(%rbx), %rdx
  12462. adoxq %rcx, %r11
  12463. mulxq 24(%rsi), %rcx, %rax
  12464. adcxq %rcx, %r12
  12465. # A[2] * B[2]
  12466. movq 16(%rbx), %rdx
  12467. mulxq 16(%rsi), %rdx, %rcx
  12468. adcxq %rax, %r13
  12469. adoxq %rdx, %r12
  12470. # A[3] * B[3]
  12471. movq 24(%rbx), %rdx
  12472. adoxq %rcx, %r13
  12473. mulxq 24(%rsi), %rcx, %rax
  12474. adoxq %r15, %r14
  12475. adcxq %rcx, %r14
  12476. # A[0] * B[3]
  12477. mulxq (%rsi), %rdx, %rcx
  12478. adcxq %rax, %r15
  12479. xorq %rax, %rax
  12480. adcxq %rdx, %r11
  12481. # A[3] * B[0]
  12482. movq (%rbx), %rdx
  12483. adcxq %rcx, %r12
  12484. mulxq 24(%rsi), %rdx, %rcx
  12485. adoxq %rdx, %r11
  12486. adoxq %rcx, %r12
  12487. # A[2] * B[3]
  12488. movq 24(%rbx), %rdx
  12489. mulxq 16(%rsi), %rdx, %rcx
  12490. adcxq %rdx, %r13
  12491. # A[3] * B[2]
  12492. movq 16(%rbx), %rdx
  12493. adcxq %rcx, %r14
  12494. mulxq 24(%rsi), %rcx, %rdx
  12495. adcxq %rax, %r15
  12496. adoxq %rcx, %r13
  12497. adoxq %rdx, %r14
  12498. adoxq %rax, %r15
  12499. # Reduce
  12500. movq $0x7fffffffffffffff, %rax
  12501. # Move top half into t4-t7 and remove top bit from t3
  12502. shldq $0x01, %r14, %r15
  12503. shldq $0x01, %r13, %r14
  12504. shldq $0x01, %r12, %r13
  12505. shldq $0x01, %r11, %r12
  12506. andq %rax, %r11
  12507. # Multiply top half by 19
  12508. movq $19, %rdx
  12509. xorq %rax, %rax
  12510. mulxq %r12, %rcx, %r12
  12511. adcxq %rcx, %r8
  12512. adoxq %r12, %r9
  12513. mulxq %r13, %rcx, %r13
  12514. adcxq %rcx, %r9
  12515. adoxq %r13, %r10
  12516. mulxq %r14, %rcx, %r14
  12517. adcxq %rcx, %r10
  12518. adoxq %r14, %r11
  12519. mulxq %r15, %r15, %rdx
  12520. adcxq %r15, %r11
  12521. adoxq %rax, %rdx
  12522. adcxq %rax, %rdx
  12523. # Overflow
  12524. shldq $0x01, %r11, %rdx
  12525. movq $0x7fffffffffffffff, %rax
  12526. imulq $19, %rdx, %rcx
  12527. andq %rax, %r11
  12528. addq %rcx, %r8
  12529. adcq $0x00, %r9
  12530. adcq $0x00, %r10
  12531. adcq $0x00, %r11
  12532. # Reduce if top bit set
  12533. movq %r11, %rdx
  12534. sarq $63, %rdx
  12535. andq $19, %rdx
  12536. andq %rax, %r11
  12537. addq %rdx, %r8
  12538. adcq $0x00, %r9
  12539. adcq $0x00, %r10
  12540. adcq $0x00, %r11
  12541. # Store
  12542. movq %r8, (%rdi)
  12543. movq %r9, 8(%rdi)
  12544. movq %r10, 16(%rdi)
  12545. movq %r11, 24(%rdi)
  12546. movq (%rsp), %rdi
  12547. movq 24(%rsp), %rsi
  12548. movq 32(%rsp), %rbx
  12549. # Multiply
  12550. # A[0] * B[0]
  12551. movq (%rbx), %rdx
  12552. mulxq (%rsi), %r8, %r9
  12553. # A[2] * B[0]
  12554. mulxq 16(%rsi), %r10, %r11
  12555. # A[1] * B[0]
  12556. mulxq 8(%rsi), %rcx, %rax
  12557. xorq %r15, %r15
  12558. adcxq %rcx, %r9
  12559. # A[1] * B[3]
  12560. movq 24(%rbx), %rdx
  12561. mulxq 8(%rsi), %r12, %r13
  12562. adcxq %rax, %r10
  12563. # A[0] * B[1]
  12564. movq 8(%rbx), %rdx
  12565. mulxq (%rsi), %rcx, %rax
  12566. adoxq %rcx, %r9
  12567. # A[2] * B[1]
  12568. mulxq 16(%rsi), %rcx, %r14
  12569. adoxq %rax, %r10
  12570. adcxq %rcx, %r11
  12571. # A[1] * B[2]
  12572. movq 16(%rbx), %rdx
  12573. mulxq 8(%rsi), %rcx, %rax
  12574. adcxq %r14, %r12
  12575. adoxq %rcx, %r11
  12576. adcxq %r15, %r13
  12577. adoxq %rax, %r12
  12578. # A[0] * B[2]
  12579. mulxq (%rsi), %rcx, %rax
  12580. adoxq %r15, %r13
  12581. xorq %r14, %r14
  12582. adcxq %rcx, %r10
  12583. # A[1] * B[1]
  12584. movq 8(%rbx), %rdx
  12585. mulxq 8(%rsi), %rdx, %rcx
  12586. adcxq %rax, %r11
  12587. adoxq %rdx, %r10
  12588. # A[3] * B[1]
  12589. movq 8(%rbx), %rdx
  12590. adoxq %rcx, %r11
  12591. mulxq 24(%rsi), %rcx, %rax
  12592. adcxq %rcx, %r12
  12593. # A[2] * B[2]
  12594. movq 16(%rbx), %rdx
  12595. mulxq 16(%rsi), %rdx, %rcx
  12596. adcxq %rax, %r13
  12597. adoxq %rdx, %r12
  12598. # A[3] * B[3]
  12599. movq 24(%rbx), %rdx
  12600. adoxq %rcx, %r13
  12601. mulxq 24(%rsi), %rcx, %rax
  12602. adoxq %r15, %r14
  12603. adcxq %rcx, %r14
  12604. # A[0] * B[3]
  12605. mulxq (%rsi), %rdx, %rcx
  12606. adcxq %rax, %r15
  12607. xorq %rax, %rax
  12608. adcxq %rdx, %r11
  12609. # A[3] * B[0]
  12610. movq (%rbx), %rdx
  12611. adcxq %rcx, %r12
  12612. mulxq 24(%rsi), %rdx, %rcx
  12613. adoxq %rdx, %r11
  12614. adoxq %rcx, %r12
  12615. # A[2] * B[3]
  12616. movq 24(%rbx), %rdx
  12617. mulxq 16(%rsi), %rdx, %rcx
  12618. adcxq %rdx, %r13
  12619. # A[3] * B[2]
  12620. movq 16(%rbx), %rdx
  12621. adcxq %rcx, %r14
  12622. mulxq 24(%rsi), %rcx, %rdx
  12623. adcxq %rax, %r15
  12624. adoxq %rcx, %r13
  12625. adoxq %rdx, %r14
  12626. adoxq %rax, %r15
  12627. # Reduce
  12628. movq $0x7fffffffffffffff, %rax
  12629. # Move top half into t4-t7 and remove top bit from t3
  12630. shldq $0x01, %r14, %r15
  12631. shldq $0x01, %r13, %r14
  12632. shldq $0x01, %r12, %r13
  12633. shldq $0x01, %r11, %r12
  12634. andq %rax, %r11
  12635. # Multiply top half by 19
  12636. movq $19, %rdx
  12637. xorq %rax, %rax
  12638. mulxq %r12, %rcx, %r12
  12639. adcxq %rcx, %r8
  12640. adoxq %r12, %r9
  12641. mulxq %r13, %rcx, %r13
  12642. adcxq %rcx, %r9
  12643. adoxq %r13, %r10
  12644. mulxq %r14, %rcx, %r14
  12645. adcxq %rcx, %r10
  12646. adoxq %r14, %r11
  12647. mulxq %r15, %r15, %rdx
  12648. adcxq %r15, %r11
  12649. adoxq %rax, %rdx
  12650. adcxq %rax, %rdx
  12651. # Overflow
  12652. shldq $0x01, %r11, %rdx
  12653. movq $0x7fffffffffffffff, %rax
  12654. imulq $19, %rdx, %rcx
  12655. andq %rax, %r11
  12656. addq %rcx, %r8
  12657. adcq $0x00, %r9
  12658. adcq $0x00, %r10
  12659. adcq $0x00, %r11
  12660. # Reduce if top bit set
  12661. movq %r11, %rdx
  12662. sarq $63, %rdx
  12663. andq $19, %rdx
  12664. andq %rax, %r11
  12665. addq %rdx, %r8
  12666. adcq $0x00, %r9
  12667. adcq $0x00, %r10
  12668. adcq $0x00, %r11
  12669. # Store
  12670. movq %r8, (%rdi)
  12671. movq %r9, 8(%rdi)
  12672. movq %r10, 16(%rdi)
  12673. movq %r11, 24(%rdi)
  12674. movq 8(%rsp), %rdi
  12675. movq 88(%rsp), %rsi
  12676. # Multiply
  12677. # A[0] * B[0]
  12678. movq (%rsi), %rdx
  12679. mulxq (%rbx), %r8, %r9
  12680. # A[2] * B[0]
  12681. mulxq 16(%rbx), %r10, %r11
  12682. # A[1] * B[0]
  12683. mulxq 8(%rbx), %rcx, %rax
  12684. xorq %r15, %r15
  12685. adcxq %rcx, %r9
  12686. # A[1] * B[3]
  12687. movq 24(%rsi), %rdx
  12688. mulxq 8(%rbx), %r12, %r13
  12689. adcxq %rax, %r10
  12690. # A[0] * B[1]
  12691. movq 8(%rsi), %rdx
  12692. mulxq (%rbx), %rcx, %rax
  12693. adoxq %rcx, %r9
  12694. # A[2] * B[1]
  12695. mulxq 16(%rbx), %rcx, %r14
  12696. adoxq %rax, %r10
  12697. adcxq %rcx, %r11
  12698. # A[1] * B[2]
  12699. movq 16(%rsi), %rdx
  12700. mulxq 8(%rbx), %rcx, %rax
  12701. adcxq %r14, %r12
  12702. adoxq %rcx, %r11
  12703. adcxq %r15, %r13
  12704. adoxq %rax, %r12
  12705. # A[0] * B[2]
  12706. mulxq (%rbx), %rcx, %rax
  12707. adoxq %r15, %r13
  12708. xorq %r14, %r14
  12709. adcxq %rcx, %r10
  12710. # A[1] * B[1]
  12711. movq 8(%rsi), %rdx
  12712. mulxq 8(%rbx), %rdx, %rcx
  12713. adcxq %rax, %r11
  12714. adoxq %rdx, %r10
  12715. # A[3] * B[1]
  12716. movq 8(%rsi), %rdx
  12717. adoxq %rcx, %r11
  12718. mulxq 24(%rbx), %rcx, %rax
  12719. adcxq %rcx, %r12
  12720. # A[2] * B[2]
  12721. movq 16(%rsi), %rdx
  12722. mulxq 16(%rbx), %rdx, %rcx
  12723. adcxq %rax, %r13
  12724. adoxq %rdx, %r12
  12725. # A[3] * B[3]
  12726. movq 24(%rsi), %rdx
  12727. adoxq %rcx, %r13
  12728. mulxq 24(%rbx), %rcx, %rax
  12729. adoxq %r15, %r14
  12730. adcxq %rcx, %r14
  12731. # A[0] * B[3]
  12732. mulxq (%rbx), %rdx, %rcx
  12733. adcxq %rax, %r15
  12734. xorq %rax, %rax
  12735. adcxq %rdx, %r11
  12736. # A[3] * B[0]
  12737. movq (%rsi), %rdx
  12738. adcxq %rcx, %r12
  12739. mulxq 24(%rbx), %rdx, %rcx
  12740. adoxq %rdx, %r11
  12741. adoxq %rcx, %r12
  12742. # A[2] * B[3]
  12743. movq 24(%rsi), %rdx
  12744. mulxq 16(%rbx), %rdx, %rcx
  12745. adcxq %rdx, %r13
  12746. # A[3] * B[2]
  12747. movq 16(%rsi), %rdx
  12748. adcxq %rcx, %r14
  12749. mulxq 24(%rbx), %rcx, %rdx
  12750. adcxq %rax, %r15
  12751. adoxq %rcx, %r13
  12752. adoxq %rdx, %r14
  12753. adoxq %rax, %r15
  12754. # Reduce
  12755. movq $0x7fffffffffffffff, %rax
  12756. # Move top half into t4-t7 and remove top bit from t3
  12757. shldq $0x01, %r14, %r15
  12758. shldq $0x01, %r13, %r14
  12759. shldq $0x01, %r12, %r13
  12760. shldq $0x01, %r11, %r12
  12761. andq %rax, %r11
  12762. # Multiply top half by 19
  12763. movq $19, %rdx
  12764. xorq %rax, %rax
  12765. mulxq %r12, %rcx, %r12
  12766. adcxq %rcx, %r8
  12767. adoxq %r12, %r9
  12768. mulxq %r13, %rcx, %r13
  12769. adcxq %rcx, %r9
  12770. adoxq %r13, %r10
  12771. mulxq %r14, %rcx, %r14
  12772. adcxq %rcx, %r10
  12773. adoxq %r14, %r11
  12774. mulxq %r15, %r15, %rdx
  12775. adcxq %r15, %r11
  12776. adoxq %rax, %rdx
  12777. adcxq %rax, %rdx
  12778. # Overflow
  12779. shldq $0x01, %r11, %rdx
  12780. movq $0x7fffffffffffffff, %rax
  12781. imulq $19, %rdx, %rcx
  12782. andq %rax, %r11
  12783. addq %rcx, %r8
  12784. adcq $0x00, %r9
  12785. adcq $0x00, %r10
  12786. adcq $0x00, %r11
  12787. # Reduce if top bit set
  12788. movq %r11, %rdx
  12789. sarq $63, %rdx
  12790. andq $19, %rdx
  12791. andq %rax, %r11
  12792. addq %rdx, %r8
  12793. adcq $0x00, %r9
  12794. adcq $0x00, %r10
  12795. adcq $0x00, %r11
  12796. # Store
  12797. movq %r8, (%rdi)
  12798. movq %r9, 8(%rdi)
  12799. movq %r10, 16(%rdi)
  12800. movq %r11, 24(%rdi)
  12801. addq $40, %rsp
  12802. popq %r15
  12803. popq %r14
  12804. popq %r13
  12805. popq %r12
  12806. popq %rbx
  12807. repz retq
  12808. #ifndef __APPLE__
  12809. .size fe_ge_to_p2_avx2,.-fe_ge_to_p2_avx2
  12810. #endif /* __APPLE__ */
  12811. #ifndef __APPLE__
  12812. .text
  12813. .globl fe_ge_to_p3_avx2
  12814. .type fe_ge_to_p3_avx2,@function
  12815. .align 16
  12816. fe_ge_to_p3_avx2:
  12817. #else
  12818. .section __TEXT,__text
  12819. .globl _fe_ge_to_p3_avx2
  12820. .p2align 4
  12821. _fe_ge_to_p3_avx2:
  12822. #endif /* __APPLE__ */
  12823. pushq %rbx
  12824. pushq %r12
  12825. pushq %r13
  12826. pushq %r14
  12827. pushq %r15
  12828. subq $40, %rsp
  12829. movq %rsi, (%rsp)
  12830. movq %rdx, 8(%rsp)
  12831. movq %rcx, 16(%rsp)
  12832. movq %r8, 24(%rsp)
  12833. movq %r9, 32(%rsp)
  12834. movq 24(%rsp), %rsi
  12835. movq 96(%rsp), %rbx
  12836. # Multiply
  12837. # A[0] * B[0]
  12838. movq (%rbx), %rdx
  12839. mulxq (%rsi), %r8, %r9
  12840. # A[2] * B[0]
  12841. mulxq 16(%rsi), %r10, %r11
  12842. # A[1] * B[0]
  12843. mulxq 8(%rsi), %rcx, %rax
  12844. xorq %r15, %r15
  12845. adcxq %rcx, %r9
  12846. # A[1] * B[3]
  12847. movq 24(%rbx), %rdx
  12848. mulxq 8(%rsi), %r12, %r13
  12849. adcxq %rax, %r10
  12850. # A[0] * B[1]
  12851. movq 8(%rbx), %rdx
  12852. mulxq (%rsi), %rcx, %rax
  12853. adoxq %rcx, %r9
  12854. # A[2] * B[1]
  12855. mulxq 16(%rsi), %rcx, %r14
  12856. adoxq %rax, %r10
  12857. adcxq %rcx, %r11
  12858. # A[1] * B[2]
  12859. movq 16(%rbx), %rdx
  12860. mulxq 8(%rsi), %rcx, %rax
  12861. adcxq %r14, %r12
  12862. adoxq %rcx, %r11
  12863. adcxq %r15, %r13
  12864. adoxq %rax, %r12
  12865. # A[0] * B[2]
  12866. mulxq (%rsi), %rcx, %rax
  12867. adoxq %r15, %r13
  12868. xorq %r14, %r14
  12869. adcxq %rcx, %r10
  12870. # A[1] * B[1]
  12871. movq 8(%rbx), %rdx
  12872. mulxq 8(%rsi), %rdx, %rcx
  12873. adcxq %rax, %r11
  12874. adoxq %rdx, %r10
  12875. # A[3] * B[1]
  12876. movq 8(%rbx), %rdx
  12877. adoxq %rcx, %r11
  12878. mulxq 24(%rsi), %rcx, %rax
  12879. adcxq %rcx, %r12
  12880. # A[2] * B[2]
  12881. movq 16(%rbx), %rdx
  12882. mulxq 16(%rsi), %rdx, %rcx
  12883. adcxq %rax, %r13
  12884. adoxq %rdx, %r12
  12885. # A[3] * B[3]
  12886. movq 24(%rbx), %rdx
  12887. adoxq %rcx, %r13
  12888. mulxq 24(%rsi), %rcx, %rax
  12889. adoxq %r15, %r14
  12890. adcxq %rcx, %r14
  12891. # A[0] * B[3]
  12892. mulxq (%rsi), %rdx, %rcx
  12893. adcxq %rax, %r15
  12894. xorq %rax, %rax
  12895. adcxq %rdx, %r11
  12896. # A[3] * B[0]
  12897. movq (%rbx), %rdx
  12898. adcxq %rcx, %r12
  12899. mulxq 24(%rsi), %rdx, %rcx
  12900. adoxq %rdx, %r11
  12901. adoxq %rcx, %r12
  12902. # A[2] * B[3]
  12903. movq 24(%rbx), %rdx
  12904. mulxq 16(%rsi), %rdx, %rcx
  12905. adcxq %rdx, %r13
  12906. # A[3] * B[2]
  12907. movq 16(%rbx), %rdx
  12908. adcxq %rcx, %r14
  12909. mulxq 24(%rsi), %rcx, %rdx
  12910. adcxq %rax, %r15
  12911. adoxq %rcx, %r13
  12912. adoxq %rdx, %r14
  12913. adoxq %rax, %r15
  12914. # Reduce
  12915. movq $0x7fffffffffffffff, %rax
  12916. # Move top half into t4-t7 and remove top bit from t3
  12917. shldq $0x01, %r14, %r15
  12918. shldq $0x01, %r13, %r14
  12919. shldq $0x01, %r12, %r13
  12920. shldq $0x01, %r11, %r12
  12921. andq %rax, %r11
  12922. # Multiply top half by 19
  12923. movq $19, %rdx
  12924. xorq %rax, %rax
  12925. mulxq %r12, %rcx, %r12
  12926. adcxq %rcx, %r8
  12927. adoxq %r12, %r9
  12928. mulxq %r13, %rcx, %r13
  12929. adcxq %rcx, %r9
  12930. adoxq %r13, %r10
  12931. mulxq %r14, %rcx, %r14
  12932. adcxq %rcx, %r10
  12933. adoxq %r14, %r11
  12934. mulxq %r15, %r15, %rdx
  12935. adcxq %r15, %r11
  12936. adoxq %rax, %rdx
  12937. adcxq %rax, %rdx
  12938. # Overflow
  12939. shldq $0x01, %r11, %rdx
  12940. movq $0x7fffffffffffffff, %rax
  12941. imulq $19, %rdx, %rcx
  12942. andq %rax, %r11
  12943. addq %rcx, %r8
  12944. adcq $0x00, %r9
  12945. adcq $0x00, %r10
  12946. adcq $0x00, %r11
  12947. # Reduce if top bit set
  12948. movq %r11, %rdx
  12949. sarq $63, %rdx
  12950. andq $19, %rdx
  12951. andq %rax, %r11
  12952. addq %rdx, %r8
  12953. adcq $0x00, %r9
  12954. adcq $0x00, %r10
  12955. adcq $0x00, %r11
  12956. # Store
  12957. movq %r8, (%rdi)
  12958. movq %r9, 8(%rdi)
  12959. movq %r10, 16(%rdi)
  12960. movq %r11, 24(%rdi)
  12961. movq (%rsp), %rdi
  12962. movq 32(%rsp), %rsi
  12963. movq 88(%rsp), %rbx
  12964. # Multiply
  12965. # A[0] * B[0]
  12966. movq (%rbx), %rdx
  12967. mulxq (%rsi), %r8, %r9
  12968. # A[2] * B[0]
  12969. mulxq 16(%rsi), %r10, %r11
  12970. # A[1] * B[0]
  12971. mulxq 8(%rsi), %rcx, %rax
  12972. xorq %r15, %r15
  12973. adcxq %rcx, %r9
  12974. # A[1] * B[3]
  12975. movq 24(%rbx), %rdx
  12976. mulxq 8(%rsi), %r12, %r13
  12977. adcxq %rax, %r10
  12978. # A[0] * B[1]
  12979. movq 8(%rbx), %rdx
  12980. mulxq (%rsi), %rcx, %rax
  12981. adoxq %rcx, %r9
  12982. # A[2] * B[1]
  12983. mulxq 16(%rsi), %rcx, %r14
  12984. adoxq %rax, %r10
  12985. adcxq %rcx, %r11
  12986. # A[1] * B[2]
  12987. movq 16(%rbx), %rdx
  12988. mulxq 8(%rsi), %rcx, %rax
  12989. adcxq %r14, %r12
  12990. adoxq %rcx, %r11
  12991. adcxq %r15, %r13
  12992. adoxq %rax, %r12
  12993. # A[0] * B[2]
  12994. mulxq (%rsi), %rcx, %rax
  12995. adoxq %r15, %r13
  12996. xorq %r14, %r14
  12997. adcxq %rcx, %r10
  12998. # A[1] * B[1]
  12999. movq 8(%rbx), %rdx
  13000. mulxq 8(%rsi), %rdx, %rcx
  13001. adcxq %rax, %r11
  13002. adoxq %rdx, %r10
  13003. # A[3] * B[1]
  13004. movq 8(%rbx), %rdx
  13005. adoxq %rcx, %r11
  13006. mulxq 24(%rsi), %rcx, %rax
  13007. adcxq %rcx, %r12
  13008. # A[2] * B[2]
  13009. movq 16(%rbx), %rdx
  13010. mulxq 16(%rsi), %rdx, %rcx
  13011. adcxq %rax, %r13
  13012. adoxq %rdx, %r12
  13013. # A[3] * B[3]
  13014. movq 24(%rbx), %rdx
  13015. adoxq %rcx, %r13
  13016. mulxq 24(%rsi), %rcx, %rax
  13017. adoxq %r15, %r14
  13018. adcxq %rcx, %r14
  13019. # A[0] * B[3]
  13020. mulxq (%rsi), %rdx, %rcx
  13021. adcxq %rax, %r15
  13022. xorq %rax, %rax
  13023. adcxq %rdx, %r11
  13024. # A[3] * B[0]
  13025. movq (%rbx), %rdx
  13026. adcxq %rcx, %r12
  13027. mulxq 24(%rsi), %rdx, %rcx
  13028. adoxq %rdx, %r11
  13029. adoxq %rcx, %r12
  13030. # A[2] * B[3]
  13031. movq 24(%rbx), %rdx
  13032. mulxq 16(%rsi), %rdx, %rcx
  13033. adcxq %rdx, %r13
  13034. # A[3] * B[2]
  13035. movq 16(%rbx), %rdx
  13036. adcxq %rcx, %r14
  13037. mulxq 24(%rsi), %rcx, %rdx
  13038. adcxq %rax, %r15
  13039. adoxq %rcx, %r13
  13040. adoxq %rdx, %r14
  13041. adoxq %rax, %r15
  13042. # Reduce
  13043. movq $0x7fffffffffffffff, %rax
  13044. # Move top half into t4-t7 and remove top bit from t3
  13045. shldq $0x01, %r14, %r15
  13046. shldq $0x01, %r13, %r14
  13047. shldq $0x01, %r12, %r13
  13048. shldq $0x01, %r11, %r12
  13049. andq %rax, %r11
  13050. # Multiply top half by 19
  13051. movq $19, %rdx
  13052. xorq %rax, %rax
  13053. mulxq %r12, %rcx, %r12
  13054. adcxq %rcx, %r8
  13055. adoxq %r12, %r9
  13056. mulxq %r13, %rcx, %r13
  13057. adcxq %rcx, %r9
  13058. adoxq %r13, %r10
  13059. mulxq %r14, %rcx, %r14
  13060. adcxq %rcx, %r10
  13061. adoxq %r14, %r11
  13062. mulxq %r15, %r15, %rdx
  13063. adcxq %r15, %r11
  13064. adoxq %rax, %rdx
  13065. adcxq %rax, %rdx
  13066. # Overflow
  13067. shldq $0x01, %r11, %rdx
  13068. movq $0x7fffffffffffffff, %rax
  13069. imulq $19, %rdx, %rcx
  13070. andq %rax, %r11
  13071. addq %rcx, %r8
  13072. adcq $0x00, %r9
  13073. adcq $0x00, %r10
  13074. adcq $0x00, %r11
  13075. # Reduce if top bit set
  13076. movq %r11, %rdx
  13077. sarq $63, %rdx
  13078. andq $19, %rdx
  13079. andq %rax, %r11
  13080. addq %rdx, %r8
  13081. adcq $0x00, %r9
  13082. adcq $0x00, %r10
  13083. adcq $0x00, %r11
  13084. # Store
  13085. movq %r8, (%rdi)
  13086. movq %r9, 8(%rdi)
  13087. movq %r10, 16(%rdi)
  13088. movq %r11, 24(%rdi)
  13089. movq 8(%rsp), %rdi
  13090. movq 96(%rsp), %rsi
  13091. # Multiply
  13092. # A[0] * B[0]
  13093. movq (%rsi), %rdx
  13094. mulxq (%rbx), %r8, %r9
  13095. # A[2] * B[0]
  13096. mulxq 16(%rbx), %r10, %r11
  13097. # A[1] * B[0]
  13098. mulxq 8(%rbx), %rcx, %rax
  13099. xorq %r15, %r15
  13100. adcxq %rcx, %r9
  13101. # A[1] * B[3]
  13102. movq 24(%rsi), %rdx
  13103. mulxq 8(%rbx), %r12, %r13
  13104. adcxq %rax, %r10
  13105. # A[0] * B[1]
  13106. movq 8(%rsi), %rdx
  13107. mulxq (%rbx), %rcx, %rax
  13108. adoxq %rcx, %r9
  13109. # A[2] * B[1]
  13110. mulxq 16(%rbx), %rcx, %r14
  13111. adoxq %rax, %r10
  13112. adcxq %rcx, %r11
  13113. # A[1] * B[2]
  13114. movq 16(%rsi), %rdx
  13115. mulxq 8(%rbx), %rcx, %rax
  13116. adcxq %r14, %r12
  13117. adoxq %rcx, %r11
  13118. adcxq %r15, %r13
  13119. adoxq %rax, %r12
  13120. # A[0] * B[2]
  13121. mulxq (%rbx), %rcx, %rax
  13122. adoxq %r15, %r13
  13123. xorq %r14, %r14
  13124. adcxq %rcx, %r10
  13125. # A[1] * B[1]
  13126. movq 8(%rsi), %rdx
  13127. mulxq 8(%rbx), %rdx, %rcx
  13128. adcxq %rax, %r11
  13129. adoxq %rdx, %r10
  13130. # A[3] * B[1]
  13131. movq 8(%rsi), %rdx
  13132. adoxq %rcx, %r11
  13133. mulxq 24(%rbx), %rcx, %rax
  13134. adcxq %rcx, %r12
  13135. # A[2] * B[2]
  13136. movq 16(%rsi), %rdx
  13137. mulxq 16(%rbx), %rdx, %rcx
  13138. adcxq %rax, %r13
  13139. adoxq %rdx, %r12
  13140. # A[3] * B[3]
  13141. movq 24(%rsi), %rdx
  13142. adoxq %rcx, %r13
  13143. mulxq 24(%rbx), %rcx, %rax
  13144. adoxq %r15, %r14
  13145. adcxq %rcx, %r14
  13146. # A[0] * B[3]
  13147. mulxq (%rbx), %rdx, %rcx
  13148. adcxq %rax, %r15
  13149. xorq %rax, %rax
  13150. adcxq %rdx, %r11
  13151. # A[3] * B[0]
  13152. movq (%rsi), %rdx
  13153. adcxq %rcx, %r12
  13154. mulxq 24(%rbx), %rdx, %rcx
  13155. adoxq %rdx, %r11
  13156. adoxq %rcx, %r12
  13157. # A[2] * B[3]
  13158. movq 24(%rsi), %rdx
  13159. mulxq 16(%rbx), %rdx, %rcx
  13160. adcxq %rdx, %r13
  13161. # A[3] * B[2]
  13162. movq 16(%rsi), %rdx
  13163. adcxq %rcx, %r14
  13164. mulxq 24(%rbx), %rcx, %rdx
  13165. adcxq %rax, %r15
  13166. adoxq %rcx, %r13
  13167. adoxq %rdx, %r14
  13168. adoxq %rax, %r15
  13169. # Reduce
  13170. movq $0x7fffffffffffffff, %rax
  13171. # Move top half into t4-t7 and remove top bit from t3
  13172. shldq $0x01, %r14, %r15
  13173. shldq $0x01, %r13, %r14
  13174. shldq $0x01, %r12, %r13
  13175. shldq $0x01, %r11, %r12
  13176. andq %rax, %r11
  13177. # Multiply top half by 19
  13178. movq $19, %rdx
  13179. xorq %rax, %rax
  13180. mulxq %r12, %rcx, %r12
  13181. adcxq %rcx, %r8
  13182. adoxq %r12, %r9
  13183. mulxq %r13, %rcx, %r13
  13184. adcxq %rcx, %r9
  13185. adoxq %r13, %r10
  13186. mulxq %r14, %rcx, %r14
  13187. adcxq %rcx, %r10
  13188. adoxq %r14, %r11
  13189. mulxq %r15, %r15, %rdx
  13190. adcxq %r15, %r11
  13191. adoxq %rax, %rdx
  13192. adcxq %rax, %rdx
  13193. # Overflow
  13194. shldq $0x01, %r11, %rdx
  13195. movq $0x7fffffffffffffff, %rax
  13196. imulq $19, %rdx, %rcx
  13197. andq %rax, %r11
  13198. addq %rcx, %r8
  13199. adcq $0x00, %r9
  13200. adcq $0x00, %r10
  13201. adcq $0x00, %r11
  13202. # Reduce if top bit set
  13203. movq %r11, %rdx
  13204. sarq $63, %rdx
  13205. andq $19, %rdx
  13206. andq %rax, %r11
  13207. addq %rdx, %r8
  13208. adcq $0x00, %r9
  13209. adcq $0x00, %r10
  13210. adcq $0x00, %r11
  13211. # Store
  13212. movq %r8, (%rdi)
  13213. movq %r9, 8(%rdi)
  13214. movq %r10, 16(%rdi)
  13215. movq %r11, 24(%rdi)
  13216. movq 16(%rsp), %rdi
  13217. movq 24(%rsp), %rsi
  13218. movq 32(%rsp), %rbx
  13219. # Multiply
  13220. # A[0] * B[0]
  13221. movq (%rbx), %rdx
  13222. mulxq (%rsi), %r8, %r9
  13223. # A[2] * B[0]
  13224. mulxq 16(%rsi), %r10, %r11
  13225. # A[1] * B[0]
  13226. mulxq 8(%rsi), %rcx, %rax
  13227. xorq %r15, %r15
  13228. adcxq %rcx, %r9
  13229. # A[1] * B[3]
  13230. movq 24(%rbx), %rdx
  13231. mulxq 8(%rsi), %r12, %r13
  13232. adcxq %rax, %r10
  13233. # A[0] * B[1]
  13234. movq 8(%rbx), %rdx
  13235. mulxq (%rsi), %rcx, %rax
  13236. adoxq %rcx, %r9
  13237. # A[2] * B[1]
  13238. mulxq 16(%rsi), %rcx, %r14
  13239. adoxq %rax, %r10
  13240. adcxq %rcx, %r11
  13241. # A[1] * B[2]
  13242. movq 16(%rbx), %rdx
  13243. mulxq 8(%rsi), %rcx, %rax
  13244. adcxq %r14, %r12
  13245. adoxq %rcx, %r11
  13246. adcxq %r15, %r13
  13247. adoxq %rax, %r12
  13248. # A[0] * B[2]
  13249. mulxq (%rsi), %rcx, %rax
  13250. adoxq %r15, %r13
  13251. xorq %r14, %r14
  13252. adcxq %rcx, %r10
  13253. # A[1] * B[1]
  13254. movq 8(%rbx), %rdx
  13255. mulxq 8(%rsi), %rdx, %rcx
  13256. adcxq %rax, %r11
  13257. adoxq %rdx, %r10
  13258. # A[3] * B[1]
  13259. movq 8(%rbx), %rdx
  13260. adoxq %rcx, %r11
  13261. mulxq 24(%rsi), %rcx, %rax
  13262. adcxq %rcx, %r12
  13263. # A[2] * B[2]
  13264. movq 16(%rbx), %rdx
  13265. mulxq 16(%rsi), %rdx, %rcx
  13266. adcxq %rax, %r13
  13267. adoxq %rdx, %r12
  13268. # A[3] * B[3]
  13269. movq 24(%rbx), %rdx
  13270. adoxq %rcx, %r13
  13271. mulxq 24(%rsi), %rcx, %rax
  13272. adoxq %r15, %r14
  13273. adcxq %rcx, %r14
  13274. # A[0] * B[3]
  13275. mulxq (%rsi), %rdx, %rcx
  13276. adcxq %rax, %r15
  13277. xorq %rax, %rax
  13278. adcxq %rdx, %r11
  13279. # A[3] * B[0]
  13280. movq (%rbx), %rdx
  13281. adcxq %rcx, %r12
  13282. mulxq 24(%rsi), %rdx, %rcx
  13283. adoxq %rdx, %r11
  13284. adoxq %rcx, %r12
  13285. # A[2] * B[3]
  13286. movq 24(%rbx), %rdx
  13287. mulxq 16(%rsi), %rdx, %rcx
  13288. adcxq %rdx, %r13
  13289. # A[3] * B[2]
  13290. movq 16(%rbx), %rdx
  13291. adcxq %rcx, %r14
  13292. mulxq 24(%rsi), %rcx, %rdx
  13293. adcxq %rax, %r15
  13294. adoxq %rcx, %r13
  13295. adoxq %rdx, %r14
  13296. adoxq %rax, %r15
  13297. # Reduce
  13298. movq $0x7fffffffffffffff, %rax
  13299. # Move top half into t4-t7 and remove top bit from t3
  13300. shldq $0x01, %r14, %r15
  13301. shldq $0x01, %r13, %r14
  13302. shldq $0x01, %r12, %r13
  13303. shldq $0x01, %r11, %r12
  13304. andq %rax, %r11
  13305. # Multiply top half by 19
  13306. movq $19, %rdx
  13307. xorq %rax, %rax
  13308. mulxq %r12, %rcx, %r12
  13309. adcxq %rcx, %r8
  13310. adoxq %r12, %r9
  13311. mulxq %r13, %rcx, %r13
  13312. adcxq %rcx, %r9
  13313. adoxq %r13, %r10
  13314. mulxq %r14, %rcx, %r14
  13315. adcxq %rcx, %r10
  13316. adoxq %r14, %r11
  13317. mulxq %r15, %r15, %rdx
  13318. adcxq %r15, %r11
  13319. adoxq %rax, %rdx
  13320. adcxq %rax, %rdx
  13321. # Overflow
  13322. shldq $0x01, %r11, %rdx
  13323. movq $0x7fffffffffffffff, %rax
  13324. imulq $19, %rdx, %rcx
  13325. andq %rax, %r11
  13326. addq %rcx, %r8
  13327. adcq $0x00, %r9
  13328. adcq $0x00, %r10
  13329. adcq $0x00, %r11
  13330. # Reduce if top bit set
  13331. movq %r11, %rdx
  13332. sarq $63, %rdx
  13333. andq $19, %rdx
  13334. andq %rax, %r11
  13335. addq %rdx, %r8
  13336. adcq $0x00, %r9
  13337. adcq $0x00, %r10
  13338. adcq $0x00, %r11
  13339. # Store
  13340. movq %r8, (%rdi)
  13341. movq %r9, 8(%rdi)
  13342. movq %r10, 16(%rdi)
  13343. movq %r11, 24(%rdi)
  13344. addq $40, %rsp
  13345. popq %r15
  13346. popq %r14
  13347. popq %r13
  13348. popq %r12
  13349. popq %rbx
  13350. repz retq
  13351. #ifndef __APPLE__
  13352. .size fe_ge_to_p3_avx2,.-fe_ge_to_p3_avx2
  13353. #endif /* __APPLE__ */
  13354. #ifndef __APPLE__
  13355. .text
  13356. .globl fe_ge_dbl_avx2
  13357. .type fe_ge_dbl_avx2,@function
  13358. .align 16
  13359. fe_ge_dbl_avx2:
  13360. #else
  13361. .section __TEXT,__text
  13362. .globl _fe_ge_dbl_avx2
  13363. .p2align 4
  13364. _fe_ge_dbl_avx2:
  13365. #endif /* __APPLE__ */
  13366. pushq %rbp
  13367. pushq %rbx
  13368. pushq %r12
  13369. pushq %r13
  13370. pushq %r14
  13371. pushq %r15
  13372. subq $48, %rsp
  13373. movq %rdi, (%rsp)
  13374. movq %rsi, 8(%rsp)
  13375. movq %rdx, 16(%rsp)
  13376. movq %rcx, 24(%rsp)
  13377. movq %r8, 32(%rsp)
  13378. movq %r9, 40(%rsp)
  13379. movq 32(%rsp), %rsi
  13380. # Square
  13381. # A[0] * A[1]
  13382. movq (%rsi), %rdx
  13383. mulxq 8(%rsi), %r9, %r10
  13384. # A[0] * A[3]
  13385. mulxq 24(%rsi), %r11, %r12
  13386. # A[2] * A[1]
  13387. movq 16(%rsi), %rdx
  13388. mulxq 8(%rsi), %rcx, %rax
  13389. xorq %r15, %r15
  13390. adoxq %rcx, %r11
  13391. # A[2] * A[3]
  13392. mulxq 24(%rsi), %r13, %r14
  13393. adoxq %rax, %r12
  13394. # A[2] * A[0]
  13395. mulxq (%rsi), %rcx, %rax
  13396. adoxq %r15, %r13
  13397. adcxq %rcx, %r10
  13398. adoxq %r15, %r14
  13399. # A[1] * A[3]
  13400. movq 8(%rsi), %rdx
  13401. mulxq 24(%rsi), %rbp, %r8
  13402. adcxq %rax, %r11
  13403. adcxq %rbp, %r12
  13404. adcxq %r8, %r13
  13405. adcxq %r15, %r14
  13406. # Double with Carry Flag
  13407. xorq %r15, %r15
  13408. # A[0] * A[0]
  13409. movq (%rsi), %rdx
  13410. mulxq %rdx, %r8, %rbp
  13411. adcxq %r9, %r9
  13412. # A[1] * A[1]
  13413. movq 8(%rsi), %rdx
  13414. mulxq %rdx, %rcx, %rax
  13415. adcxq %r10, %r10
  13416. adoxq %rbp, %r9
  13417. adcxq %r11, %r11
  13418. adoxq %rcx, %r10
  13419. # A[2] * A[2]
  13420. movq 16(%rsi), %rdx
  13421. mulxq %rdx, %rbp, %rcx
  13422. adcxq %r12, %r12
  13423. adoxq %rax, %r11
  13424. adcxq %r13, %r13
  13425. adoxq %rbp, %r12
  13426. # A[3] * A[3]
  13427. movq 24(%rsi), %rdx
  13428. mulxq %rdx, %rbp, %rax
  13429. adcxq %r14, %r14
  13430. adoxq %rcx, %r13
  13431. adcxq %r15, %r15
  13432. adoxq %rbp, %r14
  13433. adoxq %rax, %r15
  13434. # Reduce
  13435. movq $0x7fffffffffffffff, %rcx
  13436. # Move top half into t4-t7 and remove top bit from t3
  13437. shldq $0x01, %r14, %r15
  13438. shldq $0x01, %r13, %r14
  13439. shldq $0x01, %r12, %r13
  13440. shldq $0x01, %r11, %r12
  13441. andq %rcx, %r11
  13442. # Multiply top half by 19
  13443. movq $19, %rdx
  13444. xorq %rcx, %rcx
  13445. mulxq %r12, %rbp, %r12
  13446. adcxq %rbp, %r8
  13447. adoxq %r12, %r9
  13448. mulxq %r13, %rbp, %r13
  13449. adcxq %rbp, %r9
  13450. adoxq %r13, %r10
  13451. mulxq %r14, %rbp, %r14
  13452. adcxq %rbp, %r10
  13453. adoxq %r14, %r11
  13454. mulxq %r15, %r15, %rdx
  13455. adcxq %r15, %r11
  13456. adoxq %rcx, %rdx
  13457. adcxq %rcx, %rdx
  13458. # Overflow
  13459. shldq $0x01, %r11, %rdx
  13460. movq $0x7fffffffffffffff, %rcx
  13461. imulq $19, %rdx, %rbp
  13462. andq %rcx, %r11
  13463. addq %rbp, %r8
  13464. adcq $0x00, %r9
  13465. adcq $0x00, %r10
  13466. adcq $0x00, %r11
  13467. # Reduce if top bit set
  13468. movq %r11, %rdx
  13469. sarq $63, %rdx
  13470. andq $19, %rdx
  13471. andq %rcx, %r11
  13472. addq %rdx, %r8
  13473. adcq $0x00, %r9
  13474. adcq $0x00, %r10
  13475. adcq $0x00, %r11
  13476. # Store
  13477. movq %r8, (%rdi)
  13478. movq %r9, 8(%rdi)
  13479. movq %r10, 16(%rdi)
  13480. movq %r11, 24(%rdi)
  13481. movq 16(%rsp), %rdi
  13482. movq 40(%rsp), %rbx
  13483. # Square
  13484. # A[0] * A[1]
  13485. movq (%rbx), %rdx
  13486. mulxq 8(%rbx), %r9, %r10
  13487. # A[0] * A[3]
  13488. mulxq 24(%rbx), %r11, %r12
  13489. # A[2] * A[1]
  13490. movq 16(%rbx), %rdx
  13491. mulxq 8(%rbx), %rcx, %rax
  13492. xorq %r15, %r15
  13493. adoxq %rcx, %r11
  13494. # A[2] * A[3]
  13495. mulxq 24(%rbx), %r13, %r14
  13496. adoxq %rax, %r12
  13497. # A[2] * A[0]
  13498. mulxq (%rbx), %rcx, %rax
  13499. adoxq %r15, %r13
  13500. adcxq %rcx, %r10
  13501. adoxq %r15, %r14
  13502. # A[1] * A[3]
  13503. movq 8(%rbx), %rdx
  13504. mulxq 24(%rbx), %rbp, %r8
  13505. adcxq %rax, %r11
  13506. adcxq %rbp, %r12
  13507. adcxq %r8, %r13
  13508. adcxq %r15, %r14
  13509. # Double with Carry Flag
  13510. xorq %r15, %r15
  13511. # A[0] * A[0]
  13512. movq (%rbx), %rdx
  13513. mulxq %rdx, %r8, %rbp
  13514. adcxq %r9, %r9
  13515. # A[1] * A[1]
  13516. movq 8(%rbx), %rdx
  13517. mulxq %rdx, %rcx, %rax
  13518. adcxq %r10, %r10
  13519. adoxq %rbp, %r9
  13520. adcxq %r11, %r11
  13521. adoxq %rcx, %r10
  13522. # A[2] * A[2]
  13523. movq 16(%rbx), %rdx
  13524. mulxq %rdx, %rbp, %rcx
  13525. adcxq %r12, %r12
  13526. adoxq %rax, %r11
  13527. adcxq %r13, %r13
  13528. adoxq %rbp, %r12
  13529. # A[3] * A[3]
  13530. movq 24(%rbx), %rdx
  13531. mulxq %rdx, %rbp, %rax
  13532. adcxq %r14, %r14
  13533. adoxq %rcx, %r13
  13534. adcxq %r15, %r15
  13535. adoxq %rbp, %r14
  13536. adoxq %rax, %r15
  13537. # Reduce
  13538. movq $0x7fffffffffffffff, %rcx
  13539. # Move top half into t4-t7 and remove top bit from t3
  13540. shldq $0x01, %r14, %r15
  13541. shldq $0x01, %r13, %r14
  13542. shldq $0x01, %r12, %r13
  13543. shldq $0x01, %r11, %r12
  13544. andq %rcx, %r11
  13545. # Multiply top half by 19
  13546. movq $19, %rdx
  13547. xorq %rcx, %rcx
  13548. mulxq %r12, %rbp, %r12
  13549. adcxq %rbp, %r8
  13550. adoxq %r12, %r9
  13551. mulxq %r13, %rbp, %r13
  13552. adcxq %rbp, %r9
  13553. adoxq %r13, %r10
  13554. mulxq %r14, %rbp, %r14
  13555. adcxq %rbp, %r10
  13556. adoxq %r14, %r11
  13557. mulxq %r15, %r15, %rdx
  13558. adcxq %r15, %r11
  13559. adoxq %rcx, %rdx
  13560. adcxq %rcx, %rdx
  13561. # Overflow
  13562. shldq $0x01, %r11, %rdx
  13563. movq $0x7fffffffffffffff, %rcx
  13564. imulq $19, %rdx, %rbp
  13565. andq %rcx, %r11
  13566. addq %rbp, %r8
  13567. adcq $0x00, %r9
  13568. adcq $0x00, %r10
  13569. adcq $0x00, %r11
  13570. # Reduce if top bit set
  13571. movq %r11, %rdx
  13572. sarq $63, %rdx
  13573. andq $19, %rdx
  13574. andq %rcx, %r11
  13575. addq %rdx, %r8
  13576. adcq $0x00, %r9
  13577. adcq $0x00, %r10
  13578. adcq $0x00, %r11
  13579. # Store
  13580. movq %r8, (%rdi)
  13581. movq %r9, 8(%rdi)
  13582. movq %r10, 16(%rdi)
  13583. movq %r11, 24(%rdi)
  13584. movq 8(%rsp), %rdi
  13585. # Add
  13586. movq (%rsi), %r8
  13587. movq 8(%rsi), %r9
  13588. addq (%rbx), %r8
  13589. movq 16(%rsi), %r10
  13590. adcq 8(%rbx), %r9
  13591. movq 24(%rsi), %rdx
  13592. adcq 16(%rbx), %r10
  13593. movq $-19, %rcx
  13594. adcq 24(%rbx), %rdx
  13595. movq $0x7fffffffffffffff, %rax
  13596. movq %rdx, %r11
  13597. sarq $63, %rdx
  13598. # Mask the modulus
  13599. andq %rdx, %rcx
  13600. andq %rdx, %rax
  13601. # Sub modulus (if overflow)
  13602. subq %rcx, %r8
  13603. sbbq %rdx, %r9
  13604. sbbq %rdx, %r10
  13605. sbbq %rax, %r11
  13606. movq %r8, (%rdi)
  13607. movq %r9, 8(%rdi)
  13608. movq %r10, 16(%rdi)
  13609. movq %r11, 24(%rdi)
  13610. movq 24(%rsp), %rsi
  13611. # Square
  13612. # A[0] * A[1]
  13613. movq (%rdi), %rdx
  13614. mulxq 8(%rdi), %r9, %r10
  13615. # A[0] * A[3]
  13616. mulxq 24(%rdi), %r11, %r12
  13617. # A[2] * A[1]
  13618. movq 16(%rdi), %rdx
  13619. mulxq 8(%rdi), %rcx, %rax
  13620. xorq %r15, %r15
  13621. adoxq %rcx, %r11
  13622. # A[2] * A[3]
  13623. mulxq 24(%rdi), %r13, %r14
  13624. adoxq %rax, %r12
  13625. # A[2] * A[0]
  13626. mulxq (%rdi), %rcx, %rax
  13627. adoxq %r15, %r13
  13628. adcxq %rcx, %r10
  13629. adoxq %r15, %r14
  13630. # A[1] * A[3]
  13631. movq 8(%rdi), %rdx
  13632. mulxq 24(%rdi), %rbp, %r8
  13633. adcxq %rax, %r11
  13634. adcxq %rbp, %r12
  13635. adcxq %r8, %r13
  13636. adcxq %r15, %r14
  13637. # Double with Carry Flag
  13638. xorq %r15, %r15
  13639. # A[0] * A[0]
  13640. movq (%rdi), %rdx
  13641. mulxq %rdx, %r8, %rbp
  13642. adcxq %r9, %r9
  13643. # A[1] * A[1]
  13644. movq 8(%rdi), %rdx
  13645. mulxq %rdx, %rcx, %rax
  13646. adcxq %r10, %r10
  13647. adoxq %rbp, %r9
  13648. adcxq %r11, %r11
  13649. adoxq %rcx, %r10
  13650. # A[2] * A[2]
  13651. movq 16(%rdi), %rdx
  13652. mulxq %rdx, %rbp, %rcx
  13653. adcxq %r12, %r12
  13654. adoxq %rax, %r11
  13655. adcxq %r13, %r13
  13656. adoxq %rbp, %r12
  13657. # A[3] * A[3]
  13658. movq 24(%rdi), %rdx
  13659. mulxq %rdx, %rbp, %rax
  13660. adcxq %r14, %r14
  13661. adoxq %rcx, %r13
  13662. adcxq %r15, %r15
  13663. adoxq %rbp, %r14
  13664. adoxq %rax, %r15
  13665. # Reduce
  13666. movq $0x7fffffffffffffff, %rcx
  13667. # Move top half into t4-t7 and remove top bit from t3
  13668. shldq $0x01, %r14, %r15
  13669. shldq $0x01, %r13, %r14
  13670. shldq $0x01, %r12, %r13
  13671. shldq $0x01, %r11, %r12
  13672. andq %rcx, %r11
  13673. # Multiply top half by 19
  13674. movq $19, %rdx
  13675. xorq %rcx, %rcx
  13676. mulxq %r12, %rbp, %r12
  13677. adcxq %rbp, %r8
  13678. adoxq %r12, %r9
  13679. mulxq %r13, %rbp, %r13
  13680. adcxq %rbp, %r9
  13681. adoxq %r13, %r10
  13682. mulxq %r14, %rbp, %r14
  13683. adcxq %rbp, %r10
  13684. adoxq %r14, %r11
  13685. mulxq %r15, %r15, %rdx
  13686. adcxq %r15, %r11
  13687. adoxq %rcx, %rdx
  13688. adcxq %rcx, %rdx
  13689. # Overflow
  13690. shldq $0x01, %r11, %rdx
  13691. movq $0x7fffffffffffffff, %rcx
  13692. imulq $19, %rdx, %rbp
  13693. andq %rcx, %r11
  13694. addq %rbp, %r8
  13695. adcq $0x00, %r9
  13696. adcq $0x00, %r10
  13697. adcq $0x00, %r11
  13698. # Reduce if top bit set
  13699. movq %r11, %rdx
  13700. sarq $63, %rdx
  13701. andq $19, %rdx
  13702. andq %rcx, %r11
  13703. addq %rdx, %r8
  13704. adcq $0x00, %r9
  13705. adcq $0x00, %r10
  13706. adcq $0x00, %r11
  13707. # Store
  13708. movq %r8, (%rsi)
  13709. movq %r9, 8(%rsi)
  13710. movq %r10, 16(%rsi)
  13711. movq %r11, 24(%rsi)
  13712. movq 16(%rsp), %rsi
  13713. movq (%rsp), %rbx
  13714. # Add
  13715. movq (%rsi), %r8
  13716. movq 8(%rsi), %r9
  13717. movq 16(%rsi), %r10
  13718. movq 24(%rsi), %rdx
  13719. movq %r8, %r12
  13720. addq (%rbx), %r8
  13721. movq %r9, %r13
  13722. adcq 8(%rbx), %r9
  13723. movq %r10, %r14
  13724. adcq 16(%rbx), %r10
  13725. movq %rdx, %r15
  13726. adcq 24(%rbx), %rdx
  13727. movq $-19, %rcx
  13728. movq %rdx, %r11
  13729. movq $0x7fffffffffffffff, %rax
  13730. sarq $63, %rdx
  13731. # Mask the modulus
  13732. andq %rdx, %rcx
  13733. andq %rdx, %rax
  13734. # Sub modulus (if overflow)
  13735. subq %rcx, %r8
  13736. sbbq %rdx, %r9
  13737. sbbq %rdx, %r10
  13738. sbbq %rax, %r11
  13739. # Sub
  13740. subq (%rbx), %r12
  13741. movq $0x00, %rdx
  13742. sbbq 8(%rbx), %r13
  13743. movq $-19, %rcx
  13744. sbbq 16(%rbx), %r14
  13745. movq $0x7fffffffffffffff, %rax
  13746. sbbq 24(%rbx), %r15
  13747. sbbq $0x00, %rdx
  13748. # Mask the modulus
  13749. andq %rdx, %rcx
  13750. andq %rdx, %rax
  13751. # Add modulus (if underflow)
  13752. addq %rcx, %r12
  13753. adcq %rdx, %r13
  13754. adcq %rdx, %r14
  13755. adcq %rax, %r15
  13756. movq %r8, (%rdi)
  13757. movq %r9, 8(%rdi)
  13758. movq %r10, 16(%rdi)
  13759. movq %r11, 24(%rdi)
  13760. movq %r12, (%rsi)
  13761. movq %r13, 8(%rsi)
  13762. movq %r14, 16(%rsi)
  13763. movq %r15, 24(%rsi)
  13764. movq 24(%rsp), %rsi
  13765. # Sub
  13766. movq (%rsi), %r8
  13767. movq 8(%rsi), %r9
  13768. movq 16(%rsi), %r10
  13769. movq 24(%rsi), %r11
  13770. subq (%rdi), %r8
  13771. movq $0x00, %rdx
  13772. sbbq 8(%rdi), %r9
  13773. movq $-19, %rcx
  13774. sbbq 16(%rdi), %r10
  13775. movq $0x7fffffffffffffff, %rax
  13776. sbbq 24(%rdi), %r11
  13777. sbbq $0x00, %rdx
  13778. # Mask the modulus
  13779. andq %rdx, %rcx
  13780. andq %rdx, %rax
  13781. # Add modulus (if underflow)
  13782. addq %rcx, %r8
  13783. adcq %rdx, %r9
  13784. adcq %rdx, %r10
  13785. adcq %rax, %r11
  13786. movq %r8, (%rbx)
  13787. movq %r9, 8(%rbx)
  13788. movq %r10, 16(%rbx)
  13789. movq %r11, 24(%rbx)
  13790. movq 104(%rsp), %rdi
  13791. # Square * 2
  13792. # A[0] * A[1]
  13793. movq (%rdi), %rdx
  13794. mulxq 8(%rdi), %r9, %r10
  13795. # A[0] * A[3]
  13796. mulxq 24(%rdi), %r11, %r12
  13797. # A[2] * A[1]
  13798. movq 16(%rdi), %rdx
  13799. mulxq 8(%rdi), %rcx, %rax
  13800. xorq %r15, %r15
  13801. adoxq %rcx, %r11
  13802. # A[2] * A[3]
  13803. mulxq 24(%rdi), %r13, %r14
  13804. adoxq %rax, %r12
  13805. # A[2] * A[0]
  13806. mulxq (%rdi), %rcx, %rax
  13807. adoxq %r15, %r13
  13808. adcxq %rcx, %r10
  13809. adoxq %r15, %r14
  13810. # A[1] * A[3]
  13811. movq 8(%rdi), %rdx
  13812. mulxq 24(%rdi), %rbp, %r8
  13813. adcxq %rax, %r11
  13814. adcxq %rbp, %r12
  13815. adcxq %r8, %r13
  13816. adcxq %r15, %r14
  13817. # Double with Carry Flag
  13818. xorq %r15, %r15
  13819. # A[0] * A[0]
  13820. movq (%rdi), %rdx
  13821. mulxq %rdx, %r8, %rbp
  13822. adcxq %r9, %r9
  13823. # A[1] * A[1]
  13824. movq 8(%rdi), %rdx
  13825. mulxq %rdx, %rcx, %rax
  13826. adcxq %r10, %r10
  13827. adoxq %rbp, %r9
  13828. adcxq %r11, %r11
  13829. adoxq %rcx, %r10
  13830. # A[2] * A[2]
  13831. movq 16(%rdi), %rdx
  13832. mulxq %rdx, %rbp, %rcx
  13833. adcxq %r12, %r12
  13834. adoxq %rax, %r11
  13835. adcxq %r13, %r13
  13836. adoxq %rbp, %r12
  13837. # A[3] * A[3]
  13838. movq 24(%rdi), %rdx
  13839. mulxq %rdx, %rbp, %rax
  13840. adcxq %r14, %r14
  13841. adoxq %rcx, %r13
  13842. adcxq %r15, %r15
  13843. adoxq %rbp, %r14
  13844. adoxq %rax, %r15
  13845. # Reduce
  13846. movq $0x7fffffffffffffff, %rax
  13847. xorq %rbp, %rbp
  13848. # Move top half into t4-t7 and remove top bit from t3 and double
  13849. shldq $3, %r15, %rbp
  13850. shldq $2, %r14, %r15
  13851. shldq $2, %r13, %r14
  13852. shldq $2, %r12, %r13
  13853. shldq $2, %r11, %r12
  13854. shldq $0x01, %r10, %r11
  13855. shldq $0x01, %r9, %r10
  13856. shldq $0x01, %r8, %r9
  13857. shlq $0x01, %r8
  13858. andq %rax, %r11
  13859. # Two out left, one in right
  13860. andq %rax, %r15
  13861. # Multiply top bits by 19*19
  13862. imulq $0x169, %rbp, %rcx
  13863. xorq %rax, %rax
  13864. # Multiply top half by 19
  13865. movq $19, %rdx
  13866. adoxq %rcx, %r8
  13867. mulxq %r12, %rbp, %r12
  13868. adcxq %rbp, %r8
  13869. adoxq %r12, %r9
  13870. mulxq %r13, %rbp, %r13
  13871. adcxq %rbp, %r9
  13872. adoxq %r13, %r10
  13873. mulxq %r14, %rbp, %r14
  13874. adcxq %rbp, %r10
  13875. adoxq %r14, %r11
  13876. mulxq %r15, %r15, %rdx
  13877. adcxq %r15, %r11
  13878. adoxq %rax, %rdx
  13879. adcxq %rax, %rdx
  13880. # Overflow
  13881. shldq $0x01, %r11, %rdx
  13882. movq $0x7fffffffffffffff, %rax
  13883. imulq $19, %rdx, %rbp
  13884. andq %rax, %r11
  13885. addq %rbp, %r8
  13886. adcq $0x00, %r9
  13887. adcq $0x00, %r10
  13888. adcq $0x00, %r11
  13889. # Reduce if top bit set
  13890. movq %r11, %rdx
  13891. sarq $63, %rdx
  13892. andq $19, %rdx
  13893. andq %rax, %r11
  13894. addq %rdx, %r8
  13895. adcq $0x00, %r9
  13896. adcq $0x00, %r10
  13897. adcq $0x00, %r11
  13898. # Store
  13899. movq %r8, (%rsi)
  13900. movq %r9, 8(%rsi)
  13901. movq %r10, 16(%rsi)
  13902. movq %r11, 24(%rsi)
  13903. movq 16(%rsp), %rdi
  13904. # Sub
  13905. movq (%rsi), %r8
  13906. movq 8(%rsi), %r9
  13907. movq 16(%rsi), %r10
  13908. movq 24(%rsi), %r11
  13909. subq (%rdi), %r8
  13910. movq $0x00, %rdx
  13911. sbbq 8(%rdi), %r9
  13912. movq $-19, %rcx
  13913. sbbq 16(%rdi), %r10
  13914. movq $0x7fffffffffffffff, %rax
  13915. sbbq 24(%rdi), %r11
  13916. sbbq $0x00, %rdx
  13917. # Mask the modulus
  13918. andq %rdx, %rcx
  13919. andq %rdx, %rax
  13920. # Add modulus (if underflow)
  13921. addq %rcx, %r8
  13922. adcq %rdx, %r9
  13923. adcq %rdx, %r10
  13924. adcq %rax, %r11
  13925. movq %r8, (%rsi)
  13926. movq %r9, 8(%rsi)
  13927. movq %r10, 16(%rsi)
  13928. movq %r11, 24(%rsi)
  13929. addq $48, %rsp
  13930. popq %r15
  13931. popq %r14
  13932. popq %r13
  13933. popq %r12
  13934. popq %rbx
  13935. popq %rbp
  13936. repz retq
  13937. #ifndef __APPLE__
  13938. .size fe_ge_dbl_avx2,.-fe_ge_dbl_avx2
  13939. #endif /* __APPLE__ */
  13940. #ifndef __APPLE__
  13941. .text
  13942. .globl fe_ge_madd_avx2
  13943. .type fe_ge_madd_avx2,@function
  13944. .align 16
  13945. fe_ge_madd_avx2:
  13946. #else
  13947. .section __TEXT,__text
  13948. .globl _fe_ge_madd_avx2
  13949. .p2align 4
  13950. _fe_ge_madd_avx2:
  13951. #endif /* __APPLE__ */
  13952. pushq %rbp
  13953. pushq %rbx
  13954. pushq %r12
  13955. pushq %r13
  13956. pushq %r14
  13957. pushq %r15
  13958. subq $48, %rsp
  13959. movq %rdi, (%rsp)
  13960. movq %rsi, 8(%rsp)
  13961. movq %rdx, 16(%rsp)
  13962. movq %rcx, 24(%rsp)
  13963. movq %r8, 32(%rsp)
  13964. movq %r9, 40(%rsp)
  13965. movq 8(%rsp), %rsi
  13966. movq 40(%rsp), %rbx
  13967. movq 32(%rsp), %rbp
  13968. # Add
  13969. movq (%rbx), %r8
  13970. movq 8(%rbx), %r9
  13971. movq 16(%rbx), %r10
  13972. movq 24(%rbx), %rdx
  13973. movq %r8, %r12
  13974. addq (%rbp), %r8
  13975. movq %r9, %r13
  13976. adcq 8(%rbp), %r9
  13977. movq %r10, %r14
  13978. adcq 16(%rbp), %r10
  13979. movq %rdx, %r15
  13980. adcq 24(%rbp), %rdx
  13981. movq $-19, %rcx
  13982. movq %rdx, %r11
  13983. movq $0x7fffffffffffffff, %rax
  13984. sarq $63, %rdx
  13985. # Mask the modulus
  13986. andq %rdx, %rcx
  13987. andq %rdx, %rax
  13988. # Sub modulus (if overflow)
  13989. subq %rcx, %r8
  13990. sbbq %rdx, %r9
  13991. sbbq %rdx, %r10
  13992. sbbq %rax, %r11
  13993. # Sub
  13994. subq (%rbp), %r12
  13995. movq $0x00, %rdx
  13996. sbbq 8(%rbp), %r13
  13997. movq $-19, %rcx
  13998. sbbq 16(%rbp), %r14
  13999. movq $0x7fffffffffffffff, %rax
  14000. sbbq 24(%rbp), %r15
  14001. sbbq $0x00, %rdx
  14002. # Mask the modulus
  14003. andq %rdx, %rcx
  14004. andq %rdx, %rax
  14005. # Add modulus (if underflow)
  14006. addq %rcx, %r12
  14007. adcq %rdx, %r13
  14008. adcq %rdx, %r14
  14009. adcq %rax, %r15
  14010. movq %r8, (%rdi)
  14011. movq %r9, 8(%rdi)
  14012. movq %r10, 16(%rdi)
  14013. movq %r11, 24(%rdi)
  14014. movq %r12, (%rsi)
  14015. movq %r13, 8(%rsi)
  14016. movq %r14, 16(%rsi)
  14017. movq %r15, 24(%rsi)
  14018. movq 16(%rsp), %rbx
  14019. movq 128(%rsp), %rbp
  14020. # Multiply
  14021. # A[0] * B[0]
  14022. movq (%rbp), %rdx
  14023. mulxq (%rdi), %r8, %r9
  14024. # A[2] * B[0]
  14025. mulxq 16(%rdi), %r10, %r11
  14026. # A[1] * B[0]
  14027. mulxq 8(%rdi), %rcx, %rax
  14028. xorq %r15, %r15
  14029. adcxq %rcx, %r9
  14030. # A[1] * B[3]
  14031. movq 24(%rbp), %rdx
  14032. mulxq 8(%rdi), %r12, %r13
  14033. adcxq %rax, %r10
  14034. # A[0] * B[1]
  14035. movq 8(%rbp), %rdx
  14036. mulxq (%rdi), %rcx, %rax
  14037. adoxq %rcx, %r9
  14038. # A[2] * B[1]
  14039. mulxq 16(%rdi), %rcx, %r14
  14040. adoxq %rax, %r10
  14041. adcxq %rcx, %r11
  14042. # A[1] * B[2]
  14043. movq 16(%rbp), %rdx
  14044. mulxq 8(%rdi), %rcx, %rax
  14045. adcxq %r14, %r12
  14046. adoxq %rcx, %r11
  14047. adcxq %r15, %r13
  14048. adoxq %rax, %r12
  14049. # A[0] * B[2]
  14050. mulxq (%rdi), %rcx, %rax
  14051. adoxq %r15, %r13
  14052. xorq %r14, %r14
  14053. adcxq %rcx, %r10
  14054. # A[1] * B[1]
  14055. movq 8(%rbp), %rdx
  14056. mulxq 8(%rdi), %rdx, %rcx
  14057. adcxq %rax, %r11
  14058. adoxq %rdx, %r10
  14059. # A[3] * B[1]
  14060. movq 8(%rbp), %rdx
  14061. adoxq %rcx, %r11
  14062. mulxq 24(%rdi), %rcx, %rax
  14063. adcxq %rcx, %r12
  14064. # A[2] * B[2]
  14065. movq 16(%rbp), %rdx
  14066. mulxq 16(%rdi), %rdx, %rcx
  14067. adcxq %rax, %r13
  14068. adoxq %rdx, %r12
  14069. # A[3] * B[3]
  14070. movq 24(%rbp), %rdx
  14071. adoxq %rcx, %r13
  14072. mulxq 24(%rdi), %rcx, %rax
  14073. adoxq %r15, %r14
  14074. adcxq %rcx, %r14
  14075. # A[0] * B[3]
  14076. mulxq (%rdi), %rdx, %rcx
  14077. adcxq %rax, %r15
  14078. xorq %rax, %rax
  14079. adcxq %rdx, %r11
  14080. # A[3] * B[0]
  14081. movq (%rbp), %rdx
  14082. adcxq %rcx, %r12
  14083. mulxq 24(%rdi), %rdx, %rcx
  14084. adoxq %rdx, %r11
  14085. adoxq %rcx, %r12
  14086. # A[2] * B[3]
  14087. movq 24(%rbp), %rdx
  14088. mulxq 16(%rdi), %rdx, %rcx
  14089. adcxq %rdx, %r13
  14090. # A[3] * B[2]
  14091. movq 16(%rbp), %rdx
  14092. adcxq %rcx, %r14
  14093. mulxq 24(%rdi), %rcx, %rdx
  14094. adcxq %rax, %r15
  14095. adoxq %rcx, %r13
  14096. adoxq %rdx, %r14
  14097. adoxq %rax, %r15
  14098. # Reduce
  14099. movq $0x7fffffffffffffff, %rax
  14100. # Move top half into t4-t7 and remove top bit from t3
  14101. shldq $0x01, %r14, %r15
  14102. shldq $0x01, %r13, %r14
  14103. shldq $0x01, %r12, %r13
  14104. shldq $0x01, %r11, %r12
  14105. andq %rax, %r11
  14106. # Multiply top half by 19
  14107. movq $19, %rdx
  14108. xorq %rax, %rax
  14109. mulxq %r12, %rcx, %r12
  14110. adcxq %rcx, %r8
  14111. adoxq %r12, %r9
  14112. mulxq %r13, %rcx, %r13
  14113. adcxq %rcx, %r9
  14114. adoxq %r13, %r10
  14115. mulxq %r14, %rcx, %r14
  14116. adcxq %rcx, %r10
  14117. adoxq %r14, %r11
  14118. mulxq %r15, %r15, %rdx
  14119. adcxq %r15, %r11
  14120. adoxq %rax, %rdx
  14121. adcxq %rax, %rdx
  14122. # Overflow
  14123. shldq $0x01, %r11, %rdx
  14124. movq $0x7fffffffffffffff, %rax
  14125. imulq $19, %rdx, %rcx
  14126. andq %rax, %r11
  14127. addq %rcx, %r8
  14128. adcq $0x00, %r9
  14129. adcq $0x00, %r10
  14130. adcq $0x00, %r11
  14131. # Reduce if top bit set
  14132. movq %r11, %rdx
  14133. sarq $63, %rdx
  14134. andq $19, %rdx
  14135. andq %rax, %r11
  14136. addq %rdx, %r8
  14137. adcq $0x00, %r9
  14138. adcq $0x00, %r10
  14139. adcq $0x00, %r11
  14140. # Store
  14141. movq %r8, (%rbx)
  14142. movq %r9, 8(%rbx)
  14143. movq %r10, 16(%rbx)
  14144. movq %r11, 24(%rbx)
  14145. movq 136(%rsp), %rdi
  14146. # Multiply
  14147. # A[0] * B[0]
  14148. movq (%rdi), %rdx
  14149. mulxq (%rsi), %r8, %r9
  14150. # A[2] * B[0]
  14151. mulxq 16(%rsi), %r10, %r11
  14152. # A[1] * B[0]
  14153. mulxq 8(%rsi), %rcx, %rax
  14154. xorq %r15, %r15
  14155. adcxq %rcx, %r9
  14156. # A[1] * B[3]
  14157. movq 24(%rdi), %rdx
  14158. mulxq 8(%rsi), %r12, %r13
  14159. adcxq %rax, %r10
  14160. # A[0] * B[1]
  14161. movq 8(%rdi), %rdx
  14162. mulxq (%rsi), %rcx, %rax
  14163. adoxq %rcx, %r9
  14164. # A[2] * B[1]
  14165. mulxq 16(%rsi), %rcx, %r14
  14166. adoxq %rax, %r10
  14167. adcxq %rcx, %r11
  14168. # A[1] * B[2]
  14169. movq 16(%rdi), %rdx
  14170. mulxq 8(%rsi), %rcx, %rax
  14171. adcxq %r14, %r12
  14172. adoxq %rcx, %r11
  14173. adcxq %r15, %r13
  14174. adoxq %rax, %r12
  14175. # A[0] * B[2]
  14176. mulxq (%rsi), %rcx, %rax
  14177. adoxq %r15, %r13
  14178. xorq %r14, %r14
  14179. adcxq %rcx, %r10
  14180. # A[1] * B[1]
  14181. movq 8(%rdi), %rdx
  14182. mulxq 8(%rsi), %rdx, %rcx
  14183. adcxq %rax, %r11
  14184. adoxq %rdx, %r10
  14185. # A[3] * B[1]
  14186. movq 8(%rdi), %rdx
  14187. adoxq %rcx, %r11
  14188. mulxq 24(%rsi), %rcx, %rax
  14189. adcxq %rcx, %r12
  14190. # A[2] * B[2]
  14191. movq 16(%rdi), %rdx
  14192. mulxq 16(%rsi), %rdx, %rcx
  14193. adcxq %rax, %r13
  14194. adoxq %rdx, %r12
  14195. # A[3] * B[3]
  14196. movq 24(%rdi), %rdx
  14197. adoxq %rcx, %r13
  14198. mulxq 24(%rsi), %rcx, %rax
  14199. adoxq %r15, %r14
  14200. adcxq %rcx, %r14
  14201. # A[0] * B[3]
  14202. mulxq (%rsi), %rdx, %rcx
  14203. adcxq %rax, %r15
  14204. xorq %rax, %rax
  14205. adcxq %rdx, %r11
  14206. # A[3] * B[0]
  14207. movq (%rdi), %rdx
  14208. adcxq %rcx, %r12
  14209. mulxq 24(%rsi), %rdx, %rcx
  14210. adoxq %rdx, %r11
  14211. adoxq %rcx, %r12
  14212. # A[2] * B[3]
  14213. movq 24(%rdi), %rdx
  14214. mulxq 16(%rsi), %rdx, %rcx
  14215. adcxq %rdx, %r13
  14216. # A[3] * B[2]
  14217. movq 16(%rdi), %rdx
  14218. adcxq %rcx, %r14
  14219. mulxq 24(%rsi), %rcx, %rdx
  14220. adcxq %rax, %r15
  14221. adoxq %rcx, %r13
  14222. adoxq %rdx, %r14
  14223. adoxq %rax, %r15
  14224. # Reduce
  14225. movq $0x7fffffffffffffff, %rax
  14226. # Move top half into t4-t7 and remove top bit from t3
  14227. shldq $0x01, %r14, %r15
  14228. shldq $0x01, %r13, %r14
  14229. shldq $0x01, %r12, %r13
  14230. shldq $0x01, %r11, %r12
  14231. andq %rax, %r11
  14232. # Multiply top half by 19
  14233. movq $19, %rdx
  14234. xorq %rax, %rax
  14235. mulxq %r12, %rcx, %r12
  14236. adcxq %rcx, %r8
  14237. adoxq %r12, %r9
  14238. mulxq %r13, %rcx, %r13
  14239. adcxq %rcx, %r9
  14240. adoxq %r13, %r10
  14241. mulxq %r14, %rcx, %r14
  14242. adcxq %rcx, %r10
  14243. adoxq %r14, %r11
  14244. mulxq %r15, %r15, %rdx
  14245. adcxq %r15, %r11
  14246. adoxq %rax, %rdx
  14247. adcxq %rax, %rdx
  14248. # Overflow
  14249. shldq $0x01, %r11, %rdx
  14250. movq $0x7fffffffffffffff, %rax
  14251. imulq $19, %rdx, %rcx
  14252. andq %rax, %r11
  14253. addq %rcx, %r8
  14254. adcq $0x00, %r9
  14255. adcq $0x00, %r10
  14256. adcq $0x00, %r11
  14257. # Reduce if top bit set
  14258. movq %r11, %rdx
  14259. sarq $63, %rdx
  14260. andq $19, %rdx
  14261. andq %rax, %r11
  14262. addq %rdx, %r8
  14263. adcq $0x00, %r9
  14264. adcq $0x00, %r10
  14265. adcq $0x00, %r11
  14266. # Store
  14267. movq %r8, (%rsi)
  14268. movq %r9, 8(%rsi)
  14269. movq %r10, 16(%rsi)
  14270. movq %r11, 24(%rsi)
  14271. movq 24(%rsp), %rdi
  14272. movq 120(%rsp), %rsi
  14273. movq 112(%rsp), %rbp
  14274. # Multiply
  14275. # A[0] * B[0]
  14276. movq (%rbp), %rdx
  14277. mulxq (%rsi), %r8, %r9
  14278. # A[2] * B[0]
  14279. mulxq 16(%rsi), %r10, %r11
  14280. # A[1] * B[0]
  14281. mulxq 8(%rsi), %rcx, %rax
  14282. xorq %r15, %r15
  14283. adcxq %rcx, %r9
  14284. # A[1] * B[3]
  14285. movq 24(%rbp), %rdx
  14286. mulxq 8(%rsi), %r12, %r13
  14287. adcxq %rax, %r10
  14288. # A[0] * B[1]
  14289. movq 8(%rbp), %rdx
  14290. mulxq (%rsi), %rcx, %rax
  14291. adoxq %rcx, %r9
  14292. # A[2] * B[1]
  14293. mulxq 16(%rsi), %rcx, %r14
  14294. adoxq %rax, %r10
  14295. adcxq %rcx, %r11
  14296. # A[1] * B[2]
  14297. movq 16(%rbp), %rdx
  14298. mulxq 8(%rsi), %rcx, %rax
  14299. adcxq %r14, %r12
  14300. adoxq %rcx, %r11
  14301. adcxq %r15, %r13
  14302. adoxq %rax, %r12
  14303. # A[0] * B[2]
  14304. mulxq (%rsi), %rcx, %rax
  14305. adoxq %r15, %r13
  14306. xorq %r14, %r14
  14307. adcxq %rcx, %r10
  14308. # A[1] * B[1]
  14309. movq 8(%rbp), %rdx
  14310. mulxq 8(%rsi), %rdx, %rcx
  14311. adcxq %rax, %r11
  14312. adoxq %rdx, %r10
  14313. # A[3] * B[1]
  14314. movq 8(%rbp), %rdx
  14315. adoxq %rcx, %r11
  14316. mulxq 24(%rsi), %rcx, %rax
  14317. adcxq %rcx, %r12
  14318. # A[2] * B[2]
  14319. movq 16(%rbp), %rdx
  14320. mulxq 16(%rsi), %rdx, %rcx
  14321. adcxq %rax, %r13
  14322. adoxq %rdx, %r12
  14323. # A[3] * B[3]
  14324. movq 24(%rbp), %rdx
  14325. adoxq %rcx, %r13
  14326. mulxq 24(%rsi), %rcx, %rax
  14327. adoxq %r15, %r14
  14328. adcxq %rcx, %r14
  14329. # A[0] * B[3]
  14330. mulxq (%rsi), %rdx, %rcx
  14331. adcxq %rax, %r15
  14332. xorq %rax, %rax
  14333. adcxq %rdx, %r11
  14334. # A[3] * B[0]
  14335. movq (%rbp), %rdx
  14336. adcxq %rcx, %r12
  14337. mulxq 24(%rsi), %rdx, %rcx
  14338. adoxq %rdx, %r11
  14339. adoxq %rcx, %r12
  14340. # A[2] * B[3]
  14341. movq 24(%rbp), %rdx
  14342. mulxq 16(%rsi), %rdx, %rcx
  14343. adcxq %rdx, %r13
  14344. # A[3] * B[2]
  14345. movq 16(%rbp), %rdx
  14346. adcxq %rcx, %r14
  14347. mulxq 24(%rsi), %rcx, %rdx
  14348. adcxq %rax, %r15
  14349. adoxq %rcx, %r13
  14350. adoxq %rdx, %r14
  14351. adoxq %rax, %r15
  14352. # Reduce
  14353. movq $0x7fffffffffffffff, %rax
  14354. # Move top half into t4-t7 and remove top bit from t3
  14355. shldq $0x01, %r14, %r15
  14356. shldq $0x01, %r13, %r14
  14357. shldq $0x01, %r12, %r13
  14358. shldq $0x01, %r11, %r12
  14359. andq %rax, %r11
  14360. # Multiply top half by 19
  14361. movq $19, %rdx
  14362. xorq %rax, %rax
  14363. mulxq %r12, %rcx, %r12
  14364. adcxq %rcx, %r8
  14365. adoxq %r12, %r9
  14366. mulxq %r13, %rcx, %r13
  14367. adcxq %rcx, %r9
  14368. adoxq %r13, %r10
  14369. mulxq %r14, %rcx, %r14
  14370. adcxq %rcx, %r10
  14371. adoxq %r14, %r11
  14372. mulxq %r15, %r15, %rdx
  14373. adcxq %r15, %r11
  14374. adoxq %rax, %rdx
  14375. adcxq %rax, %rdx
  14376. # Overflow
  14377. shldq $0x01, %r11, %rdx
  14378. movq $0x7fffffffffffffff, %rax
  14379. imulq $19, %rdx, %rcx
  14380. andq %rax, %r11
  14381. addq %rcx, %r8
  14382. adcq $0x00, %r9
  14383. adcq $0x00, %r10
  14384. adcq $0x00, %r11
  14385. # Reduce if top bit set
  14386. movq %r11, %rdx
  14387. sarq $63, %rdx
  14388. andq $19, %rdx
  14389. andq %rax, %r11
  14390. addq %rdx, %r8
  14391. adcq $0x00, %r9
  14392. adcq $0x00, %r10
  14393. adcq $0x00, %r11
  14394. # Store
  14395. movq %r8, (%rdi)
  14396. movq %r9, 8(%rdi)
  14397. movq %r10, 16(%rdi)
  14398. movq %r11, 24(%rdi)
  14399. movq 8(%rsp), %rdi
  14400. movq (%rsp), %rsi
  14401. # Add
  14402. movq (%rbx), %r8
  14403. movq 8(%rbx), %r9
  14404. movq 16(%rbx), %r10
  14405. movq 24(%rbx), %rdx
  14406. movq %r8, %r12
  14407. addq (%rdi), %r8
  14408. movq %r9, %r13
  14409. adcq 8(%rdi), %r9
  14410. movq %r10, %r14
  14411. adcq 16(%rdi), %r10
  14412. movq %rdx, %r15
  14413. adcq 24(%rdi), %rdx
  14414. movq $-19, %rcx
  14415. movq %rdx, %r11
  14416. movq $0x7fffffffffffffff, %rax
  14417. sarq $63, %rdx
  14418. # Mask the modulus
  14419. andq %rdx, %rcx
  14420. andq %rdx, %rax
  14421. # Sub modulus (if overflow)
  14422. subq %rcx, %r8
  14423. sbbq %rdx, %r9
  14424. sbbq %rdx, %r10
  14425. sbbq %rax, %r11
  14426. # Sub
  14427. subq (%rdi), %r12
  14428. movq $0x00, %rdx
  14429. sbbq 8(%rdi), %r13
  14430. movq $-19, %rcx
  14431. sbbq 16(%rdi), %r14
  14432. movq $0x7fffffffffffffff, %rax
  14433. sbbq 24(%rdi), %r15
  14434. sbbq $0x00, %rdx
  14435. # Mask the modulus
  14436. andq %rdx, %rcx
  14437. andq %rdx, %rax
  14438. # Add modulus (if underflow)
  14439. addq %rcx, %r12
  14440. adcq %rdx, %r13
  14441. adcq %rdx, %r14
  14442. adcq %rax, %r15
  14443. movq %r8, (%rdi)
  14444. movq %r9, 8(%rdi)
  14445. movq %r10, 16(%rdi)
  14446. movq %r11, 24(%rdi)
  14447. movq %r12, (%rsi)
  14448. movq %r13, 8(%rsi)
  14449. movq %r14, 16(%rsi)
  14450. movq %r15, 24(%rsi)
  14451. movq 104(%rsp), %rdi
  14452. # Double
  14453. movq (%rdi), %r8
  14454. movq 8(%rdi), %r9
  14455. addq %r8, %r8
  14456. movq 16(%rdi), %r10
  14457. adcq %r9, %r9
  14458. movq 24(%rdi), %rdx
  14459. adcq %r10, %r10
  14460. movq $-19, %rcx
  14461. adcq %rdx, %rdx
  14462. movq $0x7fffffffffffffff, %rax
  14463. movq %rdx, %r11
  14464. sarq $63, %rdx
  14465. # Mask the modulus
  14466. andq %rdx, %rcx
  14467. andq %rdx, %rax
  14468. # Sub modulus (if overflow)
  14469. subq %rcx, %r8
  14470. sbbq %rdx, %r9
  14471. sbbq %rdx, %r10
  14472. sbbq %rax, %r11
  14473. movq %r8, (%rbx)
  14474. movq %r9, 8(%rbx)
  14475. movq %r10, 16(%rbx)
  14476. movq %r11, 24(%rbx)
  14477. movq 24(%rsp), %rdi
  14478. # Add
  14479. movq (%rbx), %r8
  14480. movq 8(%rbx), %r9
  14481. movq 16(%rbx), %r10
  14482. movq 24(%rbx), %rdx
  14483. movq %r8, %r12
  14484. addq (%rdi), %r8
  14485. movq %r9, %r13
  14486. adcq 8(%rdi), %r9
  14487. movq %r10, %r14
  14488. adcq 16(%rdi), %r10
  14489. movq %rdx, %r15
  14490. adcq 24(%rdi), %rdx
  14491. movq $-19, %rcx
  14492. movq %rdx, %r11
  14493. movq $0x7fffffffffffffff, %rax
  14494. sarq $63, %rdx
  14495. # Mask the modulus
  14496. andq %rdx, %rcx
  14497. andq %rdx, %rax
  14498. # Sub modulus (if overflow)
  14499. subq %rcx, %r8
  14500. sbbq %rdx, %r9
  14501. sbbq %rdx, %r10
  14502. sbbq %rax, %r11
  14503. # Sub
  14504. subq (%rdi), %r12
  14505. movq $0x00, %rdx
  14506. sbbq 8(%rdi), %r13
  14507. movq $-19, %rcx
  14508. sbbq 16(%rdi), %r14
  14509. movq $0x7fffffffffffffff, %rax
  14510. sbbq 24(%rdi), %r15
  14511. sbbq $0x00, %rdx
  14512. # Mask the modulus
  14513. andq %rdx, %rcx
  14514. andq %rdx, %rax
  14515. # Add modulus (if underflow)
  14516. addq %rcx, %r12
  14517. adcq %rdx, %r13
  14518. adcq %rdx, %r14
  14519. adcq %rax, %r15
  14520. movq %r8, (%rbx)
  14521. movq %r9, 8(%rbx)
  14522. movq %r10, 16(%rbx)
  14523. movq %r11, 24(%rbx)
  14524. movq %r12, (%rdi)
  14525. movq %r13, 8(%rdi)
  14526. movq %r14, 16(%rdi)
  14527. movq %r15, 24(%rdi)
  14528. addq $48, %rsp
  14529. popq %r15
  14530. popq %r14
  14531. popq %r13
  14532. popq %r12
  14533. popq %rbx
  14534. popq %rbp
  14535. repz retq
  14536. #ifndef __APPLE__
  14537. .size fe_ge_madd_avx2,.-fe_ge_madd_avx2
  14538. #endif /* __APPLE__ */
  14539. #ifndef __APPLE__
  14540. .text
  14541. .globl fe_ge_msub_avx2
  14542. .type fe_ge_msub_avx2,@function
  14543. .align 16
  14544. fe_ge_msub_avx2:
  14545. #else
  14546. .section __TEXT,__text
  14547. .globl _fe_ge_msub_avx2
  14548. .p2align 4
  14549. _fe_ge_msub_avx2:
  14550. #endif /* __APPLE__ */
  14551. pushq %rbp
  14552. pushq %rbx
  14553. pushq %r12
  14554. pushq %r13
  14555. pushq %r14
  14556. pushq %r15
  14557. subq $48, %rsp
  14558. movq %rdi, (%rsp)
  14559. movq %rsi, 8(%rsp)
  14560. movq %rdx, 16(%rsp)
  14561. movq %rcx, 24(%rsp)
  14562. movq %r8, 32(%rsp)
  14563. movq %r9, 40(%rsp)
  14564. movq 8(%rsp), %rsi
  14565. movq 40(%rsp), %rbx
  14566. movq 32(%rsp), %rbp
  14567. # Add
  14568. movq (%rbx), %r8
  14569. movq 8(%rbx), %r9
  14570. movq 16(%rbx), %r10
  14571. movq 24(%rbx), %rdx
  14572. movq %r8, %r12
  14573. addq (%rbp), %r8
  14574. movq %r9, %r13
  14575. adcq 8(%rbp), %r9
  14576. movq %r10, %r14
  14577. adcq 16(%rbp), %r10
  14578. movq %rdx, %r15
  14579. adcq 24(%rbp), %rdx
  14580. movq $-19, %rcx
  14581. movq %rdx, %r11
  14582. movq $0x7fffffffffffffff, %rax
  14583. sarq $63, %rdx
  14584. # Mask the modulus
  14585. andq %rdx, %rcx
  14586. andq %rdx, %rax
  14587. # Sub modulus (if overflow)
  14588. subq %rcx, %r8
  14589. sbbq %rdx, %r9
  14590. sbbq %rdx, %r10
  14591. sbbq %rax, %r11
  14592. # Sub
  14593. subq (%rbp), %r12
  14594. movq $0x00, %rdx
  14595. sbbq 8(%rbp), %r13
  14596. movq $-19, %rcx
  14597. sbbq 16(%rbp), %r14
  14598. movq $0x7fffffffffffffff, %rax
  14599. sbbq 24(%rbp), %r15
  14600. sbbq $0x00, %rdx
  14601. # Mask the modulus
  14602. andq %rdx, %rcx
  14603. andq %rdx, %rax
  14604. # Add modulus (if underflow)
  14605. addq %rcx, %r12
  14606. adcq %rdx, %r13
  14607. adcq %rdx, %r14
  14608. adcq %rax, %r15
  14609. movq %r8, (%rdi)
  14610. movq %r9, 8(%rdi)
  14611. movq %r10, 16(%rdi)
  14612. movq %r11, 24(%rdi)
  14613. movq %r12, (%rsi)
  14614. movq %r13, 8(%rsi)
  14615. movq %r14, 16(%rsi)
  14616. movq %r15, 24(%rsi)
  14617. movq 16(%rsp), %rbx
  14618. movq 136(%rsp), %rbp
  14619. # Multiply
  14620. # A[0] * B[0]
  14621. movq (%rbp), %rdx
  14622. mulxq (%rdi), %r8, %r9
  14623. # A[2] * B[0]
  14624. mulxq 16(%rdi), %r10, %r11
  14625. # A[1] * B[0]
  14626. mulxq 8(%rdi), %rcx, %rax
  14627. xorq %r15, %r15
  14628. adcxq %rcx, %r9
  14629. # A[1] * B[3]
  14630. movq 24(%rbp), %rdx
  14631. mulxq 8(%rdi), %r12, %r13
  14632. adcxq %rax, %r10
  14633. # A[0] * B[1]
  14634. movq 8(%rbp), %rdx
  14635. mulxq (%rdi), %rcx, %rax
  14636. adoxq %rcx, %r9
  14637. # A[2] * B[1]
  14638. mulxq 16(%rdi), %rcx, %r14
  14639. adoxq %rax, %r10
  14640. adcxq %rcx, %r11
  14641. # A[1] * B[2]
  14642. movq 16(%rbp), %rdx
  14643. mulxq 8(%rdi), %rcx, %rax
  14644. adcxq %r14, %r12
  14645. adoxq %rcx, %r11
  14646. adcxq %r15, %r13
  14647. adoxq %rax, %r12
  14648. # A[0] * B[2]
  14649. mulxq (%rdi), %rcx, %rax
  14650. adoxq %r15, %r13
  14651. xorq %r14, %r14
  14652. adcxq %rcx, %r10
  14653. # A[1] * B[1]
  14654. movq 8(%rbp), %rdx
  14655. mulxq 8(%rdi), %rdx, %rcx
  14656. adcxq %rax, %r11
  14657. adoxq %rdx, %r10
  14658. # A[3] * B[1]
  14659. movq 8(%rbp), %rdx
  14660. adoxq %rcx, %r11
  14661. mulxq 24(%rdi), %rcx, %rax
  14662. adcxq %rcx, %r12
  14663. # A[2] * B[2]
  14664. movq 16(%rbp), %rdx
  14665. mulxq 16(%rdi), %rdx, %rcx
  14666. adcxq %rax, %r13
  14667. adoxq %rdx, %r12
  14668. # A[3] * B[3]
  14669. movq 24(%rbp), %rdx
  14670. adoxq %rcx, %r13
  14671. mulxq 24(%rdi), %rcx, %rax
  14672. adoxq %r15, %r14
  14673. adcxq %rcx, %r14
  14674. # A[0] * B[3]
  14675. mulxq (%rdi), %rdx, %rcx
  14676. adcxq %rax, %r15
  14677. xorq %rax, %rax
  14678. adcxq %rdx, %r11
  14679. # A[3] * B[0]
  14680. movq (%rbp), %rdx
  14681. adcxq %rcx, %r12
  14682. mulxq 24(%rdi), %rdx, %rcx
  14683. adoxq %rdx, %r11
  14684. adoxq %rcx, %r12
  14685. # A[2] * B[3]
  14686. movq 24(%rbp), %rdx
  14687. mulxq 16(%rdi), %rdx, %rcx
  14688. adcxq %rdx, %r13
  14689. # A[3] * B[2]
  14690. movq 16(%rbp), %rdx
  14691. adcxq %rcx, %r14
  14692. mulxq 24(%rdi), %rcx, %rdx
  14693. adcxq %rax, %r15
  14694. adoxq %rcx, %r13
  14695. adoxq %rdx, %r14
  14696. adoxq %rax, %r15
  14697. # Reduce
  14698. movq $0x7fffffffffffffff, %rax
  14699. # Move top half into t4-t7 and remove top bit from t3
  14700. shldq $0x01, %r14, %r15
  14701. shldq $0x01, %r13, %r14
  14702. shldq $0x01, %r12, %r13
  14703. shldq $0x01, %r11, %r12
  14704. andq %rax, %r11
  14705. # Multiply top half by 19
  14706. movq $19, %rdx
  14707. xorq %rax, %rax
  14708. mulxq %r12, %rcx, %r12
  14709. adcxq %rcx, %r8
  14710. adoxq %r12, %r9
  14711. mulxq %r13, %rcx, %r13
  14712. adcxq %rcx, %r9
  14713. adoxq %r13, %r10
  14714. mulxq %r14, %rcx, %r14
  14715. adcxq %rcx, %r10
  14716. adoxq %r14, %r11
  14717. mulxq %r15, %r15, %rdx
  14718. adcxq %r15, %r11
  14719. adoxq %rax, %rdx
  14720. adcxq %rax, %rdx
  14721. # Overflow
  14722. shldq $0x01, %r11, %rdx
  14723. movq $0x7fffffffffffffff, %rax
  14724. imulq $19, %rdx, %rcx
  14725. andq %rax, %r11
  14726. addq %rcx, %r8
  14727. adcq $0x00, %r9
  14728. adcq $0x00, %r10
  14729. adcq $0x00, %r11
  14730. # Reduce if top bit set
  14731. movq %r11, %rdx
  14732. sarq $63, %rdx
  14733. andq $19, %rdx
  14734. andq %rax, %r11
  14735. addq %rdx, %r8
  14736. adcq $0x00, %r9
  14737. adcq $0x00, %r10
  14738. adcq $0x00, %r11
  14739. # Store
  14740. movq %r8, (%rbx)
  14741. movq %r9, 8(%rbx)
  14742. movq %r10, 16(%rbx)
  14743. movq %r11, 24(%rbx)
  14744. movq 128(%rsp), %rdi
  14745. # Multiply
  14746. # A[0] * B[0]
  14747. movq (%rdi), %rdx
  14748. mulxq (%rsi), %r8, %r9
  14749. # A[2] * B[0]
  14750. mulxq 16(%rsi), %r10, %r11
  14751. # A[1] * B[0]
  14752. mulxq 8(%rsi), %rcx, %rax
  14753. xorq %r15, %r15
  14754. adcxq %rcx, %r9
  14755. # A[1] * B[3]
  14756. movq 24(%rdi), %rdx
  14757. mulxq 8(%rsi), %r12, %r13
  14758. adcxq %rax, %r10
  14759. # A[0] * B[1]
  14760. movq 8(%rdi), %rdx
  14761. mulxq (%rsi), %rcx, %rax
  14762. adoxq %rcx, %r9
  14763. # A[2] * B[1]
  14764. mulxq 16(%rsi), %rcx, %r14
  14765. adoxq %rax, %r10
  14766. adcxq %rcx, %r11
  14767. # A[1] * B[2]
  14768. movq 16(%rdi), %rdx
  14769. mulxq 8(%rsi), %rcx, %rax
  14770. adcxq %r14, %r12
  14771. adoxq %rcx, %r11
  14772. adcxq %r15, %r13
  14773. adoxq %rax, %r12
  14774. # A[0] * B[2]
  14775. mulxq (%rsi), %rcx, %rax
  14776. adoxq %r15, %r13
  14777. xorq %r14, %r14
  14778. adcxq %rcx, %r10
  14779. # A[1] * B[1]
  14780. movq 8(%rdi), %rdx
  14781. mulxq 8(%rsi), %rdx, %rcx
  14782. adcxq %rax, %r11
  14783. adoxq %rdx, %r10
  14784. # A[3] * B[1]
  14785. movq 8(%rdi), %rdx
  14786. adoxq %rcx, %r11
  14787. mulxq 24(%rsi), %rcx, %rax
  14788. adcxq %rcx, %r12
  14789. # A[2] * B[2]
  14790. movq 16(%rdi), %rdx
  14791. mulxq 16(%rsi), %rdx, %rcx
  14792. adcxq %rax, %r13
  14793. adoxq %rdx, %r12
  14794. # A[3] * B[3]
  14795. movq 24(%rdi), %rdx
  14796. adoxq %rcx, %r13
  14797. mulxq 24(%rsi), %rcx, %rax
  14798. adoxq %r15, %r14
  14799. adcxq %rcx, %r14
  14800. # A[0] * B[3]
  14801. mulxq (%rsi), %rdx, %rcx
  14802. adcxq %rax, %r15
  14803. xorq %rax, %rax
  14804. adcxq %rdx, %r11
  14805. # A[3] * B[0]
  14806. movq (%rdi), %rdx
  14807. adcxq %rcx, %r12
  14808. mulxq 24(%rsi), %rdx, %rcx
  14809. adoxq %rdx, %r11
  14810. adoxq %rcx, %r12
  14811. # A[2] * B[3]
  14812. movq 24(%rdi), %rdx
  14813. mulxq 16(%rsi), %rdx, %rcx
  14814. adcxq %rdx, %r13
  14815. # A[3] * B[2]
  14816. movq 16(%rdi), %rdx
  14817. adcxq %rcx, %r14
  14818. mulxq 24(%rsi), %rcx, %rdx
  14819. adcxq %rax, %r15
  14820. adoxq %rcx, %r13
  14821. adoxq %rdx, %r14
  14822. adoxq %rax, %r15
  14823. # Reduce
  14824. movq $0x7fffffffffffffff, %rax
  14825. # Move top half into t4-t7 and remove top bit from t3
  14826. shldq $0x01, %r14, %r15
  14827. shldq $0x01, %r13, %r14
  14828. shldq $0x01, %r12, %r13
  14829. shldq $0x01, %r11, %r12
  14830. andq %rax, %r11
  14831. # Multiply top half by 19
  14832. movq $19, %rdx
  14833. xorq %rax, %rax
  14834. mulxq %r12, %rcx, %r12
  14835. adcxq %rcx, %r8
  14836. adoxq %r12, %r9
  14837. mulxq %r13, %rcx, %r13
  14838. adcxq %rcx, %r9
  14839. adoxq %r13, %r10
  14840. mulxq %r14, %rcx, %r14
  14841. adcxq %rcx, %r10
  14842. adoxq %r14, %r11
  14843. mulxq %r15, %r15, %rdx
  14844. adcxq %r15, %r11
  14845. adoxq %rax, %rdx
  14846. adcxq %rax, %rdx
  14847. # Overflow
  14848. shldq $0x01, %r11, %rdx
  14849. movq $0x7fffffffffffffff, %rax
  14850. imulq $19, %rdx, %rcx
  14851. andq %rax, %r11
  14852. addq %rcx, %r8
  14853. adcq $0x00, %r9
  14854. adcq $0x00, %r10
  14855. adcq $0x00, %r11
  14856. # Reduce if top bit set
  14857. movq %r11, %rdx
  14858. sarq $63, %rdx
  14859. andq $19, %rdx
  14860. andq %rax, %r11
  14861. addq %rdx, %r8
  14862. adcq $0x00, %r9
  14863. adcq $0x00, %r10
  14864. adcq $0x00, %r11
  14865. # Store
  14866. movq %r8, (%rsi)
  14867. movq %r9, 8(%rsi)
  14868. movq %r10, 16(%rsi)
  14869. movq %r11, 24(%rsi)
  14870. movq 24(%rsp), %rdi
  14871. movq 120(%rsp), %rsi
  14872. movq 112(%rsp), %rbp
  14873. # Multiply
  14874. # A[0] * B[0]
  14875. movq (%rbp), %rdx
  14876. mulxq (%rsi), %r8, %r9
  14877. # A[2] * B[0]
  14878. mulxq 16(%rsi), %r10, %r11
  14879. # A[1] * B[0]
  14880. mulxq 8(%rsi), %rcx, %rax
  14881. xorq %r15, %r15
  14882. adcxq %rcx, %r9
  14883. # A[1] * B[3]
  14884. movq 24(%rbp), %rdx
  14885. mulxq 8(%rsi), %r12, %r13
  14886. adcxq %rax, %r10
  14887. # A[0] * B[1]
  14888. movq 8(%rbp), %rdx
  14889. mulxq (%rsi), %rcx, %rax
  14890. adoxq %rcx, %r9
  14891. # A[2] * B[1]
  14892. mulxq 16(%rsi), %rcx, %r14
  14893. adoxq %rax, %r10
  14894. adcxq %rcx, %r11
  14895. # A[1] * B[2]
  14896. movq 16(%rbp), %rdx
  14897. mulxq 8(%rsi), %rcx, %rax
  14898. adcxq %r14, %r12
  14899. adoxq %rcx, %r11
  14900. adcxq %r15, %r13
  14901. adoxq %rax, %r12
  14902. # A[0] * B[2]
  14903. mulxq (%rsi), %rcx, %rax
  14904. adoxq %r15, %r13
  14905. xorq %r14, %r14
  14906. adcxq %rcx, %r10
  14907. # A[1] * B[1]
  14908. movq 8(%rbp), %rdx
  14909. mulxq 8(%rsi), %rdx, %rcx
  14910. adcxq %rax, %r11
  14911. adoxq %rdx, %r10
  14912. # A[3] * B[1]
  14913. movq 8(%rbp), %rdx
  14914. adoxq %rcx, %r11
  14915. mulxq 24(%rsi), %rcx, %rax
  14916. adcxq %rcx, %r12
  14917. # A[2] * B[2]
  14918. movq 16(%rbp), %rdx
  14919. mulxq 16(%rsi), %rdx, %rcx
  14920. adcxq %rax, %r13
  14921. adoxq %rdx, %r12
  14922. # A[3] * B[3]
  14923. movq 24(%rbp), %rdx
  14924. adoxq %rcx, %r13
  14925. mulxq 24(%rsi), %rcx, %rax
  14926. adoxq %r15, %r14
  14927. adcxq %rcx, %r14
  14928. # A[0] * B[3]
  14929. mulxq (%rsi), %rdx, %rcx
  14930. adcxq %rax, %r15
  14931. xorq %rax, %rax
  14932. adcxq %rdx, %r11
  14933. # A[3] * B[0]
  14934. movq (%rbp), %rdx
  14935. adcxq %rcx, %r12
  14936. mulxq 24(%rsi), %rdx, %rcx
  14937. adoxq %rdx, %r11
  14938. adoxq %rcx, %r12
  14939. # A[2] * B[3]
  14940. movq 24(%rbp), %rdx
  14941. mulxq 16(%rsi), %rdx, %rcx
  14942. adcxq %rdx, %r13
  14943. # A[3] * B[2]
  14944. movq 16(%rbp), %rdx
  14945. adcxq %rcx, %r14
  14946. mulxq 24(%rsi), %rcx, %rdx
  14947. adcxq %rax, %r15
  14948. adoxq %rcx, %r13
  14949. adoxq %rdx, %r14
  14950. adoxq %rax, %r15
  14951. # Reduce
  14952. movq $0x7fffffffffffffff, %rax
  14953. # Move top half into t4-t7 and remove top bit from t3
  14954. shldq $0x01, %r14, %r15
  14955. shldq $0x01, %r13, %r14
  14956. shldq $0x01, %r12, %r13
  14957. shldq $0x01, %r11, %r12
  14958. andq %rax, %r11
  14959. # Multiply top half by 19
  14960. movq $19, %rdx
  14961. xorq %rax, %rax
  14962. mulxq %r12, %rcx, %r12
  14963. adcxq %rcx, %r8
  14964. adoxq %r12, %r9
  14965. mulxq %r13, %rcx, %r13
  14966. adcxq %rcx, %r9
  14967. adoxq %r13, %r10
  14968. mulxq %r14, %rcx, %r14
  14969. adcxq %rcx, %r10
  14970. adoxq %r14, %r11
  14971. mulxq %r15, %r15, %rdx
  14972. adcxq %r15, %r11
  14973. adoxq %rax, %rdx
  14974. adcxq %rax, %rdx
  14975. # Overflow
  14976. shldq $0x01, %r11, %rdx
  14977. movq $0x7fffffffffffffff, %rax
  14978. imulq $19, %rdx, %rcx
  14979. andq %rax, %r11
  14980. addq %rcx, %r8
  14981. adcq $0x00, %r9
  14982. adcq $0x00, %r10
  14983. adcq $0x00, %r11
  14984. # Reduce if top bit set
  14985. movq %r11, %rdx
  14986. sarq $63, %rdx
  14987. andq $19, %rdx
  14988. andq %rax, %r11
  14989. addq %rdx, %r8
  14990. adcq $0x00, %r9
  14991. adcq $0x00, %r10
  14992. adcq $0x00, %r11
  14993. # Store
  14994. movq %r8, (%rdi)
  14995. movq %r9, 8(%rdi)
  14996. movq %r10, 16(%rdi)
  14997. movq %r11, 24(%rdi)
  14998. movq 8(%rsp), %rsi
  14999. movq (%rsp), %rbp
  15000. # Add
  15001. movq (%rbx), %r8
  15002. movq 8(%rbx), %r9
  15003. movq 16(%rbx), %r10
  15004. movq 24(%rbx), %rdx
  15005. movq %r8, %r12
  15006. addq (%rsi), %r8
  15007. movq %r9, %r13
  15008. adcq 8(%rsi), %r9
  15009. movq %r10, %r14
  15010. adcq 16(%rsi), %r10
  15011. movq %rdx, %r15
  15012. adcq 24(%rsi), %rdx
  15013. movq $-19, %rcx
  15014. movq %rdx, %r11
  15015. movq $0x7fffffffffffffff, %rax
  15016. sarq $63, %rdx
  15017. # Mask the modulus
  15018. andq %rdx, %rcx
  15019. andq %rdx, %rax
  15020. # Sub modulus (if overflow)
  15021. subq %rcx, %r8
  15022. sbbq %rdx, %r9
  15023. sbbq %rdx, %r10
  15024. sbbq %rax, %r11
  15025. # Sub
  15026. subq (%rsi), %r12
  15027. movq $0x00, %rdx
  15028. sbbq 8(%rsi), %r13
  15029. movq $-19, %rcx
  15030. sbbq 16(%rsi), %r14
  15031. movq $0x7fffffffffffffff, %rax
  15032. sbbq 24(%rsi), %r15
  15033. sbbq $0x00, %rdx
  15034. # Mask the modulus
  15035. andq %rdx, %rcx
  15036. andq %rdx, %rax
  15037. # Add modulus (if underflow)
  15038. addq %rcx, %r12
  15039. adcq %rdx, %r13
  15040. adcq %rdx, %r14
  15041. adcq %rax, %r15
  15042. movq %r8, (%rsi)
  15043. movq %r9, 8(%rsi)
  15044. movq %r10, 16(%rsi)
  15045. movq %r11, 24(%rsi)
  15046. movq %r12, (%rbp)
  15047. movq %r13, 8(%rbp)
  15048. movq %r14, 16(%rbp)
  15049. movq %r15, 24(%rbp)
  15050. movq 104(%rsp), %rsi
  15051. # Double
  15052. movq (%rsi), %r8
  15053. movq 8(%rsi), %r9
  15054. addq %r8, %r8
  15055. movq 16(%rsi), %r10
  15056. adcq %r9, %r9
  15057. movq 24(%rsi), %rdx
  15058. adcq %r10, %r10
  15059. movq $-19, %rcx
  15060. adcq %rdx, %rdx
  15061. movq $0x7fffffffffffffff, %rax
  15062. movq %rdx, %r11
  15063. sarq $63, %rdx
  15064. # Mask the modulus
  15065. andq %rdx, %rcx
  15066. andq %rdx, %rax
  15067. # Sub modulus (if overflow)
  15068. subq %rcx, %r8
  15069. sbbq %rdx, %r9
  15070. sbbq %rdx, %r10
  15071. sbbq %rax, %r11
  15072. movq %r8, (%rbx)
  15073. movq %r9, 8(%rbx)
  15074. movq %r10, 16(%rbx)
  15075. movq %r11, 24(%rbx)
  15076. # Add
  15077. movq (%rbx), %r8
  15078. movq 8(%rbx), %r9
  15079. movq 16(%rbx), %r10
  15080. movq 24(%rbx), %rdx
  15081. movq %r8, %r12
  15082. addq (%rdi), %r8
  15083. movq %r9, %r13
  15084. adcq 8(%rdi), %r9
  15085. movq %r10, %r14
  15086. adcq 16(%rdi), %r10
  15087. movq %rdx, %r15
  15088. adcq 24(%rdi), %rdx
  15089. movq $-19, %rcx
  15090. movq %rdx, %r11
  15091. movq $0x7fffffffffffffff, %rax
  15092. sarq $63, %rdx
  15093. # Mask the modulus
  15094. andq %rdx, %rcx
  15095. andq %rdx, %rax
  15096. # Sub modulus (if overflow)
  15097. subq %rcx, %r8
  15098. sbbq %rdx, %r9
  15099. sbbq %rdx, %r10
  15100. sbbq %rax, %r11
  15101. # Sub
  15102. subq (%rdi), %r12
  15103. movq $0x00, %rdx
  15104. sbbq 8(%rdi), %r13
  15105. movq $-19, %rcx
  15106. sbbq 16(%rdi), %r14
  15107. movq $0x7fffffffffffffff, %rax
  15108. sbbq 24(%rdi), %r15
  15109. sbbq $0x00, %rdx
  15110. # Mask the modulus
  15111. andq %rdx, %rcx
  15112. andq %rdx, %rax
  15113. # Add modulus (if underflow)
  15114. addq %rcx, %r12
  15115. adcq %rdx, %r13
  15116. adcq %rdx, %r14
  15117. adcq %rax, %r15
  15118. movq %r8, (%rdi)
  15119. movq %r9, 8(%rdi)
  15120. movq %r10, 16(%rdi)
  15121. movq %r11, 24(%rdi)
  15122. movq %r12, (%rbx)
  15123. movq %r13, 8(%rbx)
  15124. movq %r14, 16(%rbx)
  15125. movq %r15, 24(%rbx)
  15126. addq $48, %rsp
  15127. popq %r15
  15128. popq %r14
  15129. popq %r13
  15130. popq %r12
  15131. popq %rbx
  15132. popq %rbp
  15133. repz retq
  15134. #ifndef __APPLE__
  15135. .size fe_ge_msub_avx2,.-fe_ge_msub_avx2
  15136. #endif /* __APPLE__ */
  15137. #ifndef __APPLE__
  15138. .text
  15139. .globl fe_ge_add_avx2
  15140. .type fe_ge_add_avx2,@function
  15141. .align 16
  15142. fe_ge_add_avx2:
  15143. #else
  15144. .section __TEXT,__text
  15145. .globl _fe_ge_add_avx2
  15146. .p2align 4
  15147. _fe_ge_add_avx2:
  15148. #endif /* __APPLE__ */
  15149. pushq %rbx
  15150. pushq %rbp
  15151. pushq %r12
  15152. pushq %r13
  15153. pushq %r14
  15154. pushq %r15
  15155. subq $0x50, %rsp
  15156. movq %rdi, (%rsp)
  15157. movq %rsi, 8(%rsp)
  15158. movq %rdx, 16(%rsp)
  15159. movq %rcx, 24(%rsp)
  15160. movq %r8, 32(%rsp)
  15161. movq %r9, 40(%rsp)
  15162. movq 8(%rsp), %rsi
  15163. movq 40(%rsp), %rbx
  15164. movq 32(%rsp), %rbp
  15165. # Add
  15166. movq (%rbx), %r8
  15167. movq 8(%rbx), %r9
  15168. movq 16(%rbx), %r10
  15169. movq 24(%rbx), %rdx
  15170. movq %r8, %r12
  15171. addq (%rbp), %r8
  15172. movq %r9, %r13
  15173. adcq 8(%rbp), %r9
  15174. movq %r10, %r14
  15175. adcq 16(%rbp), %r10
  15176. movq %rdx, %r15
  15177. adcq 24(%rbp), %rdx
  15178. movq $-19, %rcx
  15179. movq %rdx, %r11
  15180. movq $0x7fffffffffffffff, %rax
  15181. sarq $63, %rdx
  15182. # Mask the modulus
  15183. andq %rdx, %rcx
  15184. andq %rdx, %rax
  15185. # Sub modulus (if overflow)
  15186. subq %rcx, %r8
  15187. sbbq %rdx, %r9
  15188. sbbq %rdx, %r10
  15189. sbbq %rax, %r11
  15190. # Sub
  15191. subq (%rbp), %r12
  15192. movq $0x00, %rdx
  15193. sbbq 8(%rbp), %r13
  15194. movq $-19, %rcx
  15195. sbbq 16(%rbp), %r14
  15196. movq $0x7fffffffffffffff, %rax
  15197. sbbq 24(%rbp), %r15
  15198. sbbq $0x00, %rdx
  15199. # Mask the modulus
  15200. andq %rdx, %rcx
  15201. andq %rdx, %rax
  15202. # Add modulus (if underflow)
  15203. addq %rcx, %r12
  15204. adcq %rdx, %r13
  15205. adcq %rdx, %r14
  15206. adcq %rax, %r15
  15207. movq %r8, (%rdi)
  15208. movq %r9, 8(%rdi)
  15209. movq %r10, 16(%rdi)
  15210. movq %r11, 24(%rdi)
  15211. movq %r12, (%rsi)
  15212. movq %r13, 8(%rsi)
  15213. movq %r14, 16(%rsi)
  15214. movq %r15, 24(%rsi)
  15215. movq 16(%rsp), %rbx
  15216. movq 168(%rsp), %rbp
  15217. # Multiply
  15218. # A[0] * B[0]
  15219. movq (%rbp), %rdx
  15220. mulxq (%rdi), %r8, %r9
  15221. # A[2] * B[0]
  15222. mulxq 16(%rdi), %r10, %r11
  15223. # A[1] * B[0]
  15224. mulxq 8(%rdi), %rcx, %rax
  15225. xorq %r15, %r15
  15226. adcxq %rcx, %r9
  15227. # A[1] * B[3]
  15228. movq 24(%rbp), %rdx
  15229. mulxq 8(%rdi), %r12, %r13
  15230. adcxq %rax, %r10
  15231. # A[0] * B[1]
  15232. movq 8(%rbp), %rdx
  15233. mulxq (%rdi), %rcx, %rax
  15234. adoxq %rcx, %r9
  15235. # A[2] * B[1]
  15236. mulxq 16(%rdi), %rcx, %r14
  15237. adoxq %rax, %r10
  15238. adcxq %rcx, %r11
  15239. # A[1] * B[2]
  15240. movq 16(%rbp), %rdx
  15241. mulxq 8(%rdi), %rcx, %rax
  15242. adcxq %r14, %r12
  15243. adoxq %rcx, %r11
  15244. adcxq %r15, %r13
  15245. adoxq %rax, %r12
  15246. # A[0] * B[2]
  15247. mulxq (%rdi), %rcx, %rax
  15248. adoxq %r15, %r13
  15249. xorq %r14, %r14
  15250. adcxq %rcx, %r10
  15251. # A[1] * B[1]
  15252. movq 8(%rbp), %rdx
  15253. mulxq 8(%rdi), %rdx, %rcx
  15254. adcxq %rax, %r11
  15255. adoxq %rdx, %r10
  15256. # A[3] * B[1]
  15257. movq 8(%rbp), %rdx
  15258. adoxq %rcx, %r11
  15259. mulxq 24(%rdi), %rcx, %rax
  15260. adcxq %rcx, %r12
  15261. # A[2] * B[2]
  15262. movq 16(%rbp), %rdx
  15263. mulxq 16(%rdi), %rdx, %rcx
  15264. adcxq %rax, %r13
  15265. adoxq %rdx, %r12
  15266. # A[3] * B[3]
  15267. movq 24(%rbp), %rdx
  15268. adoxq %rcx, %r13
  15269. mulxq 24(%rdi), %rcx, %rax
  15270. adoxq %r15, %r14
  15271. adcxq %rcx, %r14
  15272. # A[0] * B[3]
  15273. mulxq (%rdi), %rdx, %rcx
  15274. adcxq %rax, %r15
  15275. xorq %rax, %rax
  15276. adcxq %rdx, %r11
  15277. # A[3] * B[0]
  15278. movq (%rbp), %rdx
  15279. adcxq %rcx, %r12
  15280. mulxq 24(%rdi), %rdx, %rcx
  15281. adoxq %rdx, %r11
  15282. adoxq %rcx, %r12
  15283. # A[2] * B[3]
  15284. movq 24(%rbp), %rdx
  15285. mulxq 16(%rdi), %rdx, %rcx
  15286. adcxq %rdx, %r13
  15287. # A[3] * B[2]
  15288. movq 16(%rbp), %rdx
  15289. adcxq %rcx, %r14
  15290. mulxq 24(%rdi), %rcx, %rdx
  15291. adcxq %rax, %r15
  15292. adoxq %rcx, %r13
  15293. adoxq %rdx, %r14
  15294. adoxq %rax, %r15
  15295. # Reduce
  15296. movq $0x7fffffffffffffff, %rax
  15297. # Move top half into t4-t7 and remove top bit from t3
  15298. shldq $0x01, %r14, %r15
  15299. shldq $0x01, %r13, %r14
  15300. shldq $0x01, %r12, %r13
  15301. shldq $0x01, %r11, %r12
  15302. andq %rax, %r11
  15303. # Multiply top half by 19
  15304. movq $19, %rdx
  15305. xorq %rax, %rax
  15306. mulxq %r12, %rcx, %r12
  15307. adcxq %rcx, %r8
  15308. adoxq %r12, %r9
  15309. mulxq %r13, %rcx, %r13
  15310. adcxq %rcx, %r9
  15311. adoxq %r13, %r10
  15312. mulxq %r14, %rcx, %r14
  15313. adcxq %rcx, %r10
  15314. adoxq %r14, %r11
  15315. mulxq %r15, %r15, %rdx
  15316. adcxq %r15, %r11
  15317. adoxq %rax, %rdx
  15318. adcxq %rax, %rdx
  15319. # Overflow
  15320. shldq $0x01, %r11, %rdx
  15321. movq $0x7fffffffffffffff, %rax
  15322. imulq $19, %rdx, %rcx
  15323. andq %rax, %r11
  15324. addq %rcx, %r8
  15325. adcq $0x00, %r9
  15326. adcq $0x00, %r10
  15327. adcq $0x00, %r11
  15328. # Reduce if top bit set
  15329. movq %r11, %rdx
  15330. sarq $63, %rdx
  15331. andq $19, %rdx
  15332. andq %rax, %r11
  15333. addq %rdx, %r8
  15334. adcq $0x00, %r9
  15335. adcq $0x00, %r10
  15336. adcq $0x00, %r11
  15337. # Store
  15338. movq %r8, (%rbx)
  15339. movq %r9, 8(%rbx)
  15340. movq %r10, 16(%rbx)
  15341. movq %r11, 24(%rbx)
  15342. movq 176(%rsp), %rbx
  15343. # Multiply
  15344. # A[0] * B[0]
  15345. movq (%rbx), %rdx
  15346. mulxq (%rsi), %r8, %r9
  15347. # A[2] * B[0]
  15348. mulxq 16(%rsi), %r10, %r11
  15349. # A[1] * B[0]
  15350. mulxq 8(%rsi), %rcx, %rax
  15351. xorq %r15, %r15
  15352. adcxq %rcx, %r9
  15353. # A[1] * B[3]
  15354. movq 24(%rbx), %rdx
  15355. mulxq 8(%rsi), %r12, %r13
  15356. adcxq %rax, %r10
  15357. # A[0] * B[1]
  15358. movq 8(%rbx), %rdx
  15359. mulxq (%rsi), %rcx, %rax
  15360. adoxq %rcx, %r9
  15361. # A[2] * B[1]
  15362. mulxq 16(%rsi), %rcx, %r14
  15363. adoxq %rax, %r10
  15364. adcxq %rcx, %r11
  15365. # A[1] * B[2]
  15366. movq 16(%rbx), %rdx
  15367. mulxq 8(%rsi), %rcx, %rax
  15368. adcxq %r14, %r12
  15369. adoxq %rcx, %r11
  15370. adcxq %r15, %r13
  15371. adoxq %rax, %r12
  15372. # A[0] * B[2]
  15373. mulxq (%rsi), %rcx, %rax
  15374. adoxq %r15, %r13
  15375. xorq %r14, %r14
  15376. adcxq %rcx, %r10
  15377. # A[1] * B[1]
  15378. movq 8(%rbx), %rdx
  15379. mulxq 8(%rsi), %rdx, %rcx
  15380. adcxq %rax, %r11
  15381. adoxq %rdx, %r10
  15382. # A[3] * B[1]
  15383. movq 8(%rbx), %rdx
  15384. adoxq %rcx, %r11
  15385. mulxq 24(%rsi), %rcx, %rax
  15386. adcxq %rcx, %r12
  15387. # A[2] * B[2]
  15388. movq 16(%rbx), %rdx
  15389. mulxq 16(%rsi), %rdx, %rcx
  15390. adcxq %rax, %r13
  15391. adoxq %rdx, %r12
  15392. # A[3] * B[3]
  15393. movq 24(%rbx), %rdx
  15394. adoxq %rcx, %r13
  15395. mulxq 24(%rsi), %rcx, %rax
  15396. adoxq %r15, %r14
  15397. adcxq %rcx, %r14
  15398. # A[0] * B[3]
  15399. mulxq (%rsi), %rdx, %rcx
  15400. adcxq %rax, %r15
  15401. xorq %rax, %rax
  15402. adcxq %rdx, %r11
  15403. # A[3] * B[0]
  15404. movq (%rbx), %rdx
  15405. adcxq %rcx, %r12
  15406. mulxq 24(%rsi), %rdx, %rcx
  15407. adoxq %rdx, %r11
  15408. adoxq %rcx, %r12
  15409. # A[2] * B[3]
  15410. movq 24(%rbx), %rdx
  15411. mulxq 16(%rsi), %rdx, %rcx
  15412. adcxq %rdx, %r13
  15413. # A[3] * B[2]
  15414. movq 16(%rbx), %rdx
  15415. adcxq %rcx, %r14
  15416. mulxq 24(%rsi), %rcx, %rdx
  15417. adcxq %rax, %r15
  15418. adoxq %rcx, %r13
  15419. adoxq %rdx, %r14
  15420. adoxq %rax, %r15
  15421. # Reduce
  15422. movq $0x7fffffffffffffff, %rax
  15423. # Move top half into t4-t7 and remove top bit from t3
  15424. shldq $0x01, %r14, %r15
  15425. shldq $0x01, %r13, %r14
  15426. shldq $0x01, %r12, %r13
  15427. shldq $0x01, %r11, %r12
  15428. andq %rax, %r11
  15429. # Multiply top half by 19
  15430. movq $19, %rdx
  15431. xorq %rax, %rax
  15432. mulxq %r12, %rcx, %r12
  15433. adcxq %rcx, %r8
  15434. adoxq %r12, %r9
  15435. mulxq %r13, %rcx, %r13
  15436. adcxq %rcx, %r9
  15437. adoxq %r13, %r10
  15438. mulxq %r14, %rcx, %r14
  15439. adcxq %rcx, %r10
  15440. adoxq %r14, %r11
  15441. mulxq %r15, %r15, %rdx
  15442. adcxq %r15, %r11
  15443. adoxq %rax, %rdx
  15444. adcxq %rax, %rdx
  15445. # Overflow
  15446. shldq $0x01, %r11, %rdx
  15447. movq $0x7fffffffffffffff, %rax
  15448. imulq $19, %rdx, %rcx
  15449. andq %rax, %r11
  15450. addq %rcx, %r8
  15451. adcq $0x00, %r9
  15452. adcq $0x00, %r10
  15453. adcq $0x00, %r11
  15454. # Reduce if top bit set
  15455. movq %r11, %rdx
  15456. sarq $63, %rdx
  15457. andq $19, %rdx
  15458. andq %rax, %r11
  15459. addq %rdx, %r8
  15460. adcq $0x00, %r9
  15461. adcq $0x00, %r10
  15462. adcq $0x00, %r11
  15463. # Store
  15464. movq %r8, (%rsi)
  15465. movq %r9, 8(%rsi)
  15466. movq %r10, 16(%rsi)
  15467. movq %r11, 24(%rsi)
  15468. movq 24(%rsp), %rsi
  15469. movq 160(%rsp), %rbx
  15470. movq 144(%rsp), %rbp
  15471. # Multiply
  15472. # A[0] * B[0]
  15473. movq (%rbp), %rdx
  15474. mulxq (%rbx), %r8, %r9
  15475. # A[2] * B[0]
  15476. mulxq 16(%rbx), %r10, %r11
  15477. # A[1] * B[0]
  15478. mulxq 8(%rbx), %rcx, %rax
  15479. xorq %r15, %r15
  15480. adcxq %rcx, %r9
  15481. # A[1] * B[3]
  15482. movq 24(%rbp), %rdx
  15483. mulxq 8(%rbx), %r12, %r13
  15484. adcxq %rax, %r10
  15485. # A[0] * B[1]
  15486. movq 8(%rbp), %rdx
  15487. mulxq (%rbx), %rcx, %rax
  15488. adoxq %rcx, %r9
  15489. # A[2] * B[1]
  15490. mulxq 16(%rbx), %rcx, %r14
  15491. adoxq %rax, %r10
  15492. adcxq %rcx, %r11
  15493. # A[1] * B[2]
  15494. movq 16(%rbp), %rdx
  15495. mulxq 8(%rbx), %rcx, %rax
  15496. adcxq %r14, %r12
  15497. adoxq %rcx, %r11
  15498. adcxq %r15, %r13
  15499. adoxq %rax, %r12
  15500. # A[0] * B[2]
  15501. mulxq (%rbx), %rcx, %rax
  15502. adoxq %r15, %r13
  15503. xorq %r14, %r14
  15504. adcxq %rcx, %r10
  15505. # A[1] * B[1]
  15506. movq 8(%rbp), %rdx
  15507. mulxq 8(%rbx), %rdx, %rcx
  15508. adcxq %rax, %r11
  15509. adoxq %rdx, %r10
  15510. # A[3] * B[1]
  15511. movq 8(%rbp), %rdx
  15512. adoxq %rcx, %r11
  15513. mulxq 24(%rbx), %rcx, %rax
  15514. adcxq %rcx, %r12
  15515. # A[2] * B[2]
  15516. movq 16(%rbp), %rdx
  15517. mulxq 16(%rbx), %rdx, %rcx
  15518. adcxq %rax, %r13
  15519. adoxq %rdx, %r12
  15520. # A[3] * B[3]
  15521. movq 24(%rbp), %rdx
  15522. adoxq %rcx, %r13
  15523. mulxq 24(%rbx), %rcx, %rax
  15524. adoxq %r15, %r14
  15525. adcxq %rcx, %r14
  15526. # A[0] * B[3]
  15527. mulxq (%rbx), %rdx, %rcx
  15528. adcxq %rax, %r15
  15529. xorq %rax, %rax
  15530. adcxq %rdx, %r11
  15531. # A[3] * B[0]
  15532. movq (%rbp), %rdx
  15533. adcxq %rcx, %r12
  15534. mulxq 24(%rbx), %rdx, %rcx
  15535. adoxq %rdx, %r11
  15536. adoxq %rcx, %r12
  15537. # A[2] * B[3]
  15538. movq 24(%rbp), %rdx
  15539. mulxq 16(%rbx), %rdx, %rcx
  15540. adcxq %rdx, %r13
  15541. # A[3] * B[2]
  15542. movq 16(%rbp), %rdx
  15543. adcxq %rcx, %r14
  15544. mulxq 24(%rbx), %rcx, %rdx
  15545. adcxq %rax, %r15
  15546. adoxq %rcx, %r13
  15547. adoxq %rdx, %r14
  15548. adoxq %rax, %r15
  15549. # Reduce
  15550. movq $0x7fffffffffffffff, %rax
  15551. # Move top half into t4-t7 and remove top bit from t3
  15552. shldq $0x01, %r14, %r15
  15553. shldq $0x01, %r13, %r14
  15554. shldq $0x01, %r12, %r13
  15555. shldq $0x01, %r11, %r12
  15556. andq %rax, %r11
  15557. # Multiply top half by 19
  15558. movq $19, %rdx
  15559. xorq %rax, %rax
  15560. mulxq %r12, %rcx, %r12
  15561. adcxq %rcx, %r8
  15562. adoxq %r12, %r9
  15563. mulxq %r13, %rcx, %r13
  15564. adcxq %rcx, %r9
  15565. adoxq %r13, %r10
  15566. mulxq %r14, %rcx, %r14
  15567. adcxq %rcx, %r10
  15568. adoxq %r14, %r11
  15569. mulxq %r15, %r15, %rdx
  15570. adcxq %r15, %r11
  15571. adoxq %rax, %rdx
  15572. adcxq %rax, %rdx
  15573. # Overflow
  15574. shldq $0x01, %r11, %rdx
  15575. movq $0x7fffffffffffffff, %rax
  15576. imulq $19, %rdx, %rcx
  15577. andq %rax, %r11
  15578. addq %rcx, %r8
  15579. adcq $0x00, %r9
  15580. adcq $0x00, %r10
  15581. adcq $0x00, %r11
  15582. # Reduce if top bit set
  15583. movq %r11, %rdx
  15584. sarq $63, %rdx
  15585. andq $19, %rdx
  15586. andq %rax, %r11
  15587. addq %rdx, %r8
  15588. adcq $0x00, %r9
  15589. adcq $0x00, %r10
  15590. adcq $0x00, %r11
  15591. # Store
  15592. movq %r8, (%rsi)
  15593. movq %r9, 8(%rsi)
  15594. movq %r10, 16(%rsi)
  15595. movq %r11, 24(%rsi)
  15596. movq 136(%rsp), %rsi
  15597. movq 152(%rsp), %rbx
  15598. # Multiply
  15599. # A[0] * B[0]
  15600. movq (%rbx), %rdx
  15601. mulxq (%rsi), %r8, %r9
  15602. # A[2] * B[0]
  15603. mulxq 16(%rsi), %r10, %r11
  15604. # A[1] * B[0]
  15605. mulxq 8(%rsi), %rcx, %rax
  15606. xorq %r15, %r15
  15607. adcxq %rcx, %r9
  15608. # A[1] * B[3]
  15609. movq 24(%rbx), %rdx
  15610. mulxq 8(%rsi), %r12, %r13
  15611. adcxq %rax, %r10
  15612. # A[0] * B[1]
  15613. movq 8(%rbx), %rdx
  15614. mulxq (%rsi), %rcx, %rax
  15615. adoxq %rcx, %r9
  15616. # A[2] * B[1]
  15617. mulxq 16(%rsi), %rcx, %r14
  15618. adoxq %rax, %r10
  15619. adcxq %rcx, %r11
  15620. # A[1] * B[2]
  15621. movq 16(%rbx), %rdx
  15622. mulxq 8(%rsi), %rcx, %rax
  15623. adcxq %r14, %r12
  15624. adoxq %rcx, %r11
  15625. adcxq %r15, %r13
  15626. adoxq %rax, %r12
  15627. # A[0] * B[2]
  15628. mulxq (%rsi), %rcx, %rax
  15629. adoxq %r15, %r13
  15630. xorq %r14, %r14
  15631. adcxq %rcx, %r10
  15632. # A[1] * B[1]
  15633. movq 8(%rbx), %rdx
  15634. mulxq 8(%rsi), %rdx, %rcx
  15635. adcxq %rax, %r11
  15636. adoxq %rdx, %r10
  15637. # A[3] * B[1]
  15638. movq 8(%rbx), %rdx
  15639. adoxq %rcx, %r11
  15640. mulxq 24(%rsi), %rcx, %rax
  15641. adcxq %rcx, %r12
  15642. # A[2] * B[2]
  15643. movq 16(%rbx), %rdx
  15644. mulxq 16(%rsi), %rdx, %rcx
  15645. adcxq %rax, %r13
  15646. adoxq %rdx, %r12
  15647. # A[3] * B[3]
  15648. movq 24(%rbx), %rdx
  15649. adoxq %rcx, %r13
  15650. mulxq 24(%rsi), %rcx, %rax
  15651. adoxq %r15, %r14
  15652. adcxq %rcx, %r14
  15653. # A[0] * B[3]
  15654. mulxq (%rsi), %rdx, %rcx
  15655. adcxq %rax, %r15
  15656. xorq %rax, %rax
  15657. adcxq %rdx, %r11
  15658. # A[3] * B[0]
  15659. movq (%rbx), %rdx
  15660. adcxq %rcx, %r12
  15661. mulxq 24(%rsi), %rdx, %rcx
  15662. adoxq %rdx, %r11
  15663. adoxq %rcx, %r12
  15664. # A[2] * B[3]
  15665. movq 24(%rbx), %rdx
  15666. mulxq 16(%rsi), %rdx, %rcx
  15667. adcxq %rdx, %r13
  15668. # A[3] * B[2]
  15669. movq 16(%rbx), %rdx
  15670. adcxq %rcx, %r14
  15671. mulxq 24(%rsi), %rcx, %rdx
  15672. adcxq %rax, %r15
  15673. adoxq %rcx, %r13
  15674. adoxq %rdx, %r14
  15675. adoxq %rax, %r15
  15676. # Reduce
  15677. movq $0x7fffffffffffffff, %rax
  15678. # Move top half into t4-t7 and remove top bit from t3
  15679. shldq $0x01, %r14, %r15
  15680. shldq $0x01, %r13, %r14
  15681. shldq $0x01, %r12, %r13
  15682. shldq $0x01, %r11, %r12
  15683. andq %rax, %r11
  15684. # Multiply top half by 19
  15685. movq $19, %rdx
  15686. xorq %rax, %rax
  15687. mulxq %r12, %rcx, %r12
  15688. adcxq %rcx, %r8
  15689. adoxq %r12, %r9
  15690. mulxq %r13, %rcx, %r13
  15691. adcxq %rcx, %r9
  15692. adoxq %r13, %r10
  15693. mulxq %r14, %rcx, %r14
  15694. adcxq %rcx, %r10
  15695. adoxq %r14, %r11
  15696. mulxq %r15, %r15, %rdx
  15697. adcxq %r15, %r11
  15698. adoxq %rax, %rdx
  15699. adcxq %rax, %rdx
  15700. # Overflow
  15701. shldq $0x01, %r11, %rdx
  15702. movq $0x7fffffffffffffff, %rax
  15703. imulq $19, %rdx, %rcx
  15704. andq %rax, %r11
  15705. addq %rcx, %r8
  15706. adcq $0x00, %r9
  15707. adcq $0x00, %r10
  15708. adcq $0x00, %r11
  15709. # Reduce if top bit set
  15710. movq %r11, %rdx
  15711. sarq $63, %rdx
  15712. andq $19, %rdx
  15713. andq %rax, %r11
  15714. addq %rdx, %r8
  15715. adcq $0x00, %r9
  15716. adcq $0x00, %r10
  15717. adcq $0x00, %r11
  15718. # Store
  15719. movq %r8, (%rdi)
  15720. movq %r9, 8(%rdi)
  15721. movq %r10, 16(%rdi)
  15722. movq %r11, 24(%rdi)
  15723. leaq 48(%rsp), %rsi
  15724. # Double
  15725. movq (%rdi), %r8
  15726. movq 8(%rdi), %r9
  15727. addq %r8, %r8
  15728. movq 16(%rdi), %r10
  15729. adcq %r9, %r9
  15730. movq 24(%rdi), %rdx
  15731. adcq %r10, %r10
  15732. movq $-19, %rcx
  15733. adcq %rdx, %rdx
  15734. movq $0x7fffffffffffffff, %rax
  15735. movq %rdx, %r11
  15736. sarq $63, %rdx
  15737. # Mask the modulus
  15738. andq %rdx, %rcx
  15739. andq %rdx, %rax
  15740. # Sub modulus (if overflow)
  15741. subq %rcx, %r8
  15742. sbbq %rdx, %r9
  15743. sbbq %rdx, %r10
  15744. sbbq %rax, %r11
  15745. movq %r8, (%rsi)
  15746. movq %r9, 8(%rsi)
  15747. movq %r10, 16(%rsi)
  15748. movq %r11, 24(%rsi)
  15749. movq 8(%rsp), %rbx
  15750. movq 16(%rsp), %rbp
  15751. # Add
  15752. movq (%rbp), %r8
  15753. movq 8(%rbp), %r9
  15754. movq 16(%rbp), %r10
  15755. movq 24(%rbp), %rdx
  15756. movq %r8, %r12
  15757. addq (%rbx), %r8
  15758. movq %r9, %r13
  15759. adcq 8(%rbx), %r9
  15760. movq %r10, %r14
  15761. adcq 16(%rbx), %r10
  15762. movq %rdx, %r15
  15763. adcq 24(%rbx), %rdx
  15764. movq $-19, %rcx
  15765. movq %rdx, %r11
  15766. movq $0x7fffffffffffffff, %rax
  15767. sarq $63, %rdx
  15768. # Mask the modulus
  15769. andq %rdx, %rcx
  15770. andq %rdx, %rax
  15771. # Sub modulus (if overflow)
  15772. subq %rcx, %r8
  15773. sbbq %rdx, %r9
  15774. sbbq %rdx, %r10
  15775. sbbq %rax, %r11
  15776. # Sub
  15777. subq (%rbx), %r12
  15778. movq $0x00, %rdx
  15779. sbbq 8(%rbx), %r13
  15780. movq $-19, %rcx
  15781. sbbq 16(%rbx), %r14
  15782. movq $0x7fffffffffffffff, %rax
  15783. sbbq 24(%rbx), %r15
  15784. sbbq $0x00, %rdx
  15785. # Mask the modulus
  15786. andq %rdx, %rcx
  15787. andq %rdx, %rax
  15788. # Add modulus (if underflow)
  15789. addq %rcx, %r12
  15790. adcq %rdx, %r13
  15791. adcq %rdx, %r14
  15792. adcq %rax, %r15
  15793. movq %r8, (%rbx)
  15794. movq %r9, 8(%rbx)
  15795. movq %r10, 16(%rbx)
  15796. movq %r11, 24(%rbx)
  15797. movq %r12, (%rdi)
  15798. movq %r13, 8(%rdi)
  15799. movq %r14, 16(%rdi)
  15800. movq %r15, 24(%rdi)
  15801. movq 24(%rsp), %rdi
  15802. # Add
  15803. movq (%rsi), %r8
  15804. movq 8(%rsi), %r9
  15805. movq 16(%rsi), %r10
  15806. movq 24(%rsi), %rdx
  15807. movq %r8, %r12
  15808. addq (%rdi), %r8
  15809. movq %r9, %r13
  15810. adcq 8(%rdi), %r9
  15811. movq %r10, %r14
  15812. adcq 16(%rdi), %r10
  15813. movq %rdx, %r15
  15814. adcq 24(%rdi), %rdx
  15815. movq $-19, %rcx
  15816. movq %rdx, %r11
  15817. movq $0x7fffffffffffffff, %rax
  15818. sarq $63, %rdx
  15819. # Mask the modulus
  15820. andq %rdx, %rcx
  15821. andq %rdx, %rax
  15822. # Sub modulus (if overflow)
  15823. subq %rcx, %r8
  15824. sbbq %rdx, %r9
  15825. sbbq %rdx, %r10
  15826. sbbq %rax, %r11
  15827. # Sub
  15828. subq (%rdi), %r12
  15829. movq $0x00, %rdx
  15830. sbbq 8(%rdi), %r13
  15831. movq $-19, %rcx
  15832. sbbq 16(%rdi), %r14
  15833. movq $0x7fffffffffffffff, %rax
  15834. sbbq 24(%rdi), %r15
  15835. sbbq $0x00, %rdx
  15836. # Mask the modulus
  15837. andq %rdx, %rcx
  15838. andq %rdx, %rax
  15839. # Add modulus (if underflow)
  15840. addq %rcx, %r12
  15841. adcq %rdx, %r13
  15842. adcq %rdx, %r14
  15843. adcq %rax, %r15
  15844. movq %r8, (%rbp)
  15845. movq %r9, 8(%rbp)
  15846. movq %r10, 16(%rbp)
  15847. movq %r11, 24(%rbp)
  15848. movq %r12, (%rdi)
  15849. movq %r13, 8(%rdi)
  15850. movq %r14, 16(%rdi)
  15851. movq %r15, 24(%rdi)
  15852. addq $0x50, %rsp
  15853. popq %r15
  15854. popq %r14
  15855. popq %r13
  15856. popq %r12
  15857. popq %rbp
  15858. popq %rbx
  15859. repz retq
  15860. #ifndef __APPLE__
  15861. .size fe_ge_add_avx2,.-fe_ge_add_avx2
  15862. #endif /* __APPLE__ */
  15863. #ifndef __APPLE__
  15864. .text
  15865. .globl fe_ge_sub_avx2
  15866. .type fe_ge_sub_avx2,@function
  15867. .align 16
  15868. fe_ge_sub_avx2:
  15869. #else
  15870. .section __TEXT,__text
  15871. .globl _fe_ge_sub_avx2
  15872. .p2align 4
  15873. _fe_ge_sub_avx2:
  15874. #endif /* __APPLE__ */
  15875. pushq %rbx
  15876. pushq %rbp
  15877. pushq %r12
  15878. pushq %r13
  15879. pushq %r14
  15880. pushq %r15
  15881. subq $0x50, %rsp
  15882. movq %rdi, (%rsp)
  15883. movq %rsi, 8(%rsp)
  15884. movq %rdx, 16(%rsp)
  15885. movq %rcx, 24(%rsp)
  15886. movq %r8, 32(%rsp)
  15887. movq %r9, 40(%rsp)
  15888. movq 8(%rsp), %rsi
  15889. movq 40(%rsp), %rbx
  15890. movq 32(%rsp), %rbp
  15891. # Add
  15892. movq (%rbx), %r8
  15893. movq 8(%rbx), %r9
  15894. movq 16(%rbx), %r10
  15895. movq 24(%rbx), %rdx
  15896. movq %r8, %r12
  15897. addq (%rbp), %r8
  15898. movq %r9, %r13
  15899. adcq 8(%rbp), %r9
  15900. movq %r10, %r14
  15901. adcq 16(%rbp), %r10
  15902. movq %rdx, %r15
  15903. adcq 24(%rbp), %rdx
  15904. movq $-19, %rcx
  15905. movq %rdx, %r11
  15906. movq $0x7fffffffffffffff, %rax
  15907. sarq $63, %rdx
  15908. # Mask the modulus
  15909. andq %rdx, %rcx
  15910. andq %rdx, %rax
  15911. # Sub modulus (if overflow)
  15912. subq %rcx, %r8
  15913. sbbq %rdx, %r9
  15914. sbbq %rdx, %r10
  15915. sbbq %rax, %r11
  15916. # Sub
  15917. subq (%rbp), %r12
  15918. movq $0x00, %rdx
  15919. sbbq 8(%rbp), %r13
  15920. movq $-19, %rcx
  15921. sbbq 16(%rbp), %r14
  15922. movq $0x7fffffffffffffff, %rax
  15923. sbbq 24(%rbp), %r15
  15924. sbbq $0x00, %rdx
  15925. # Mask the modulus
  15926. andq %rdx, %rcx
  15927. andq %rdx, %rax
  15928. # Add modulus (if underflow)
  15929. addq %rcx, %r12
  15930. adcq %rdx, %r13
  15931. adcq %rdx, %r14
  15932. adcq %rax, %r15
  15933. movq %r8, (%rdi)
  15934. movq %r9, 8(%rdi)
  15935. movq %r10, 16(%rdi)
  15936. movq %r11, 24(%rdi)
  15937. movq %r12, (%rsi)
  15938. movq %r13, 8(%rsi)
  15939. movq %r14, 16(%rsi)
  15940. movq %r15, 24(%rsi)
  15941. movq 16(%rsp), %rbx
  15942. movq 176(%rsp), %rbp
  15943. # Multiply
  15944. # A[0] * B[0]
  15945. movq (%rbp), %rdx
  15946. mulxq (%rdi), %r8, %r9
  15947. # A[2] * B[0]
  15948. mulxq 16(%rdi), %r10, %r11
  15949. # A[1] * B[0]
  15950. mulxq 8(%rdi), %rcx, %rax
  15951. xorq %r15, %r15
  15952. adcxq %rcx, %r9
  15953. # A[1] * B[3]
  15954. movq 24(%rbp), %rdx
  15955. mulxq 8(%rdi), %r12, %r13
  15956. adcxq %rax, %r10
  15957. # A[0] * B[1]
  15958. movq 8(%rbp), %rdx
  15959. mulxq (%rdi), %rcx, %rax
  15960. adoxq %rcx, %r9
  15961. # A[2] * B[1]
  15962. mulxq 16(%rdi), %rcx, %r14
  15963. adoxq %rax, %r10
  15964. adcxq %rcx, %r11
  15965. # A[1] * B[2]
  15966. movq 16(%rbp), %rdx
  15967. mulxq 8(%rdi), %rcx, %rax
  15968. adcxq %r14, %r12
  15969. adoxq %rcx, %r11
  15970. adcxq %r15, %r13
  15971. adoxq %rax, %r12
  15972. # A[0] * B[2]
  15973. mulxq (%rdi), %rcx, %rax
  15974. adoxq %r15, %r13
  15975. xorq %r14, %r14
  15976. adcxq %rcx, %r10
  15977. # A[1] * B[1]
  15978. movq 8(%rbp), %rdx
  15979. mulxq 8(%rdi), %rdx, %rcx
  15980. adcxq %rax, %r11
  15981. adoxq %rdx, %r10
  15982. # A[3] * B[1]
  15983. movq 8(%rbp), %rdx
  15984. adoxq %rcx, %r11
  15985. mulxq 24(%rdi), %rcx, %rax
  15986. adcxq %rcx, %r12
  15987. # A[2] * B[2]
  15988. movq 16(%rbp), %rdx
  15989. mulxq 16(%rdi), %rdx, %rcx
  15990. adcxq %rax, %r13
  15991. adoxq %rdx, %r12
  15992. # A[3] * B[3]
  15993. movq 24(%rbp), %rdx
  15994. adoxq %rcx, %r13
  15995. mulxq 24(%rdi), %rcx, %rax
  15996. adoxq %r15, %r14
  15997. adcxq %rcx, %r14
  15998. # A[0] * B[3]
  15999. mulxq (%rdi), %rdx, %rcx
  16000. adcxq %rax, %r15
  16001. xorq %rax, %rax
  16002. adcxq %rdx, %r11
  16003. # A[3] * B[0]
  16004. movq (%rbp), %rdx
  16005. adcxq %rcx, %r12
  16006. mulxq 24(%rdi), %rdx, %rcx
  16007. adoxq %rdx, %r11
  16008. adoxq %rcx, %r12
  16009. # A[2] * B[3]
  16010. movq 24(%rbp), %rdx
  16011. mulxq 16(%rdi), %rdx, %rcx
  16012. adcxq %rdx, %r13
  16013. # A[3] * B[2]
  16014. movq 16(%rbp), %rdx
  16015. adcxq %rcx, %r14
  16016. mulxq 24(%rdi), %rcx, %rdx
  16017. adcxq %rax, %r15
  16018. adoxq %rcx, %r13
  16019. adoxq %rdx, %r14
  16020. adoxq %rax, %r15
  16021. # Reduce
  16022. movq $0x7fffffffffffffff, %rax
  16023. # Move top half into t4-t7 and remove top bit from t3
  16024. shldq $0x01, %r14, %r15
  16025. shldq $0x01, %r13, %r14
  16026. shldq $0x01, %r12, %r13
  16027. shldq $0x01, %r11, %r12
  16028. andq %rax, %r11
  16029. # Multiply top half by 19
  16030. movq $19, %rdx
  16031. xorq %rax, %rax
  16032. mulxq %r12, %rcx, %r12
  16033. adcxq %rcx, %r8
  16034. adoxq %r12, %r9
  16035. mulxq %r13, %rcx, %r13
  16036. adcxq %rcx, %r9
  16037. adoxq %r13, %r10
  16038. mulxq %r14, %rcx, %r14
  16039. adcxq %rcx, %r10
  16040. adoxq %r14, %r11
  16041. mulxq %r15, %r15, %rdx
  16042. adcxq %r15, %r11
  16043. adoxq %rax, %rdx
  16044. adcxq %rax, %rdx
  16045. # Overflow
  16046. shldq $0x01, %r11, %rdx
  16047. movq $0x7fffffffffffffff, %rax
  16048. imulq $19, %rdx, %rcx
  16049. andq %rax, %r11
  16050. addq %rcx, %r8
  16051. adcq $0x00, %r9
  16052. adcq $0x00, %r10
  16053. adcq $0x00, %r11
  16054. # Reduce if top bit set
  16055. movq %r11, %rdx
  16056. sarq $63, %rdx
  16057. andq $19, %rdx
  16058. andq %rax, %r11
  16059. addq %rdx, %r8
  16060. adcq $0x00, %r9
  16061. adcq $0x00, %r10
  16062. adcq $0x00, %r11
  16063. # Store
  16064. movq %r8, (%rbx)
  16065. movq %r9, 8(%rbx)
  16066. movq %r10, 16(%rbx)
  16067. movq %r11, 24(%rbx)
  16068. movq 168(%rsp), %rbx
  16069. # Multiply
  16070. # A[0] * B[0]
  16071. movq (%rbx), %rdx
  16072. mulxq (%rsi), %r8, %r9
  16073. # A[2] * B[0]
  16074. mulxq 16(%rsi), %r10, %r11
  16075. # A[1] * B[0]
  16076. mulxq 8(%rsi), %rcx, %rax
  16077. xorq %r15, %r15
  16078. adcxq %rcx, %r9
  16079. # A[1] * B[3]
  16080. movq 24(%rbx), %rdx
  16081. mulxq 8(%rsi), %r12, %r13
  16082. adcxq %rax, %r10
  16083. # A[0] * B[1]
  16084. movq 8(%rbx), %rdx
  16085. mulxq (%rsi), %rcx, %rax
  16086. adoxq %rcx, %r9
  16087. # A[2] * B[1]
  16088. mulxq 16(%rsi), %rcx, %r14
  16089. adoxq %rax, %r10
  16090. adcxq %rcx, %r11
  16091. # A[1] * B[2]
  16092. movq 16(%rbx), %rdx
  16093. mulxq 8(%rsi), %rcx, %rax
  16094. adcxq %r14, %r12
  16095. adoxq %rcx, %r11
  16096. adcxq %r15, %r13
  16097. adoxq %rax, %r12
  16098. # A[0] * B[2]
  16099. mulxq (%rsi), %rcx, %rax
  16100. adoxq %r15, %r13
  16101. xorq %r14, %r14
  16102. adcxq %rcx, %r10
  16103. # A[1] * B[1]
  16104. movq 8(%rbx), %rdx
  16105. mulxq 8(%rsi), %rdx, %rcx
  16106. adcxq %rax, %r11
  16107. adoxq %rdx, %r10
  16108. # A[3] * B[1]
  16109. movq 8(%rbx), %rdx
  16110. adoxq %rcx, %r11
  16111. mulxq 24(%rsi), %rcx, %rax
  16112. adcxq %rcx, %r12
  16113. # A[2] * B[2]
  16114. movq 16(%rbx), %rdx
  16115. mulxq 16(%rsi), %rdx, %rcx
  16116. adcxq %rax, %r13
  16117. adoxq %rdx, %r12
  16118. # A[3] * B[3]
  16119. movq 24(%rbx), %rdx
  16120. adoxq %rcx, %r13
  16121. mulxq 24(%rsi), %rcx, %rax
  16122. adoxq %r15, %r14
  16123. adcxq %rcx, %r14
  16124. # A[0] * B[3]
  16125. mulxq (%rsi), %rdx, %rcx
  16126. adcxq %rax, %r15
  16127. xorq %rax, %rax
  16128. adcxq %rdx, %r11
  16129. # A[3] * B[0]
  16130. movq (%rbx), %rdx
  16131. adcxq %rcx, %r12
  16132. mulxq 24(%rsi), %rdx, %rcx
  16133. adoxq %rdx, %r11
  16134. adoxq %rcx, %r12
  16135. # A[2] * B[3]
  16136. movq 24(%rbx), %rdx
  16137. mulxq 16(%rsi), %rdx, %rcx
  16138. adcxq %rdx, %r13
  16139. # A[3] * B[2]
  16140. movq 16(%rbx), %rdx
  16141. adcxq %rcx, %r14
  16142. mulxq 24(%rsi), %rcx, %rdx
  16143. adcxq %rax, %r15
  16144. adoxq %rcx, %r13
  16145. adoxq %rdx, %r14
  16146. adoxq %rax, %r15
  16147. # Reduce
  16148. movq $0x7fffffffffffffff, %rax
  16149. # Move top half into t4-t7 and remove top bit from t3
  16150. shldq $0x01, %r14, %r15
  16151. shldq $0x01, %r13, %r14
  16152. shldq $0x01, %r12, %r13
  16153. shldq $0x01, %r11, %r12
  16154. andq %rax, %r11
  16155. # Multiply top half by 19
  16156. movq $19, %rdx
  16157. xorq %rax, %rax
  16158. mulxq %r12, %rcx, %r12
  16159. adcxq %rcx, %r8
  16160. adoxq %r12, %r9
  16161. mulxq %r13, %rcx, %r13
  16162. adcxq %rcx, %r9
  16163. adoxq %r13, %r10
  16164. mulxq %r14, %rcx, %r14
  16165. adcxq %rcx, %r10
  16166. adoxq %r14, %r11
  16167. mulxq %r15, %r15, %rdx
  16168. adcxq %r15, %r11
  16169. adoxq %rax, %rdx
  16170. adcxq %rax, %rdx
  16171. # Overflow
  16172. shldq $0x01, %r11, %rdx
  16173. movq $0x7fffffffffffffff, %rax
  16174. imulq $19, %rdx, %rcx
  16175. andq %rax, %r11
  16176. addq %rcx, %r8
  16177. adcq $0x00, %r9
  16178. adcq $0x00, %r10
  16179. adcq $0x00, %r11
  16180. # Reduce if top bit set
  16181. movq %r11, %rdx
  16182. sarq $63, %rdx
  16183. andq $19, %rdx
  16184. andq %rax, %r11
  16185. addq %rdx, %r8
  16186. adcq $0x00, %r9
  16187. adcq $0x00, %r10
  16188. adcq $0x00, %r11
  16189. # Store
  16190. movq %r8, (%rsi)
  16191. movq %r9, 8(%rsi)
  16192. movq %r10, 16(%rsi)
  16193. movq %r11, 24(%rsi)
  16194. movq 24(%rsp), %rsi
  16195. movq 160(%rsp), %rbx
  16196. movq 144(%rsp), %rbp
  16197. # Multiply
  16198. # A[0] * B[0]
  16199. movq (%rbp), %rdx
  16200. mulxq (%rbx), %r8, %r9
  16201. # A[2] * B[0]
  16202. mulxq 16(%rbx), %r10, %r11
  16203. # A[1] * B[0]
  16204. mulxq 8(%rbx), %rcx, %rax
  16205. xorq %r15, %r15
  16206. adcxq %rcx, %r9
  16207. # A[1] * B[3]
  16208. movq 24(%rbp), %rdx
  16209. mulxq 8(%rbx), %r12, %r13
  16210. adcxq %rax, %r10
  16211. # A[0] * B[1]
  16212. movq 8(%rbp), %rdx
  16213. mulxq (%rbx), %rcx, %rax
  16214. adoxq %rcx, %r9
  16215. # A[2] * B[1]
  16216. mulxq 16(%rbx), %rcx, %r14
  16217. adoxq %rax, %r10
  16218. adcxq %rcx, %r11
  16219. # A[1] * B[2]
  16220. movq 16(%rbp), %rdx
  16221. mulxq 8(%rbx), %rcx, %rax
  16222. adcxq %r14, %r12
  16223. adoxq %rcx, %r11
  16224. adcxq %r15, %r13
  16225. adoxq %rax, %r12
  16226. # A[0] * B[2]
  16227. mulxq (%rbx), %rcx, %rax
  16228. adoxq %r15, %r13
  16229. xorq %r14, %r14
  16230. adcxq %rcx, %r10
  16231. # A[1] * B[1]
  16232. movq 8(%rbp), %rdx
  16233. mulxq 8(%rbx), %rdx, %rcx
  16234. adcxq %rax, %r11
  16235. adoxq %rdx, %r10
  16236. # A[3] * B[1]
  16237. movq 8(%rbp), %rdx
  16238. adoxq %rcx, %r11
  16239. mulxq 24(%rbx), %rcx, %rax
  16240. adcxq %rcx, %r12
  16241. # A[2] * B[2]
  16242. movq 16(%rbp), %rdx
  16243. mulxq 16(%rbx), %rdx, %rcx
  16244. adcxq %rax, %r13
  16245. adoxq %rdx, %r12
  16246. # A[3] * B[3]
  16247. movq 24(%rbp), %rdx
  16248. adoxq %rcx, %r13
  16249. mulxq 24(%rbx), %rcx, %rax
  16250. adoxq %r15, %r14
  16251. adcxq %rcx, %r14
  16252. # A[0] * B[3]
  16253. mulxq (%rbx), %rdx, %rcx
  16254. adcxq %rax, %r15
  16255. xorq %rax, %rax
  16256. adcxq %rdx, %r11
  16257. # A[3] * B[0]
  16258. movq (%rbp), %rdx
  16259. adcxq %rcx, %r12
  16260. mulxq 24(%rbx), %rdx, %rcx
  16261. adoxq %rdx, %r11
  16262. adoxq %rcx, %r12
  16263. # A[2] * B[3]
  16264. movq 24(%rbp), %rdx
  16265. mulxq 16(%rbx), %rdx, %rcx
  16266. adcxq %rdx, %r13
  16267. # A[3] * B[2]
  16268. movq 16(%rbp), %rdx
  16269. adcxq %rcx, %r14
  16270. mulxq 24(%rbx), %rcx, %rdx
  16271. adcxq %rax, %r15
  16272. adoxq %rcx, %r13
  16273. adoxq %rdx, %r14
  16274. adoxq %rax, %r15
  16275. # Reduce
  16276. movq $0x7fffffffffffffff, %rax
  16277. # Move top half into t4-t7 and remove top bit from t3
  16278. shldq $0x01, %r14, %r15
  16279. shldq $0x01, %r13, %r14
  16280. shldq $0x01, %r12, %r13
  16281. shldq $0x01, %r11, %r12
  16282. andq %rax, %r11
  16283. # Multiply top half by 19
  16284. movq $19, %rdx
  16285. xorq %rax, %rax
  16286. mulxq %r12, %rcx, %r12
  16287. adcxq %rcx, %r8
  16288. adoxq %r12, %r9
  16289. mulxq %r13, %rcx, %r13
  16290. adcxq %rcx, %r9
  16291. adoxq %r13, %r10
  16292. mulxq %r14, %rcx, %r14
  16293. adcxq %rcx, %r10
  16294. adoxq %r14, %r11
  16295. mulxq %r15, %r15, %rdx
  16296. adcxq %r15, %r11
  16297. adoxq %rax, %rdx
  16298. adcxq %rax, %rdx
  16299. # Overflow
  16300. shldq $0x01, %r11, %rdx
  16301. movq $0x7fffffffffffffff, %rax
  16302. imulq $19, %rdx, %rcx
  16303. andq %rax, %r11
  16304. addq %rcx, %r8
  16305. adcq $0x00, %r9
  16306. adcq $0x00, %r10
  16307. adcq $0x00, %r11
  16308. # Reduce if top bit set
  16309. movq %r11, %rdx
  16310. sarq $63, %rdx
  16311. andq $19, %rdx
  16312. andq %rax, %r11
  16313. addq %rdx, %r8
  16314. adcq $0x00, %r9
  16315. adcq $0x00, %r10
  16316. adcq $0x00, %r11
  16317. # Store
  16318. movq %r8, (%rsi)
  16319. movq %r9, 8(%rsi)
  16320. movq %r10, 16(%rsi)
  16321. movq %r11, 24(%rsi)
  16322. movq 136(%rsp), %rsi
  16323. movq 152(%rsp), %rbx
  16324. # Multiply
  16325. # A[0] * B[0]
  16326. movq (%rbx), %rdx
  16327. mulxq (%rsi), %r8, %r9
  16328. # A[2] * B[0]
  16329. mulxq 16(%rsi), %r10, %r11
  16330. # A[1] * B[0]
  16331. mulxq 8(%rsi), %rcx, %rax
  16332. xorq %r15, %r15
  16333. adcxq %rcx, %r9
  16334. # A[1] * B[3]
  16335. movq 24(%rbx), %rdx
  16336. mulxq 8(%rsi), %r12, %r13
  16337. adcxq %rax, %r10
  16338. # A[0] * B[1]
  16339. movq 8(%rbx), %rdx
  16340. mulxq (%rsi), %rcx, %rax
  16341. adoxq %rcx, %r9
  16342. # A[2] * B[1]
  16343. mulxq 16(%rsi), %rcx, %r14
  16344. adoxq %rax, %r10
  16345. adcxq %rcx, %r11
  16346. # A[1] * B[2]
  16347. movq 16(%rbx), %rdx
  16348. mulxq 8(%rsi), %rcx, %rax
  16349. adcxq %r14, %r12
  16350. adoxq %rcx, %r11
  16351. adcxq %r15, %r13
  16352. adoxq %rax, %r12
  16353. # A[0] * B[2]
  16354. mulxq (%rsi), %rcx, %rax
  16355. adoxq %r15, %r13
  16356. xorq %r14, %r14
  16357. adcxq %rcx, %r10
  16358. # A[1] * B[1]
  16359. movq 8(%rbx), %rdx
  16360. mulxq 8(%rsi), %rdx, %rcx
  16361. adcxq %rax, %r11
  16362. adoxq %rdx, %r10
  16363. # A[3] * B[1]
  16364. movq 8(%rbx), %rdx
  16365. adoxq %rcx, %r11
  16366. mulxq 24(%rsi), %rcx, %rax
  16367. adcxq %rcx, %r12
  16368. # A[2] * B[2]
  16369. movq 16(%rbx), %rdx
  16370. mulxq 16(%rsi), %rdx, %rcx
  16371. adcxq %rax, %r13
  16372. adoxq %rdx, %r12
  16373. # A[3] * B[3]
  16374. movq 24(%rbx), %rdx
  16375. adoxq %rcx, %r13
  16376. mulxq 24(%rsi), %rcx, %rax
  16377. adoxq %r15, %r14
  16378. adcxq %rcx, %r14
  16379. # A[0] * B[3]
  16380. mulxq (%rsi), %rdx, %rcx
  16381. adcxq %rax, %r15
  16382. xorq %rax, %rax
  16383. adcxq %rdx, %r11
  16384. # A[3] * B[0]
  16385. movq (%rbx), %rdx
  16386. adcxq %rcx, %r12
  16387. mulxq 24(%rsi), %rdx, %rcx
  16388. adoxq %rdx, %r11
  16389. adoxq %rcx, %r12
  16390. # A[2] * B[3]
  16391. movq 24(%rbx), %rdx
  16392. mulxq 16(%rsi), %rdx, %rcx
  16393. adcxq %rdx, %r13
  16394. # A[3] * B[2]
  16395. movq 16(%rbx), %rdx
  16396. adcxq %rcx, %r14
  16397. mulxq 24(%rsi), %rcx, %rdx
  16398. adcxq %rax, %r15
  16399. adoxq %rcx, %r13
  16400. adoxq %rdx, %r14
  16401. adoxq %rax, %r15
  16402. # Reduce
  16403. movq $0x7fffffffffffffff, %rax
  16404. # Move top half into t4-t7 and remove top bit from t3
  16405. shldq $0x01, %r14, %r15
  16406. shldq $0x01, %r13, %r14
  16407. shldq $0x01, %r12, %r13
  16408. shldq $0x01, %r11, %r12
  16409. andq %rax, %r11
  16410. # Multiply top half by 19
  16411. movq $19, %rdx
  16412. xorq %rax, %rax
  16413. mulxq %r12, %rcx, %r12
  16414. adcxq %rcx, %r8
  16415. adoxq %r12, %r9
  16416. mulxq %r13, %rcx, %r13
  16417. adcxq %rcx, %r9
  16418. adoxq %r13, %r10
  16419. mulxq %r14, %rcx, %r14
  16420. adcxq %rcx, %r10
  16421. adoxq %r14, %r11
  16422. mulxq %r15, %r15, %rdx
  16423. adcxq %r15, %r11
  16424. adoxq %rax, %rdx
  16425. adcxq %rax, %rdx
  16426. # Overflow
  16427. shldq $0x01, %r11, %rdx
  16428. movq $0x7fffffffffffffff, %rax
  16429. imulq $19, %rdx, %rcx
  16430. andq %rax, %r11
  16431. addq %rcx, %r8
  16432. adcq $0x00, %r9
  16433. adcq $0x00, %r10
  16434. adcq $0x00, %r11
  16435. # Reduce if top bit set
  16436. movq %r11, %rdx
  16437. sarq $63, %rdx
  16438. andq $19, %rdx
  16439. andq %rax, %r11
  16440. addq %rdx, %r8
  16441. adcq $0x00, %r9
  16442. adcq $0x00, %r10
  16443. adcq $0x00, %r11
  16444. # Store
  16445. movq %r8, (%rdi)
  16446. movq %r9, 8(%rdi)
  16447. movq %r10, 16(%rdi)
  16448. movq %r11, 24(%rdi)
  16449. leaq 48(%rsp), %rsi
  16450. # Double
  16451. movq (%rdi), %r8
  16452. movq 8(%rdi), %r9
  16453. addq %r8, %r8
  16454. movq 16(%rdi), %r10
  16455. adcq %r9, %r9
  16456. movq 24(%rdi), %rdx
  16457. adcq %r10, %r10
  16458. movq $-19, %rcx
  16459. adcq %rdx, %rdx
  16460. movq $0x7fffffffffffffff, %rax
  16461. movq %rdx, %r11
  16462. sarq $63, %rdx
  16463. # Mask the modulus
  16464. andq %rdx, %rcx
  16465. andq %rdx, %rax
  16466. # Sub modulus (if overflow)
  16467. subq %rcx, %r8
  16468. sbbq %rdx, %r9
  16469. sbbq %rdx, %r10
  16470. sbbq %rax, %r11
  16471. movq %r8, (%rsi)
  16472. movq %r9, 8(%rsi)
  16473. movq %r10, 16(%rsi)
  16474. movq %r11, 24(%rsi)
  16475. movq 8(%rsp), %rbx
  16476. movq 16(%rsp), %rbp
  16477. # Add
  16478. movq (%rbp), %r8
  16479. movq 8(%rbp), %r9
  16480. movq 16(%rbp), %r10
  16481. movq 24(%rbp), %rdx
  16482. movq %r8, %r12
  16483. addq (%rbx), %r8
  16484. movq %r9, %r13
  16485. adcq 8(%rbx), %r9
  16486. movq %r10, %r14
  16487. adcq 16(%rbx), %r10
  16488. movq %rdx, %r15
  16489. adcq 24(%rbx), %rdx
  16490. movq $-19, %rcx
  16491. movq %rdx, %r11
  16492. movq $0x7fffffffffffffff, %rax
  16493. sarq $63, %rdx
  16494. # Mask the modulus
  16495. andq %rdx, %rcx
  16496. andq %rdx, %rax
  16497. # Sub modulus (if overflow)
  16498. subq %rcx, %r8
  16499. sbbq %rdx, %r9
  16500. sbbq %rdx, %r10
  16501. sbbq %rax, %r11
  16502. # Sub
  16503. subq (%rbx), %r12
  16504. movq $0x00, %rdx
  16505. sbbq 8(%rbx), %r13
  16506. movq $-19, %rcx
  16507. sbbq 16(%rbx), %r14
  16508. movq $0x7fffffffffffffff, %rax
  16509. sbbq 24(%rbx), %r15
  16510. sbbq $0x00, %rdx
  16511. # Mask the modulus
  16512. andq %rdx, %rcx
  16513. andq %rdx, %rax
  16514. # Add modulus (if underflow)
  16515. addq %rcx, %r12
  16516. adcq %rdx, %r13
  16517. adcq %rdx, %r14
  16518. adcq %rax, %r15
  16519. movq %r8, (%rbx)
  16520. movq %r9, 8(%rbx)
  16521. movq %r10, 16(%rbx)
  16522. movq %r11, 24(%rbx)
  16523. movq %r12, (%rdi)
  16524. movq %r13, 8(%rdi)
  16525. movq %r14, 16(%rdi)
  16526. movq %r15, 24(%rdi)
  16527. movq 24(%rsp), %rdi
  16528. # Add
  16529. movq (%rsi), %r8
  16530. movq 8(%rsi), %r9
  16531. movq 16(%rsi), %r10
  16532. movq 24(%rsi), %rdx
  16533. movq %r8, %r12
  16534. addq (%rdi), %r8
  16535. movq %r9, %r13
  16536. adcq 8(%rdi), %r9
  16537. movq %r10, %r14
  16538. adcq 16(%rdi), %r10
  16539. movq %rdx, %r15
  16540. adcq 24(%rdi), %rdx
  16541. movq $-19, %rcx
  16542. movq %rdx, %r11
  16543. movq $0x7fffffffffffffff, %rax
  16544. sarq $63, %rdx
  16545. # Mask the modulus
  16546. andq %rdx, %rcx
  16547. andq %rdx, %rax
  16548. # Sub modulus (if overflow)
  16549. subq %rcx, %r8
  16550. sbbq %rdx, %r9
  16551. sbbq %rdx, %r10
  16552. sbbq %rax, %r11
  16553. # Sub
  16554. subq (%rdi), %r12
  16555. movq $0x00, %rdx
  16556. sbbq 8(%rdi), %r13
  16557. movq $-19, %rcx
  16558. sbbq 16(%rdi), %r14
  16559. movq $0x7fffffffffffffff, %rax
  16560. sbbq 24(%rdi), %r15
  16561. sbbq $0x00, %rdx
  16562. # Mask the modulus
  16563. andq %rdx, %rcx
  16564. andq %rdx, %rax
  16565. # Add modulus (if underflow)
  16566. addq %rcx, %r12
  16567. adcq %rdx, %r13
  16568. adcq %rdx, %r14
  16569. adcq %rax, %r15
  16570. movq %r8, (%rdi)
  16571. movq %r9, 8(%rdi)
  16572. movq %r10, 16(%rdi)
  16573. movq %r11, 24(%rdi)
  16574. movq %r12, (%rbp)
  16575. movq %r13, 8(%rbp)
  16576. movq %r14, 16(%rbp)
  16577. movq %r15, 24(%rbp)
  16578. addq $0x50, %rsp
  16579. popq %r15
  16580. popq %r14
  16581. popq %r13
  16582. popq %r12
  16583. popq %rbp
  16584. popq %rbx
  16585. repz retq
  16586. #ifndef __APPLE__
  16587. .size fe_ge_sub_avx2,.-fe_ge_sub_avx2
  16588. #endif /* __APPLE__ */
  16589. #endif /* HAVE_INTEL_AVX2 */
  16590. #if defined(__linux__) && defined(__ELF__)
  16591. .section .note.GNU-stack,"",%progbits
  16592. #endif