OWEALs
/
wolfssl
mirror of https://github.com/wolfSSL/wolfssl.git


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289529052915292529352945295529652975298529953005301530253035304530553065307530853095310531153125313531453155316531753185319532053215322532353245325532653275328532953305331533253335334533553365337533853395340534153425343534453455346534753485349535053515352535353545355535653575358535953605361536253635364536553665367536853695370537153725373537453755376537753785379538053815382538353845385538653875388538953905391539253935394539553965397539853995400540154025403540454055406540754085409541054115412541354145415541654175418541954205421542254235424542554265427542854295430543154325433543454355436543754385439544054415442544354445445544654475448544954505451545254535454545554565457545854595460546154625463546454655466546754685469547054715472547354745475547654775478547954805481548254835484548554865487548854895490549154925493549454955496549754985499550055015502550355045505550655075508550955105511551255135514551555165517551855195520552155225523552455255526552755285529553055315532553355345535553655375538553955405541554255435544554555465547554855495550555155525553555455555556555755585559556055615562556355645565556655675568556955705571557255735574557555765577557855795580558155825583558455855586558755885589559055915592559355945595559655975598559956005601560256035604560556065607560856095610561156125613561456155616561756185619562056215622562356245625562656275628562956305631563256335634563556365637563856395640564156425643564456455646564756485649565056515652565356545655565656575658565956605661566256635664566556665667566856695670567156725673567456755676567756785679568056815682568356845685568656875688568956905691569256935694569556965697569856995700570157025703570457055706570757085709571057115712571357145715571657175718571957205721572257235724572557265727572857295730573157325733573457355736573757385739574057415742574357445745574657475748574957505751575257535754575557565757575857595760576157625763576457655766576757685769577057715772577357745775577657775778577957805781578257835784578557865787578857895790579157925793579457955796579757985799580058015802580358045805580658075808580958105811581258135814581558165817581858195820582158225823582458255826582758285829583058315832583358345835583658375838583958405841584258435844584558465847584858495850585158525853585458555856585758585859586058615862586358645865586658675868586958705871587258735874587558765877587858795880588158825883588458855886588758885889589058915892589358945895589658975898589959005901590259035904590559065907590859095910591159125913591459155916591759185919592059215922592359245925592659275928592959305931593259335934593559365937593859395940594159425943594459455946594759485949595059515952595359545955595659575958595959605961596259635964596559665967596859695970597159725973597459755976597759785979598059815982598359845985598659875988598959905991599259935994599559965997599859996000600160026003600460056006600760086009601060116012601360146015601660176018601960206021602260236024602560266027602860296030603160326033603460356036603760386039604060416042604360446045604660476048604960506051605260536054605560566057605860596060606160626063606460656066606760686069607060716072607360746075607660776078607960806081608260836084608560866087608860896090609160926093609460956096609760986099610061016102610361046105610661076108610961106111611261136114611561166117611861196120612161226123612461256126612761286129613061316132613361346135613661376138613961406141614261436144614561466147614861496150615161526153615461556156615761586159616061616162616361646165616661676168616961706171617261736174617561766177617861796180618161826183618461856186618761886189619061916192619361946195619661976198619962006201620262036204620562066207620862096210621162126213621462156216621762186219622062216222622362246225622662276228622962306231623262336234623562366237623862396240624162426243624462456246624762486249625062516252625362546255625662576258625962606261626262636264626562666267626862696270627162726273627462756276627762786279628062816282628362846285628662876288628962906291629262936294629562966297629862996300630163026303630463056306630763086309631063116312631363146315631663176318631963206321632263236324632563266327632863296330633163326333633463356336633763386339634063416342634363446345634663476348634963506351635263536354635563566357635863596360636163626363636463656366636763686369637063716372637363746375637663776378637963806381638263836384638563866387638863896390639163926393639463956396639763986399640064016402640364046405640664076408640964106411641264136414641564166417641864196420642164226423642464256426642764286429643064316432643364346435643664376438643964406441644264436444644564466447644864496450645164526453645464556456645764586459646064616462646364646465646664676468646964706471647264736474647564766477647864796480648164826483648464856486648764886489649064916492649364946495649664976498649965006501650265036504650565066507650865096510651165126513651465156516651765186519652065216522652365246525652665276528652965306531653265336534653565366537653865396540654165426543654465456546654765486549655065516552655365546555655665576558655965606561656265636564656565666567656865696570657165726573657465756576657765786579658065816582658365846585658665876588658965906591659265936594659565966597659865996600660166026603660466056606660766086609661066116612661366146615661666176618661966206621662266236624662566266627662866296630663166326633663466356636663766386639664066416642664366446645664666476648664966506651665266536654665566566657665866596660666166626663666466656666666766686669667066716672667366746675667666776678667966806681668266836684668566866687668866896690669166926693669466956696669766986699670067016702670367046705670667076708670967106711671267136714671567166717671867196720672167226723672467256726672767286729673067316732673367346735673667376738673967406741674267436744674567466747674867496750675167526753675467556756675767586759676067616762676367646765676667676768676967706771677267736774677567766777677867796780678167826783678467856786678767886789679067916792679367946795679667976798679968006801680268036804680568066807680868096810681168126813681468156816681768186819682068216822682368246825682668276828682968306831683268336834683568366837683868396840684168426843684468456846684768486849685068516852685368546855685668576858685968606861686268636864686568666867686868696870687168726873687468756876687768786879688068816882688368846885688668876888688968906891689268936894689568966897689868996900690169026903690469056906690769086909691069116912691369146915691669176918691969206921692269236924692569266927692869296930693169326933693469356936693769386939694069416942694369446945694669476948694969506951695269536954695569566957695869596960696169626963696469656966696769686969697069716972697369746975697669776978697969806981698269836984698569866987698869896990699169926993699469956996699769986999700070017002700370047005700670077008700970107011701270137014701570167017701870197020702170227023702470257026702770287029703070317032703370347035703670377038703970407041704270437044704570467047704870497050705170527053705470557056705770587059706070617062706370647065706670677068706970707071707270737074707570767077707870797080708170827083708470857086708770887089709070917092709370947095709670977098709971007101710271037104710571067107710871097110711171127113711471157116711771187119712071217122712371247125712671277128712971307131713271337134713571367137713871397140714171427143714471457146714771487149715071517152715371547155715671577158715971607161716271637164716571667167716871697170717171727173717471757176717771787179718071817182718371847185718671877188718971907191719271937194719571967197719871997200720172027203720472057206720772087209721072117212721372147215721672177218721972207221722272237224722572267227722872297230723172327233723472357236723772387239724072417242724372447245724672477248724972507251725272537254725572567257725872597260726172627263726472657266726772687269727072717272727372747275727672777278727972807281728272837284728572867287728872897290729172927293729472957296729772987299730073017302730373047305730673077308730973107311731273137314731573167317731873197320732173227323732473257326732773287329733073317332733373347335733673377338733973407341734273437344734573467347734873497350735173527353735473557356735773587359736073617362736373647365736673677368736973707371737273737374737573767377737873797380738173827383738473857386738773887389739073917392739373947395739673977398739974007401740274037404740574067407740874097410741174127413741474157416741774187419742074217422742374247425742674277428742974307431743274337434743574367437743874397440744174427443744474457446744774487449745074517452745374547455745674577458745974607461746274637464746574667467746874697470747174727473747474757476747774787479748074817482748374847485748674877488748974907491749274937494749574967497749874997500750175027503750475057506750775087509751075117512751375147515751675177518751975207521752275237524752575267527752875297530753175327533753475357536753775387539754075417542754375447545754675477548754975507551755275537554755575567557755875597560756175627563756475657566756775687569757075717572757375747575757675777578757975807581758275837584758575867587758875897590759175927593759475957596759775987599760076017602760376047605760676077608760976107611761276137614761576167617761876197620762176227623762476257626762776287629763076317632763376347635763676377638763976407641764276437644764576467647764876497650765176527653765476557656765776587659766076617662766376647665766676677668766976707671767276737674767576767677767876797680768176827683768476857686768776887689769076917692769376947695769676977698769977007701770277037704770577067707770877097710771177127713771477157716771777187719772077217722772377247725772677277728772977307731773277337734773577367737773877397740774177427743774477457746774777487749775077517752775377547755775677577758775977607761776277637764776577667767776877697770777177727773777477757776777777787779778077817782778377847785778677877788778977907791779277937794779577967797779877997800780178027803780478057806780778087809781078117812781378147815781678177818781978207821782278237824782578267827782878297830783178327833783478357836783778387839784078417842784378447845784678477848784978507851785278537854785578567857785878597860786178627863786478657866786778687869787078717872787378747875787678777878787978807881788278837884788578867887788878897890789178927893789478957896789778987899790079017902790379047905790679077908790979107911791279137914791579167917791879197920792179227923792479257926792779287929793079317932793379347935793679377938793979407941794279437944794579467947794879497950795179527953795479557956795779587959796079617962796379647965796679677968796979707971797279737974797579767977797879797980798179827983798479857986798779887989799079917992799379947995799679977998799980008001800280038004800580068007800880098010801180128013801480158016801780188019802080218022802380248025802680278028802980308031803280338034803580368037803880398040804180428043804480458046804780488049805080518052805380548055805680578058805980608061806280638064806580668067806880698070807180728073807480758076807780788079808080818082808380848085808680878088808980908091809280938094809580968097809880998100810181028103810481058106810781088109811081118112811381148115811681178118811981208121812281238124812581268127812881298130813181328133813481358136813781388139814081418142814381448145814681478148814981508151815281538154815581568157815881598160816181628163816481658166816781688169817081718172817381748175817681778178817981808181818281838184818581868187818881898190819181928193819481958196819781988199820082018202820382048205820682078208820982108211821282138214821582168217821882198220822182228223822482258226822782288229823082318232823382348235823682378238823982408241824282438244824582468247824882498250825182528253825482558256825782588259826082618262826382648265826682678268826982708271827282738274827582768277827882798280828182828283828482858286828782888289829082918292829382948295829682978298829983008301830283038304830583068307830883098310831183128313831483158316831783188319832083218322832383248325832683278328832983308331833283338334833583368337833883398340834183428343834483458346834783488349835083518352835383548355835683578358835983608361836283638364836583668367836883698370837183728373837483758376837783788379838083818382838383848385838683878388838983908391839283938394839583968397839883998400840184028403840484058406840784088409841084118412841384148415841684178418841984208421842284238424842584268427842884298430843184328433843484358436843784388439844084418442844384448445844684478448844984508451845284538454845584568457845884598460846184628463846484658466846784688469847084718472847384748475847684778478847984808481848284838484848584868487848884898490849184928493849484958496849784988499850085018502850385048505850685078508850985108511851285138514851585168517851885198520852185228523852485258526852785288529853085318532853385348535853685378538853985408541854285438544854585468547854885498550855185528553855485558556855785588559856085618562856385648565856685678568856985708571857285738574857585768577857885798580858185828583858485858586858785888589859085918592859385948595859685978598859986008601860286038604860586068607860886098610861186128613861486158616861786188619862086218622862386248625862686278628862986308631863286338634863586368637863886398640864186428643864486458646864786488649865086518652865386548655865686578658865986608661866286638664866586668667866886698670867186728673867486758676867786788679868086818682868386848685868686878688868986908691869286938694869586968697869886998700870187028703870487058706870787088709871087118712871387148715871687178718871987208721872287238724872587268727872887298730873187328733873487358736873787388739874087418742874387448745874687478748874987508751875287538754875587568757875887598760876187628763876487658766876787688769877087718772877387748775877687778778877987808781878287838784878587868787878887898790879187928793879487958796879787988799880088018802880388048805880688078808880988108811881288138814881588168817881888198820882188228823882488258826882788288829883088318832883388348835883688378838883988408841884288438844884588468847884888498850885188528853885488558856885788588859886088618862886388648865886688678868886988708871887288738874887588768877887888798880888188828883888488858886888788888889889088918892889388948895889688978898889989008901890289038904890589068907890889098910891189128913891489158916891789188919892089218922892389248925892689278928892989308931893289338934893589368937893889398940894189428943894489458946894789488949895089518952895389548955895689578958895989608961896289638964896589668967896889698970897189728973897489758976897789788979898089818982898389848985898689878988898989908991899289938994899589968997899889999000900190029003900490059006900790089009901090119012901390149015901690179018901990209021902290239024902590269027902890299030903190329033903490359036903790389039904090419042904390449045904690479048904990509051905290539054905590569057905890599060906190629063906490659066906790689069907090719072907390749075907690779078907990809081908290839084908590869087908890899090909190929093909490959096909790989099910091019102910391049105910691079108910991109111911291139114911591169117911891199120912191229123912491259126912791289129913091319132913391349135913691379138913991409141914291439144914591469147914891499150915191529153915491559156915791589159916091619162916391649165916691679168916991709171917291739174917591769177917891799180918191829183918491859186918791889189919091919192919391949195919691979198919992009201920292039204920592069207920892099210921192129213921492159216921792189219922092219222922392249225922692279228922992309231923292339234923592369237923892399240924192429243924492459246924792489249925092519252925392549255925692579258925992609261926292639264926592669267926892699270927192729273927492759276927792789279928092819282928392849285928692879288928992909291929292939294929592969297929892999300930193029303930493059306930793089309931093119312931393149315931693179318931993209321932293239324932593269327932893299330933193329333933493359336933793389339934093419342934393449345934693479348934993509351935293539354935593569357935893599360936193629363936493659366936793689369937093719372937393749375937693779378937993809381938293839384938593869387938893899390939193929393939493959396939793989399940094019402940394049405940694079408940994109411941294139414941594169417941894199420942194229423942494259426942794289429943094319432943394349435943694379438943994409441944294439444944594469447944894499450945194529453945494559456945794589459946094619462946394649465946694679468946994709471947294739474947594769477947894799480948194829483948494859486948794889489949094919492949394949495949694979498949995009501950295039504950595069507950895099510951195129513951495159516951795189519952095219522952395249525952695279528952995309531953295339534953595369537953895399540954195429543954495459546954795489549955095519552955395549555955695579558955995609561956295639564956595669567956895699570957195729573957495759576957795789579958095819582958395849585958695879588958995909591959295939594959595969597959895999600960196029603960496059606960796089609961096119612961396149615961696179618961996209621962296239624962596269627962896299630963196329633963496359636963796389639964096419642964396449645964696479648964996509651965296539654965596569657965896599660966196629663966496659666966796689669967096719672967396749675967696779678967996809681968296839684968596869687968896899690969196929693969496959696969796989699970097019702970397049705970697079708970997109711971297139714971597169717971897199720972197229723972497259726972797289729973097319732973397349735973697379738973997409741974297439744974597469747974897499750975197529753975497559756975797589759976097619762976397649765976697679768976997709771977297739774977597769777977897799780978197829783978497859786978797889789979097919792979397949795979697979798979998009801980298039804980598069807980898099810981198129813981498159816981798189819982098219822982398249825982698279828982998309831983298339834983598369837983898399840984198429843984498459846984798489849985098519852985398549855985698579858985998609861986298639864986598669867986898699870987198729873987498759876987798789879988098819882988398849885988698879888988998909891989298939894989598969897989898999900990199029903990499059906990799089909991099119912991399149915991699179918991999209921992299239924992599269927992899299930993199329933993499359936993799389939994099419942994399449945994699479948994999509951995299539954995599569957995899599960996199629963996499659966996799689969997099719972997399749975997699779978997999809981998299839984998599869987998899899990999199929993999499959996999799989999100001000110002100031000410005100061000710008100091001010011100121001310014100151001610017100181001910020100211002210023100241002510026100271002810029100301003110032100331003410035100361003710038100391004010041100421004310044100451004610047100481004910050100511005210053100541005510056100571005810059100601006110062100631006410065100661006710068100691007010071100721007310074100751007610077100781007910080100811008210083100841008510086100871008810089100901009110092100931009410095100961009710098100991010010101101021010310104101051010610107101081010910110101111011210113101141011510116101171011810119101201012110122101231012410125101261012710128101291013010131101321013310134101351013610137101381013910140101411014210143101441014510146101471014810149101501015110152101531015410155101561015710158101591016010161101621016310164101651016610167101681016910170101711017210173101741017510176101771017810179101801018110182101831018410185101861018710188101891019010191101921019310194101951019610197101981019910200102011020210203102041020510206102071020810209102101021110212102131021410215102161021710218102191022010221102221022310224102251022610227102281022910230102311023210233102341023510236102371023810239102401024110242102431024410245102461024710248102491025010251102521025310254102551025610257102581025910260102611026210263102641026510266102671026810269102701027110272102731027410275102761027710278102791028010281102821028310284102851028610287102881028910290102911029210293102941029510296102971029810299103001030110302103031030410305103061030710308103091031010311103121031310314103151031610317103181031910320103211032210323103241032510326103271032810329103301033110332103331033410335103361033710338103391034010341103421034310344103451034610347103481034910350103511035210353103541035510356103571035810359103601036110362103631036410365103661036710368103691037010371103721037310374103751037610377103781037910380103811038210383103841038510386103871038810389103901039110392103931039410395103961039710398103991040010401104021040310404104051040610407104081040910410104111041210413104141041510416104171041810419104201042110422104231042410425104261042710428104291043010431104321043310434104351043610437104381043910440104411044210443104441044510446104471044810449104501045110452104531045410455104561045710458104591046010461104621046310464104651046610467104681046910470104711047210473104741047510476104771047810479104801048110482104831048410485104861048710488104891049010491104921049310494104951049610497104981049910500105011050210503105041050510506105071050810509105101051110512105131051410515105161051710518105191052010521105221052310524105251052610527105281052910530105311053210533105341053510536105371053810539105401054110542105431054410545105461054710548105491055010551105521055310554105551055610557105581055910560105611056210563105641056510566105671056810569105701057110572105731057410575105761057710578105791058010581105821058310584105851058610587105881058910590105911059210593105941059510596105971059810599106001060110602106031060410605106061060710608106091061010611106121061310614106151061610617106181061910620106211062210623106241062510626106271062810629106301063110632106331063410635106361063710638106391064010641106421064310644106451064610647106481064910650106511065210653106541065510656106571065810659106601066110662106631066410665106661066710668106691067010671106721067310674106751067610677106781067910680106811068210683106841068510686106871068810689106901069110692106931069410695106961069710698106991070010701107021070310704107051070610707107081070910710107111071210713107141071510716107171071810719107201072110722107231072410725107261072710728107291073010731107321073310734107351073610737107381073910740107411074210743107441074510746107471074810749107501075110752107531075410755107561075710758107591076010761107621076310764107651076610767107681076910770107711077210773107741077510776107771077810779107801078110782107831078410785107861078710788107891079010791107921079310794107951079610797107981079910800108011080210803108041080510806108071080810809108101081110812108131081410815108161081710818108191082010821108221082310824108251082610827108281082910830108311083210833108341083510836108371083810839108401084110842108431084410845108461084710848108491085010851108521085310854108551085610857108581085910860108611086210863108641086510866108671086810869108701087110872108731087410875108761087710878108791088010881108821088310884108851088610887108881088910890108911089210893108941089510896108971089810899109001090110902109031090410905109061090710908109091091010911109121091310914109151091610917109181091910920109211092210923109241092510926109271092810929109301093110932109331093410935109361093710938109391094010941109421094310944109451094610947109481094910950109511095210953109541095510956109571095810959109601096110962109631096410965109661096710968109691097010971109721097310974109751097610977109781097910980109811098210983109841098510986109871098810989109901099110992109931099410995109961099710998109991100011001110021100311004110051100611007110081100911010110111101211013110141101511016110171101811019110201102111022110231102411025110261102711028110291103011031110321103311034110351103611037110381103911040110411104211043110441104511046110471104811049110501105111052110531105411055110561105711058110591106011061110621106311064110651106611067110681106911070110711107211073110741107511076110771107811079110801108111082110831108411085110861108711088110891109011091110921109311094110951109611097110981109911100111011110211103111041110511106111071110811109111101111111112111131111411115111161111711118111191112011121111221112311124111251112611127111281112911130111311113211133111341113511136111371113811139111401114111142111431114411145111461114711148111491115011151111521115311154111551115611157111581115911160111611116211163111641116511166111671116811169111701117111172111731117411175111761117711178111791118011181111821118311184111851118611187111881118911190111911119211193111941119511196111971119811199112001120111202112031120411205112061120711208112091121011211112121121311214112151121611217112181121911220112211122211223112241122511226112271122811229112301123111232112331123411235112361123711238112391124011241112421124311244112451124611247112481124911250112511125211253112541125511256112571125811259112601126111262112631126411265112661126711268112691127011271112721127311274112751127611277112781127911280112811128211283112841128511286112871128811289112901129111292112931129411295112961129711298112991130011301113021130311304113051130611307113081130911310113111131211313113141131511316113171131811319113201132111322113231132411325113261132711328113291133011331113321133311334113351133611337113381133911340113411134211343113441134511346113471134811349113501135111352113531135411355113561135711358113591136011361113621136311364113651136611367113681136911370113711137211373113741137511376113771137811379113801138111382113831138411385113861138711388113891139011391113921139311394113951139611397113981139911400114011140211403114041140511406114071140811409114101141111412114131141411415114161141711418114191142011421114221142311424114251142611427114281142911430114311143211433114341143511436114371143811439114401144111442114431144411445114461144711448114491145011451114521145311454114551145611457114581145911460114611146211463114641146511466114671146811469114701147111472114731147411475114761147711478114791148011481114821148311484114851148611487114881148911490114911149211493114941149511496114971149811499115001150111502115031150411505115061150711508115091151011511115121151311514115151151611517115181151911520115211152211523115241152511526115271152811529115301153111532115331153411535115361153711538115391154011541115421154311544115451154611547115481154911550115511155211553115541155511556115571155811559115601156111562115631156411565115661156711568115691157011571115721157311574115751157611577115781157911580115811158211583115841158511586115871158811589115901159111592115931159411595115961159711598115991160011601116021160311604116051160611607116081160911610116111161211613116141161511616116171161811619116201162111622116231162411625116261162711628116291163011631116321163311634116351163611637116381163911640116411164211643116441164511646116471164811649116501165111652116531165411655116561165711658116591166011661116621166311664116651166611667116681166911670116711167211673116741167511676116771167811679116801168111682116831168411685116861168711688116891169011691116921169311694116951169611697116981169911700117011170211703117041170511706117071170811709117101171111712117131171411715117161171711718117191172011721117221172311724117251172611727117281172911730117311173211733117341173511736117371173811739117401174111742117431174411745117461174711748117491175011751117521175311754117551175611757117581175911760117611176211763117641176511766117671176811769117701177111772117731177411775117761177711778117791178011781117821178311784117851178611787117881178911790117911179211793117941179511796117971179811799118001180111802118031180411805118061180711808118091181011811118121181311814118151181611817118181181911820118211182211823118241182511826118271182811829118301183111832118331183411835118361183711838118391184011841118421184311844118451184611847118481184911850118511185211853118541185511856118571185811859118601186111862118631186411865118661186711868118691187011871118721187311874118751187611877118781187911880118811188211883118841188511886118871188811889118901189111892118931189411895118961189711898118991190011901119021190311904119051190611907119081190911910119111191211913119141191511916119171191811919119201192111922119231192411925119261192711928119291193011931119321193311934119351193611937119381193911940119411194211943119441194511946119471194811949119501195111952119531195411955119561195711958119591196011961119621196311964119651196611967119681196911970119711197211973119741197511976119771197811979119801198111982119831198411985119861198711988119891199011991119921199311994119951199611997119981199912000120011200212003120041200512006120071200812009120101201112012120131201412015120161201712018120191202012021120221202312024120251202612027120281202912030120311203212033120341203512036120371203812039120401204112042120431204412045120461204712048120491205012051120521205312054120551205612057120581205912060120611206212063120641206512066120671206812069120701207112072120731207412075120761207712078120791208012081120821208312084120851208612087120881208912090120911209212093120941209512096120971209812099121001210112102121031210412105121061210712108121091211012111121121211312114121151211612117121181211912120121211212212123121241212512126121271212812129121301213112132121331213412135121361213712138121391214012141121421214312144121451214612147121481214912150121511215212153121541215512156121571215812159121601216112162121631216412165121661216712168121691217012171121721217312174121751217612177121781217912180121811218212183121841218512186121871218812189121901219112192121931219412195121961219712198121991220012201122021220312204122051220612207122081220912210122111221212213122141221512216122171221812219122201222112222122231222412225122261222712228122291223012231122321223312234122351223612237122381223912240122411224212243122441224512246122471224812249122501225112252122531225412255122561225712258122591226012261122621226312264122651226612267122681226912270122711227212273122741227512276122771227812279122801228112282122831228412285122861228712288122891229012291122921229312294122951229612297122981229912300123011230212303123041230512306123071230812309123101231112312123131231412315123161231712318123191232012321123221232312324123251232612327123281232912330123311233212333123341233512336123371233812339123401234112342123431234412345123461234712348123491235012351123521235312354123551235612357123581235912360123611236212363123641236512366123671236812369123701237112372123731237412375123761237712378123791238012381123821238312384123851238612387123881238912390123911239212393123941239512396123971239812399124001240112402124031240412405124061240712408124091241012411124121241312414124151241612417124181241912420124211242212423124241242512426124271242812429124301243112432124331243412435124361243712438124391244012441124421244312444124451244612447124481244912450124511245212453124541245512456124571245812459124601246112462124631246412465124661246712468124691247012471124721247312474124751247612477124781247912480124811248212483124841248512486124871248812489124901249112492124931249412495124961249712498124991250012501125021250312504125051250612507125081250912510125111251212513125141251512516125171251812519125201252112522125231252412525125261252712528125291253012531125321253312534125351253612537125381253912540125411254212543125441254512546125471254812549125501255112552125531255412555125561255712558125591256012561125621256312564125651256612567125681256912570125711257212573125741257512576125771257812579125801258112582125831258412585125861258712588125891259012591125921259312594125951259612597125981259912600126011260212603126041260512606126071260812609126101261112612126131261412615126161261712618126191262012621126221262312624126251262612627126281262912630126311263212633126341263512636126371263812639126401264112642126431264412645126461264712648126491265012651126521265312654126551265612657126581265912660126611266212663126641266512666126671266812669126701267112672126731267412675126761267712678126791268012681126821268312684126851268612687126881268912690126911269212693126941269512696126971269812699127001270112702127031270412705127061270712708127091271012711127121271312714127151271612717127181271912720127211272212723127241272512726127271272812729127301273112732127331273412735127361273712738127391274012741127421274312744127451274612747127481274912750127511275212753127541275512756127571275812759127601276112762127631276412765127661276712768127691277012771127721277312774127751277612777127781277912780127811278212783127841278512786127871278812789127901279112792127931279412795127961279712798127991280012801128021280312804128051280612807128081280912810128111281212813128141281512816128171281812819128201282112822128231282412825128261282712828128291283012831128321283312834128351283612837128381283912840128411284212843128441284512846128471284812849128501285112852128531285412855128561285712858128591286012861128621286312864128651286612867128681286912870128711287212873128741287512876128771287812879128801288112882128831288412885128861288712888128891289012891128921289312894128951289612897128981289912900129011290212903129041290512906129071290812909129101291112912129131291412915129161291712918129191292012921129221292312924129251292612927129281292912930129311293212933129341293512936129371293812939129401294112942129431294412945129461294712948129491295012951129521295312954129551295612957129581295912960129611296212963129641296512966129671296812969129701297112972129731297412975129761297712978129791298012981129821298312984129851298612987129881298912990129911299212993129941299512996129971299812999130001300113002130031300413005130061300713008130091301013011130121301313014130151301613017130181301913020130211302213023130241302513026130271302813029130301303113032130331303413035130361303713038130391304013041130421304313044130451304613047130481304913050130511305213053130541305513056130571305813059130601306113062130631306413065130661306713068130691307013071130721307313074130751307613077130781307913080130811308213083130841308513086130871308813089130901309113092130931309413095130961309713098130991310013101131021310313104131051310613107131081310913110131111311213113131141311513116131171311813119131201312113122131231312413125131261312713128131291313013131131321313313134131351313613137131381313913140131411314213143131441314513146131471314813149131501315113152131531315413155131561315713158131591316013161131621316313164131651316613167131681316913170131711317213173131741317513176131771317813179131801318113182131831318413185131861318713188131891319013191131921319313194131951319613197131981319913200132011320213203132041320513206132071320813209132101321113212132131321413215132161321713218132191322013221132221322313224132251322613227132281322913230132311323213233132341323513236132371323813239132401324113242132431324413245132461324713248132491325013251132521325313254132551325613257132581325913260132611326213263132641326513266132671326813269132701327113272132731327413275132761327713278132791328013281132821328313284132851328613287132881328913290132911329213293132941329513296132971329813299133001330113302133031330413305133061330713308133091331013311133121331313314133151331613317133181331913320133211332213323133241332513326133271332813329133301333113332133331333413335133361333713338133391334013341133421334313344133451334613347133481334913350133511335213353133541335513356133571335813359133601336113362133631336413365133661336713368133691337013371133721337313374133751337613377133781337913380133811338213383133841338513386133871338813389133901339113392133931339413395133961339713398133991340013401134021340313404134051340613407134081340913410134111341213413134141341513416134171341813419134201342113422134231342413425134261342713428134291343013431134321343313434134351343613437134381343913440134411344213443134441344513446134471344813449134501345113452134531345413455134561345713458134591346013461134621346313464134651346613467134681346913470134711347213473134741347513476134771347813479134801348113482134831348413485134861348713488134891349013491134921349313494134951349613497134981349913500135011350213503135041350513506135071350813509135101351113512135131351413515135161351713518135191352013521135221352313524135251352613527135281352913530135311353213533135341353513536135371353813539135401354113542135431354413545135461354713548135491355013551135521355313554135551355613557135581355913560135611356213563135641356513566135671356813569135701357113572135731357413575135761357713578135791358013581135821358313584135851358613587135881358913590135911359213593135941359513596135971359813599136001360113602136031360413605136061360713608136091361013611136121361313614136151361613617136181361913620136211362213623136241362513626136271362813629136301363113632136331363413635136361363713638136391364013641136421364313644136451364613647136481364913650136511365213653136541365513656136571365813659136601366113662136631366413665136661366713668136691367013671136721367313674136751367613677136781367913680136811368213683136841368513686136871368813689136901369113692136931369413695136961369713698136991370013701137021370313704137051370613707137081370913710137111371213713137141371513716137171371813719137201372113722137231372413725137261372713728137291373013731137321373313734137351373613737137381373913740137411374213743137441374513746137471374813749137501375113752137531375413755137561375713758137591376013761137621376313764137651376613767137681376913770137711377213773137741377513776137771377813779137801378113782137831378413785137861378713788137891379013791137921379313794137951379613797137981379913800138011380213803138041380513806138071380813809138101381113812138131381413815138161381713818138191382013821138221382313824138251382613827138281382913830138311383213833138341383513836138371383813839138401384113842138431384413845138461384713848138491385013851138521385313854138551385613857138581385913860138611386213863138641386513866138671386813869138701387113872138731387413875138761387713878138791388013881138821388313884138851388613887138881388913890138911389213893138941389513896138971389813899139001390113902139031390413905139061390713908139091391013911139121391313914139151391613917139181391913920139211392213923139241392513926139271392813929139301393113932139331393413935139361393713938139391394013941139421394313944139451394613947139481394913950139511395213953139541395513956139571395813959139601396113962139631396413965139661396713968139691397013971139721397313974139751397613977139781397913980139811398213983139841398513986139871398813989139901399113992139931399413995139961399713998139991400014001140021400314004140051400614007140081400914010140111401214013140141401514016140171401814019140201402114022140231402414025140261402714028140291403014031140321403314034140351403614037140381403914040140411404214043140441404514046140471404814049140501405114052140531405414055140561405714058140591406014061140621406314064140651406614067140681406914070140711407214073140741407514076140771407814079140801408114082140831408414085140861408714088140891409014091140921409314094140951409614097140981409914100141011410214103141041410514106141071410814109141101411114112141131411414115141161411714118141191412014121141221412314124141251412614127141281412914130141311413214133141341413514136141371413814139141401414114142141431414414145141461414714148141491415014151141521415314154141551415614157141581415914160141611416214163141641416514166141671416814169141701417114172141731417414175141761417714178141791418014181141821418314184141851418614187141881418914190141911419214193141941419514196141971419814199142001420114202142031420414205142061420714208142091421014211142121421314214142151421614217142181421914220142211422214223142241422514226142271422814229142301423114232142331423414235142361423714238142391424014241142421424314244142451424614247142481424914250142511425214253142541425514256142571425814259142601426114262142631426414265142661426714268142691427014271142721427314274142751427614277142781427914280142811428214283142841428514286142871428814289142901429114292142931429414295142961429714298142991430014301143021430314304143051430614307143081430914310143111431214313143141431514316143171431814319143201432114322143231432414325143261432714328143291433014331143321433314334143351433614337143381433914340143411434214343143441434514346143471434814349143501435114352143531435414355143561435714358143591436014361143621436314364143651436614367143681436914370143711437214373143741437514376143771437814379143801438114382143831438414385143861438714388143891439014391143921439314394143951439614397143981439914400144011440214403144041440514406144071440814409144101441114412144131441414415144161441714418144191442014421144221442314424144251442614427144281442914430144311443214433144341443514436144371443814439144401444114442144431444414445144461444714448144491445014451144521445314454144551445614457144581445914460144611446214463144641446514466144671446814469144701447114472144731447414475144761447714478144791448014481144821448314484144851448614487144881448914490144911449214493144941449514496144971449814499145001450114502145031450414505145061450714508145091451014511145121451314514145151451614517145181451914520145211452214523145241452514526145271452814529145301453114532145331453414535145361453714538145391454014541145421454314544145451454614547145481454914550145511455214553145541455514556145571455814559145601456114562145631456414565145661456714568145691457014571145721457314574145751457614577145781457914580145811458214583145841458514586145871458814589145901459114592145931459414595145961459714598145991460014601146021460314604146051460614607146081460914610146111461214613146141461514616146171461814619146201462114622146231462414625146261462714628146291463014631146321463314634146351463614637146381463914640146411464214643146441464514646146471464814649146501465114652146531465414655146561465714658146591466014661146621466314664146651466614667146681466914670146711467214673146741467514676146771467814679146801468114682146831468414685146861468714688146891469014691146921469314694146951469614697146981469914700147011470214703147041470514706147071470814709147101471114712147131471414715147161471714718147191472014721147221472314724147251472614727147281472914730147311473214733147341473514736147371473814739147401474114742147431474414745147461474714748147491475014751147521475314754147551475614757147581475914760147611476214763147641476514766147671476814769147701477114772147731477414775147761477714778147791478014781147821478314784147851478614787147881478914790147911479214793147941479514796147971479814799148001480114802148031480414805148061480714808148091481014811148121481314814148151481614817148181481914820148211482214823148241482514826148271482814829148301483114832148331483414835148361483714838148391484014841148421484314844148451484614847148481484914850148511485214853148541485514856148571485814859148601486114862148631486414865148661486714868148691487014871148721487314874148751487614877148781487914880148811488214883148841488514886148871488814889148901489114892148931489414895148961489714898148991490014901149021490314904149051490614907149081490914910149111491214913149141491514916149171491814919149201492114922149231492414925149261492714928149291493014931149321493314934149351493614937149381493914940149411494214943149441494514946149471494814949149501495114952149531495414955149561495714958149591496014961149621496314964149651496614967149681496914970149711497214973149741497514976149771497814979149801498114982149831498414985149861498714988149891499014991149921499314994149951499614997149981499915000150011500215003150041500515006150071500815009150101501115012150131501415015150161501715018150191502015021150221502315024150251502615027150281502915030150311503215033150341503515036150371503815039150401504115042150431504415045150461504715048150491505015051150521505315054150551505615057150581505915060150611506215063150641506515066150671506815069150701507115072150731507415075150761507715078150791508015081150821508315084150851508615087150881508915090150911509215093150941509515096150971509815099151001510115102151031510415105151061510715108151091511015111151121511315114151151511615117151181511915120151211512215123151241512515126151271512815129151301513115132151331513415135151361513715138151391514015141151421514315144151451514615147151481514915150151511515215153151541515515156151571515815159151601516115162151631516415165151661516715168151691517015171151721517315174151751517615177151781517915180151811518215183151841518515186151871518815189151901519115192151931519415195151961519715198151991520015201152021520315204152051520615207152081520915210152111521215213152141521515216152171521815219152201522115222152231522415225152261522715228152291523015231152321523315234152351523615237152381523915240152411524215243152441524515246152471524815249152501525115252152531525415255152561525715258152591526015261152621526315264152651526615267152681526915270152711527215273152741527515276152771527815279152801528115282152831528415285152861528715288152891529015291152921529315294152951529615297152981529915300153011530215303153041530515306153071530815309153101531115312153131531415315153161531715318153191532015321153221532315324153251532615327153281532915330153311533215333153341533515336153371533815339153401534115342153431534415345153461534715348153491535015351153521535315354153551535615357153581535915360153611536215363153641536515366153671536815369153701537115372153731537415375153761537715378153791538015381153821538315384153851538615387153881538915390153911539215393153941539515396153971539815399154001540115402154031540415405154061540715408154091541015411154121541315414154151541615417154181541915420154211542215423154241542515426154271542815429154301543115432154331543415435154361543715438154391544015441154421544315444154451544615447154481544915450154511545215453154541545515456154571545815459154601546115462154631546415465154661546715468154691547015471154721547315474154751547615477154781547915480154811548215483154841548515486154871548815489154901549115492154931549415495154961549715498154991550015501155021550315504155051550615507155081550915510155111551215513155141551515516155171551815519155201552115522155231552415525155261552715528155291553015531155321553315534155351553615537155381553915540155411554215543155441554515546155471554815549155501555115552155531555415555155561555715558155591556015561155621556315564155651556615567155681556915570155711557215573155741557515576155771557815579155801558115582155831558415585155861558715588155891559015591155921559315594155951559615597155981559915600156011560215603156041560515606156071560815609156101561115612156131561415615156161561715618156191562015621156221562315624156251562615627156281562915630156311563215633156341563515636156371563815639156401564115642156431564415645156461564715648156491565015651156521565315654156551565615657156581565915660156611566215663156641566515666156671566815669156701567115672156731567415675156761567715678156791568015681156821568315684156851568615687156881568915690156911569215693156941569515696156971569815699157001570115702157031570415705157061570715708157091571015711157121571315714157151571615717157181571915720157211572215723157241572515726157271572815729157301573115732157331573415735157361573715738157391574015741157421574315744157451574615747157481574915750157511575215753157541575515756157571575815759157601576115762157631576415765157661576715768157691577015771157721577315774157751577615777157781577915780157811578215783157841578515786157871578815789157901579115792157931579415795157961579715798157991580015801158021580315804158051580615807158081580915810158111581215813158141581515816158171581815819158201582115822158231582415825158261582715828158291583015831158321583315834158351583615837158381583915840158411584215843158441584515846158471584815849158501585115852158531585415855158561585715858158591586015861158621586315864158651586615867158681586915870158711587215873158741587515876158771587815879158801588115882158831588415885158861588715888158891589015891158921589315894158951589615897158981589915900159011590215903159041590515906159071590815909159101591115912159131591415915159161591715918159191592015921159221592315924159251592615927159281592915930159311593215933159341593515936159371593815939159401594115942159431594415945159461594715948159491595015951159521595315954159551595615957159581595915960159611596215963159641596515966159671596815969159701597115972159731597415975159761597715978159791598015981159821598315984159851598615987159881598915990159911599215993159941599515996159971599815999160001600116002160031600416005160061600716008160091601016011160121601316014160151601616017160181601916020160211602216023160241602516026160271602816029160301603116032160331603416035160361603716038160391604016041160421604316044160451604616047160481604916050160511605216053160541605516056160571605816059160601606116062160631606416065160661606716068160691607016071160721607316074160751607616077160781607916080160811608216083160841608516086160871608816089160901609116092160931609416095160961609716098160991610016101161021610316104161051610616107161081610916110161111611216113161141611516116161171611816119161201612116122161231612416125161261612716128161291613016131161321613316134161351613616137161381613916140161411614216143161441614516146161471614816149161501615116152161531615416155161561615716158161591616016161161621616316164161651616616167161681616916170161711617216173161741617516176161771617816179161801618116182161831618416185161861618716188161891619016191161921619316194161951619616197161981619916200162011620216203162041620516206162071620816209162101621116212162131621416215162161621716218162191622016221162221622316224162251622616227162281622916230162311623216233162341623516236162371623816239162401624116242162431624416245162461624716248162491625016251162521625316254162551625616257162581625916260162611626216263162641626516266162671626816269162701627116272162731627416275162761627716278162791628016281162821628316284162851628616287162881628916290162911629216293162941629516296162971629816299163001630116302163031630416305163061630716308163091631016311163121631316314163151631616317163181631916320163211632216323163241632516326163271632816329163301633116332163331633416335163361633716338163391634016341163421634316344163451634616347163481634916350163511635216353163541635516356163571635816359163601636116362163631636416365163661636716368163691637016371163721637316374163751637616377163781637916380163811638216383163841638516386163871638816389163901639116392163931639416395163961639716398163991640016401164021640316404164051640616407164081640916410164111641216413164141641516416164171641816419164201642116422164231642416425164261642716428164291643016431164321643316434164351643616437164381643916440164411644216443164441644516446164471644816449164501645116452164531645416455164561645716458164591646016461164621646316464164651646616467164681646916470164711647216473164741647516476164771647816479164801648116482164831648416485164861648716488164891649016491164921649316494164951649616497164981649916500165011650216503165041650516506165071650816509165101651116512165131651416515165161651716518165191652016521165221652316524165251652616527165281652916530165311653216533165341653516536165371653816539165401654116542165431654416545165461654716548165491655016551165521655316554165551655616557165581655916560165611656216563165641656516566165671656816569165701657116572165731657416575165761657716578165791658016581165821658316584165851658616587165881658916590165911659216593165941659516596165971659816599166001660116602166031660416605166061660716608166091661016611166121661316614166151661616617166181661916620166211662216623166241662516626166271662816629166301663116632166331663416635166361663716638166391664016641166421664316644166451664616647166481664916650166511665216653166541665516656166571665816659166601666116662166631666416665166661666716668166691667016671166721667316674166751667616677166781667916680166811668216683166841668516686166871668816689166901669116692166931669416695166961669716698166991670016701167021670316704167051670616707167081670916710167111671216713167141671516716167171671816719167201672116722167231672416725167261672716728167291673016731167321673316734167351673616737167381673916740167411674216743167441674516746167471674816749167501675116752167531675416755167561675716758167591676016761167621676316764167651676616767167681676916770167711677216773167741677516776167771677816779167801678116782167831678416785167861678716788167891679016791167921679316794167951679616797167981679916800168011680216803168041680516806168071680816809168101681116812168131681416815168161681716818168191682016821168221682316824168251682616827168281682916830168311683216833168341683516836168371683816839168401684116842168431684416845168461684716848168491685016851168521685316854168551685616857168581685916860168611686216863168641686516866168671686816869168701687116872168731687416875168761687716878168791688016881168821688316884168851688616887168881688916890168911689216893168941689516896168971689816899169001690116902169031690416905169061690716908169091691016911169121691316914169151691616917169181691916920169211692216923169241692516926169271692816929169301693116932169331693416935169361693716938169391694016941169421694316944169451694616947169481694916950169511695216953169541695516956169571695816959169601696116962169631696416965169661696716968169691697016971169721697316974169751697616977169781697916980169811698216983169841698516986169871698816989169901699116992169931699416995169961699716998169991700017001170021700317004170051700617007170081700917010170111701217013170141701517016170171701817019170201702117022170231702417025170261702717028170291703017031170321703317034170351703617037170381703917040170411704217043170441704517046170471704817049170501705117052170531705417055170561705717058170591706017061170621706317064170651706617067170681706917070170711707217073170741707517076170771707817079170801708117082170831708417085170861708717088170891709017091170921709317094170951709617097170981709917100171011710217103171041710517106171071710817109171101711117112171131711417115171161711717118171191712017121171221712317124171251712617127171281712917130171311713217133171341713517136171371713817139171401714117142171431714417145171461714717148171491715017151171521715317154171551715617157171581715917160171611716217163171641716517166171671716817169171701717117172171731717417175171761717717178171791718017181171821718317184171851718617187171881718917190171911719217193171941719517196171971719817199172001720117202172031720417205172061720717208172091721017211172121721317214172151721617217172181721917220172211722217223172241722517226172271722817229172301723117232172331723417235172361723717238172391724017241172421724317244172451724617247172481724917250172511725217253172541725517256172571725817259172601726117262172631726417265172661726717268172691727017271172721727317274172751727617277172781727917280172811728217283172841728517286172871728817289172901729117292172931729417295172961729717298172991730017301173021730317304173051730617307173081730917310173111731217313173141731517316173171731817319173201732117322173231732417325173261732717328173291733017331173321733317334173351733617337173381733917340173411734217343173441734517346173471734817349173501735117352173531735417355173561735717358173591736017361173621736317364173651736617367173681736917370173711737217373173741737517376173771737817379173801738117382173831738417385173861738717388173891739017391173921739317394173951739617397173981739917400174011740217403174041740517406174071740817409174101741117412174131741417415174161741717418174191742017421174221742317424174251742617427174281742917430174311743217433174341743517436174371743817439174401744117442174431744417445174461744717448174491745017451174521745317454174551745617457174581745917460174611746217463174641746517466174671746817469174701747117472174731747417475174761747717478174791748017481174821748317484174851748617487174881748917490174911749217493174941749517496174971749817499175001750117502175031750417505175061750717508175091751017511175121751317514175151751617517175181751917520175211752217523175241752517526175271752817529175301753117532175331753417535175361753717538175391754017541175421754317544175451754617547175481754917550175511755217553175541755517556175571755817559175601756117562175631756417565175661756717568175691757017571175721757317574175751757617577175781757917580175811758217583175841758517586175871758817589175901759117592175931759417595175961759717598175991760017601176021760317604176051760617607176081760917610176111761217613176141761517616176171761817619176201762117622176231762417625176261762717628176291763017631176321763317634176351763617637176381763917640176411764217643176441764517646176471764817649176501765117652176531765417655176561765717658176591766017661176621766317664176651766617667176681766917670176711767217673176741767517676176771767817679176801768117682176831768417685176861768717688176891769017691176921769317694176951769617697176981769917700177011770217703177041770517706177071770817709177101771117712177131771417715177161771717718177191772017721177221772317724177251772617727177281772917730177311773217733177341773517736177371773817739177401774117742177431774417745177461774717748177491775017751177521775317754177551775617757177581775917760177611776217763177641776517766177671776817769177701777117772177731777417775177761777717778177791778017781177821778317784177851778617787177881778917790177911779217793177941779517796177971779817799178001780117802178031780417805178061780717808178091781017811178121781317814178151781617817178181781917820178211782217823178241782517826178271782817829178301783117832178331783417835178361783717838178391784017841178421784317844178451784617847178481784917850178511785217853178541785517856178571785817859178601786117862178631786417865178661786717868178691787017871178721787317874178751787617877178781787917880178811788217883178841788517886178871788817889178901789117892178931789417895178961789717898178991790017901179021790317904179051790617907179081790917910179111791217913179141791517916179171791817919179201792117922179231792417925179261792717928179291793017931179321793317934179351793617937179381793917940179411794217943179441794517946179471794817949179501795117952179531795417955179561795717958179591796017961179621796317964179651796617967179681796917970179711797217973179741797517976179771797817979179801798117982179831798417985179861798717988179891799017991179921799317994179951799617997179981799918000180011800218003180041800518006180071800818009180101801118012180131801418015180161801718018180191802018021180221802318024180251802618027180281802918030180311803218033180341803518036180371803818039180401804118042180431804418045180461804718048180491805018051180521805318054180551805618057180581805918060180611806218063180641806518066180671806818069180701807118072180731807418075180761807718078180791808018081180821808318084180851808618087180881808918090180911809218093180941809518096180971809818099181001810118102181031810418105181061810718108181091811018111181121811318114181151811618117181181811918120181211812218123181241812518126181271812818129181301813118132181331813418135181361813718138181391814018141181421814318144181451814618147181481814918150181511815218153181541815518156181571815818159181601816118162181631816418165181661816718168181691817018171181721817318174181751817618177181781817918180181811818218183181841818518186181871818818189181901819118192181931819418195181961819718198181991820018201182021820318204182051820618207182081820918210182111821218213182141821518216182171821818219182201822118222182231822418225182261822718228182291823018231182321823318234182351823618237182381823918240182411824218243182441824518246182471824818249182501825118252182531825418255182561825718258182591826018261182621826318264182651826618267182681826918270182711827218273182741827518276182771827818279182801828118282182831828418285182861828718288182891829018291182921829318294182951829618297182981829918300183011830218303183041830518306183071830818309183101831118312183131831418315183161831718318183191832018321183221832318324183251832618327183281832918330183311833218333183341833518336183371833818339183401834118342183431834418345183461834718348183491835018351183521835318354183551835618357183581835918360183611836218363183641836518366183671836818369183701837118372183731837418375183761837718378183791838018381183821838318384183851838618387183881838918390183911839218393183941839518396183971839818399184001840118402184031840418405184061840718408184091841018411184121841318414184151841618417184181841918420184211842218423184241842518426184271842818429184301843118432184331843418435184361843718438184391844018441184421844318444184451844618447184481844918450184511845218453184541845518456184571845818459184601846118462184631846418465184661846718468184691847018471184721847318474184751847618477184781847918480184811848218483184841848518486184871848818489184901849118492184931849418495184961849718498184991850018501185021850318504185051850618507185081850918510185111851218513185141851518516185171851818519185201852118522185231852418525185261852718528185291853018531185321853318534185351853618537185381853918540185411854218543185441854518546185471854818549185501855118552185531855418555185561855718558185591856018561185621856318564185651856618567185681856918570185711857218573185741857518576185771857818579185801858118582185831858418585185861858718588185891859018591185921859318594185951859618597185981859918600186011860218603186041860518606186071860818609186101861118612186131861418615186161861718618186191862018621186221862318624186251862618627186281862918630186311863218633186341863518636186371863818639186401864118642186431864418645186461864718648186491865018651186521865318654186551865618657186581865918660186611866218663186641866518666186671866818669186701867118672186731867418675186761867718678186791868018681186821868318684186851868618687186881868918690186911869218693186941869518696186971869818699187001870118702187031870418705187061870718708187091871018711187121871318714187151871618717187181871918720187211872218723187241872518726187271872818729187301873118732187331873418735187361873718738187391874018741187421874318744187451874618747187481874918750187511875218753187541875518756187571875818759187601876118762187631876418765187661876718768187691877018771187721877318774187751877618777187781877918780187811878218783187841878518786187871878818789187901879118792187931879418795187961879718798187991880018801188021880318804188051880618807188081880918810188111881218813188141881518816188171881818819188201882118822188231882418825188261882718828188291883018831188321883318834188351883618837188381883918840188411884218843188441884518846188471884818849188501885118852188531885418855188561885718858188591886018861188621886318864188651886618867188681886918870188711887218873188741887518876188771887818879188801888118882188831888418885188861888718888188891889018891188921889318894188951889618897188981889918900189011890218903189041890518906189071890818909189101891118912189131891418915189161891718918189191892018921189221892318924189251892618927189281892918930189311893218933189341893518936189371893818939189401894118942189431894418945189461894718948189491895018951189521895318954189551895618957189581895918960189611896218963189641896518966189671896818969189701897118972189731897418975189761897718978189791898018981189821898318984189851898618987189881898918990189911899218993189941899518996189971899818999190001900119002190031900419005190061900719008190091901019011190121901319014190151901619017190181901919020190211902219023190241902519026190271902819029190301903119032190331903419035190361903719038190391904019041190421904319044190451904619047190481904919050190511905219053190541905519056190571905819059190601906119062190631906419065190661906719068190691907019071190721907319074190751907619077190781907919080190811908219083190841908519086190871908819089190901909119092190931909419095190961909719098190991910019101191021910319104191051910619107191081910919110191111911219113191141911519116191171911819119191201912119122191231912419125191261912719128191291913019131191321913319134191351913619137191381913919140191411914219143191441914519146191471914819149191501915119152191531915419155191561915719158191591916019161191621916319164191651916619167191681916919170191711917219173191741917519176191771917819179191801918119182191831918419185191861918719188191891919019191191921919319194191951919619197191981919919200192011920219203192041920519206192071920819209192101921119212192131921419215192161921719218192191922019221192221922319224192251922619227192281922919230192311923219233192341923519236192371923819239192401924119242192431924419245192461924719248192491925019251192521925319254192551925619257192581925919260192611926219263192641926519266192671926819269192701927119272192731927419275192761927719278192791928019281192821928319284192851928619287192881928919290192911929219293192941929519296192971929819299193001930119302193031930419305193061930719308193091931019311193121931319314193151931619317193181931919320193211932219323193241932519326193271932819329193301933119332193331933419335193361933719338193391934019341193421934319344193451934619347193481934919350193511935219353193541935519356193571935819359193601936119362
							/* sp_int.c
 *
 * Copyright (C) 2006-2023 wolfSSL Inc.
 *
 * This file is part of wolfSSL.
 *
 * wolfSSL is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * wolfSSL is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
 */

/* Implementation by Sean Parkinson. */

/*
DESCRIPTION
This library provides single precision (SP) integer math functions.

*/
#ifdef HAVE_CONFIG_H
    #include <config.h>
#endif

#include <wolfssl/wolfcrypt/settings.h>

#if defined(WOLFSSL_SP_MATH) || defined(WOLFSSL_SP_MATH_ALL)

#if (!defined(WOLFSSL_SMALL_STACK) && !defined(SP_ALLOC)) || \
    defined(WOLFSSL_SP_NO_MALLOC)
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && \
    !defined(WOLFSSL_SP_NO_DYN_STACK)
#pragma GCC diagnostic push
/* We are statically declaring a variable smaller than sp_int.
 * We track available memory in the 'size' field.
 * Disable warnings of sp_int being partly outside array bounds of variable.
 */
#pragma GCC diagnostic ignored "-Warray-bounds"
#endif
#endif

#ifdef NO_INLINE
    #include <wolfssl/wolfcrypt/misc.h>
#else
    #define WOLFSSL_MISC_INCLUDED
    #include <wolfcrypt/src/misc.c>
#endif

/* SP Build Options:
 * WOLFSSL_HAVE_SP_RSA:         Enable SP RSA support
 * WOLFSSL_HAVE_SP_DH:          Enable SP DH support
 * WOLFSSL_HAVE_SP_ECC:         Enable SP ECC support
 * WOLFSSL_SP_MATH:             Use only single precision math and algorithms
 *      it supports (no fastmath tfm.c or normal integer.c)
 * WOLFSSL_SP_MATH_ALL          Implementation of all MP functions
 *      (replacement for tfm.c and integer.c)
 * WOLFSSL_SP_SMALL:            Use smaller version of code and avoid large
 *      stack variables
 * WOLFSSL_SP_NO_MALLOC:        Always use stack, no heap XMALLOC/XFREE allowed
 * WOLFSSL_SP_NO_2048:          Disable RSA/DH 2048-bit support
 * WOLFSSL_SP_NO_3072:          Disable RSA/DH 3072-bit support
 * WOLFSSL_SP_4096:             Enable RSA/RH 4096-bit support
 * WOLFSSL_SP_NO_256            Disable ECC 256-bit SECP256R1 support
 * WOLFSSL_SP_384               Enable ECC 384-bit SECP384R1 support
 * WOLFSSL_SP_521               Enable ECC 521-bit SECP521R1 support
 * WOLFSSL_SP_ASM               Enable assembly speedups (detect platform)
 * WOLFSSL_SP_X86_64_ASM        Enable Intel x64 assembly implementation
 * WOLFSSL_SP_ARM32_ASM         Enable Aarch32 assembly implementation
 * WOLFSSL_SP_ARM64_ASM         Enable Aarch64 assembly implementation
 * WOLFSSL_SP_ARM_CORTEX_M_ASM  Enable Cortex-M assembly implementation
 * WOLFSSL_SP_ARM_THUMB_ASM     Enable ARM Thumb assembly implementation
 *      (used with -mthumb)
 * WOLFSSL_SP_X86_64            Enable Intel x86 64-bit assembly speedups
 * WOLFSSL_SP_X86               Enable Intel x86 assembly speedups
 * WOLFSSL_SP_ARM64             Enable Aarch64 assembly speedups
 * WOLFSSL_SP_ARM32             Enable ARM32 assembly speedups
 * WOLFSSL_SP_ARM32_UDIV        Enable word divide asm that uses UDIV instr
 * WOLFSSL_SP_ARM_THUMB         Enable ARM Thumb assembly speedups
 *                              (explicitly uses register 'r7')
 * WOLFSSL_SP_PPC64             Enable PPC64 assembly speedups
 * WOLFSSL_SP_PPC               Enable PPC assembly speedups
 * WOLFSSL_SP_MIPS64            Enable MIPS64 assembly speedups
 * WOLFSSL_SP_MIPS              Enable MIPS assembly speedups
 * WOLFSSL_SP_RISCV64           Enable RISCV64 assembly speedups
 * WOLFSSL_SP_RISCV32           Enable RISCV32 assembly speedups
 * WOLFSSL_SP_S390X             Enable S390X assembly speedups
 * SP_WORD_SIZE                 Force 32 or 64 bit mode
 * WOLFSSL_SP_NONBLOCK          Enables "non blocking" mode for SP math, which
 *      will return FP_WOULDBLOCK for long operations and function must be
 *      called again until complete.
 * WOLFSSL_SP_FAST_NCT_EXPTMOD  Enables the faster non-constant time modular
 *      exponentiation implementation.
 * WOLFSSL_SP_INT_NEGATIVE      Enables negative values to be used.
 * WOLFSSL_SP_INT_DIGIT_ALIGN   Enable when unaligned access of sp_int_digit
 *                              pointer is not allowed.
 * WOLFSSL_SP_NO_DYN_STACK      Disable use of dynamic stack items.
 *                              Dynamic arrays used when not small stack.
 * WOLFSSL_SP_FAST_MODEXP       Allow fast mod_exp with small C code
 * WOLFSSL_SP_LOW_MEM           Use algorithms that use less memory.
 */

/* TODO: WOLFSSL_SP_SMALL is incompatible with clang-12+ -Os. */
#if defined(__clang__) && defined(__clang_major__) && \
    (__clang_major__ >= 12) && defined(WOLFSSL_SP_SMALL)
    #undef WOLFSSL_SP_SMALL
#endif

#include <wolfssl/wolfcrypt/sp_int.h>

/* DECL_SP_INT: Declare one variable of type 'sp_int'. */
#if (defined(WOLFSSL_SMALL_STACK) || defined(SP_ALLOC)) && \
    !defined(WOLFSSL_SP_NO_MALLOC)
    /* Declare a variable that will be assigned a value on XMALLOC. */
    #define DECL_SP_INT(n, s)   \
        sp_int* n = NULL
#else
    #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && \
        !defined(WOLFSSL_SP_NO_DYN_STACK)
        /* Declare a variable on the stack with the required data size. */
        #define DECL_SP_INT(n, s)               \
            byte    n##d[MP_INT_SIZEOF(s)];     \
            sp_int* (n) = (sp_int*)n##d
    #else
        /* Declare a variable on the stack. */
        #define DECL_SP_INT(n, s)               \
            sp_int n[1]
    #endif
#endif

/* ALLOC_SP_INT: Allocate an 'sp_int' of required size. */
#if (defined(WOLFSSL_SMALL_STACK) || defined(SP_ALLOC)) && \
    !defined(WOLFSSL_SP_NO_MALLOC)
    /* Dynamically allocate just enough data to support size. */
    #define ALLOC_SP_INT(n, s, err, h)                                         \
    do {                                                                       \
        if (((err) == MP_OKAY) && ((s) > SP_INT_DIGITS)) {                     \
            (err) = MP_VAL;                                                    \
        }                                                                      \
        if ((err) == MP_OKAY) {                                                \
            (n) = (sp_int*)XMALLOC(MP_INT_SIZEOF(s), (h),                      \
                DYNAMIC_TYPE_BIGINT);                                          \
            if ((n) == NULL) {                                                 \
                (err) = MP_MEM;                                                \
            }                                                                  \
        }                                                                      \
    }                                                                          \
    while (0)

    /* Dynamically allocate just enough data to support size - and set size. */
    #define ALLOC_SP_INT_SIZE(n, s, err, h)                                    \
    do {                                                                       \
        ALLOC_SP_INT(n, s, err, h);                                            \
        if ((err) == MP_OKAY) {                                                \
            (n)->size = (s);                                                   \
        }                                                                      \
    }                                                                          \
    while (0)
#else
    /* Array declared on stack - check size is valid. */
    #define ALLOC_SP_INT(n, s, err, h)                                         \
    do {                                                                       \
        if (((err) == MP_OKAY) && ((s) > SP_INT_DIGITS)) {                     \
            (err) = MP_VAL;                                                    \
        }                                                                      \
    }                                                                          \
    while (0)

    /* Array declared on stack - set the size field. */
    #define ALLOC_SP_INT_SIZE(n, s, err, h)                                    \
    do {                                                                       \
        ALLOC_SP_INT(n, s, err, h);                                            \
        if ((err) == MP_OKAY) {                                                \
            (n)->size = (unsigned int)(s);                                     \
        }                                                                      \
    }                                                                          \
    while (0)
#endif

/* FREE_SP_INT: Free an 'sp_int' variable. */
#if (defined(WOLFSSL_SMALL_STACK) || defined(SP_ALLOC)) && \
    !defined(WOLFSSL_SP_NO_MALLOC)
    /* Free dynamically allocated data. */
    #define FREE_SP_INT(n, h)                   \
    do {                                        \
        if ((n) != NULL) {                      \
            XFREE(n, h, DYNAMIC_TYPE_BIGINT);   \
        }                                       \
    }                                           \
    while (0)
#else
    /* Nothing to do as declared on stack. */
    #define FREE_SP_INT(n, h) WC_DO_NOTHING
#endif


/* Declare a variable that will be assigned a value on XMALLOC. */
#define DECL_DYN_SP_INT_ARRAY(n, s, c)  \
    sp_int* n##d = NULL;            \
    sp_int* (n)[c] = { NULL, }

/* DECL_SP_INT_ARRAY: Declare array of 'sp_int'. */
#if (defined(WOLFSSL_SMALL_STACK) || defined(SP_ALLOC)) && \
    !defined(WOLFSSL_SP_NO_MALLOC)
    /* Declare a variable that will be assigned a value on XMALLOC. */
    #define DECL_SP_INT_ARRAY(n, s, c)  \
        DECL_DYN_SP_INT_ARRAY(n, s, c)
#else
    #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && \
        !defined(WOLFSSL_SP_NO_DYN_STACK)
        /* Declare a variable on the stack with the required data size. */
        #define DECL_SP_INT_ARRAY(n, s, c)          \
            byte    n##d[MP_INT_SIZEOF(s) * (c)];   \
            sp_int* (n)[c] = { NULL, }
    #else
        /* Declare a variable on the stack. */
        #define DECL_SP_INT_ARRAY(n, s, c)      \
            sp_int n##d[c];                     \
            sp_int* (n)[c]
    #endif
#endif

/* Dynamically allocate just enough data to support multiple sp_ints of the
 * required size. Use pointers into data to make up array and set sizes.
 */
#define ALLOC_DYN_SP_INT_ARRAY(n, s, c, err, h)                                \
do {                                                                           \
    if (((err) == MP_OKAY) && ((s) > SP_INT_DIGITS)) {                         \
        (err) = MP_VAL;                                                        \
    }                                                                          \
    if ((err) == MP_OKAY) {                                                    \
        n##d = (sp_int*)XMALLOC(MP_INT_SIZEOF(s) * (c), (h),                   \
                                                         DYNAMIC_TYPE_BIGINT); \
        if (n##d == NULL) {                                                    \
            (err) = MP_MEM;                                                    \
        }                                                                      \
        else {                                                                 \
            int n##ii;                                                         \
            (n)[0] = n##d;                                                     \
            (n)[0]->size = (s);                                                \
            for (n##ii = 1; n##ii < (int)(c); n##ii++) {                       \
                (n)[n##ii] = MP_INT_NEXT((n)[n##ii-1], s);                     \
                (n)[n##ii]->size = (s);                                        \
            }                                                                  \
        }                                                                      \
    }                                                                          \
}                                                                              \
while (0)

/* ALLOC_SP_INT_ARRAY: Allocate an array of 'sp_int's of required size. */
#if (defined(WOLFSSL_SMALL_STACK) || defined(SP_ALLOC)) && \
    !defined(WOLFSSL_SP_NO_MALLOC)
    #define ALLOC_SP_INT_ARRAY(n, s, c, err, h) \
        ALLOC_DYN_SP_INT_ARRAY(n, s, c, err, h)
#else
    #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && \
        !defined(WOLFSSL_SP_NO_DYN_STACK)
        /* Data declared on stack that supports multiple sp_ints of the
         * required size. Use pointers into data to make up array and set sizes.
         */
        #define ALLOC_SP_INT_ARRAY(n, s, c, err, h)                            \
        do {                                                                   \
            if (((err) == MP_OKAY) && ((s) > SP_INT_DIGITS)) {                 \
                (err) = MP_VAL;                                                \
            }                                                                  \
            if ((err) == MP_OKAY) {                                            \
                int n##ii;                                                     \
                (n)[0] = (sp_int*)n##d;                                        \
                ((sp_int_minimal*)(n)[0])->size = (s);                         \
                for (n##ii = 1; n##ii < (int)(c); n##ii++) {                   \
                    (n)[n##ii] = MP_INT_NEXT((n)[n##ii-1], s);                 \
                    ((sp_int_minimal*)(n)[n##ii])->size = (s);                 \
                }                                                              \
            }                                                                  \
        }                                                                      \
        while (0)
    #else
        /* Data declared on stack that supports multiple sp_ints of the
         * required size. Set into array and set sizes.
         */
        #define ALLOC_SP_INT_ARRAY(n, s, c, err, h)                            \
        do {                                                                   \
            if (((err) == MP_OKAY) && ((s) > SP_INT_DIGITS)) {                 \
                (err) = MP_VAL;                                                \
            }                                                                  \
            if ((err) == MP_OKAY) {                                            \
                int n##ii;                                                     \
                for (n##ii = 0; n##ii < (int)(c); n##ii++) {                   \
                    (n)[n##ii] = &n##d[n##ii];                                 \
                    (n)[n##ii]->size = (s);                                    \
                }                                                              \
            }                                                                  \
        }                                                                      \
        while (0)
    #endif
#endif

/* Free data variable that was dynamically allocated. */
#define FREE_DYN_SP_INT_ARRAY(n, h)             \
do {                                            \
    if (n##d != NULL) {                         \
        XFREE(n##d, h, DYNAMIC_TYPE_BIGINT);    \
    }                                           \
}                                               \
while (0)

/* FREE_SP_INT_ARRAY: Free an array of 'sp_int'. */
#if (defined(WOLFSSL_SMALL_STACK) || defined(SP_ALLOC)) && \
    !defined(WOLFSSL_SP_NO_MALLOC)
    #define FREE_SP_INT_ARRAY(n, h)                 \
        FREE_DYN_SP_INT_ARRAY(n, h)
#else
    /* Nothing to do as data declared on stack. */
    #define FREE_SP_INT_ARRAY(n, h) WC_DO_NOTHING
#endif


#ifndef WOLFSSL_NO_ASM
    #ifdef __IAR_SYSTEMS_ICC__
        #define __asm__        asm
        #define __volatile__   volatile
    #endif /* __IAR_SYSTEMS_ICC__ */
    #ifdef __KEIL__
        #define __asm__        __asm
        #define __volatile__   volatile
    #endif

    #if defined(WOLFSSL_SP_X86_64) && SP_WORD_SIZE == 64
/*
 * CPU: x86_64
 */

#ifndef _MSC_VER
/* Multiply va by vb and store double size result in: vh | vl */
#define SP_ASM_MUL(vl, vh, va, vb)                       \
    __asm__ __volatile__ (                               \
        "movq	%[b], %%rax	\n\t"                    \
        "mulq	%[a]		\n\t"                    \
        "movq	%%rax, %[l]	\n\t"                    \
        "movq	%%rdx, %[h]	\n\t"                    \
        : [h] "+r" (vh), [l] "+r" (vl)                   \
        : [a] "m" (va), [b] "m" (vb)                     \
        : "memory", "%rax", "%rdx", "cc"                 \
    )
/* Multiply va by vb and store double size result in: vo | vh | vl */
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        "movq	%[b], %%rax	\n\t"                    \
        "mulq	%[a]		\n\t"                    \
        "movq	$0   , %[o]	\n\t"                    \
        "movq	%%rax, %[l]	\n\t"                    \
        "movq	%%rdx, %[h]	\n\t"                    \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "=r" (vo)    \
        : [a] "m" (va), [b] "m" (vb)                     \
        : "%rax", "%rdx", "cc"                           \
    )
/* Multiply va by vb and add double size result into: vo | vh | vl */
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        "movq	%[b], %%rax	\n\t"                    \
        "mulq	%[a]		\n\t"                    \
        "addq	%%rax, %[l]	\n\t"                    \
        "adcq	%%rdx, %[h]	\n\t"                    \
        "adcq	$0   , %[o]	\n\t"                    \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "m" (va), [b] "m" (vb)                     \
        : "%rax", "%rdx", "cc"                           \
    )
/* Multiply va by vb and add double size result into: vh | vl */
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
    __asm__ __volatile__ (                               \
        "movq	%[b], %%rax	\n\t"                    \
        "mulq	%[a]		\n\t"                    \
        "addq	%%rax, %[l]	\n\t"                    \
        "adcq	%%rdx, %[h]	\n\t"                    \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "m" (va), [b] "m" (vb)                     \
        : "%rax", "%rdx", "cc"                           \
    )
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
    __asm__ __volatile__ (                               \
        "movq	%[b], %%rax	\n\t"                    \
        "mulq	%[a]		\n\t"                    \
        "addq	%%rax, %[l]	\n\t"                    \
        "adcq	%%rdx, %[h]	\n\t"                    \
        "adcq	$0   , %[o]	\n\t"                    \
        "addq	%%rax, %[l]	\n\t"                    \
        "adcq	%%rdx, %[h]	\n\t"                    \
        "adcq	$0   , %[o]	\n\t"                    \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "m" (va), [b] "m" (vb)                     \
        : "%rax", "%rdx", "cc"                           \
    )
/* Multiply va by vb and add double size result twice into: vo | vh | vl
 * Assumes first add will not overflow vh | vl
 */
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
    __asm__ __volatile__ (                               \
        "movq	%[b], %%rax	\n\t"                    \
        "mulq	%[a]		\n\t"                    \
        "addq	%%rax, %[l]	\n\t"                    \
        "adcq	%%rdx, %[h]	\n\t"                    \
        "addq	%%rax, %[l]	\n\t"                    \
        "adcq	%%rdx, %[h]	\n\t"                    \
        "adcq	$0   , %[o]	\n\t"                    \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "m" (va), [b] "m" (vb)                     \
        : "%rax", "%rdx", "cc"                           \
    )
/* Square va and store double size result in: vh | vl */
#define SP_ASM_SQR(vl, vh, va)                           \
    __asm__ __volatile__ (                               \
        "movq	%[a], %%rax	\n\t"                    \
        "mulq	%%rax		\n\t"                    \
        "movq	%%rax, %[l]	\n\t"                    \
        "movq	%%rdx, %[h]	\n\t"                    \
        : [h] "+r" (vh), [l] "+r" (vl)                   \
        : [a] "m" (va)                                   \
        : "memory", "%rax", "%rdx", "cc"                 \
    )
/* Square va and add double size result into: vo | vh | vl */
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
    __asm__ __volatile__ (                               \
        "movq	%[a], %%rax	\n\t"                    \
        "mulq	%%rax		\n\t"                    \
        "addq	%%rax, %[l]	\n\t"                    \
        "adcq	%%rdx, %[h]	\n\t"                    \
        "adcq	$0   , %[o]	\n\t"                    \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "m" (va)                                   \
        : "%rax", "%rdx", "cc"                           \
    )
/* Square va and add double size result into: vh | vl */
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
    __asm__ __volatile__ (                               \
        "movq	%[a], %%rax	\n\t"                    \
        "mulq	%%rax		\n\t"                    \
        "addq	%%rax, %[l]	\n\t"                    \
        "adcq	%%rdx, %[h]	\n\t"                    \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "m" (va)                                   \
        : "%rax", "%rdx", "cc"                           \
    )
/* Add va into: vh | vl */
#define SP_ASM_ADDC(vl, vh, va)                          \
    __asm__ __volatile__ (                               \
        "addq	%[a], %[l]	\n\t"                    \
        "adcq	$0  , %[h]	\n\t"                    \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "m" (va)                                   \
        : "cc"                                           \
    )
/* Add va, variable in a register, into: vh | vl */
#define SP_ASM_ADDC_REG(vl, vh, va)                      \
    __asm__ __volatile__ (                               \
        "addq	%[a], %[l]	\n\t"                    \
        "adcq	$0  , %[h]	\n\t"                    \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "cc"                                           \
    )
/* Sub va from: vh | vl */
#define SP_ASM_SUBB(vl, vh, va)                          \
    __asm__ __volatile__ (                               \
        "subq	%[a], %[l]	\n\t"                    \
        "sbbq	$0  , %[h]	\n\t"                    \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "m" (va)                                   \
        : "cc"                                           \
    )
/* Sub va from: vh | vl */
#define SP_ASM_SUBB_REG(vl, vh, va)                      \
    __asm__ __volatile__ (                               \
        "subq	%[a], %[l]	\n\t"                    \
        "sbbq	$0  , %[h]	\n\t"                    \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "cc"                                           \
    )
/* Add two times vc | vb | va into vo | vh | vl */
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
    __asm__ __volatile__ (                               \
        "addq	%[a], %[l]	\n\t"                    \
        "adcq	%[b], %[h]	\n\t"                    \
        "adcq	%[c], %[o]	\n\t"                    \
        "addq	%[a], %[l]	\n\t"                    \
        "adcq	%[b], %[h]	\n\t"                    \
        "adcq	%[c], %[o]	\n\t"                    \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb), [c] "r" (vc)       \
        : "cc"                                           \
    )
/* Index of highest bit set. */
#define SP_ASM_HI_BIT_SET_IDX(va, vi)                    \
    __asm__ __volatile__ (                               \
        "bsr	%[a], %[i]	\n\t"                    \
        : [i] "=r" (vi)                                  \
        : [a] "r" (va)                                   \
        : "cc"                                           \
    )
#else
#include <intrin.h>

/* Multiply va by vb and store double size result in: vh | vl */
#define SP_ASM_MUL(vl, vh, va, vb)                       \
    vl = _umul128(va, vb, &vh)

/* Multiply va by vb and store double size result in: vo | vh | vl */
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
    do {                                                 \
        vl = _umul128(va, vb, &vh);                      \
        vo = 0;                                          \
    }                                                    \
    while (0)

/* Multiply va by vb and add double size result into: vo | vh | vl */
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
    do {                                                 \
        unsigned __int64 vtl, vth;                       \
        unsigned char c;                                 \
        vtl = _umul128(va, vb, &vth);                    \
        c = _addcarry_u64(0, vl, vtl, &vl);              \
        c = _addcarry_u64(c, vh, vth, &vh);              \
            _addcarry_u64(c, vo,   0, &vo);              \
    }                                                    \
    while (0)

/* Multiply va by vb and add double size result into: vh | vl */
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
    do {                                                 \
        unsigned __int64 vtl, vth;                       \
        unsigned char c;                                 \
        vtl = _umul128(va, vb, &vth);                    \
        c = _addcarry_u64(0, vl, vtl, &vl);              \
            _addcarry_u64(c, vh, vth, &vh);              \
    }                                                    \
    while (0)

/* Multiply va by vb and add double size result twice into: vo | vh | vl */
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
    do {                                                 \
        unsigned __int64 vtl, vth;                       \
        unsigned char c;                                 \
        vtl = _umul128(va, vb, &vth);                    \
        c = _addcarry_u64(0, vl, vtl, &vl);              \
        c = _addcarry_u64(c, vh, vth, &vh);              \
            _addcarry_u64(c, vo,   0, &vo);              \
        c = _addcarry_u64(0, vl, vtl, &vl);              \
        c = _addcarry_u64(c, vh, vth, &vh);              \
            _addcarry_u64(c, vo,   0, &vo);              \
    }                                                    \
    while (0)
/* Multiply va by vb and add double size result twice into: vo | vh | vl
 * Assumes first add will not overflow vh | vl
 */
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
    do {                                                 \
        unsigned __int64 vtl, vth;                       \
        unsigned char c;                                 \
        vtl = _umul128(va, vb, &vth);                    \
        c = _addcarry_u64(0, vl, vtl, &vl);              \
            _addcarry_u64(c, vh, vth, &vh);              \
        c = _addcarry_u64(0, vl, vtl, &vl);              \
        c = _addcarry_u64(c, vh, vth, &vh);              \
            _addcarry_u64(c, vo,   0, &vo);              \
    }                                                    \
    while (0)

 /* Square va and store double size result in: vh | vl */
#define SP_ASM_SQR(vl, vh, va)                           \
    vl = _umul128(va, va, &vh)

/* Square va and add double size result into: vo | vh | vl */
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
    do {                                                 \
        unsigned __int64 vtl, vth;                       \
        unsigned char c;                                 \
        vtl = _umul128(va, va, &vth);                    \
        c = _addcarry_u64(0, vl, vtl, &vl);              \
        c = _addcarry_u64(c, vh, vth, &vh);              \
            _addcarry_u64(c, vo,   0, &vo);              \
    }                                                    \
    while (0)

/* Square va and add double size result into: vh | vl */
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
    do {                                                 \
        unsigned __int64 vtl, vth;                       \
        unsigned char c;                                 \
        vtl = _umul128(va, va, &vth);                    \
        c = _addcarry_u64(0, vl, vtl, &vl);              \
            _addcarry_u64(c, vh, vth, &vh);              \
    }                                                    \
    while (0)

/* Add va into: vh | vl */
#define SP_ASM_ADDC(vl, vh, va)                          \
    do {                                                 \
        unsigned char c;                                 \
        c = _addcarry_u64(0, vl, va, &vl);               \
            _addcarry_u64(c, vh,  0, &vh);               \
    }                                                    \
    while (0)

/* Add va, variable in a register, into: vh | vl */
#define SP_ASM_ADDC_REG(vl, vh, va)                      \
    do {                                                 \
        unsigned char c;                                 \
        c = _addcarry_u64(0, vl, va, &vl);               \
            _addcarry_u64(c, vh,  0, &vh);               \
    }                                                    \
    while (0)

/* Sub va from: vh | vl */
#define SP_ASM_SUBB(vl, vh, va)                          \
    do {                                                 \
        unsigned char c;                                 \
        c = _subborrow_u64(0, vl, va, &vl);              \
            _subborrow_u64(c, vh,  0, &vh);              \
    }                                                    \
    while (0)

/* Add two times vc | vb | va into vo | vh | vl */
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
    do {                                                 \
        unsigned char c;                                 \
        c = _addcarry_u64(0, vl, va, &vl);               \
        c = _addcarry_u64(c, vh, vb, &vh);               \
            _addcarry_u64(c, vo, vc, &vo);               \
        c = _addcarry_u64(0, vl, va, &vl);               \
        c = _addcarry_u64(c, vh, vb, &vh);               \
            _addcarry_u64(c, vo, vc, &vo);               \
    }                                                    \
    while (0)
/* Index of highest bit set. */
#define SP_ASM_HI_BIT_SET_IDX(va, vi)                    \
    do {                                                 \
        unsigned long idx;                               \
        _BitScanReverse64(&idx, va);                     \
        vi = idx;                                        \
    }                                                    \
    while (0)
#endif

#if !defined(WOLFSSL_SP_DIV_WORD_HALF) && (!defined(_MSC_VER) || \
    _MSC_VER >= 1920)
/* Divide a two digit number by a digit number and return. (hi | lo) / d
 *
 * Using divq instruction on Intel x64.
 *
 * @param  [in]  hi  SP integer digit. High digit of the dividend.
 * @param  [in]  lo  SP integer digit. Lower digit of the dividend.
 * @param  [in]  d   SP integer digit. Number to divide by.
 * @return  The division result.
 */
static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo,
                                          sp_int_digit d)
{
#ifndef _MSC_VER
    __asm__ __volatile__ (
        "divq %2"
        : "+a" (lo)
        : "d" (hi), "r" (d)
        : "cc"
    );
    return lo;
#elif defined(_MSC_VER) && _MSC_VER >= 1920
    return _udiv128(hi, lo, d, NULL);
#endif
}
#define SP_ASM_DIV_WORD
#endif

#define SP_INT_ASM_AVAILABLE

    #endif /* WOLFSSL_SP_X86_64 && SP_WORD_SIZE == 64 */

    #if defined(WOLFSSL_SP_X86) && SP_WORD_SIZE == 32
/*
 * CPU: x86
 */

/* Multiply va by vb and store double size result in: vh | vl */
#define SP_ASM_MUL(vl, vh, va, vb)                       \
    __asm__ __volatile__ (                               \
        "movl	%[b], %%eax	\n\t"                    \
        "mull	%[a]		\n\t"                    \
        "movl	%%eax, %[l]	\n\t"                    \
        "movl	%%edx, %[h]	\n\t"                    \
        : [h] "+r" (vh), [l] "+r" (vl)                   \
        : [a] "m" (va), [b] "m" (vb)                     \
        : "memory", "eax", "edx", "cc"                   \
    )
/* Multiply va by vb and store double size result in: vo | vh | vl */
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        "movl	%[b], %%eax	\n\t"                    \
        "mull	%[a]		\n\t"                    \
        "movl	$0   , %[o]	\n\t"                    \
        "movl	%%eax, %[l]	\n\t"                    \
        "movl	%%edx, %[h]	\n\t"                    \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "=r" (vo)    \
        : [a] "m" (va), [b] "m" (vb)                     \
        : "eax", "edx", "cc"                             \
    )
/* Multiply va by vb and add double size result into: vo | vh | vl */
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        "movl	%[b], %%eax	\n\t"                    \
        "mull	%[a]		\n\t"                    \
        "addl	%%eax, %[l]	\n\t"                    \
        "adcl	%%edx, %[h]	\n\t"                    \
        "adcl	$0   , %[o]	\n\t"                    \
        : [l] "+rm" (vl), [h] "+rm" (vh), [o] "+rm" (vo) \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "eax", "edx", "cc"                             \
    )
/* Multiply va by vb and add double size result into: vh | vl */
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
    __asm__ __volatile__ (                               \
        "movl	%[b], %%eax	\n\t"                    \
        "mull	%[a]		\n\t"                    \
        "addl	%%eax, %[l]	\n\t"                    \
        "adcl	%%edx, %[h]	\n\t"                    \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "m" (va), [b] "m" (vb)                     \
        : "eax", "edx", "cc"                             \
    )
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
    __asm__ __volatile__ (                               \
        "movl	%[b], %%eax	\n\t"                    \
        "mull	%[a]		\n\t"                    \
        "addl	%%eax, %[l]	\n\t"                    \
        "adcl	%%edx, %[h]	\n\t"                    \
        "adcl	$0   , %[o]	\n\t"                    \
        "addl	%%eax, %[l]	\n\t"                    \
        "adcl	%%edx, %[h]	\n\t"                    \
        "adcl	$0   , %[o]	\n\t"                    \
        : [l] "+rm" (vl), [h] "+rm" (vh), [o] "+rm" (vo) \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "eax", "edx", "cc"                             \
    )
/* Multiply va by vb and add double size result twice into: vo | vh | vl
 * Assumes first add will not overflow vh | vl
 */
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
    __asm__ __volatile__ (                               \
        "movl	%[b], %%eax	\n\t"                    \
        "mull	%[a]		\n\t"                    \
        "addl	%%eax, %[l]	\n\t"                    \
        "adcl	%%edx, %[h]	\n\t"                    \
        "addl	%%eax, %[l]	\n\t"                    \
        "adcl	%%edx, %[h]	\n\t"                    \
        "adcl	$0   , %[o]	\n\t"                    \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "m" (va), [b] "m" (vb)                     \
        : "eax", "edx", "cc"                             \
    )
/* Square va and store double size result in: vh | vl */
#define SP_ASM_SQR(vl, vh, va)                           \
    __asm__ __volatile__ (                               \
        "movl	%[a], %%eax	\n\t"                    \
        "mull	%%eax		\n\t"                    \
        "movl	%%eax, %[l]	\n\t"                    \
        "movl	%%edx, %[h]	\n\t"                    \
        : [h] "+r" (vh), [l] "+r" (vl)                   \
        : [a] "m" (va)                                   \
        : "memory", "eax", "edx", "cc"                   \
    )
/* Square va and add double size result into: vo | vh | vl */
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
    __asm__ __volatile__ (                               \
        "movl	%[a], %%eax	\n\t"                    \
        "mull	%%eax		\n\t"                    \
        "addl	%%eax, %[l]	\n\t"                    \
        "adcl	%%edx, %[h]	\n\t"                    \
        "adcl	$0   , %[o]	\n\t"                    \
        : [l] "+rm" (vl), [h] "+rm" (vh), [o] "+rm" (vo) \
        : [a] "m" (va)                                   \
        : "eax", "edx", "cc"                             \
    )
/* Square va and add double size result into: vh | vl */
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
    __asm__ __volatile__ (                               \
        "movl	%[a], %%eax	\n\t"                    \
        "mull	%%eax		\n\t"                    \
        "addl	%%eax, %[l]	\n\t"                    \
        "adcl	%%edx, %[h]	\n\t"                    \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "m" (va)                                   \
        : "eax", "edx", "cc"                             \
    )
/* Add va into: vh | vl */
#define SP_ASM_ADDC(vl, vh, va)                          \
    __asm__ __volatile__ (                               \
        "addl	%[a], %[l]	\n\t"                    \
        "adcl	$0  , %[h]	\n\t"                    \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "m" (va)                                   \
        : "cc"                                           \
    )
/* Add va, variable in a register, into: vh | vl */
#define SP_ASM_ADDC_REG(vl, vh, va)                      \
    __asm__ __volatile__ (                               \
        "addl	%[a], %[l]	\n\t"                    \
        "adcl	$0  , %[h]	\n\t"                    \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "cc"                                           \
    )
/* Sub va from: vh | vl */
#define SP_ASM_SUBB(vl, vh, va)                          \
    __asm__ __volatile__ (                               \
        "subl	%[a], %[l]	\n\t"                    \
        "sbbl	$0  , %[h]	\n\t"                    \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "m" (va)                                   \
        : "cc"                                           \
    )
/* Sub va from: vh | vl */
#define SP_ASM_SUBB_REG(vl, vh, va)                      \
    __asm__ __volatile__ (                               \
        "subl	%[a], %[l]	\n\t"                    \
        "sbbl	$0  , %[h]	\n\t"                    \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "cc"                                           \
    )
/* Add two times vc | vb | va into vo | vh | vl */
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
    __asm__ __volatile__ (                               \
        "addl	%[a], %[l]	\n\t"                    \
        "adcl	%[b], %[h]	\n\t"                    \
        "adcl	%[c], %[o]	\n\t"                    \
        "addl	%[a], %[l]	\n\t"                    \
        "adcl	%[b], %[h]	\n\t"                    \
        "adcl	%[c], %[o]	\n\t"                    \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb), [c] "r" (vc)       \
        : "cc"                                           \
    )
/* Index of highest bit set. */
#define SP_ASM_HI_BIT_SET_IDX(va, vi)                    \
    __asm__ __volatile__ (                               \
        "bsr	%[a], %[i]	\n\t"                    \
        : [i] "=r" (vi)                                  \
        : [a] "r" (va)                                   \
        : "cC"                                           \
    )

#ifndef WOLFSSL_SP_DIV_WORD_HALF
/* Divide a two digit number by a digit number and return. (hi | lo) / d
 *
 * Using divl instruction on Intel x64.
 *
 * @param  [in]  hi  SP integer digit. High digit of the dividend.
 * @param  [in]  lo  SP integer digit. Lower digit of the dividend.
 * @param  [in]  d   SP integer digit. Number to divide by.
 * @return  The division result.
 */
static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo,
                                          sp_int_digit d)
{
    __asm__ __volatile__ (
        "divl %2"
        : "+a" (lo)
        : "d" (hi), "r" (d)
        : "cc"
    );
    return lo;
}
#define SP_ASM_DIV_WORD
#endif

#define SP_INT_ASM_AVAILABLE

    #endif /* WOLFSSL_SP_X86 && SP_WORD_SIZE == 32 */

    #if defined(WOLFSSL_SP_ARM64) && SP_WORD_SIZE == 64
/*
 * CPU: Aarch64
 */

/* Multiply va by vb and store double size result in: vh | vl */
#define SP_ASM_MUL(vl, vh, va, vb)                       \
    __asm__ __volatile__ (                               \
        "mul	%[l], %[a], %[b]	\n\t"            \
        "umulh	%[h], %[a], %[b]	\n\t"            \
        : [h] "+r" (vh), [l] "+r" (vl)                   \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "memory", "cc"                                 \
    )
/* Multiply va by vb and store double size result in: vo | vh | vl */
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        "mul	x8, %[a], %[b]		\n\t"            \
        "umulh	%[h], %[a], %[b]	\n\t"            \
        "mov	%[l], x8		\n\t"            \
        "mov	%[o], xzr		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "=r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "x8"                                           \
    )
/* Multiply va by vb and add double size result into: vo | vh | vl */
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        "mul	x8, %[a], %[b]		\n\t"            \
        "umulh	x9, %[a], %[b]		\n\t"            \
        "adds	%[l], %[l], x8		\n\t"            \
        "adcs	%[h], %[h], x9		\n\t"            \
        "adc	%[o], %[o], xzr		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "x8", "x9", "cc"                               \
    )
/* Multiply va by vb and add double size result into: vh | vl */
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
    __asm__ __volatile__ (                               \
        "mul	x8, %[a], %[b]		\n\t"            \
        "umulh	x9, %[a], %[b]		\n\t"            \
        "adds	%[l], %[l], x8		\n\t"            \
        "adc	%[h], %[h], x9		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "x8", "x9", "cc"                               \
    )
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
    __asm__ __volatile__ (                               \
        "mul	x8, %[a], %[b]		\n\t"            \
        "umulh	x9, %[a], %[b]		\n\t"            \
        "adds	%[l], %[l], x8		\n\t"            \
        "adcs	%[h], %[h], x9		\n\t"            \
        "adc	%[o], %[o], xzr		\n\t"            \
        "adds	%[l], %[l], x8		\n\t"            \
        "adcs	%[h], %[h], x9		\n\t"            \
        "adc	%[o], %[o], xzr		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "x8", "x9", "cc"                               \
    )
/* Multiply va by vb and add double size result twice into: vo | vh | vl
 * Assumes first add will not overflow vh | vl
 */
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
    __asm__ __volatile__ (                               \
        "mul	x8, %[a], %[b]		\n\t"            \
        "umulh	x9, %[a], %[b]		\n\t"            \
        "adds	%[l], %[l], x8		\n\t"            \
        "adc	%[h], %[h], x9		\n\t"            \
        "adds	%[l], %[l], x8		\n\t"            \
        "adcs	%[h], %[h], x9		\n\t"            \
        "adc	%[o], %[o], xzr		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "x8", "x9", "cc"                               \
    )
/* Square va and store double size result in: vh | vl */
#define SP_ASM_SQR(vl, vh, va)                           \
    __asm__ __volatile__ (                               \
        "mul	%[l], %[a], %[a]	\n\t"            \
        "umulh	%[h], %[a], %[a]	\n\t"            \
        : [h] "+r" (vh), [l] "+r" (vl)                   \
        : [a] "r" (va)                                   \
        : "memory"                                       \
    )
/* Square va and add double size result into: vo | vh | vl */
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
    __asm__ __volatile__ (                               \
        "mul	x8, %[a], %[a]		\n\t"            \
        "umulh	x9, %[a], %[a]		\n\t"            \
        "adds	%[l], %[l], x8		\n\t"            \
        "adcs	%[h], %[h], x9		\n\t"            \
        "adc	%[o], %[o], xzr		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va)                                   \
        : "x8", "x9", "cc"                               \
    )
/* Square va and add double size result into: vh | vl */
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
    __asm__ __volatile__ (                               \
        "mul	x8, %[a], %[a]		\n\t"            \
        "umulh	x9, %[a], %[a]		\n\t"            \
        "adds	%[l], %[l], x8		\n\t"            \
        "adc	%[h], %[h], x9		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "x8", "x9", "cc"                               \
    )
/* Add va into: vh | vl */
#define SP_ASM_ADDC(vl, vh, va)                          \
    __asm__ __volatile__ (                               \
        "adds	%[l], %[l], %[a]	\n\t"            \
        "adc	%[h], %[h], xzr		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "cc"                                           \
    )
/* Sub va from: vh | vl */
#define SP_ASM_SUBB(vl, vh, va)                          \
    __asm__ __volatile__ (                               \
        "subs	%[l], %[l], %[a]	\n\t"            \
        "sbc	%[h], %[h], xzr		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "cc"                                           \
    )
/* Add two times vc | vb | va into vo | vh | vl */
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
    __asm__ __volatile__ (                               \
        "adds	%[l], %[l], %[a]	\n\t"            \
        "adcs	%[h], %[h], %[b]	\n\t"            \
        "adc	%[o], %[o], %[c]	\n\t"            \
        "adds	%[l], %[l], %[a]	\n\t"            \
        "adcs	%[h], %[h], %[b]	\n\t"            \
        "adc	%[o], %[o], %[c]	\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb), [c] "r" (vc)       \
        : "cc"                                           \
    )
/* Count leading zeros. */
#define SP_ASM_LZCNT(va, vn)                             \
    __asm__ __volatile__ (                               \
        "clz	%[n], %[a]	\n\t"                    \
        : [n] "=r" (vn)                                  \
        : [a] "r" (va)                                   \
        :                                                \
    )

#ifndef WOLFSSL_SP_DIV_WORD_HALF
/* Divide a two digit number by a digit number and return. (hi | lo) / d
 *
 * Using udiv instruction on Aarch64.
 * Constant time.
 *
 * @param  [in]  hi  SP integer digit. High digit of the dividend.
 * @param  [in]  lo  SP integer digit. Lower digit of the dividend.
 * @param  [in]  d   SP integer digit. Number to divide by.
 * @return  The division result.
 */
static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo,
                                          sp_int_digit d)
{
    __asm__ __volatile__ (
        "lsr	x3, %[d], 48\n\t"
        "mov	x5, 16\n\t"
        "cmp	x3, 0\n\t"
        "mov	x4, 63\n\t"
        "csel	x3, x5, xzr, eq\n\t"
        "sub	x4, x4, x3\n\t"
        "lsl	%[d], %[d], x3\n\t"
        "lsl	%[hi], %[hi], x3\n\t"
        "lsr	x5, %[lo], x4\n\t"
        "lsl	%[lo], %[lo], x3\n\t"
        "orr	%[hi], %[hi], x5, lsr 1\n\t"

        "lsr	x5, %[d], 32\n\t"
        "add	x5, x5, 1\n\t"

        "udiv	x3, %[hi], x5\n\t"
        "lsl	x6, x3, 32\n\t"
        "mul	x4, %[d], x6\n\t"
        "umulh	x3, %[d], x6\n\t"
        "subs	%[lo], %[lo], x4\n\t"
        "sbc	%[hi], %[hi], x3\n\t"

        "udiv	x3, %[hi], x5\n\t"
        "lsl	x3, x3, 32\n\t"
        "add	x6, x6, x3\n\t"
        "mul	x4, %[d], x3\n\t"
        "umulh	x3, %[d], x3\n\t"
        "subs	%[lo], %[lo], x4\n\t"
        "sbc	%[hi], %[hi], x3\n\t"

        "lsr	x3, %[lo], 32\n\t"
        "orr	x3, x3, %[hi], lsl 32\n\t"

        "udiv	x3, x3, x5\n\t"
        "add	x6, x6, x3\n\t"
        "mul	x4, %[d], x3\n\t"
        "umulh	x3, %[d], x3\n\t"
        "subs	%[lo], %[lo], x4\n\t"
        "sbc	%[hi], %[hi], x3\n\t"

        "lsr	x3, %[lo], 32\n\t"
        "orr	x3, x3, %[hi], lsl 32\n\t"

        "udiv	x3, x3, x5\n\t"
        "add	x6, x6, x3\n\t"
        "mul	x4, %[d], x3\n\t"
        "sub	%[lo], %[lo], x4\n\t"

        "udiv	x3, %[lo], %[d]\n\t"
        "add	%[hi], x6, x3\n\t"

        : [hi] "+r" (hi), [lo] "+r" (lo), [d] "+r" (d)
        :
        : "x3", "x4", "x5", "x6", "cc"
    );

    return hi;
}
#define SP_ASM_DIV_WORD
#endif

#define SP_INT_ASM_AVAILABLE

    #endif /* WOLFSSL_SP_ARM64 && SP_WORD_SIZE == 64 */

    #if (defined(WOLFSSL_SP_ARM32) || defined(WOLFSSL_SP_ARM_CORTEX_M)) && \
        SP_WORD_SIZE == 32
/*
 * CPU: ARM32 or Cortex-M4 and similar
 */

/* Multiply va by vb and store double size result in: vh | vl */
#define SP_ASM_MUL(vl, vh, va, vb)                       \
    __asm__ __volatile__ (                               \
        "umull	%[l], %[h], %[a], %[b]	\n\t"            \
        : [h] "+r" (vh), [l] "+r" (vl)                   \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "memory"                                       \
    )
/* Multiply va by vb and store double size result in: vo | vh | vl */
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        "umull	%[l], %[h], %[a], %[b]	\n\t"            \
        "mov	%[o], #0		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "=r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        :                                                \
    )
/* Multiply va by vb and add double size result into: vo | vh | vl */
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        "umull	r8, r9, %[a], %[b]	\n\t"            \
        "adds	%[l], %[l], r8		\n\t"            \
        "adcs	%[h], %[h], r9		\n\t"            \
        "adc	%[o], %[o], #0		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "r8", "r9", "cc"                               \
    )
/* Multiply va by vb and add double size result into: vh | vl */
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
    __asm__ __volatile__ (                               \
        "umlal	%[l], %[h], %[a], %[b]	\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va), [b] "r" (vb)                     \
        :                                                \
    )
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
    __asm__ __volatile__ (                               \
        "umull	r8, r9, %[a], %[b]	\n\t"            \
        "adds	%[l], %[l], r8		\n\t"            \
        "adcs	%[h], %[h], r9		\n\t"            \
        "adc	%[o], %[o], #0		\n\t"            \
        "adds	%[l], %[l], r8		\n\t"            \
        "adcs	%[h], %[h], r9		\n\t"            \
        "adc	%[o], %[o], #0		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "r8", "r9", "cc"                               \
    )
/* Multiply va by vb and add double size result twice into: vo | vh | vl
 * Assumes first add will not overflow vh | vl
 */
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
    __asm__ __volatile__ (                               \
        "umull	r8, r9, %[a], %[b]	\n\t"            \
        "adds	%[l], %[l], r8		\n\t"            \
        "adc	%[h], %[h], r9		\n\t"            \
        "adds	%[l], %[l], r8		\n\t"            \
        "adcs	%[h], %[h], r9		\n\t"            \
        "adc	%[o], %[o], #0		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "r8", "r9", "cc"                               \
    )
/* Square va and store double size result in: vh | vl */
#define SP_ASM_SQR(vl, vh, va)                           \
    __asm__ __volatile__ (                               \
        "umull	%[l], %[h], %[a], %[a]	\n\t"            \
        : [h] "+r" (vh), [l] "+r" (vl)                   \
        : [a] "r" (va)                                   \
        : "memory"                                       \
    )
/* Square va and add double size result into: vo | vh | vl */
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
    __asm__ __volatile__ (                               \
        "umull	r8, r9, %[a], %[a]	\n\t"            \
        "adds	%[l], %[l], r8		\n\t"            \
        "adcs	%[h], %[h], r9		\n\t"            \
        "adc	%[o], %[o], #0		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va)                                   \
        : "r8", "r9", "cc"                               \
    )
/* Square va and add double size result into: vh | vl */
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
    __asm__ __volatile__ (                               \
        "umlal	%[l], %[h], %[a], %[a]	\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "cc"                                           \
    )
/* Add va into: vh | vl */
#define SP_ASM_ADDC(vl, vh, va)                          \
    __asm__ __volatile__ (                               \
        "adds	%[l], %[l], %[a]	\n\t"            \
        "adc	%[h], %[h], #0		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "cc"                                           \
    )
/* Sub va from: vh | vl */
#define SP_ASM_SUBB(vl, vh, va)                          \
    __asm__ __volatile__ (                               \
        "subs	%[l], %[l], %[a]	\n\t"            \
        "sbc	%[h], %[h], #0		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "cc"                                           \
    )
/* Add two times vc | vb | va into vo | vh | vl */
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
    __asm__ __volatile__ (                               \
        "adds	%[l], %[l], %[a]	\n\t"            \
        "adcs	%[h], %[h], %[b]	\n\t"            \
        "adc	%[o], %[o], %[c]	\n\t"            \
        "adds	%[l], %[l], %[a]	\n\t"            \
        "adcs	%[h], %[h], %[b]	\n\t"            \
        "adc	%[o], %[o], %[c]	\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb), [c] "r" (vc)       \
        : "cc"                                           \
    )
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH >= 7)
/* Count leading zeros - instruction only available on ARMv7 and newer. */
#define SP_ASM_LZCNT(va, vn)                             \
    __asm__ __volatile__ (                               \
        "clz	%[n], %[a]	\n\t"                    \
        : [n] "=r" (vn)                                  \
        : [a] "r" (va)                                   \
        :                                                \
    )
#endif

#ifndef WOLFSSL_SP_DIV_WORD_HALF
#ifndef WOLFSSL_SP_ARM32_UDIV
/* Divide a two digit number by a digit number and return. (hi | lo) / d
 *
 * No division instruction used - does operation bit by bit.
 * Constant time.
 *
 * @param  [in]  hi  SP integer digit. High digit of the dividend.
 * @param  [in]  lo  SP integer digit. Lower digit of the dividend.
 * @param  [in]  d   SP integer digit. Number to divide by.
 * @return  The division result.
 */
static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo,
                                          sp_int_digit d)
{
    sp_int_digit r = 0;
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7)
    static const char debruijn32[32] = {
        0, 31, 9, 30, 3, 8, 13, 29, 2, 5, 7, 21, 12, 24, 28, 19,
        1, 10, 4, 14, 6, 22, 25, 20, 11, 15, 23, 26, 16, 27, 17, 18
    };
    static const sp_uint32 debruijn32_mul = 0x076be629;
#endif

    __asm__ __volatile__ (
        /* Shift d so that top bit is set. */
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7)
        "ldr	r4, %[m]\n\t"
        "mov	r5, %[d]\n\t"
        "orr	r5, r5, r5, lsr #1\n\t"
        "orr	r5, r5, r5, lsr #2\n\t"
        "orr	r5, r5, r5, lsr #4\n\t"
        "orr	r5, r5, r5, lsr #8\n\t"
        "orr	r5, r5, r5, lsr #16\n\t"
        "add	r5, r5, #1\n\t"
        "mul	r5, r5, r4\n\t"
        "lsr	r5, r5, #27\n\t"
        "ldrb	r5, [%[t], r5]\n\t"
#else
        "clz	r5, %[d]\n\t"
#endif
        "rsb	r6, r5, #31\n\t"
        "lsl	%[d], %[d], r5\n\t"
        "lsl	%[hi], %[hi], r5\n\t"
        "lsr	r9, %[lo], r6\n\t"
        "lsl	%[lo], %[lo], r5\n\t"
        "orr	%[hi], %[hi], r9, lsr #1\n\t"

        "lsr	r5, %[d], #1\n\t"
        "add	r5, r5, #1\n\t"
        "mov	r6, %[lo]\n\t"
        "mov	r9, %[hi]\n\t"
        /* Do top 32 */
        "subs	r8, r5, r9\n\t"
        "sbc	r8, r8, r8\n\t"
        "add	%[r], %[r], %[r]\n\t"
        "sub	%[r], %[r], r8\n\t"
        "and	r8, r8, r5\n\t"
        "subs	r9, r9, r8\n\t"
        /* Next 30 bits */
        "mov	r4, #29\n\t"
        "\n1:\n\t"
        "movs	r6, r6, lsl #1\n\t"
        "adc	r9, r9, r9\n\t"
        "subs	r8, r5, r9\n\t"
        "sbc	r8, r8, r8\n\t"
        "add	%[r], %[r], %[r]\n\t"
        "sub	%[r], %[r], r8\n\t"
        "and	r8, r8, r5\n\t"
        "subs	r9, r9, r8\n\t"
        "subs	r4, r4, #1\n\t"
        "bpl	1b\n\t"

        "add	%[r], %[r], %[r]\n\t"
        "add	%[r], %[r], #1\n\t"

        /* Handle difference has hi word > 0. */
        "umull	r4, r5, %[r], %[d]\n\t"
        "subs	r4, %[lo], r4\n\t"
        "sbc	r5, %[hi], r5\n\t"
        "add	%[r], %[r], r5\n\t"
        "umull	r4, r5, %[r], %[d]\n\t"
        "subs	r4, %[lo], r4\n\t"
        "sbc	r5, %[hi], r5\n\t"
        "add	%[r], %[r], r5\n\t"

        /* Add 1 to result if bottom half of difference is >= d. */
        "mul	r4, %[r], %[d]\n\t"
        "subs	r4, %[lo], r4\n\t"
        "subs	r9, %[d], r4\n\t"
        "sbc	r8, r8, r8\n\t"
        "sub	%[r], %[r], r8\n\t"
        "subs	r9, r9, #1\n\t"
        "sbc	r8, r8, r8\n\t"
        "sub	%[r], %[r], r8\n\t"
        : [r] "+r" (r), [hi] "+r" (hi), [lo] "+r" (lo), [d] "+r" (d)
#if defined(WOLFSSL_SP_ARM_ARCH) && (WOLFSSL_SP_ARM_ARCH < 7)
        : [t] "r" (debruijn32), [m] "m" (debruijn32_mul)
#else
        :
#endif
        : "r4", "r5", "r6", "r8", "r9", "cc"
    );

    return r;
}
#else
/* Divide a two digit number by a digit number and return. (hi | lo) / d
 *
 * Using udiv instruction on arm32
 * Constant time.
 *
 * @param  [in]  hi  SP integer digit. High digit of the dividend.
 * @param  [in]  lo  SP integer digit. Lower digit of the dividend.
 * @param  [in]  d   SP integer digit. Number to divide by.
 * @return  The division result.
 */
static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo,
                                          sp_int_digit d)
{
    __asm__ __volatile__ (
        "lsrs	r3, %[d], #24\n\t"
	"it	eq\n\t"
        "moveq	r3, #8\n\t"
	"it	ne\n\t"
        "movne	r3, #0\n\t"
        "rsb	r4, r3, #31\n\t"
        "lsl	%[d], %[d], r3\n\t"
        "lsl	%[hi], %[hi], r3\n\t"
        "lsr	r5, %[lo], r4\n\t"
        "lsl	%[lo], %[lo], r3\n\t"
        "orr	%[hi], %[hi], r5, lsr #1\n\t"

        "lsr	r5, %[d], 16\n\t"
        "add	r5, r5, 1\n\t"

        "udiv	r3, %[hi], r5\n\t"
        "lsl	r6, r3, 16\n\t"
        "umull	r4, r3, %[d], r6\n\t"
        "subs	%[lo], %[lo], r4\n\t"
        "sbc	%[hi], %[hi], r3\n\t"

        "udiv	r3, %[hi], r5\n\t"
        "lsl	r3, r3, 16\n\t"
        "add	r6, r6, r3\n\t"
        "umull	r4, r3, %[d], r3\n\t"
        "subs	%[lo], %[lo], r4\n\t"
        "sbc	%[hi], %[hi], r3\n\t"

        "lsr	r3, %[lo], 16\n\t"
        "orr	r3, r3, %[hi], lsl 16\n\t"

        "udiv	r3, r3, r5\n\t"
        "add	r6, r6, r3\n\t"
        "umull	r4, r3, %[d], r3\n\t"
        "subs	%[lo], %[lo], r4\n\t"
        "sbc	%[hi], %[hi], r3\n\t"

        "lsr	r3, %[lo], 16\n\t"
        "orr	r3, r3, %[hi], lsl 16\n\t"

        "udiv	r3, r3, r5\n\t"
        "add	r6, r6, r3\n\t"
        "mul	r4, %[d], r3\n\t"
        "sub	%[lo], %[lo], r4\n\t"

        "udiv	r3, %[lo], %[d]\n\t"
        "add	%[hi], r6, r3\n\t"

        : [hi] "+r" (hi), [lo] "+r" (lo), [d] "+r" (d)
        :
        : "r3", "r4", "r5", "r6", "cc"
    );

    return hi;
}
#endif

#define SP_ASM_DIV_WORD
#endif

#define SP_INT_ASM_AVAILABLE

    #endif /* (WOLFSSL_SP_ARM32 || ARM_CORTEX_M) && SP_WORD_SIZE == 32 */

    #if defined(WOLFSSL_SP_ARM_THUMB) && SP_WORD_SIZE == 32
/*
 * CPU: ARM Thumb (like Cortex-M0)
 */

/* Compile with -fomit-frame-pointer, or similar, if compiler complains about
 * usage of register 'r7'.
 */

#if defined(__clang__)

/* Multiply va by vb and store double size result in: vh | vl */
#define SP_ASM_MUL(vl, vh, va, vb)                       \
    __asm__ __volatile__ (                               \
        /* al * bl */                                    \
        "uxth	r6, %[a]		\n\t"            \
        "uxth	%[l], %[b]		\n\t"            \
        "muls	%[l], r6		\n\t"            \
        /* al * bh */                                    \
        "lsrs	r4, %[b], #16		\n\t"            \
        "muls	r6, r4			\n\t"            \
        "lsrs	%[h], r6, #16		\n\t"            \
        "lsls	r6, r6, #16		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "movs	r5, #0			\n\t"            \
        "adcs	%[h], r5		\n\t"            \
        /* ah * bh */                                    \
        "lsrs	r6, %[a], #16		\n\t"            \
        "muls	r4, r6			\n\t"            \
        "adds	%[h], %[h], r4		\n\t"            \
        /* ah * bl */                                    \
        "uxth	r4, %[b]		\n\t"            \
        "muls	r6, r4			\n\t"            \
        "lsrs	r4, r6, #16		\n\t"            \
        "lsls	r6, r6, #16		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], r4		\n\t"            \
        : [h] "+l" (vh), [l] "+l" (vl)                   \
        : [a] "l" (va), [b] "l" (vb)                     \
        : "r4", "r5", "r6", "cc"                         \
    )
/* Multiply va by vb and store double size result in: vo | vh | vl */
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        /* al * bl */                                    \
        "uxth	r6, %[a]		\n\t"            \
        "uxth	%[l], %[b]		\n\t"            \
        "muls	%[l], r6		\n\t"            \
        /* al * bh */                                    \
        "lsrs	r5, %[b], #16		\n\t"            \
        "muls	r6, r5			\n\t"            \
        "lsrs	%[h], r6, #16		\n\t"            \
        "lsls	r6, r6, #16		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "movs	%[o], #0		\n\t"            \
        "adcs	%[h], %[o]		\n\t"            \
        /* ah * bh */                                    \
        "lsrs	r6, %[a], #16		\n\t"            \
        "muls	r5, r6			\n\t"            \
        "adds	%[h], %[h], r5		\n\t"            \
        /* ah * bl */                                    \
        "uxth	r5, %[b]		\n\t"            \
        "muls	r6, r5			\n\t"            \
        "lsrs	r5, r6, #16		\n\t"            \
        "lsls	r6, r6, #16		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], r5		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
        : [a] "l" (va), [b] "l" (vb)                     \
        : "r5", "r6", "cc"                               \
    )
#if !defined(WOLFSSL_SP_SMALL) && !defined(DEBUG)
/* Multiply va by vb and add double size result into: vo | vh | vl */
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        /* al * bl */                                    \
        "uxth	r6, %[a]		\n\t"            \
        "uxth	r7, %[b]		\n\t"            \
        "muls	r7, r6			\n\t"            \
        "adds	%[l], %[l], r7		\n\t"            \
        "movs	r5, #0			\n\t"            \
        "adcs	%[h], r5		\n\t"            \
        "adcs	%[o], r5		\n\t"            \
        /* al * bh */                                    \
        "lsrs	r7, %[b], #16		\n\t"            \
        "muls	r6, r7			\n\t"            \
        "lsrs	r7, r6, #16		\n\t"            \
        "lsls	r6, r6, #16		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], r7		\n\t"            \
        "adcs	%[o], r5		\n\t"            \
        /* ah * bh */                                    \
        "lsrs	r6, %[a], #16		\n\t"            \
        "lsrs	r7, %[b], #16		\n\t"            \
        "muls	r7, r6			\n\t"            \
        "adds	%[h], %[h], r7		\n\t"            \
        "adcs	%[o], r5		\n\t"            \
        /* ah * bl */                                    \
        "uxth	r7, %[b]		\n\t"            \
        "muls	r6, r7			\n\t"            \
        "lsrs	r7, r6, #16		\n\t"            \
        "lsls	r6, r6, #16		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], r7		\n\t"            \
        "adcs	%[o], r5		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
        : [a] "l" (va), [b] "l" (vb)                     \
        : "r5", "r6", "r7", "cc"                         \
    )
#else
/* Multiply va by vb and add double size result into: vo | vh | vl */
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        /* al * bl */                                    \
        "uxth	r6, %[a]		\n\t"            \
        "uxth	r5, %[b]		\n\t"            \
        "muls	r5, r6			\n\t"            \
        "adds	%[l], %[l], r5		\n\t"            \
        "movs	r5, #0			\n\t"            \
        "adcs	%[h], r5		\n\t"            \
        "adcs	%[o], r5		\n\t"            \
        /* al * bh */                                    \
        "lsrs	r5, %[b], #16		\n\t"            \
        "muls	r6, r5			\n\t"            \
        "lsrs	r5, r6, #16		\n\t"            \
        "lsls	r6, r6, #16		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], r5		\n\t"            \
        "movs	r5, #0			\n\t"            \
        "adcs	%[o], r5		\n\t"            \
        /* ah * bh */                                    \
        "lsrs	r6, %[a], #16		\n\t"            \
        "lsrs	r5, %[b], #16		\n\t"            \
        "muls	r5, r6			\n\t"            \
        "adds	%[h], %[h], r5		\n\t"            \
        "movs	r5, #0			\n\t"            \
        "adcs	%[o], r5		\n\t"            \
        /* ah * bl */                                    \
        "uxth	r5, %[b]		\n\t"            \
        "muls	r6, r5			\n\t"            \
        "lsrs	r5, r6, #16		\n\t"            \
        "lsls	r6, r6, #16		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], r5		\n\t"            \
        "movs	r5, #0			\n\t"            \
        "adcs	%[o], r5		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
        : [a] "l" (va), [b] "l" (vb)                     \
        : "r5", "r6", "cc"                               \
    )
#endif
/* Multiply va by vb and add double size result into: vh | vl */
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
    __asm__ __volatile__ (                               \
        /* al * bl */                                    \
        "uxth	r6, %[a]		\n\t"            \
        "uxth	r4, %[b]		\n\t"            \
        "muls	r4, r6			\n\t"            \
        "adds	%[l], %[l], r4		\n\t"            \
        "movs	r5, #0			\n\t"            \
        "adcs	%[h], r5		\n\t"            \
        /* al * bh */                                    \
        "lsrs	r4, %[b], #16		\n\t"            \
        "muls	r6, r4			\n\t"            \
        "lsrs	r4, r6, #16		\n\t"            \
        "lsls	r6, r6, #16		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], r4		\n\t"            \
        /* ah * bh */                                    \
        "lsrs	r6, %[a], #16		\n\t"            \
        "lsrs	r4, %[b], #16		\n\t"            \
        "muls	r4, r6			\n\t"            \
        "adds	%[h], %[h], r4		\n\t"            \
        /* ah * bl */                                    \
        "uxth	r4, %[b]		\n\t"            \
        "muls	r6, r4			\n\t"            \
        "lsrs	r4, r6, #16		\n\t"            \
        "lsls	r6, r6, #16		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], r4		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh)                   \
        : [a] "l" (va), [b] "l" (vb)                     \
        : "r4", "r5", "r6", "cc"                         \
    )
#if !defined(WOLFSSL_SP_SMALL) && !defined(DEBUG)
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
    __asm__ __volatile__ (                               \
        /* al * bl */                                    \
        "uxth	r6, %[a]		\n\t"            \
        "uxth	r7, %[b]		\n\t"            \
        "muls	r7, r6			\n\t"            \
        "adds	%[l], %[l], r7		\n\t"            \
        "movs	r5, #0			\n\t"            \
        "adcs	%[h], r5		\n\t"            \
        "adcs	%[o], r5		\n\t"            \
        "adds	%[l], %[l], r7		\n\t"            \
        "adcs	%[h], r5		\n\t"            \
        "adcs	%[o], r5		\n\t"            \
        /* al * bh */                                    \
        "lsrs	r7, %[b], #16		\n\t"            \
        "muls	r6, r7			\n\t"            \
        "lsrs	r7, r6, #16		\n\t"            \
        "lsls	r6, r6, #16		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], r7		\n\t"            \
        "adcs	%[o], r5		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], r7		\n\t"            \
        "adcs	%[o], r5		\n\t"            \
        /* ah * bh */                                    \
        "lsrs	r6, %[a], #16		\n\t"            \
        "lsrs	r7, %[b], #16		\n\t"            \
        "muls	r7, r6			\n\t"            \
        "adds	%[h], %[h], r7		\n\t"            \
        "adcs	%[o], r5		\n\t"            \
        "adds	%[h], %[h], r7		\n\t"            \
        "adcs	%[o], r5		\n\t"            \
        /* ah * bl */                                    \
        "uxth	r7, %[b]		\n\t"            \
        "muls	r6, r7			\n\t"            \
        "lsrs	r7, r6, #16		\n\t"            \
        "lsls	r6, r6, #16		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], r7		\n\t"            \
        "adcs	%[o], r5		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], r7		\n\t"            \
        "adcs	%[o], r5		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
        : [a] "l" (va), [b] "l" (vb)                     \
        : "r5", "r6", "r7", "cc"                         \
    )
#else
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
    __asm__ __volatile__ (                               \
        "movs	r8, %[a]		\n\t"            \
        /* al * bl */                                    \
        "uxth	r6, %[a]		\n\t"            \
        "uxth	r5, %[b]		\n\t"            \
        "muls	r5, r6			\n\t"            \
        "adds	%[l], %[l], r5		\n\t"            \
        "movs	%[a], #0		\n\t"            \
        "adcs	%[h], %[a]		\n\t"            \
        "adcs	%[o], %[a]		\n\t"            \
        "adds	%[l], %[l], r5		\n\t"            \
        "adcs	%[h], %[a]		\n\t"            \
        "adcs	%[o], %[a]		\n\t"            \
        /* al * bh */                                    \
        "lsrs	r5, %[b], #16		\n\t"            \
        "muls	r6, r5			\n\t"            \
        "lsrs	r5, r6, #16		\n\t"            \
        "lsls	r6, r6, #16		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], r5		\n\t"            \
        "adcs	%[o], %[a]		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], r5		\n\t"            \
        "adcs	%[o], %[a]		\n\t"            \
        /* ah * bh */                                    \
        "movs	%[a], r8		\n\t"            \
        "lsrs	r6, %[a], #16		\n\t"            \
        "lsrs	r5, %[b], #16		\n\t"            \
        "muls	r5, r6			\n\t"            \
        "adds	%[h], %[h], r5		\n\t"            \
        "movs	%[a], #0		\n\t"            \
        "adcs	%[o], %[a]		\n\t"            \
        "adds	%[h], %[h], r5		\n\t"            \
        "adcs	%[o], %[a]		\n\t"            \
        /* ah * bl */                                    \
        "uxth	r5, %[b]		\n\t"            \
        "muls	r6, r5			\n\t"            \
        "lsrs	r5, r6, #16		\n\t"            \
        "lsls	r6, r6, #16		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], r5		\n\t"            \
        "adcs	%[o], %[a]		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], r5		\n\t"            \
        "adcs	%[o], %[a]		\n\t"            \
        "movs	%[a], r8		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
        : [a] "l" (va), [b] "l" (vb)                     \
        : "r5", "r6", "r8", "cc"                         \
    )
#endif
#ifndef DEBUG
/* Multiply va by vb and add double size result twice into: vo | vh | vl
 * Assumes first add will not overflow vh | vl
 */
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
    __asm__ __volatile__ (                               \
        /* al * bl */                                    \
        "uxth	r6, %[a]		\n\t"            \
        "uxth	r7, %[b]		\n\t"            \
        "muls	r7, r6			\n\t"            \
        "adds	%[l], %[l], r7		\n\t"            \
        "movs	r5, #0			\n\t"            \
        "adcs	%[h], r5		\n\t"            \
        "adds	%[l], %[l], r7		\n\t"            \
        "adcs	%[h], r5		\n\t"            \
        /* al * bh */                                    \
        "lsrs	r7, %[b], #16		\n\t"            \
        "muls	r6, r7			\n\t"            \
        "lsrs	r7, r6, #16		\n\t"            \
        "lsls	r6, r6, #16		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], r7		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], r7		\n\t"            \
        "adcs	%[o], r5		\n\t"            \
        /* ah * bh */                                    \
        "lsrs	r6, %[a], #16		\n\t"            \
        "lsrs	r7, %[b], #16		\n\t"            \
        "muls	r7, r6			\n\t"            \
        "adds	%[h], %[h], r7		\n\t"            \
        "adcs	%[o], r5		\n\t"            \
        "adds	%[h], %[h], r7		\n\t"            \
        "adcs	%[o], r5		\n\t"            \
        /* ah * bl */                                    \
        "uxth	r7, %[b]		\n\t"            \
        "muls	r6, r7			\n\t"            \
        "lsrs	r7, r6, #16		\n\t"            \
        "lsls	r6, r6, #16		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], r7		\n\t"            \
        "adcs	%[o], r5		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], r7		\n\t"            \
        "adcs	%[o], r5		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
        : [a] "l" (va), [b] "l" (vb)                     \
        : "r5", "r6", "r7", "cc"                         \
    )
#else
/* Multiply va by vb and add double size result twice into: vo | vh | vl
 * Assumes first add will not overflow vh | vl
 */
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
    __asm__ __volatile__ (                               \
        "movs	r8, %[a]		\n\t"            \
        /* al * bl */                                    \
        "uxth	r5, %[a]		\n\t"            \
        "uxth	r6, %[b]		\n\t"            \
        "muls	r6, r5			\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "movs	%[a], #0		\n\t"            \
        "adcs	%[h], %[a]		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], %[a]		\n\t"            \
        /* al * bh */                                    \
        "lsrs	r6, %[b], #16		\n\t"            \
        "muls	r5, r6			\n\t"            \
        "lsrs	r6, r5, #16		\n\t"            \
        "lsls	r5, r5, #16		\n\t"            \
        "adds	%[l], %[l], r5		\n\t"            \
        "adcs	%[h], r6		\n\t"            \
        "adds	%[l], %[l], r5		\n\t"            \
        "adcs	%[h], r6		\n\t"            \
        "adcs	%[o], %[a]		\n\t"            \
        /* ah * bh */                                    \
        "movs	%[a], r8		\n\t"            \
        "lsrs	r5, %[a], #16		\n\t"            \
        "lsrs	r6, %[b], #16		\n\t"            \
        "muls	r6, r5			\n\t"            \
        "movs	%[a], #0		\n\t"            \
        "adds	%[h], %[h], r6		\n\t"            \
        "adcs	%[o], %[a]		\n\t"            \
        "adds	%[h], %[h], r6		\n\t"            \
        "adcs	%[o], %[a]		\n\t"            \
        /* ah * bl */                                    \
        "uxth	r6, %[b]		\n\t"            \
        "muls	r5, r6			\n\t"            \
        "lsrs	r6, r5, #16		\n\t"            \
        "lsls	r5, r5, #16		\n\t"            \
        "adds	%[l], %[l], r5		\n\t"            \
        "adcs	%[h], r6		\n\t"            \
        "adcs	%[o], %[a]		\n\t"            \
        "adds	%[l], %[l], r5		\n\t"            \
        "adcs	%[h], r6		\n\t"            \
        "adcs	%[o], %[a]		\n\t"            \
        "movs	%[a], r8		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
        : [a] "l" (va), [b] "l" (vb)                     \
        : "r5", "r6", "r8", "cc"                         \
    )
#endif
/* Square va and store double size result in: vh | vl */
#define SP_ASM_SQR(vl, vh, va)                           \
    __asm__ __volatile__ (                               \
        "lsrs	r5, %[a], #16		\n\t"            \
        "uxth	r6, %[a]		\n\t"            \
        "mov	%[l], r6		\n\t"            \
        "mov	%[h], r5		\n\t"            \
        /* al * al */                                    \
        "muls	%[l], %[l]		\n\t"            \
        /* ah * ah */                                    \
        "muls	%[h], %[h]		\n\t"            \
        /* 2 * al * ah */                                \
        "muls	r6, r5			\n\t"            \
        "lsrs	r5, r6, #15		\n\t"            \
        "lsls	r6, r6, #17		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], r5		\n\t"            \
        : [h] "+l" (vh), [l] "+l" (vl)                   \
        : [a] "l" (va)                                   \
        : "r5", "r6", "cc"                               \
    )
/* Square va and add double size result into: vo | vh | vl */
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
    __asm__ __volatile__ (                               \
        "lsrs	r4, %[a], #16		\n\t"            \
        "uxth	r6, %[a]		\n\t"            \
        /* al * al */                                    \
        "muls	r6, r6			\n\t"            \
        /* ah * ah */                                    \
        "muls	r4, r4			\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], r4		\n\t"            \
        "movs	r5, #0			\n\t"            \
        "adcs	%[o], r5		\n\t"            \
        "lsrs	r4, %[a], #16		\n\t"            \
        "uxth	r6, %[a]		\n\t"            \
        /* 2 * al * ah */                                \
        "muls	r6, r4			\n\t"            \
        "lsrs	r4, r6, #15		\n\t"            \
        "lsls	r6, r6, #17		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], r4		\n\t"            \
        "adcs	%[o], r5		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
        : [a] "l" (va)                                   \
        : "r4", "r5", "r6", "cc"                         \
    )
/* Square va and add double size result into: vh | vl */
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
    __asm__ __volatile__ (                               \
        "lsrs	r6, %[a], #16		\n\t"            \
        "uxth	r6, %[a]		\n\t"            \
        /* al * al */                                    \
        "muls	r6, r6			\n\t"            \
        /* ah * ah */                                    \
        "muls	r6, r6			\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], r6		\n\t"            \
        "lsrs	r6, %[a], #16		\n\t"            \
        "uxth	r6, %[a]		\n\t"            \
        /* 2 * al * ah */                                \
        "muls	r6, r6			\n\t"            \
        "lsrs	r6, r6, #15		\n\t"            \
        "lsls	r6, r6, #17		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], r6		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh)                   \
        : [a] "l" (va)                                   \
        : "r5", "r6", "cc"                               \
    )
/* Add va into: vh | vl */
#define SP_ASM_ADDC(vl, vh, va)                          \
    __asm__ __volatile__ (                               \
        "adds	%[l], %[l], %[a]	\n\t"            \
        "movs	r5, #0			\n\t"            \
        "adcs	%[h], r5		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh)                   \
        : [a] "l" (va)                                   \
        : "r5", "cc"                                     \
    )
/* Sub va from: vh | vl */
#define SP_ASM_SUBB(vl, vh, va)                          \
    __asm__ __volatile__ (                               \
        "subs	%[l], %[l], %[a]	\n\t"            \
        "movs	r5, #0			\n\t"            \
        "sbcs	%[h], r5		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh)                   \
        : [a] "l" (va)                                   \
        : "r5", "cc"                                     \
    )
/* Add two times vc | vb | va into vo | vh | vl */
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
    __asm__ __volatile__ (                               \
        "adds	%[l], %[l], %[a]	\n\t"            \
        "adcs	%[h], %[b]		\n\t"            \
        "adcs	%[o], %[c]		\n\t"            \
        "adds	%[l], %[l], %[a]	\n\t"            \
        "adcs	%[h], %[b]		\n\t"            \
        "adcs	%[o], %[c]		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
        : [a] "l" (va), [b] "l" (vb), [c] "l" (vc)       \
        : "cc"                                           \
    )

#elif defined(WOLFSSL_KEIL)

/* Multiply va by vb and store double size result in: vh | vl */
#define SP_ASM_MUL(vl, vh, va, vb)                       \
    __asm__ __volatile__ (                               \
        /* al * bl */                                    \
        "uxth	r6, %[a]		\n\t"            \
        "uxth	%[l], %[b]		\n\t"            \
        "muls	%[l], r6, %[l]		\n\t"            \
        /* al * bh */                                    \
        "lsrs	r4, %[b], #16		\n\t"            \
        "muls	r6, r4, r6		\n\t"            \
        "lsrs	%[h], r6, #16		\n\t"            \
        "lsls	r6, r6, #16		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "movs	r5, #0			\n\t"            \
        "adcs	%[h], %[h], r5		\n\t"            \
        /* ah * bh */                                    \
        "lsrs	r6, %[a], #16		\n\t"            \
        "muls	r4, r6, r4		\n\t"            \
        "adds	%[h], %[h], r4		\n\t"            \
        /* ah * bl */                                    \
        "uxth	r4, %[b]		\n\t"            \
        "muls	r6, r4, r6		\n\t"            \
        "lsrs	r4, r6, #16		\n\t"            \
        "lsls	r6, r6, #16		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], %[h], r4		\n\t"            \
        : [h] "+l" (vh), [l] "+l" (vl)                   \
        : [a] "l" (va), [b] "l" (vb)                     \
        : "r4", "r5", "r6", "cc"                         \
    )
/* Multiply va by vb and store double size result in: vo | vh | vl */
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        /* al * bl */                                    \
        "uxth	r6, %[a]		\n\t"            \
        "uxth	%[l], %[b]		\n\t"            \
        "muls	%[l], r6, %[l]		\n\t"            \
        /* al * bh */                                    \
        "lsrs	r5, %[b], #16		\n\t"            \
        "muls	r6, r5, r6		\n\t"            \
        "lsrs	%[h], r6, #16		\n\t"            \
        "lsls	r6, r6, #16		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "movs	%[o], #0		\n\t"            \
        "adcs	%[h], %[h], %[o]	\n\t"            \
        /* ah * bh */                                    \
        "lsrs	r6, %[a], #16		\n\t"            \
        "muls	r5, r6, r5		\n\t"            \
        "adds	%[h], %[h], r5		\n\t"            \
        /* ah * bl */                                    \
        "uxth	r5, %[b]		\n\t"            \
        "muls	r6, r5, r6		\n\t"            \
        "lsrs	r5, r6, #16		\n\t"            \
        "lsls	r6, r6, #16		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], %[h], r5		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
        : [a] "l" (va), [b] "l" (vb)                     \
        : "r5", "r6", "cc"                               \
    )
#if !defined(WOLFSSL_SP_SMALL) && !defined(DEBUG)
/* Multiply va by vb and add double size result into: vo | vh | vl */
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        /* al * bl */                                    \
        "uxth	r6, %[a]		\n\t"            \
        "uxth	r7, %[b]		\n\t"            \
        "muls	r7, r6, r7		\n\t"            \
        "adds	%[l], %[l], r7		\n\t"            \
        "movs	r5, #0			\n\t"            \
        "adcs	%[h], %[h], r5		\n\t"            \
        "adcs	%[o], %[o], r5		\n\t"            \
        /* al * bh */                                    \
        "lsrs	r7, %[b], #16		\n\t"            \
        "muls	r6, r7, r6		\n\t"            \
        "lsrs	r7, r6, #16		\n\t"            \
        "lsls	r6, r6, #16		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], %[h], r7		\n\t"            \
        "adcs	%[o], %[o], r5		\n\t"            \
        /* ah * bh */                                    \
        "lsrs	r6, %[a], #16		\n\t"            \
        "lsrs	r7, %[b], #16		\n\t"            \
        "muls	r7, r6, r7		\n\t"            \
        "adds	%[h], %[h], r7		\n\t"            \
        "adcs	%[o], %[o], r5		\n\t"            \
        /* ah * bl */                                    \
        "uxth	r7, %[b]		\n\t"            \
        "muls	r6, r7, r6		\n\t"            \
        "lsrs	r7, r6, #16		\n\t"            \
        "lsls	r6, r6, #16		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], %[h], r7		\n\t"            \
        "adcs	%[o], %[o], r5		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
        : [a] "l" (va), [b] "l" (vb)                     \
        : "r5", "r6", "r7", "cc"                         \
    )
#else
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        /* al * bl */                                    \
        "uxth   r6, %[a]                \n\t"            \
        "uxth   r5, %[b]                \n\t"            \
        "muls   r5, r6, r5              \n\t"            \
        "adds   %[l], %[l], r5          \n\t"            \
        "movs   r5, #0                  \n\t"            \
        "adcs   %[h], %[h], r5          \n\t"            \
        "adcs   %[o], %[o], r5          \n\t"            \
        /* al * bh */                                    \
        "lsrs   r5, %[b], #16           \n\t"            \
        "muls   r6, r5, r6              \n\t"            \
        "lsrs   r5, r6, #16             \n\t"            \
        "lsls   r6, r6, #16             \n\t"            \
        "adds   %[l], %[l], r6          \n\t"            \
        "adcs   %[h], %[h], r5          \n\t"            \
        "movs   r5, #0                  \n\t"            \
        "adcs   %[o], %[o], r5          \n\t"            \
        /* ah * bh */                                    \
        "lsrs   r6, %[a], #16           \n\t"            \
        "lsrs   r5, %[b], #16           \n\t"            \
        "muls   r5, r6, r5              \n\t"            \
        "adds   %[h], %[h], r5          \n\t"            \
        "movs   r5, #0                  \n\t"            \
        "adcs   %[o], %[o], r5          \n\t"            \
        /* ah * bl */                                    \
        "uxth   r5, %[b]                \n\t"            \
        "muls   r6, r5, r6              \n\t"            \
        "lsrs   r5, r6, #16             \n\t"            \
        "lsls   r6, r6, #16             \n\t"            \
        "adds   %[l], %[l], r6          \n\t"            \
        "adcs   %[h], %[h], r5          \n\t"            \
        "movs   r5, #0                  \n\t"            \
        "adcs   %[o], %[o], r5          \n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
        : [a] "l" (va), [b] "l" (vb)                     \
        : "r5", "r6", "cc"                               \
    )
#endif
/* Multiply va by vb and add double size result into: vh | vl */
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
    __asm__ __volatile__ (                               \
        /* al * bl */                                    \
        "uxth	r6, %[a]		\n\t"            \
        "uxth	r4, %[b]		\n\t"            \
        "muls	r4, r6, r4		\n\t"            \
        "adds	%[l], %[l], r4		\n\t"            \
        "movs	r5, #0			\n\t"            \
        "adcs	%[h], %[h], r5		\n\t"            \
        /* al * bh */                                    \
        "lsrs	r4, %[b], #16		\n\t"            \
        "muls	r6, r4, r6		\n\t"            \
        "lsrs	r4, r6, #16		\n\t"            \
        "lsls	r6, r6, #16		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], %[h], r4		\n\t"            \
        /* ah * bh */                                    \
        "lsrs	r6, %[a], #16		\n\t"            \
        "lsrs	r4, %[b], #16		\n\t"            \
        "muls	r4, r6, r4		\n\t"            \
        "adds	%[h], %[h], r4		\n\t"            \
        /* ah * bl */                                    \
        "uxth	r4, %[b]		\n\t"            \
        "muls	r6, r4, r6		\n\t"            \
        "lsrs	r4, r6, #16		\n\t"            \
        "lsls	r6, r6, #16		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], %[h], r4		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh)                   \
        : [a] "l" (va), [b] "l" (vb)                     \
        : "r4", "r5", "r6", "cc"                         \
    )
#if !defined(WOLFSSL_SP_SMALL) && !defined(DEBUG)
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
    __asm__ __volatile__ (                               \
        /* al * bl */                                    \
        "uxth	r6, %[a]		\n\t"            \
        "uxth	r7, %[b]		\n\t"            \
        "muls	r7, r6, r7		\n\t"            \
        "adds	%[l], %[l], r7		\n\t"            \
        "movs	r5, #0			\n\t"            \
        "adcs	%[h], %[h], r5		\n\t"            \
        "adcs	%[o], %[o], r5		\n\t"            \
        "adds	%[l], %[l], r7		\n\t"            \
        "adcs	%[h], %[h], r5		\n\t"            \
        "adcs	%[o], %[o], r5		\n\t"            \
        /* al * bh */                                    \
        "lsrs	r7, %[b], #16		\n\t"            \
        "muls	r6, r7, r6		\n\t"            \
        "lsrs	r7, r6, #16		\n\t"            \
        "lsls	r6, r6, #16		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], %[h], r7		\n\t"            \
        "adcs	%[o], %[o], r5		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], %[h], r7		\n\t"            \
        "adcs	%[o], %[o], r5		\n\t"            \
        /* ah * bh */                                    \
        "lsrs	r6, %[a], #16		\n\t"            \
        "lsrs	r7, %[b], #16		\n\t"            \
        "muls	r7, r6, r7		\n\t"            \
        "adds	%[h], %[h], r7		\n\t"            \
        "adcs	%[o], %[o], r5		\n\t"            \
        "adds	%[h], %[h], r7		\n\t"            \
        "adcs	%[o], %[o], r5		\n\t"            \
        /* ah * bl */                                    \
        "uxth	r7, %[b]		\n\t"            \
        "muls	r6, r7, r6		\n\t"            \
        "lsrs	r7, r6, #16		\n\t"            \
        "lsls	r6, r6, #16		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], %[h], r7		\n\t"            \
        "adcs	%[o], %[o], r5		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], %[h], r7		\n\t"            \
        "adcs	%[o], %[o], r5		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
        : [a] "l" (va), [b] "l" (vb)                     \
        : "r5", "r6", "r7", "cc"                         \
    )
#else
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
    __asm__ __volatile__ (                               \
        "movs	r8, %[a]		\n\t"            \
        /* al * bl */                                    \
        "uxth	r6, %[a]		\n\t"            \
        "uxth	r5, %[b]		\n\t"            \
        "muls	r5, r6, r5		\n\t"            \
        "adds	%[l], %[l], r5		\n\t"            \
        "movs	%[a], #0		\n\t"            \
        "adcs	%[h], %[h], %[a]	\n\t"            \
        "adcs	%[o], %[o], %[a]	\n\t"            \
        "adds	%[l], %[l], r5		\n\t"            \
        "adcs	%[h], %[h], %[a]	\n\t"            \
        "adcs	%[o], %[o], %[a]	\n\t"            \
        /* al * bh */                                    \
        "lsrs	r5, %[b], #16		\n\t"            \
        "muls	r6, r5, r6		\n\t"            \
        "lsrs	r5, r6, #16		\n\t"            \
        "lsls	r6, r6, #16		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], %[h], r5		\n\t"            \
        "adcs	%[o], %[o], %[a]	\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], %[h], r5		\n\t"            \
        "adcs	%[o], %[o], %[a]	\n\t"            \
        /* ah * bh */                                    \
        "movs	%[a], r8		\n\t"            \
        "lsrs	r6, %[a], #16		\n\t"            \
        "lsrs	r5, %[b], #16		\n\t"            \
        "muls	r5, r6, r5		\n\t"            \
        "adds	%[h], %[h], r5		\n\t"            \
        "movs	%[a], #0		\n\t"            \
        "adcs	%[o], %[o], %[a]	\n\t"            \
        "adds	%[h], %[h], r5		\n\t"            \
        "adcs	%[o], %[o], %[a]	\n\t"            \
        /* ah * bl */                                    \
        "uxth	r5, %[b]		\n\t"            \
        "muls	r6, r5, r6		\n\t"            \
        "lsrs	r5, r6, #16		\n\t"            \
        "lsls	r6, r6, #16		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], %[h], r5		\n\t"            \
        "adcs	%[o], %[o], %[a]	\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], %[h], r5		\n\t"            \
        "adcs	%[o], %[o], %[a]	\n\t"            \
        "movs	%[a], r8		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
        : [a] "l" (va), [b] "l" (vb)                     \
        : "r5", "r6", "r8", "cc"                         \
    )
#endif
#ifndef DEBUG
/* Multiply va by vb and add double size result twice into: vo | vh | vl
 * Assumes first add will not overflow vh | vl
 */
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
    __asm__ __volatile__ (                               \
        /* al * bl */                                    \
        "uxth	r6, %[a]		\n\t"            \
        "uxth	r7, %[b]		\n\t"            \
        "muls	r7, r6, r7		\n\t"            \
        "adds	%[l], %[l], r7		\n\t"            \
        "movs	r5, #0			\n\t"            \
        "adcs	%[h], %[h], r5		\n\t"            \
        "adds	%[l], %[l], r7		\n\t"            \
        "adcs	%[h], %[h], r5		\n\t"            \
        /* al * bh */                                    \
        "lsrs	r7, %[b], #16		\n\t"            \
        "muls	r6, r7, r6		\n\t"            \
        "lsrs	r7, r6, #16		\n\t"            \
        "lsls	r6, r6, #16		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], %[h], r7		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], %[h], r7		\n\t"            \
        "adcs	%[o], %[o], r5		\n\t"            \
        /* ah * bh */                                    \
        "lsrs	r6, %[a], #16		\n\t"            \
        "lsrs	r7, %[b], #16		\n\t"            \
        "muls	r7, r6, r7		\n\t"            \
        "adds	%[h], %[h], r7		\n\t"            \
        "adcs	%[o], %[o], r5		\n\t"            \
        "adds	%[h], %[h], r7		\n\t"            \
        "adcs	%[o], %[o], r5		\n\t"            \
        /* ah * bl */                                    \
        "uxth	r7, %[b]		\n\t"            \
        "muls	r6, r7, r6		\n\t"            \
        "lsrs	r7, r6, #16		\n\t"            \
        "lsls	r6, r6, #16		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], %[h], r7		\n\t"            \
        "adcs	%[o], %[o], r5		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], %[h], r7		\n\t"            \
        "adcs	%[o], %[o], r5		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
        : [a] "l" (va), [b] "l" (vb)                     \
        : "r5", "r6", "r7", "cc"                         \
    )
#else
/* Multiply va by vb and add double size result twice into: vo | vh | vl
 * Assumes first add will not overflow vh | vl
 */
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
    __asm__ __volatile__ (                               \
        "movs	r8, %[a]		\n\t"            \
        /* al * bl */                                    \
        "uxth	r5, %[a]		\n\t"            \
        "uxth	r6, %[b]		\n\t"            \
        "muls	r6, r5, r6		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "movs	%[a], #0		\n\t"            \
        "adcs	%[h], %[h], %[a]	\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], %[h], %[a]	\n\t"            \
        /* al * bh */                                    \
        "lsrs	r6, %[b], #16		\n\t"            \
        "muls	r5, r6, r5		\n\t"            \
        "lsrs	r6, r5, #16		\n\t"            \
        "lsls	r5, r5, #16		\n\t"            \
        "adds	%[l], %[l], r5		\n\t"            \
        "adcs	%[h], %[h], r6		\n\t"            \
        "adds	%[l], %[l], r5		\n\t"            \
        "adcs	%[h], %[h], r6		\n\t"            \
        "adcs	%[o], %[o], %[a]	\n\t"            \
        /* ah * bh */                                    \
        "movs	%[a], r8		\n\t"            \
        "lsrs	r5, %[a], #16		\n\t"            \
        "lsrs	r6, %[b], #16		\n\t"            \
        "muls	r6, r5, r6		\n\t"            \
        "movs	%[a], #0		\n\t"            \
        "adds	%[h], %[h], r6		\n\t"            \
        "adcs	%[o], %[o], %[a]	\n\t"            \
        "adds	%[h], %[h], r6		\n\t"            \
        "adcs	%[o], %[o], %[a]	\n\t"            \
        /* ah * bl */                                    \
        "uxth	r6, %[b]		\n\t"            \
        "muls	r5, r6, r5		\n\t"            \
        "lsrs	r6, r5, #16		\n\t"            \
        "lsls	r5, r5, #16		\n\t"            \
        "adds	%[l], %[l], r5		\n\t"            \
        "adcs	%[h], %[h], r6		\n\t"            \
        "adcs	%[o], %[o], %[a]	\n\t"            \
        "adds	%[l], %[l], r5		\n\t"            \
        "adcs	%[h], %[h], r6		\n\t"            \
        "adcs	%[o], %[o], %[a]	\n\t"            \
        "movs	%[a], r8		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
        : [a] "l" (va), [b] "l" (vb)                     \
        : "r5", "r6", "r8", "cc"                         \
    )
#endif
/* Square va and store double size result in: vh | vl */
#define SP_ASM_SQR(vl, vh, va)                           \
    __asm__ __volatile__ (                               \
        "lsrs	r5, %[a], #16		\n\t"            \
        "uxth	r6, %[a]		\n\t"            \
        "mov	%[l], r6		\n\t"            \
        "mov	%[h], r5		\n\t"            \
        /* al * al */                                    \
        "muls	%[l], %[l], %[l]	\n\t"            \
        /* ah * ah */                                    \
        "muls	%[h], %[h], %[h]	\n\t"            \
        /* 2 * al * ah */                                \
        "muls	r6, r5, r6		\n\t"            \
        "lsrs	r5, r6, #15		\n\t"            \
        "lsls	r6, r6, #17		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], %[h], r5		\n\t"            \
        : [h] "+l" (vh), [l] "+l" (vl)                   \
        : [a] "l" (va)                                   \
        : "r5", "r6", "cc"                               \
    )
/* Square va and add double size result into: vo | vh | vl */
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
    __asm__ __volatile__ (                               \
        "lsrs	r4, %[a], #16		\n\t"            \
        "uxth	r6, %[a]		\n\t"            \
        /* al * al */                                    \
        "muls	r6, r6, r6		\n\t"            \
        /* ah * ah */                                    \
        "muls	r4, r4, r4		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], %[h], r4		\n\t"            \
        "movs	r5, #0			\n\t"            \
        "adcs	%[o], %[o], r5		\n\t"            \
        "lsrs	r4, %[a], #16		\n\t"            \
        "uxth	r6, %[a]		\n\t"            \
        /* 2 * al * ah */                                \
        "muls	r6, r4, r6		\n\t"            \
        "lsrs	r4, r6, #15		\n\t"            \
        "lsls	r6, r6, #17		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], %[h], r4		\n\t"            \
        "adcs	%[o], %[o], r5		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
        : [a] "l" (va)                                   \
        : "r4", "r5", "r6", "cc"                         \
    )
/* Square va and add double size result into: vh | vl */
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
    __asm__ __volatile__ (                               \
        "lsrs	r5, %[a], #16		\n\t"            \
        "uxth	r6, %[a]		\n\t"            \
        /* al * al */                                    \
        "muls	r6, r6, r6		\n\t"            \
        /* ah * ah */                                    \
        "muls	r5, r5, r5		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], %[h], r5		\n\t"            \
        "lsrs	r5, %[a], #16		\n\t"            \
        "uxth	r6, %[a]		\n\t"            \
        /* 2 * al * ah */                                \
        "muls	r6, r5, r6		\n\t"            \
        "lsrs	r5, r6, #15		\n\t"            \
        "lsls	r6, r6, #17		\n\t"            \
        "adds	%[l], %[l], r6		\n\t"            \
        "adcs	%[h], %[h], r5		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh)                   \
        : [a] "l" (va)                                   \
        : "r5", "r6", "cc"                               \
    )
/* Add va into: vh | vl */
#define SP_ASM_ADDC(vl, vh, va)                          \
    __asm__ __volatile__ (                               \
        "adds	%[l], %[l], %[a]	\n\t"            \
        "movs	r5, #0			\n\t"            \
        "adcs	%[h], %[h], r5		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh)                   \
        : [a] "l" (va)                                   \
        : "r5", "cc"                                     \
    )
/* Sub va from: vh | vl */
#define SP_ASM_SUBB(vl, vh, va)                          \
    __asm__ __volatile__ (                               \
        "subs	%[l], %[l], %[a]	\n\t"            \
        "movs	r5, #0			\n\t"            \
        "sbcs	%[h], %[h], r5		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh)                   \
        : [a] "l" (va)                                   \
        : "r5", "cc"                                     \
    )
/* Add two times vc | vb | va into vo | vh | vl */
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
    __asm__ __volatile__ (                               \
        "adds	%[l], %[l], %[a]	\n\t"            \
        "adcs	%[h], %[h], %[b]	\n\t"            \
        "adcs	%[o], %[o], %[c]	\n\t"            \
        "adds	%[l], %[l], %[a]	\n\t"            \
        "adcs	%[h], %[h], %[b]	\n\t"            \
        "adcs	%[o], %[o], %[c]	\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
        : [a] "l" (va), [b] "l" (vb), [c] "l" (vc)       \
        : "cc"                                           \
    )

#elif defined(__GNUC__)

/* Multiply va by vb and store double size result in: vh | vl */
#define SP_ASM_MUL(vl, vh, va, vb)                       \
    __asm__ __volatile__ (                               \
        /* al * bl */                                    \
        "uxth	r6, %[a]		\n\t"            \
        "uxth	%[l], %[b]		\n\t"            \
        "mul	%[l], r6		\n\t"            \
        /* al * bh */                                    \
        "lsr	r4, %[b], #16		\n\t"            \
        "mul	r6, r4			\n\t"            \
        "lsr	%[h], r6, #16		\n\t"            \
        "lsl	r6, r6, #16		\n\t"            \
        "add	%[l], %[l], r6		\n\t"            \
        "mov	r5, #0			\n\t"            \
        "adc	%[h], r5		\n\t"            \
        /* ah * bh */                                    \
        "lsr	r6, %[a], #16		\n\t"            \
        "mul	r4, r6			\n\t"            \
        "add	%[h], %[h], r4		\n\t"            \
        /* ah * bl */                                    \
        "uxth	r4, %[b]		\n\t"            \
        "mul	r6, r4			\n\t"            \
        "lsr	r4, r6, #16		\n\t"            \
        "lsl	r6, r6, #16		\n\t"            \
        "add	%[l], %[l], r6		\n\t"            \
        "adc	%[h], r4		\n\t"            \
        : [h] "+l" (vh), [l] "+l" (vl)                   \
        : [a] "l" (va), [b] "l" (vb)                     \
        : "r4", "r5", "r6", "cc"                         \
    )
/* Multiply va by vb and store double size result in: vo | vh | vl */
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        /* al * bl */                                    \
        "uxth	r6, %[a]		\n\t"            \
        "uxth	%[l], %[b]		\n\t"            \
        "mul	%[l], r6		\n\t"            \
        /* al * bh */                                    \
        "lsr	r5, %[b], #16		\n\t"            \
        "mul	r6, r5			\n\t"            \
        "lsr	%[h], r6, #16		\n\t"            \
        "lsl	r6, r6, #16		\n\t"            \
        "add	%[l], %[l], r6		\n\t"            \
        "mov	%[o], #0		\n\t"            \
        "adc	%[h], %[o]		\n\t"            \
        /* ah * bh */                                    \
        "lsr	r6, %[a], #16		\n\t"            \
        "mul	r5, r6			\n\t"            \
        "add	%[h], %[h], r5		\n\t"            \
        /* ah * bl */                                    \
        "uxth	r5, %[b]		\n\t"            \
        "mul	r6, r5			\n\t"            \
        "lsr	r5, r6, #16		\n\t"            \
        "lsl	r6, r6, #16		\n\t"            \
        "add	%[l], %[l], r6		\n\t"            \
        "adc	%[h], r5		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
        : [a] "l" (va), [b] "l" (vb)                     \
        : "r5", "r6", "cc"                               \
    )
#if !defined(WOLFSSL_SP_SMALL) && !defined(DEBUG)
/* Multiply va by vb and add double size result into: vo | vh | vl */
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        /* al * bl */                                    \
        "uxth	r6, %[a]		\n\t"            \
        "uxth	r7, %[b]		\n\t"            \
        "mul	r7, r6			\n\t"            \
        "add	%[l], %[l], r7		\n\t"            \
        "mov	r5, #0			\n\t"            \
        "adc	%[h], r5		\n\t"            \
        "adc	%[o], r5		\n\t"            \
        /* al * bh */                                    \
        "lsr	r7, %[b], #16		\n\t"            \
        "mul	r6, r7			\n\t"            \
        "lsr	r7, r6, #16		\n\t"            \
        "lsl	r6, r6, #16		\n\t"            \
        "add	%[l], %[l], r6		\n\t"            \
        "adc	%[h], r7		\n\t"            \
        "adc	%[o], r5		\n\t"            \
        /* ah * bh */                                    \
        "lsr	r6, %[a], #16		\n\t"            \
        "lsr	r7, %[b], #16		\n\t"            \
        "mul	r7, r6			\n\t"            \
        "add	%[h], %[h], r7		\n\t"            \
        "adc	%[o], r5		\n\t"            \
        /* ah * bl */                                    \
        "uxth	r7, %[b]		\n\t"            \
        "mul	r6, r7			\n\t"            \
        "lsr	r7, r6, #16		\n\t"            \
        "lsl	r6, r6, #16		\n\t"            \
        "add	%[l], %[l], r6		\n\t"            \
        "adc	%[h], r7		\n\t"            \
        "adc	%[o], r5		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
        : [a] "l" (va), [b] "l" (vb)                     \
        : "r5", "r6", "r7", "cc"                         \
    )
#else
/* Multiply va by vb and add double size result into: vo | vh | vl */
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        /* al * bl */                                    \
        "uxth   r6, %[a]                \n\t"            \
        "uxth   r5, %[b]                \n\t"            \
        "mul    r5, r6                  \n\t"            \
        "add    %[l], %[l], r5          \n\t"            \
        "mov    r5, #0                  \n\t"            \
        "adc    %[h], r5                \n\t"            \
        "adc    %[o], r5                \n\t"            \
        /* al * bh */                                    \
        "lsr    r5, %[b], #16           \n\t"            \
        "mul    r6, r5                  \n\t"            \
        "lsr    r5, r6, #16             \n\t"            \
        "lsl    r6, r6, #16             \n\t"            \
        "add    %[l], %[l], r6          \n\t"            \
        "adc    %[h], r5                \n\t"            \
        "mov    r5, #0                  \n\t"            \
        "adc    %[o], r5                \n\t"            \
        /* ah * bh */                                    \
        "lsr    r6, %[a], #16           \n\t"            \
        "lsr    r5, %[b], #16           \n\t"            \
        "mul    r5, r6                  \n\t"            \
        "add    %[h], %[h], r5          \n\t"            \
        "mov    r5, #0                  \n\t"            \
        "adc    %[o], r5                \n\t"            \
        /* ah * bl */                                    \
        "uxth   r5, %[b]                \n\t"            \
        "mul    r6, r5                  \n\t"            \
        "lsr    r5, r6, #16             \n\t"            \
        "lsl    r6, r6, #16             \n\t"            \
        "add    %[l], %[l], r6          \n\t"            \
        "adc    %[h], r5                \n\t"            \
        "mov    r5, #0                  \n\t"            \
        "adc    %[o], r5                \n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
        : [a] "l" (va), [b] "l" (vb)                     \
        : "r5", "r6", "cc"                               \
    )
#endif
/* Multiply va by vb and add double size result into: vh | vl */
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
    __asm__ __volatile__ (                               \
        /* al * bl */                                    \
        "uxth	r6, %[a]		\n\t"            \
        "uxth	r4, %[b]		\n\t"            \
        "mul	r4, r6			\n\t"            \
        "add	%[l], %[l], r4		\n\t"            \
        "mov	r5, #0			\n\t"            \
        "adc	%[h], r5		\n\t"            \
        /* al * bh */                                    \
        "lsr	r4, %[b], #16		\n\t"            \
        "mul	r6, r4			\n\t"            \
        "lsr	r4, r6, #16		\n\t"            \
        "lsl	r6, r6, #16		\n\t"            \
        "add	%[l], %[l], r6		\n\t"            \
        "adc	%[h], r4		\n\t"            \
        /* ah * bh */                                    \
        "lsr	r6, %[a], #16		\n\t"            \
        "lsr	r4, %[b], #16		\n\t"            \
        "mul	r4, r6			\n\t"            \
        "add	%[h], %[h], r4		\n\t"            \
        /* ah * bl */                                    \
        "uxth	r4, %[b]		\n\t"            \
        "mul	r6, r4			\n\t"            \
        "lsr	r4, r6, #16		\n\t"            \
        "lsl	r6, r6, #16		\n\t"            \
        "add	%[l], %[l], r6		\n\t"            \
        "adc	%[h], r4		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh)                   \
        : [a] "l" (va), [b] "l" (vb)                     \
        : "r4", "r5", "r6", "cc"                         \
    )
#if !defined(WOLFSSL_SP_SMALL) && !defined(DEBUG)
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
    __asm__ __volatile__ (                               \
        /* al * bl */                                    \
        "uxth	r6, %[a]		\n\t"            \
        "uxth	r7, %[b]		\n\t"            \
        "mul	r7, r6			\n\t"            \
        "add	%[l], %[l], r7		\n\t"            \
        "mov	r5, #0			\n\t"            \
        "adc	%[h], r5		\n\t"            \
        "adc	%[o], r5		\n\t"            \
        "add	%[l], %[l], r7		\n\t"            \
        "adc	%[h], r5		\n\t"            \
        "adc	%[o], r5		\n\t"            \
        /* al * bh */                                    \
        "lsr	r7, %[b], #16		\n\t"            \
        "mul	r6, r7			\n\t"            \
        "lsr	r7, r6, #16		\n\t"            \
        "lsl	r6, r6, #16		\n\t"            \
        "add	%[l], %[l], r6		\n\t"            \
        "adc	%[h], r7		\n\t"            \
        "adc	%[o], r5		\n\t"            \
        "add	%[l], %[l], r6		\n\t"            \
        "adc	%[h], r7		\n\t"            \
        "adc	%[o], r5		\n\t"            \
        /* ah * bh */                                    \
        "lsr	r6, %[a], #16		\n\t"            \
        "lsr	r7, %[b], #16		\n\t"            \
        "mul	r7, r6			\n\t"            \
        "add	%[h], %[h], r7		\n\t"            \
        "adc	%[o], r5		\n\t"            \
        "add	%[h], %[h], r7		\n\t"            \
        "adc	%[o], r5		\n\t"            \
        /* ah * bl */                                    \
        "uxth	r7, %[b]		\n\t"            \
        "mul	r6, r7			\n\t"            \
        "lsr	r7, r6, #16		\n\t"            \
        "lsl	r6, r6, #16		\n\t"            \
        "add	%[l], %[l], r6		\n\t"            \
        "adc	%[h], r7		\n\t"            \
        "adc	%[o], r5		\n\t"            \
        "add	%[l], %[l], r6		\n\t"            \
        "adc	%[h], r7		\n\t"            \
        "adc	%[o], r5		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
        : [a] "l" (va), [b] "l" (vb)                     \
        : "r5", "r6", "r7", "cc"                         \
    )
#else
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
    __asm__ __volatile__ (                               \
        "mov    r8, %[a]                \n\t"            \
        /* al * bl */                                    \
        "uxth   r6, %[a]                \n\t"            \
        "uxth   r5, %[b]                \n\t"            \
        "mul    r5, r6                  \n\t"            \
        "add    %[l], %[l], r5          \n\t"            \
        "mov    %[a], #0                \n\t"            \
        "adc    %[h], %[a]              \n\t"            \
        "adc    %[o], %[a]              \n\t"            \
        "add    %[l], %[l], r5          \n\t"            \
        "adc    %[h], %[a]              \n\t"            \
        "adc    %[o], %[a]              \n\t"            \
        /* al * bh */                                    \
        "lsr    r5, %[b], #16           \n\t"            \
        "mul    r6, r5                  \n\t"            \
        "lsr    r5, r6, #16             \n\t"            \
        "lsl    r6, r6, #16             \n\t"            \
        "add    %[l], %[l], r6          \n\t"            \
        "adc    %[h], r5                \n\t"            \
        "adc    %[o], %[a]              \n\t"            \
        "add    %[l], %[l], r6          \n\t"            \
        "adc    %[h], r5                \n\t"            \
        "adc    %[o], %[a]              \n\t"            \
        /* ah * bh */                                    \
        "mov    %[a], r8                \n\t"            \
        "lsr    r6, %[a], #16           \n\t"            \
        "lsr    r5, %[b], #16           \n\t"            \
        "mul    r5, r6                  \n\t"            \
        "add    %[h], %[h], r5          \n\t"            \
        "mov    %[a], #0                \n\t"            \
        "adc    %[o], %[a]              \n\t"            \
        "add    %[h], %[h], r5          \n\t"            \
        "adc    %[o], %[a]              \n\t"            \
        /* ah * bl */                                    \
        "uxth   r5, %[b]                \n\t"            \
        "mul    r6, r5                  \n\t"            \
        "lsr    r5, r6, #16             \n\t"            \
        "lsl    r6, r6, #16             \n\t"            \
        "add    %[l], %[l], r6          \n\t"            \
        "adc    %[h], r5                \n\t"            \
        "adc    %[o], %[a]              \n\t"            \
        "add    %[l], %[l], r6          \n\t"            \
        "adc    %[h], r5                \n\t"            \
        "adc    %[o], %[a]              \n\t"            \
        "mov    %[a], r8                \n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
        : [a] "l" (va), [b] "l" (vb)                     \
        : "r5", "r6", "r8", "cc"                         \
    )
#endif
#ifndef DEBUG
/* Multiply va by vb and add double size result twice into: vo | vh | vl
 * Assumes first add will not overflow vh | vl
 */
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
    __asm__ __volatile__ (                               \
        /* al * bl */                                    \
        "uxth	r6, %[a]		\n\t"            \
        "uxth	r7, %[b]		\n\t"            \
        "mul	r7, r6			\n\t"            \
        "add	%[l], %[l], r7		\n\t"            \
        "mov	r5, #0			\n\t"            \
        "adc	%[h], r5		\n\t"            \
        "add	%[l], %[l], r7		\n\t"            \
        "adc	%[h], r5		\n\t"            \
        /* al * bh */                                    \
        "lsr	r7, %[b], #16		\n\t"            \
        "mul	r6, r7			\n\t"            \
        "lsr	r7, r6, #16		\n\t"            \
        "lsl	r6, r6, #16		\n\t"            \
        "add	%[l], %[l], r6		\n\t"            \
        "adc	%[h], r7		\n\t"            \
        "add	%[l], %[l], r6		\n\t"            \
        "adc	%[h], r7		\n\t"            \
        "adc	%[o], r5		\n\t"            \
        /* ah * bh */                                    \
        "lsr	r6, %[a], #16		\n\t"            \
        "lsr	r7, %[b], #16		\n\t"            \
        "mul	r7, r6			\n\t"            \
        "add	%[h], %[h], r7		\n\t"            \
        "adc	%[o], r5		\n\t"            \
        "add	%[h], %[h], r7		\n\t"            \
        "adc	%[o], r5		\n\t"            \
        /* ah * bl */                                    \
        "uxth	r7, %[b]		\n\t"            \
        "mul	r6, r7			\n\t"            \
        "lsr	r7, r6, #16		\n\t"            \
        "lsl	r6, r6, #16		\n\t"            \
        "add	%[l], %[l], r6		\n\t"            \
        "adc	%[h], r7		\n\t"            \
        "adc	%[o], r5		\n\t"            \
        "add	%[l], %[l], r6		\n\t"            \
        "adc	%[h], r7		\n\t"            \
        "adc	%[o], r5		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
        : [a] "l" (va), [b] "l" (vb)                     \
        : "r5", "r6", "r7", "cc"                         \
    )
#else
/* Multiply va by vb and add double size result twice into: vo | vh | vl
 * Assumes first add will not overflow vh | vl
 */
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
    __asm__ __volatile__ (                               \
        "mov	r8, %[a]		\n\t"            \
        /* al * bl */                                    \
        "uxth	r5, %[a]		\n\t"            \
        "uxth	r6, %[b]		\n\t"            \
        "mul	r6, r5			\n\t"            \
        "add	%[l], %[l], r6		\n\t"            \
        "mov	%[a], #0		\n\t"            \
        "adc	%[h], %[a]		\n\t"            \
        "add	%[l], %[l], r6		\n\t"            \
        "adc	%[h], %[a]		\n\t"            \
        /* al * bh */                                    \
        "lsr	r6, %[b], #16		\n\t"            \
        "mul	r5, r6			\n\t"            \
        "lsr	r6, r5, #16		\n\t"            \
        "lsl	r5, r5, #16		\n\t"            \
        "add	%[l], %[l], r5		\n\t"            \
        "adc	%[h], r6		\n\t"            \
        "add	%[l], %[l], r5		\n\t"            \
        "adc	%[h], r6		\n\t"            \
        "adc	%[o], %[a]		\n\t"            \
        /* ah * bh */                                    \
        "mov    %[a], r8                \n\t"            \
        "lsr	r5, %[a], #16		\n\t"            \
        "lsr	r6, %[b], #16		\n\t"            \
        "mul	r6, r5			\n\t"            \
        "mov    %[a], #0                \n\t"            \
        "add	%[h], %[h], r6		\n\t"            \
        "adc	%[o], %[a]		\n\t"            \
        "add	%[h], %[h], r6		\n\t"            \
        "adc	%[o], %[a]		\n\t"            \
        /* ah * bl */                                    \
        "uxth	r6, %[b]		\n\t"            \
        "mul	r5, r6			\n\t"            \
        "lsr	r6, r5, #16		\n\t"            \
        "lsl	r5, r5, #16		\n\t"            \
        "add	%[l], %[l], r5		\n\t"            \
        "adc	%[h], r6		\n\t"            \
        "adc	%[o], %[a]		\n\t"            \
        "add	%[l], %[l], r5		\n\t"            \
        "adc	%[h], r6		\n\t"            \
        "adc	%[o], %[a]		\n\t"            \
        "mov    %[a], r8                \n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
        : [a] "l" (va), [b] "l" (vb)                     \
        : "r5", "r6", "r8", "cc"                         \
    )
#endif
/* Square va and store double size result in: vh | vl */
#define SP_ASM_SQR(vl, vh, va)                           \
    __asm__ __volatile__ (                               \
        "lsr	r5, %[a], #16		\n\t"            \
        "uxth	r6, %[a]		\n\t"            \
        "mov	%[l], r6		\n\t"            \
        "mov	%[h], r5		\n\t"            \
        /* al * al */                                    \
        "mul	%[l], %[l]		\n\t"            \
        /* ah * ah */                                    \
        "mul	%[h], %[h]		\n\t"            \
        /* 2 * al * ah */                                \
        "mul	r6, r5			\n\t"            \
        "lsr	r5, r6, #15		\n\t"            \
        "lsl	r6, r6, #17		\n\t"            \
        "add	%[l], %[l], r6		\n\t"            \
        "adc	%[h], r5		\n\t"            \
        : [h] "+l" (vh), [l] "+l" (vl)                   \
        : [a] "l" (va)                                   \
        : "r5", "r6", "cc"                               \
    )
/* Square va and add double size result into: vo | vh | vl */
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
    __asm__ __volatile__ (                               \
        "lsr	r4, %[a], #16		\n\t"            \
        "uxth	r6, %[a]		\n\t"            \
        /* al * al */                                    \
        "mul	r6, r6			\n\t"            \
        /* ah * ah */                                    \
        "mul	r4, r4			\n\t"            \
        "add	%[l], %[l], r6		\n\t"            \
        "adc	%[h], r4		\n\t"            \
        "mov	r5, #0			\n\t"            \
        "adc	%[o], r5		\n\t"            \
        "lsr	r4, %[a], #16		\n\t"            \
        "uxth	r6, %[a]		\n\t"            \
        /* 2 * al * ah */                                \
        "mul	r6, r4			\n\t"            \
        "lsr	r4, r6, #15		\n\t"            \
        "lsl	r6, r6, #17		\n\t"            \
        "add	%[l], %[l], r6		\n\t"            \
        "adc	%[h], r4		\n\t"            \
        "adc	%[o], r5		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
        : [a] "l" (va)                                   \
        : "r4", "r5", "r6", "cc"                         \
    )
/* Square va and add double size result into: vh | vl */
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
    __asm__ __volatile__ (                               \
        "lsr	r5, %[a], #16		\n\t"            \
        "uxth	r6, %[a]		\n\t"            \
        /* al * al */                                    \
        "mul	r6, r6			\n\t"            \
        /* ah * ah */                                    \
        "mul	r5, r5			\n\t"            \
        "add	%[l], %[l], r6		\n\t"            \
        "adc	%[h], r5		\n\t"            \
        "lsr	r5, %[a], #16		\n\t"            \
        "uxth	r6, %[a]		\n\t"            \
        /* 2 * al * ah */                                \
        "mul	r6, r5			\n\t"            \
        "lsr	r5, r6, #15		\n\t"            \
        "lsl	r6, r6, #17		\n\t"            \
        "add	%[l], %[l], r6		\n\t"            \
        "adc	%[h], r5		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh)                   \
        : [a] "l" (va)                                   \
        : "r5", "r6", "cc"                               \
    )
/* Add va into: vh | vl */
#define SP_ASM_ADDC(vl, vh, va)                          \
    __asm__ __volatile__ (                               \
        "add	%[l], %[l], %[a]	\n\t"            \
        "mov	r5, #0			\n\t"            \
        "adc	%[h], r5		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh)                   \
        : [a] "l" (va)                                   \
        : "r5", "cc"                                     \
    )
/* Sub va from: vh | vl */
#define SP_ASM_SUBB(vl, vh, va)                          \
    __asm__ __volatile__ (                               \
        "sub	%[l], %[l], %[a]	\n\t"            \
        "mov	r5, #0			\n\t"            \
        "sbc	%[h], r5		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh)                   \
        : [a] "l" (va)                                   \
        : "r5", "cc"                                     \
    )
/* Add two times vc | vb | va into vo | vh | vl */
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
    __asm__ __volatile__ (                               \
        "add	%[l], %[l], %[a]	\n\t"            \
        "adc	%[h], %[b]		\n\t"            \
        "adc	%[o], %[c]		\n\t"            \
        "add	%[l], %[l], %[a]	\n\t"            \
        "adc	%[h], %[b]		\n\t"            \
        "adc	%[o], %[c]		\n\t"            \
        : [l] "+l" (vl), [h] "+l" (vh), [o] "+l" (vo)    \
        : [a] "l" (va), [b] "l" (vb), [c] "l" (vc)       \
        : "cc"                                           \
    )

#endif

#ifdef WOLFSSL_SP_DIV_WORD_HALF
/* Divide a two digit number by a digit number and return. (hi | lo) / d
 *
 * No division instruction used - does operation bit by bit.
 * Constant time.
 *
 * @param  [in]  hi  SP integer digit. High digit of the dividend.
 * @param  [in]  lo  SP integer digit. Lower digit of the dividend.
 * @param  [in]  d   SP integer digit. Number to divide by.
 * @return  The division result.
 */
static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo,
                                          sp_int_digit d)
{
    __asm__ __volatile__ (
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "lsrs	r3, %[d], #24\n\t"
#else
        "lsr	r3, %[d], #24\n\t"
#endif
        "beq	2%=f\n\t"
	"\n1%=:\n\t"
        "movs	r3, #0\n\t"
        "b	3%=f\n\t"
	"\n2%=:\n\t"
        "mov	r3, #8\n\t"
	"\n3%=:\n\t"
        "movs	r4, #31\n\t"
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "subs	r4, r4, r3\n\t"
#else
        "sub	r4, r4, r3\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "lsls	%[d], %[d], r3\n\t"
#else
        "lsl	%[d], %[d], r3\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "lsls	%[hi], %[hi], r3\n\t"
#else
        "lsl	%[hi], %[hi], r3\n\t"
#endif
        "mov	r5, %[lo]\n\t"
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "lsrs	r5, r5, r4\n\t"
#else
        "lsr	r5, r5, r4\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "lsls	%[lo], %[lo], r3\n\t"
#else
        "lsl	%[lo], %[lo], r3\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "lsrs	r5, r5, #1\n\t"
#else
        "lsr	r5, r5, #1\n\t"
#endif
#if defined(WOLFSSL_KEIL)
        "orrs	%[hi], %[hi], r5\n\t"
#elif defined(__clang__)
        "orrs	%[hi], r5\n\t"
#else
        "orr	%[hi], r5\n\t"
#endif

        "movs   r3, #0\n\t"
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "lsrs   r5, %[d], #1\n\t"
#else
        "lsr    r5, %[d], #1\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "adds   r5, r5, #1\n\t"
#else
        "add    r5, r5, #1\n\t"
#endif
        "mov    r8, %[lo]\n\t"
        "mov    r9, %[hi]\n\t"
        /* Do top 32 */
        "movs   r6, r5\n\t"
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "subs   r6, r6, %[hi]\n\t"
#else
        "sub    r6, r6, %[hi]\n\t"
#endif
#ifdef WOLFSSL_KEIL
        "sbcs   r6, r6, r6\n\t"
#elif defined(__clang__)
        "sbcs   r6, r6\n\t"
#else
        "sbc    r6, r6\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "adds   r3, r3, r3\n\t"
#else
        "add    r3, r3, r3\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "subs   r3, r3, r6\n\t"
#else
        "sub    r3, r3, r6\n\t"
#endif
#ifdef WOLFSSL_KEIL
        "ands   r6, r6, r5\n\t"
#elif defined(__clang__)
        "ands   r6, r5\n\t"
#else
        "and    r6, r5\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "subs   %[hi], %[hi], r6\n\t"
#else
        "sub    %[hi], %[hi], r6\n\t"
#endif
        "movs   r4, #29\n\t"
        "\n"
    "L_sp_div_word_loop%=:\n\t"
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "lsls   %[lo], %[lo], #1\n\t"
#else
        "lsl    %[lo], %[lo], #1\n\t"
#endif
#ifdef WOLFSSL_KEIL
        "adcs   %[hi], %[hi], %[hi]\n\t"
#elif defined(__clang__)
        "adcs   %[hi], %[hi]\n\t"
#else
        "adc    %[hi], %[hi]\n\t"
#endif
        "movs   r6, r5\n\t"
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "subs   r6, r6, %[hi]\n\t"
#else
        "sub    r6, r6, %[hi]\n\t"
#endif
#ifdef WOLFSSL_KEIL
        "sbcs   r6, r6, r6\n\t"
#elif defined(__clang__)
        "sbcs   r6, r6\n\t"
#else
        "sbc    r6, r6\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "adds   r3, r3, r3\n\t"
#else
        "add    r3, r3, r3\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "subs   r3, r3, r6\n\t"
#else
        "sub    r3, r3, r6\n\t"
#endif
#ifdef WOLFSSL_KEIL
        "ands   r6, r6, r5\n\t"
#elif defined(__clang__)
        "ands   r6, r5\n\t"
#else
        "and    r6, r5\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "subs   %[hi], %[hi], r6\n\t"
#else
        "sub    %[hi], %[hi], r6\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "subs   r4, r4, #1\n\t"
#else
        "sub    r4, r4, #1\n\t"
#endif
        "bpl    L_sp_div_word_loop%=\n\t"
        "movs   r7, #0\n\t"
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "adds   r3, r3, r3\n\t"
#else
        "add    r3, r3, r3\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "adds   r3, r3, #1\n\t"
#else
        "add    r3, r3, #1\n\t"
#endif
        /* r * d - Start */
        "uxth   %[hi], r3\n\t"
        "uxth   r4, %[d]\n\t"
#ifdef WOLFSSL_KEIL
        "muls   r4, %[hi], r4\n\t"
#elif defined(__clang__)
        "muls   r4, %[hi]\n\t"
#else
        "mul    r4, %[hi]\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "lsrs   r6, %[d], #16\n\t"
#else
        "lsr    r6, %[d], #16\n\t"
#endif
#ifdef WOLFSSL_KEIL
        "muls   %[hi], r6, %[hi]\n\t"
#elif defined(__clang__)
        "muls   %[hi], r6\n\t"
#else
        "mul    %[hi], r6\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "lsrs   r5, %[hi], #16\n\t"
#else
        "lsr    r5, %[hi], #16\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "lsls   %[hi], %[hi], #16\n\t"
#else
        "lsl    %[hi], %[hi], #16\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "adds   r4, r4, %[hi]\n\t"
#else
        "add    r4, r4, %[hi]\n\t"
#endif
#ifdef WOLFSSL_KEIL
        "adcs   r5, r5, r7\n\t"
#elif defined(__clang__)
        "adcs   r5, r7\n\t"
#else
        "adc    r5, r7\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "lsrs   %[hi], r3, #16\n\t"
#else
        "lsr    %[hi], r3, #16\n\t"
#endif
#ifdef WOLFSSL_KEIL
        "muls   r6, %[hi], r6\n\t"
#elif defined(__clang__)
        "muls   r6, %[hi]\n\t"
#else
        "mul    r6, %[hi]\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "adds   r5, r5, r6\n\t"
#else
        "add    r5, r5, r6\n\t"
#endif
        "uxth   r6, %[d]\n\t"
#ifdef WOLFSSL_KEIL
        "muls   %[hi], r6, %[hi]\n\t"
#elif defined(__clang__)
        "muls   %[hi], r6\n\t"
#else
        "mul    %[hi], r6\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "lsrs   r6, %[hi], #16\n\t"
#else
        "lsr    r6, %[hi], #16\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "lsls   %[hi], %[hi], #16\n\t"
#else
        "lsl    %[hi], %[hi], #16\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "adds   r4, r4, %[hi]\n\t"
#else
        "add    r4, r4, %[hi]\n\t"
#endif
#ifdef WOLFSSL_KEIL
        "adcs   r5, r5, r6\n\t"
#elif defined(__clang__)
        "adcs   r5, r6\n\t"
#else
        "adc    r5, r6\n\t"
#endif
        /* r * d - Done */
        "mov    %[hi], r8\n\t"
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "subs   %[hi], %[hi], r4\n\t"
#else
        "sub    %[hi], %[hi], r4\n\t"
#endif
        "movs   r4, %[hi]\n\t"
        "mov    %[hi], r9\n\t"
#ifdef WOLFSSL_KEIL
        "sbcs   %[hi], %[hi], r5\n\t"
#elif defined(__clang__)
        "sbcs   %[hi], r5\n\t"
#else
        "sbc    %[hi], r5\n\t"
#endif
        "movs   r5, %[hi]\n\t"
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "adds   r3, r3, r5\n\t"
#else
        "add    r3, r3, r5\n\t"
#endif
        /* r * d - Start */
        "uxth   %[hi], r3\n\t"
        "uxth   r4, %[d]\n\t"
#ifdef WOLFSSL_KEIL
        "muls   r4, %[hi], r4\n\t"
#elif defined(__clang__)
        "muls   r4, %[hi]\n\t"
#else
        "mul    r4, %[hi]\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "lsrs   r6, %[d], #16\n\t"
#else
        "lsr    r6, %[d], #16\n\t"
#endif
#ifdef WOLFSSL_KEIL
        "muls   %[hi], r6, %[hi]\n\t"
#elif defined(__clang__)
        "muls   %[hi], r6\n\t"
#else
        "mul    %[hi], r6\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "lsrs   r5, %[hi], #16\n\t"
#else
        "lsr    r5, %[hi], #16\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "lsls   %[hi], %[hi], #16\n\t"
#else
        "lsl    %[hi], %[hi], #16\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "adds   r4, r4, %[hi]\n\t"
#else
        "add    r4, r4, %[hi]\n\t"
#endif
#ifdef WOLFSSL_KEIL
        "adcs   r5, r5, r7\n\t"
#elif defined(__clang__)
        "adcs   r5, r7\n\t"
#else
        "adc    r5, r7\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "lsrs   %[hi], r3, #16\n\t"
#else
        "lsr    %[hi], r3, #16\n\t"
#endif
#ifdef WOLFSSL_KEIL
        "muls   r6, %[hi], r6\n\t"
#elif defined(__clang__)
        "muls   r6, %[hi]\n\t"
#else
        "mul    r6, %[hi]\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "adds   r5, r5, r6\n\t"
#else
        "add    r5, r5, r6\n\t"
#endif
        "uxth   r6, %[d]\n\t"
#ifdef WOLFSSL_KEIL
        "muls   %[hi], r6, %[hi]\n\t"
#elif defined(__clang__)
        "muls   %[hi], r6\n\t"
#else
        "mul    %[hi], r6\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "lsrs   r6, %[hi], #16\n\t"
#else
        "lsr    r6, %[hi], #16\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "lsls   %[hi], %[hi], #16\n\t"
#else
        "lsl    %[hi], %[hi], #16\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "adds   r4, r4, %[hi]\n\t"
#else
        "add    r4, r4, %[hi]\n\t"
#endif
#ifdef WOLFSSL_KEIL
        "adcs   r5, r5, r6\n\t"
#elif defined(__clang__)
        "adcs   r5, r6\n\t"
#else
        "adc    r5, r6\n\t"
#endif
        /* r * d - Done */
        "mov    %[hi], r8\n\t"
        "mov    r6, r9\n\t"
#ifdef WOLFSSL_KEIL
        "subs   r4, %[hi], r4\n\t"
#else
#ifdef __clang__
        "subs   r4, %[hi], r4\n\t"
#else
        "sub    r4, %[hi], r4\n\t"
#endif
#endif
#ifdef WOLFSSL_KEIL
        "sbcs   r6, r6, r5\n\t"
#elif defined(__clang__)
        "sbcs   r6, r5\n\t"
#else
        "sbc    r6, r5\n\t"
#endif
        "movs   r5, r6\n\t"
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "adds   r3, r3, r5\n\t"
#else
        "add    r3, r3, r5\n\t"
#endif
        /* r * d - Start */
        "uxth   %[hi], r3\n\t"
        "uxth   r4, %[d]\n\t"
#ifdef WOLFSSL_KEIL
        "muls   r4, %[hi], r4\n\t"
#elif defined(__clang__)
        "muls   r4, %[hi]\n\t"
#else
        "mul    r4, %[hi]\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "lsrs   r6, %[d], #16\n\t"
#else
        "lsr    r6, %[d], #16\n\t"
#endif
#ifdef WOLFSSL_KEIL
        "muls   %[hi], r6, %[hi]\n\t"
#elif defined(__clang__)
        "muls   %[hi], r6\n\t"
#else
        "mul    %[hi], r6\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "lsrs   r5, %[hi], #16\n\t"
#else
        "lsr    r5, %[hi], #16\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "lsls   %[hi], %[hi], #16\n\t"
#else
        "lsl    %[hi], %[hi], #16\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "adds   r4, r4, %[hi]\n\t"
#else
        "add    r4, r4, %[hi]\n\t"
#endif
#ifdef WOLFSSL_KEIL
        "adcs   r5, r5, r7\n\t"
#elif defined(__clang__)
        "adcs   r5, r7\n\t"
#else
        "adc    r5, r7\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "lsrs   %[hi], r3, #16\n\t"
#else
        "lsr    %[hi], r3, #16\n\t"
#endif
#ifdef WOLFSSL_KEIL
        "muls   r6, %[hi], r6\n\t"
#elif defined(__clang__)
        "muls   r6, %[hi]\n\t"
#else
        "mul    r6, %[hi]\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "adds   r5, r5, r6\n\t"
#else
        "add    r5, r5, r6\n\t"
#endif
        "uxth   r6, %[d]\n\t"
#ifdef WOLFSSL_KEIL
        "muls   %[hi], r6, %[hi]\n\t"
#elif defined(__clang__)
        "muls   %[hi], r6\n\t"
#else
        "mul    %[hi], r6\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "lsrs   r6, %[hi], #16\n\t"
#else
        "lsr    r6, %[hi], #16\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "lsls   %[hi], %[hi], #16\n\t"
#else
        "lsl    %[hi], %[hi], #16\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "adds   r4, r4, %[hi]\n\t"
#else
        "add    r4, r4, %[hi]\n\t"
#endif
#ifdef WOLFSSL_KEIL
        "adcs   r5, r5, r6\n\t"
#elif defined(__clang__)
        "adcs   r5, r6\n\t"
#else
        "adc    r5, r6\n\t"
#endif
        /* r * d - Done */
        "mov    %[hi], r8\n\t"
        "mov    r6, r9\n\t"
#ifdef WOLFSSL_KEIL
        "subs   r4, %[hi], r4\n\t"
#else
#ifdef __clang__
        "subs   r4, %[hi], r4\n\t"
#else
        "sub    r4, %[hi], r4\n\t"
#endif
#endif
#ifdef WOLFSSL_KEIL
        "sbcs   r6, r6, r5\n\t"
#elif defined(__clang__)
        "sbcs   r6, r5\n\t"
#else
        "sbc    r6, r5\n\t"
#endif
        "movs   r5, r6\n\t"
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "adds   r3, r3, r5\n\t"
#else
        "add    r3, r3, r5\n\t"
#endif
        "movs   r6, %[d]\n\t"
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "subs   r6, r6, r4\n\t"
#else
        "sub    r6, r6, r4\n\t"
#endif
#ifdef WOLFSSL_KEIL
        "sbcs   r6, r6, r6\n\t"
#elif defined(__clang__)
        "sbcs   r6, r6\n\t"
#else
        "sbc    r6, r6\n\t"
#endif
#if defined(__clang__) || defined(WOLFSSL_KEIL)
        "subs   r3, r3, r6\n\t"
#else
        "sub    r3, r3, r6\n\t"
#endif
        "movs   %[hi], r3\n\t"
        : [hi] "+l" (hi), [lo] "+l" (lo), [d] "+l" (d)
        :
        : "r3", "r4", "r5", "r6", "r7", "r8", "r9", "cc"
    );
    return (uint32_t)(size_t)hi;
}

#define SP_ASM_DIV_WORD
#endif /* !WOLFSSL_SP_DIV_WORD_HALF */

#define SP_INT_ASM_AVAILABLE

    #endif /* WOLFSSL_SP_ARM_THUMB && SP_WORD_SIZE == 32 */

    #if defined(WOLFSSL_SP_PPC64) && SP_WORD_SIZE == 64
/*
 * CPU: PPC64
 */

/* Multiply va by vb and store double size result in: vh | vl */
#define SP_ASM_MUL(vl, vh, va, vb)                       \
    __asm__ __volatile__ (                               \
        "mulld	%[l], %[a], %[b]	\n\t"            \
        "mulhdu	%[h], %[a], %[b]	\n\t"            \
        : [h] "+r" (vh), [l] "+r" (vl)                   \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "memory"                                       \
    )
/* Multiply va by vb and store double size result in: vo | vh | vl */
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        "mulhdu	%[h], %[a], %[b]	\n\t"            \
        "mulld	%[l], %[a], %[b]	\n\t"            \
        "li	%[o], 0			\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "=r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        :                                                \
    )
/* Multiply va by vb and add double size result into: vo | vh | vl */
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        "mulld	16, %[a], %[b]		\n\t"            \
        "mulhdu	17, %[a], %[b]		\n\t"            \
        "addc	%[l], %[l], 16		\n\t"            \
        "adde	%[h], %[h], 17		\n\t"            \
        "addze	%[o], %[o]		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "16", "17", "cc"                               \
    )
/* Multiply va by vb and add double size result into: vh | vl */
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
    __asm__ __volatile__ (                               \
        "mulld	16, %[a], %[b]		\n\t"            \
        "mulhdu	17, %[a], %[b]		\n\t"            \
        "addc	%[l], %[l], 16		\n\t"            \
        "adde	%[h], %[h], 17		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "16", "17", "cc"                               \
    )
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
    __asm__ __volatile__ (                               \
        "mulld	16, %[a], %[b]		\n\t"            \
        "mulhdu	17, %[a], %[b]		\n\t"            \
        "addc	%[l], %[l], 16		\n\t"            \
        "adde	%[h], %[h], 17		\n\t"            \
        "addze	%[o], %[o]		\n\t"            \
        "addc	%[l], %[l], 16		\n\t"            \
        "adde	%[h], %[h], 17		\n\t"            \
        "addze	%[o], %[o]		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "16", "17", "cc"                               \
    )
/* Multiply va by vb and add double size result twice into: vo | vh | vl
 * Assumes first add will not overflow vh | vl
 */
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
    __asm__ __volatile__ (                               \
        "mulld	16, %[a], %[b]		\n\t"            \
        "mulhdu	17, %[a], %[b]		\n\t"            \
        "addc	%[l], %[l], 16		\n\t"            \
        "adde	%[h], %[h], 17		\n\t"            \
        "addc	%[l], %[l], 16		\n\t"            \
        "adde	%[h], %[h], 17		\n\t"            \
        "addze	%[o], %[o]		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "16", "17", "cc"                               \
    )
/* Square va and store double size result in: vh | vl */
#define SP_ASM_SQR(vl, vh, va)                           \
    __asm__ __volatile__ (                               \
        "mulld	%[l], %[a], %[a]	\n\t"            \
        "mulhdu	%[h], %[a], %[a]	\n\t"            \
        : [h] "+r" (vh), [l] "+r" (vl)                   \
        : [a] "r" (va)                                   \
        : "memory"                                       \
    )
/* Square va and add double size result into: vo | vh | vl */
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
    __asm__ __volatile__ (                               \
        "mulld	16, %[a], %[a]		\n\t"            \
        "mulhdu	17, %[a], %[a]		\n\t"            \
        "addc	%[l], %[l], 16		\n\t"            \
        "adde	%[h], %[h], 17		\n\t"            \
        "addze	%[o], %[o]		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va)                                   \
        : "16", "17", "cc"                               \
    )
/* Square va and add double size result into: vh | vl */
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
    __asm__ __volatile__ (                               \
        "mulld	16, %[a], %[a]		\n\t"            \
        "mulhdu	17, %[a], %[a]		\n\t"            \
        "addc	%[l], %[l], 16		\n\t"            \
        "adde	%[h], %[h], 17		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "16", "17", "cc"                               \
    )
/* Add va into: vh | vl */
#define SP_ASM_ADDC(vl, vh, va)                          \
    __asm__ __volatile__ (                               \
        "addc	%[l], %[l], %[a]	\n\t"            \
        "addze	%[h], %[h]		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "cc"                                           \
    )
/* Sub va from: vh | vl */
#define SP_ASM_SUBB(vl, vh, va)                          \
    __asm__ __volatile__ (                               \
        "subfc	%[l], %[a], %[l]	\n\t"            \
        "li    16, 0			\n\t"            \
        "subfe %[h], 16, %[h]		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "16", "cc"                                     \
    )
/* Add two times vc | vb | va into vo | vh | vl */
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
    __asm__ __volatile__ (                               \
        "addc	%[l], %[l], %[a]	\n\t"            \
        "adde	%[h], %[h], %[b]	\n\t"            \
        "adde	%[o], %[o], %[c]	\n\t"            \
        "addc	%[l], %[l], %[a]	\n\t"            \
        "adde	%[h], %[h], %[b]	\n\t"            \
        "adde	%[o], %[o], %[c]	\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb), [c] "r" (vc)       \
        : "cc"                                           \
    )
/* Count leading zeros. */
#define SP_ASM_LZCNT(va, vn)                             \
    __asm__ __volatile__ (                               \
        "cntlzd	%[n], %[a]	\n\t"                    \
        : [n] "=r" (vn)                                  \
        : [a] "r" (va)                                   \
        :                                                \
    )

#define SP_INT_ASM_AVAILABLE

    #endif /* WOLFSSL_SP_PPC64 && SP_WORD_SIZE == 64 */

    #if defined(WOLFSSL_SP_PPC) && SP_WORD_SIZE == 32
/*
 * CPU: PPC 32-bit
 */

/* Multiply va by vb and store double size result in: vh | vl */
#define SP_ASM_MUL(vl, vh, va, vb)                       \
    __asm__ __volatile__ (                               \
        "mullw	%[l], %[a], %[b]	\n\t"            \
        "mulhwu	%[h], %[a], %[b]	\n\t"            \
        : [h] "+r" (vh), [l] "+r" (vl)                   \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "memory"                                       \
    )
/* Multiply va by vb and store double size result in: vo | vh | vl */
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        "mulhwu	%[h], %[a], %[b]	\n\t"            \
        "mullw	%[l], %[a], %[b]	\n\t"            \
        "li	%[o], 0			\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "=r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
    )
/* Multiply va by vb and add double size result into: vo | vh | vl */
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        "mullw	16, %[a], %[b]		\n\t"            \
        "mulhwu	17, %[a], %[b]		\n\t"            \
        "addc	%[l], %[l], 16		\n\t"            \
        "adde	%[h], %[h], 17		\n\t"            \
        "addze	%[o], %[o]		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "16", "17", "cc"                               \
    )
/* Multiply va by vb and add double size result into: vh | vl */
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
    __asm__ __volatile__ (                               \
        "mullw	16, %[a], %[b]		\n\t"            \
        "mulhwu	17, %[a], %[b]		\n\t"            \
        "addc	%[l], %[l], 16		\n\t"            \
        "adde	%[h], %[h], 17		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "16", "17", "cc"                               \
    )
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
    __asm__ __volatile__ (                               \
        "mullw	16, %[a], %[b]		\n\t"            \
        "mulhwu	17, %[a], %[b]		\n\t"            \
        "addc	%[l], %[l], 16		\n\t"            \
        "adde	%[h], %[h], 17		\n\t"            \
        "addze	%[o], %[o]		\n\t"            \
        "addc	%[l], %[l], 16		\n\t"            \
        "adde	%[h], %[h], 17		\n\t"            \
        "addze	%[o], %[o]		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "16", "17", "cc"                               \
    )
/* Multiply va by vb and add double size result twice into: vo | vh | vl
 * Assumes first add will not overflow vh | vl
 */
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
    __asm__ __volatile__ (                               \
        "mullw	16, %[a], %[b]		\n\t"            \
        "mulhwu	17, %[a], %[b]		\n\t"            \
        "addc	%[l], %[l], 16		\n\t"            \
        "adde	%[h], %[h], 17		\n\t"            \
        "addc	%[l], %[l], 16		\n\t"            \
        "adde	%[h], %[h], 17		\n\t"            \
        "addze	%[o], %[o]		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "16", "17", "cc"                               \
    )
/* Square va and store double size result in: vh | vl */
#define SP_ASM_SQR(vl, vh, va)                           \
    __asm__ __volatile__ (                               \
        "mullw	%[l], %[a], %[a]	\n\t"            \
        "mulhwu	%[h], %[a], %[a]	\n\t"            \
        : [h] "+r" (vh), [l] "+r" (vl)                   \
        : [a] "r" (va)                                   \
        : "memory"                                       \
    )
/* Square va and add double size result into: vo | vh | vl */
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
    __asm__ __volatile__ (                               \
        "mullw	16, %[a], %[a]		\n\t"            \
        "mulhwu	17, %[a], %[a]		\n\t"            \
        "addc	%[l], %[l], 16		\n\t"            \
        "adde	%[h], %[h], 17		\n\t"            \
        "addze	%[o], %[o]		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va)                                   \
        : "16", "17", "cc"                               \
    )
/* Square va and add double size result into: vh | vl */
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
    __asm__ __volatile__ (                               \
        "mullw	16, %[a], %[a]		\n\t"            \
        "mulhwu	17, %[a], %[a]		\n\t"            \
        "addc	%[l], %[l], 16		\n\t"            \
        "adde	%[h], %[h], 17		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "16", "17", "cc"                               \
    )
/* Add va into: vh | vl */
#define SP_ASM_ADDC(vl, vh, va)                          \
    __asm__ __volatile__ (                               \
        "addc	%[l], %[l], %[a]	\n\t"            \
        "addze	%[h], %[h]		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "cc"                                           \
    )
/* Sub va from: vh | vl */
#define SP_ASM_SUBB(vl, vh, va)                          \
    __asm__ __volatile__ (                               \
        "subfc	%[l], %[a], %[l]	\n\t"            \
        "li	16, 0			\n\t"            \
        "subfe	%[h], 16, %[h]		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "16", "cc"                                     \
    )
/* Add two times vc | vb | va into vo | vh | vl */
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
    __asm__ __volatile__ (                               \
        "addc	%[l], %[l], %[a]	\n\t"            \
        "adde	%[h], %[h], %[b]	\n\t"            \
        "adde	%[o], %[o], %[c]	\n\t"            \
        "addc	%[l], %[l], %[a]	\n\t"            \
        "adde	%[h], %[h], %[b]	\n\t"            \
        "adde	%[o], %[o], %[c]	\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb), [c] "r" (vc)       \
        : "cc"                                           \
    )
/* Count leading zeros. */
#define SP_ASM_LZCNT(va, vn)                             \
    __asm__ __volatile__ (                               \
        "cntlzw	%[n], %[a]	\n\t"                    \
        : [n] "=r" (vn)                                  \
        : [a] "r" (va)                                   \
    )

#define SP_INT_ASM_AVAILABLE

    #endif /* WOLFSSL_SP_PPC && SP_WORD_SIZE == 64 */

    #if defined(WOLFSSL_SP_MIPS64) && SP_WORD_SIZE == 64
/*
 * CPU: MIPS 64-bit
 */

/* Multiply va by vb and store double size result in: vh | vl */
#define SP_ASM_MUL(vl, vh, va, vb)                       \
    __asm__ __volatile__ (                               \
        "dmultu	%[a], %[b]		\n\t"            \
        "mflo	%[l]			\n\t"            \
        "mfhi	%[h]			\n\t"            \
        : [h] "+r" (vh), [l] "+r" (vl)                   \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "memory", "$lo", "$hi"                         \
    )
/* Multiply va by vb and store double size result in: vo | vh | vl */
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        "dmultu	%[a], %[b]		\n\t"            \
        "mflo	%[l]			\n\t"            \
        "mfhi	%[h]			\n\t"            \
        "move	%[o], $0		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "=r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "$lo", "$hi"                                   \
    )
/* Multiply va by vb and add double size result into: vo | vh | vl */
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        "dmultu	%[a], %[b]		\n\t"            \
        "mflo	$10			\n\t"            \
        "mfhi	$11			\n\t"            \
        "daddu	%[l], %[l], $10		\n\t"            \
        "sltu	$12, %[l], $10		\n\t"            \
        "daddu	%[h], %[h], $12		\n\t"            \
        "sltu	$12, %[h], $12		\n\t"            \
        "daddu	%[o], %[o], $12		\n\t"            \
        "daddu	%[h], %[h], $11		\n\t"            \
        "sltu	$12, %[h], $11		\n\t"            \
        "daddu	%[o], %[o], $12		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "$10", "$11", "$12", "$lo", "$hi"              \
    )
/* Multiply va by vb and add double size result into: vh | vl */
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
    __asm__ __volatile__ (                               \
        "dmultu	%[a], %[b]		\n\t"            \
        "mflo	$10			\n\t"            \
        "mfhi	$11			\n\t"            \
        "daddu	%[l], %[l], $10		\n\t"            \
        "sltu	$12, %[l], $10		\n\t"            \
        "daddu	%[h], %[h], $11		\n\t"            \
        "daddu	%[h], %[h], $12		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "$10", "$11", "$12", "$lo", "$hi"              \
    )
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
    __asm__ __volatile__ (                               \
        "dmultu	%[a], %[b]		\n\t"            \
        "mflo	$10			\n\t"            \
        "mfhi	$11			\n\t"            \
        "daddu	%[l], %[l], $10		\n\t"            \
        "sltu	$12, %[l], $10		\n\t"            \
        "daddu	%[h], %[h], $12		\n\t"            \
        "sltu	$12, %[h], $12		\n\t"            \
        "daddu	%[o], %[o], $12		\n\t"            \
        "daddu	%[h], %[h], $11		\n\t"            \
        "sltu	$12, %[h], $11		\n\t"            \
        "daddu	%[o], %[o], $12		\n\t"            \
        "daddu	%[l], %[l], $10		\n\t"            \
        "sltu	$12, %[l], $10		\n\t"            \
        "daddu	%[h], %[h], $12		\n\t"            \
        "sltu	$12, %[h], $12		\n\t"            \
        "daddu	%[o], %[o], $12		\n\t"            \
        "daddu	%[h], %[h], $11		\n\t"            \
        "sltu	$12, %[h], $11		\n\t"            \
        "daddu	%[o], %[o], $12		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "$10", "$11", "$12", "$lo", "$hi"              \
    )
/* Multiply va by vb and add double size result twice into: vo | vh | vl
 * Assumes first add will not overflow vh | vl
 */
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
    __asm__ __volatile__ (                               \
        "dmultu	%[a], %[b]		\n\t"            \
        "mflo	$10			\n\t"            \
        "mfhi	$11			\n\t"            \
        "daddu	%[l], %[l], $10		\n\t"            \
        "sltu	$12, %[l], $10		\n\t"            \
        "daddu	%[h], %[h], $11		\n\t"            \
        "daddu	%[h], %[h], $12		\n\t"            \
        "daddu	%[l], %[l], $10		\n\t"            \
        "sltu	$12, %[l], $10		\n\t"            \
        "daddu	%[h], %[h], $12		\n\t"            \
        "sltu	$12, %[h], $12		\n\t"            \
        "daddu	%[o], %[o], $12		\n\t"            \
        "daddu	%[h], %[h], $11		\n\t"            \
        "sltu	$12, %[h], $11		\n\t"            \
        "daddu	%[o], %[o], $12		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "$10", "$11", "$12", "$lo", "$hi"              \
    )
/* Square va and store double size result in: vh | vl */
#define SP_ASM_SQR(vl, vh, va)                           \
    __asm__ __volatile__ (                               \
        "dmultu	%[a], %[a]		\n\t"            \
        "mflo	%[l]			\n\t"            \
        "mfhi	%[h]			\n\t"            \
        : [h] "+r" (vh), [l] "+r" (vl)                   \
        : [a] "r" (va)                                   \
        : "memory", "$lo", "$hi"                         \
    )
/* Square va and add double size result into: vo | vh | vl */
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
    __asm__ __volatile__ (                               \
        "dmultu	%[a], %[a]		\n\t"            \
        "mflo	$10			\n\t"            \
        "mfhi	$11			\n\t"            \
        "daddu	%[l], %[l], $10		\n\t"            \
        "sltu	$12, %[l], $10		\n\t"            \
        "daddu	%[h], %[h], $12		\n\t"            \
        "sltu	$12, %[h], $12		\n\t"            \
        "daddu	%[o], %[o], $12		\n\t"            \
        "daddu	%[h], %[h], $11		\n\t"            \
        "sltu	$12, %[h], $11		\n\t"            \
        "daddu	%[o], %[o], $12		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va)                                   \
        : "$10", "$11", "$12", "$lo", "$hi"              \
    )
/* Square va and add double size result into: vh | vl */
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
    __asm__ __volatile__ (                               \
        "dmultu	%[a], %[a]		\n\t"            \
        "mflo	$10			\n\t"            \
        "mfhi	$11			\n\t"            \
        "daddu	%[l], %[l], $10		\n\t"            \
        "sltu	$12, %[l], $10		\n\t"            \
        "daddu	%[h], %[h], $11		\n\t"            \
        "daddu	%[h], %[h], $12		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "$10", "$11", "$12", "$lo", "$hi"              \
    )
/* Add va into: vh | vl */
#define SP_ASM_ADDC(vl, vh, va)                          \
    __asm__ __volatile__ (                               \
        "daddu	%[l], %[l], %[a]	\n\t"            \
        "sltu	$12, %[l], %[a]		\n\t"            \
        "daddu	%[h], %[h], $12		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "$12"                                          \
    )
/* Sub va from: vh | vl */
#define SP_ASM_SUBB(vl, vh, va)                          \
    __asm__ __volatile__ (                               \
        "move	$12, %[l]		\n\t"            \
        "dsubu	%[l], $12, %[a]		\n\t"            \
        "sltu	$12, $12, %[l]		\n\t"            \
        "dsubu	%[h], %[h], $12		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "$12"                                          \
    )
/* Add two times vc | vb | va into vo | vh | vl */
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
    __asm__ __volatile__ (                               \
        "daddu	%[l], %[l], %[a]	\n\t"            \
        "sltu	$12, %[l], %[a]		\n\t"            \
        "daddu	%[h], %[h], $12		\n\t"            \
        "sltu	$12, %[h], $12		\n\t"            \
        "daddu	%[o], %[o], $12		\n\t"            \
        "daddu	%[h], %[h], %[b]	\n\t"            \
        "sltu	$12, %[h], %[b]		\n\t"            \
        "daddu	%[o], %[o], %[c]	\n\t"            \
        "daddu	%[o], %[o], $12		\n\t"            \
        "daddu	%[l], %[l], %[a]	\n\t"            \
        "sltu	$12, %[l], %[a]		\n\t"            \
        "daddu	%[h], %[h], $12		\n\t"            \
        "sltu	$12, %[h], $12		\n\t"            \
        "daddu	%[o], %[o], $12		\n\t"            \
        "daddu	%[h], %[h], %[b]	\n\t"            \
        "sltu	$12, %[h], %[b]		\n\t"            \
        "daddu	%[o], %[o], %[c]	\n\t"            \
        "daddu	%[o], %[o], $12		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb), [c] "r" (vc)       \
        : "$12"                                          \
    )

#define SP_INT_ASM_AVAILABLE

    #endif /* WOLFSSL_SP_MIPS64 && SP_WORD_SIZE == 64 */

    #if defined(WOLFSSL_SP_MIPS) && SP_WORD_SIZE == 32
/*
 * CPU: MIPS 32-bit
 */

/* Multiply va by vb and store double size result in: vh | vl */
#define SP_ASM_MUL(vl, vh, va, vb)                       \
    __asm__ __volatile__ (                               \
        "multu	%[a], %[b]		\n\t"            \
        "mflo	%[l]			\n\t"            \
        "mfhi	%[h]			\n\t"            \
        : [h] "+r" (vh), [l] "+r" (vl)                   \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "memory", "%lo", "%hi"                         \
    )
/* Multiply va by vb and store double size result in: vo | vh | vl */
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        "multu	%[a], %[b]		\n\t"            \
        "mflo	%[l]			\n\t"            \
        "mfhi	%[h]			\n\t"            \
        "move	%[o], $0		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "=r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "%lo", "%hi"                                   \
    )
/* Multiply va by vb and add double size result into: vo | vh | vl */
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        "multu	%[a], %[b]		\n\t"            \
        "mflo	$10			\n\t"            \
        "mfhi	$11			\n\t"            \
        "addu	%[l], %[l], $10		\n\t"            \
        "sltu	$12, %[l], $10		\n\t"            \
        "addu	%[h], %[h], $12		\n\t"            \
        "sltu	$12, %[h], $12		\n\t"            \
        "addu	%[o], %[o], $12		\n\t"            \
        "addu	%[h], %[h], $11		\n\t"            \
        "sltu	$12, %[h], $11		\n\t"            \
        "addu	%[o], %[o], $12		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "$10", "$11", "$12", "%lo", "%hi"              \
    )
/* Multiply va by vb and add double size result into: vh | vl */
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
    __asm__ __volatile__ (                               \
        "multu	%[a], %[b]		\n\t"            \
        "mflo	$10			\n\t"            \
        "mfhi	$11			\n\t"            \
        "addu	%[l], %[l], $10		\n\t"            \
        "sltu	$12, %[l], $10		\n\t"            \
        "addu	%[h], %[h], $11		\n\t"            \
        "addu	%[h], %[h], $12		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "$10", "$11", "$12", "%lo", "%hi"              \
    )
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
    __asm__ __volatile__ (                               \
        "multu	%[a], %[b]		\n\t"            \
        "mflo	$10			\n\t"            \
        "mfhi	$11			\n\t"            \
        "addu	%[l], %[l], $10		\n\t"            \
        "sltu	$12, %[l], $10		\n\t"            \
        "addu	%[h], %[h], $12		\n\t"            \
        "sltu	$12, %[h], $12		\n\t"            \
        "addu	%[o], %[o], $12		\n\t"            \
        "addu	%[h], %[h], $11		\n\t"            \
        "sltu	$12, %[h], $11		\n\t"            \
        "addu	%[o], %[o], $12		\n\t"            \
        "addu	%[l], %[l], $10		\n\t"            \
        "sltu	$12, %[l], $10		\n\t"            \
        "addu	%[h], %[h], $12		\n\t"            \
        "sltu	$12, %[h], $12		\n\t"            \
        "addu	%[o], %[o], $12		\n\t"            \
        "addu	%[h], %[h], $11		\n\t"            \
        "sltu	$12, %[h], $11		\n\t"            \
        "addu	%[o], %[o], $12		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "$10", "$11", "$12", "%lo", "%hi"              \
    )
/* Multiply va by vb and add double size result twice into: vo | vh | vl
 * Assumes first add will not overflow vh | vl
 */
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
    __asm__ __volatile__ (                               \
        "multu	%[a], %[b]		\n\t"            \
        "mflo	$10			\n\t"            \
        "mfhi	$11			\n\t"            \
        "addu	%[l], %[l], $10		\n\t"            \
        "sltu	$12, %[l], $10		\n\t"            \
        "addu	%[h], %[h], $11		\n\t"            \
        "addu	%[h], %[h], $12		\n\t"            \
        "addu	%[l], %[l], $10		\n\t"            \
        "sltu	$12, %[l], $10		\n\t"            \
        "addu	%[h], %[h], $12		\n\t"            \
        "sltu	$12, %[h], $12		\n\t"            \
        "addu	%[o], %[o], $12		\n\t"            \
        "addu	%[h], %[h], $11		\n\t"            \
        "sltu	$12, %[h], $11		\n\t"            \
        "addu	%[o], %[o], $12		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "$10", "$11", "$12", "%lo", "%hi"              \
    )
/* Square va and store double size result in: vh | vl */
#define SP_ASM_SQR(vl, vh, va)                           \
    __asm__ __volatile__ (                               \
        "multu	%[a], %[a]		\n\t"            \
        "mflo	%[l]			\n\t"            \
        "mfhi	%[h]			\n\t"            \
        : [h] "+r" (vh), [l] "+r" (vl)                   \
        : [a] "r" (va)                                   \
        : "memory", "%lo", "%hi"                         \
    )
/* Square va and add double size result into: vo | vh | vl */
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
    __asm__ __volatile__ (                               \
        "multu	%[a], %[a]		\n\t"            \
        "mflo	$10			\n\t"            \
        "mfhi	$11			\n\t"            \
        "addu	%[l], %[l], $10		\n\t"            \
        "sltu	$12, %[l], $10		\n\t"            \
        "addu	%[h], %[h], $12		\n\t"            \
        "sltu	$12, %[h], $12		\n\t"            \
        "addu	%[o], %[o], $12		\n\t"            \
        "addu	%[h], %[h], $11		\n\t"            \
        "sltu	$12, %[h], $11		\n\t"            \
        "addu	%[o], %[o], $12		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va)                                   \
        : "$10", "$11", "$12", "%lo", "%hi"              \
    )
/* Square va and add double size result into: vh | vl */
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
    __asm__ __volatile__ (                               \
        "multu	%[a], %[a]		\n\t"            \
        "mflo	$10			\n\t"            \
        "mfhi	$11			\n\t"            \
        "addu	%[l], %[l], $10		\n\t"            \
        "sltu	$12, %[l], $10		\n\t"            \
        "addu	%[h], %[h], $11		\n\t"            \
        "addu	%[h], %[h], $12		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "$10", "$11", "$12", "%lo", "%hi"              \
    )
/* Add va into: vh | vl */
#define SP_ASM_ADDC(vl, vh, va)                          \
    __asm__ __volatile__ (                               \
        "addu	%[l], %[l], %[a]	\n\t"            \
        "sltu	$12, %[l], %[a]		\n\t"            \
        "addu	%[h], %[h], $12		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "$12"                                          \
    )
/* Sub va from: vh | vl */
#define SP_ASM_SUBB(vl, vh, va)                          \
    __asm__ __volatile__ (                               \
        "move	$12, %[l]		\n\t"            \
        "subu	%[l], $12, %[a]		\n\t"            \
        "sltu	$12, $12, %[l]		\n\t"            \
        "subu	%[h], %[h], $12		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "$12"                                          \
    )
/* Add two times vc | vb | va into vo | vh | vl */
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
    __asm__ __volatile__ (                               \
        "addu	%[l], %[l], %[a]	\n\t"            \
        "sltu	$12, %[l], %[a]		\n\t"            \
        "addu	%[h], %[h], $12		\n\t"            \
        "sltu	$12, %[h], $12		\n\t"            \
        "addu	%[o], %[o], $12		\n\t"            \
        "addu	%[h], %[h], %[b]	\n\t"            \
        "sltu	$12, %[h], %[b]		\n\t"            \
        "addu	%[o], %[o], %[c]	\n\t"            \
        "addu	%[o], %[o], $12		\n\t"            \
        "addu	%[l], %[l], %[a]	\n\t"            \
        "sltu	$12, %[l], %[a]		\n\t"            \
        "addu	%[h], %[h], $12		\n\t"            \
        "sltu	$12, %[h], $12		\n\t"            \
        "addu	%[o], %[o], $12		\n\t"            \
        "addu	%[h], %[h], %[b]	\n\t"            \
        "sltu	$12, %[h], %[b]		\n\t"            \
        "addu	%[o], %[o], %[c]	\n\t"            \
        "addu	%[o], %[o], $12		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb), [c] "r" (vc)       \
        : "$12"                                          \
    )

#define SP_INT_ASM_AVAILABLE

    #endif /* WOLFSSL_SP_MIPS && SP_WORD_SIZE == 32 */

    #if defined(WOLFSSL_SP_RISCV64) && SP_WORD_SIZE == 64
/*
 * CPU: RISCV 64-bit
 */

/* Multiply va by vb and store double size result in: vh | vl */
#define SP_ASM_MUL(vl, vh, va, vb)                       \
    __asm__ __volatile__ (                               \
        "mul	%[l], %[a], %[b]	\n\t"            \
        "mulhu	%[h], %[a], %[b]	\n\t"            \
        : [h] "+r" (vh), [l] "+r" (vl)                   \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "memory"                                       \
    )
/* Multiply va by vb and store double size result in: vo | vh | vl */
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        "mulhu	%[h], %[a], %[b]	\n\t"            \
        "mul	%[l], %[a], %[b]	\n\t"            \
        "add	%[o], zero, zero	\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "=r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        :                                                \
    )
/* Multiply va by vb and add double size result into: vo | vh | vl */
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        "mul	a5, %[a], %[b]		\n\t"            \
        "mulhu	a6, %[a], %[b]		\n\t"            \
        "add	%[l], %[l], a5		\n\t"            \
        "sltu	a7, %[l], a5		\n\t"            \
        "add	%[h], %[h], a7		\n\t"            \
        "sltu	a7, %[h], a7		\n\t"            \
        "add	%[o], %[o], a7		\n\t"            \
        "add	%[h], %[h], a6		\n\t"            \
        "sltu	a7, %[h], a6		\n\t"            \
        "add	%[o], %[o], a7		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "a5", "a6", "a7"                               \
    )
/* Multiply va by vb and add double size result into: vh | vl */
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
    __asm__ __volatile__ (                               \
        "mul	a5, %[a], %[b]		\n\t"            \
        "mulhu	a6, %[a], %[b]		\n\t"            \
        "add	%[l], %[l], a5		\n\t"            \
        "sltu	a7, %[l], a5		\n\t"            \
        "add	%[h], %[h], a6		\n\t"            \
        "add	%[h], %[h], a7		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "a5", "a6", "a7"                               \
    )
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
    __asm__ __volatile__ (                               \
        "mul	a5, %[a], %[b]		\n\t"            \
        "mulhu	a6, %[a], %[b]		\n\t"            \
        "add	%[l], %[l], a5		\n\t"            \
        "sltu	a7, %[l], a5		\n\t"            \
        "add	%[h], %[h], a7		\n\t"            \
        "sltu	a7, %[h], a7		\n\t"            \
        "add	%[o], %[o], a7		\n\t"            \
        "add	%[h], %[h], a6		\n\t"            \
        "sltu	a7, %[h], a6		\n\t"            \
        "add	%[o], %[o], a7		\n\t"            \
        "add	%[l], %[l], a5		\n\t"            \
        "sltu	a7, %[l], a5		\n\t"            \
        "add	%[h], %[h], a7		\n\t"            \
        "sltu	a7, %[h], a7		\n\t"            \
        "add	%[o], %[o], a7		\n\t"            \
        "add	%[h], %[h], a6		\n\t"            \
        "sltu	a7, %[h], a6		\n\t"            \
        "add	%[o], %[o], a7		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "a5", "a6", "a7"                               \
    )
/* Multiply va by vb and add double size result twice into: vo | vh | vl
 * Assumes first add will not overflow vh | vl
 */
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
    __asm__ __volatile__ (                               \
        "mul	a5, %[a], %[b]		\n\t"            \
        "mulhu	a6, %[a], %[b]		\n\t"            \
        "add	%[l], %[l], a5		\n\t"            \
        "sltu	a7, %[l], a5		\n\t"            \
        "add	%[h], %[h], a6		\n\t"            \
        "add	%[h], %[h], a7		\n\t"            \
        "add	%[l], %[l], a5		\n\t"            \
        "sltu	a7, %[l], a5		\n\t"            \
        "add	%[h], %[h], a7		\n\t"            \
        "sltu	a7, %[h], a7		\n\t"            \
        "add	%[o], %[o], a7		\n\t"            \
        "add	%[h], %[h], a6		\n\t"            \
        "sltu	a7, %[h], a6		\n\t"            \
        "add	%[o], %[o], a7		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "a5", "a6", "a7"                               \
    )
/* Square va and store double size result in: vh | vl */
#define SP_ASM_SQR(vl, vh, va)                           \
    __asm__ __volatile__ (                               \
        "mul	%[l], %[a], %[a]	\n\t"            \
        "mulhu	%[h], %[a], %[a]	\n\t"            \
        : [h] "+r" (vh), [l] "+r" (vl)                   \
        : [a] "r" (va)                                   \
        : "memory"                                       \
    )
/* Square va and add double size result into: vo | vh | vl */
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
    __asm__ __volatile__ (                               \
        "mul	a5, %[a], %[a]		\n\t"            \
        "mulhu	a6, %[a], %[a]		\n\t"            \
        "add	%[l], %[l], a5		\n\t"            \
        "sltu	a7, %[l], a5		\n\t"            \
        "add	%[h], %[h], a7		\n\t"            \
        "sltu	a7, %[h], a7		\n\t"            \
        "add	%[o], %[o], a7		\n\t"            \
        "add	%[h], %[h], a6		\n\t"            \
        "sltu	a7, %[h], a6		\n\t"            \
        "add	%[o], %[o], a7		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va)                                   \
        : "a5", "a6", "a7"                               \
    )
/* Square va and add double size result into: vh | vl */
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
    __asm__ __volatile__ (                               \
        "mul	a5, %[a], %[a]		\n\t"            \
        "mulhu	a6, %[a], %[a]		\n\t"            \
        "add	%[l], %[l], a5		\n\t"            \
        "sltu	a7, %[l], a5		\n\t"            \
        "add	%[h], %[h], a6		\n\t"            \
        "add	%[h], %[h], a7		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "a5", "a6", "a7"                               \
    )
/* Add va into: vh | vl */
#define SP_ASM_ADDC(vl, vh, va)                          \
    __asm__ __volatile__ (                               \
        "add	%[l], %[l], %[a]	\n\t"            \
        "sltu	a7, %[l], %[a]		\n\t"            \
        "add	%[h], %[h], a7		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "a7"                                           \
    )
/* Sub va from: vh | vl */
#define SP_ASM_SUBB(vl, vh, va)                          \
    __asm__ __volatile__ (                               \
        "add	a7, %[l], zero		\n\t"            \
        "sub	%[l], a7, %[a]		\n\t"            \
        "sltu	a7, a7, %[l]		\n\t"            \
        "sub	%[h], %[h], a7		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "a7"                                           \
    )
/* Add two times vc | vb | va into vo | vh | vl */
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
    __asm__ __volatile__ (                               \
        "add	%[l], %[l], %[a]	\n\t"            \
        "sltu	a7, %[l], %[a]		\n\t"            \
        "add	%[h], %[h], a7		\n\t"            \
        "sltu	a7, %[h], a7		\n\t"            \
        "add	%[o], %[o], a7		\n\t"            \
        "add	%[h], %[h], %[b]	\n\t"            \
        "sltu	a7, %[h], %[b]		\n\t"            \
        "add	%[o], %[o], %[c]	\n\t"            \
        "add	%[o], %[o], a7		\n\t"            \
        "add	%[l], %[l], %[a]	\n\t"            \
        "sltu	a7, %[l], %[a]		\n\t"            \
        "add	%[h], %[h], a7		\n\t"            \
        "sltu	a7, %[h], a7		\n\t"            \
        "add	%[o], %[o], a7		\n\t"            \
        "add	%[h], %[h], %[b]	\n\t"            \
        "sltu	a7, %[h], %[b]		\n\t"            \
        "add	%[o], %[o], %[c]	\n\t"            \
        "add	%[o], %[o], a7		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb), [c] "r" (vc)       \
        : "a7"                                           \
    )

#define SP_INT_ASM_AVAILABLE

    #endif /* WOLFSSL_SP_RISCV64 && SP_WORD_SIZE == 64 */

    #if defined(WOLFSSL_SP_RISCV32) && SP_WORD_SIZE == 32
/*
 * CPU: RISCV 32-bit
 */

/* Multiply va by vb and store double size result in: vh | vl */
#define SP_ASM_MUL(vl, vh, va, vb)                       \
    __asm__ __volatile__ (                               \
        "mul	%[l], %[a], %[b]	\n\t"            \
        "mulhu	%[h], %[a], %[b]	\n\t"            \
        : [h] "+r" (vh), [l] "+r" (vl)                   \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "memory"                                       \
    )
/* Multiply va by vb and store double size result in: vo | vh | vl */
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        "mulhu	%[h], %[a], %[b]	\n\t"            \
        "mul	%[l], %[a], %[b]	\n\t"            \
        "add	%[o], zero, zero	\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "=r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        :                                                \
    )
/* Multiply va by vb and add double size result into: vo | vh | vl */
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        "mul	a5, %[a], %[b]		\n\t"            \
        "mulhu	a6, %[a], %[b]		\n\t"            \
        "add	%[l], %[l], a5		\n\t"            \
        "sltu	a7, %[l], a5		\n\t"            \
        "add	%[h], %[h], a7		\n\t"            \
        "sltu	a7, %[h], a7		\n\t"            \
        "add	%[o], %[o], a7		\n\t"            \
        "add	%[h], %[h], a6		\n\t"            \
        "sltu	a7, %[h], a6		\n\t"            \
        "add	%[o], %[o], a7		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "a5", "a6", "a7"                               \
    )
/* Multiply va by vb and add double size result into: vh | vl */
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
    __asm__ __volatile__ (                               \
        "mul	a5, %[a], %[b]		\n\t"            \
        "mulhu	a6, %[a], %[b]		\n\t"            \
        "add	%[l], %[l], a5		\n\t"            \
        "sltu	a7, %[l], a5		\n\t"            \
        "add	%[h], %[h], a6		\n\t"            \
        "add	%[h], %[h], a7		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "a5", "a6", "a7"                               \
    )
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
    __asm__ __volatile__ (                               \
        "mul	a5, %[a], %[b]		\n\t"            \
        "mulhu	a6, %[a], %[b]		\n\t"            \
        "add	%[l], %[l], a5		\n\t"            \
        "sltu	a7, %[l], a5		\n\t"            \
        "add	%[h], %[h], a7		\n\t"            \
        "sltu	a7, %[h], a7		\n\t"            \
        "add	%[o], %[o], a7		\n\t"            \
        "add	%[h], %[h], a6		\n\t"            \
        "sltu	a7, %[h], a6		\n\t"            \
        "add	%[o], %[o], a7		\n\t"            \
        "add	%[l], %[l], a5		\n\t"            \
        "sltu	a7, %[l], a5		\n\t"            \
        "add	%[h], %[h], a7		\n\t"            \
        "sltu	a7, %[h], a7		\n\t"            \
        "add	%[o], %[o], a7		\n\t"            \
        "add	%[h], %[h], a6		\n\t"            \
        "sltu	a7, %[h], a6		\n\t"            \
        "add	%[o], %[o], a7		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "a5", "a6", "a7"                               \
    )
/* Multiply va by vb and add double size result twice into: vo | vh | vl
 * Assumes first add will not overflow vh | vl
 */
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
    __asm__ __volatile__ (                               \
        "mul	a5, %[a], %[b]		\n\t"            \
        "mulhu	a6, %[a], %[b]		\n\t"            \
        "add	%[l], %[l], a5		\n\t"            \
        "sltu	a7, %[l], a5		\n\t"            \
        "add	%[h], %[h], a6		\n\t"            \
        "add	%[h], %[h], a7		\n\t"            \
        "add	%[l], %[l], a5		\n\t"            \
        "sltu	a7, %[l], a5		\n\t"            \
        "add	%[h], %[h], a7		\n\t"            \
        "sltu	a7, %[h], a7		\n\t"            \
        "add	%[o], %[o], a7		\n\t"            \
        "add	%[h], %[h], a6		\n\t"            \
        "sltu	a7, %[h], a6		\n\t"            \
        "add	%[o], %[o], a7		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "a5", "a6", "a7"                               \
    )
/* Square va and store double size result in: vh | vl */
#define SP_ASM_SQR(vl, vh, va)                           \
    __asm__ __volatile__ (                               \
        "mul	%[l], %[a], %[a]	\n\t"            \
        "mulhu	%[h], %[a], %[a]	\n\t"            \
        : [h] "+r" (vh), [l] "+r" (vl)                   \
        : [a] "r" (va)                                   \
        : "memory"                                       \
    )
/* Square va and add double size result into: vo | vh | vl */
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
    __asm__ __volatile__ (                               \
        "mul	a5, %[a], %[a]		\n\t"            \
        "mulhu	a6, %[a], %[a]		\n\t"            \
        "add	%[l], %[l], a5		\n\t"            \
        "sltu	a7, %[l], a5		\n\t"            \
        "add	%[h], %[h], a7		\n\t"            \
        "sltu	a7, %[h], a7		\n\t"            \
        "add	%[o], %[o], a7		\n\t"            \
        "add	%[h], %[h], a6		\n\t"            \
        "sltu	a7, %[h], a6		\n\t"            \
        "add	%[o], %[o], a7		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va)                                   \
        : "a5", "a6", "a7"                               \
    )
/* Square va and add double size result into: vh | vl */
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
    __asm__ __volatile__ (                               \
        "mul	a5, %[a], %[a]		\n\t"            \
        "mulhu	a6, %[a], %[a]		\n\t"            \
        "add	%[l], %[l], a5		\n\t"            \
        "sltu	a7, %[l], a5		\n\t"            \
        "add	%[h], %[h], a6		\n\t"            \
        "add	%[h], %[h], a7		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "a5", "a6", "a7"                               \
    )
/* Add va into: vh | vl */
#define SP_ASM_ADDC(vl, vh, va)                          \
    __asm__ __volatile__ (                               \
        "add	%[l], %[l], %[a]	\n\t"            \
        "sltu	a7, %[l], %[a]		\n\t"            \
        "add	%[h], %[h], a7		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "a7"                                           \
    )
/* Sub va from: vh | vl */
#define SP_ASM_SUBB(vl, vh, va)                          \
    __asm__ __volatile__ (                               \
        "add	a7, %[l], zero		\n\t"            \
        "sub	%[l], a7, %[a]		\n\t"            \
        "sltu	a7, a7, %[l]		\n\t"            \
        "sub	%[h], %[h], a7		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "a7"                                           \
    )
/* Add two times vc | vb | va into vo | vh | vl */
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
    __asm__ __volatile__ (                               \
        "add	%[l], %[l], %[a]	\n\t"            \
        "sltu	a7, %[l], %[a]		\n\t"            \
        "add	%[h], %[h], a7		\n\t"            \
        "sltu	a7, %[h], a7		\n\t"            \
        "add	%[o], %[o], a7		\n\t"            \
        "add	%[h], %[h], %[b]	\n\t"            \
        "sltu	a7, %[h], %[b]		\n\t"            \
        "add	%[o], %[o], %[c]	\n\t"            \
        "add	%[o], %[o], a7		\n\t"            \
        "add	%[l], %[l], %[a]	\n\t"            \
        "sltu	a7, %[l], %[a]		\n\t"            \
        "add	%[h], %[h], a7		\n\t"            \
        "sltu	a7, %[h], a7		\n\t"            \
        "add	%[o], %[o], a7		\n\t"            \
        "add	%[h], %[h], %[b]	\n\t"            \
        "sltu	a7, %[h], %[b]		\n\t"            \
        "add	%[o], %[o], %[c]	\n\t"            \
        "add	%[o], %[o], a7		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb), [c] "r" (vc)       \
        : "a7"                                           \
    )

#define SP_INT_ASM_AVAILABLE

    #endif /* WOLFSSL_SP_RISCV32 && SP_WORD_SIZE == 32 */

    #if defined(WOLFSSL_SP_S390X) && SP_WORD_SIZE == 64
/*
 * CPU: Intel s390x
 */

/* Multiply va by vb and store double size result in: vh | vl */
#define SP_ASM_MUL(vl, vh, va, vb)                       \
    __asm__ __volatile__ (                               \
        "lgr	%%r1, %[a]		\n\t"            \
        "mlgr	%%r0, %[b]		\n\t"            \
        "lgr	%[l], %%r1		\n\t"            \
        "lgr	%[h], %%r0		\n\t"            \
        : [h] "+r" (vh), [l] "+r" (vl)                   \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "memory", "r0", "r1"                           \
    )
/* Multiply va by vb and store double size result in: vo | vh | vl */
#define SP_ASM_MUL_SET(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        "lgr	%%r1, %[a]		\n\t"            \
        "mlgr	%%r0, %[b]		\n\t"            \
        "lghi	%[o], 0			\n\t"            \
        "lgr	%[l], %%r1		\n\t"            \
        "lgr	%[h], %%r0		\n\t"            \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "=r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "r0", "r1"                                     \
    )
/* Multiply va by vb and add double size result into: vo | vh | vl */
#define SP_ASM_MUL_ADD(vl, vh, vo, va, vb)               \
    __asm__ __volatile__ (                               \
        "lghi	%%r10, 0	\n\t"                    \
        "lgr	%%r1, %[a]		\n\t"            \
        "mlgr	%%r0, %[b]		\n\t"            \
        "algr	%[l], %%r1	\n\t"                    \
        "alcgr	%[h], %%r0	\n\t"                    \
        "alcgr	%[o], %%r10	\n\t"                    \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "r0", "r1", "r10", "cc"                        \
    )
/* Multiply va by vb and add double size result into: vh | vl */
#define SP_ASM_MUL_ADD_NO(vl, vh, va, vb)                \
    __asm__ __volatile__ (                               \
        "lgr	%%r1, %[a]		\n\t"            \
        "mlgr	%%r0, %[b]		\n\t"            \
        "algr	%[l], %%r1	\n\t"                    \
        "alcgr	%[h], %%r0	\n\t"                    \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "r0", "r1", "cc"                               \
    )
/* Multiply va by vb and add double size result twice into: vo | vh | vl */
#define SP_ASM_MUL_ADD2(vl, vh, vo, va, vb)              \
    __asm__ __volatile__ (                               \
        "lghi	%%r10, 0	\n\t"                    \
        "lgr	%%r1, %[a]		\n\t"            \
        "mlgr	%%r0, %[b]		\n\t"            \
        "algr	%[l], %%r1	\n\t"                    \
        "alcgr	%[h], %%r0	\n\t"                    \
        "alcgr	%[o], %%r10	\n\t"                    \
        "algr	%[l], %%r1	\n\t"                    \
        "alcgr	%[h], %%r0	\n\t"                    \
        "alcgr	%[o], %%r10	\n\t"                    \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "r0", "r1", "r10", "cc"                        \
    )
/* Multiply va by vb and add double size result twice into: vo | vh | vl
 * Assumes first add will not overflow vh | vl
 */
#define SP_ASM_MUL_ADD2_NO(vl, vh, vo, va, vb)           \
    __asm__ __volatile__ (                               \
        "lghi	%%r10, 0	\n\t"                    \
        "lgr	%%r1, %[a]		\n\t"            \
        "mlgr	%%r0, %[b]		\n\t"            \
        "algr	%[l], %%r1	\n\t"                    \
        "alcgr	%[h], %%r0	\n\t"                    \
        "algr	%[l], %%r1	\n\t"                    \
        "alcgr	%[h], %%r0	\n\t"                    \
        "alcgr	%[o], %%r10	\n\t"                    \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb)                     \
        : "r0", "r1", "r10", "cc"                        \
    )
/* Square va and store double size result in: vh | vl */
#define SP_ASM_SQR(vl, vh, va)                           \
    __asm__ __volatile__ (                               \
        "lgr	%%r1, %[a]		\n\t"            \
        "mlgr	%%r0, %%r1		\n\t"            \
        "lgr	%[l], %%r1		\n\t"            \
        "lgr	%[h], %%r0		\n\t"            \
        : [h] "+r" (vh), [l] "+r" (vl)                   \
        : [a] "r" (va)                                   \
        : "memory", "r0", "r1"                           \
    )
/* Square va and add double size result into: vo | vh | vl */
#define SP_ASM_SQR_ADD(vl, vh, vo, va)                   \
    __asm__ __volatile__ (                               \
        "lghi	%%r10, 0	\n\t"                    \
        "lgr	%%r1, %[a]		\n\t"            \
        "mlgr	%%r0, %%r1		\n\t"            \
        "algr	%[l], %%r1	\n\t"                    \
        "alcgr	%[h], %%r0	\n\t"                    \
        "alcgr	%[o], %%r10	\n\t"                    \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va)                                   \
        : "r0", "r1", "r10", "cc"                        \
    )
/* Square va and add double size result into: vh | vl */
#define SP_ASM_SQR_ADD_NO(vl, vh, va)                    \
    __asm__ __volatile__ (                               \
        "lgr	%%r1, %[a]		\n\t"            \
        "mlgr	%%r0, %%r1		\n\t"            \
        "algr	%[l], %%r1	\n\t"                    \
        "alcgr	%[h], %%r0	\n\t"                    \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "r0", "r1", "cc"                               \
    )
/* Add va into: vh | vl */
#define SP_ASM_ADDC(vl, vh, va)                          \
    __asm__ __volatile__ (                               \
        "lghi	%%r10, 0	\n\t"                    \
        "algr	%[l], %[a]	\n\t"                    \
        "alcgr	%[h], %%r10	\n\t"                    \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "r10", "cc"                                    \
    )
/* Sub va from: vh | vl */
#define SP_ASM_SUBB(vl, vh, va)                          \
    __asm__ __volatile__ (                               \
        "lghi	%%r10, 0	\n\t"                    \
        "slgr	%[l], %[a]	\n\t"                    \
        "slbgr	%[h], %%r10	\n\t"                    \
        : [l] "+r" (vl), [h] "+r" (vh)                   \
        : [a] "r" (va)                                   \
        : "r10", "cc"                                    \
    )
/* Add two times vc | vb | va into vo | vh | vl */
#define SP_ASM_ADD_DBL_3(vl, vh, vo, va, vb, vc)         \
    __asm__ __volatile__ (                               \
        "algr	%[l], %[a]	\n\t"                    \
        "alcgr	%[h], %[b]	\n\t"                    \
        "alcgr	%[o], %[c]	\n\t"                    \
        "algr	%[l], %[a]	\n\t"                    \
        "alcgr	%[h], %[b]	\n\t"                    \
        "alcgr	%[o], %[c]	\n\t"                    \
        : [l] "+r" (vl), [h] "+r" (vh), [o] "+r" (vo)    \
        : [a] "r" (va), [b] "r" (vb), [c] "r" (vc)       \
        : "cc"                                           \
    )

#define SP_INT_ASM_AVAILABLE

    #endif /* WOLFSSL_SP_S390X && SP_WORD_SIZE == 64 */

#ifdef SP_INT_ASM_AVAILABLE
    #ifndef SP_INT_NO_ASM
        #define SQR_MUL_ASM
    #endif
    #ifndef SP_ASM_ADDC_REG
        #define SP_ASM_ADDC_REG  SP_ASM_ADDC
    #endif /* SP_ASM_ADDC_REG */
    #ifndef SP_ASM_SUBB_REG
        #define SP_ASM_SUBB_REG  SP_ASM_SUBB
    #endif /* SP_ASM_ADDC_REG */
#endif /* SQR_MUL_ASM */

#endif /* !WOLFSSL_NO_ASM */


#if (!defined(NO_RSA) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || \
    !defined(NO_DSA) || !defined(NO_DH) || \
    (defined(HAVE_ECC) && defined(HAVE_COMP_KEY)) || defined(OPENSSL_EXTRA) || \
    (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_PUBLIC_ONLY))
#ifndef WC_NO_CACHE_RESISTANT
    /* Mask of address for constant time operations. */
    const size_t sp_off_on_addr[2] =
    {
        (size_t) 0,
        (size_t)-1
    };
#endif
#endif


#if defined(WOLFSSL_HAVE_SP_DH) || defined(WOLFSSL_HAVE_SP_RSA)

#ifdef __cplusplus
extern "C" {
#endif

/* Modular exponentiation implementations using Single Precision. */
WOLFSSL_LOCAL int sp_ModExp_1024(sp_int* base, sp_int* exp, sp_int* mod,
    sp_int* res);
WOLFSSL_LOCAL int sp_ModExp_1536(sp_int* base, sp_int* exp, sp_int* mod,
    sp_int* res);
WOLFSSL_LOCAL int sp_ModExp_2048(sp_int* base, sp_int* exp, sp_int* mod,
    sp_int* res);
WOLFSSL_LOCAL int sp_ModExp_3072(sp_int* base, sp_int* exp, sp_int* mod,
    sp_int* res);
WOLFSSL_LOCAL int sp_ModExp_4096(sp_int* base, sp_int* exp, sp_int* mod,
    sp_int* res);

#ifdef __cplusplus
} /* extern "C" */
#endif

#endif /* WOLFSSL_HAVE_SP_DH || WOLFSSL_HAVE_SP_RSA */


#if defined(WOLFSSL_SP_MATH_ALL) || defined(WOLFSSL_HAVE_SP_DH) || \
    defined(OPENSSL_ALL)
static int _sp_mont_red(sp_int* a, const sp_int* m, sp_int_digit mp);
#endif
#if defined(WOLFSSL_SP_MATH_ALL) || defined(WOLFSSL_HAVE_SP_DH) || \
    defined(WOLFCRYPT_HAVE_ECCSI) || defined(WOLFCRYPT_HAVE_SAKKE) || \
    defined(OPENSSL_ALL)
static void _sp_mont_setup(const sp_int* m, sp_int_digit* rho);
#endif

/* Determine when mp_add_d is required. */
#if !defined(NO_PWDBASED) || defined(WOLFSSL_KEY_GEN) || !defined(NO_DH) || \
    !defined(NO_DSA) || (defined(HAVE_ECC) && defined(HAVE_COMP_KEY)) || \
    (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
    defined(OPENSSL_EXTRA)
#define WOLFSSL_SP_ADD_D
#endif
/* Determine when mp_sub_d is required. */
#if (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
    !defined(NO_DH) || defined(HAVE_ECC) || !defined(NO_DSA)
#define WOLFSSL_SP_SUB_D
#endif
/* Determine when mp_read_radix with a radix of 10 is required. */
#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(NO_RSA) && \
    !defined(WOLFSSL_RSA_VERIFY_ONLY)) || defined(HAVE_ECC) || \
    !defined(NO_DSA) || defined(OPENSSL_EXTRA)
#define WOLFSSL_SP_READ_RADIX_16
#endif
/* Determine when mp_read_radix with a radix of 10 is required. */
#if defined(WOLFSSL_SP_MATH_ALL) && !defined(NO_RSA) && \
    !defined(WOLFSSL_RSA_VERIFY_ONLY)
#define WOLFSSL_SP_READ_RADIX_10
#endif
/* Determine when mp_invmod is required. */
#if defined(HAVE_ECC) || !defined(NO_DSA) || defined(OPENSSL_EXTRA) || \
    (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY) && \
     !defined(WOLFSSL_RSA_PUBLIC_ONLY))
#define WOLFSSL_SP_INVMOD
#endif
/* Determine when mp_invmod_mont_ct is required. */
#if defined(WOLFSSL_SP_MATH_ALL) && defined(HAVE_ECC)
#define WOLFSSL_SP_INVMOD_MONT_CT
#endif

/* Determine when mp_prime_gen is required. */
#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY) && \
    !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || !defined(NO_DH) || \
    (!defined(NO_RSA) && defined(WOLFSSL_KEY_GEN))
#define WOLFSSL_SP_PRIME_GEN
#endif

/* Set the multi-precision number to zero.
 *
 * Assumes a is not NULL.
 *
 * @param  [out]  a  SP integer to set to zero.
 */
static void _sp_zero(sp_int* a)
{
    sp_int_minimal* am = (sp_int_minimal *)a;

    am->used = 0;
    am->dp[0] = 0;
#ifdef WOLFSSL_SP_INT_NEGATIVE
    am->sign = MP_ZPOS;
#endif
}


/* Initialize the multi-precision number to be zero with a given max size.
 *
 * @param  [out]  a     SP integer.
 * @param  [in]   size  Number of words to say are available.
 */
static void _sp_init_size(sp_int* a, unsigned int size)
{
    volatile sp_int_minimal* am = (sp_int_minimal *)a;

#ifdef HAVE_WOLF_BIGINT
    wc_bigint_init((struct WC_BIGINT*)&am->raw);
#endif
    _sp_zero((sp_int*)am);

    am->size = size;
}

/* Initialize the multi-precision number to be zero with a given max size.
 *
 * @param  [out]  a     SP integer.
 * @param  [in]   size  Number of words to say are available.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a is NULL.
 */
int sp_init_size(sp_int* a, unsigned int size)
{
    int err = MP_OKAY;

    /* Validate parameters. Don't use size more than max compiled. */
    if ((a == NULL) || ((size <= 0) || (size > SP_INT_DIGITS))) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
        _sp_init_size(a, size);
    }

    return err;
}

/* Initialize the multi-precision number to be zero.
 *
 * @param  [out]  a  SP integer.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a is NULL.
 */
int sp_init(sp_int* a)
{
    int err = MP_OKAY;

    /* Validate parameter. */
    if (a == NULL) {
        err = MP_VAL;
    }
    else {
        /* Assume complete sp_int with SP_INT_DIGITS digits. */
        _sp_init_size(a, SP_INT_DIGITS);
    }

    return err;
}

#if !defined(WOLFSSL_RSA_PUBLIC_ONLY) || !defined(NO_DH) || defined(HAVE_ECC)
/* Initialize up to six multi-precision numbers to be zero.
 *
 * @param  [out]  n1  SP integer.
 * @param  [out]  n2  SP integer.
 * @param  [out]  n3  SP integer.
 * @param  [out]  n4  SP integer.
 * @param  [out]  n5  SP integer.
 * @param  [out]  n6  SP integer.
 *
 * @return  MP_OKAY on success.
 */
int sp_init_multi(sp_int* n1, sp_int* n2, sp_int* n3, sp_int* n4, sp_int* n5,
    sp_int* n6)
{
    /* Initialize only those pointers that are valid. */
    if (n1 != NULL) {
        _sp_init_size(n1, SP_INT_DIGITS);
    }
    if (n2 != NULL) {
        _sp_init_size(n2, SP_INT_DIGITS);
    }
    if (n3 != NULL) {
        _sp_init_size(n3, SP_INT_DIGITS);
    }
    if (n4 != NULL) {
        _sp_init_size(n4, SP_INT_DIGITS);
    }
    if (n5 != NULL) {
        _sp_init_size(n5, SP_INT_DIGITS);
    }
    if (n6 != NULL) {
        _sp_init_size(n6, SP_INT_DIGITS);
    }

    return MP_OKAY;
}
#endif /* !WOLFSSL_RSA_PUBLIC_ONLY || !NO_DH || HAVE_ECC */

/* Free the memory allocated in the multi-precision number.
 *
 * @param  [in]  a  SP integer.
 */
void sp_free(sp_int* a)
{
    if (a != NULL) {
    #ifdef HAVE_WOLF_BIGINT
        wc_bigint_free(&a->raw);
    #endif
    }
}

#if (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
    !defined(NO_DH) || defined(HAVE_ECC)
/* Grow multi-precision number to be able to hold l digits.
 * This function does nothing as the number of digits is fixed.
 *
 * @param  [in,out]  a  SP integer.
 * @param  [in]      l  Number of digits to grow to.
 *
 * @return  MP_OKAY on success
 * @return  MP_MEM if the number of digits requested is more than available.
 */
int sp_grow(sp_int* a, int l)
{
    int err = MP_OKAY;

    /* Validate parameter. */
    if ((a == NULL) || (l < 0)) {
        err = MP_VAL;
    }
    /* Ensure enough words allocated for grow. */
    if ((err == MP_OKAY) && ((unsigned int)l > a->size)) {
        err = MP_MEM;
    }
    if (err == MP_OKAY) {
        unsigned int i;

        /* Put in zeros up to the new length. */
        for (i = a->used; i < (unsigned int)l; i++) {
            a->dp[i] = 0;
        }
    }

    return err;
}
#endif /* (!NO_RSA && !WOLFSSL_RSA_VERIFY_ONLY) || !NO_DH || HAVE_ECC */

#if (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
    defined(HAVE_ECC)
/* Set the multi-precision number to zero.
 *
 * @param  [out]  a  SP integer to set to zero.
 */
void sp_zero(sp_int* a)
{
    /* Make an sp_int with valid pointer zero. */
    if (a != NULL) {
        _sp_zero(a);
    }
}
#endif /* (!NO_RSA && !WOLFSSL_RSA_VERIFY_ONLY) || HAVE_ECC */

/* Clear the data from the multi-precision number, set to zero and free.
 *
 * @param  [out]  a  SP integer.
 */
void sp_clear(sp_int* a)
{
    /* Clear when valid pointer passed in. */
    if (a != NULL) {
        unsigned int i;

        /* Only clear the digits being used. */
        for (i = 0; i < a->used; i++) {
            a->dp[i] = 0;
        }
        /* Set back to zero and free. */
        _sp_zero(a);
        sp_free(a);
    }
}

#if !defined(NO_RSA) || !defined(NO_DH) || defined(HAVE_ECC) || \
    !defined(NO_DSA) || defined(WOLFSSL_SP_PRIME_GEN)
/* Ensure the data in the multi-precision number is zeroed.
 *
 * Use when security sensitive data needs to be wiped.
 *
 * @param  [in]  a  SP integer.
 */
void sp_forcezero(sp_int* a)
{
    /* Zeroize when a vald pointer passed in. */
    if (a != NULL) {
        /* Ensure all data zeroized - data not zeroed when used decreases. */
        ForceZero(a->dp, a->size * SP_WORD_SIZEOF);
        /* Set back to zero. */
    #ifdef HAVE_WOLF_BIGINT
        /* Zeroize the raw data as well. */
        wc_bigint_zero(&a->raw);
    #endif
        /* Make value zero and free. */
        _sp_zero(a);
        sp_free(a);
    }
}
#endif /* !WOLFSSL_RSA_VERIFY_ONLY || !NO_DH || HAVE_ECC */

#if defined(WOLFSSL_SP_MATH_ALL) || !defined(NO_DH) || defined(HAVE_ECC) || \
    !defined(NO_RSA) || defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY)
/* Copy value of multi-precision number a into r.
 *
 * @param  [in]   a  SP integer - source.
 * @param  [out]  r  SP integer - destination.
 */
static void _sp_copy(const sp_int* a, sp_int* r)
{
    /* Copy words across. */
    if (a->used == 0) {
        r->dp[0] = 0;
    }
    else {
        XMEMCPY(r->dp, a->dp, a->used * SP_WORD_SIZEOF);
    }
    /* Set number of used words in result. */
    r->used = a->used;
#ifdef WOLFSSL_SP_INT_NEGATIVE
    /* Set sign of result. */
    r->sign = a->sign;
#endif
}

/* Copy value of multi-precision number a into r.
 *
 * @param  [in]   a  SP integer - source.
 * @param  [out]  r  SP integer - destination.
 *
 * @return  MP_OKAY on success.
 */
int sp_copy(const sp_int* a, sp_int* r)
{
    int err = MP_OKAY;

    /* Validate parameters. */
    if ((a == NULL) || (r == NULL)) {
        err = MP_VAL;
    }
    /* Only copy if different pointers. */
    if (a != r) {
        /* Validated space in result. */
        if ((err == MP_OKAY) && (a->used > r->size)) {
            err = MP_VAL;
        }
        if (err == MP_OKAY) {
            _sp_copy(a, r);
        }
    }

    return err;
}
#endif

#if ((defined(WOLFSSL_SP_MATH_ALL) && ((!defined(WOLFSSL_RSA_VERIFY_ONLY) && \
      !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || !defined(NO_DH))) || \
     defined(OPENSSL_ALL)) && defined(WC_PROTECT_ENCRYPTED_MEM)

/* Copy 2 numbers into two results based on y. Copy a fixed number of digits.
 *
 * Constant time implementation.
 * When y is 0, r1 = a2 and r2 = a1.
 * When y is 1, r1 = a1 and r2 = a2.
 *
 * @param [in]  a1    First number to copy.
 * @param [in]  a2    Second number to copy.
 * @param [out] r1    First result number to copy into.
 * @param [out] r2    Second result number to copy into.
 * @param [in]  y     Indicates which number goes into which result number.
 * @param [in]  used  Number of digits to copy.
 */
static void _sp_copy_2_ct(const sp_int* a1, const sp_int* a2, sp_int* r1,
    sp_int* r2, int y, unsigned int used)
{
    unsigned int i;

    /* Copy data - constant time. */
    for (i = 0; i < used; i++) {
        r1->dp[i] = (a1->dp[i] & ((sp_digit)wc_off_on_addr[y  ])) +
                    (a2->dp[i] & ((sp_digit)wc_off_on_addr[y^1]));
        r2->dp[i] = (a1->dp[i] & ((sp_digit)wc_off_on_addr[y^1])) +
                    (a2->dp[i] & ((sp_digit)wc_off_on_addr[y  ]));
    }
    /* Copy used. */
    r1->used = (a1->used & ((int)wc_off_on_addr[y  ])) +
               (a2->used & ((int)wc_off_on_addr[y^1]));
    r2->used = (a1->used & ((int)wc_off_on_addr[y^1])) +
               (a2->used & ((int)wc_off_on_addr[y  ]));
#ifdef WOLFSSL_SP_INT_NEGATIVE
    /* Copy sign. */
    r1->sign = (a1->sign & ((int)wc_off_on_addr[y  ])) +
               (a2->sign & ((int)wc_off_on_addr[y^1]));
    r2->sign = (a1->sign & ((int)wc_off_on_addr[y^1])) +
               (a2->sign & ((int)wc_off_on_addr[y  ]));
#endif
}

#endif

#if defined(WOLFSSL_SP_MATH_ALL) || (defined(HAVE_ECC) && defined(FP_ECC))
/* Initializes r and copies in value from a.
 *
 * @param  [out]  r  SP integer - destination.
 * @param  [in]   a  SP integer - source.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a or r is NULL.
 */
int sp_init_copy(sp_int* r, const sp_int* a)
{
    int err;

    /* Initialize r and copy value in a into it. */
    err = sp_init(r);
    if (err == MP_OKAY) {
        err = sp_copy(a, r);
    }

    return err;
}
#endif /* WOLFSSL_SP_MATH_ALL || (HAVE_ECC && FP_ECC) */

#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
    !defined(NO_DH) || !defined(NO_DSA)
/* Exchange the values in a and b.
 *
 * Avoid using this API as three copy operations are performed.
 *
 * @param  [in,out]  a  SP integer to swap.
 * @param  [in,out]  b  SP integer to swap.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a or b is NULL.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
int sp_exch(sp_int* a, sp_int* b)
{
    int err = MP_OKAY;

    /* Validate parameters. */
    if ((a == NULL) || (b == NULL)) {
        err = MP_VAL;
    }
    /* Check space for a in b and b in a. */
    if ((err == MP_OKAY) && ((a->size < b->used) || (b->size < a->used))) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
        /* Declare temporary for swapping. */
        DECL_SP_INT(t, a->used);

        /* Create temporary for swapping. */
        ALLOC_SP_INT(t, a->used, err, NULL);
        if (err == MP_OKAY) {
            /* Cache allocated size of a and b. */
            unsigned int asize = a->size;
            unsigned int bsize = b->size;
            /* Copy all of SP int: t <- a, a <- b, b <- t. */
            XMEMCPY(t, a, MP_INT_SIZEOF(a->used));
            XMEMCPY(a, b, MP_INT_SIZEOF(b->used));
            XMEMCPY(b, t, MP_INT_SIZEOF(t->used));
            /* Put back size of a and b. */
            a->size = asize;
            b->size = bsize;
        }

        FREE_SP_INT(t, NULL);
    }

    return err;
}
#endif /* (WOLFSSL_SP_MATH_ALL && !WOLFSSL_RSA_VERIFY_ONLY) || !NO_DH ||
        * !NO_DSA */

#if defined(HAVE_ECC) && defined(ECC_TIMING_RESISTANT) && \
    !defined(WC_NO_CACHE_RESISTANT)
/* Conditional swap of SP int values in constant time.
 *
 * @param [in]  a     First SP int to conditionally swap.
 * @param [in]  b     Second SP int to conditionally swap.
 * @param [in]  cnt   Count of words to copy.
 * @param [in]  swap  When value is 1 then swap.
 * @param [in]  t     Temporary SP int to use in swap.
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
int sp_cond_swap_ct_ex(sp_int* a, sp_int* b, int cnt, int swap, sp_int* t)
{
    unsigned int i;
    sp_int_digit mask = (sp_int_digit)0 - (sp_int_digit)swap;

    /* XOR other fields in sp_int into temp - mask set when swapping. */
    t->used = (a->used ^ b->used) & (unsigned int)mask;
#ifdef WOLFSSL_SP_INT_NEGATIVE
    t->sign = (a->sign ^ b->sign) & (unsigned int)mask;
#endif

    /* XOR requested words into temp - mask set when swapping. */
    for (i = 0; i < (unsigned int)cnt; i++) {
        t->dp[i] = (a->dp[i] ^ b->dp[i]) & mask;
    }

    /* XOR temporary - when mask set then result will be b. */
    a->used ^= t->used;
#ifdef WOLFSSL_SP_INT_NEGATIVE
    a->sign ^= t->sign;
#endif
    for (i = 0; i < (unsigned int)cnt; i++) {
        a->dp[i] ^= t->dp[i];
    }

    /* XOR temporary - when mask set then result will be a. */
    b->used ^= t->used;
#ifdef WOLFSSL_SP_INT_NEGATIVE
    b->sign ^= b->sign;
#endif
    for (i = 0; i < (unsigned int)cnt; i++) {
        b->dp[i] ^= t->dp[i];
    }

    return MP_OKAY;
}

/* Conditional swap of SP int values in constant time.
 *
 * @param [in]  a     First SP int to conditionally swap.
 * @param [in]  b     Second SP int to conditionally swap.
 * @param [in]  cnt   Count of words to copy.
 * @param [in]  swap  When value is 1 then swap.
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
int sp_cond_swap_ct(sp_int* a, sp_int* b, int cnt, int swap)
{
    int err = MP_OKAY;
    DECL_SP_INT(t, (size_t)cnt);

    /* Allocate temporary to hold masked xor of a and b. */
    ALLOC_SP_INT(t, cnt, err, NULL);

    if (err == MP_OKAY) {
        err = sp_cond_swap_ct_ex(a, b, cnt, swap, t);
        FREE_SP_INT(t, NULL);
    }

    return err;
}
#endif /* HAVE_ECC && ECC_TIMING_RESISTANT && !WC_NO_CACHE_RESISTANT */

#ifdef WOLFSSL_SP_INT_NEGATIVE
/* Calculate the absolute value of the multi-precision number.
 *
 * @param  [in]   a  SP integer to calculate absolute value of.
 * @param  [out]  r  SP integer to hold result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a or r is NULL.
 */
int sp_abs(const sp_int* a, sp_int* r)
{
    int err;

    /* Copy a into r - copy fails when r is NULL. */
    err = sp_copy(a, r);
    if (err == MP_OKAY) {
        r->sign = MP_ZPOS;
    }

    return err;
}
#endif /* WOLFSSL_SP_INT_NEGATIVE */

#if defined(WOLFSSL_SP_MATH_ALL) || !defined(NO_DH) || defined(HAVE_ECC) || \
    (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY))
/* Compare absolute value of two multi-precision numbers.
 *
 * @param  [in]  a  SP integer.
 * @param  [in]  b  SP integer.
 *
 * @return  MP_GT when a is greater than b.
 * @return  MP_LT when a is less than b.
 * @return  MP_EQ when a is equals b.
 */
static int _sp_cmp_abs(const sp_int* a, const sp_int* b)
{
    int ret = MP_EQ;

    /* Check number of words first. */
    if (a->used > b->used) {
        ret = MP_GT;
    }
    else if (a->used < b->used) {
        ret = MP_LT;
    }
    else {
        int i;

        /* Starting from most significant word, compare words.
         * Stop when different and set comparison return.
         */
        for (i = (int)(a->used - 1); i >= 0; i--) {
            if (a->dp[i] > b->dp[i]) {
                ret = MP_GT;
                break;
            }
            else if (a->dp[i] < b->dp[i]) {
                ret = MP_LT;
                break;
            }
        }
        /* If we made to the end then ret is MP_EQ from initialization. */
    }

    return ret;
}
#endif

#if defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)
/* Compare absolute value of two multi-precision numbers.
 *
 * Pointers are compared such that NULL is less than not NULL.
 *
 * @param  [in]  a  SP integer.
 * @param  [in]  b  SP integer.
 *
 * @return  MP_GT when a is greater than b.
 * @return  MP_LT when a is less than b.
 * @return  MP_EQ when a equals b.
 */
int sp_cmp_mag(const sp_int* a, const sp_int* b)
{
    int ret;

    /* Do pointer checks first. Both NULL returns equal. */
    if (a == b) {
        ret = MP_EQ;
    }
    /* Nothing is smaller than something. */
    else if (a == NULL) {
        ret = MP_LT;
    }
    /* Something is larger than nothing. */
    else if (b == NULL) {
        ret = MP_GT;
    }
    else
    {
        /* Compare values - a and b are not NULL. */
        ret = _sp_cmp_abs(a, b);
    }

    return ret;
}
#endif

#if defined(WOLFSSL_SP_MATH_ALL) || defined(HAVE_ECC) || !defined(NO_DSA) || \
    defined(OPENSSL_EXTRA) || !defined(NO_DH) || \
    (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY))
/* Compare two multi-precision numbers.
 *
 * Assumes a and b are not NULL.
 *
 * @param  [in]  a  SP integer.
 * @param  [in]  a  SP integer.
 *
 * @return  MP_GT when a is greater than b.
 * @return  MP_LT when a is less than b.
 * @return  MP_EQ when a is equals b.
 */
static int _sp_cmp(const sp_int* a, const sp_int* b)
{
    int ret;

#ifdef WOLFSSL_SP_INT_NEGATIVE
    /* Check sign first. */
    if (a->sign > b->sign) {
        ret = MP_LT;
    }
    else if (a->sign < b->sign) {
        ret = MP_GT;
    }
    else /* (a->sign == b->sign) */ {
#endif
        /* Compare values. */
        ret = _sp_cmp_abs(a, b);
#ifdef WOLFSSL_SP_INT_NEGATIVE
        if (a->sign == MP_NEG) {
            /* MP_GT = 1, MP_LT = -1, MP_EQ = 0
             * Swapping MP_GT and MP_LT results.
             */
            ret = -ret;
        }
    }
#endif

    return ret;
}
#endif

#if (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
    !defined(NO_DSA) || defined(HAVE_ECC) || !defined(NO_DH) || \
    defined(WOLFSSL_SP_MATH_ALL)
/* Compare two multi-precision numbers.
 *
 * Pointers are compared such that NULL is less than not NULL.
 *
 * @param  [in]  a  SP integer.
 * @param  [in]  a  SP integer.
 *
 * @return  MP_GT when a is greater than b.
 * @return  MP_LT when a is less than b.
 * @return  MP_EQ when a is equals b.
 */
int sp_cmp(const sp_int* a, const sp_int* b)
{
    int ret;

    /* Check pointers first. Both NULL returns equal. */
    if (a == b) {
        ret = MP_EQ;
    }
    /* Nothing is smaller than something. */
    else if (a == NULL) {
        ret = MP_LT;
    }
    /* Something is larger than nothing. */
    else if (b == NULL) {
        ret = MP_GT;
    }
    else
    {
        /* Compare values - a and b are not NULL. */
        ret = _sp_cmp(a, b);
    }

    return ret;
}
#endif

/*************************
 * Bit check/set functions
 *************************/

#if (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
    (defined(WOLFSSL_SP_MATH_ALL) && defined(HAVE_ECC)) || \
    defined(OPENSSL_EXTRA)
/* Check if a bit is set
 *
 * When a is NULL, result is 0.
 *
 * @param  [in]  a  SP integer.
 * @param  [in]  b  Bit position to check.
 *
 * @return  0 when bit is not set.
 * @return  1 when bit is set.
 */
int sp_is_bit_set(const sp_int* a, unsigned int b)
{
    int ret = 0;
    /* Index of word. */
    unsigned int i = b >> SP_WORD_SHIFT;

    /* Check parameters. */
    if ((a != NULL) && (i < a->used)) {
        /* Shift amount to get bit down to index 0. */
        unsigned int s = b & SP_WORD_MASK;

        /* Get and mask bit. */
        ret = (int)((a->dp[i] >> s) & (sp_int_digit)1);
    }

    return ret;
}
#endif /* (!NO_RSA && !WOLFSSL_RSA_VERIFY_ONLY) ||
        * (WOLFSSL_SP_MATH_ALL && HAVE_ECC) */

/* Count the number of bits in the multi-precision number.
 *
 * When a is NULL, result is 0.
 *
 * @param  [in]  a  SP integer.
 *
 * @return  Number of bits in the SP integer value.
 */
int sp_count_bits(const sp_int* a)
{
    int n = -1;

    /* Check parameter. */
    if ((a != NULL) && (a->used > 0)) {
        /* Get index of last word. */
        n = (int)(a->used - 1);
        /* Don't count leading zeros. */
        while ((n >= 0) && (a->dp[n] == 0)) {
            n--;
        }
    }

    /* -1 indicates SP integer value was zero. */
    if (n < 0) {
        n = 0;
    }
    else {
        /* Get the most significant word. */
        sp_int_digit d = a->dp[n];
        /* Count of bits up to last word. */
        n *= SP_WORD_SIZE;

    #ifdef SP_ASM_HI_BIT_SET_IDX
        {
            sp_int_digit hi;
            /* Get index of highest set bit. */
            SP_ASM_HI_BIT_SET_IDX(d, hi);
            /* Add bits up to and including index. */
            n += (int)hi + 1;
        }
    #elif defined(SP_ASM_LZCNT)
        {
            sp_int_digit lz;
            /* Count number of leading zeros in highest non-zero digit. */
            SP_ASM_LZCNT(d, lz);
            /* Add non-leading zero bits count. */
            n += SP_WORD_SIZE - (int)lz;
        }
    #else
        /* Check if top word has more than half the bits set. */
        if (d > SP_HALF_MAX) {
            /* Set count to a full last word. */
            n += SP_WORD_SIZE;
            /* Don't count leading zero bits. */
            while ((d & ((sp_int_digit)1 << (SP_WORD_SIZE - 1))) == 0) {
                n--;
                d <<= 1;
            }
        }
        else {
            /* Add to count until highest set bit is shifted out. */
            while (d != 0) {
                n++;
                d >>= 1;
            }
        }
    #endif
    }

    return n;
}

#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY) && \
    !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || !defined(NO_DH) || \
    (defined(HAVE_ECC) && defined(FP_ECC)) || \
    (!defined(NO_RSA) && defined(WOLFSSL_KEY_GEN))

/* Number of entries in array of number of least significant zero bits. */
#define SP_LNZ_CNT      16
/* Number of bits the array checks. */
#define SP_LNZ_BITS     4
/* Mask to apply to check with array. */
#define SP_LNZ_MASK     0xf
/* Number of least significant zero bits in first SP_LNZ_CNT numbers. */
static const int sp_lnz[SP_LNZ_CNT] = {
   4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
};

/* Count the number of least significant zero bits.
 *
 * When a is not NULL, result is 0.
 *
 * @param  [in]   a  SP integer to use.
 *
 * @return  Number of least significant zero bits.
 */
#if !defined(HAVE_ECC) || !defined(HAVE_COMP_KEY)
static
#endif /* !HAVE_ECC || HAVE_COMP_KEY */
int sp_cnt_lsb(const sp_int* a)
{
    unsigned int bc = 0;

    /* Check for number with a value. */
    if ((a != NULL) && (!sp_iszero(a))) {
        unsigned int i;
        unsigned int j;

        /* Count least significant words that are zero. */
        for (i = 0; i < a->used && a->dp[i] == 0; i++, bc += SP_WORD_SIZE) {
        }

        /* Use 4-bit table to get count. */
        for (j = 0; j < SP_WORD_SIZE; j += SP_LNZ_BITS) {
            /* Get number of lesat significant 0 bits in nibble. */
            int cnt = sp_lnz[(a->dp[i] >> j) & SP_LNZ_MASK];
            /* Done if not all 4 bits are zero. */
            if (cnt != 4) {
                /* Add checked bits and count in last 4 bits checked. */
                bc += j + (unsigned int)cnt;
                break;
            }
        }
    }

    return (int)bc;
}
#endif /* WOLFSSL_SP_MATH_ALL || WOLFSSL_HAVE_SP_DH || (HAVE_ECC && FP_ECC) */

#if !defined(WOLFSSL_RSA_VERIFY_ONLY) || defined(WOLFSSL_ASN_TEMPLATE) || \
    (defined(WOLFSSL_SP_MATH_ALL) && !defined(NO_ASN))
/* Determine if the most significant byte of the encoded multi-precision number
 * has the top bit set.
 *
 * When a is NULL, result is 0.
 *
 * @param  [in]  a  SP integer.
 *
 * @return  1 when the top bit of top byte is set.
 * @return  0 when the top bit of top byte is not set.
 */
int sp_leading_bit(const sp_int* a)
{
    int bit = 0;

    /* Check if we have a number and value to use. */
    if ((a != NULL) && (a->used > 0)) {
        /* Get top word. */
        sp_int_digit d = a->dp[a->used - 1];

    #if SP_WORD_SIZE > 8
        /* Remove bottom 8 bits until highest 8 bits left. */
        while (d > (sp_int_digit)0xff) {
            d >>= 8;
        }
    #endif
        /* Get the highest bit of the 8-bit value. */
        bit = (int)(d >> 7);
    }

    return bit;
}
#endif /* !WOLFSSL_RSA_VERIFY_ONLY */

#if defined(WOLFSSL_SP_MATH_ALL) || defined(WOLFSSL_HAVE_SP_DH) || \
    defined(HAVE_ECC) || defined(WOLFSSL_KEY_GEN) || defined(OPENSSL_EXTRA) || \
    !defined(NO_RSA)
/* Set one bit of a: a |= 1 << i
 * The field 'used' is updated in a.
 *
 * @param  [in,out]  a  SP integer to set bit into.
 * @param  [in]      i  Index of bit to set.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a is NULL, index is negative or index is too large.
 */
int sp_set_bit(sp_int* a, int i)
{
    int err = MP_OKAY;
    /* Get index of word to set. */
    unsigned int w = (unsigned int)(i >> SP_WORD_SHIFT);

    /* Check for valid number and and space for bit. */
    if ((a == NULL) || (i < 0) || (w >= a->size)) {
        err = MP_VAL;
    }
    if (err == MP_OKAY) {
        /* Amount to shift up to set bit in word. */
        unsigned int s = (unsigned int)(i & (SP_WORD_SIZE - 1));
        unsigned int j;

        /* Set to zero all unused words up to and including word to have bit
         * set.
         */
        for (j = a->used; j <= w; j++) {
            a->dp[j] = 0;
        }
        /* Set bit in word. */
        a->dp[w] |= (sp_int_digit)1 << s;
        /* Update used if necessary */
        if (a->used <= w) {
            a->used = w + 1;
        }
    }

    return err;
}
#endif /* WOLFSSL_SP_MATH_ALL || WOLFSSL_HAVE_SP_DH || HAVE_ECC ||
        * WOLFSSL_KEY_GEN || OPENSSL_EXTRA || !NO_RSA */

#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
    defined(WOLFSSL_KEY_GEN) || !defined(NO_DH)
/* Exponentiate 2 to the power of e: a = 2^e
 * This is done by setting the 'e'th bit.
 *
 * @param  [out]  a  SP integer to hold result.
 * @param  [in]   e  Exponent.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a is NULL, e is negative or 2^exponent is too large.
 */
int sp_2expt(sp_int* a, int e)
{
    int err = MP_OKAY;

    /* Validate parameters. */
    if ((a == NULL) || (e < 0)) {
        err = MP_VAL;
    }
    if (err == MP_OKAY) {
        /* Set number to zero and then set bit. */
        _sp_zero(a);
        err = sp_set_bit(a, e);
    }

    return err;
}
#endif /* (WOLFSSL_SP_MATH_ALL && !WOLFSSL_RSA_VERIFY_ONLY) ||
        * WOLFSSL_KEY_GEN || !NO_DH */

/**********************
 * Digit/Long functions
 **********************/

#if defined(WOLFSSL_SP_MATH_ALL) || !defined(NO_RSA) || !defined(NO_DH) || \
    defined(HAVE_ECC)
/* Set the multi-precision number to be the value of the digit.
 *
 * @param  [out]  a  SP integer to become number.
 * @param  [in]   d  Digit to be set.
 */
static void _sp_set(sp_int* a, sp_int_digit d)
{
    /* Use sp_int_minimal to support allocated byte arrays as sp_ints. */
    sp_int_minimal* am = (sp_int_minimal*)a;

    am->dp[0] = d;
    /* d == 0 => used = 0, d > 0 => used = 1 */
    am->used = (d > 0);
#ifdef WOLFSSL_SP_INT_NEGATIVE
    am->sign = MP_ZPOS;
#endif
}

/* Set the multi-precision number to be the value of the digit.
 *
 * @param  [out]  a  SP integer to become number.
 * @param  [in]   d  Digit to be set.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a is NULL.
 */
int sp_set(sp_int* a, sp_int_digit d)
{
    int err = MP_OKAY;

    /* Validate parameters. */
    if (a == NULL) {
        err = MP_VAL;
    }
    if (err == MP_OKAY) {
        _sp_set(a, d);
    }

    return err;
}
#endif

#if defined(WOLFSSL_SP_MATH_ALL) || !defined(NO_RSA) || defined(OPENSSL_EXTRA)
/* Set a number into the multi-precision number.
 *
 * Number may be larger than the size of a digit.
 *
 * @param  [out]  a  SP integer to set.
 * @param  [in]   n  Long value to set.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a is NULL.
 */
int sp_set_int(sp_int* a, unsigned long n)
{
    int err = MP_OKAY;

    if (a == NULL) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
    #if SP_WORD_SIZE < SP_ULONG_BITS
        /* Assign if value first in one word. */
        if (n <= (sp_int_digit)SP_DIGIT_MAX) {
    #endif
            a->dp[0] = (sp_int_digit)n;
            a->used = (n != 0);
    #if SP_WORD_SIZE < SP_ULONG_BITS
        }
        else {
            unsigned int i;

            /* Assign value word by word. */
            for (i = 0; (i < a->size) && (n > 0); i++,n >>= SP_WORD_SIZE) {
                a->dp[i] = (sp_int_digit)n;
            }
            /* Update number of words used. */
            a->used = i;
            /* Check for overflow. */
            if ((i == a->size) && (n != 0)) {
                err = MP_VAL;
            }
        }
    #endif
    #ifdef WOLFSSL_SP_INT_NEGATIVE
        a->sign = MP_ZPOS;
    #endif
    }

    return err;
}
#endif /* WOLFSSL_SP_MATH_ALL || !NO_RSA  */

#if defined(WOLFSSL_SP_MATH_ALL) || \
    (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
    !defined(NO_DH) || defined(HAVE_ECC)
/* Compare a one digit number with a multi-precision number.
 *
 * When a is NULL, MP_LT is returned.
 *
 * @param  [in]  a  SP integer to compare.
 * @param  [in]  d  Digit to compare with.
 *
 * @return  MP_GT when a is greater than d.
 * @return  MP_LT when a is less than d.
 * @return  MP_EQ when a is equals d.
 */
int sp_cmp_d(const sp_int* a, sp_int_digit d)
{
    int ret = MP_EQ;

    /* No SP integer is always less - even when d is zero. */
    if (a == NULL) {
        ret = MP_LT;
    }
    else
#ifdef WOLFSSL_SP_INT_NEGATIVE
    /* Check sign first. */
    if (a->sign == MP_NEG) {
        ret = MP_LT;
    }
    else
#endif
    {
        /* Check if SP integer as more than one word. */
        if (a->used > 1) {
            ret = MP_GT;
        }
        /* Special case for zero. */
        else if (a->used == 0) {
            if (d != 0) {
                ret = MP_LT;
            }
            /* ret initialized to equal. */
        }
        else {
            /* The single word in the SP integer can now be compared with d. */
            if (a->dp[0] > d) {
                ret = MP_GT;
            }
            else if (a->dp[0] < d) {
                ret = MP_LT;
            }
            /* ret initialized to equal. */
        }
    }

    return ret;
}
#endif

#if defined(WOLFSSL_SP_ADD_D) || (defined(WOLFSSL_SP_INT_NEGATIVE) && \
    defined(WOLFSSL_SP_SUB_D)) || defined(WOLFSSL_SP_READ_RADIX_10)
/* Add a one digit number to the multi-precision number.
 *
 * @param  [in]   a  SP integer be added to.
 * @param  [in]   d  Digit to add.
 * @param  [out]  r  SP integer to store result in.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when result is too large for fixed size dp array.
 */
static int _sp_add_d(const sp_int* a, sp_int_digit d, sp_int* r)
{
    int err = MP_OKAY;

    /* Special case of zero means we want result to have a digit when not adding
     * zero. */
    if (a->used == 0) {
        r->dp[0] = d;
        r->used = (d > 0);
    }
    else {
        unsigned int i = 0;
        sp_int_digit a0 = a->dp[0];

        /* Set used of result - updated if overflow seen. */
        r->used = a->used;

        r->dp[0] = a0 + d;
        /* Check for carry. */
        if (r->dp[0] < a0) {
            /* Do carry through all words. */
            for (++i; i < a->used; i++) {
                r->dp[i] = a->dp[i] + 1;
                if (r->dp[i] != 0) {
                   break;
                }
            }
            /* Add another word if required. */
            if (i == a->used) {
                /* Check result has enough space for another word. */
                if (i < r->size) {
                    r->used++;
                    r->dp[i] = 1;
                }
                else {
                    err = MP_VAL;
                }
            }
        }
        /* When result is not the same as input, copy rest of digits. */
        if ((err == MP_OKAY) && (r != a)) {
            /* Copy any words that didn't update with carry. */
            for (++i; i < a->used; i++) {
                r->dp[i] = a->dp[i];
            }
        }
    }

    return err;
}
#endif /* WOLFSSL_SP_ADD_D || (WOLFSSL_SP_INT_NEGATIVE && WOLFSSL_SP_SUB_D) ||
        * defined(WOLFSSL_SP_READ_RADIX_10) */

#if (defined(WOLFSSL_SP_INT_NEGATIVE) && defined(WOLFSSL_SP_ADD_D)) || \
    defined(WOLFSSL_SP_SUB_D) || defined(WOLFSSL_SP_INVMOD) || \
    defined(WOLFSSL_SP_INVMOD_MONT_CT) || (defined(WOLFSSL_SP_PRIME_GEN) && \
    !defined(WC_NO_RNG))
/* Sub a one digit number from the multi-precision number.
 *
 * @param  [in]   a  SP integer be subtracted from.
 * @param  [in]   d  Digit to subtract.
 * @param  [out]  r  SP integer to store result in.
 */
static void _sp_sub_d(const sp_int* a, sp_int_digit d, sp_int* r)
{
    /* Set result used to be same as input. Updated with clamp. */
    r->used = a->used;
    /* Only possible when not handling negatives. */
    if (a->used == 0) {
        /* Set result to zero as no negative support. */
        r->dp[0] = 0;
    }
    else {
        unsigned int i = 0;
        sp_int_digit a0 = a->dp[0];

        r->dp[0] = a0 - d;
        /* Check for borrow. */
        if (r->dp[0] > a0) {
            /* Do borrow through all words. */
            for (++i; i < a->used; i++) {
                r->dp[i] = a->dp[i] - 1;
                if (r->dp[i] != SP_DIGIT_MAX) {
                   break;
                }
            }
        }
        /* When result is not the same as input, copy rest of digits. */
        if (r != a) {
            /* Copy any words that didn't update with borrow. */
            for (++i; i < a->used; i++) {
                r->dp[i] = a->dp[i];
            }
        }
        /* Remove leading zero words. */
        sp_clamp(r);
    }
}
#endif /* (WOLFSSL_SP_INT_NEGATIVE && WOLFSSL_SP_ADD_D) || WOLFSSL_SP_SUB_D
        * WOLFSSL_SP_INVMOD || WOLFSSL_SP_INVMOD_MONT_CT ||
        * WOLFSSL_SP_PRIME_GEN */

#ifdef WOLFSSL_SP_ADD_D
/* Add a one digit number to the multi-precision number.
 *
 * @param  [in]   a  SP integer be added to.
 * @param  [in]   d  Digit to add.
 * @param  [out]  r  SP integer to store result in.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when result is too large for fixed size dp array.
 */
int sp_add_d(const sp_int* a, sp_int_digit d, sp_int* r)
{
    int err = MP_OKAY;

    /* Check validity of parameters. */
    if ((a == NULL) || (r == NULL)) {
        err = MP_VAL;
    }

#ifndef WOLFSSL_SP_INT_NEGATIVE
    /* Check for space in result especially when carry adds a new word. */
    if ((err == MP_OKAY) && (a->used + 1 > r->size)) {
         err = MP_VAL;
    }
    if (err == MP_OKAY) {
        /* Positive only so just use internal function. */
        err = _sp_add_d(a, d, r);
    }
#else
    /* Check for space in result especially when carry adds a new word. */
    if ((err == MP_OKAY) && (a->sign == MP_ZPOS) && (a->used + 1 > r->size)) {
         err = MP_VAL;
    }
    /* Check for space in result - no carry but borrow possible. */
    if ((err == MP_OKAY) && (a->sign == MP_NEG) && (a->used > r->size)) {
         err = MP_VAL;
    }
    if (err == MP_OKAY) {
        if (a->sign == MP_ZPOS) {
            /* Positive, so use internal function. */
            r->sign = MP_ZPOS;
            err = _sp_add_d(a, d, r);
        }
        else if ((a->used > 1) || (a->dp[0] > d)) {
            /* Negative value bigger than digit so subtract digit. */
            r->sign = MP_NEG;
            _sp_sub_d(a, d, r);
        }
        else {
            /* Negative value smaller or equal to digit. */
            r->sign = MP_ZPOS;
            /* Subtract negative value from digit. */
            r->dp[0] = d - a->dp[0];
            /* Result is a digit equal to or greater than zero. */
            r->used = (r->dp[0] > 0);
        }
    }
#endif

    return err;
}
#endif /* WOLFSSL_SP_ADD_D */

#ifdef WOLFSSL_SP_SUB_D
/* Sub a one digit number from the multi-precision number.
 *
 * @param  [in]   a  SP integer be subtracted from.
 * @param  [in]   d  Digit to subtract.
 * @param  [out]  r  SP integer to store result in.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a or r is NULL.
 */
int sp_sub_d(const sp_int* a, sp_int_digit d, sp_int* r)
{
    int err = MP_OKAY;

    /* Check validity of parameters. */
    if ((a == NULL) || (r == NULL)) {
        err = MP_VAL;
    }
#ifndef WOLFSSL_SP_INT_NEGATIVE
    /* Check for space in result. */
    if ((err == MP_OKAY) && (a->used > r->size)) {
         err = MP_VAL;
    }
    if (err == MP_OKAY) {
        /* Positive only so just use internal function. */
        _sp_sub_d(a, d, r);
    }
#else
    /* Check for space in result especially when borrow adds a new word. */
    if ((err == MP_OKAY) && (a->sign == MP_NEG) && (a->used + 1 > r->size)) {
         err = MP_VAL;
    }
    /* Check for space in result - no carry but borrow possible. */
    if ((err == MP_OKAY) && (a->sign == MP_ZPOS) && (a->used > r->size)) {
         err = MP_VAL;
    }
    if (err == MP_OKAY) {
        if (a->sign == MP_NEG) {
            /* Subtracting from negative use internal add. */
            r->sign = MP_NEG;
            err = _sp_add_d(a, d, r);
        }
        else if ((a->used > 1) || (a->dp[0] >= d)) {
            /* Positive number greater than or equal to digit - subtract digit.
             */
            r->sign = MP_ZPOS;
            _sp_sub_d(a, d, r);
        }
        else {
            /* Positive value smaller than digit. */
            r->sign = MP_NEG;
            /* Subtract positive value from digit. */
            r->dp[0] = d - a->dp[0];
            /* Result is a digit equal to or greater than zero. */
            r->used = 1;
        }
    }
#endif

    return err;
}
#endif /* WOLFSSL_SP_SUB_D */

#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
    defined(WOLFSSL_SP_SMALL) && (defined(WOLFSSL_SP_MATH_ALL) || \
    !defined(NO_DH) || defined(HAVE_ECC) || \
    (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY) && \
     !defined(WOLFSSL_RSA_PUBLIC_ONLY))) || \
    (defined(WOLFSSL_KEY_GEN) && !defined(NO_RSA))
/* Multiply a by digit n and put result into r shifting up o digits.
 *   r = (a * n) << (o * SP_WORD_SIZE)
 *
 * @param  [in]   a  SP integer to be multiplied.
 * @param  [in]   d  SP digit to multiply by.
 * @param  [out]  r  SP integer result.
 * @param  [in]   o  Number of digits to move result up by.
 * @return  MP_OKAY on success.
 * @return  MP_VAL when result is too large for sp_int.
 */
static int _sp_mul_d(const sp_int* a, sp_int_digit d, sp_int* r, unsigned int o)
{
    int err = MP_OKAY;
    unsigned int i;
#ifndef SQR_MUL_ASM
    sp_int_word t = 0;
#else
    sp_int_digit l = 0;
    sp_int_digit h = 0;
#endif

#ifdef WOLFSSL_SP_SMALL
    /* Zero out offset words. */
    for (i = 0; i < o; i++) {
        r->dp[i] = 0;
    }
#else
    /* Don't use the offset. Only when doing small code size div. */
    (void)o;
#endif

    /* Multiply each word of a by n. */
    for (i = 0; i < a->used; i++, o++) {
    #ifndef SQR_MUL_ASM
        /* Add product to top word of previous result. */
        t += (sp_int_word)a->dp[i] * d;
        /* Store low word. */
        r->dp[o] = (sp_int_digit)t;
        /* Move top word down. */
        t >>= SP_WORD_SIZE;
    #else
        /* Multiply and add into low and high from previous result.
         * No overflow of possible with add. */
        SP_ASM_MUL_ADD_NO(l, h, a->dp[i], d);
        /* Store low word. */
        r->dp[o] = l;
        /* Move high word into low word and set high word to 0. */
        l = h;
        h = 0;
    #endif
    }

    /* Check whether new word to be appended to result. */
#ifndef SQR_MUL_ASM
    if (t > 0)
#else
    if (l > 0)
#endif
    {
        /* Validate space available in result. */
        if (o == r->size) {
            err = MP_VAL;
        }
        else {
            /* Store new top word. */
        #ifndef SQR_MUL_ASM
            r->dp[o++] = (sp_int_digit)t;
        #else
            r->dp[o++] = l;
        #endif
        }
    }
    /* Update number of words in result. */
    r->used = o;
    /* In case n is zero. */
    sp_clamp(r);

    return err;
}
#endif /* (WOLFSSL_SP_MATH_ALL && !WOLFSSL_RSA_VERIFY_ONLY) ||
        *  WOLFSSL_SP_SMALL || (WOLFSSL_KEY_GEN && !NO_RSA) */

#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
    (defined(WOLFSSL_KEY_GEN) && !defined(NO_RSA))
/* Multiply a by digit n and put result into r. r = a * n
 *
 * @param  [in]   a  SP integer to multiply.
 * @param  [in]   n  Digit to multiply by.
 * @param  [out]  r  SP integer to hold result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a or b is NULL, or a has maximum number of digits used.
 */
int sp_mul_d(const sp_int* a, sp_int_digit d, sp_int* r)
{
    int err = MP_OKAY;

    /* Validate parameters. */
    if ((a == NULL) || (r == NULL)) {
        err = MP_VAL;
    }
    /* Check space for product result - _sp_mul_d checks when new word added. */
    if ((err == MP_OKAY) && (a->used > r->size)) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
        err = _sp_mul_d(a, d, r, 0);
    #ifdef WOLFSSL_SP_INT_NEGATIVE
        /* Update sign. */
        if (d == 0) {
            r->sign = MP_ZPOS;
        }
        else {
            r->sign = a->sign;
        }
    #endif
    }

    return err;
}
#endif /* (WOLFSSL_SP_MATH_ALL && !WOLFSSL_RSA_VERIFY_ONLY) ||
        * (WOLFSSL_KEY_GEN && !NO_RSA) */

/* Predefine complicated rules of when to compile in sp_div_d and sp_mod_d. */
#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
    defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY) || \
    defined(OPENSSL_EXTRA) || defined(WC_MP_TO_RADIX)
#define WOLFSSL_SP_DIV_D
#endif
#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
    !defined(NO_DH) || \
    (defined(HAVE_ECC) && (defined(FP_ECC) || defined(HAVE_COMP_KEY))) || \
    (!defined(NO_RSA) && defined(WOLFSSL_KEY_GEN))
#define WOLFSSL_SP_MOD_D
#endif

#if (defined(WOLFSSL_SP_MATH_ALL) || !defined(NO_DH) || defined(HAVE_ECC) || \
     (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY) && \
      !defined(WOLFSSL_RSA_PUBLIC_ONLY))) || \
    defined(WOLFSSL_SP_DIV_D) || defined(WOLFSSL_SP_MOD_D)
#ifndef SP_ASM_DIV_WORD
/* Divide a two digit number by a digit number and return. (hi | lo) / d
 *
 * @param  [in]  hi  SP integer digit. High digit of the dividend.
 * @param  [in]  lo  SP integer digit. Lower digit of the dividend.
 * @param  [in]  d   SP integer digit. Number to divide by.
 * @return  The division result.
 */
static WC_INLINE sp_int_digit sp_div_word(sp_int_digit hi, sp_int_digit lo,
    sp_int_digit d)
{
#ifdef WOLFSSL_SP_DIV_WORD_HALF
    sp_int_digit r;

    /* Trial division using half of the bits in d. */

    /* Check for shortcut when no high word set. */
    if (hi == 0) {
        r = lo / d;
    }
    else {
        /* Half the bits of d. */
        sp_int_digit divh = d >> SP_HALF_SIZE;
        /* Number to divide in one value. */
        sp_int_word w = ((sp_int_word)hi << SP_WORD_SIZE) | lo;
        sp_int_word trial;
        sp_int_digit r2;

        /* Calculation for top SP_WORD_SIZE / 2 bits of dividend. */
        /* Divide high word by top half of divisor. */
        r = hi / divh;
        /* When result too big then assume only max value. */
        if (r > SP_HALF_MAX) {
            r = SP_HALF_MAX;
        }
        /* Shift up result for trial division calucation. */
        r <<= SP_HALF_SIZE;
        /* Calculate trial value. */
        trial = r * (sp_int_word)d;
        /* Decrease r while trial is too big. */
        while (trial > w) {
            r -= (sp_int_digit)1 << SP_HALF_SIZE;
            trial -= (sp_int_word)d << SP_HALF_SIZE;
        }
        /* Subtract trial. */
        w -= trial;

        /* Calculation for remaining second SP_WORD_SIZE / 2 bits. */
        /* Divide top SP_WORD_SIZE of remainder by top half of divisor. */
        r2 = ((sp_int_digit)(w >> SP_HALF_SIZE)) / divh;
        /* Calculate trial value. */
        trial = r2 * (sp_int_word)d;
        /* Decrease r while trial is too big. */
        while (trial > w) {
            r2--;
            trial -= d;
        }
        /* Subtract trial. */
        w -= trial;
        /* Update result. */
        r += r2;

        /* Calculation for remaining bottom SP_WORD_SIZE bits. */
        r2 = ((sp_int_digit)w) / d;
        /* Update result. */
        r += r2;
    }

    return r;
#else
    sp_int_word w;
    sp_int_digit r;

    /* Use built-in divide. */
    w = ((sp_int_word)hi << SP_WORD_SIZE) | lo;
    w /= d;
    r = (sp_int_digit)w;

    return r;
#endif /* WOLFSSL_SP_DIV_WORD_HALF */
}
#endif /* !SP_ASM_DIV_WORD */
#endif /* WOLFSSL_SP_MATH_ALL || !NO_DH || HAVE_ECC ||
        * (!NO_RSA && !WOLFSSL_RSA_VERIFY_ONLY) */

#if (defined(WOLFSSL_SP_DIV_D) || defined(WOLFSSL_SP_MOD_D)) && \
    !defined(WOLFSSL_SP_SMALL)

#if SP_WORD_SIZE == 64
    /* 2^64 / 3 */
    #define SP_DIV_3_CONST      0x5555555555555555L
    /* 2^64 / 10 */
    #define SP_DIV_10_CONST     0x1999999999999999L
#elif SP_WORD_SIZE == 32
    /* 2^32 / 3 */
    #define SP_DIV_3_CONST      0x55555555
    /* 2^32 / 10 */
    #define SP_DIV_10_CONST     0x19999999
#elif SP_WORD_SIZE == 16
    /* 2^16 / 3 */
    #define SP_DIV_3_CONST      0x5555
    /* 2^16 / 10 */
    #define SP_DIV_10_CONST     0x1999
#elif SP_WORD_SIZE == 8
    /* 2^8 / 3 */
    #define SP_DIV_3_CONST      0x55
    /* 2^8 / 10 */
    #define SP_DIV_10_CONST     0x19
#endif

#if !defined(WOLFSSL_SP_SMALL) && (SP_WORD_SIZE < 64)
/* Divide by 3: r = a / 3 and rem = a % 3
 *
 * Used in checking prime: (a % 3) == 0?.
 *
 * @param  [in]   a    SP integer to be divided.
 * @param  [out]  r    SP integer that is the quotient. May be NULL.
 * @param  [out]  rem  SP integer that is the remainder. May be NULL.
 */
static void _sp_div_3(const sp_int* a, sp_int* r, sp_int_digit* rem)
{
#ifndef SQR_MUL_ASM
    sp_int_word t;
    sp_int_digit tt;
#else
    sp_int_digit l = 0;
    sp_int_digit tt = 0;
    sp_int_digit t = SP_DIV_3_CONST;
    sp_int_digit lm = 0;
    sp_int_digit hm = 0;
#endif
    sp_int_digit tr = 0;
    /* Quotient fixup. */
    static const unsigned char sp_r6[6] = { 0, 0, 0, 1, 1, 1 };
    /* Remainder fixup. */
    static const unsigned char sp_rem6[6] = { 0, 1, 2, 0, 1, 2 };

    /* Check whether only mod value needed. */
    if (r == NULL) {
        unsigned int i;

        /*    2^2 mod 3 = 4 mod 3 = 1.
         * => 2^(2*n) mod 3 = (2^2 mod 3)^n mod 3 = 1^n mod 3 = 1
         * => (2^(2*n) * x) mod 3 = (2^(2*n) mod 3) * (x mod 3) = x mod 3
         *
         * Calculate mod 3 on sum of digits as SP_WORD_SIZE is a multiple of 2.
         */
    #ifndef SQR_MUL_ASM
        t = 0;
        /* Sum the digits. */
        for (i = 0; i < a->used; i++) {
            t += a->dp[i];
        }
        /* Sum digits of sum. */
        t = (t >> SP_WORD_SIZE) + (t & SP_MASK);
        /* Get top digit after multipling by (2^SP_WORD_SIZE) / 3. */
        tt = (sp_int_digit)((t * SP_DIV_3_CONST) >> SP_WORD_SIZE);
        /* Subtract trial division. */
        tr = (sp_int_digit)(t - (sp_int_word)tt * 3);
    #else
        /* Sum the digits. */
        for (i = 0; i < a->used; i++) {
            SP_ASM_ADDC_REG(l, tr, a->dp[i]);
        }
        /* Sum digits of sum - can get carry. */
        SP_ASM_ADDC_REG(l, tt, tr);
        /* Multiply digit by (2^SP_WORD_SIZE) / 3. */
        SP_ASM_MUL(lm, hm, l, t);
        /* Add remainder multiplied by (2^SP_WORD_SIZE) / 3 to top digit. */
        hm += tt * SP_DIV_3_CONST;
        /* Subtract trial division from digit. */
        tr = l - (hm * 3);
    #endif
        /* tr is 0..5 but need 0..2 */
        /* Fix up remainder. */
        tr = sp_rem6[tr];
        *rem = tr;
    }
    /* At least result needed - remainder is calculated anyway. */
    else {
        int i;

        /* Divide starting at most significant word down to least. */
        for (i = (int)(a->used - 1); i >= 0; i--) {
    #ifndef SQR_MUL_ASM
            /* Combine remainder from last operation with this word. */
            t = ((sp_int_word)tr << SP_WORD_SIZE) | a->dp[i];
            /* Get top digit after multipling by (2^SP_WORD_SIZE) / 3. */
            tt = (sp_int_digit)((t * SP_DIV_3_CONST) >> SP_WORD_SIZE);
            /* Subtract trial division. */
            tr = (sp_int_digit)(t - (sp_int_word)tt * 3);
    #else
            /* Multiply digit by (2^SP_WORD_SIZE) / 3. */
            SP_ASM_MUL(l, tt, a->dp[i], t);
            /* Add remainder multiplied by (2^SP_WORD_SIZE) / 3 to top digit. */
            tt += tr * SP_DIV_3_CONST;
            /* Subtract trial division from digit. */
            tr = a->dp[i] - (tt * 3);
    #endif
            /* tr is 0..5 but need 0..2 */
            /* Fix up result. */
            tt += sp_r6[tr];
            /* Fix up remainder. */
            tr = sp_rem6[tr];
            /* Store result of digit divided by 3. */
            r->dp[i] = tt;
        }

        /* Set the used amount to maximal amount. */
        r->used = a->used;
        /* Remove leading zeros. */
        sp_clamp(r);
        /* Return remainder if required. */
        if (rem != NULL) {
            *rem = tr;
        }
    }
}
#endif /* !(WOLFSSL_SP_SMALL && (SP_WORD_SIZE < 64) */

/* Divide by 10: r = a / 10 and rem = a % 10
 *
 * Used when writing with a radix of 10 - decimal number.
 *
 * @param  [in]   a    SP integer to be divided.
 * @param  [out]  r    SP integer that is the quotient. May be NULL.
 * @param  [out]  rem  SP integer that is the remainder. May be NULL.
 */
static void _sp_div_10(const sp_int* a, sp_int* r, sp_int_digit* rem)
{
    int i;
#ifndef SQR_MUL_ASM
    sp_int_word t;
    sp_int_digit tt;
#else
    sp_int_digit l = 0;
    sp_int_digit tt = 0;
    sp_int_digit t = SP_DIV_10_CONST;
#endif
    sp_int_digit tr = 0;

    /* Check whether only mod value needed. */
    if (r == NULL) {
        /* Divide starting at most significant word down to least. */
        for (i = (int)(a->used - 1); i >= 0; i--) {
    #ifndef SQR_MUL_ASM
            /* Combine remainder from last operation with this word. */
            t = ((sp_int_word)tr << SP_WORD_SIZE) | a->dp[i];
            /* Get top digit after multipling by (2^SP_WORD_SIZE) / 10. */
            tt = (sp_int_digit)((t * SP_DIV_10_CONST) >> SP_WORD_SIZE);
            /* Subtract trial division. */
            tr = (sp_int_digit)(t - (sp_int_word)tt * 10);
    #else
            /* Multiply digit by (2^SP_WORD_SIZE) / 10. */
            SP_ASM_MUL(l, tt, a->dp[i], t);
            /* Add remainder multiplied by (2^SP_WORD_SIZE) / 10 to top digit.
             */
            tt += tr * SP_DIV_10_CONST;
            /* Subtract trial division from digit. */
            tr = a->dp[i] - (tt * 10);
    #endif
            /* tr is 0..99 but need 0..9 */
            /* Fix up remainder. */
            tr = tr % 10;
        }
        *rem = tr;
    }
    /* At least result needed - remainder is calculated anyway. */
    else {
        /* Divide starting at most significant word down to least. */
        for (i = (int)(a->used - 1); i >= 0; i--) {
    #ifndef SQR_MUL_ASM
            /* Combine remainder from last operation with this word. */
            t = ((sp_int_word)tr << SP_WORD_SIZE) | a->dp[i];
            /* Get top digit after multipling by (2^SP_WORD_SIZE) / 10. */
            tt = (sp_int_digit)((t * SP_DIV_10_CONST) >> SP_WORD_SIZE);
            /* Subtract trial division. */
            tr = (sp_int_digit)(t - (sp_int_word)tt * 10);
    #else
            /* Multiply digit by (2^SP_WORD_SIZE) / 10. */
            SP_ASM_MUL(l, tt, a->dp[i], t);
            /* Add remainder multiplied by (2^SP_WORD_SIZE) / 10 to top digit.
             */
            tt += tr * SP_DIV_10_CONST;
            /* Subtract trial division from digit. */
            tr = a->dp[i] - (tt * 10);
    #endif
            /* tr is 0..99 but need 0..9 */
            /* Fix up result. */
            tt += tr / 10;
            /* Fix up remainder. */
            tr %= 10;
            /* Store result of digit divided by 10. */
            r->dp[i] = tt;
        }

        /* Set the used amount to maximal amount. */
        r->used = a->used;
        /* Remove leading zeros. */
        sp_clamp(r);
        /* Return remainder if required. */
        if (rem != NULL) {
            *rem = tr;
        }
    }
}
#endif /* (WOLFSSL_SP_DIV_D || WOLFSSL_SP_MOD_D) && !WOLFSSL_SP_SMALL */

#if defined(WOLFSSL_SP_DIV_D) || defined(WOLFSSL_SP_MOD_D)
/* Divide by small number: r = a / d and rem = a % d
 *
 * @param  [in]   a    SP integer to be divided.
 * @param  [in]   d    Digit to divide by.
 * @param  [out]  r    SP integer that is the quotient. May be NULL.
 * @param  [out]  rem  SP integer that is the remainder. May be NULL.
 */
static void _sp_div_small(const sp_int* a, sp_int_digit d, sp_int* r,
    sp_int_digit* rem)
{
    int i;
#ifndef SQR_MUL_ASM
    sp_int_word t;
    sp_int_digit tt;
#else
    sp_int_digit l = 0;
    sp_int_digit tt = 0;
#endif
    sp_int_digit tr = 0;
    sp_int_digit m = SP_DIGIT_MAX / d;

#ifndef WOLFSSL_SP_SMALL
    /* Check whether only mod value needed. */
    if (r == NULL) {
        /* Divide starting at most significant word down to least. */
        for (i = (int)(a->used - 1); i >= 0; i--) {
        #ifndef SQR_MUL_ASM
            /* Combine remainder from last operation with this word. */
            t = ((sp_int_word)tr << SP_WORD_SIZE) | a->dp[i];
            /* Get top digit after multipling. */
            tt = (sp_int_digit)((t * m) >> SP_WORD_SIZE);
            /* Subtract trial division. */
            tr = (sp_int_digit)t - (sp_int_digit)(tt * d);
        #else
            /* Multiply digit. */
            SP_ASM_MUL(l, tt, a->dp[i], m);
            /* Add multiplied remainder to top digit. */
            tt += tr * m;
            /* Subtract trial division from digit. */
            tr = a->dp[i] - (tt * d);
        #endif
            /* tr < d * d */
            /* Fix up remainder. */
            tr = tr % d;
        }
        *rem = tr;
    }
    /* At least result needed - remainder is calculated anyway. */
    else
#endif /* !WOLFSSL_SP_SMALL */
    {
        /* Divide starting at most significant word down to least. */
        for (i = (int)(a->used - 1); i >= 0; i--) {
        #ifndef SQR_MUL_ASM
            /* Combine remainder from last operation with this word. */
            t = ((sp_int_word)tr << SP_WORD_SIZE) | a->dp[i];
            /* Get top digit after multipling. */
            tt = (sp_int_digit)((t * m) >> SP_WORD_SIZE);
            /* Subtract trial division. */
            tr = (sp_int_digit)t - (sp_int_digit)(tt * d);
        #else
            /* Multiply digit. */
            SP_ASM_MUL(l, tt, a->dp[i], m);
            /* Add multiplied remainder to top digit. */
            tt += tr * m;
            /* Subtract trial division from digit. */
            tr = a->dp[i] - (tt * d);
        #endif
            /* tr < d * d */
            /* Fix up result. */
            tt += tr / d;
            /* Fix up remainder. */
            tr %= d;
            /* Store result of dividing the digit. */
        #ifdef WOLFSSL_SP_SMALL
            if (r != NULL)
        #endif
            {
                r->dp[i] = tt;
            }
        }

    #ifdef WOLFSSL_SP_SMALL
        if (r != NULL)
    #endif
        {
            /* Set the used amount to maximal amount. */
            r->used = a->used;
            /* Remove leading zeros. */
            sp_clamp(r);
        }
        /* Return remainder if required. */
        if (rem != NULL) {
            *rem = tr;
        }
    }
}
#endif

#ifdef WOLFSSL_SP_DIV_D
/* Divide a multi-precision number by a digit size number and calculate
 * remainder.
 *   r = a / d; rem = a % d
 *
 * Use trial division algorithm.
 *
 * @param  [in]   a    SP integer to be divided.
 * @param  [in]   d    Digit to divide by.
 * @param  [out]  r    SP integer that is the quotient. May be NULL.
 * @param  [out]  rem  Digit that is the remainder. May be NULL.
 */
static void _sp_div_d(const sp_int* a, sp_int_digit d, sp_int* r,
    sp_int_digit* rem)
{
    int i;
#ifndef SQR_MUL_ASM
    sp_int_word w = 0;
#else
    sp_int_digit l;
    sp_int_digit h = 0;
#endif
    sp_int_digit t;

    /* Divide starting at most significant word down to least. */
    for (i = (int)(a->used - 1); i >= 0; i--) {
    #ifndef SQR_MUL_ASM
        /* Combine remainder from last operation with this word and divide. */
        t = sp_div_word((sp_int_digit)w, a->dp[i], d);
        /* Combine remainder from last operation with this word. */
        w = (w << SP_WORD_SIZE) | a->dp[i];
        /* Subtract to get modulo result. */
        w -= (sp_int_word)t * d;
    #else
        /* Get current word. */
        l = a->dp[i];
        /* Combine remainder from last operation with this word and divide. */
        t = sp_div_word(h, l, d);
        /* Subtract to get modulo result. */
        h = l - t * d;
    #endif
        /* Store result of dividing the digit. */
        if (r != NULL) {
            r->dp[i] = t;
        }
    }
    if (r != NULL) {
        /* Set the used amount to maximal amount. */
        r->used = a->used;
        /* Remove leading zeros. */
        sp_clamp(r);
    }

    /* Return remainder if required. */
    if (rem != NULL) {
    #ifndef SQR_MUL_ASM
        *rem = (sp_int_digit)w;
    #else
        *rem = h;
    #endif
    }
}

/* Divide a multi-precision number by a digit size number and calculate
 * remainder.
 *   r = a / d; rem = a % d
 *
 * @param  [in]   a    SP integer to be divided.
 * @param  [in]   d    Digit to divide by.
 * @param  [out]  r    SP integer that is the quotient. May be NULL.
 * @param  [out]  rem  Digit that is the remainder. May be NULL.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a is NULL or d is 0.
 */
int sp_div_d(const sp_int* a, sp_int_digit d, sp_int* r, sp_int_digit* rem)
{
    int err = MP_OKAY;

    /* Validate parameters. */
    if ((a == NULL) || (d == 0)) {
        err = MP_VAL;
    }
    /* Check space for maximal sized result. */
    if ((err == MP_OKAY) && (r != NULL) && (a->used > r->size)) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
#if !defined(WOLFSSL_SP_SMALL)
    #if SP_WORD_SIZE < 64
        if (d == 3) {
            /* Fast implementation for divisor of 3. */
            _sp_div_3(a, r, rem);
        }
        else
    #endif
        if (d == 10) {
            /* Fast implementation for divisor of 10 - sp_todecimal(). */
            _sp_div_10(a, r, rem);
        }
        else
#endif
        if (d <= SP_HALF_MAX) {
            /* For small divisors. */
            _sp_div_small(a, d, r, rem);
        }
        else
        {
            _sp_div_d(a, d, r, rem);
        }

    #ifdef WOLFSSL_SP_INT_NEGATIVE
        if (r != NULL) {
            r->sign = a->sign;
        }
    #endif
    }

    return err;
}
#endif /* WOLFSSL_SP_DIV_D */

#ifdef WOLFSSL_SP_MOD_D
/* Calculate a modulo the digit d into r: r = a mod d
 *
 * @param  [in]   a  SP integer to reduce.
 * @param  [in]   d  Digit to that is the modulus.
 * @param  [out]  r  Digit that is the result.
 */
static void _sp_mod_d(const sp_int* a, const sp_int_digit d, sp_int_digit* r)
{
    int i;
#ifndef SQR_MUL_ASM
    sp_int_word w = 0;
#else
    sp_int_digit h = 0;
#endif

    /* Divide starting at most significant word down to least. */
    for (i = (int)(a->used - 1); i >= 0; i--) {
    #ifndef SQR_MUL_ASM
        /* Combine remainder from last operation with this word and divide. */
        sp_int_digit t = sp_div_word((sp_int_digit)w, a->dp[i], d);
        /* Combine remainder from last operation with this word. */
        w = (w << SP_WORD_SIZE) | a->dp[i];
        /* Subtract to get modulo result. */
        w -= (sp_int_word)t * d;
    #else
        /* Combine remainder from last operation with this word and divide. */
        sp_int_digit t = sp_div_word(h, a->dp[i], d);
        /* Subtract to get modulo result. */
        h = a->dp[i] - t * d;
    #endif
    }

    /* Return remainder. */
#ifndef SQR_MUL_ASM
    *r = (sp_int_digit)w;
#else
    *r = h;
#endif
}

/* Calculate a modulo the digit d into r: r = a mod d
 *
 * @param  [in]   a  SP integer to reduce.
 * @param  [in]   d  Digit to that is the modulus.
 * @param  [out]  r  Digit that is the result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a is NULL or d is 0.
 */
#if !defined(WOLFSSL_SP_MATH_ALL) && (!defined(HAVE_ECC) || \
    !defined(HAVE_COMP_KEY)) && !defined(OPENSSL_EXTRA)
static
#endif /* !WOLFSSL_SP_MATH_ALL && (!HAVE_ECC || !HAVE_COMP_KEY) */
int sp_mod_d(const sp_int* a, sp_int_digit d, sp_int_digit* r)
{
    int err = MP_OKAY;

    /* Validate parameters. */
    if ((a == NULL) || (r == NULL) || (d == 0)) {
        err = MP_VAL;
    }

#if 0
    sp_print(a, "a");
    sp_print_digit(d, "m");
#endif

    if (err == MP_OKAY) {
        /* Check whether d is a power of 2. */
        if ((d & (d - 1)) == 0) {
            if (a->used == 0) {
                *r = 0;
            }
            else {
                *r = a->dp[0] & (d - 1);
            }
        }
#if !defined(WOLFSSL_SP_SMALL)
    #if SP_WORD_SIZE < 64
        else if (d == 3) {
            /* Fast implementation for divisor of 3. */
            _sp_div_3(a, NULL, r);
        }
    #endif
        else if (d == 10) {
            /* Fast implementation for divisor of 10. */
            _sp_div_10(a, NULL, r);
        }
#endif
        else if (d <= SP_HALF_MAX) {
            /* For small divisors. */
            _sp_div_small(a, d, NULL, r);
        }
        else {
            _sp_mod_d(a, d, r);
        }

    #ifdef WOLFSSL_SP_INT_NEGATIVE
        if (a->sign == MP_NEG) {
            *r = d - *r;
        }
    #endif
    }

#if 0
    sp_print_digit(*r, "rmod");
#endif

    return err;
}
#endif /* WOLFSSL_SP_MOD_D */

#if defined(HAVE_ECC) || !defined(NO_DSA) || defined(OPENSSL_EXTRA) || \
    (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY) && \
     !defined(WOLFSSL_RSA_PUBLIC_ONLY))
/* Divides a by 2 and stores in r: r = a >> 1
 *
 * @param  [in]   a  SP integer to divide.
 * @param  [out]  r  SP integer to hold result.
 */
static void _sp_div_2(const sp_int* a, sp_int* r)
{
    int i;

    /* Shift down each word by 1 and include bottom bit of next at top. */
    for (i = 0; i < (int)a->used - 1; i++) {
        r->dp[i] = (a->dp[i] >> 1) | (a->dp[i+1] << (SP_WORD_SIZE - 1));
    }
    /* Last word only needs to be shifted down. */
    r->dp[i] = a->dp[i] >> 1;
    /* Set used to be all words seen. */
    r->used = (unsigned int)i + 1;
    /* Remove leading zeros. */
    sp_clamp(r);
#ifdef WOLFSSL_SP_INT_NEGATIVE
    /* Same sign in result. */
    r->sign = a->sign;
#endif
}

#if defined(WOLFSSL_SP_MATH_ALL) && defined(HAVE_ECC)
/* Divides a by 2 and stores in r: r = a >> 1
 *
 * @param  [in]   a  SP integer to divide.
 * @param  [out]  r  SP integer to hold result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a or r is NULL.
 */
int sp_div_2(const sp_int* a, sp_int* r)
{
    int err = MP_OKAY;

    /* Only when a public API. */
    if ((a == NULL) || (r == NULL)) {
        err = MP_VAL;
    }
    /* Ensure maximal size is supported by result. */
    if ((err == MP_OKAY) && (a->used > r->size)) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
        _sp_div_2(a, r);
    }

    return err;
}
#endif /* WOLFSSL_SP_MATH_ALL && HAVE_ECC */
#endif /* HAVE_ECC || !NO_DSA || OPENSSL_EXTRA ||
        * (!NO_RSA && !WOLFSSL_RSA_VERIFY_ONLY) */

#if defined(WOLFSSL_SP_MATH_ALL) && defined(HAVE_ECC)
/* Divides a by 2 mod m and stores in r: r = (a / 2) mod m
 *
 * r = a / 2 (mod m) - constant time (a < m and positive)
 *
 * @param  [in]   a  SP integer to divide.
 * @param  [in]   m  SP integer that is modulus.
 * @param  [out]  r  SP integer to hold result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a, m or r is NULL.
 */
int sp_div_2_mod_ct(const sp_int* a, const sp_int* m, sp_int* r)
{
    int err = MP_OKAY;

    /* Validate parameters. */
    if ((a == NULL) || (m == NULL) || (r == NULL)) {
        err = MP_VAL;
    }
    /* Check result has enough space for a + m. */
    if ((err == MP_OKAY) && (m->used + 1 > r->size)) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
    #ifndef SQR_MUL_ASM
        sp_int_word  w = 0;
    #else
        sp_int_digit l = 0;
        sp_int_digit h;
        sp_int_digit t;
    #endif
        /* Mask to apply to modulus. */
        sp_int_digit mask = (sp_int_digit)0 - (a->dp[0] & 1);
        unsigned int i;

    #if 0
        sp_print(a, "a");
        sp_print(m, "m");
    #endif

        /* Add a to m, if a is odd, into r in constant time. */
        for (i = 0; i < m->used; i++) {
            /* Mask to apply to a - set when used value at index. */
            sp_int_digit mask_a = (sp_int_digit)0 - (i < a->used);

        #ifndef SQR_MUL_ASM
            /* Conditionally add modulus. */
            w         += m->dp[i] & mask;
            /* Conditionally add a. */
            w         += a->dp[i] & mask_a;
            /* Store low digit in result. */
            r->dp[i]   = (sp_int_digit)w;
            /* Move high digit down. */
            w        >>= DIGIT_BIT;
        #else
            /* No high digit. */
            h        = 0;
            /* Conditionally use modulus. */
            t        = m->dp[i] & mask;
            /* Add with carry modulus. */
            SP_ASM_ADDC_REG(l, h, t);
            /* Conditionally use a. */
            t        = a->dp[i] & mask_a;
            /* Add with carry a. */
            SP_ASM_ADDC_REG(l, h, t);
            /* Store low digit in result. */
            r->dp[i] = l;
            /* Move high digit down. */
            l        = h;
        #endif
        }
        /* Store carry. */
    #ifndef SQR_MUL_ASM
        r->dp[i] = (sp_int_digit)w;
    #else
        r->dp[i] = l;
    #endif
        /* Used includes carry - set or not. */
        r->used = i + 1;
    #ifdef WOLFSSL_SP_INT_NEGATIVE
        r->sign = MP_ZPOS;
    #endif
        /* Divide conditional sum by 2. */
        _sp_div_2(r, r);

    #if 0
        sp_print(r, "rd2");
    #endif
    }

    return err;
}
#endif /* WOLFSSL_SP_MATH_ALL && HAVE_ECC */

/************************
 * Add/Subtract Functions
 ************************/

#if !defined(WOLFSSL_RSA_VERIFY_ONLY) || defined(WOLFSSL_SP_INVMOD)
/* Add offset b to a into r: r = a + (b << (o * SP_WORD_SIZEOF))
 *
 * @param  [in]   a  SP integer to add to.
 * @param  [in]   b  SP integer to add.
 * @param  [out]  r  SP integer to store result in.
 * @param  [in]   o  Number of digits to offset b.
 */
static void _sp_add_off(const sp_int* a, const sp_int* b, sp_int* r, int o)
{
    unsigned int i = 0;
#ifndef SQR_MUL_ASM
    sp_int_word t = 0;
#else
    sp_int_digit l = 0;
    sp_int_digit h = 0;
    sp_int_digit t = 0;
#endif

#ifdef SP_MATH_NEED_ADD_OFF
    unsigned int j;

    /* Copy a into result up to offset. */
    for (; (i < o) && (i < a->used); i++) {
        r->dp[i] = a->dp[i];
    }
    /* Set result to 0 for digits beyonf those in a. */
    for (; i < o; i++) {
        r->dp[i] = 0;
    }

    /* Add each digit from a and b where both have values. */
    for (j = 0; (i < a->used) && (j < b->used); i++, j++) {
    #ifndef SQR_MUL_ASM
        t += a->dp[i];
        t += b->dp[j];
        r->dp[i] = (sp_int_digit)t;
        t >>= SP_WORD_SIZE;
    #else
        t = a->dp[i];
        SP_ASM_ADDC(l, h, t);
        t = b->dp[j];
        SP_ASM_ADDC(l, h, t);
        r->dp[i] = l;
        l = h;
        h = 0;
    #endif
    }
    /* Either a and/or b are out of digits. Add carry and remaining a digits. */
    for (; i < a->used; i++) {
    #ifndef SQR_MUL_ASM
        t += a->dp[i];
        r->dp[i] = (sp_int_digit)t;
        t >>= SP_WORD_SIZE;
    #else
        t = a->dp[i];
        SP_ASM_ADDC(l, h, t);
        r->dp[i] = l;
        l = h;
        h = 0;
    #endif
    }
    /* a is out of digits. Add carry and remaining b digits. */
    for (; j < b->used; i++, j++) {
    #ifndef SQR_MUL_ASM
        t += b->dp[j];
        r->dp[i] = (sp_int_digit)t;
        t >>= SP_WORD_SIZE;
    #else
        t = b->dp[j];
        SP_ASM_ADDC(l, h, t);
        r->dp[i] = l;
        l = h;
        h = 0;
    #endif
    }
#else
    (void)o;

    /* Add each digit from a and b where both have values. */
    for (; (i < a->used) && (i < b->used); i++) {
    #ifndef SQR_MUL_ASM
        t += a->dp[i];
        t += b->dp[i];
        r->dp[i] = (sp_int_digit)t;
        t >>= SP_WORD_SIZE;
    #else
        t = a->dp[i];
        SP_ASM_ADDC(l, h, t);
        t = b->dp[i];
        SP_ASM_ADDC(l, h, t);
        r->dp[i] = l;
        l = h;
        h = 0;
    #endif
    }
    /* Either a and/or b are out of digits. Add carry and remaining a digits. */
    for (; i < a->used; i++) {
    #ifndef SQR_MUL_ASM
        t += a->dp[i];
        r->dp[i] = (sp_int_digit)t;
        t >>= SP_WORD_SIZE;
    #else
        t = a->dp[i];
        SP_ASM_ADDC(l, h, t);
        r->dp[i] = l;
        l = h;
        h = 0;
    #endif
    }
    /* a is out of digits. Add carry and remaining b digits. */
    for (; i < b->used; i++) {
    #ifndef SQR_MUL_ASM
        t += b->dp[i];
        r->dp[i] = (sp_int_digit)t;
        t >>= SP_WORD_SIZE;
    #else
        t = b->dp[i];
        SP_ASM_ADDC(l, h, t);
        r->dp[i] = l;
        l = h;
        h = 0;
    #endif
    }
#endif

    /* Set used based on last digit put in. */
    r->used = i;
    /* Put in carry. */
#ifndef SQR_MUL_ASM
    r->dp[i] = (sp_int_digit)t;
    r->used += (t != 0);
#else
    r->dp[i] = l;
    r->used += (l != 0);
#endif

    /* Remove leading zeros. */
    sp_clamp(r);
}
#endif /* !WOLFSSL_RSA_VERIFY_ONLY */

#if defined(WOLFSSL_SP_MATH_ALL) || defined(WOLFSSL_SP_INT_NEGATIVE) || \
    !defined(NO_DH) || defined(HAVE_ECC) || (!defined(NO_RSA) && \
    !defined(WOLFSSL_RSA_VERIFY_ONLY))
/* Sub offset b from a into r: r = a - (b << (o * SP_WORD_SIZEOF))
 * a must be greater than b.
 *
 * When using offset, r == a is faster.
 *
 * @param  [in]   a  SP integer to subtract from.
 * @param  [in]   b  SP integer to subtract.
 * @param  [out]  r  SP integer to store result in.
 * @param  [in]   o  Number of digits to offset b.
 */
static void _sp_sub_off(const sp_int* a, const sp_int* b, sp_int* r,
    unsigned int o)
{
    unsigned int i = 0;
    unsigned int j;
#ifndef SQR_MUL_ASM
    sp_int_sword t = 0;
#else
    sp_int_digit l = 0;
    sp_int_digit h = 0;
#endif

    /* Need to copy digits up to offset into result. */
    if (r != a) {
        for (; (i < o) && (i < a->used); i++) {
            r->dp[i] = a->dp[i];
        }
    }
    else {
        i = o;
    }
    /* Index to add at is the offset now. */

    for (j = 0; (i < a->used) && (j < b->used); i++, j++) {
    #ifndef SQR_MUL_ASM
        /* Add a into and subtract b from current value. */
        t += a->dp[i];
        t -= b->dp[j];
        /* Store low digit in result. */
        r->dp[i] = (sp_int_digit)t;
        /* Move high digit down. */
        t >>= SP_WORD_SIZE;
    #else
        /* Add a into and subtract b from current value. */
        SP_ASM_ADDC(l, h, a->dp[i]);
        SP_ASM_SUBB(l, h, b->dp[j]);
        /* Store low digit in result. */
        r->dp[i] = l;
        /* Move high digit down. */
        l = h;
        /* High digit is 0 when positive or -1 on negative. */
        h = (sp_int_digit)0 - (h >> (SP_WORD_SIZE - 1));
    #endif
    }
    for (; i < a->used; i++) {
    #ifndef SQR_MUL_ASM
        /* Add a into current value. */
        t += a->dp[i];
        /* Store low digit in result. */
        r->dp[i] = (sp_int_digit)t;
        /* Move high digit down. */
        t >>= SP_WORD_SIZE;
    #else
        /* Add a into current value. */
        SP_ASM_ADDC(l, h, a->dp[i]);
        /* Store low digit in result. */
        r->dp[i] = l;
        /* Move high digit down. */
        l = h;
        /* High digit is 0 when positive or -1 on negative. */
        h = (sp_int_digit)0 - (h >> (SP_WORD_SIZE - 1));
    #endif
    }

    /* Set used based on last digit put in. */
    r->used = i;
    /* Remove leading zeros. */
    sp_clamp(r);
}
#endif /* WOLFSSL_SP_MATH_ALL || WOLFSSL_SP_INT_NEGATIVE || !NO_DH ||
        * HAVE_ECC || (!NO_RSA && !WOLFSSL_RSA_VERIFY_ONLY) */

#if !defined(WOLFSSL_RSA_VERIFY_ONLY) || defined(WOLFSSL_SP_INVMOD)
/* Add b to a into r: r = a + b
 *
 * @param  [in]   a  SP integer to add to.
 * @param  [in]   b  SP integer to add.
 * @param  [out]  r  SP integer to store result in.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a, b, or r is NULL.
 */
int sp_add(const sp_int* a, const sp_int* b, sp_int* r)
{
    int err = MP_OKAY;

    /* Validate parameters. */
    if ((a == NULL) || (b == NULL) || (r == NULL)) {
        err = MP_VAL;
    }
    /* Check that r as big as a and b plus one word. */
    if ((err == MP_OKAY) && ((a->used >= r->size) || (b->used >= r->size))) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
    #ifndef WOLFSSL_SP_INT_NEGATIVE
        /* Add two positive numbers. */
        _sp_add_off(a, b, r, 0);
    #else
        /* Same sign then add absolute values and use sign. */
        if (a->sign == b->sign) {
            _sp_add_off(a, b, r, 0);
            r->sign = a->sign;
        }
        /* Different sign and abs(a) >= abs(b). */
        else if (_sp_cmp_abs(a, b) != MP_LT) {
            /* Subtract absolute values and use sign of a unless result 0. */
            _sp_sub_off(a, b, r, 0);
            if (sp_iszero(r)) {
                r->sign = MP_ZPOS;
            }
            else {
                r->sign = a->sign;
            }
        }
        /* Different sign and abs(a) < abs(b). */
        else {
            /* Reverse subtract absolute values and use sign of b. */
            _sp_sub_off(b, a, r, 0);
            r->sign = b->sign;
        }
    #endif
    }

    return err;
}
#endif /* !WOLFSSL_RSA_VERIFY_ONLY */

#if defined(WOLFSSL_SP_MATH_ALL) || !defined(NO_DH) || defined(HAVE_ECC) || \
    (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY))
/* Subtract b from a into r: r = a - b
 *
 * a must be greater than b unless WOLFSSL_SP_INT_NEGATIVE is defined.
 *
 * @param  [in]   a  SP integer to subtract from.
 * @param  [in]   b  SP integer to subtract.
 * @param  [out]  r  SP integer to store result in.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a, b, or r is NULL.
 */
int sp_sub(const sp_int* a, const sp_int* b, sp_int* r)
{
    int err = MP_OKAY;

    /* Validate parameters. */
    if ((a == NULL) || (b == NULL) || (r == NULL)) {
        err = MP_VAL;
    }
    /* Check that r as big as a and b plus one word. */
    if ((err == MP_OKAY) && ((a->used >= r->size) || (b->used >= r->size))) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
    #ifndef WOLFSSL_SP_INT_NEGATIVE
        /* Subtract positive numbers b from a. */
        _sp_sub_off(a, b, r, 0);
    #else
        /* Different sign. */
        if (a->sign != b->sign) {
            /* Add absolute values and use sign of a. */
            _sp_add_off(a, b, r, 0);
            r->sign = a->sign;
        }
        /* Same sign and abs(a) >= abs(b). */
        else if (_sp_cmp_abs(a, b) != MP_LT) {
            /* Subtract absolute values and use sign of a unless result 0. */
            _sp_sub_off(a, b, r, 0);
            if (sp_iszero(r)) {
                r->sign = MP_ZPOS;
            }
            else {
                r->sign = a->sign;
            }
        }
        /* Same sign and abs(a) < abs(b). */
        else {
            /* Reverse subtract absolute values and use opposite sign of a */
            _sp_sub_off(b, a, r, 0);
            r->sign = 1 - a->sign;
        }
    #endif
    }

    return err;
}
#endif /* WOLFSSL_SP_MATH_ALL || !NO_DH || HAVE_ECC ||
        * (!NO_RSA && !WOLFSSL_RSA_VERIFY_ONLY)*/

/****************************
 * Add/Subtract mod functions
 ****************************/

#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
    (!defined(WOLFSSL_SP_MATH) && defined(WOLFSSL_CUSTOM_CURVES)) || \
    defined(WOLFCRYPT_HAVE_ECCSI) || defined(WOLFCRYPT_HAVE_SAKKE)
/* Add two value and reduce: r = (a + b) % m
 *
 * @param  [in]   a  SP integer to add.
 * @param  [in]   b  SP integer to add with.
 * @param  [in]   m  SP integer that is the modulus.
 * @param  [out]  r  SP integer to hold result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_addmod(const sp_int* a, const sp_int* b, const sp_int* m,
    sp_int* r)
{
    int err = MP_OKAY;
    /* Calculate used based on digits used in a and b. */
    unsigned int used = ((a->used >= b->used) ? a->used + 1 : b->used + 1);
    DECL_SP_INT(t, used);

    /* Allocate a temporary SP int to hold sum. */
    ALLOC_SP_INT_SIZE(t, used, err, NULL);

    if (err == MP_OKAY) {
        /* Do sum. */
        err = sp_add(a, b, t);
    }
    if (err == MP_OKAY) {
        /* Mod result. */
        err = sp_mod(t, m, r);
    }

    FREE_SP_INT(t, NULL);
    return err;
}

/* Add two value and reduce: r = (a + b) % m
 *
 * @param  [in]   a  SP integer to add.
 * @param  [in]   b  SP integer to add with.
 * @param  [in]   m  SP integer that is the modulus.
 * @param  [out]  r  SP integer to hold result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a, b, m or r is NULL.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
int sp_addmod(const sp_int* a, const sp_int* b, const sp_int* m, sp_int* r)
{
    int err = MP_OKAY;

    /* Validate parameters. */
    if ((a == NULL) || (b == NULL) || (m == NULL) || (r == NULL)) {
        err = MP_VAL;
    }
    /* Ensure a and b aren't too big a number to operate on. */
    else if (a->used >= SP_INT_DIGITS) {
        err = MP_VAL;
    }
    else if (b->used >= SP_INT_DIGITS) {
        err = MP_VAL;
    }


#if 0
    if (err == MP_OKAY) {
        sp_print(a, "a");
        sp_print(b, "b");
        sp_print(m, "m");
    }
#endif
    if (err == MP_OKAY) {
        /* Do add and modular reduction. */
        err = _sp_addmod(a, b, m, r);
    }
#if 0
    if (err == MP_OKAY) {
        sp_print(r, "rma");
    }
#endif

    return err;
}
#endif /* WOLFSSL_SP_MATH_ALL || WOLFSSL_CUSTOM_CURVES) ||
        * WOLFCRYPT_HAVE_ECCSI || WOLFCRYPT_HAVE_SAKKE */

#if defined(WOLFSSL_SP_MATH_ALL) && (!defined(WOLFSSL_RSA_VERIFY_ONLY) || \
    defined(HAVE_ECC))
/* Sub b from a and reduce: r = (a - b) % m
 * Result is always positive.
 *
 * @param  [in]   a  SP integer to subtract from
 * @param  [in]   b  SP integer to subtract.
 * @param  [in]   m  SP integer that is the modulus.
 * @param  [out]  r  SP integer to hold result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_submod(const sp_int* a, const sp_int* b, const sp_int* m,
    sp_int* r)
{
    int err = MP_OKAY;
#ifndef WOLFSSL_SP_INT_NEGATIVE
    unsigned int used = ((a->used >= m->used) ?
        ((a->used >= b->used) ? (a->used + 1) : (b->used + 1)) :
        ((b->used >= m->used)) ? (b->used + 1) : (m->used + 1));
    DECL_SP_INT_ARRAY(t, used, 2);

    ALLOC_SP_INT_ARRAY(t, used, 2, err, NULL);
    if (err == MP_OKAY) {
        /* Reduce a to less than m. */
        if (_sp_cmp(a, m) != MP_LT) {
            err = sp_mod(a, m, t[0]);
            a = t[0];
        }
    }
    if (err == MP_OKAY) {
        /* Reduce b to less than m. */
        if (_sp_cmp(b, m) != MP_LT) {
            err = sp_mod(b, m, t[1]);
            b = t[1];
        }
    }
    if (err == MP_OKAY) {
        /* Add m to a if a smaller than b. */
        if (_sp_cmp(a, b) == MP_LT) {
            err = sp_add(a, m, t[0]);
            a = t[0];
        }
    }
    if (err == MP_OKAY) {
        /* Subtract b from a. */
        err = sp_sub(a, b, r);
    }

    FREE_SP_INT_ARRAY(t, NULL);
#else /* WOLFSSL_SP_INT_NEGATIVE */
    unsigned int used = ((a->used >= b->used) ? a->used + 1 : b->used + 1);
    DECL_SP_INT(t, used);

    ALLOC_SP_INT_SIZE(t, used, err, NULL);
    /* Subtract b from a into temporary. */
    if (err == MP_OKAY) {
        err = sp_sub(a, b, t);
    }
    if (err == MP_OKAY) {
        /* Reduce result mod m into result. */
        err = sp_mod(t, m, r);
    }
    FREE_SP_INT(t, NULL);
#endif /* WOLFSSL_SP_INT_NEGATIVE */

    return err;
}

/* Sub b from a and reduce: r = (a - b) % m
 * Result is always positive.
 *
 * @param  [in]   a  SP integer to subtract from
 * @param  [in]   b  SP integer to subtract.
 * @param  [in]   m  SP integer that is the modulus.
 * @param  [out]  r  SP integer to hold result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a, b, m or r is NULL.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
int sp_submod(const sp_int* a, const sp_int* b, const sp_int* m, sp_int* r)
{
    int err = MP_OKAY;
    /* Validate parameters. */
    if ((a == NULL) || (b == NULL) || (m == NULL) || (r == NULL)) {
        err = MP_VAL;
    }
    /* Ensure a, b and m aren't too big a number to operate on. */
    else if (a->used >= SP_INT_DIGITS) {
        err = MP_VAL;
    }
    else if (b->used >= SP_INT_DIGITS) {
        err = MP_VAL;
    }
    else if (m->used >= SP_INT_DIGITS) {
        err = MP_VAL;
    }

#if 0
    if (err == MP_OKAY) {
        sp_print(a, "a");
        sp_print(b, "b");
        sp_print(m, "m");
    }
#endif
    if (err == MP_OKAY) {
        /* Do submod. */
        err = _sp_submod(a, b, m, r);
    }
#if 0
    if (err == MP_OKAY) {
        sp_print(r, "rms");
    }
#endif

    return err;
}
#endif /* WOLFSSL_SP_MATH_ALL */

#if defined(WOLFSSL_SP_MATH_ALL) && defined(HAVE_ECC)
/* Add two value and reduce: r = (a + b) % m
 *
 * r = a + b (mod m) - constant time (a < m and b < m, a, b and m are positive)
 *
 * Assumes a, b, m and r are not NULL.
 * m and r must not be the same pointer.
 *
 * @param  [in]   a  SP integer to add.
 * @param  [in]   b  SP integer to add with.
 * @param  [in]   m  SP integer that is the modulus.
 * @param  [out]  r  SP integer to hold result.
 *
 * @return  MP_OKAY on success.
 */
int sp_addmod_ct(const sp_int* a, const sp_int* b, const sp_int* m, sp_int* r)
{
    int err = MP_OKAY;
#ifndef SQR_MUL_ASM
    sp_int_sword w;
    sp_int_sword s;
#else
    sp_int_digit wl;
    sp_int_digit wh;
    sp_int_digit sl;
    sp_int_digit sh;
    sp_int_digit t;
#endif
    sp_int_digit mask;
    sp_int_digit mask_a = (sp_int_digit)-1;
    sp_int_digit mask_b = (sp_int_digit)-1;
    unsigned int i;

    /* Check result is as big as modulus. */
    if (m->used > r->size) {
        err = MP_VAL;
    }
    /* Validate parameters. */
    if ((err == MP_OKAY) && (r == m)) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
#if 0
        sp_print(a, "a");
        sp_print(b, "b");
        sp_print(m, "m");
#endif

        /* Add a to b into r. Do the subtract of modulus but don't store result.
         * When subtract result is negative, the overflow will be negative.
         * Only need to subtract mod when result is positive - overflow is
         * positive.
         */
    #ifndef SQR_MUL_ASM
        w = 0;
        s = 0;
    #else
        wl = 0;
        sl = 0;
        sh = 0;
    #endif
        /* Constant time - add modulus digits worth from a and b. */
        for (i = 0; i < m->used; i++) {
            /* Values past 'used' are not initialized. */
            mask_a += (i == a->used);
            mask_b += (i == b->used);

        #ifndef SQR_MUL_ASM
            /* Add next digits from a and b to current value. */
            w         += a->dp[i] & mask_a;
            w         += b->dp[i] & mask_b;
            /* Store low digit in result. */
            r->dp[i]   = (sp_int_digit)w;
            /* Add result to reducing value. */
            s         += (sp_int_digit)w;
            /* Subtract next digit of modulus. */
            s         -= m->dp[i];
            /* Move high digit of reduced result down. */
            s        >>= DIGIT_BIT;
            /* Move high digit of sum result down. */
            w        >>= DIGIT_BIT;
        #else
            wh = 0;
            /* Add next digits from a and b to current value. */
            t = a->dp[i] & mask_a;
            SP_ASM_ADDC_REG(wl, wh, t);
            t = b->dp[i] & mask_b;
            SP_ASM_ADDC_REG(wl, wh, t);
            /* Store low digit in result. */
            r->dp[i] = wl;
            /* Add result to reducing value. */
            SP_ASM_ADDC_REG(sl, sh, wl);
            /* Subtract next digit of modulus. */
            SP_ASM_SUBB(sl, sh, m->dp[i]);
            /* Move high digit of reduced result down. */
            sl = sh;
            /* High digit is 0 when positive or -1 on negative. */
            sh = (sp_int_digit)0 - (sh >> (SP_WORD_SIZE-1));
            /* Move high digit of sum result down. */
            wl = wh;
        #endif
        }
    #ifndef SQR_MUL_ASM
        /* Add carry into reduced result. */
        s += (sp_int_digit)w;
        /* s will be positive when subtracting modulus is needed. */
        mask = (sp_int_digit)0 - (s >= 0);
    #else
        /* Add carry into reduced result. */
        SP_ASM_ADDC_REG(sl, sh, wl);
        /* s will be positive when subtracting modulus is needed. */
        mask = (sh >> (SP_WORD_SIZE-1)) - 1;
    #endif

        /* Constant time, conditionally, subtract modulus from sum. */
    #ifndef SQR_MUL_ASM
        w = 0;
    #else
        wl = 0;
        wh = 0;
    #endif
        for (i = 0; i < m->used; i++) {
        #ifndef SQR_MUL_ASM
            /* Add result to current value and conditionally subtract modulus.
             */
            w         += r->dp[i];
            w         -= m->dp[i] & mask;
            /* Store low digit in result. */
            r->dp[i]   = (sp_int_digit)w;
            /* Move high digit of sum result down. */
            w        >>= DIGIT_BIT;
        #else
            /* Add result to current value and conditionally subtract modulus.
             */
            SP_ASM_ADDC(wl, wh, r->dp[i]);
            t = m->dp[i] & mask;
            SP_ASM_SUBB_REG(wl, wh, t);
            /* Store low digit in result. */
            r->dp[i] = wl;
            /* Move high digit of sum result down. */
            wl = wh;
            /* High digit is 0 when positive or -1 on negative. */
            wh = (sp_int_digit)0 - (wl >> (SP_WORD_SIZE-1));
        #endif
        }
        /* Result will always have digits equal to or less than those in
         * modulus. */
        r->used = i;
    #ifdef WOLFSSL_SP_INT_NEGATIVE
        r->sign = MP_ZPOS;
    #endif /* WOLFSSL_SP_INT_NEGATIVE */
        /* Remove leading zeros. */
        sp_clamp(r);

#if 0
        sp_print(r, "rma");
#endif
    }

    return err;
}
#endif /* WOLFSSL_SP_MATH_ALL && HAVE_ECC */

#if defined(WOLFSSL_SP_MATH_ALL) && defined(HAVE_ECC)
/* Sub b from a and reduce: r = (a - b) % m
 * Result is always positive.
 *
 * r = a - b (mod m) - constant time (a < m and b < m, a, b and m are positive)
 *
 * Assumes a, b, m and r are not NULL.
 * m and r must not be the same pointer.
 *
 * @param  [in]   a  SP integer to subtract from
 * @param  [in]   b  SP integer to subtract.
 * @param  [in]   m  SP integer that is the modulus.
 * @param  [out]  r  SP integer to hold result.
 *
 * @return  MP_OKAY on success.
 */
int sp_submod_ct(const sp_int* a, const sp_int* b, const sp_int* m, sp_int* r)
{
    int err = MP_OKAY;
#ifndef SQR_MUL_ASM
    sp_int_sword w;
#else
    sp_int_digit l;
    sp_int_digit h;
    sp_int_digit t;
#endif
    sp_int_digit mask;
    sp_int_digit mask_a = (sp_int_digit)-1;
    sp_int_digit mask_b = (sp_int_digit)-1;
    unsigned int i;

    /* Check result is as big as modulus plus one digit. */
    if (m->used > r->size) {
        err = MP_VAL;
    }
    /* Validate parameters. */
    if ((err == MP_OKAY) && (r == m)) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
#if 0
        sp_print(a, "a");
        sp_print(b, "b");
        sp_print(m, "m");
#endif

        /* In constant time, subtract b from a putting result in r. */
    #ifndef SQR_MUL_ASM
        w = 0;
    #else
        l = 0;
        h = 0;
    #endif
        for (i = 0; i < m->used; i++) {
            /* Values past 'used' are not initialized. */
            mask_a += (i == a->used);
            mask_b += (i == b->used);

        #ifndef SQR_MUL_ASM
            /* Add a to and subtract b from current value. */
            w         += a->dp[i] & mask_a;
            w         -= b->dp[i] & mask_b;
            /* Store low digit in result. */
            r->dp[i]   = (sp_int_digit)w;
            /* Move high digit down. */
            w        >>= DIGIT_BIT;
        #else
            /* Add a and subtract b from current value. */
            t = a->dp[i] & mask_a;
            SP_ASM_ADDC_REG(l, h, t);
            t = b->dp[i] & mask_b;
            SP_ASM_SUBB_REG(l, h, t);
            /* Store low digit in result. */
            r->dp[i] = l;
            /* Move high digit down. */
            l = h;
            /* High digit is 0 when positive or -1 on negative. */
            h = (sp_int_digit)0 - (l >> (SP_WORD_SIZE - 1));
        #endif
        }
        /* When w is negative then we need to add modulus to make result
         * positive. */
    #ifndef SQR_MUL_ASM
        mask = (sp_int_digit)0 - (w < 0);
    #else
        mask = h;
    #endif
        /* Constant time, conditionally, add modulus to difference. */
    #ifndef SQR_MUL_ASM
        w = 0;
    #else
        l = 0;
    #endif
        for (i = 0; i < m->used; i++) {
        #ifndef SQR_MUL_ASM
            /* Add result and conditionally modulus to current value. */
            w         += r->dp[i];
            w         += m->dp[i] & mask;
            /* Store low digit in result. */
            r->dp[i]   = (sp_int_digit)w;
            /* Move high digit down. */
            w        >>= DIGIT_BIT;
        #else
            h = 0;
            /* Add result and conditionally modulus to current value. */
            SP_ASM_ADDC(l, h, r->dp[i]);
            t = m->dp[i] & mask;
            SP_ASM_ADDC_REG(l, h, t);
            /* Store low digit in result. */
            r->dp[i] = l;
            /* Move high digit down. */
            l = h;
        #endif
        }
        /* Result will always have digits equal to or less than those in
         * modulus. */
        r->used = i;
    #ifdef WOLFSSL_SP_INT_NEGATIVE
        r->sign = MP_ZPOS;
    #endif /* WOLFSSL_SP_INT_NEGATIVE */
        /* Remove leading zeros. */
        sp_clamp(r);

#if 0
        sp_print(r, "rms");
#endif
    }

    return err;
}
#endif /* WOLFSSL_SP_MATH_ALL && HAVE_ECC */

/********************
 * Shifting functoins
 ********************/

#if !defined(NO_DH) || defined(HAVE_ECC) || (!defined(NO_RSA) && \
    defined(WC_RSA_BLINDING) && !defined(WOLFSSL_RSA_VERIFY_ONLY))
/* Left shift the multi-precision number by a number of digits.
 *
 * @param  [in,out]  a  SP integer to shift.
 * @param  [in]      s  Number of digits to shift.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a is NULL, s is negative or the result is too big.
 */
int sp_lshd(sp_int* a, int s)
{
    int err = MP_OKAY;

    /* Validate parameters. */
    if ((a == NULL) || (s < 0)) {
        err = MP_VAL;
    }
    /* Ensure number has enough digits for operation. */
    if ((err == MP_OKAY) && (a->used + (unsigned int)s > a->size)) {
        err = MP_VAL;
    }
    if (err == MP_OKAY) {
        /* Move up digits. */
        XMEMMOVE(a->dp + s, a->dp, a->used * SP_WORD_SIZEOF);
        /* Back fill with zeros. */
        XMEMSET(a->dp, 0, (size_t)s * SP_WORD_SIZEOF);
        /* Update used. */
        a->used += (unsigned int)s;
        /* Remove leading zeros. */
        sp_clamp(a);
    }

    return err;
}
#endif

#if defined(WOLFSSL_SP_MATH_ALL) || !defined(NO_DH) || defined(HAVE_ECC) || \
    (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY) && \
     !defined(WOLFSSL_RSA_PUBLIC_ONLY))
/* Left shift the multi-precision number by n bits.
 * Bits may be larger than the word size.
 *
 * Used by sp_mul_2d() and other internal functions.
 *
 * @param  [in,out]  a  SP integer to shift.
 * @param  [in]      n  Number of bits to shift left.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when the result is too big.
 */
static int sp_lshb(sp_int* a, int n)
{
    int err = MP_OKAY;

    if (a->used != 0) {
        /* Calculate number of digits to shift. */
        unsigned int s = (unsigned int)n >> SP_WORD_SHIFT;

        /* Ensure number has enough digits for result. */
        if (a->used + s >= a->size) {
            err = MP_VAL;
        }
        if (err == MP_OKAY) {
            /* Get count of bits to move in digit. */
            n &= SP_WORD_MASK;
            /* Check whether this is a complicated case. */
            if (n != 0) {
                unsigned int i;

                /* Shift up starting at most significant digit. */
                /* Get new most significant digit. */
                sp_int_digit v = a->dp[a->used - 1] >> (SP_WORD_SIZE - n);
                /* Shift up each digit. */
                for (i = a->used - 1; i >= 1; i--) {
                    a->dp[i + s] = (a->dp[i] << n) |
                                   (a->dp[i - 1] >> (SP_WORD_SIZE - n));
                }
                /* Shift up least significant digit. */
                a->dp[s] = a->dp[0] << n;
                /* Add new high digit unless zero. */
                if (v != 0) {
                    a->dp[a->used + s] = v;
                    a->used++;
                }
            }
            /* Only digits to move and ensure not zero. */
            else if (s > 0) {
                /* Move up digits. */
                XMEMMOVE(a->dp + s, a->dp, a->used * SP_WORD_SIZEOF);
            }

            /* Update used digit count. */
            a->used += s;
            /* Back fill with zeros. */
            XMEMSET(a->dp, 0, SP_WORD_SIZEOF * s);
        }
    }

    return err;
}
#endif /* WOLFSSL_SP_MATH_ALL || !NO_DH || HAVE_ECC ||
        * (!NO_RSA && !WOLFSSL_RSA_VERIFY_ONLY) */

#ifdef WOLFSSL_SP_MATH_ALL
/* Shift a right by c digits: a = a >> (n * SP_WORD_SIZE)
 *
 * @param  [in, out] a  SP integer to shift.
 * @param  [in]      c  Number of digits to shift.
 */
void sp_rshd(sp_int* a, int c)
{
    /* Do shift if we have an SP int. */
    if ((a != NULL) && (c > 0)) {
        /* Make zero if shift removes all digits. */
        if ((unsigned int)c >= a->used) {
            _sp_zero(a);
        }
        else {
            unsigned int i;

            /* Update used digits count. */
            a->used -= (unsigned int)c;
            /* Move digits down. */
            for (i = 0; i < a->used; i++, c++) {
                a->dp[i] = a->dp[c];
            }
        }
    }
}
#endif /* WOLFSSL_SP_MATH_ALL */

#if defined(WOLFSSL_SP_MATH_ALL) || !defined(NO_DH) || defined(HAVE_ECC) || \
    (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
    defined(WOLFSSL_HAVE_SP_DH)
/* Shift a right by n bits into r: r = a >> n
 *
 * @param  [in]   a  SP integer to shift.
 * @param  [in]   n  Number of bits to shift.
 * @param  [out]  r  SP integer to store result in.
 */
int sp_rshb(const sp_int* a, int n, sp_int* r)
{
    int err = MP_OKAY;
    /* Number of digits to shift down. */
    unsigned int i = (unsigned int)(n >> SP_WORD_SHIFT);

    if ((a == NULL) || (n < 0)) {
        err = MP_VAL;
    }
    /* Handle case where shifting out all digits. */
    if ((err == MP_OKAY) && (i >= a->used)) {
        _sp_zero(r);
    }
    /* Change callers when more error cases returned. */
    else if ((err == MP_OKAY) && (a->used - i > r->size)) {
        err = MP_VAL;
    }
    else if (err == MP_OKAY) {
        unsigned int j;

        /* Number of bits to shift in digits. */
        n &= SP_WORD_SIZE - 1;
        /* Handle simple case. */
        if (n == 0) {
            /* Set the count of used digits. */
            r->used = a->used - i;
            /* Move digits down. */
            if (r == a) {
                XMEMMOVE(r->dp, r->dp + i, SP_WORD_SIZEOF * r->used);
            }
            else {
                XMEMCPY(r->dp, a->dp + i, SP_WORD_SIZEOF * r->used);
            }
        }
        else {
            /* Move the bits down starting at least significant digit. */
            for (j = 0; i < a->used-1; i++, j++)
                r->dp[j] = (a->dp[i] >> n) | (a->dp[i+1] << (SP_WORD_SIZE - n));
            /* Most significant digit has no higher digit to pull from. */
            r->dp[j] = a->dp[i] >> n;
            /* Set the count of used digits. */
            r->used = j + (r->dp[j] > 0);
        }
#ifdef WOLFSSL_SP_INT_NEGATIVE
        if (sp_iszero(r)) {
            /* Set zero sign. */
            r->sign = MP_ZPOS;
        }
        else {
            /* Retain sign. */
            r->sign = a->sign;
        }
#endif
    }

    return err;
}
#endif /* WOLFSSL_SP_MATH_ALL || !NO_DH || HAVE_ECC ||
        * (!NO_RSA && !WOLFSSL_RSA_VERIFY_ONLY) || WOLFSSL_HAVE_SP_DH */

#if defined(WOLFSSL_SP_MATH_ALL) || !defined(NO_DH) || defined(HAVE_ECC) || \
    (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY) && \
     !defined(WOLFSSL_RSA_PUBLIC_ONLY))
static void _sp_div_same_size(sp_int* a, const sp_int* d, sp_int* r)
{
    unsigned int i;

    /* Compare top digits of dividend with those of divisor up to last. */
    for (i = d->used - 1; i > 0; i--) {
        /* Break if top divisor is not equal to dividend. */
        if (a->dp[a->used - d->used + i] != d->dp[i]) {
            break;
        }
    }
    /* Check if top dividend is greater than or equal to divisor. */
    if (a->dp[a->used - d->used + i] >= d->dp[i]) {
        /* Update quotient result. */
        r->dp[a->used - d->used] += 1;
        /* Get 'used' to restore - ensure zeros put into quotient. */
        i = a->used;
        /* Subtract d from top of a. */
        _sp_sub_off(a, d, a, a->used - d->used);
        /* Restore 'used' on remainder. */
        a->used = i;
    }
}

/* Divide a by d and return the quotient in r and the remainder in a.
 *   r = a / d; a = a % d
 *
 * Note: a is constantly having multiplies of d subtracted.
 *
 * @param  [in, out] a      SP integer to be divided and remainder on out.
 * @param  [in]      d      SP integer to divide by.
 * @param  [out]     r      SP integer that is the quotient.
 * @param  [out]     trial  SP integer that is product in trial division.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when operation fails - only when compiling small code.
 */
static int _sp_div_impl(sp_int* a, const sp_int* d, sp_int* r, sp_int* trial)
{
    int err = MP_OKAY;
    unsigned int i;
#ifdef WOLFSSL_SP_SMALL
    int c;
#else
    unsigned int j;
    unsigned int o;
    #ifndef SQR_MUL_ASM
    sp_int_sword sw;
    #else
    sp_int_digit sl;
    sp_int_digit sh;
    sp_int_digit st;
    #endif
#endif /* WOLFSSL_SP_SMALL */
    sp_int_digit t;
    sp_int_digit dt;

    /* Set result size to clear. */
    r->used = a->used - d->used + 1;
    /* Set all potentially used digits to zero. */
    for (i = 0; i < r->used; i++) {
        r->dp[i] = 0;
    }
#ifdef WOLFSSL_SP_INT_NEGATIVE
    r->sign = MP_ZPOS;
#endif
    /* Get the most significant digit (will have top bit set). */
    dt = d->dp[d->used-1];

    /* Handle when a >= d ^ (2 ^ (SP_WORD_SIZE * x)). */
    _sp_div_same_size(a, d, r);

    /* Keep subtracting multiples of d as long as the digit count of a is
     * greater than equal to d.
     */
    for (i = a->used - 1; i >= d->used; i--) {
        /* When top digits equal, guestimate maximum multiplier.
         * Worst case, multiplier is actually SP_DIGIT_MAX - 1.
         * That is, for w (word size in bits) > 1, n > 1, let:
         *   a = 2^((n+1)*w-1), d = 2^(n*w-1) + 2^((n-1)*w) - 1, t = 2^w - 2
         * Then,
         *     d * t
         *   = (2^(n*w-1) + 2^((n-1)*w) - 1) * (2^w - 2)
         *   = 2^((n+1)*w-1) - 2^(n*w) + 2^(n*w) - 2^((n-1)*w+1) - 2^w + 2
         *   = 2^((n+1)*w-1) - 2^((n-1)*w+1) - 2^w + 2
         *   = a - 2^((n-1)*w+1) - 2^w + 2
         * d > 2^((n-1)*w+1) + 2^w - 2, when w > 1, n > 1
         */
        if (a->dp[i] == dt) {
            t = SP_DIGIT_MAX;
        }
        else {
            /* Calculate trial quotient by dividing top word of dividend by top
             * digit of divisor.
             * Some implementations segfault when quotient > SP_DIGIT_MAX.
             * Implementations in assembly, using builtins or using
             * digits only (WOLFSSL_SP_DIV_WORD_HALF).
             */
            t = sp_div_word(a->dp[i], a->dp[i-1], dt);
        }
#ifdef WOLFSSL_SP_SMALL
        do {
            /* Calculate trial from trial quotient. */
            err = _sp_mul_d(d, t, trial, i - d->used);
            if (err != MP_OKAY) {
                break;
            }
            /* Check if trial is bigger. */
            c = _sp_cmp_abs(trial, a);
            if (c == MP_GT) {
                /* Decrement trial quotient and try again. */
                t--;
            }
        }
        while (c == MP_GT);

        if (err != MP_OKAY) {
            break;
        }

        /* Subtract the trial and add qoutient to result. */
        _sp_sub_off(a, trial, a, 0);
        r->dp[i - d->used] += t;
        /* Handle overflow of digit. */
        if (r->dp[i - d->used] < t) {
            r->dp[i + 1 - d->used]++;
        }
#else
        /* Index of lowest digit trial is subtracted from. */
        o = i - d->used;
        do {
        #ifndef SQR_MUL_ASM
            sp_int_word tw = 0;
        #else
            sp_int_digit tl = 0;
            sp_int_digit th = 0;
        #endif

            /* Multiply divisor by trial quotient. */
            for (j = 0; j < d->used; j++) {
            #ifndef SQR_MUL_ASM
                tw += (sp_int_word)d->dp[j] * t;
                trial->dp[j] = (sp_int_digit)tw;
                tw >>= SP_WORD_SIZE;
            #else
                SP_ASM_MUL_ADD_NO(tl, th, d->dp[j], t);
                trial->dp[j] = tl;
                tl = th;
                th = 0;
            #endif
            }
          #ifndef SQR_MUL_ASM
            trial->dp[j] = (sp_int_digit)tw;
          #else
            trial->dp[j] = tl;
          #endif

            /* Check trial quotient isn't larger than dividend. */
            for (j = d->used; j > 0; j--) {
                if (trial->dp[j] != a->dp[j + o]) {
                    break;
                }
            }
            /* Decrement trial quotient if larger and try again. */
            if (trial->dp[j] > a->dp[j + o]) {
                t--;
            }
        }
        while (trial->dp[j] > a->dp[j + o]);

    #ifndef SQR_MUL_ASM
        sw = 0;
    #else
        sl = 0;
        sh = 0;
    #endif
        /* Subtract trial - don't need to update used. */
        for (j = 0; j <= d->used; j++) {
        #ifndef SQR_MUL_ASM
            sw += a->dp[j + o];
            sw -= trial->dp[j];
            a->dp[j + o] = (sp_int_digit)sw;
            sw >>= SP_WORD_SIZE;
        #else
            st = a->dp[j + o];
            SP_ASM_ADDC(sl, sh, st);
            st = trial->dp[j];
            SP_ASM_SUBB(sl, sh, st);
            a->dp[j + o] = sl;
            sl = sh;
            sh = (sp_int_digit)0 - (sl >> (SP_WORD_SIZE - 1));
        #endif
        }

        r->dp[o] = t;
#endif /* WOLFSSL_SP_SMALL */
    }
    /* Update used. */
    a->used = i + 1;
    if (a->used == d->used) {
        /* Finish div now that length of dividend is same as divisor. */
        _sp_div_same_size(a, d, r);
    }

    return err;
}

/* Divide a by d and return the quotient in r and the remainder in rem.
 *   r = a / d; rem = a % d
 *
 * @param  [in]   a     SP integer to be divided.
 * @param  [in]   d     SP integer to divide by.
 * @param  [out]  r     SP integer that is the quotient.
 * @param  [out]  rem   SP integer that is the remainder.
 * @param  [in]   used  Number of digits in temporaries to use.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_div(const sp_int* a, const sp_int* d, sp_int* r, sp_int* rem,
    unsigned int used)
{
    int err = MP_OKAY;
    int ret;
    int done = 0;
    int s = 0;
    sp_int* sa = NULL;
    sp_int* sd = NULL;
    sp_int* tr = NULL;
    sp_int* trial = NULL;
#ifdef WOLFSSL_SP_INT_NEGATIVE
    unsigned int signA = MP_ZPOS;
    unsigned int signD = MP_ZPOS;
#endif /* WOLFSSL_SP_INT_NEGATIVE */
    /* Intermediates will always be less than or equal to dividend. */
    DECL_SP_INT_ARRAY(td, used, 4);

#ifdef WOLFSSL_SP_INT_NEGATIVE
    /* Cache sign for results. */
    signA = a->sign;
    signD = d->sign;
#endif /* WOLFSSL_SP_INT_NEGATIVE */

    /* Handle simple case of: dividend < divisor. */
    ret = _sp_cmp_abs(a, d);
    if (ret == MP_LT) {
        /* a = 0 * d + a */
        if ((rem != NULL) && (a != rem)) {
            _sp_copy(a, rem);
        }
        if (r != NULL) {
            _sp_set(r, 0);
        }
        done = 1;
    }
    /* Handle simple case of: dividend == divisor. */
    else if (ret == MP_EQ) {
        /* a = 1 * d + 0 */
        if (rem != NULL) {
            _sp_set(rem, 0);
        }
        if (r != NULL) {
            _sp_set(r, 1);
        #ifdef WOLFSSL_SP_INT_NEGATIVE
            r->sign = (signA == signD) ? MP_ZPOS : MP_NEG;
        #endif /* WOLFSSL_SP_INT_NEGATIVE */
        }
        done = 1;
    }
    else if (sp_count_bits(a) == sp_count_bits(d)) {
        /* a is greater than d but same bit length - subtract. */
        if (rem != NULL) {
            _sp_sub_off(a, d, rem, 0);
        #ifdef WOLFSSL_SP_INT_NEGATIVE
            rem->sign = signA;
        #endif
        }
        if (r != NULL) {
            _sp_set(r, 1);
        #ifdef WOLFSSL_SP_INT_NEGATIVE
            r->sign = (signA == signD) ? MP_ZPOS : MP_NEG;
        #endif /* WOLFSSL_SP_INT_NEGATIVE */
        }
        done = 1;
    }

    /* Allocate temporary 'sp_int's and assign. */
    if ((!done) && (err == MP_OKAY)) {
    #if (defined(WOLFSSL_SMALL_STACK) || defined(SP_ALLOC)) && \
        !defined(WOLFSSL_SP_NO_MALLOC)
        int cnt = 4;
        /* Reuse remainder sp_int where possible. */
        if ((rem != NULL) && (rem != d) && (rem->size > a->used)) {
            sa = rem;
            cnt--;
        }
        /* Reuse result sp_int where possible. */
        if ((r != NULL) && (r != d)) {
            tr = r;
            cnt--;
        }
        /* Macro always has code associated with it and checks err first. */
        ALLOC_SP_INT_ARRAY(td, used, cnt, err, NULL);
    #else
        ALLOC_SP_INT_ARRAY(td, used, 4, err, NULL);
    #endif
    }
    if ((!done) && (err == MP_OKAY)) {
    #if (defined(WOLFSSL_SMALL_STACK) || defined(SP_ALLOC)) && \
        !defined(WOLFSSL_SP_NO_MALLOC)
        int i = 2;

        /* Set to temporary when not reusing. */
        if (sa == NULL) {
            sa = td[i++];
            _sp_init_size(sa, used);
        }
        if (tr == NULL) {
            tr = td[i];
            _sp_init_size(tr, a->used - d->used + 2);
        }
    #else
        sa    = td[2];
        tr    = td[3];

        _sp_init_size(sa, used);
        _sp_init_size(tr, a->used - d->used + 2);
    #endif
        sd    = td[0];
        trial = td[1];

        /* Initialize sizes to minimal values. */
        _sp_init_size(sd, d->used + 1);
        _sp_init_size(trial, used);

        /* Move divisor to top of word. Adjust dividend as well. */
        s = sp_count_bits(d);
        s = SP_WORD_SIZE - (s & SP_WORD_MASK);
        _sp_copy(a, sa);
        /* Only shift if top bit of divisor no set. */
        if (s != SP_WORD_SIZE) {
            err = sp_lshb(sa, s);
            if (err == MP_OKAY) {
                _sp_copy(d, sd);
                d = sd;
                err = sp_lshb(sd, s);
            }
        }
    }
    if ((!done) && (err == MP_OKAY) && (d->used > 0)) {
        /* Do division: tr = sa / d, sa = sa % d. */
        err = _sp_div_impl(sa, d, tr, trial);
        /* Return the remainder if required. */
        if ((err == MP_OKAY) && (rem != NULL)) {
            /* Move result back down if moved up for divisor value. */
            if (s != SP_WORD_SIZE) {
                (void)sp_rshb(sa, s, sa);
            }
            _sp_copy(sa, rem);
            sp_clamp(rem);
        #ifdef WOLFSSL_SP_INT_NEGATIVE
            rem->sign = (rem->used == 0) ? MP_ZPOS : signA;
        #endif
        }
        /* Return the quotient if required. */
        if ((err == MP_OKAY) && (r != NULL)) {
            _sp_copy(tr, r);
            sp_clamp(r);
        #ifdef WOLFSSL_SP_INT_NEGATIVE
            if ((r->used == 0) || (signA == signD)) {
                r->sign = MP_ZPOS;
            }
            else {
                r->sign = MP_NEG;
            }
        #endif /* WOLFSSL_SP_INT_NEGATIVE */
        }
    }

    FREE_SP_INT_ARRAY(td, NULL);
    return err;
}

/* Divide a by d and return the quotient in r and the remainder in rem.
 *   r = a / d; rem = a % d
 *
 * @param  [in]   a    SP integer to be divided.
 * @param  [in]   d    SP integer to divide by.
 * @param  [out]  r    SP integer that is the quotient.
 * @param  [out]  rem  SP integer that is the remainder.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a or d is NULL, r and rem are NULL, or d is 0.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
int sp_div(const sp_int* a, const sp_int* d, sp_int* r, sp_int* rem)
{
    int err = MP_OKAY;
    unsigned int used = 1;

    /* Validate parameters. */
    if ((a == NULL) || (d == NULL) || ((r == NULL) && (rem == NULL))) {
        err = MP_VAL;
    }
    /* a / 0 = infinity. */
    if ((err == MP_OKAY) && sp_iszero(d)) {
        err = MP_VAL;
    }
    /* Ensure quotient result has enough memory. */
    if ((err == MP_OKAY) && (r != NULL) && (r->size < a->used - d->used + 2)) {
        err = MP_VAL;
    }
    if ((err == MP_OKAY) && (rem != NULL)) {
        /* Ensure remainder has enough memory. */
        if ((a->used <= d->used) && (rem->size < a->used + 1)) {
            err = MP_VAL;
        }
        else if ((a->used > d->used) && (rem->size < d->used + 1)) {
            err = MP_VAL;
        }
    }
    if (err == MP_OKAY) {
        if (a->used == SP_INT_DIGITS) {
            /* May need to shift number being divided left into a new word. */
            int bits = SP_WORD_SIZE - (sp_count_bits(d) % SP_WORD_SIZE);
            if ((bits != SP_WORD_SIZE) &&
                    (sp_count_bits(a) + bits > SP_INT_DIGITS * SP_WORD_SIZE)) {
                err = MP_VAL;
            }
            else {
                used = SP_INT_DIGITS;
            }
        }
        else {
            used = a->used + 1;
        }
    }

    if (err == MP_OKAY) {
    #if 0
        sp_print(a, "a");
        sp_print(d, "b");
    #endif
        /* Do operation. */
        err = _sp_div(a, d, r, rem, used);
    #if 0
        if (err == MP_OKAY) {
            if (rem != NULL) {
                sp_print(rem, "rdr");
            }
            if (r != NULL) {
                sp_print(r, "rdw");
            }
        }
    #endif
    }

    return err;
}
#endif /* WOLFSSL_SP_MATH_ALL || !NO_DH || HAVE_ECC || \
        * (!NO_RSA && !WOLFSSL_RSA_VERIFY_ONLY) */

#if defined(WOLFSSL_SP_MATH_ALL) || !defined(NO_DH) || defined(HAVE_ECC) || \
    (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY) && \
     !defined(WOLFSSL_RSA_PUBLIC_ONLY))
#ifndef FREESCALE_LTC_TFM
#ifdef WOLFSSL_SP_INT_NEGATIVE
/* Calculate the remainder of dividing a by m: r = a mod m. r is m.
 *
 * @param  [in]   a  SP integer to reduce.
 * @param  [in]   m  SP integer that is the modulus.
 * @param  [out]  r  SP integer to store result in.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_mod(const sp_int* a, const sp_int* m, sp_int* r)
{
    int err = MP_OKAY;
    /* Remainder will start as a. */
    DECL_SP_INT(t, (a == NULL) ? 1 : a->used + 1);

    /* In case remainder is modulus - allocate temporary. */
    ALLOC_SP_INT(t, a->used + 1, err, NULL);
    if (err == MP_OKAY) {
        _sp_init_size(t, a->used + 1);
        /* Use divide to calculate remainder and don't get quotient. */
        err = sp_div(a, m, NULL, t);
    }
    if (err == MP_OKAY) {
        /* Make remainder positive and copy into result. */
        if ((!sp_iszero(t)) && (t->sign != m->sign)) {
            err = sp_add(t, m, r);
        }
        else {
            _sp_copy(t, r);
        }
    }
    FREE_SP_INT(t, NULL);

    return err;
}
#endif

/* Calculate the remainder of dividing a by m: r = a mod m.
 *
 * @param  [in]   a  SP integer to reduce.
 * @param  [in]   m  SP integer that is the modulus.
 * @param  [out]  r  SP integer to store result in.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a, m or r is NULL or m is 0.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
int sp_mod(const sp_int* a, const sp_int* m, sp_int* r)
{
    int err = MP_OKAY;

    /* Validate parameters. */
    if ((a == NULL) || (m == NULL) || (r == NULL)) {
        err = MP_VAL;
    }
    /* Ensure a isn't too big a number to operate on. */
    else if (a->used >= SP_INT_DIGITS) {
        err = MP_VAL;
    }

#ifndef WOLFSSL_SP_INT_NEGATIVE
    if (err == MP_OKAY) {
        /* Use divide to calculate remainder and don't get quotient. */
        err = sp_div(a, m, NULL, r);
    }
#else
    if ((err == MP_OKAY) && (r != m)) {
        err = sp_div(a, m, NULL, r);
        if ((err == MP_OKAY) && (!sp_iszero(r)) && (r->sign != m->sign)) {
            err = sp_add(r, m, r);
        }
    }
    else if (err == MP_OKAY) {
        err = _sp_mod(a, m, r);
    }
#endif /* WOLFSSL_SP_INT_NEGATIVE */

    return err;
}
#endif /* !FREESCALE_LTC_TFM */
#endif /* WOLFSSL_SP_MATH_ALL || !NO_DH || HAVE_ECC || \
        * (!NO_RSA && !WOLFSSL_RSA_VERIFY_ONLY) */

#if defined(WOLFSSL_SP_MATH_ALL) || defined(WOLFSSL_HAVE_SP_DH) || \
    defined(HAVE_ECC) || !defined(NO_RSA)

/* START SP_MUL implementations. */
/* This code is generated.
 * To generate:
 *   cd scripts/sp/sp_int
 *   ./gen.sh
 * File sp_mul.c contains code.
 */

#ifdef SQR_MUL_ASM
/* Multiply a by b into r where a and b have same no. digits. r = a * b
 *
 * Optimised code for when number of digits in a and b are the same.
 *
 * @param  [in]   a    SP integer to mulitply.
 * @param  [in]   b    SP integer to mulitply by.
 * @param  [out]  r    SP integer to hod reult.
 *
 * @return  MP_OKAY otherwise.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_mul_nxn(const sp_int* a, const sp_int* b, sp_int* r)
{
    int err = MP_OKAY;
    unsigned int i;
    int j;
    unsigned int k;
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    sp_int_digit* t = NULL;
#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && \
    !defined(WOLFSSL_SP_NO_DYN_STACK)
    sp_int_digit t[a->used];
#else
    sp_int_digit t[SP_INT_DIGITS / 2];
#endif

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    t = (sp_int_digit*)XMALLOC(sizeof(sp_int_digit) * a->used, NULL,
        DYNAMIC_TYPE_BIGINT);
    if (t == NULL) {
        err = MP_MEM;
    }
#endif
    if (err == MP_OKAY) {
        sp_int_digit l;
        sp_int_digit h;
        sp_int_digit o;
        const sp_int_digit* dp;

        h = 0;
        l = 0;
        SP_ASM_MUL(h, l, a->dp[0], b->dp[0]);
        t[0] = h;
        h = 0;
        o = 0;
        for (k = 1; k <= a->used - 1; k++) {
            j = (int)k;
            dp = a->dp;
            for (; j >= 0; dp++, j--) {
                SP_ASM_MUL_ADD(l, h, o, dp[0], b->dp[j]);
            }
            t[k] = l;
            l = h;
            h = o;
            o = 0;
        }
        for (; k <= (a->used - 1) * 2; k++) {
            i = k - (b->used - 1);
            dp = &b->dp[b->used - 1];
            for (; i < a->used; i++, dp--) {
                SP_ASM_MUL_ADD(l, h, o, a->dp[i], dp[0]);
            }
            r->dp[k] = l;
            l = h;
            h = o;
            o = 0;
        }
        r->dp[k] = l;
        XMEMCPY(r->dp, t, a->used * sizeof(sp_int_digit));
        r->used = k + 1;
        sp_clamp(r);
    }

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    if (t != NULL) {
        XFREE(t, NULL, DYNAMIC_TYPE_BIGINT);
    }
#endif
    return err;
}

/* Multiply a by b into r. r = a * b
 *
 * @param  [in]   a    SP integer to mulitply.
 * @param  [in]   b    SP integer to mulitply by.
 * @param  [out]  r    SP integer to hod reult.
 *
 * @return  MP_OKAY otherwise.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_mul(const sp_int* a, const sp_int* b, sp_int* r)
{
    int err = MP_OKAY;
    unsigned int i;
    int j;
    unsigned int k;
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    sp_int_digit* t = NULL;
#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && \
    !defined(WOLFSSL_SP_NO_DYN_STACK)
    sp_int_digit t[a->used + b->used];
#else
    sp_int_digit t[SP_INT_DIGITS];
#endif

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    t = (sp_int_digit*)XMALLOC(sizeof(sp_int_digit) * (a->used + b->used), NULL,
        DYNAMIC_TYPE_BIGINT);
    if (t == NULL) {
        err = MP_MEM;
    }
#endif
    if (err == MP_OKAY) {
        sp_int_digit l;
        sp_int_digit h;
        sp_int_digit o;

        h = 0;
        l = 0;
        SP_ASM_MUL(h, l, a->dp[0], b->dp[0]);
        t[0] = h;
        h = 0;
        o = 0;
        for (k = 1; k <= b->used - 1; k++) {
            i = 0;
            j = (int)k;
            for (; (i < a->used) && (j >= 0); i++, j--) {
                SP_ASM_MUL_ADD(l, h, o, a->dp[i], b->dp[j]);
            }
            t[k] = l;
            l = h;
            h = o;
            o = 0;
        }
        for (; k <= (a->used - 1) + (b->used - 1); k++) {
            j = (int)(b->used - 1);
            i = k - (unsigned int)j;
            for (; (i < a->used) && (j >= 0); i++, j--) {
                SP_ASM_MUL_ADD(l, h, o, a->dp[i], b->dp[j]);
            }
            t[k] = l;
            l = h;
            h = o;
            o = 0;
        }
        t[k] = l;
        r->used = k + 1;
        XMEMCPY(r->dp, t, r->used * sizeof(sp_int_digit));
        sp_clamp(r);
    }

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    if (t != NULL) {
        XFREE(t, NULL, DYNAMIC_TYPE_BIGINT);
    }
#endif
    return err;
}
#else
/* Multiply a by b into r. r = a * b
 *
 * @param  [in]   a    SP integer to mulitply.
 * @param  [in]   b    SP integer to mulitply by.
 * @param  [out]  r    SP integer to hod reult.
 *
 * @return  MP_OKAY otherwise.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_mul(const sp_int* a, const sp_int* b, sp_int* r)
{
    int err = MP_OKAY;
    unsigned int i;
    int j;
    unsigned int k;
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    sp_int_digit* t = NULL;
#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && \
    !defined(WOLFSSL_SP_NO_DYN_STACK)
    sp_int_digit t[a->used + b->used];
#else
    sp_int_digit t[SP_INT_DIGITS];
#endif

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    t = (sp_int_digit*)XMALLOC(sizeof(sp_int_digit) * (a->used + b->used), NULL,
        DYNAMIC_TYPE_BIGINT);
    if (t == NULL) {
        err = MP_MEM;
    }
#endif
    if (err == MP_OKAY) {
        sp_int_word w;
        sp_int_word l;
        sp_int_word h;
    #ifdef SP_WORD_OVERFLOW
        sp_int_word o;
    #endif

        w = (sp_int_word)a->dp[0] * b->dp[0];
        t[0] = (sp_int_digit)w;
        l = (sp_int_digit)(w >> SP_WORD_SIZE);
        h = 0;
    #ifdef SP_WORD_OVERFLOW
        o = 0;
    #endif
        for (k = 1; k <= (a->used - 1) + (b->used - 1); k++) {
            i = k - (b->used - 1);
            i &= (((unsigned int)i >> (sizeof(i) * 8 - 1)) - 1U);
            j = (int)(k - i);
            for (; (i < a->used) && (j >= 0); i++, j--) {
                w = (sp_int_word)a->dp[i] * b->dp[j];
                l += (sp_int_digit)w;
                h += (sp_int_digit)(w >> SP_WORD_SIZE);
            #ifdef SP_WORD_OVERFLOW
                h += (sp_int_digit)(l >> SP_WORD_SIZE);
                l &= SP_MASK;
                o += (sp_int_digit)(h >> SP_WORD_SIZE);
                h &= SP_MASK;
            #endif
            }
            t[k] = (sp_int_digit)l;
            l >>= SP_WORD_SIZE;
            l += (sp_int_digit)h;
            h >>= SP_WORD_SIZE;
        #ifdef SP_WORD_OVERFLOW
            h += o & SP_MASK;
            o >>= SP_WORD_SIZE;
        #endif
        }
        t[k] = (sp_int_digit)l;
        r->used = k + 1;
        XMEMCPY(r->dp, t, r->used * sizeof(sp_int_digit));
        sp_clamp(r);
    }

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    if (t != NULL) {
        XFREE(t, NULL, DYNAMIC_TYPE_BIGINT);
    }
#endif
    return err;
}
#endif

#ifndef WOLFSSL_SP_SMALL
#if !defined(WOLFSSL_HAVE_SP_ECC) && defined(HAVE_ECC)
#if (SP_WORD_SIZE == 64 && SP_INT_BITS >= 256)
#ifndef SQR_MUL_ASM
/* Multiply a by b and store in r: r = a * b
 *
 * Long-hand implementation.
 *
 * @param  [in]   a  SP integer to multiply.
 * @param  [in]   b  SP integer to multiply.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_mul_4(const sp_int* a, const sp_int* b, sp_int* r)
{
    int err = MP_OKAY;
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    sp_int_word* w = NULL;
#else
    sp_int_word w[16];
#endif
    const sp_int_digit* da = a->dp;
    const sp_int_digit* db = b->dp;

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    w = (sp_int_word*)XMALLOC(sizeof(sp_int_word) * 16, NULL,
        DYNAMIC_TYPE_BIGINT);
    if (w == NULL) {
        err = MP_MEM;
    }
#endif

    if (err == MP_OKAY) {
        w[0] = (sp_int_word)da[0] * db[0];
        w[1] = (sp_int_word)da[0] * db[1];
        w[2] = (sp_int_word)da[1] * db[0];
        w[3] = (sp_int_word)da[0] * db[2];
        w[4] = (sp_int_word)da[1] * db[1];
        w[5] = (sp_int_word)da[2] * db[0];
        w[6] = (sp_int_word)da[0] * db[3];
        w[7] = (sp_int_word)da[1] * db[2];
        w[8] = (sp_int_word)da[2] * db[1];
        w[9] = (sp_int_word)da[3] * db[0];
        w[10] = (sp_int_word)da[1] * db[3];
        w[11] = (sp_int_word)da[2] * db[2];
        w[12] = (sp_int_word)da[3] * db[1];
        w[13] = (sp_int_word)da[2] * db[3];
        w[14] = (sp_int_word)da[3] * db[2];
        w[15] = (sp_int_word)da[3] * db[3];

        r->dp[0] = (sp_int_digit)w[0];
        w[0] >>= SP_WORD_SIZE;
        w[0] += (sp_int_digit)w[1];
        w[0] += (sp_int_digit)w[2];
        r->dp[1] = (sp_int_digit)w[0];
        w[0] >>= SP_WORD_SIZE;
        w[1] >>= SP_WORD_SIZE;
        w[0] += (sp_int_digit)w[1];
        w[2] >>= SP_WORD_SIZE;
        w[0] += (sp_int_digit)w[2];
        w[0] += (sp_int_digit)w[3];
        w[0] += (sp_int_digit)w[4];
        w[0] += (sp_int_digit)w[5];
        r->dp[2] = (sp_int_digit)w[0];
        w[0] >>= SP_WORD_SIZE;
        w[3] >>= SP_WORD_SIZE;
        w[0] += (sp_int_digit)w[3];
        w[4] >>= SP_WORD_SIZE;
        w[0] += (sp_int_digit)w[4];
        w[5] >>= SP_WORD_SIZE;
        w[0] += (sp_int_digit)w[5];
        w[0] += (sp_int_digit)w[6];
        w[0] += (sp_int_digit)w[7];
        w[0] += (sp_int_digit)w[8];
        w[0] += (sp_int_digit)w[9];
        r->dp[3] = (sp_int_digit)w[0];
        w[0] >>= SP_WORD_SIZE;
        w[6] >>= SP_WORD_SIZE;
        w[0] += (sp_int_digit)w[6];
        w[7] >>= SP_WORD_SIZE;
        w[0] += (sp_int_digit)w[7];
        w[8] >>= SP_WORD_SIZE;
        w[0] += (sp_int_digit)w[8];
        w[9] >>= SP_WORD_SIZE;
        w[0] += (sp_int_digit)w[9];
        w[0] += (sp_int_digit)w[10];
        w[0] += (sp_int_digit)w[11];
        w[0] += (sp_int_digit)w[12];
        r->dp[4] = (sp_int_digit)w[0];
        w[0] >>= SP_WORD_SIZE;
        w[10] >>= SP_WORD_SIZE;
        w[0] += (sp_int_digit)w[10];
        w[11] >>= SP_WORD_SIZE;
        w[0] += (sp_int_digit)w[11];
        w[12] >>= SP_WORD_SIZE;
        w[0] += (sp_int_digit)w[12];
        w[0] += (sp_int_digit)w[13];
        w[0] += (sp_int_digit)w[14];
        r->dp[5] = (sp_int_digit)w[0];
        w[0] >>= SP_WORD_SIZE;
        w[13] >>= SP_WORD_SIZE;
        w[0] += (sp_int_digit)w[13];
        w[14] >>= SP_WORD_SIZE;
        w[0] += (sp_int_digit)w[14];
        w[0] += (sp_int_digit)w[15];
        r->dp[6] = (sp_int_digit)w[0];
        w[0] >>= SP_WORD_SIZE;
        w[15] >>= SP_WORD_SIZE;
        w[0] += (sp_int_digit)w[15];
        r->dp[7] = (sp_int_digit)w[0];

        r->used = 8;
        sp_clamp(r);
    }

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    if (w != NULL) {
        XFREE(w, NULL, DYNAMIC_TYPE_BIGINT);
    }
#endif
    return err;
}
#else /* SQR_MUL_ASM */
/* Multiply a by b and store in r: r = a * b
 *
 * Comba implementation.
 *
 * @param  [in]   a  SP integer to multiply.
 * @param  [in]   b  SP integer to multiply.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_mul_4(const sp_int* a, const sp_int* b, sp_int* r)
{
    sp_int_digit l = 0;
    sp_int_digit h = 0;
    sp_int_digit o = 0;
    sp_int_digit t[4];

    SP_ASM_MUL(h, l, a->dp[0], b->dp[0]);
    t[0] = h;
    h = 0;
    SP_ASM_MUL_ADD_NO(l, h, a->dp[0], b->dp[1]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[0]);
    t[1] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD_NO(l, h, a->dp[0], b->dp[2]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[1]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[0]);
    t[2] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[3]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[2]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[1]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[0]);
    t[3] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[3]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[2]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[1]);
    r->dp[4] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[3]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[2]);
    r->dp[5] = l;
    l = h;
    h = o;
    SP_ASM_MUL_ADD_NO(l, h, a->dp[3], b->dp[3]);
    r->dp[6] = l;
    r->dp[7] = h;
    XMEMCPY(r->dp, t, 4 * sizeof(sp_int_digit));
    r->used = 8;
    sp_clamp(r);

    return MP_OKAY;
}
#endif /* SQR_MUL_ASM */
#endif /* SP_WORD_SIZE == 64 */
#if (SP_WORD_SIZE == 64 && SP_INT_BITS >= 384)
#ifdef SQR_MUL_ASM
/* Multiply a by b and store in r: r = a * b
 *
 * Comba implementation.
 *
 * @param  [in]   a  SP integer to multiply.
 * @param  [in]   b  SP integer to multiply.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_mul_6(const sp_int* a, const sp_int* b, sp_int* r)
{
    sp_int_digit l = 0;
    sp_int_digit h = 0;
    sp_int_digit o = 0;
    sp_int_digit t[6];

    SP_ASM_MUL(h, l, a->dp[0], b->dp[0]);
    t[0] = h;
    h = 0;
    SP_ASM_MUL_ADD_NO(l, h, a->dp[0], b->dp[1]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[0]);
    t[1] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD_NO(l, h, a->dp[0], b->dp[2]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[1]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[0]);
    t[2] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[3]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[2]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[1]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[0]);
    t[3] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[4]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[3]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[2]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[1]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[0]);
    t[4] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[5]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[4]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[3]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[2]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[1]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[0]);
    t[5] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[5]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[4]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[3]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[2]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[1]);
    r->dp[6] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[5]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[4]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[3]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[2]);
    r->dp[7] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[5]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[4]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[3]);
    r->dp[8] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[5]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[4]);
    r->dp[9] = l;
    l = h;
    h = o;
    SP_ASM_MUL_ADD_NO(l, h, a->dp[5], b->dp[5]);
    r->dp[10] = l;
    r->dp[11] = h;
    XMEMCPY(r->dp, t, 6 * sizeof(sp_int_digit));
    r->used = 12;
    sp_clamp(r);

    return MP_OKAY;
}
#endif /* SQR_MUL_ASM */
#endif /* SP_WORD_SIZE == 64 */
#if (SP_WORD_SIZE == 32 && SP_INT_BITS >= 256)
#ifdef SQR_MUL_ASM
/* Multiply a by b and store in r: r = a * b
 *
 * Comba implementation.
 *
 * @param  [in]   a  SP integer to multiply.
 * @param  [in]   b  SP integer to multiply.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_mul_8(const sp_int* a, const sp_int* b, sp_int* r)
{
    sp_int_digit l = 0;
    sp_int_digit h = 0;
    sp_int_digit o = 0;
    sp_int_digit t[8];

    SP_ASM_MUL(h, l, a->dp[0], b->dp[0]);
    t[0] = h;
    h = 0;
    SP_ASM_MUL_ADD_NO(l, h, a->dp[0], b->dp[1]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[0]);
    t[1] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD_NO(l, h, a->dp[0], b->dp[2]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[1]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[0]);
    t[2] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[3]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[2]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[1]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[0]);
    t[3] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[4]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[3]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[2]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[1]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[0]);
    t[4] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[5]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[4]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[3]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[2]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[1]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[0]);
    t[5] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[6]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[5]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[4]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[3]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[2]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[1]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[0]);
    t[6] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[7]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[6]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[5]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[4]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[3]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[2]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[1]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[0]);
    t[7] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[7]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[6]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[5]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[4]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[3]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[2]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[1]);
    r->dp[8] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[7]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[6]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[5]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[4]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[3]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[2]);
    r->dp[9] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[7]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[6]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[5]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[4]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[3]);
    r->dp[10] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[7]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[6]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[5]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[4]);
    r->dp[11] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[7]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[6]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[5]);
    r->dp[12] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[7]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[6]);
    r->dp[13] = l;
    l = h;
    h = o;
    SP_ASM_MUL_ADD_NO(l, h, a->dp[7], b->dp[7]);
    r->dp[14] = l;
    r->dp[15] = h;
    XMEMCPY(r->dp, t, 8 * sizeof(sp_int_digit));
    r->used = 16;
    sp_clamp(r);

    return MP_OKAY;
}
#endif /* SQR_MUL_ASM */
#endif /* SP_WORD_SIZE == 32 */
#if (SP_WORD_SIZE == 32 && SP_INT_BITS >= 384)
#ifdef SQR_MUL_ASM
/* Multiply a by b and store in r: r = a * b
 *
 * Comba implementation.
 *
 * @param  [in]   a  SP integer to multiply.
 * @param  [in]   b  SP integer to multiply.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_mul_12(const sp_int* a, const sp_int* b, sp_int* r)
{
    sp_int_digit l = 0;
    sp_int_digit h = 0;
    sp_int_digit o = 0;
    sp_int_digit t[12];

    SP_ASM_MUL(h, l, a->dp[0], b->dp[0]);
    t[0] = h;
    h = 0;
    SP_ASM_MUL_ADD_NO(l, h, a->dp[0], b->dp[1]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[0]);
    t[1] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD_NO(l, h, a->dp[0], b->dp[2]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[1]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[0]);
    t[2] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[3]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[2]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[1]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[0]);
    t[3] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[4]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[3]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[2]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[1]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[0]);
    t[4] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[5]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[4]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[3]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[2]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[1]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[0]);
    t[5] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[6]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[5]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[4]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[3]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[2]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[1]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[0]);
    t[6] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[7]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[6]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[5]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[4]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[3]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[2]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[1]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[0]);
    t[7] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[8]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[7]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[6]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[5]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[4]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[3]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[2]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[1]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[0]);
    t[8] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[9]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[8]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[7]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[6]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[5]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[4]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[3]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[2]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[1]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[0]);
    t[9] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[10]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[9]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[8]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[7]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[6]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[5]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[4]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[3]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[2]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[1]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[0]);
    t[10] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[11]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[10]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[9]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[8]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[7]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[6]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[5]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[4]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[3]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[2]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[1]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[0]);
    t[11] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[11]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[10]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[9]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[8]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[7]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[6]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[5]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[4]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[3]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[2]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[1]);
    r->dp[12] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[11]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[10]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[9]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[8]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[7]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[6]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[5]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[4]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[3]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[2]);
    r->dp[13] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[11]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[10]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[9]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[8]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[7]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[6]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[5]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[4]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[3]);
    r->dp[14] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[11]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[10]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[9]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[8]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[7]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[6]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[5]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[4]);
    r->dp[15] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[11]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[10]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[9]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[8]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[7]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[6]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[5]);
    r->dp[16] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[11]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[10]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[9]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[8]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[7]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[6]);
    r->dp[17] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[11]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[10]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[9]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[8]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[7]);
    r->dp[18] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[11]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[10]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[9]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[8]);
    r->dp[19] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[11]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[10]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[9]);
    r->dp[20] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[11]);
    SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[10]);
    r->dp[21] = l;
    l = h;
    h = o;
    SP_ASM_MUL_ADD_NO(l, h, a->dp[11], b->dp[11]);
    r->dp[22] = l;
    r->dp[23] = h;
    XMEMCPY(r->dp, t, 12 * sizeof(sp_int_digit));
    r->used = 24;
    sp_clamp(r);

    return MP_OKAY;
}
#endif /* SQR_MUL_ASM */
#endif /* SP_WORD_SIZE == 32 */
#endif /* !WOLFSSL_HAVE_SP_ECC && HAVE_ECC */

#if defined(SQR_MUL_ASM) && (defined(WOLFSSL_SP_INT_LARGE_COMBA) || \
    (!defined(WOLFSSL_SP_MATH) && defined(WOLFCRYPT_HAVE_SAKKE) && \
    (SP_WORD_SIZE == 64)))
    #if SP_INT_DIGITS >= 32
/* Multiply a by b and store in r: r = a * b
 *
 * Comba implementation.
 *
 * @param  [in]   a  SP integer to multiply.
 * @param  [in]   b  SP integer to multiply.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_mul_16(const sp_int* a, const sp_int* b, sp_int* r)
{
    int err = MP_OKAY;
    sp_int_digit l = 0;
    sp_int_digit h = 0;
    sp_int_digit o = 0;
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    sp_int_digit* t = NULL;
#else
    sp_int_digit t[16];
#endif

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     t = (sp_int_digit*)XMALLOC(sizeof(sp_int_digit) * 16, NULL,
         DYNAMIC_TYPE_BIGINT);
     if (t == NULL) {
         err = MP_MEM;
     }
#endif
    if (err == MP_OKAY) {
        SP_ASM_MUL(h, l, a->dp[0], b->dp[0]);
        t[0] = h;
        h = 0;
        SP_ASM_MUL_ADD_NO(l, h, a->dp[0], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[0]);
        t[1] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD_NO(l, h, a->dp[0], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[0]);
        t[2] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[0]);
        t[3] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[0]);
        t[4] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[0]);
        t[5] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[0]);
        t[6] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[0]);
        t[7] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[0]);
        t[8] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[0]);
        t[9] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[0]);
        t[10] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[0]);
        t[11] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[0]);
        t[12] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[0]);
        t[13] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[0]);
        t[14] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[0]);
        t[15] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[1]);
        r->dp[16] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[2]);
        r->dp[17] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[3]);
        r->dp[18] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[4]);
        r->dp[19] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[5]);
        r->dp[20] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[6]);
        r->dp[21] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[7]);
        r->dp[22] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[8]);
        r->dp[23] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[9]);
        r->dp[24] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[10]);
        r->dp[25] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[11]);
        r->dp[26] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[12]);
        r->dp[27] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[13]);
        r->dp[28] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[14]);
        r->dp[29] = l;
        l = h;
        h = o;
        SP_ASM_MUL_ADD_NO(l, h, a->dp[15], b->dp[15]);
        r->dp[30] = l;
        r->dp[31] = h;
        XMEMCPY(r->dp, t, 16 * sizeof(sp_int_digit));
        r->used = 32;
        sp_clamp(r);
    }

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    if (t != NULL) {
        XFREE(t, NULL, DYNAMIC_TYPE_BIGINT);
    }
#endif
    return err;
}
    #endif /* SP_INT_DIGITS >= 32 */
#endif /* SQR_MUL_ASM && (WOLFSSL_SP_INT_LARGE_COMBA || !WOLFSSL_SP_MATH &&
        * WOLFCRYPT_HAVE_SAKKE && SP_WORD_SIZE == 64 */

#if defined(SQR_MUL_ASM) && defined(WOLFSSL_SP_INT_LARGE_COMBA)
    #if SP_INT_DIGITS >= 48
/* Multiply a by b and store in r: r = a * b
 *
 * Comba implementation.
 *
 * @param  [in]   a  SP integer to multiply.
 * @param  [in]   b  SP integer to multiply.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_mul_24(const sp_int* a, const sp_int* b, sp_int* r)
{
    int err = MP_OKAY;
    sp_int_digit l = 0;
    sp_int_digit h = 0;
    sp_int_digit o = 0;
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    sp_int_digit* t = NULL;
#else
    sp_int_digit t[24];
#endif

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     t = (sp_int_digit*)XMALLOC(sizeof(sp_int_digit) * 24, NULL,
         DYNAMIC_TYPE_BIGINT);
     if (t == NULL) {
         err = MP_MEM;
     }
#endif
    if (err == MP_OKAY) {
        SP_ASM_MUL(h, l, a->dp[0], b->dp[0]);
        t[0] = h;
        h = 0;
        SP_ASM_MUL_ADD_NO(l, h, a->dp[0], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[0]);
        t[1] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD_NO(l, h, a->dp[0], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[0]);
        t[2] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[0]);
        t[3] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[0]);
        t[4] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[0]);
        t[5] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[0]);
        t[6] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[0]);
        t[7] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[0]);
        t[8] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[0]);
        t[9] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[0]);
        t[10] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[0]);
        t[11] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[0]);
        t[12] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[0]);
        t[13] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[0]);
        t[14] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[0]);
        t[15] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[16]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[0]);
        t[16] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[17]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[16]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[0]);
        t[17] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[18]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[17]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[16]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[0]);
        t[18] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[19]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[18]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[17]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[16]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[0]);
        t[19] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[20]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[19]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[18]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[17]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[16]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[0]);
        t[20] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[21]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[20]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[19]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[18]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[17]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[16]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[0]);
        t[21] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[22]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[21]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[20]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[19]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[18]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[17]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[16]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[0]);
        t[22] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[0], b->dp[23]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[22]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[21]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[20]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[19]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[18]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[17]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[16]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[1]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[0]);
        t[23] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[1], b->dp[23]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[22]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[21]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[20]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[19]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[18]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[17]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[16]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[2]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[1]);
        r->dp[24] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[2], b->dp[23]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[22]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[21]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[20]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[19]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[18]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[17]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[16]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[3]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[2]);
        r->dp[25] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[3], b->dp[23]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[22]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[21]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[20]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[19]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[18]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[17]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[16]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[4]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[3]);
        r->dp[26] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[4], b->dp[23]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[22]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[21]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[20]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[19]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[18]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[17]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[16]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[5]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[4]);
        r->dp[27] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[5], b->dp[23]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[22]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[21]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[20]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[19]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[18]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[17]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[16]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[6]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[5]);
        r->dp[28] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[6], b->dp[23]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[22]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[21]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[20]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[19]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[18]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[17]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[16]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[7]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[6]);
        r->dp[29] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[7], b->dp[23]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[22]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[21]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[20]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[19]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[18]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[17]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[16]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[8]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[7]);
        r->dp[30] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[8], b->dp[23]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[22]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[21]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[20]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[19]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[18]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[17]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[16]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[9]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[8]);
        r->dp[31] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[9], b->dp[23]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[22]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[21]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[20]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[19]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[18]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[17]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[16]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[10]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[9]);
        r->dp[32] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[10], b->dp[23]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[22]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[21]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[20]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[19]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[18]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[17]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[16]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[11]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[10]);
        r->dp[33] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[11], b->dp[23]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[22]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[21]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[20]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[19]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[18]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[17]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[16]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[12]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[11]);
        r->dp[34] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[12], b->dp[23]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[22]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[21]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[20]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[19]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[18]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[17]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[16]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[13]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[12]);
        r->dp[35] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[13], b->dp[23]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[22]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[21]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[20]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[19]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[18]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[17]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[16]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[14]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[13]);
        r->dp[36] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[14], b->dp[23]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[22]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[21]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[20]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[19]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[18]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[17]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[16]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[15]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[14]);
        r->dp[37] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[15], b->dp[23]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[22]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[21]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[20]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[19]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[18]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[17]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[16]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[15]);
        r->dp[38] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[16], b->dp[23]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[22]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[21]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[20]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[19]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[18]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[17]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[16]);
        r->dp[39] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[17], b->dp[23]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[22]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[21]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[20]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[19]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[18]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[17]);
        r->dp[40] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[18], b->dp[23]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[22]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[21]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[20]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[19]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[18]);
        r->dp[41] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[19], b->dp[23]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[22]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[21]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[20]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[19]);
        r->dp[42] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[20], b->dp[23]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[22]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[21]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[20]);
        r->dp[43] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[21], b->dp[23]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[22]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[21]);
        r->dp[44] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD(l, h, o, a->dp[22], b->dp[23]);
        SP_ASM_MUL_ADD(l, h, o, a->dp[23], b->dp[22]);
        r->dp[45] = l;
        l = h;
        h = o;
        SP_ASM_MUL_ADD_NO(l, h, a->dp[23], b->dp[23]);
        r->dp[46] = l;
        r->dp[47] = h;
        XMEMCPY(r->dp, t, 24 * sizeof(sp_int_digit));
        r->used = 48;
        sp_clamp(r);
    }

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    if (t != NULL) {
        XFREE(t, NULL, DYNAMIC_TYPE_BIGINT);
    }
#endif
    return err;
}
    #endif /* SP_INT_DIGITS >= 48 */

    #if SP_INT_DIGITS >= 64
/* Multiply a by b and store in r: r = a * b
 *
 * Karatsuba implementation.
 *
 * @param  [in]   a  SP integer to multiply.
 * @param  [in]   b  SP integer to multiply.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_mul_32(const sp_int* a, const sp_int* b, sp_int* r)
{
    int err = MP_OKAY;
    unsigned int i;
    sp_int_digit l;
    sp_int_digit h;
    sp_int* a1;
    sp_int* b1;
    sp_int* z0;
    sp_int* z1;
    sp_int* z2;
    sp_int_digit ca;
    sp_int_digit cb;
    DECL_SP_INT_ARRAY(t, 16, 2);
    DECL_SP_INT_ARRAY(z, 33, 2);

    ALLOC_SP_INT_ARRAY(t, 16, 2, err, NULL);
    ALLOC_SP_INT_ARRAY(z, 33, 2, err, NULL);
    if (err == MP_OKAY) {
        a1 = t[0];
        b1 = t[1];
        z1 = z[0];
        z2 = z[1];
        z0 = r;

        XMEMCPY(a1->dp, &a->dp[16], sizeof(sp_int_digit) * 16);
        a1->used = 16;
        XMEMCPY(b1->dp, &b->dp[16], sizeof(sp_int_digit) * 16);
        b1->used = 16;

        /* z2 = a1 * b1 */
        err = _sp_mul_16(a1, b1, z2);
    }
    if (err == MP_OKAY) {
        l = a1->dp[0];
        h = 0;
        SP_ASM_ADDC(l, h, a->dp[0]);
        a1->dp[0] = l;
        l = h;
        h = 0;
        for (i = 1; i < 16; i++) {
            SP_ASM_ADDC(l, h, a1->dp[i]);
            SP_ASM_ADDC(l, h, a->dp[i]);
            a1->dp[i] = l;
            l = h;
            h = 0;
        }
        ca = l;
        /* b01 = b0 + b1 */
        l = b1->dp[0];
        h = 0;
        SP_ASM_ADDC(l, h, b->dp[0]);
        b1->dp[0] = l;
        l = h;
        h = 0;
        for (i = 1; i < 16; i++) {
            SP_ASM_ADDC(l, h, b1->dp[i]);
            SP_ASM_ADDC(l, h, b->dp[i]);
            b1->dp[i] = l;
            l = h;
            h = 0;
        }
        cb = l;

        /* z0 = a0 * b0 */
        err = _sp_mul_16(a, b, z0);
    }
    if (err == MP_OKAY) {
        /* z1 = (a0 + a1) * (b0 + b1) */
        err = _sp_mul_16(a1, b1, z1);
    }
    if (err == MP_OKAY) {
        /* r = (z2 << 32) + (z1 - z0 - z2) << 16) + z0 */
        /* r = z0 */
        /* r += (z1 - z0 - z2) << 16 */
        z1->dp[32] = ca & cb;
        l = 0;
        if (ca) {
            h = 0;
            for (i = 0; i < 16; i++) {
                SP_ASM_ADDC(l, h, z1->dp[i + 16]);
                SP_ASM_ADDC(l, h, b1->dp[i]);
                z1->dp[i + 16] = l;
                l = h;
                h = 0;
            }
        }
        z1->dp[32] += l;
        l = 0;
        if (cb) {
            h = 0;
            for (i = 0; i < 16; i++) {
                SP_ASM_ADDC(l, h, z1->dp[i + 16]);
                SP_ASM_ADDC(l, h, a1->dp[i]);
                z1->dp[i + 16] = l;
                l = h;
                h = 0;
            }
        }
        z1->dp[32] += l;
        /* z1 = z1 - z0 - z1 */
        l = 0;
        h = 0;
        for (i = 0; i < 32; i++) {
            l += z1->dp[i];
            SP_ASM_SUBB(l, h, z0->dp[i]);
            SP_ASM_SUBB(l, h, z2->dp[i]);
            z1->dp[i] = l;
            l = h;
            h = 0;
        }
        z1->dp[i] += l;
        /* r += z1 << 16 */
        l = 0;
        h = 0;
        for (i = 0; i < 16; i++) {
            SP_ASM_ADDC(l, h, r->dp[i + 16]);
            SP_ASM_ADDC(l, h, z1->dp[i]);
            r->dp[i + 16] = l;
            l = h;
            h = 0;
        }
        for (; i < 33; i++) {
            SP_ASM_ADDC(l, h, z1->dp[i]);
            r->dp[i + 16] = l;
            l = h;
            h = 0;
        }
        /* r += z2 << 32  */
        l = 0;
        h = 0;
        for (i = 0; i < 17; i++) {
            SP_ASM_ADDC(l, h, r->dp[i + 32]);
            SP_ASM_ADDC(l, h, z2->dp[i]);
            r->dp[i + 32] = l;
            l = h;
            h = 0;
        }
        for (; i < 32; i++) {
            SP_ASM_ADDC(l, h, z2->dp[i]);
            r->dp[i + 32] = l;
            l = h;
            h = 0;
        }
        r->used = 64;
        sp_clamp(r);
    }

    FREE_SP_INT_ARRAY(z, NULL);
    FREE_SP_INT_ARRAY(t, NULL);
    return err;
}
    #endif /* SP_INT_DIGITS >= 64 */

    #if SP_INT_DIGITS >= 96
/* Multiply a by b and store in r: r = a * b
 *
 * Karatsuba implementation.
 *
 * @param  [in]   a  SP integer to multiply.
 * @param  [in]   b  SP integer to multiply.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_mul_48(const sp_int* a, const sp_int* b, sp_int* r)
{
    int err = MP_OKAY;
    unsigned int i;
    sp_int_digit l;
    sp_int_digit h;
    sp_int* a1;
    sp_int* b1;
    sp_int* z0;
    sp_int* z1;
    sp_int* z2;
    sp_int_digit ca;
    sp_int_digit cb;
    DECL_SP_INT_ARRAY(t, 24, 2);
    DECL_SP_INT_ARRAY(z, 49, 2);

    ALLOC_SP_INT_ARRAY(t, 24, 2, err, NULL);
    ALLOC_SP_INT_ARRAY(z, 49, 2, err, NULL);
    if (err == MP_OKAY) {
        a1 = t[0];
        b1 = t[1];
        z1 = z[0];
        z2 = z[1];
        z0 = r;

        XMEMCPY(a1->dp, &a->dp[24], sizeof(sp_int_digit) * 24);
        a1->used = 24;
        XMEMCPY(b1->dp, &b->dp[24], sizeof(sp_int_digit) * 24);
        b1->used = 24;

        /* z2 = a1 * b1 */
        err = _sp_mul_24(a1, b1, z2);
    }
    if (err == MP_OKAY) {
        l = a1->dp[0];
        h = 0;
        SP_ASM_ADDC(l, h, a->dp[0]);
        a1->dp[0] = l;
        l = h;
        h = 0;
        for (i = 1; i < 24; i++) {
            SP_ASM_ADDC(l, h, a1->dp[i]);
            SP_ASM_ADDC(l, h, a->dp[i]);
            a1->dp[i] = l;
            l = h;
            h = 0;
        }
        ca = l;
        /* b01 = b0 + b1 */
        l = b1->dp[0];
        h = 0;
        SP_ASM_ADDC(l, h, b->dp[0]);
        b1->dp[0] = l;
        l = h;
        h = 0;
        for (i = 1; i < 24; i++) {
            SP_ASM_ADDC(l, h, b1->dp[i]);
            SP_ASM_ADDC(l, h, b->dp[i]);
            b1->dp[i] = l;
            l = h;
            h = 0;
        }
        cb = l;

        /* z0 = a0 * b0 */
        err = _sp_mul_24(a, b, z0);
    }
    if (err == MP_OKAY) {
        /* z1 = (a0 + a1) * (b0 + b1) */
        err = _sp_mul_24(a1, b1, z1);
    }
    if (err == MP_OKAY) {
        /* r = (z2 << 48) + (z1 - z0 - z2) << 24) + z0 */
        /* r = z0 */
        /* r += (z1 - z0 - z2) << 24 */
        z1->dp[48] = ca & cb;
        l = 0;
        if (ca) {
            h = 0;
            for (i = 0; i < 24; i++) {
                SP_ASM_ADDC(l, h, z1->dp[i + 24]);
                SP_ASM_ADDC(l, h, b1->dp[i]);
                z1->dp[i + 24] = l;
                l = h;
                h = 0;
            }
        }
        z1->dp[48] += l;
        l = 0;
        if (cb) {
            h = 0;
            for (i = 0; i < 24; i++) {
                SP_ASM_ADDC(l, h, z1->dp[i + 24]);
                SP_ASM_ADDC(l, h, a1->dp[i]);
                z1->dp[i + 24] = l;
                l = h;
                h = 0;
            }
        }
        z1->dp[48] += l;
        /* z1 = z1 - z0 - z1 */
        l = 0;
        h = 0;
        for (i = 0; i < 48; i++) {
            l += z1->dp[i];
            SP_ASM_SUBB(l, h, z0->dp[i]);
            SP_ASM_SUBB(l, h, z2->dp[i]);
            z1->dp[i] = l;
            l = h;
            h = 0;
        }
        z1->dp[i] += l;
        /* r += z1 << 16 */
        l = 0;
        h = 0;
        for (i = 0; i < 24; i++) {
            SP_ASM_ADDC(l, h, r->dp[i + 24]);
            SP_ASM_ADDC(l, h, z1->dp[i]);
            r->dp[i + 24] = l;
            l = h;
            h = 0;
        }
        for (; i < 49; i++) {
            SP_ASM_ADDC(l, h, z1->dp[i]);
            r->dp[i + 24] = l;
            l = h;
            h = 0;
        }
        /* r += z2 << 48  */
        l = 0;
        h = 0;
        for (i = 0; i < 25; i++) {
            SP_ASM_ADDC(l, h, r->dp[i + 48]);
            SP_ASM_ADDC(l, h, z2->dp[i]);
            r->dp[i + 48] = l;
            l = h;
            h = 0;
        }
        for (; i < 48; i++) {
            SP_ASM_ADDC(l, h, z2->dp[i]);
            r->dp[i + 48] = l;
            l = h;
            h = 0;
        }
        r->used = 96;
        sp_clamp(r);
    }

    FREE_SP_INT_ARRAY(z, NULL);
    FREE_SP_INT_ARRAY(t, NULL);
    return err;
}
    #endif /* SP_INT_DIGITS >= 96 */

    #if SP_INT_DIGITS >= 128
/* Multiply a by b and store in r: r = a * b
 *
 * Karatsuba implementation.
 *
 * @param  [in]   a  SP integer to multiply.
 * @param  [in]   b  SP integer to multiply.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_mul_64(const sp_int* a, const sp_int* b, sp_int* r)
{
    int err = MP_OKAY;
    unsigned int i;
    sp_int_digit l;
    sp_int_digit h;
    sp_int* a1;
    sp_int* b1;
    sp_int* z0;
    sp_int* z1;
    sp_int* z2;
    sp_int_digit ca;
    sp_int_digit cb;
    DECL_SP_INT_ARRAY(t, 32, 2);
    DECL_SP_INT_ARRAY(z, 65, 2);

    ALLOC_SP_INT_ARRAY(t, 32, 2, err, NULL);
    ALLOC_SP_INT_ARRAY(z, 65, 2, err, NULL);
    if (err == MP_OKAY) {
        a1 = t[0];
        b1 = t[1];
        z1 = z[0];
        z2 = z[1];
        z0 = r;

        XMEMCPY(a1->dp, &a->dp[32], sizeof(sp_int_digit) * 32);
        a1->used = 32;
        XMEMCPY(b1->dp, &b->dp[32], sizeof(sp_int_digit) * 32);
        b1->used = 32;

        /* z2 = a1 * b1 */
        err = _sp_mul_32(a1, b1, z2);
    }
    if (err == MP_OKAY) {
        l = a1->dp[0];
        h = 0;
        SP_ASM_ADDC(l, h, a->dp[0]);
        a1->dp[0] = l;
        l = h;
        h = 0;
        for (i = 1; i < 32; i++) {
            SP_ASM_ADDC(l, h, a1->dp[i]);
            SP_ASM_ADDC(l, h, a->dp[i]);
            a1->dp[i] = l;
            l = h;
            h = 0;
        }
        ca = l;
        /* b01 = b0 + b1 */
        l = b1->dp[0];
        h = 0;
        SP_ASM_ADDC(l, h, b->dp[0]);
        b1->dp[0] = l;
        l = h;
        h = 0;
        for (i = 1; i < 32; i++) {
            SP_ASM_ADDC(l, h, b1->dp[i]);
            SP_ASM_ADDC(l, h, b->dp[i]);
            b1->dp[i] = l;
            l = h;
            h = 0;
        }
        cb = l;

        /* z0 = a0 * b0 */
        err = _sp_mul_32(a, b, z0);
    }
    if (err == MP_OKAY) {
        /* z1 = (a0 + a1) * (b0 + b1) */
        err = _sp_mul_32(a1, b1, z1);
    }
    if (err == MP_OKAY) {
        /* r = (z2 << 64) + (z1 - z0 - z2) << 32) + z0 */
        /* r = z0 */
        /* r += (z1 - z0 - z2) << 32 */
        z1->dp[64] = ca & cb;
        l = 0;
        if (ca) {
            h = 0;
            for (i = 0; i < 32; i++) {
                SP_ASM_ADDC(l, h, z1->dp[i + 32]);
                SP_ASM_ADDC(l, h, b1->dp[i]);
                z1->dp[i + 32] = l;
                l = h;
                h = 0;
            }
        }
        z1->dp[64] += l;
        l = 0;
        if (cb) {
            h = 0;
            for (i = 0; i < 32; i++) {
                SP_ASM_ADDC(l, h, z1->dp[i + 32]);
                SP_ASM_ADDC(l, h, a1->dp[i]);
                z1->dp[i + 32] = l;
                l = h;
                h = 0;
            }
        }
        z1->dp[64] += l;
        /* z1 = z1 - z0 - z1 */
        l = 0;
        h = 0;
        for (i = 0; i < 64; i++) {
            l += z1->dp[i];
            SP_ASM_SUBB(l, h, z0->dp[i]);
            SP_ASM_SUBB(l, h, z2->dp[i]);
            z1->dp[i] = l;
            l = h;
            h = 0;
        }
        z1->dp[i] += l;
        /* r += z1 << 16 */
        l = 0;
        h = 0;
        for (i = 0; i < 32; i++) {
            SP_ASM_ADDC(l, h, r->dp[i + 32]);
            SP_ASM_ADDC(l, h, z1->dp[i]);
            r->dp[i + 32] = l;
            l = h;
            h = 0;
        }
        for (; i < 65; i++) {
            SP_ASM_ADDC(l, h, z1->dp[i]);
            r->dp[i + 32] = l;
            l = h;
            h = 0;
        }
        /* r += z2 << 64  */
        l = 0;
        h = 0;
        for (i = 0; i < 33; i++) {
            SP_ASM_ADDC(l, h, r->dp[i + 64]);
            SP_ASM_ADDC(l, h, z2->dp[i]);
            r->dp[i + 64] = l;
            l = h;
            h = 0;
        }
        for (; i < 64; i++) {
            SP_ASM_ADDC(l, h, z2->dp[i]);
            r->dp[i + 64] = l;
            l = h;
            h = 0;
        }
        r->used = 128;
        sp_clamp(r);
    }

    FREE_SP_INT_ARRAY(z, NULL);
    FREE_SP_INT_ARRAY(t, NULL);
    return err;
}
    #endif /* SP_INT_DIGITS >= 128 */

    #if SP_INT_DIGITS >= 192
/* Multiply a by b and store in r: r = a * b
 *
 * Karatsuba implementation.
 *
 * @param  [in]   a  SP integer to multiply.
 * @param  [in]   b  SP integer to multiply.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_mul_96(const sp_int* a, const sp_int* b, sp_int* r)
{
    int err = MP_OKAY;
    unsigned int i;
    sp_int_digit l;
    sp_int_digit h;
    sp_int* a1;
    sp_int* b1;
    sp_int* z0;
    sp_int* z1;
    sp_int* z2;
    sp_int_digit ca;
    sp_int_digit cb;
    DECL_SP_INT_ARRAY(t, 48, 2);
    DECL_SP_INT_ARRAY(z, 97, 2);

    ALLOC_SP_INT_ARRAY(t, 48, 2, err, NULL);
    ALLOC_SP_INT_ARRAY(z, 97, 2, err, NULL);
    if (err == MP_OKAY) {
        a1 = t[0];
        b1 = t[1];
        z1 = z[0];
        z2 = z[1];
        z0 = r;

        XMEMCPY(a1->dp, &a->dp[48], sizeof(sp_int_digit) * 48);
        a1->used = 48;
        XMEMCPY(b1->dp, &b->dp[48], sizeof(sp_int_digit) * 48);
        b1->used = 48;

        /* z2 = a1 * b1 */
        err = _sp_mul_48(a1, b1, z2);
    }
    if (err == MP_OKAY) {
        l = a1->dp[0];
        h = 0;
        SP_ASM_ADDC(l, h, a->dp[0]);
        a1->dp[0] = l;
        l = h;
        h = 0;
        for (i = 1; i < 48; i++) {
            SP_ASM_ADDC(l, h, a1->dp[i]);
            SP_ASM_ADDC(l, h, a->dp[i]);
            a1->dp[i] = l;
            l = h;
            h = 0;
        }
        ca = l;
        /* b01 = b0 + b1 */
        l = b1->dp[0];
        h = 0;
        SP_ASM_ADDC(l, h, b->dp[0]);
        b1->dp[0] = l;
        l = h;
        h = 0;
        for (i = 1; i < 48; i++) {
            SP_ASM_ADDC(l, h, b1->dp[i]);
            SP_ASM_ADDC(l, h, b->dp[i]);
            b1->dp[i] = l;
            l = h;
            h = 0;
        }
        cb = l;

        /* z0 = a0 * b0 */
        err = _sp_mul_48(a, b, z0);
    }
    if (err == MP_OKAY) {
        /* z1 = (a0 + a1) * (b0 + b1) */
        err = _sp_mul_48(a1, b1, z1);
    }
    if (err == MP_OKAY) {
        /* r = (z2 << 96) + (z1 - z0 - z2) << 48) + z0 */
        /* r = z0 */
        /* r += (z1 - z0 - z2) << 48 */
        z1->dp[96] = ca & cb;
        l = 0;
        if (ca) {
            h = 0;
            for (i = 0; i < 48; i++) {
                SP_ASM_ADDC(l, h, z1->dp[i + 48]);
                SP_ASM_ADDC(l, h, b1->dp[i]);
                z1->dp[i + 48] = l;
                l = h;
                h = 0;
            }
        }
        z1->dp[96] += l;
        l = 0;
        if (cb) {
            h = 0;
            for (i = 0; i < 48; i++) {
                SP_ASM_ADDC(l, h, z1->dp[i + 48]);
                SP_ASM_ADDC(l, h, a1->dp[i]);
                z1->dp[i + 48] = l;
                l = h;
                h = 0;
            }
        }
        z1->dp[96] += l;
        /* z1 = z1 - z0 - z1 */
        l = 0;
        h = 0;
        for (i = 0; i < 96; i++) {
            l += z1->dp[i];
            SP_ASM_SUBB(l, h, z0->dp[i]);
            SP_ASM_SUBB(l, h, z2->dp[i]);
            z1->dp[i] = l;
            l = h;
            h = 0;
        }
        z1->dp[i] += l;
        /* r += z1 << 16 */
        l = 0;
        h = 0;
        for (i = 0; i < 48; i++) {
            SP_ASM_ADDC(l, h, r->dp[i + 48]);
            SP_ASM_ADDC(l, h, z1->dp[i]);
            r->dp[i + 48] = l;
            l = h;
            h = 0;
        }
        for (; i < 97; i++) {
            SP_ASM_ADDC(l, h, z1->dp[i]);
            r->dp[i + 48] = l;
            l = h;
            h = 0;
        }
        /* r += z2 << 96  */
        l = 0;
        h = 0;
        for (i = 0; i < 49; i++) {
            SP_ASM_ADDC(l, h, r->dp[i + 96]);
            SP_ASM_ADDC(l, h, z2->dp[i]);
            r->dp[i + 96] = l;
            l = h;
            h = 0;
        }
        for (; i < 96; i++) {
            SP_ASM_ADDC(l, h, z2->dp[i]);
            r->dp[i + 96] = l;
            l = h;
            h = 0;
        }
        r->used = 192;
        sp_clamp(r);
    }

    FREE_SP_INT_ARRAY(z, NULL);
    FREE_SP_INT_ARRAY(t, NULL);
    return err;
}
    #endif /* SP_INT_DIGITS >= 192 */

#endif /* SQR_MUL_ASM && WOLFSSL_SP_INT_LARGE_COMBA */
#endif /* !WOLFSSL_SP_SMALL */

/* Multiply a by b and store in r: r = a * b
 *
 * @param  [in]   a  SP integer to multiply.
 * @param  [in]   b  SP integer to multiply.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a, b or is NULL; or the result will be too big for fixed
 *          data length.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
int sp_mul(const sp_int* a, const sp_int* b, sp_int* r)
{
    int err = MP_OKAY;
#ifdef WOLFSSL_SP_INT_NEGATIVE
    unsigned int sign = MP_ZPOS;
#endif

    if ((a == NULL) || (b == NULL) || (r == NULL)) {
        err = MP_VAL;
    }

    /* Need extra digit during calculation. */
    if ((err == MP_OKAY) && (a->used + b->used > r->size)) {
        err = MP_VAL;
    }

#if 0
    if (err == MP_OKAY) {
        sp_print(a, "a");
        sp_print(b, "b");
    }
#endif

    if (err == MP_OKAY) {
    #ifdef WOLFSSL_SP_INT_NEGATIVE
        sign = a->sign ^ b->sign;
    #endif

        if ((a->used == 0) || (b->used == 0)) {
            _sp_zero(r);
        }
        else
#ifndef WOLFSSL_SP_SMALL
#if !defined(WOLFSSL_HAVE_SP_ECC) && defined(HAVE_ECC)
#if (SP_WORD_SIZE == 64 && SP_INT_BITS >= 256)
        if ((a->used == 4) && (b->used == 4)) {
            err = _sp_mul_4(a, b, r);
        }
        else
#endif /* SP_WORD_SIZE == 64 */
#if (SP_WORD_SIZE == 64 && SP_INT_BITS >= 384)
#ifdef SQR_MUL_ASM
        if ((a->used == 6) && (b->used == 6)) {
            err = _sp_mul_6(a, b, r);
        }
        else
#endif /* SQR_MUL_ASM */
#endif /* SP_WORD_SIZE == 64 */
#if (SP_WORD_SIZE == 32 && SP_INT_BITS >= 256)
#ifdef SQR_MUL_ASM
        if ((a->used == 8) && (b->used == 8)) {
            err = _sp_mul_8(a, b, r);
        }
        else
#endif /* SQR_MUL_ASM */
#endif /* SP_WORD_SIZE == 32 */
#if (SP_WORD_SIZE == 32 && SP_INT_BITS >= 384)
#ifdef SQR_MUL_ASM
        if ((a->used == 12) && (b->used == 12)) {
            err = _sp_mul_12(a, b, r);
        }
        else
#endif /* SQR_MUL_ASM */
#endif /* SP_WORD_SIZE == 32 */
#endif /* !WOLFSSL_HAVE_SP_ECC && HAVE_ECC */
#if defined(SQR_MUL_ASM) && (defined(WOLFSSL_SP_INT_LARGE_COMBA) || \
    (!defined(WOLFSSL_SP_MATH) && defined(WOLFCRYPT_HAVE_SAKKE) && \
    (SP_WORD_SIZE == 64)))
    #if SP_INT_DIGITS >= 32
        if ((a->used == 16) && (b->used == 16)) {
            err = _sp_mul_16(a, b, r);
        }
        else
    #endif /* SP_INT_DIGITS >= 32 */
#endif /* SQR_MUL_ASM && (WOLFSSL_SP_INT_LARGE_COMBA || !WOLFSSL_SP_MATH &&
        * WOLFCRYPT_HAVE_SAKKE && SP_WORD_SIZE == 64 */
#if defined(SQR_MUL_ASM) && defined(WOLFSSL_SP_INT_LARGE_COMBA)
    #if SP_INT_DIGITS >= 48
        if ((a->used == 24) && (b->used == 24)) {
            err = _sp_mul_24(a, b, r);
        }
        else
    #endif /* SP_INT_DIGITS >= 48 */
    #if SP_INT_DIGITS >= 64
        if ((a->used == 32) && (b->used == 32)) {
            err = _sp_mul_32(a, b, r);
        }
        else
    #endif /* SP_INT_DIGITS >= 64 */
    #if SP_INT_DIGITS >= 96
        if ((a->used == 48) && (b->used == 48)) {
            err = _sp_mul_48(a, b, r);
        }
        else
    #endif /* SP_INT_DIGITS >= 96 */
    #if SP_INT_DIGITS >= 128
        if ((a->used == 64) && (b->used == 64)) {
            err = _sp_mul_64(a, b, r);
        }
        else
    #endif /* SP_INT_DIGITS >= 128 */
    #if SP_INT_DIGITS >= 192
        if ((a->used == 96) && (b->used == 96)) {
            err = _sp_mul_96(a, b, r);
        }
        else
    #endif /* SP_INT_DIGITS >= 192 */
#endif /* SQR_MUL_ASM && WOLFSSL_SP_INT_LARGE_COMBA */
#endif /* !WOLFSSL_SP_SMALL */

#ifdef SQR_MUL_ASM
        if (a->used == b->used) {
            err = _sp_mul_nxn(a, b, r);
        }
        else
#endif
        {
            err = _sp_mul(a, b, r);
        }
    }

#ifdef WOLFSSL_SP_INT_NEGATIVE
    if (err == MP_OKAY) {
        r->sign = (r->used == 0) ? MP_ZPOS : sign;
    }
#endif

#if 0
    if (err == MP_OKAY) {
        sp_print(r, "rmul");
    }
#endif

    return err;
}
/* END SP_MUL implementations. */

#endif

#if defined(WOLFSSL_SP_MATH_ALL) || defined(WOLFSSL_HAVE_SP_DH) || \
    defined(WOLFCRYPT_HAVE_ECCSI) || \
    (!defined(NO_RSA) && defined(WOLFSSL_KEY_GEN)) || defined(OPENSSL_ALL)
/* Multiply a by b mod m and store in r: r = (a * b) mod m
 *
 * @param  [in]   a  SP integer to multiply.
 * @param  [in]   b  SP integer to multiply.
 * @param  [in]   m  SP integer that is the modulus.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_mulmod_tmp(const sp_int* a, const sp_int* b, const sp_int* m,
    sp_int* r)
{
    int err = MP_OKAY;
    /* Create temporary for multiplication result. */
    DECL_SP_INT(t, a->used + b->used);

    ALLOC_SP_INT(t, a->used + b->used, err, NULL);
    if (err == MP_OKAY) {
        err = sp_init_size(t, a->used + b->used);
    }

    /* Multiply and reduce. */
    if (err == MP_OKAY) {
        err = sp_mul(a, b, t);
    }
    if (err == MP_OKAY) {
        err = sp_mod(t, m, r);
    }

    /* Dispose of an allocated SP int. */
    FREE_SP_INT(t, NULL);

    return err;
}

/* Multiply a by b mod m and store in r: r = (a * b) mod m
 *
 * @param  [in]   a  SP integer to multiply.
 * @param  [in]   b  SP integer to multiply.
 * @param  [in]   m  SP integer that is the modulus.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_mulmod(const sp_int* a, const sp_int* b, const sp_int* m,
    sp_int* r)
{
    int err = MP_OKAY;

    /* Use r as intermediate result if not same as pointer m which is needed
     * after first intermediate result.
     */
    if (r != m) {
        /* Multiply and reduce. */
        err = sp_mul(a, b, r);
        if (err == MP_OKAY) {
            err = sp_mod(r, m, r);
        }
    }
    else {
        /* Do operation using temporary. */
        err = _sp_mulmod_tmp(a, b, m, r);
    }

    return err;
}

/* Multiply a by b mod m and store in r: r = (a * b) mod m
 *
 * @param  [in]   a  SP integer to multiply.
 * @param  [in]   b  SP integer to multiply.
 * @param  [in]   m  SP integer that is the modulus.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a, b, m or r is NULL; m is 0; or a * b is too big for
 *          fixed data length.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
int sp_mulmod(const sp_int* a, const sp_int* b, const sp_int* m, sp_int* r)
{
    int err = MP_OKAY;

    /* Validate parameters. */
    if ((a == NULL) || (b == NULL) || (m == NULL) || (r == NULL)) {
        err = MP_VAL;
    }
    /* Ensure result SP int is big enough for intermediates. */
    if ((err == MP_OKAY) && (r != m) && (a->used + b->used > r->size)) {
        err = MP_VAL;
    }

#if 0
    if (err == 0) {
        sp_print(a, "a");
        sp_print(b, "b");
        sp_print(m, "m");
    }
#endif

    if (err == MP_OKAY) {
        err = _sp_mulmod(a, b, m, r);
    }

#if 0
    if (err == 0) {
        sp_print(r, "rmm");
    }
#endif

    return err;
}
#endif

#ifdef WOLFSSL_SP_INVMOD
/* Calculates the multiplicative inverse in the field. r*a = x*m + 1
 * Right-shift Algorithm. NOT constant time.
 *
 * Algorithm:
 *   1. u = m, v = a, b = 0, c = 1
 *   2. While v != 1 and u != 0
 *     2.1. If u even
 *       2.1.1. u /= 2
 *       2.1.2. b = (b / 2) mod m
 *     2.2. Else if v even
 *       2.2.1. v /= 2
 *       2.2.2. c = (c / 2) mod m
 *     2.3. Else if u >= v
 *       2.3.1. u -= v
 *       2.3.2. b = (c - b) mod m
 *     2.4. Else (v > u)
 *       2.4.1. v -= u
 *       2.4.2. c = (b - c) mod m
 *  3. NO_INVERSE if u == 0
 *
 * @param  [in]   a  SP integer to find inverse of.
 * @param  [in]   m  SP integer this is the modulus.
 * @param  [in]   u  SP integer to use in calculation.
 * @param  [in]   v  SP integer to use in calculation.
 * @param  [in]   b  SP integer to use in calculation
 * @param  [out]  c  SP integer that is the inverse.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when no inverse.
 */
static int _sp_invmod_bin(const sp_int* a, const sp_int* m, sp_int* u,
    sp_int* v, sp_int* b, sp_int* c)
{
    int err = MP_OKAY;

    /* 1. u = m, v = a, b = 0, c = 1 */
    _sp_copy(m, u);
    if (a != v) {
        _sp_copy(a, v);
    }
    _sp_zero(b);
    _sp_set(c, 1);

    /* 2. While v != 1 and u != 0 */
    while (!sp_isone(v) && !sp_iszero(u)) {
        /* 2.1. If u even */
        if ((u->dp[0] & 1) == 0) {
            /* 2.1.1. u /= 2 */
            _sp_div_2(u, u);
            /* 2.1.2. b = (b / 2) mod m */
            if (sp_isodd(b)) {
                _sp_add_off(b, m, b, 0);
            }
            _sp_div_2(b, b);
        }
        /* 2.2. Else if v even */
        else if ((v->dp[0] & 1) == 0) {
            /* 2.2.1. v /= 2 */
            _sp_div_2(v, v);
            /* 2.1.2. c = (c / 2) mod m */
            if (sp_isodd(c)) {
                _sp_add_off(c, m, c, 0);
            }
            _sp_div_2(c, c);
        }
        /* 2.3. Else if u >= v */
        else if (_sp_cmp_abs(u, v) != MP_LT) {
            /* 2.3.1. u -= v */
            _sp_sub_off(u, v, u, 0);
            /* 2.3.2. b = (c - b) mod m */
            if (_sp_cmp_abs(b, c) == MP_LT) {
                _sp_add_off(b, m, b, 0);
            }
            _sp_sub_off(b, c, b, 0);
        }
        /* 2.4. Else (v > u) */
        else {
            /* 2.4.1. v -= u */
            _sp_sub_off(v, u, v, 0);
            /* 2.4.2. c = (b - c) mod m */
            if (_sp_cmp_abs(c, b) == MP_LT) {
                _sp_add_off(c, m, c, 0);
            }
            _sp_sub_off(c, b, c, 0);
        }
    }
    /* 3. NO_INVERSE if u == 0 */
    if (sp_iszero(u)) {
        err = MP_VAL;
    }

    return err;
}

#if !defined(WOLFSSL_SP_LOW_MEM) && !defined(WOLFSSL_SP_SMALL) && \
    (!defined(NO_RSA) || !defined(NO_DH))
/* Calculates the multiplicative inverse in the field. r*a = x*m + 1
 * Extended Euclidean Algorithm. NOT constant time.
 *
 * Creates two new SP ints.
 *
 * Algorithm:
 *  1. x = m, y = a, b = 1, c = 0
 *  2. while x > 1
 *   2.1. d = x / y, r = x mod y
 *   2.2. c -= d * b
 *   2.3. x = y, y = r
 *   2.4. s = b, b = c, c = s
 *  3. If y != 0 then NO_INVERSE
 *  4. If c < 0 then c += m
 *  5. inv = c
 *
 * @param  [in]   a    SP integer to find inverse of.
 * @param  [in]   m    SP integer this is the modulus.
 * @param  [in]   u    SP integer to use in calculation.
 * @param  [in]   v    SP integer to use in calculation.
 * @param  [in]   b    SP integer to use in calculation
 * @param  [in]   c    SP integer to use in calculation
 * @param  [out]  inv  SP integer that is the inverse.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when no inverse.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_invmod_div(const sp_int* a, const sp_int* m, sp_int* x,
    sp_int* y, sp_int* b, sp_int* c, sp_int* inv)
{
    int err = MP_OKAY;
    sp_int* s;
#ifndef WOLFSSL_SP_INT_NEGATIVE
    int bneg = 0;
    int cneg = 0;
    int neg;
#endif
    DECL_SP_INT(d, m->used + 1);

    ALLOC_SP_INT(d, m->used + 1, err, NULL);
    if (err == MP_OKAY) {
        mp_init(d);

        /* 1. x = m, y = a, b = 1, c = 0 */
        if (a != y) {
            _sp_copy(a, y);
        }
        _sp_copy(m, x);
        _sp_set(b, 1);
        _sp_zero(c);
    }
#ifdef WOLFSSL_SP_INT_NEGATIVE
    /* 2. while x > 1 */
    while ((err == MP_OKAY) && (!sp_isone(x)) && (!sp_iszero(x))) {
        /* 2.1. d = x / y, r = x mod y */
        err = sp_div(x, y, d, x);
        if (err == MP_OKAY) {
            /* 2.2. c -= d * b */
            if (sp_isone(d)) {
                /* c -= 1 * b */
                err = sp_sub(c, b, c);
            }
            else {
                /* d *= b */
                err = sp_mul(d, b, d);
                /* c -= d */
                if (err == MP_OKAY) {
                    err = sp_sub(c, d, c);
                }
            }
            /* 2.3. x = y, y = r */
            s = y; y = x; x = s;
            /* 2.4. s = b, b = c, c = s */
            s = b; b = c; c = s;
        }
    }
    /* 3. If y != 0 then NO_INVERSE */
    if ((err == MP_OKAY) && (!sp_iszero(y))) {
        err = MP_VAL;
    }
    /* 4. If c < 0 then c += m */
    if ((err == MP_OKAY) && sp_isneg(c)) {
        err = sp_add(c, m, c);
    }
    if (err == MP_OKAY) {
        /* 5. inv = c */
        err = sp_copy(c, inv);
    }
#else
    /* 2. while x > 1 */
    while ((err == MP_OKAY) && (!sp_isone(x)) && (!sp_iszero(x))) {
        /* 2.1. d = x / y, r = x mod y */
        err = sp_div(x, y, d, x);
        if (err == MP_OKAY) {
            if (sp_isone(d)) {
                /* c -= 1 * b */
                if ((bneg ^ cneg) == 1) {
                    /* c -= -b or -c -= b, therefore add. */
                    _sp_add_off(c, b, c, 0);
                }
                else if (_sp_cmp_abs(c, b) == MP_LT) {
                    /* |c| < |b| and same sign, reverse subtract and negate. */
                    _sp_sub_off(b, c, c, 0);
                    cneg = !cneg;
                }
                else {
                    /* |c| >= |b| */
                    _sp_sub_off(c, b, c, 0);
                }
            }
            else {
                /* d *= b */
                err = sp_mul(d, b, d);
                /* c -= d */
                if (err == MP_OKAY) {
                    if ((bneg ^ cneg) == 1) {
                        /* c -= -d or -c -= d, therefore add. */
                        _sp_add_off(c, d, c, 0);
                    }
                    else if (_sp_cmp_abs(c, d) == MP_LT) {
                        /* |c| < |d| and same sign, reverse subtract and negate.
                         */
                        _sp_sub_off(d, c, c, 0);
                        cneg = !cneg;
                    }
                    else {
                        _sp_sub_off(c, d, c, 0);
                    }
                }
            }
            /* 2.3. x = y, y = r */
            s = y; y = x; x = s;
            /* 2.4. s = b, b = c, c = s */
            s = b; b = c; c = s;
            neg = bneg; bneg = cneg; cneg = neg;
        }
    }
    /* 3. If y != 0 then NO_INVERSE */
    if ((err == MP_OKAY) && (!sp_iszero(y))) {
        err = MP_VAL;
    }
    /* 4. If c < 0 then c += m */
    if ((err == MP_OKAY) && cneg) {
        /* c = m - |c| */
        _sp_sub_off(m, c, c, 0);
    }
    if (err == MP_OKAY) {
        /* 5. inv = c */
        err = sp_copy(c, inv);
    }
#endif

    FREE_SP_INT(d, NULL);
    return err;
}
#endif

/* Calculates the multiplicative inverse in the field.
 * Right-shift Algorithm or Extended Euclidean Algorithm. NOT constant time.
 *
 * r*a = x*m + 1
 *
 * @param  [in]   a  SP integer to find inverse of.
 * @param  [in]   m  SP integer this is the modulus.
 * @param  [out]  r  SP integer to hold result. r cannot be m.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when m is even and a divides m evenly.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_invmod(const sp_int* a, const sp_int* m, sp_int* r)
{
    int err = MP_OKAY;
    sp_int* u = NULL;
    sp_int* v = NULL;
    sp_int* b = NULL;
    DECL_SP_INT_ARRAY(t, m->used + 1, 3);
    DECL_SP_INT(c, 2 * m->used + 1);

    /* Allocate SP ints:
     *  - x3 one word larger than modulus
     *  - x1 one word longer than twice modulus used
     */
    ALLOC_SP_INT_ARRAY(t, m->used + 1, 3, err, NULL);
    ALLOC_SP_INT(c, 2 * m->used + 1, err, NULL);
    if (err == MP_OKAY) {
        u = t[0];
        v = t[1];
        b = t[2];
        /* c allocated separately and larger for even mod case. */
    }

    /* Initialize intermediate values with minimal sizes. */
    if (err == MP_OKAY) {
        err = sp_init_size(u, m->used + 1);
    }
    if (err == MP_OKAY) {
        err = sp_init_size(v, m->used + 1);
    }
    if (err == MP_OKAY) {
        err = sp_init_size(b, m->used + 1);
    }
    if (err == MP_OKAY) {
        err = sp_init_size(c, 2 * m->used + 1);
    }

    if (err == MP_OKAY) {
        const sp_int* mm = m;
        const sp_int* ma = a;
        int evenMod = 0;

        if (sp_iseven(m)) {
            /* a^-1 mod m = m + ((1 - m*(m^-1 % a)) / a) */
            mm = a;
            ma = v;
            _sp_copy(a, u);
            err = sp_mod(m, a, v);
            /* v == 0 when a divides m evenly - no inverse.  */
            if ((err == MP_OKAY) && sp_iszero(v)) {
                err = MP_VAL;
            }
            evenMod = 1;
        }

        if (err == MP_OKAY) {
            /* Calculate inverse. */
        #if !defined(WOLFSSL_SP_LOW_MEM) && !defined(WOLFSSL_SP_SMALL) && \
            (!defined(NO_RSA) || !defined(NO_DH))
            if (sp_count_bits(mm) >= 1024) {
                err = _sp_invmod_div(ma, mm, u, v, b, c, c);
            }
            else
        #endif
            {
                err = _sp_invmod_bin(ma, mm, u, v, b, c);
            }
        }

        /* Fixup for even modulus. */
        if ((err == MP_OKAY) && evenMod) {
            /* Finish operation.
             *    a^-1 mod m = m + ((1 - m*c) / a)
             * => a^-1 mod m = m - ((m*c - 1) / a)
             */
            err = sp_mul(c, m, c);
            if (err == MP_OKAY) {
                _sp_sub_d(c, 1, c);
                err = sp_div(c, a, c, NULL);
            }
            if (err == MP_OKAY) {
                err = sp_sub(m, c, r);
            }
        }
        else if (err == MP_OKAY) {
            _sp_copy(c, r);
        }
    }

    FREE_SP_INT(c, NULL);
    FREE_SP_INT_ARRAY(t, NULL);
    return err;
}

/* Calculates the multiplicative inverse in the field.
 * Right-shift Algorithm or Extended Euclidean Algorithm. NOT constant time.
 *
 * r*a = x*m + 1
 *
 * @param  [in]   a  SP integer to find inverse of.
 * @param  [in]   m  SP integer this is the modulus.
 * @param  [out]  r  SP integer to hold result. r cannot be m.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a, m or r is NULL; a or m is zero; a and m are even or
 *          m is negative.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
int sp_invmod(const sp_int* a, const sp_int* m, sp_int* r)
{
    int err = MP_OKAY;

    /* Validate parameters. */
    if ((a == NULL) || (m == NULL) || (r == NULL) || (r == m)) {
        err = MP_VAL;
    }
    if ((err == MP_OKAY) && (m->used * 2 > r->size)) {
        err = MP_VAL;
    }

#ifdef WOLFSSL_SP_INT_NEGATIVE
    /* Don't support negative modulus. */
    if ((err == MP_OKAY) && (m->sign == MP_NEG)) {
        err = MP_VAL;
    }
#endif

    if (err == MP_OKAY) {
        /* Ensure number is less than modulus. */
        if (_sp_cmp_abs(a, m) != MP_LT) {
            err = sp_mod(a, m, r);
            a = r;
        }
    }

#ifdef WOLFSSL_SP_INT_NEGATIVE
    if ((err == MP_OKAY) && (a->sign == MP_NEG)) {
        /* Make 'a' positive */
        err = sp_add(m, a, r);
        a = r;
    }
#endif

    /* 0 != n*m + 1 (+ve m), r*a mod 0 is always 0 (never 1)  */
    if ((err == MP_OKAY) && (sp_iszero(a) || sp_iszero(m))) {
        err = MP_VAL;
    }
    /* r*2*x != n*2*y + 1 for integer x,y */
    if ((err == MP_OKAY) && sp_iseven(a) && sp_iseven(m)) {
        err = MP_VAL;
    }
    /* 1*1 = 0*m + 1  */
    if ((err == MP_OKAY) && sp_isone(a)) {
        _sp_set(r, 1);
    }
    else if (err == MP_OKAY) {
        err = _sp_invmod(a, m, r);
    }

    return err;
}
#endif /* WOLFSSL_SP_INVMOD */

#ifdef WOLFSSL_SP_INVMOD_MONT_CT

/* Number of entries to pre-compute.
 * Many pre-defined primes have multiple of 8 consecutive 1s.
 * P-256 modulus - 2 => 32x1, 31x0, 1x1, 96x0, 94x1, 1x0, 1x1.
 */
#define CT_INV_MOD_PRE_CNT      8

/* Calculates the multiplicative inverse in the field - constant time.
 *
 * Modulus (m) must be a prime and greater than 2.
 * For prime m, inv = a ^ (m-2) mod m as 1 = a ^ (m-1) mod m.
 *
 * Algorithm:
 *  pre = pre-computed values, m = modulus, a = value to find inverse of,
 *  e = exponent
 *  Pre-calc:
 *   1. pre[0] = 2^0 * a mod m
 *   2. For i in 2..CT_INV_MOD_PRE_CNT
 *    2.1. pre[i-1] = ((pre[i-2] ^ 2) * a) mod m
 *  Calc inverse:
 *   1. e = m - 2
 *   2. j = Count leading 1's up to CT_INV_MOD_PRE_CNT
 *   3. t = pre[j-1]
 *   4. s = 0
 *   5. j = 0
 *   6. For i index of next top bit..0
 *    6.1. bit = e[i]
 *    6.2. j += bit
 *    6.3. s += 1
 *    6.4. if j == CT_INV_MOD_PRE_CNT or (bit == 0 and j > 0)
 *     6.4.1. s -= 1 - bit
 *     6.4.2. For s downto 1
 *      6.4.2.1. t = (t ^ 2) mod m
 *     6.4.3. s = 1 - bit
 *     6.4.4. t = (t * pre[j-1]) mod m
 *     6.4.5. j = 0
 *   7. For s downto 1
 *    7.1. t = (t ^ 2) mod m
 *   8. If j > 0 then r = (t * pre[j-1]) mod m
 *   9. Else r = t
 *
 * @param  [in]   a   SP integer, Montgomery form, to find inverse of.
 * @param  [in]   m   SP integer this is the modulus.
 * @param  [out]  r   SP integer to hold result.
 * @param  [in]   mp  SP integer digit that is the bottom digit of inv(-m).
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_invmod_mont_ct(const sp_int* a, const sp_int* m, sp_int* r,
    sp_int_digit mp)
{
    int err = MP_OKAY;
    int i;
    int j = 0;
    int s = 0;
    sp_int* t = NULL;
    sp_int* e = NULL;
#ifndef WOLFSSL_SP_NO_MALLOC
    DECL_DYN_SP_INT_ARRAY(pre, m->used * 2 + 1, CT_INV_MOD_PRE_CNT + 2);
#else
    DECL_SP_INT_ARRAY(pre, m->used * 2 + 1, CT_INV_MOD_PRE_CNT + 2);
#endif

#ifndef WOLFSSL_SP_NO_MALLOC
    ALLOC_DYN_SP_INT_ARRAY(pre, m->used * 2 + 1, CT_INV_MOD_PRE_CNT + 2, err,
        NULL);
#else
    ALLOC_SP_INT_ARRAY(pre, m->used * 2 + 1, CT_INV_MOD_PRE_CNT + 2, err, NULL);
#endif
    if (err == MP_OKAY) {
        t = pre[CT_INV_MOD_PRE_CNT + 0];
        e = pre[CT_INV_MOD_PRE_CNT + 1];
        /* Space for sqr and mul result. */
        _sp_init_size(t, m->used * 2 + 1);
        /* e = mod - 2 */
        _sp_init_size(e, m->used + 1);

        /* Create pre-computation results: ((2^(1..8))-1).a. */
        _sp_init_size(pre[0], m->used * 2 + 1);
        /* 1. pre[0] = 2^0 * a mod m
         *    Start with 1.a = a.
         */
        _sp_copy(a, pre[0]);
        /* 2. For i in 2..CT_INV_MOD_PRE_CNT
         *    For rest of entries in table.
         */
        for (i = 1; (err == MP_OKAY) && (i < CT_INV_MOD_PRE_CNT); i++) {
            /* 2.1 pre[i-1] = ((pre[i-1] ^ 2) * a) mod m */
            /* Previous value ..1 -> ..10 */
            _sp_init_size(pre[i], m->used * 2 + 1);
            err = sp_sqr(pre[i-1], pre[i]);
            if (err == MP_OKAY) {
                err = _sp_mont_red(pre[i], m, mp);
            }
            /* ..10 -> ..11 */
            if (err == MP_OKAY) {
                err = sp_mul(pre[i], a, pre[i]);
            }
            if (err == MP_OKAY) {
                err = _sp_mont_red(pre[i], m, mp);
            }
        }
    }

    if (err == MP_OKAY) {
        /* 1. e = m - 2 */
        _sp_sub_d(m, 2, e);
        /* 2. j = Count leading 1's up to CT_INV_MOD_PRE_CNT
         *    One or more of the top bits is 1 so count.
         */
        for (i = sp_count_bits(e)-2, j = 1; i >= 0; i--, j++) {
            if ((!sp_is_bit_set(e, (unsigned int)i)) ||
                    (j == CT_INV_MOD_PRE_CNT)) {
                break;
            }
        }
        /* 3. Set tmp to product of leading bits. */
        _sp_copy(pre[j-1], t);

        /* 4. s = 0 */
        s = 0;
        /* 5. j = 0 */
        j = 0;
        /* 6. For i index of next top bit..0
         *    Do remaining bits in exponent.
         */
        for (; (err == MP_OKAY) && (i >= 0); i--) {
            /* 6.1. bit = e[i] */
            int bit = sp_is_bit_set(e, (unsigned int)i);

            /* 6.2. j += bit
             *      Update count of consequitive 1 bits.
             */
            j += bit;
            /* 6.3. s += 1
             *      Update count of squares required.
             */
            s++;

            /* 6.4. if j == CT_INV_MOD_PRE_CNT or (bit == 0 and j > 0)
             *      Check if max 1 bits or 0 and have seen at least one 1 bit.
             */
            if ((j == CT_INV_MOD_PRE_CNT) || ((!bit) && (j > 0))) {
                /* 6.4.1. s -= 1 - bit */
                bit = 1 - bit;
                s -= bit;
                /* 6.4.2. For s downto 1
                 *        Do s squares.
                 */
                for (; (err == MP_OKAY) && (s > 0); s--) {
                    /* 6.4.2.1. t = (t ^ 2) mod m */
                    err = sp_sqr(t, t);
                    if (err == MP_OKAY) {
                        err = _sp_mont_red(t, m, mp);
                    }
                }
                /* 6.4.3. s = 1 - bit */
                s = bit;

                /* 6.4.4. t = (t * pre[j-1]) mod m */
                if (err == MP_OKAY) {
                    err = sp_mul(t, pre[j-1], t);
                }
                if (err == MP_OKAY) {
                    err = _sp_mont_red(t, m, mp);
                }
                /* 6.4.5. j = 0
                 *        Reset number of 1 bits seen.
                 */
                j = 0;
            }
        }
    }
    if (err == MP_OKAY) {
        /* 7. For s downto 1
         *    Do s squares - total remaining. */
        for (; (err == MP_OKAY) && (s > 0); s--) {
            /* 7.1. t = (t ^ 2) mod m */
            err = sp_sqr(t, t);
            if (err == MP_OKAY) {
                err = _sp_mont_red(t, m, mp);
            }
        }
    }
    if (err == MP_OKAY) {
        /* 8. If j > 0 then r = (t * pre[j-1]) mod m */
        if (j > 0) {
            err = sp_mul(t, pre[j-1], r);
            if (err == MP_OKAY) {
                err = _sp_mont_red(r, m, mp);
            }
        }
        /* 9. Else r = t */
        else {
            _sp_copy(t, r);
        }
    }

#ifndef WOLFSSL_SP_NO_MALLOC
    FREE_DYN_SP_INT_ARRAY(pre, NULL);
#else
    FREE_SP_INT_ARRAY(pre, NULL);
#endif
    return err;
}

/* Calculates the multiplicative inverse in the field - constant time.
 *
 * Modulus (m) must be a prime and greater than 2.
 * For prime m, inv = a ^ (m-2) mod m as 1 = a ^ (m-1) mod m.
 *
 * @param  [in]   a   SP integer, Montgomery form, to find inverse of.
 * @param  [in]   m   SP integer this is the modulus.
 * @param  [out]  r   SP integer to hold result.
 * @param  [in]   mp  SP integer digit that is the bottom digit of inv(-m).
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a, m or r is NULL; a is 0 or m is less than 3.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
int sp_invmod_mont_ct(const sp_int* a, const sp_int* m, sp_int* r,
    sp_int_digit mp)
{
    int err = MP_OKAY;

    /* Validate parameters. */
    if ((a == NULL) || (m == NULL) || (r == NULL)) {
        err = MP_VAL;
    }
    /* Ensure m is not too big. */
    else if (m->used * 2 >= SP_INT_DIGITS) {
        err = MP_VAL;
    }
    /* check that r can hold the range of the modulus result */
    else if (m->used > r->size) {
        err = MP_VAL;
    }

    /* 0 != n*m + 1 (+ve m), r*a mod 0 is always 0 (never 1) */
    if ((err == MP_OKAY) && (sp_iszero(a) || sp_iszero(m) ||
            ((m->used == 1) && (m->dp[0] < 3)))) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
        /* Do operation. */
        err = _sp_invmod_mont_ct(a, m, r, mp);
    }

    return err;
}

#endif /* WOLFSSL_SP_INVMOD_MONT_CT */


/**************************
 * Exponentiation functions
 **************************/

#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY) && \
    !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || !defined(NO_DH) || \
    defined(OPENSSL_ALL)

#ifndef WC_PROTECT_ENCRYPTED_MEM

/* Internal. Exponentiates b to the power of e modulo m into r: r = b ^ e mod m
 * Process the exponent one bit at a time.
 * Is constant time and can be cache attack resistant.
 *
 * Algorithm:
 *  b: base, e: exponent, m: modulus, r: result, bits: #bits to use
 *  1. s = 0
 *  2. t[0] = b mod m.
 *  3. t[1] = t[0]
 *  4. For i in (bits-1)...0
 *   4.1. t[s] = t[s] ^ 2
 *   4.2. y = e[i]
 *   4.3  j = y & s
 *   4.4  s = s | y
 *   4.5. t[j] = t[j] * b
 *  5. r = t[1]
 *
 * @param  [in]   b     SP integer that is the base.
 * @param  [in]   e     SP integer that is the exponent.
 * @param  [in]   bits  Number of bits in exponent to use. May be greater than
 *                      count of bits in e.
 * @param  [in]   m     SP integer that is the modulus.
 * @param  [out]  r     SP integer to hold result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_exptmod_ex(const sp_int* b, const sp_int* e, int bits,
    const sp_int* m, sp_int* r)
{
    int i;
    int err = MP_OKAY;
    int done = 0;
    /* 1. s = 0 */
    int s = 0;
#ifdef WC_NO_CACHE_RESISTANT
    DECL_SP_INT_ARRAY(t, 2 * m->used + 1, 2);
#else
    DECL_SP_INT_ARRAY(t, 2 * m->used + 1, 3);
#endif

    /* Allocate temporaries. */
#ifdef WC_NO_CACHE_RESISTANT
    ALLOC_SP_INT_ARRAY(t, 2 * m->used + 1, 2, err, NULL);
#else
    /* Working SP int needed when cache resistant. */
    ALLOC_SP_INT_ARRAY(t, 2 * m->used + 1, 3, err, NULL);
#endif
    if (err == MP_OKAY) {
        /* Initialize temporaries. */
        _sp_init_size(t[0], 2 * m->used + 1);
        _sp_init_size(t[1], 2 * m->used + 1);
    #ifndef WC_NO_CACHE_RESISTANT
        _sp_init_size(t[2], 2 * m->used + 1);
    #endif

        /* 2. t[0] = b mod m
         * Ensure base is less than modulus - set fake working value to base.
         */
        if (_sp_cmp_abs(b, m) != MP_LT) {
            err = sp_mod(b, m, t[0]);
            /* Handle base == modulus. */
            if ((err == MP_OKAY) && sp_iszero(t[0])) {
                _sp_set(r, 0);
                done = 1;
            }
        }
        else {
            /* Copy base into working variable. */
            _sp_copy(b, t[0]);
        }
    }

    if ((!done) && (err == MP_OKAY)) {
        /* 3. t[1] = t[0]
         *    Set real working value to base.
         */
        _sp_copy(t[0], t[1]);

        /* 4. For i in (bits-1)...0 */
        for (i = bits - 1; (err == MP_OKAY) && (i >= 0); i--) {
#ifdef WC_NO_CACHE_RESISTANT
            /* 4.1. t[s] = t[s] ^ 2 */
            err = sp_sqrmod(t[s], m, t[s]);
            if (err == MP_OKAY) {
                /* 4.2. y = e[i] */
                int y = (e->dp[i >> SP_WORD_SHIFT] >> (i & SP_WORD_MASK)) & 1;
                /* 4.3. j = y & s */
                int j = y & s;
                /* 4.4  s = s | y */
                s |= y;
                /* 4.5. t[j] = t[j] * b */
                err = _sp_mulmod(t[j], b, m, t[j]);
            }
#else
            /* 4.1. t[s] = t[s] ^ 2 */
            _sp_copy((sp_int*)(((size_t)t[0] & sp_off_on_addr[s^1]) +
                               ((size_t)t[1] & sp_off_on_addr[s  ])),
                     t[2]);
            err = sp_sqrmod(t[2], m, t[2]);
            _sp_copy(t[2],
                     (sp_int*)(((size_t)t[0] & sp_off_on_addr[s^1]) +
                               ((size_t)t[1] & sp_off_on_addr[s  ])));

            if (err == MP_OKAY) {
                /* 4.2. y = e[i] */
                int y = (int)((e->dp[i >> SP_WORD_SHIFT] >> (i & SP_WORD_MASK)) & 1);
                /* 4.3. j = y & s */
                int j = y & s;
                /* 4.4  s = s | y */
                s |= y;
                /* 4.5. t[j] = t[j] * b */
                _sp_copy((sp_int*)(((size_t)t[0] & sp_off_on_addr[j^1]) +
                                   ((size_t)t[1] & sp_off_on_addr[j  ])),
                         t[2]);
                err = _sp_mulmod(t[2], b, m, t[2]);
                _sp_copy(t[2],
                         (sp_int*)(((size_t)t[0] & sp_off_on_addr[j^1]) +
                                   ((size_t)t[1] & sp_off_on_addr[j  ])));
            }
#endif
        }
    }
    if ((!done) && (err == MP_OKAY)) {
        /* 5. r = t[1] */
        _sp_copy(t[1], r);
    }

    FREE_SP_INT_ARRAY(t, NULL);
    return err;
}

#else

/* Internal. Exponentiates b to the power of e modulo m into r: r = b ^ e mod m
 * Process the exponent one bit at a time with base in Montgomery form.
 * Is constant time and cache attack resistant.
 *
 * Based on work by Marc Joye, Sung-Ming Yen, "The Montgomery Powering Ladder",
 * Cryptographic Hardware and Embedded Systems, CHES 2002
 *
 * Algorithm:
 *  b: base, e: exponent, m: modulus, r: result, bits: #bits to use
 *  1. t[1] = b mod m.
 *  2. t[0] = 1
 *  3. For i in (bits-1)...0
 *   3.1. y = e[i]
 *   3.2. t[2] = t[0] * t[1]
 *   3.3. t[3] = t[y] ^ 2
 *   3.4. t[y] = t[3], t[y^1] = t[2]
 *  4. r = t[0]
 *
 * @param  [in]   b     SP integer that is the base.
 * @param  [in]   e     SP integer that is the exponent.
 * @param  [in]   bits  Number of bits in exponent to use. May be greater than
 *                      count of bits in e.
 * @param  [in]   m     SP integer that is the modulus.
 * @param  [out]  r     SP integer to hold result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_exptmod_ex(const sp_int* b, const sp_int* e, int bits,
    const sp_int* m, sp_int* r)
{
    int err = MP_OKAY;
    int done = 0;
    DECL_SP_INT_ARRAY(t, m->used * 2 + 1, 4);

    /* Allocate temporaries. */
    ALLOC_SP_INT_ARRAY(t, m->used * 2 + 1, 4, err, NULL);
    if (err == MP_OKAY) {
        /* Initialize temporaries. */
        _sp_init_size(t[0], m->used * 2 + 1);
        _sp_init_size(t[1], m->used * 2 + 1);
        _sp_init_size(t[2], m->used * 2 + 1);
        _sp_init_size(t[3], m->used * 2 + 1);

        /* 1. Ensure base is less than modulus. */
        if (_sp_cmp_abs(b, m) != MP_LT) {
            err = sp_mod(b, m, t[1]);
            /* Handle base == modulus. */
            if ((err == MP_OKAY) && sp_iszero(t[1])) {
                _sp_set(r, 0);
                done = 1;
            }
        }
        else {
            /* Copy base into working variable. */
            err = sp_copy(b, t[1]);
        }
    }

    if ((!done) && (err == MP_OKAY)) {
        int i;

        /* 2. t[0] = 1 */
        _sp_set(t[0], 1);

        /* 3. For i in (bits-1)...0 */
        for (i = bits - 1; (err == MP_OKAY) && (i >= 0); i--) {
            /* 3.1. y = e[i] */
            int y = (e->dp[i >> SP_WORD_SHIFT] >> (i & SP_WORD_MASK)) & 1;

            /* 3.2. t[2] = t[0] * t[1] */
            err = sp_mulmod(t[0], t[1], m, t[2]);
            /* 3.3. t[3] = t[y] ^ 2 */
            if (err == MP_OKAY) {
                _sp_copy((sp_int*)(((size_t)t[0] & sp_off_on_addr[y^1]) +
                                   ((size_t)t[1] & sp_off_on_addr[y  ])),
                         t[3]);
                err = sp_sqrmod(t[3], m, t[3]);
            }
            /* 3.4. t[y] = t[3], t[y^1] = t[2] */
            if (err == MP_OKAY) {
                _sp_copy_2_ct(t[2], t[3], t[0], t[1], y, m->used);
            }
        }
    }
    if ((!done) && (err == MP_OKAY)) {
        /* 4. r = t[0] */
        err = sp_copy(t[0], r);
    }

    FREE_SP_INT_ARRAY(t, NULL);
    return err;
}

#endif /* WC_PROTECT_ENCRYPTED_MEM */

#endif

#if (defined(WOLFSSL_SP_MATH_ALL) && ((!defined(WOLFSSL_RSA_VERIFY_ONLY) && \
    !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || !defined(NO_DH))) || \
    defined(OPENSSL_ALL)
#ifndef WC_NO_HARDEN
#if !defined(WC_NO_CACHE_RESISTANT)

#ifndef WC_PROTECT_ENCRYPTED_MEM

/* Internal. Exponentiates b to the power of e modulo m into r: r = b ^ e mod m
 * Process the exponent one bit at a time with base in Montgomery form.
 * Is constant time and cache attack resistant.
 *
 * Algorithm:
 *  b: base, e: exponent, m: modulus, r: result, bits: #bits to use
 *  1. t[0] = b mod m.
 *  2. s = 0
 *  3. t[0] = ToMont(t[0])
 *  4. t[1] = t[0]
 *  5. bm = t[0]
 *  6. For i in (bits-1)...0
 *   6.1. t[s] = t[s] ^ 2
 *   6.2. y = e[i]
 *   6.3  j = y & s
 *   6.4  s = s | y
 *   6.5. t[j] = t[j] * bm
 *  7. t[1] = FromMont(t[1])
 *  8. r = t[1]
 *
 * @param  [in]   b     SP integer that is the base.
 * @param  [in]   e     SP integer that is the exponent.
 * @param  [in]   bits  Number of bits in exponent to use. May be greater than
 *                      count of bits in e.
 * @param  [in]   m     SP integer that is the modulus.
 * @param  [out]  r     SP integer to hold result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_exptmod_mont_ex(const sp_int* b, const sp_int* e, int bits,
    const sp_int* m, sp_int* r)
{
    int err = MP_OKAY;
    int done = 0;
    DECL_SP_INT_ARRAY(t, m->used * 2 + 1, 4);

    /* Allocate temporaries. */
    ALLOC_SP_INT_ARRAY(t, m->used * 2 + 1, 4, err, NULL);
    if (err == MP_OKAY) {
        /* Initialize temporaries. */
        _sp_init_size(t[0], m->used * 2 + 1);
        _sp_init_size(t[1], m->used * 2 + 1);
        _sp_init_size(t[2], m->used * 2 + 1);
        _sp_init_size(t[3], m->used * 2 + 1);

        /* 1. Ensure base is less than modulus. */
        if (_sp_cmp_abs(b, m) != MP_LT) {
            err = sp_mod(b, m, t[0]);
            /* Handle base == modulus. */
            if ((err == MP_OKAY) && sp_iszero(t[0])) {
                _sp_set(r, 0);
                done = 1;
            }
        }
        else {
            /* Copy base into working variable. */
            _sp_copy(b, t[0]);
        }
    }

    if ((!done) && (err == MP_OKAY)) {
        int i;
        /* 2. s = 0 */
        int s = 0;
        sp_int_digit mp;

        /* Calculate Montgomery multiplier for reduction. */
        _sp_mont_setup(m, &mp);
        /* 3. t[0] = ToMont(t[0])
         *    Convert base to Montgomery form - as fake working value.
         */
        err = sp_mont_norm(t[1], m);
        if (err == MP_OKAY) {
            err = sp_mul(t[0], t[1], t[0]);
        }
        if (err == MP_OKAY) {
            /* t[0] = t[0] mod m, temporary size has to be bigger than t[0]. */
            err = _sp_div(t[0], m, NULL, t[0], t[0]->used + 1);
        }
        if (err == MP_OKAY) {
            /* 4. t[1] = t[0]
             *    Set real working value to base.
             */
            _sp_copy(t[0], t[1]);
            /* 5. bm = t[0]. */
            _sp_copy(t[0], t[2]);
        }

        /* 6. For i in (bits-1)...0 */
        for (i = bits - 1; (err == MP_OKAY) && (i >= 0); i--) {
            /* 6.1. t[s] = t[s] ^ 2 */
            _sp_copy((sp_int*)(((size_t)t[0] & sp_off_on_addr[s^1]) +
                               ((size_t)t[1] & sp_off_on_addr[s  ])),
                     t[3]);
            err = sp_sqr(t[3], t[3]);
            if (err == MP_OKAY) {
                err = _sp_mont_red(t[3], m, mp);
            }
            _sp_copy(t[3],
                     (sp_int*)(((size_t)t[0] & sp_off_on_addr[s^1]) +
                               ((size_t)t[1] & sp_off_on_addr[s  ])));

            if (err == MP_OKAY) {
                /* 6.2. y = e[i] */
                int y = (int)((e->dp[i >> SP_WORD_SHIFT] >> (i & SP_WORD_MASK)) & 1);
                /* 6.3  j = y & s */
                int j = y & s;
                /* 6.4  s = s | y */
                s |= y;

                /* 6.5. t[j] = t[j] * bm */
                _sp_copy((sp_int*)(((size_t)t[0] & sp_off_on_addr[j^1]) +
                                   ((size_t)t[1] & sp_off_on_addr[j  ])),
                         t[3]);
                err = sp_mul(t[3], t[2], t[3]);
                if (err == MP_OKAY) {
                    err = _sp_mont_red(t[3], m, mp);
                }
                _sp_copy(t[3],
                         (sp_int*)(((size_t)t[0] & sp_off_on_addr[j^1]) +
                                   ((size_t)t[1] & sp_off_on_addr[j  ])));
            }
        }
        if (err == MP_OKAY) {
            /* 7. t[1] = FromMont(t[1]) */
            err = _sp_mont_red(t[1], m, mp);
            /* Reduction implementation returns number to range: 0..m-1. */
        }
    }
    if ((!done) && (err == MP_OKAY)) {
        /* 8. r = t[1] */
        _sp_copy(t[1], r);
    }

    FREE_SP_INT_ARRAY(t, NULL);
    return err;
}

#else

/* Internal. Exponentiates b to the power of e modulo m into r: r = b ^ e mod m
 * Process the exponent one bit at a time with base in Montgomery form.
 * Is constant time and cache attack resistant.
 *
 * Based on work by Marc Joye, Sung-Ming Yen, "The Montgomery Powering Ladder",
 * Cryptographic Hardware and Embedded Systems, CHES 2002
 *
 * Algorithm:
 *  b: base, e: exponent, m: modulus, r: result, bits: #bits to use
 *  1. t[1] = b mod m.
 *  2. t[0] = ToMont(1)
 *  3. t[1] = ToMont(t[1])
 *  4. For i in (bits-1)...0
 *   4.1. y = e[i]
 *   4.2. t[2] = t[0] * t[1]
 *   4.3. t[3] = t[y] ^ 2
 *   4.4. t[y] = t[3], t[y^1] = t[2]
 *  5. t[0] = FromMont(t[0])
 *  6. r = t[0]
 *
 * @param  [in]   b     SP integer that is the base.
 * @param  [in]   e     SP integer that is the exponent.
 * @param  [in]   bits  Number of bits in exponent to use. May be greater than
 *                      count of bits in e.
 * @param  [in]   m     SP integer that is the modulus.
 * @param  [out]  r     SP integer to hold result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_exptmod_mont_ex(const sp_int* b, const sp_int* e, int bits,
    const sp_int* m, sp_int* r)
{
    int err = MP_OKAY;
    int done = 0;
    DECL_SP_INT_ARRAY(t, m->used * 2 + 1, 4);

    /* Allocate temporaries. */
    ALLOC_SP_INT_ARRAY(t, m->used * 2 + 1, 4, err, NULL);
    if (err == MP_OKAY) {
        /* Initialize temporaries. */
        _sp_init_size(t[0], m->used * 2 + 1);
        _sp_init_size(t[1], m->used * 2 + 1);
        _sp_init_size(t[2], m->used * 2 + 1);
        _sp_init_size(t[3], m->used * 2 + 1);

        /* 1. Ensure base is less than modulus. */
        if (_sp_cmp_abs(b, m) != MP_LT) {
            err = sp_mod(b, m, t[1]);
            /* Handle base == modulus. */
            if ((err == MP_OKAY) && sp_iszero(t[1])) {
                _sp_set(r, 0);
                done = 1;
            }
        }
        else {
            /* Copy base into working variable. */
            err = sp_copy(b, t[1]);
        }
    }

    if ((!done) && (err == MP_OKAY)) {
        int i;
        sp_int_digit mp;

        /* Calculate Montgomery multiplier for reduction. */
        _sp_mont_setup(m, &mp);
        /* 2. t[0] = ToMont(1)
          *    Calculate 1 in Montgomery form.
          */
        err = sp_mont_norm(t[0], m);
        if (err == MP_OKAY) {
            /* 3. t[1] = ToMont(t[1])
             *    Convert base to Montgomery form.
             */
            err = sp_mulmod(t[1], t[0], m, t[1]);
        }

        /* 4. For i in (bits-1)...0 */
        for (i = bits - 1; (err == MP_OKAY) && (i >= 0); i--) {
            /* 4.1. y = e[i] */
            int y = (e->dp[i >> SP_WORD_SHIFT] >> (i & SP_WORD_MASK)) & 1;

            /* 4.2. t[2] = t[0] * t[1] */
            err = sp_mul(t[0], t[1], t[2]);
            if (err == MP_OKAY) {
                err = _sp_mont_red(t[2], m, mp);
            }
            /* 4.3. t[3] = t[y] ^ 2 */
            if (err == MP_OKAY) {
                _sp_copy((sp_int*)(((size_t)t[0] & sp_off_on_addr[y^1]) +
                                   ((size_t)t[1] & sp_off_on_addr[y  ])),
                         t[3]);
                err = sp_sqr(t[3], t[3]);
            }
            if (err == MP_OKAY) {
                err = _sp_mont_red(t[3], m, mp);
            }
            /* 4.4. t[y] = t[3], t[y^1] = t[2] */
            if (err == MP_OKAY) {
                _sp_copy_2_ct(t[2], t[3], t[0], t[1], y, m->used);
            }
        }

        if (err == MP_OKAY) {
            /* 5. t[0] = FromMont(t[0]) */
            err = _sp_mont_red(t[0], m, mp);
            /* Reduction implementation returns number to range: 0..m-1. */
        }
    }
    if ((!done) && (err == MP_OKAY)) {
        /* 6. r = t[0] */
        err = sp_copy(t[0], r);
    }

    FREE_SP_INT_ARRAY(t, NULL);
    return err;
}

#endif /* WC_PROTECT_ENCRYPTED_MEM */

#else

#ifdef SP_ALLOC
#define SP_ALLOC_PREDEFINED
#endif
/* Always allocate large array of sp_ints unless defined WOLFSSL_SP_NO_MALLOC */
#define SP_ALLOC

/* Internal. Exponentiates b to the power of e modulo m into r: r = b ^ e mod m
 * Creates a window of precalculated exponents with base in Montgomery form.
 * Is constant time but NOT cache attack resistant.
 *
 * Algorithm:
 *  b: base, e: exponent, m: modulus, r: result, bits: #bits to use
 *  w: window size based on bits.
 *  1. t[1] = b mod m.
 *  2. t[0] = MontNorm(m) = ToMont(1)
 *  3. t[1] = ToMont(t[1])
 *  4. For i in 2..(2 ^ w) - 1
 *   4.1 if i[0] == 0 then t[i] = t[i/2] ^ 2
 *   4.2 if i[0] == 1 then t[i] = t[i-1] * t[1]
 *  5. cb = w * (bits / w)
 *  5. tr = t[e / (2 ^ cb)]
 *  6. For i in cb..w
 *   6.1. y = e[(i-1)..(i-w)]
 *   6.2. tr = tr ^ (2 * w)
 *   6.3. tr = tr * t[y]
 *  7. tr = FromMont(tr)
 *  8. r = tr
 *
 * @param  [in]   b     SP integer that is the base.
 * @param  [in]   e     SP integer that is the exponent.
 * @param  [in]   bits  Number of bits in exponent to use. May be greater than
 *                      count of bits in e.
 * @param  [in]   m     SP integer that is the modulus.
 * @param  [out]  r     SP integer to hold result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_exptmod_mont_ex(const sp_int* b, const sp_int* e, int bits,
    const sp_int* m, sp_int* r)
{
    int i;
    int c;
    int y;
    int winBits;
    int preCnt;
    int err = MP_OKAY;
    int done = 0;
    sp_int_digit mask;
    sp_int* tr = NULL;
    DECL_SP_INT_ARRAY(t, m->used * 2 + 1, (1 << 6) + 1);

    /* Window bits based on number of pre-calculations versus number of loop
     * calculcations.
     * Exponents for RSA and DH will result in 6-bit windows.
     */
    if (bits > 450) {
        winBits = 6;
    }
    else if (bits <= 21) {
        winBits = 1;
    }
    else if (bits <= 36) {
        winBits = 3;
    }
    else if (bits <= 140) {
        winBits = 4;
    }
    else {
        winBits = 5;
    }
    /* An entry for each possible 0..2^winBits-1 value. */
    preCnt = 1 << winBits;
    /* Mask for calculating index into pre-computed table. */
    mask = preCnt - 1;

    /* Allocate sp_ints for:
     *  - pre-computation table
     *  - temporary result
     */
    ALLOC_SP_INT_ARRAY(t, m->used * 2 + 1, preCnt + 1, err, NULL);
    if (err == MP_OKAY) {
        /* Set variable to use allocate memory. */
        tr = t[preCnt];

        /* Initialize all allocated. */
        for (i = 0; i < preCnt; i++) {
            _sp_init_size(t[i], m->used * 2 + 1);
        }
        _sp_init_size(tr, m->used * 2 + 1);

        /* 1. t[1] = b mod m. */
        if (_sp_cmp_abs(b, m) != MP_LT) {
            err = sp_mod(b, m, t[1]);
            /* Handle base == modulus. */
            if ((err == MP_OKAY) && sp_iszero(t[1])) {
                _sp_set(r, 0);
                done = 1;
            }
        }
        else {
            /* Copy base into entry of table to contain b^1. */
            _sp_copy(b, t[1]);
        }
    }

    if ((!done) && (err == MP_OKAY)) {
        sp_int_digit mp;
        sp_int_digit n;

        /* Calculate Montgomery multiplier for reduction. */
        _sp_mont_setup(m, &mp);
        /* 2. t[0] = MontNorm(m) = ToMont(1) */
        err = sp_mont_norm(t[0], m);
        if (err == MP_OKAY) {
            /* 3. t[1] = ToMont(t[1]) */
            err = sp_mul(t[1], t[0], t[1]);
        }
        if (err == MP_OKAY) {
            /* t[1] = t[1] mod m, temporary size has to be bigger than t[1]. */
            err = _sp_div(t[1], m, NULL, t[1], t[1]->used + 1);
        }

        /* 4. For i in 2..(2 ^ w) - 1 */
        for (i = 2; (i < preCnt) && (err == MP_OKAY); i++) {
            /* 4.1 if i[0] == 0 then t[i] = t[i/2] ^ 2 */
            if ((i & 1) == 0) {
                err = sp_sqr(t[i/2], t[i]);
            }
            /* 4.2 if i[0] == 1 then t[i] = t[i-1] * t[1] */
            else {
                err = sp_mul(t[i-1], t[1], t[i]);
            }
            /* Montgomery reduce square or multiplication result. */
            if (err == MP_OKAY) {
                err = _sp_mont_red(t[i], m, mp);
            }
        }

        if (err == MP_OKAY) {
            /* 5. cb = w * (bits / w) */
            i = (bits - 1) >> SP_WORD_SHIFT;
            n = e->dp[i--];
            /* Find top bit index in last word. */
            c = bits & (SP_WORD_SIZE - 1);
            if (c == 0) {
                c = SP_WORD_SIZE;
            }
            /* Use as many bits from top to make remaining a multiple of window
             * size.
             */
            if ((bits % winBits) != 0) {
                c -= bits % winBits;
            }
            else {
                c -= winBits;
            }

            /* 5. tr = t[e / (2 ^ cb)] */
            y = (int)(n >> c);
            n <<= SP_WORD_SIZE - c;
            /* 5. Copy table value for first window. */
            _sp_copy(t[y], tr);

            /* 6. For i in cb..w */
            for (; (i >= 0) || (c >= winBits); ) {
                int j;

                /* 6.1. y = e[(i-1)..(i-w)] */
                if (c == 0) {
                    /* Bits up to end of digit */
                    n = e->dp[i--];
                    y = (int)(n >> (SP_WORD_SIZE - winBits));
                    n <<= winBits;
                    c = SP_WORD_SIZE - winBits;
                }
                else if (c < winBits) {
                    /* Bits to end of digit and part of next */
                    y = (int)(n >> (SP_WORD_SIZE - winBits));
                    n = e->dp[i--];
                    c = winBits - c;
                    y |= (int)(n >> (SP_WORD_SIZE - c));
                    n <<= c;
                    c = SP_WORD_SIZE - c;
                }
                else {
                    /* Bits from middle of digit */
                    y = (int)((n >> (SP_WORD_SIZE - winBits)) & mask);
                    n <<= winBits;
                    c -= winBits;
                }

                /* 6.2. tr = tr ^ (2 * w) */
                for (j = 0; (j < winBits) && (err == MP_OKAY); j++) {
                    err = sp_sqr(tr, tr);
                    if (err == MP_OKAY) {
                        err = _sp_mont_red(tr, m, mp);
                    }
                }

                /* 6.3. tr = tr * t[y] */
                if (err == MP_OKAY) {
                    err = sp_mul(tr, t[y], tr);
                }
                if (err == MP_OKAY) {
                    err = _sp_mont_red(tr, m, mp);
                }
            }
        }

        if (err == MP_OKAY) {
            /* 7. tr = FromMont(tr) */
            err = _sp_mont_red(tr, m, mp);
            /* Reduction implementation returns number to range: 0..m-1. */
        }
    }
    if ((!done) && (err == MP_OKAY)) {
        /* 8. r = tr */
        _sp_copy(tr, r);
    }

    FREE_SP_INT_ARRAY(t, NULL);
    return err;
}

#ifndef SP_ALLOC_PREDEFINED
#undef SP_ALLOC
#undef SP_ALLOC_PREDEFINED
#endif

#endif /* !WC_NO_CACHE_RESISTANT */
#endif /* !WC_NO_HARDEN */

/* w = Log2(SP_WORD_SIZE) - 1 */
#if SP_WORD_SIZE == 8
    #define EXP2_WINSIZE    2
#elif SP_WORD_SIZE == 16
    #define EXP2_WINSIZE    3
#elif SP_WORD_SIZE == 32
    #define EXP2_WINSIZE    4
#elif SP_WORD_SIZE == 64
    #define EXP2_WINSIZE    5
#else
    #error "sp_exptmod_base_2: Unexpected SP_WORD_SIZE"
#endif
/* Mask is all bits in window set. */
#define EXP2_MASK           ((1 << EXP2_WINSIZE) - 1)

/* Internal. Exponentiates 2 to the power of e modulo m into r: r = 2 ^ e mod m
 * Is constant time and cache attack resistant.
 *
 * Calculates value to make mod operations constant time expect when
 * WC_NO_HARDERN defined or modulus fits in one word.
 *
 * Algorithm:
 *  b: base, e: exponent, m: modulus, r: result, bits: #bits to use
 *  w: window size based on #bits in word.
 *  1. if Words(m) > 1 then tr = MontNorm(m) = ToMont(1)
 *     else                 tr = 1
 *  2. if Words(m) > 1 and HARDEN then a = m * (2 ^ (2^w))
 *     else                            a = 0
 *  3. cb = w * (bits / w)
 *  4. y = e / (2 ^ cb)
 *  5. tr = (tr * (2 ^ y) + a) mod m
 *  6. For i in cb..w
 *   6.1. y = e[(i-1)..(i-w)]
 *   6.2. tr = tr ^ (2 * w)
 *   6.3. tr = ((tr * (2 ^ y) + a) mod m
 *  7. if Words(m) > 1 then tr = FromMont(tr)
 *  8. r = tr
 *
 * @param  [in]   e       SP integer that is the exponent.
 * @param  [in]   digits  Number of digits in base to use. May be greater than
 *                        count of bits in b.
 * @param  [in]   m       SP integer that is the modulus.
 * @param  [out]  r       SP integer to hold result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_exptmod_base_2(const sp_int* e, int digits, const sp_int* m,
    sp_int* r)
{
    int i = 0;
    int c = 0;
    int y;
    int err = MP_OKAY;
    sp_int_digit mp = 0;
    sp_int_digit n = 0;
#ifndef WC_NO_HARDEN
    sp_int* a = NULL;
    sp_int* tr = NULL;
    DECL_SP_INT_ARRAY(d, m->used * 2 + 1, 2);
#else
    DECL_SP_INT(tr, m->used * 2 + 1);
#endif
    int useMont = (m->used > 1);

#if 0
    sp_print_int(2, "a");
    sp_print(e, "b");
    sp_print(m, "m");
#endif

#ifndef WC_NO_HARDEN
    /* Allocate sp_ints for:
     *  - constant time add value for mod operation
     *  - temporary result
     */
    ALLOC_SP_INT_ARRAY(d, m->used * 2 + 1, 2, err, NULL);
#else
    /* Allocate sp_int for temporary result. */
    ALLOC_SP_INT(tr, m->used * 2 + 1, err, NULL);
#endif
    if (err == MP_OKAY) {
    #ifndef WC_NO_HARDEN
        a  = d[0];
        tr = d[1];

        _sp_init_size(a, m->used * 2 + 1);
    #endif
        _sp_init_size(tr, m->used * 2 + 1);

    }

    if ((err == MP_OKAY) && useMont) {
        /* Calculate Montgomery multiplier for reduction. */
        _sp_mont_setup(m, &mp);
    }
    if (err == MP_OKAY) {
        /* 1. if Words(m) > 1 then tr = MontNorm(m) = ToMont(1)
         *    else                 tr = 1
         */
        if (useMont) {
            /* Calculate Montgomery normalizer for modulus - 1 in Montgomery
             * form.
             */
            err = sp_mont_norm(tr, m);
        }
        else {
             /* For single word modulus don't use Montgomery form. */
            err = sp_set(tr, 1);
        }
    }
    /* 2. if Words(m) > 1 and HARDEN then a = m * (2 ^ (2^w))
     *    else                            a = 0
     */
#ifndef WC_NO_HARDEN
    if ((err == MP_OKAY) && useMont) {
        err = sp_mul_2d(m, 1 << EXP2_WINSIZE, a);
    }
#endif

    if (err == MP_OKAY) {
        /* 3. cb = w * (bits / w) */
        i = digits - 1;
        n = e->dp[i--];
        c = SP_WORD_SIZE;
    #if EXP2_WINSIZE != 1
        c -= (digits * SP_WORD_SIZE) % EXP2_WINSIZE;
        if (c != SP_WORD_SIZE) {
            /* 4. y = e / (2 ^ cb) */
            y = (int)(n >> c);
            n <<= SP_WORD_SIZE - c;
        }
        else
    #endif
        {
            /* 4. y = e / (2 ^ cb) */
            y = (int)((n >> (SP_WORD_SIZE - EXP2_WINSIZE)) & EXP2_MASK);
            n <<= EXP2_WINSIZE;
            c -= EXP2_WINSIZE;
        }

        /* 5. tr = (tr * (2 ^ y) + a) mod m */
        err = sp_mul_2d(tr, y, tr);
    }
#ifndef WC_NO_HARDEN
    if ((err == MP_OKAY) && useMont) {
        /* Add value to make mod operation constant time. */
        err = sp_add(tr, a, tr);
    }
#endif
    if (err == MP_OKAY) {
        err = sp_mod(tr, m, tr);
    }
    /* 6. For i in cb..w */
    for (; (err == MP_OKAY) && ((i >= 0) || (c >= EXP2_WINSIZE)); ) {
        int j;

        /* 6.1. y = e[(i-1)..(i-w)] */
        if (c == 0) {
            /* Bits from next digit. */
            n = e->dp[i--];
            y = (int)(n >> (SP_WORD_SIZE - EXP2_WINSIZE));
            n <<= EXP2_WINSIZE;
            c = SP_WORD_SIZE - EXP2_WINSIZE;
        }
    #if (EXP2_WINSIZE != 1) && (EXP2_WINSIZE != 2) && (EXP2_WINSIZE != 4)
        else if (c < EXP2_WINSIZE) {
            /* Bits to end of digit and part of next */
            y = (int)(n >> (SP_WORD_SIZE - EXP2_WINSIZE));
            n = e->dp[i--];
            c = EXP2_WINSIZE - c;
            y |= (int)(n >> (SP_WORD_SIZE - c));
            n <<= c;
            c = SP_WORD_SIZE - c;
        }
    #endif
        else {
            /* Bits from middle of digit */
            y = (int)((n >> (SP_WORD_SIZE - EXP2_WINSIZE)) & EXP2_MASK);
            n <<= EXP2_WINSIZE;
            c -= EXP2_WINSIZE;
        }

        /* 6.2. tr = tr ^ (2 * w) */
        for (j = 0; (j < EXP2_WINSIZE) && (err == MP_OKAY); j++) {
            err = sp_sqr(tr, tr);
            if (err == MP_OKAY) {
                if (useMont) {
                    err = _sp_mont_red(tr, m, mp);
                }
                else {
                    err = sp_mod(tr, m, tr);
                }
            }
        }

        /* 6.3. tr = ((tr * (2 ^ y) + a) mod m */
        if (err == MP_OKAY) {
            err = sp_mul_2d(tr, y, tr);
        }
    #ifndef WC_NO_HARDEN
        if ((err == MP_OKAY) && useMont) {
            /* Add value to make mod operation constant time. */
            err = sp_add(tr, a, tr);
        }
    #endif
        if (err == MP_OKAY) {
            /* Reduce current result by modulus. */
            err = sp_mod(tr, m, tr);
        }
    }

    /* 7. if Words(m) > 1 then tr = FromMont(tr) */
    if ((err == MP_OKAY) && useMont) {
        err = _sp_mont_red(tr, m, mp);
        /* Reduction implementation returns number to range: 0..m-1. */
    }
    if (err == MP_OKAY) {
        /* 8. r = tr */
        _sp_copy(tr, r);
    }

#if 0
    sp_print(r, "rme");
#endif

#ifndef WC_NO_HARDEN
    FREE_SP_INT_ARRAY(d, NULL);
#else
    FREE_SP_INT(tr, m->used * 2 + 1);
#endif
    return err;
}
#endif

#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
    !defined(NO_DH) || (!defined(NO_RSA) && defined(WOLFSSL_KEY_GEN)) || \
    defined(OPENSSL_ALL)
/* Exponentiates b to the power of e modulo m into r: r = b ^ e mod m
 *
 * Error returned when parameters r == e or r == m and base >= modulus.
 *
 * @param  [in]   b       SP integer that is the base.
 * @param  [in]   e       SP integer that is the exponent.
 * @param  [in]   digits  Number of digits in exponent to use. May be greater
 *                        than count of digits in e.
 * @param  [in]   m       SP integer that is the modulus.
 * @param  [out]  r       SP integer to hold result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when b, e, m or r is NULL, digits is negative, or m <= 0 or
 *          e is negative.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
int sp_exptmod_ex(const sp_int* b, const sp_int* e, int digits, const sp_int* m,
    sp_int* r)
{
    int err = MP_OKAY;
    int done = 0;
    int mBits = sp_count_bits(m);
    int bBits = sp_count_bits(b);
    int eBits = sp_count_bits(e);

    if ((b == NULL) || (e == NULL) || (m == NULL) || (r == NULL) ||
             (digits < 0)) {
        err = MP_VAL;
    }
    /* Ensure m is not too big. */
    else if (m->used * 2 >= SP_INT_DIGITS) {
        err = MP_VAL;
    }

#if 0
    if (err == MP_OKAY) {
        sp_print(b, "a");
        sp_print(e, "b");
        sp_print(m, "m");
    }
#endif

    /* Check for invalid modulus. */
    if ((err == MP_OKAY) && sp_iszero(m)) {
        err = MP_VAL;
    }
#ifdef WOLFSSL_SP_INT_NEGATIVE
    /* Check for unsupported negative values of exponent and modulus. */
    if ((err == MP_OKAY) && ((e->sign == MP_NEG) || (m->sign == MP_NEG))) {
        err = MP_VAL;
    }
#endif

    /* Check for degenerate cases. */
    if ((err == MP_OKAY) && sp_isone(m)) {
        _sp_set(r, 0);
        done = 1;
    }
    if ((!done) && (err == MP_OKAY) && sp_iszero(e)) {
        _sp_set(r, 1);
        done = 1;
    }

    /* Ensure base is less than modulus. */
    if ((!done) && (err == MP_OKAY) && (_sp_cmp_abs(b, m) != MP_LT)) {
        if ((r == e) || (r == m)) {
            err = MP_VAL;
        }
        if (err == MP_OKAY) {
            err = sp_mod(b, m, r);
        }
        if (err == MP_OKAY) {
            b = r;
        }
    }
    /* Check for degenerate case of base. */
    if ((!done) && (err == MP_OKAY) && sp_iszero(b)) {
        _sp_set(r, 0);
        done = 1;
    }

    /* Ensure SP integers have space for intermediate values. */
    if ((!done) && (err == MP_OKAY) && (m->used * 2 >= r->size)) {
        err = MP_VAL;
    }

    if ((!done) && (err == MP_OKAY)) {
        /* Use code optimized for specific sizes if possible */
#if (defined(WOLFSSL_SP_MATH) || defined(WOLFSSL_SP_MATH_ALL)) && \
    (defined(WOLFSSL_HAVE_SP_RSA) || defined(WOLFSSL_HAVE_SP_DH))
    #ifndef WOLFSSL_SP_NO_2048
        if ((mBits == 1024) && sp_isodd(m) && (bBits <= 1024) &&
                (eBits <= 1024)) {
            err = sp_ModExp_1024((sp_int*)b, (sp_int*)e, (sp_int*)m, r);
            done = 1;
        }
        else if ((mBits == 2048) && sp_isodd(m) && (bBits <= 2048) &&
                 (eBits <= 2048)) {
            err = sp_ModExp_2048((sp_int*)b, (sp_int*)e, (sp_int*)m, r);
            done = 1;
        }
        else
    #endif
    #ifndef WOLFSSL_SP_NO_3072
        if ((mBits == 1536) && sp_isodd(m) && (bBits <= 1536) &&
                (eBits <= 1536)) {
            err = sp_ModExp_1536((sp_int*)b, (sp_int*)e, (sp_int*)m, r);
            done = 1;
        }
        else if ((mBits == 3072) && sp_isodd(m) && (bBits <= 3072) &&
                 (eBits <= 3072)) {
            err = sp_ModExp_3072((sp_int*)b, (sp_int*)e, (sp_int*)m, r);
            done = 1;
        }
        else
    #endif
    #ifdef WOLFSSL_SP_4096
        if ((mBits == 4096) && sp_isodd(m) && (bBits <= 4096) &&
                (eBits <= 4096)) {
            err = sp_ModExp_4096((sp_int*)b, (sp_int*)e, (sp_int*)m, r);
            done = 1;
        }
        else
    #endif
#endif
        {
            /* SP does not support size. */
        }
    }
#if defined(WOLFSSL_SP_MATH_ALL) || !defined(NO_DH) || defined(OPENSSL_ALL)
#if (defined(WOLFSSL_RSA_VERIFY_ONLY) || defined(WOLFSSL_RSA_PUBLIC_ONLY)) && \
    defined(NO_DH)
    if ((!done) && (err == MP_OKAY)) {
        /* Use non-constant time version - fastest. */
        err = sp_exptmod_nct(b, e, m, r);
    }
#else
#if defined(WOLFSSL_SP_MATH_ALL) || defined(OPENSSL_ALL)
    if ((!done) && (err == MP_OKAY) && (b->used == 1) && (b->dp[0] == 2) &&
         mp_isodd(m)) {
        /* Use the generic base 2 implementation. */
        err = _sp_exptmod_base_2(e, digits, m, r);
    }
    else if ((!done) && (err == MP_OKAY) && ((m->used > 1) && mp_isodd(m))) {
    #ifndef WC_NO_HARDEN
        /* Use constant time version hardened against timing attacks and
         * cache attacks when WC_NO_CACHE_RESISTANT not defined. */
        err = _sp_exptmod_mont_ex(b, e, digits * SP_WORD_SIZE, m, r);
    #else
        /* Use non-constant time version - fastest. */
        err = sp_exptmod_nct(b, e, m, r);
    #endif
    }
    else
#endif /* WOLFSSL_SP_MATH_ALL || OPENSSL_ALL */
    if ((!done) && (err == MP_OKAY)) {
        /* Otherwise use the generic implementation hardened against
         * timing and cache attacks. */
        err = _sp_exptmod_ex(b, e, digits * SP_WORD_SIZE, m, r);
    }
#endif /* WOLFSSL_RSA_VERIFY_ONLY || WOLFSSL_RSA_PUBLIC_ONLY */
#else
    if ((!done) && (err == MP_OKAY)) {
        err = MP_VAL;
    }
#endif /* WOLFSSL_SP_MATH_ALL || WOLFSSL_HAVE_SP_DH */

    (void)mBits;
    (void)bBits;
    (void)eBits;
    (void)digits;

#if 0
    if (err == MP_OKAY) {
        sp_print(r, "rme");
    }
#endif
    return err;
}
#endif

#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
    !defined(NO_DH) || (!defined(NO_RSA) && defined(WOLFSSL_KEY_GEN)) || \
    defined(OPENSSL_ALL)
/* Exponentiates b to the power of e modulo m into r: r = b ^ e mod m
 *
 * @param  [in]   b  SP integer that is the base.
 * @param  [in]   e  SP integer that is the exponent.
 * @param  [in]   m  SP integer that is the modulus.
 * @param  [out]  r  SP integer to hold result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when b, e, m or r is NULL; or m <= 0 or e is negative.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
int sp_exptmod(const sp_int* b, const sp_int* e, const sp_int* m, sp_int* r)
{
    int err = MP_OKAY;

    /* Validate parameters. */
    if ((b == NULL) || (e == NULL) || (m == NULL) || (r == NULL)) {
        err = MP_VAL;
    }
    SAVE_VECTOR_REGISTERS(err = _svr_ret;);
    if (err == MP_OKAY) {
        err = sp_exptmod_ex(b, e, (int)e->used, m, r);
    }
    RESTORE_VECTOR_REGISTERS();
    return err;
}
#endif

#if defined(WOLFSSL_SP_MATH_ALL) || defined(WOLFSSL_HAVE_SP_DH)
#if defined(WOLFSSL_SP_FAST_NCT_EXPTMOD) || !defined(WOLFSSL_SP_SMALL)

/* Internal. Exponentiates b to the power of e modulo m into r: r = b ^ e mod m
 * Creates a window of precalculated exponents with base in Montgomery form.
 * Sliding window and is NOT constant time.
 *
 * n-bit window is: (b^(2^(n-1))*b^0)...(b^(2^(n-1))*b^(2^(n-1)-1))
 * e.g. when n=6, b^32..b^63
 * Algorithm:
 *   1. Ensure base is less than modulus.
 *   2. Convert base to Montgomery form
 *   3. Set result to table entry for top window bits, or
 *      if less than windows bits in exponent, 1 in Montgomery form.
 *   4. While at least window bits left:
 *     4.1. Count number of and skip leading 0 bits unless less then window bits
 *          left.
 *     4.2. Montgomery square result for each leading 0 and window bits if bits
 *          left.
 *     4.3. Break if less than window bits left.
 *     4.4. Get top window bits from expononent and drop.
 *     4.5. Montgomery multiply result by table entry.
 *   5. While bits left:
 *     5.1. Montogmery square result
 *     5.2. If exponent bit set
 *       5.2.1. Montgomery multiply result by Montgomery form of base.
 *   6. Convert result back from Montgomery form.
 *
 * @param  [in]   b     SP integer that is the base.
 * @param  [in]   e     SP integer that is the exponent.
 * @param  [in]   bits  Number of bits in exponent to use. May be greater than
 *                      count of bits in e.
 * @param  [in]   m     SP integer that is the modulus.
 * @param  [out]  r     SP integer to hold result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_exptmod_nct(const sp_int* b, const sp_int* e, const sp_int* m,
    sp_int* r)
{
    int i = 0;
    int bits;
    int winBits;
    int preCnt;
    int err = MP_OKAY;
    int done = 0;
    sp_int* tr = NULL;
    sp_int* bm = NULL;
    /* Maximum winBits is 6 and preCnt is (1 << (winBits - 1)). */
#ifndef WOLFSSL_SP_NO_MALLOC
    DECL_DYN_SP_INT_ARRAY(t, m->used * 2 + 1, (1 << 5) + 2);
#else
    DECL_SP_INT_ARRAY(t, m->used * 2 + 1, (1 << 5) + 2);
#endif

    bits = sp_count_bits(e);

    /* Window bits based on number of pre-calculations versus number of loop
     * calculcations.
     * Exponents for RSA and DH will result in 6-bit windows.
     * Note: for 4096-bit values, 7-bit window is slightly better.
     */
    if (bits > 450) {
        winBits = 6;
    }
    else if (bits <= 21) {
        winBits = 1;
    }
    else if (bits <= 36) {
        winBits = 3;
    }
    else if (bits <= 140) {
        winBits = 4;
    }
    else {
        winBits = 5;
    }
    /* Top bit of exponent fixed as 1 for pre-calculated window. */
    preCnt = 1 << (winBits - 1);

    /* Allocate sp_ints for:
     *  - pre-computation table
     *  - temporary result
     *  - Montgomery form of base
     */
#ifndef WOLFSSL_SP_NO_MALLOC
    ALLOC_DYN_SP_INT_ARRAY(t, m->used * 2 + 1, (size_t)preCnt + 2, err, NULL);
#else
    ALLOC_SP_INT_ARRAY(t, m->used * 2 + 1, (size_t)preCnt + 2, err, NULL);
#endif
    if (err == MP_OKAY) {
        /* Set variables to use allocate memory. */
        tr = t[preCnt + 0];
        bm = t[preCnt + 1];

        /* Iniitialize all allocated  */
        for (i = 0; i < preCnt; i++) {
            _sp_init_size(t[i], m->used * 2 + 1);
        }
        _sp_init_size(tr, m->used * 2 + 1);
        _sp_init_size(bm, m->used * 2 + 1);

        /* 1. Ensure base is less than modulus. */
        if (_sp_cmp_abs(b, m) != MP_LT) {
            err = sp_mod(b, m, bm);
            /* Handle base == modulus. */
            if ((err == MP_OKAY) && sp_iszero(bm)) {
                _sp_set(r, 0);
                done = 1;
            }
        }
        else {
            /* Copy base into Montogmery base variable. */
            _sp_copy(b, bm);
        }
    }

    if ((!done) && (err == MP_OKAY)) {
        int y = 0;
        int c = 0;
        sp_int_digit mp;

        /* Calculate Montgomery multiplier for reduction. */
        _sp_mont_setup(m, &mp);
        /* Calculate Montgomery normalizer for modulus. */
        err = sp_mont_norm(t[0], m);
        if (err == MP_OKAY) {
            /* 2. Convert base to Montgomery form. */
            err = sp_mul(bm, t[0], bm);
        }
        if (err == MP_OKAY) {
            /* bm = bm mod m, temporary size has to be bigger than bm->used. */
            err = _sp_div(bm, m, NULL, bm, bm->used + 1);
        }
        if (err == MP_OKAY) {
            /* Copy Montgomery form of base into first element of table. */
            _sp_copy(bm, t[0]);
        }
        /* Calculate b^(2^(winBits-1)) */
        for (i = 1; (i < winBits) && (err == MP_OKAY); i++) {
            err = sp_sqr(t[0], t[0]);
            if (err == MP_OKAY) {
                err = _sp_mont_red(t[0], m, mp);
            }
        }
        /* For each table entry after first. */
        for (i = 1; (i < preCnt) && (err == MP_OKAY); i++) {
            /* Multiply previous entry by the base in Mont form into table. */
            err = sp_mul(t[i-1], bm, t[i]);
            if (err == MP_OKAY) {
                err = _sp_mont_red(t[i], m, mp);
            }
        }

        /* 3. Set result to table entry for top window bits, or
         *    if less than windows bits in exponent, 1 in Montgomery form.
         */
        if (err == MP_OKAY) {
            sp_int_digit n;
            /* Mask for calculating index into pre-computed table. */
            sp_int_digit mask = (sp_int_digit)preCnt - 1;

            /* Find the top bit. */
            i = (bits - 1) >> SP_WORD_SHIFT;
            n = e->dp[i--];
            c = bits % SP_WORD_SIZE;
            if (c == 0) {
                c = SP_WORD_SIZE;
            }
            /* Put top bit at highest offset in digit. */
            n <<= SP_WORD_SIZE - c;

            if (bits >= winBits) {
                /* Top bit set. Copy from window. */
                if (c < winBits) {
                    /* Bits to end of digit and part of next */
                    y = (int)((n >> (SP_WORD_SIZE - winBits)) & mask);
                    n = e->dp[i--];
                    c = winBits - c;
                    y |= (int)(n >> (SP_WORD_SIZE - c));
                    n <<= c;
                    c = SP_WORD_SIZE - c;
                }
                else {
                    /* Bits from middle of digit */
                    y = (int)((n >> (SP_WORD_SIZE - winBits)) & mask);
                    n <<= winBits;
                    c -= winBits;
                }
                _sp_copy(t[y], tr);
            }
            else {
                /* 1 in Montgomery form. */
                err = sp_mont_norm(tr, m);
            }

            /* 4. While at least window bits left. */
            while ((err == MP_OKAY) && ((i >= 0) || (c >= winBits))) {
                /* Number of squares to before due to top bits being 0. */
                int sqrs = 0;

                /* 4.1. Count number of and skip leading 0 bits unless less
                 *      than window bits.
                 */
                do {
                    /* Make sure n has bits from the right digit. */
                    if (c == 0) {
                        n = e->dp[i--];
                        c = SP_WORD_SIZE;
                    }
                    /* Mask off the next bit. */
                    if ((n & ((sp_int_digit)1 << (SP_WORD_SIZE - 1))) != 0) {
                        break;
                    }

                    /* Another square needed. */
                    sqrs++;
                    /* Skip bit. */
                    n <<= 1;
                    c--;
                }
                while ((err == MP_OKAY) && ((i >= 0) || (c >= winBits)));

                if ((err == MP_OKAY) && ((i >= 0) || (c >= winBits))) {
                    /* Add squares needed before using table entry. */
                    sqrs += winBits;
                }

                /* 4.2. Montgomery square result for each leading 0 and window
                 *      bits if bits left.
                 */
                for (; (err == MP_OKAY) && (sqrs > 0); sqrs--) {
                    err = sp_sqr(tr, tr);
                    if (err == MP_OKAY) {
                        err = _sp_mont_red(tr, m, mp);
                    }
                }

                /* 4.3. Break if less than window bits left. */
                if ((err == MP_OKAY) && (i < 0) && (c < winBits)) {
                    break;
                }

                /* 4.4. Get top window bits from expononent and drop. */
                if (err == MP_OKAY) {
                    if (c == 0) {
                        /* Bits from next digit. */
                        n = e->dp[i--];
                        y = (int)(n >> (SP_WORD_SIZE - winBits));
                        n <<= winBits;
                        c = SP_WORD_SIZE - winBits;
                    }
                    else if (c < winBits) {
                        /* Bits to end of digit and part of next. */
                        y = (int)(n >> (SP_WORD_SIZE - winBits));
                        n = e->dp[i--];
                        c = winBits - c;
                        y |= (int)(n >> (SP_WORD_SIZE - c));
                        n <<= c;
                        c = SP_WORD_SIZE - c;
                    }
                    else {
                        /* Bits from middle of digit. */
                        y = (int)(n >> (SP_WORD_SIZE - winBits));
                        n <<= winBits;
                        c -= winBits;
                    }
                    y &= (int)mask;
                }

                /* 4.5. Montgomery multiply result by table entry. */
                if (err == MP_OKAY) {
                    err = sp_mul(tr, t[y], tr);
                }
                if (err == MP_OKAY) {
                    err = _sp_mont_red(tr, m, mp);
                }
            }

            /* Finished multiplying in table entries. */
            if ((err == MP_OKAY) && (c > 0)) {
                /* Handle remaining bits.
                 * Window values have top bit set and can't be used. */
                n = e->dp[0];
                /*  5. While bits left: */
                for (--c; (err == MP_OKAY) && (c >= 0); c--) {
                    /* 5.1. Montogmery square result */
                    err = sp_sqr(tr, tr);
                    if (err == MP_OKAY) {
                        err = _sp_mont_red(tr, m, mp);
                    }
                    /* 5.2. If exponent bit set */
                    if ((err == MP_OKAY) && ((n >> c) & 1)) {
                        /* 5.2.1. Montgomery multiply result by Montgomery form
                         * of base.
                         */
                        err = sp_mul(tr, bm, tr);
                        if (err == MP_OKAY) {
                            err = _sp_mont_red(tr, m, mp);
                        }
                    }
                }
            }
        }

        if (err == MP_OKAY) {
            /* 6. Convert result back from Montgomery form. */
            err = _sp_mont_red(tr, m, mp);
            /* Reduction implementation returns number to range: 0..m-1. */
        }
    }
    if ((!done) && (err == MP_OKAY)) {
        /* Copy temporary result into parameter. */
        _sp_copy(tr, r);
    }

#ifndef WOLFSSL_SP_NO_MALLOC
    FREE_DYN_SP_INT_ARRAY(t, NULL);
#else
    FREE_SP_INT_ARRAY(t, NULL);
#endif
    return err;
}

#else
/* Exponentiates b to the power of e modulo m into r: r = b ^ e mod m
 * Non-constant time implementation.
 *
 * Algorithm:
 *   1. Convert base to Montgomery form
 *   2. Set result to base (assumes exponent is not zero)
 *   3. For each bit in exponent starting at second highest
 *     3.1. Montogmery square result
 *     3.2. If exponent bit set
 *       3.2.1. Montgomery multiply result by Montgomery form of base.
 *   4. Convert result back from Montgomery form.
 *
 * @param  [in]   b  SP integer that is the base.
 * @param  [in]   e  SP integer that is the exponent.
 * @param  [in]   m  SP integer that is the modulus.
 * @param  [out]  r  SP integer to hold result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when b, e, m or r is NULL; or m <= 0 or e is negative.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_exptmod_nct(const sp_int* b, const sp_int* e, const sp_int* m,
    sp_int* r)
{
    int i;
    int err = MP_OKAY;
    int done = 0;
    int y = 0;
    int bits = sp_count_bits(e);
    sp_int_digit mp;
    DECL_SP_INT_ARRAY(t, m->used * 2 + 1, 2);

    /* Allocate memory for:
     *  - Montgomery form of base
     *  - Temporary result (in case r is same var as another parameter). */
    ALLOC_SP_INT_ARRAY(t, m->used * 2 + 1, 2, err, NULL);
    if (err == MP_OKAY) {
        _sp_init_size(t[0], m->used * 2 + 1);
        _sp_init_size(t[1], m->used * 2 + 1);

        /* Ensure base is less than modulus and copy into temp. */
        if (_sp_cmp_abs(b, m) != MP_LT) {
            err = sp_mod(b, m, t[0]);
            /* Handle base == modulus. */
            if ((err == MP_OKAY) && sp_iszero(t[0])) {
                _sp_set(r, 0);
                done = 1;
            }
        }
        else {
            /* Copy base into temp. */
            _sp_copy(b, t[0]);
        }
    }

    if ((!done) && (err == MP_OKAY)) {
        /* Calculate Montgomery multiplier for reduction. */
        _sp_mont_setup(m, &mp);
        /* Calculate Montgomery normalizer for modulus. */
        err = sp_mont_norm(t[1], m);
        if (err == MP_OKAY) {
            /* 1. Convert base to Montgomery form. */
            err = sp_mul(t[0], t[1], t[0]);
        }
        if (err == MP_OKAY) {
            /* t[0] = t[0] mod m, temporary size has to be bigger than t[0]. */
            err = _sp_div(t[0], m, NULL, t[0], t[0]->used + 1);
        }
        if (err == MP_OKAY) {
            /* 2. Result starts as Montgomery form of base (assuming e > 0). */
            _sp_copy(t[0], t[1]);
        }

        /* 3. For each bit in exponent starting at second highest. */
        for (i = bits - 2; (err == MP_OKAY) && (i >= 0); i--) {
            /* 3.1. Montgomery square result. */
            err = sp_sqr(t[0], t[0]);
            if (err == MP_OKAY) {
                err = _sp_mont_red(t[0], m, mp);
            }
            if (err == MP_OKAY) {
                /* Get bit and index i. */
                y = (e->dp[i >> SP_WORD_SHIFT] >> (i & SP_WORD_MASK)) & 1;
                /* 3.2. If exponent bit set */
                if (y != 0) {
                    /* 3.2.1. Montgomery multiply result by Mont of base. */
                    err = sp_mul(t[0], t[1], t[0]);
                    if (err == MP_OKAY) {
                        err = _sp_mont_red(t[0], m, mp);
                    }
                }
            }
        }
        if (err == MP_OKAY) {
            /* 4. Convert from Montgomery form. */
            err = _sp_mont_red(t[0], m, mp);
            /* Reduction implementation returns number of range 0..m-1. */
        }
    }
    if ((!done) && (err == MP_OKAY)) {
        /* Copy temporary result into parameter. */
        _sp_copy(t[0], r);
    }

    FREE_SP_INT_ARRAY(t, NULL);
    return err;
}
#endif /* WOLFSSL_SP_FAST_NCT_EXPTMOD || !WOLFSSL_SP_SMALL */

/* Exponentiates b to the power of e modulo m into r: r = b ^ e mod m
 * Non-constant time implementation.
 *
 * @param  [in]   b  SP integer that is the base.
 * @param  [in]   e  SP integer that is the exponent.
 * @param  [in]   m  SP integer that is the modulus.
 * @param  [out]  r  SP integer to hold result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when b, e, m or r is NULL; or m <= 0 or e is negative.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
int sp_exptmod_nct(const sp_int* b, const sp_int* e, const sp_int* m, sp_int* r)
{
    int err = MP_OKAY;

    /* Validate parameters. */
    if ((b == NULL) || (e == NULL) || (m == NULL) || (r == NULL)) {
        err = MP_VAL;
    }

#if 0
    if (err == MP_OKAY) {
        sp_print(b, "a");
        sp_print(e, "b");
        sp_print(m, "m");
    }
#endif

    if (err != MP_OKAY) {
    }
    /* Handle special cases. */
    else if (sp_iszero(m)) {
        err = MP_VAL;
    }
#ifdef WOLFSSL_SP_INT_NEGATIVE
    else if ((e->sign == MP_NEG) || (m->sign == MP_NEG)) {
        err = MP_VAL;
    }
#endif
    /* x mod 1 is always 0. */
    else if (sp_isone(m)) {
        _sp_set(r, 0);
    }
    /* b^0 mod m = 1 mod m = 1. */
    else if (sp_iszero(e)) {
        _sp_set(r, 1);
    }
    /* 0^x mod m = 0 mod m = 0. */
    else if (sp_iszero(b)) {
        _sp_set(r, 0);
    }
    /* Ensure SP integers have space for intermediate values. */
    else if (m->used * 2 >= r->size) {
        err = MP_VAL;
    }
#if !defined(WOLFSSL_RSA_VERIFY_ONLY) && !defined(WOLFSSL_RSA_PUBLIC_ONLY)
    else if (mp_iseven(m)) {
        err = _sp_exptmod_ex(b, e, (int)(e->used * SP_WORD_SIZE), m, r);
    }
#endif
    else {
        err = _sp_exptmod_nct(b, e, m, r);
    }

#if 0
    if (err == MP_OKAY) {
        sp_print(r, "rme");
    }
#endif

    return err;
}
#endif /* WOLFSSL_SP_MATH_ALL || WOLFSSL_HAVE_SP_DH */

/***************
 * 2^e functions
 ***************/

#if defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)
/* Divide by 2^e: r = a >> e and rem = bits shifted out
 *
 * @param  [in]   a    SP integer to divide.
 * @param  [in]   e    Exponent bits (dividing by 2^e).
 * @param  [in]   m    SP integer that is the modulus.
 * @param  [out]  r    SP integer to hold result.
 * @param  [out]  rem  SP integer to hold remainder.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a is NULL or e is negative.
 */
int sp_div_2d(const sp_int* a, int e, sp_int* r, sp_int* rem)
{
    int err = MP_OKAY;

    if ((a == NULL) || (e < 0)) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
        /* Number of bits remaining after shift. */
        int remBits = sp_count_bits(a) - e;

        if (remBits <= 0) {
            /* Shifting down by more bits than in number. */
            _sp_zero(r);
            if (rem != NULL) {
                err = sp_copy(a, rem);
            }
        }
        else {
            if (rem != NULL) {
                /* Copy a in to remainder. */
                err = sp_copy(a, rem);
            }
            if (err == MP_OKAY) {
                /* Shift a down by into result. */
                err = sp_rshb(a, e, r);
            }
            if ((err == MP_OKAY) && (rem != NULL)) {
                /* Set used and mask off top digit of remainder. */
                rem->used = ((unsigned int)e + SP_WORD_SIZE - 1) >>
                    SP_WORD_SHIFT;
                e &= SP_WORD_MASK;
                if (e > 0) {
                    rem->dp[rem->used - 1] &= ((sp_int_digit)1 << e) - 1;
                }

                /* Remove leading zeros from remainder. */
                sp_clamp(rem);
            #ifdef WOLFSSL_SP_INT_NEGATIVE
                rem->sign = MP_ZPOS;
            #endif
            }
        }
    }

    return err;
}
#endif /* WOLFSSL_SP_MATH_ALL && !WOLFSSL_RSA_VERIFY_ONLY */

#if defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)
/* The bottom e bits: r = a & ((1 << e) - 1)
 *
 * @param  [in]   a  SP integer to reduce.
 * @param  [in]   e  Modulus bits (modulus equals 2^e).
 * @param  [out]  r  SP integer to hold result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a or r is NULL, e is negative or e is too large for
 *          result.
 */
int sp_mod_2d(const sp_int* a, int e, sp_int* r)
{
    int err = MP_OKAY;
    unsigned int digits = ((unsigned int)e + SP_WORD_SIZE - 1) >> SP_WORD_SHIFT;

    if ((a == NULL) || (r == NULL) || (e < 0)) {
        err = MP_VAL;
    }
    if ((err == MP_OKAY) && (digits > r->size)) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
        /* Copy a into r if not same pointer. */
        if (a != r) {
            XMEMCPY(r->dp, a->dp, digits * SP_WORD_SIZEOF);
            r->used = a->used;
        #ifdef WOLFSSL_SP_INT_NEGATIVE
            r->sign = a->sign;
        #endif
        }

        /* Modify result if a is bigger or same digit size. */
    #ifndef WOLFSSL_SP_INT_NEGATIVE
        if (digits <= a->used)
    #else
        /* Need to make negative positive and mask. */
        if ((a->sign == MP_NEG) || (digits <= a->used))
    #endif
        {
        #ifdef WOLFSSL_SP_INT_NEGATIVE
            if (a->sign == MP_NEG) {
                unsigned int i;
                sp_int_digit carry = 0;

                /* Negate value. */
                for (i = 0; i < r->used; i++) {
                    sp_int_digit next = r->dp[i] > 0;
                    r->dp[i] = (sp_int_digit)0 - r->dp[i] - carry;
                    carry |= next;
                }
                for (; i < digits; i++) {
                    r->dp[i] = (sp_int_digit)0 - carry;
                }
                r->sign = MP_ZPOS;
            }
        #endif
            /* Set used and mask off top digit of result. */
            r->used = digits;
            e &= SP_WORD_MASK;
            if (e > 0) {
                r->dp[r->used - 1] &= ((sp_int_digit)1 << e) - 1;
            }
            sp_clamp(r);
        }
    }

    return err;
}
#endif /* WOLFSSL_SP_MATH_ALL && !WOLFSSL_RSA_VERIFY_ONLY */

#if (defined(WOLFSSL_SP_MATH_ALL) && (!defined(WOLFSSL_RSA_VERIFY_ONLY) || \
    !defined(NO_DH))) || defined(OPENSSL_ALL)
/* Multiply by 2^e: r = a << e
 *
 * @param  [in]   a  SP integer to multiply.
 * @param  [in]   e  Multiplier bits (multiplier equals 2^e).
 * @param  [out]  r  SP integer to hold result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a or r is NULL, e is negative, or result is too big for
 *          result size.
 */
int sp_mul_2d(const sp_int* a, int e, sp_int* r)
{
    int err = MP_OKAY;

    /* Validate parameters. */
    if ((a == NULL) || (r == NULL) || (e < 0)) {
        err = MP_VAL;
    }

    /* Ensure result has enough allocated digits for result. */
    if ((err == MP_OKAY) &&
            ((unsigned int)(sp_count_bits(a) + e) > r->size * SP_WORD_SIZE)) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
        /* Copy a into r as left shift function works on the number. */
        if (a != r) {
            err = sp_copy(a, r);
        }
    }

    if (err == MP_OKAY) {
#if 0
        sp_print(a, "a");
        sp_print_int(e, "n");
#endif
        err = sp_lshb(r, e);
#if 0
        sp_print(r, "rsl");
#endif
    }

    return err;
}
#endif /* WOLFSSL_SP_MATH_ALL && !WOLFSSL_RSA_VERIFY_ONLY */

#if defined(WOLFSSL_SP_MATH_ALL) || defined(WOLFSSL_HAVE_SP_DH) || \
    defined(HAVE_ECC) || (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY))

/* START SP_SQR implementations */
/* This code is generated.
 * To generate:
 *   cd scripts/sp/sp_int
 *   ./gen.sh
 * File sp_sqr.c contains code.
 */

#if !defined(WOLFSSL_SP_MATH) || !defined(WOLFSSL_SP_SMALL)
#ifdef SQR_MUL_ASM
/* Square a and store in r. r = a * a
 *
 * @param  [in]   a  SP integer to square.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_sqr(const sp_int* a, sp_int* r)
{
    int err = MP_OKAY;
    unsigned int i;
    int j;
    unsigned int k;
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    sp_int_digit* t = NULL;
#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && \
    !defined(WOLFSSL_SP_NO_DYN_STACK)
    sp_int_digit t[((a->used + 1) / 2) * 2 + 1];
#else
    sp_int_digit t[(SP_INT_DIGITS + 1) / 2];
#endif

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    t = (sp_int_digit*)XMALLOC(
        sizeof(sp_int_digit) * (((a->used + 1) / 2) * 2 + 1), NULL,
        DYNAMIC_TYPE_BIGINT);
    if (t == NULL) {
        err = MP_MEM;
    }
#endif
    if ((err == MP_OKAY) && (a->used <= 1)) {
        sp_int_digit l;
        sp_int_digit h;

        h = 0;
        l = 0;
        SP_ASM_SQR(h, l, a->dp[0]);
        r->dp[0] = h;
        r->dp[1] = l;
    }
    else if (err == MP_OKAY) {
        sp_int_digit l;
        sp_int_digit h;
        sp_int_digit o;
        sp_int_digit* p = t;

        h = 0;
        l = 0;
        SP_ASM_SQR(h, l, a->dp[0]);
        t[0] = h;
        h = 0;
        o = 0;
        for (k = 1; k < (a->used + 1) / 2; k++) {
            i = k;
            j = (int)(k - 1);
            for (; (j >= 0); i++, j--) {
                SP_ASM_MUL_ADD2(l, h, o, a->dp[i], a->dp[j]);
            }
            t[k * 2 - 1] = l;
            l = h;
            h = o;
            o = 0;

            SP_ASM_SQR_ADD(l, h, o, a->dp[k]);
            i = k + 1;
            j = (int)(k - 1);
            for (; (j >= 0); i++, j--) {
                SP_ASM_MUL_ADD2(l, h, o, a->dp[i], a->dp[j]);
            }
            t[k * 2] = l;
            l = h;
            h = o;
            o = 0;
        }
        for (; k < a->used; k++) {
            i = k;
            j = (int)(k - 1);
            for (; (i < a->used); i++, j--) {
                SP_ASM_MUL_ADD2(l, h, o, a->dp[i], a->dp[j]);
            }
            p[k * 2 - 1] = l;
            l = h;
            h = o;
            o = 0;

            SP_ASM_SQR_ADD(l, h, o, a->dp[k]);
            i = k + 1;
            j = (int)(k - 1);
            for (; (i < a->used); i++, j--) {
                SP_ASM_MUL_ADD2(l, h, o, a->dp[i], a->dp[j]);
            }
            p[k * 2] = l;
            l = h;
            h = o;
            o = 0;

            p = r->dp;
        }
        r->dp[k * 2 - 1] = l;
        XMEMCPY(r->dp, t, (((a->used + 1) / 2) * 2 + 1) * sizeof(sp_int_digit));
    }

    if (err == MP_OKAY) {
        r->used = a->used * 2;
        sp_clamp(r);
    }

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    if (t != NULL) {
        XFREE(t, NULL, DYNAMIC_TYPE_BIGINT);
    }
#endif
    return err;
}
#else /* !SQR_MUL_ASM */
/* Square a and store in r. r = a * a
 *
 * @param  [in]   a  SP integer to square.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_sqr(const sp_int* a, sp_int* r)
{
    int err = MP_OKAY;
    unsigned int i;
    int j;
    unsigned int k;
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    sp_int_digit* t = NULL;
#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && \
    !defined(WOLFSSL_SP_NO_DYN_STACK)
    sp_int_digit t[a->used * 2];
#else
    sp_int_digit t[SP_INT_DIGITS];
#endif

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    t = (sp_int_digit*)XMALLOC(sizeof(sp_int_digit) * (a->used * 2), NULL,
        DYNAMIC_TYPE_BIGINT);
    if (t == NULL) {
        err = MP_MEM;
    }
#endif
    if (err == MP_OKAY) {
        sp_int_word w;
        sp_int_word l;
        sp_int_word h;
    #ifdef SP_WORD_OVERFLOW
        sp_int_word o;
    #endif

        w = (sp_int_word)a->dp[0] * a->dp[0];
        t[0] = (sp_int_digit)w;
        l = (sp_int_digit)(w >> SP_WORD_SIZE);
        h = 0;
    #ifdef SP_WORD_OVERFLOW
        o = 0;
    #endif
        for (k = 1; k <= (a->used - 1) * 2; k++) {
            i = k / 2;
            j = (int)(k - i);
            if (i == (unsigned int)j) {
                w = (sp_int_word)a->dp[i] * a->dp[j];
                l += (sp_int_digit)w;
                h += (sp_int_digit)(w >> SP_WORD_SIZE);
            #ifdef SP_WORD_OVERFLOW
                h += (sp_int_digit)(l >> SP_WORD_SIZE);
                l &= SP_MASK;
                o += (sp_int_digit)(h >> SP_WORD_SIZE);
                h &= SP_MASK;
            #endif
            }
            for (++i, --j; (i < a->used) && (j >= 0); i++, j--) {
                w = (sp_int_word)a->dp[i] * a->dp[j];
                l += (sp_int_digit)w;
                h += (sp_int_digit)(w >> SP_WORD_SIZE);
            #ifdef SP_WORD_OVERFLOW
                h += (sp_int_digit)(l >> SP_WORD_SIZE);
                l &= SP_MASK;
                o += (sp_int_digit)(h >> SP_WORD_SIZE);
                h &= SP_MASK;
            #endif
                l += (sp_int_digit)w;
                h += (sp_int_digit)(w >> SP_WORD_SIZE);
            #ifdef SP_WORD_OVERFLOW
                h += (sp_int_digit)(l >> SP_WORD_SIZE);
                l &= SP_MASK;
                o += (sp_int_digit)(h >> SP_WORD_SIZE);
                h &= SP_MASK;
            #endif
            }
            t[k] = (sp_int_digit)l;
            l >>= SP_WORD_SIZE;
            l += (sp_int_digit)h;
            h >>= SP_WORD_SIZE;
        #ifdef SP_WORD_OVERFLOW
            h += o & SP_MASK;
            o >>= SP_WORD_SIZE;
        #endif
        }
        t[k] = (sp_int_digit)l;
        r->used = k + 1;
        XMEMCPY(r->dp, t, r->used * sizeof(sp_int_digit));
        sp_clamp(r);
    }

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    if (t != NULL) {
        XFREE(t, NULL, DYNAMIC_TYPE_BIGINT);
    }
#endif
    return err;
}
#endif /* SQR_MUL_ASM */
#endif /* !WOLFSSL_SP_MATH || !WOLFSSL_SP_SMALL */

#ifndef WOLFSSL_SP_SMALL
#if !defined(WOLFSSL_HAVE_SP_ECC) && defined(HAVE_ECC)
#if (SP_WORD_SIZE == 64 && SP_INT_BITS >= 256)
#ifndef SQR_MUL_ASM
/* Square a and store in r. r = a * a
 *
 * Long-hand implementation.
 *
 * @param  [in]   a  SP integer to square.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_sqr_4(const sp_int* a, sp_int* r)
{
    int err = MP_OKAY;
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    sp_int_word* w = NULL;
#else
    sp_int_word w[10];
#endif
    const sp_int_digit* da = a->dp;

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    w = (sp_int_word*)XMALLOC(sizeof(sp_int_word) * 10, NULL,
        DYNAMIC_TYPE_BIGINT);
    if (w == NULL) {
        err = MP_MEM;
    }
#endif


    if (err == MP_OKAY) {
        w[0] = (sp_int_word)da[0] * da[0];
        w[1] = (sp_int_word)da[0] * da[1];
        w[2] = (sp_int_word)da[0] * da[2];
        w[3] = (sp_int_word)da[1] * da[1];
        w[4] = (sp_int_word)da[0] * da[3];
        w[5] = (sp_int_word)da[1] * da[2];
        w[6] = (sp_int_word)da[1] * da[3];
        w[7] = (sp_int_word)da[2] * da[2];
        w[8] = (sp_int_word)da[2] * da[3];
        w[9] = (sp_int_word)da[3] * da[3];

        r->dp[0] = (sp_int_digit)w[0];
        w[0] >>= SP_WORD_SIZE;
        w[0] += (sp_int_digit)w[1];
        w[0] += (sp_int_digit)w[1];
        r->dp[1] = (sp_int_digit)w[0];
        w[0] >>= SP_WORD_SIZE;
        w[1] >>= SP_WORD_SIZE;
        w[0] += (sp_int_digit)w[1];
        w[0] += (sp_int_digit)w[1];
        w[0] += (sp_int_digit)w[2];
        w[0] += (sp_int_digit)w[2];
        w[0] += (sp_int_digit)w[3];
        r->dp[2] = (sp_int_digit)w[0];
        w[0] >>= SP_WORD_SIZE;
        w[2] >>= SP_WORD_SIZE;
        w[0] += (sp_int_digit)w[2];
        w[0] += (sp_int_digit)w[2];
        w[3] >>= SP_WORD_SIZE;
        w[0] += (sp_int_digit)w[3];
        w[0] += (sp_int_digit)w[4];
        w[0] += (sp_int_digit)w[4];
        w[0] += (sp_int_digit)w[5];
        w[0] += (sp_int_digit)w[5];
        r->dp[3] = (sp_int_digit)w[0];
        w[0] >>= SP_WORD_SIZE;
        w[4] >>= SP_WORD_SIZE;
        w[0] += (sp_int_digit)w[4];
        w[0] += (sp_int_digit)w[4];
        w[5] >>= SP_WORD_SIZE;
        w[0] += (sp_int_digit)w[5];
        w[0] += (sp_int_digit)w[5];
        w[0] += (sp_int_digit)w[6];
        w[0] += (sp_int_digit)w[6];
        w[0] += (sp_int_digit)w[7];
        r->dp[4] = (sp_int_digit)w[0];
        w[0] >>= SP_WORD_SIZE;
        w[6] >>= SP_WORD_SIZE;
        w[0] += (sp_int_digit)w[6];
        w[0] += (sp_int_digit)w[6];
        w[7] >>= SP_WORD_SIZE;
        w[0] += (sp_int_digit)w[7];
        w[0] += (sp_int_digit)w[8];
        w[0] += (sp_int_digit)w[8];
        r->dp[5] = (sp_int_digit)w[0];
        w[0] >>= SP_WORD_SIZE;
        w[8] >>= SP_WORD_SIZE;
        w[0] += (sp_int_digit)w[8];
        w[0] += (sp_int_digit)w[8];
        w[0] += (sp_int_digit)w[9];
        r->dp[6] = (sp_int_digit)w[0];
        w[0] >>= SP_WORD_SIZE;
        w[9] >>= SP_WORD_SIZE;
        w[0] += (sp_int_digit)w[9];
        r->dp[7] = (sp_int_digit)w[0];

        r->used = 8;
        sp_clamp(r);
    }

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    if (w != NULL) {
        XFREE(w, NULL, DYNAMIC_TYPE_BIGINT);
    }
#endif
    return err;
}
#else /* SQR_MUL_ASM */
/* Square a and store in r. r = a * a
 *
 * Comba implementation.
 *
 * @param  [in]   a  SP integer to square.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_sqr_4(const sp_int* a, sp_int* r)
{
    sp_int_digit l = 0;
    sp_int_digit h = 0;
    sp_int_digit o = 0;
    sp_int_digit t[4];

    SP_ASM_SQR(h, l, a->dp[0]);
    t[0] = h;
    h = 0;
    SP_ASM_MUL_ADD2_NO(l, h, o, a->dp[0], a->dp[1]);
    t[1] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD2_NO(l, h, o, a->dp[0], a->dp[2]);
    SP_ASM_SQR_ADD(l, h, o, a->dp[1]);
    t[2] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD2(l, h, o, a->dp[0], a->dp[3]);
    SP_ASM_MUL_ADD2(l, h, o, a->dp[1], a->dp[2]);
    t[3] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD2(l, h, o, a->dp[1], a->dp[3]);
    SP_ASM_SQR_ADD(l, h, o, a->dp[2]);
    r->dp[4] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD2(l, h, o, a->dp[2], a->dp[3]);
    r->dp[5] = l;
    l = h;
    h = o;
    SP_ASM_SQR_ADD_NO(l, h, a->dp[3]);
    r->dp[6] = l;
    r->dp[7] = h;
    XMEMCPY(r->dp, t, 4 * sizeof(sp_int_digit));
    r->used = 8;
    sp_clamp(r);

    return MP_OKAY;
}
#endif /* SQR_MUL_ASM */
#endif /* SP_WORD_SIZE == 64 */
#if (SP_WORD_SIZE == 64 && SP_INT_BITS >= 384)
#ifdef SQR_MUL_ASM
/* Square a and store in r. r = a * a
 *
 * Comba implementation.
 *
 * @param  [in]   a  SP integer to square.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_sqr_6(const sp_int* a, sp_int* r)
{
    sp_int_digit l = 0;
    sp_int_digit h = 0;
    sp_int_digit o = 0;
    sp_int_digit tl = 0;
    sp_int_digit th = 0;
    sp_int_digit to;
    sp_int_digit t[6];

#if defined(WOLFSSL_SP_ARM_THUMB) && SP_WORD_SIZE == 32
    to = 0;
#endif

    SP_ASM_SQR(h, l, a->dp[0]);
    t[0] = h;
    h = 0;
    SP_ASM_MUL_ADD2_NO(l, h, o, a->dp[0], a->dp[1]);
    t[1] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD2_NO(l, h, o, a->dp[0], a->dp[2]);
    SP_ASM_SQR_ADD(l, h, o, a->dp[1]);
    t[2] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD2(l, h, o, a->dp[0], a->dp[3]);
    SP_ASM_MUL_ADD2(l, h, o, a->dp[1], a->dp[2]);
    t[3] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD2(l, h, o, a->dp[0], a->dp[4]);
    SP_ASM_MUL_ADD2(l, h, o, a->dp[1], a->dp[3]);
    SP_ASM_SQR_ADD(l, h, o, a->dp[2]);
    t[4] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[5]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[4]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[3]);
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
    t[5] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD2(l, h, o, a->dp[1], a->dp[5]);
    SP_ASM_MUL_ADD2(l, h, o, a->dp[2], a->dp[4]);
    SP_ASM_SQR_ADD(l, h, o, a->dp[3]);
    r->dp[6] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD2(l, h, o, a->dp[2], a->dp[5]);
    SP_ASM_MUL_ADD2(l, h, o, a->dp[3], a->dp[4]);
    r->dp[7] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD2(l, h, o, a->dp[3], a->dp[5]);
    SP_ASM_SQR_ADD(l, h, o, a->dp[4]);
    r->dp[8] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD2(l, h, o, a->dp[4], a->dp[5]);
    r->dp[9] = l;
    l = h;
    h = o;
    SP_ASM_SQR_ADD_NO(l, h, a->dp[5]);
    r->dp[10] = l;
    r->dp[11] = h;
    XMEMCPY(r->dp, t, 6 * sizeof(sp_int_digit));
    r->used = 12;
    sp_clamp(r);

    return MP_OKAY;
}
#endif /* SQR_MUL_ASM */
#endif /* SP_WORD_SIZE == 64 */
#if (SP_WORD_SIZE == 32 && SP_INT_BITS >= 256)
#ifdef SQR_MUL_ASM
/* Square a and store in r. r = a * a
 *
 * Comba implementation.
 *
 * @param  [in]   a  SP integer to square.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_sqr_8(const sp_int* a, sp_int* r)
{
    sp_int_digit l = 0;
    sp_int_digit h = 0;
    sp_int_digit o = 0;
    sp_int_digit tl = 0;
    sp_int_digit th = 0;
    sp_int_digit to;
    sp_int_digit t[8];

#if defined(WOLFSSL_SP_ARM_THUMB) && SP_WORD_SIZE == 32
    to = 0;
#endif

    SP_ASM_SQR(h, l, a->dp[0]);
    t[0] = h;
    h = 0;
    SP_ASM_MUL_ADD2_NO(l, h, o, a->dp[0], a->dp[1]);
    t[1] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD2_NO(l, h, o, a->dp[0], a->dp[2]);
    SP_ASM_SQR_ADD(l, h, o, a->dp[1]);
    t[2] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD2(l, h, o, a->dp[0], a->dp[3]);
    SP_ASM_MUL_ADD2(l, h, o, a->dp[1], a->dp[2]);
    t[3] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD2(l, h, o, a->dp[0], a->dp[4]);
    SP_ASM_MUL_ADD2(l, h, o, a->dp[1], a->dp[3]);
    SP_ASM_SQR_ADD(l, h, o, a->dp[2]);
    t[4] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[5]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[4]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[3]);
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
    t[5] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[6]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[5]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[4]);
    SP_ASM_SQR_ADD(l, h, o, a->dp[3]);
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
    t[6] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[7]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[6]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[5]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[4]);
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
    t[7] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_SET(tl, th, to, a->dp[1], a->dp[7]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[6]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[5]);
    SP_ASM_SQR_ADD(l, h, o, a->dp[4]);
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
    r->dp[8] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_SET(tl, th, to, a->dp[2], a->dp[7]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[6]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[5]);
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
    r->dp[9] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD2(l, h, o, a->dp[3], a->dp[7]);
    SP_ASM_MUL_ADD2(l, h, o, a->dp[4], a->dp[6]);
    SP_ASM_SQR_ADD(l, h, o, a->dp[5]);
    r->dp[10] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD2(l, h, o, a->dp[4], a->dp[7]);
    SP_ASM_MUL_ADD2(l, h, o, a->dp[5], a->dp[6]);
    r->dp[11] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD2(l, h, o, a->dp[5], a->dp[7]);
    SP_ASM_SQR_ADD(l, h, o, a->dp[6]);
    r->dp[12] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD2(l, h, o, a->dp[6], a->dp[7]);
    r->dp[13] = l;
    l = h;
    h = o;
    SP_ASM_SQR_ADD_NO(l, h, a->dp[7]);
    r->dp[14] = l;
    r->dp[15] = h;
    XMEMCPY(r->dp, t, 8 * sizeof(sp_int_digit));
    r->used = 16;
    sp_clamp(r);

    return MP_OKAY;
}
#endif /* SQR_MUL_ASM */
#endif /* SP_WORD_SIZE == 32 */
#if (SP_WORD_SIZE == 32 && SP_INT_BITS >= 384)
#ifdef SQR_MUL_ASM
/* Square a and store in r. r = a * a
 *
 * Comba implementation.
 *
 * @param  [in]   a  SP integer to square.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_sqr_12(const sp_int* a, sp_int* r)
{
    sp_int_digit l = 0;
    sp_int_digit h = 0;
    sp_int_digit o = 0;
    sp_int_digit tl = 0;
    sp_int_digit th = 0;
    sp_int_digit to;
    sp_int_digit t[12];

#if defined(WOLFSSL_SP_ARM_THUMB) && SP_WORD_SIZE == 32
    to = 0;
#endif

    SP_ASM_SQR(h, l, a->dp[0]);
    t[0] = h;
    h = 0;
    SP_ASM_MUL_ADD2_NO(l, h, o, a->dp[0], a->dp[1]);
    t[1] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD2_NO(l, h, o, a->dp[0], a->dp[2]);
    SP_ASM_SQR_ADD(l, h, o, a->dp[1]);
    t[2] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD2(l, h, o, a->dp[0], a->dp[3]);
    SP_ASM_MUL_ADD2(l, h, o, a->dp[1], a->dp[2]);
    t[3] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD2(l, h, o, a->dp[0], a->dp[4]);
    SP_ASM_MUL_ADD2(l, h, o, a->dp[1], a->dp[3]);
    SP_ASM_SQR_ADD(l, h, o, a->dp[2]);
    t[4] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[5]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[4]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[3]);
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
    t[5] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[6]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[5]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[4]);
    SP_ASM_SQR_ADD(l, h, o, a->dp[3]);
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
    t[6] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[7]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[6]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[5]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[4]);
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
    t[7] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[8]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[7]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[6]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[5]);
    SP_ASM_SQR_ADD(l, h, o, a->dp[4]);
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
    t[8] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[9]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[8]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[7]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[6]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[5]);
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
    t[9] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[10]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[9]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[8]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[7]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[6]);
    SP_ASM_SQR_ADD(l, h, o, a->dp[5]);
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
    t[10] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[11]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[10]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[9]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[8]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[7]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[6]);
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
    t[11] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_SET(tl, th, to, a->dp[1], a->dp[11]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[10]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[9]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[8]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[7]);
    SP_ASM_SQR_ADD(l, h, o, a->dp[6]);
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
    r->dp[12] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_SET(tl, th, to, a->dp[2], a->dp[11]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[10]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[9]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[8]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[7]);
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
    r->dp[13] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_SET(tl, th, to, a->dp[3], a->dp[11]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[10]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[9]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[8]);
    SP_ASM_SQR_ADD(l, h, o, a->dp[7]);
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
    r->dp[14] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_SET(tl, th, to, a->dp[4], a->dp[11]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[10]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[9]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[8]);
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
    r->dp[15] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_SET(tl, th, to, a->dp[5], a->dp[11]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[10]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[9]);
    SP_ASM_SQR_ADD(l, h, o, a->dp[8]);
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
    r->dp[16] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_SET(tl, th, to, a->dp[6], a->dp[11]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[10]);
    SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[9]);
    SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
    r->dp[17] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD2(l, h, o, a->dp[7], a->dp[11]);
    SP_ASM_MUL_ADD2(l, h, o, a->dp[8], a->dp[10]);
    SP_ASM_SQR_ADD(l, h, o, a->dp[9]);
    r->dp[18] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD2(l, h, o, a->dp[8], a->dp[11]);
    SP_ASM_MUL_ADD2(l, h, o, a->dp[9], a->dp[10]);
    r->dp[19] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD2(l, h, o, a->dp[9], a->dp[11]);
    SP_ASM_SQR_ADD(l, h, o, a->dp[10]);
    r->dp[20] = l;
    l = h;
    h = o;
    o = 0;
    SP_ASM_MUL_ADD2(l, h, o, a->dp[10], a->dp[11]);
    r->dp[21] = l;
    l = h;
    h = o;
    SP_ASM_SQR_ADD_NO(l, h, a->dp[11]);
    r->dp[22] = l;
    r->dp[23] = h;
    XMEMCPY(r->dp, t, 12 * sizeof(sp_int_digit));
    r->used = 24;
    sp_clamp(r);

    return MP_OKAY;
}
#endif /* SQR_MUL_ASM */
#endif /* SP_WORD_SIZE == 32 */
#endif /* !WOLFSSL_HAVE_SP_ECC && HAVE_ECC */

#if defined(SQR_MUL_ASM) && (defined(WOLFSSL_SP_INT_LARGE_COMBA) || \
    (!defined(WOLFSSL_SP_MATH) && defined(WOLFCRYPT_HAVE_SAKKE) && \
    (SP_WORD_SIZE == 64)))
    #if SP_INT_DIGITS >= 32
/* Square a and store in r. r = a * a
 *
 * Comba implementation.
 *
 * @param  [in]   a  SP integer to square.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_sqr_16(const sp_int* a, sp_int* r)
{
    int err = MP_OKAY;
    sp_int_digit l = 0;
    sp_int_digit h = 0;
    sp_int_digit o = 0;
    sp_int_digit tl = 0;
    sp_int_digit th = 0;
    sp_int_digit to;
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    sp_int_digit* t = NULL;
#else
    sp_int_digit t[16];
#endif

#if defined(WOLFSSL_SP_ARM_THUMB) && SP_WORD_SIZE == 32
    to = 0;
#endif

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     t = (sp_int_digit*)XMALLOC(sizeof(sp_int_digit) * 16, NULL,
         DYNAMIC_TYPE_BIGINT);
     if (t == NULL) {
         err = MP_MEM;
     }
#endif
    if (err == MP_OKAY) {
        SP_ASM_SQR(h, l, a->dp[0]);
        t[0] = h;
        h = 0;
        SP_ASM_MUL_ADD2_NO(l, h, o, a->dp[0], a->dp[1]);
        t[1] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD2_NO(l, h, o, a->dp[0], a->dp[2]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[1]);
        t[2] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD2(l, h, o, a->dp[0], a->dp[3]);
        SP_ASM_MUL_ADD2(l, h, o, a->dp[1], a->dp[2]);
        t[3] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD2(l, h, o, a->dp[0], a->dp[4]);
        SP_ASM_MUL_ADD2(l, h, o, a->dp[1], a->dp[3]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[2]);
        t[4] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[5]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[4]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[3]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        t[5] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[6]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[5]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[4]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[3]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        t[6] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[7]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[6]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[5]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[4]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        t[7] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[8]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[7]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[6]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[5]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[4]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        t[8] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[9]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[8]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[7]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[6]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[5]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        t[9] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[10]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[9]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[8]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[7]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[6]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[5]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        t[10] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[11]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[10]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[9]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[8]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[7]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[6]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        t[11] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[12]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[11]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[10]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[9]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[8]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[7]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[6]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        t[12] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[13]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[12]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[11]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[10]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[9]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[8]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[7]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        t[13] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[14]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[13]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[12]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[11]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[10]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[9]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[8]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[7]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        t[14] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[15]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[14]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[13]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[12]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[11]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[10]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[9]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[8]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        t[15] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[1], a->dp[15]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[14]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[13]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[12]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[11]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[10]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[9]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[8]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        r->dp[16] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[2], a->dp[15]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[14]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[13]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[12]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[11]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[10]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[9]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        r->dp[17] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[3], a->dp[15]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[14]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[13]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[12]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[11]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[10]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[9]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        r->dp[18] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[4], a->dp[15]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[14]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[13]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[12]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[11]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[10]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        r->dp[19] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[5], a->dp[15]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[14]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[13]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[12]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[11]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[10]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        r->dp[20] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[6], a->dp[15]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[14]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[13]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[12]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[11]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        r->dp[21] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[7], a->dp[15]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[14]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[13]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[12]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[11]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        r->dp[22] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[8], a->dp[15]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[14]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[13]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[11], a->dp[12]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        r->dp[23] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[9], a->dp[15]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[14]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[11], a->dp[13]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[12]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        r->dp[24] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[10], a->dp[15]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[11], a->dp[14]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[12], a->dp[13]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        r->dp[25] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD2(l, h, o, a->dp[11], a->dp[15]);
        SP_ASM_MUL_ADD2(l, h, o, a->dp[12], a->dp[14]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[13]);
        r->dp[26] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD2(l, h, o, a->dp[12], a->dp[15]);
        SP_ASM_MUL_ADD2(l, h, o, a->dp[13], a->dp[14]);
        r->dp[27] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD2(l, h, o, a->dp[13], a->dp[15]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[14]);
        r->dp[28] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD2(l, h, o, a->dp[14], a->dp[15]);
        r->dp[29] = l;
        l = h;
        h = o;
        SP_ASM_SQR_ADD_NO(l, h, a->dp[15]);
        r->dp[30] = l;
        r->dp[31] = h;
        XMEMCPY(r->dp, t, 16 * sizeof(sp_int_digit));
        r->used = 32;
        sp_clamp(r);
    }

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    if (t != NULL) {
        XFREE(t, NULL, DYNAMIC_TYPE_BIGINT);
    }
#endif
    return err;
}
    #endif /* SP_INT_DIGITS >= 32 */
#endif /* SQR_MUL_ASM && (WOLFSSL_SP_INT_LARGE_COMBA || !WOLFSSL_SP_MATH &&
        * WOLFCRYPT_HAVE_SAKKE && SP_WORD_SIZE == 64 */

#if defined(SQR_MUL_ASM) && defined(WOLFSSL_SP_INT_LARGE_COMBA)
    #if SP_INT_DIGITS >= 48
/* Square a and store in r. r = a * a
 *
 * Comba implementation.
 *
 * @param  [in]   a  SP integer to square.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_sqr_24(const sp_int* a, sp_int* r)
{
    int err = MP_OKAY;
    sp_int_digit l = 0;
    sp_int_digit h = 0;
    sp_int_digit o = 0;
    sp_int_digit tl = 0;
    sp_int_digit th = 0;
    sp_int_digit to;
#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    sp_int_digit* t = NULL;
#else
    sp_int_digit t[24];
#endif

#if defined(WOLFSSL_SP_ARM_THUMB) && SP_WORD_SIZE == 32
    to = 0;
#endif

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
     t = (sp_int_digit*)XMALLOC(sizeof(sp_int_digit) * 24, NULL,
         DYNAMIC_TYPE_BIGINT);
     if (t == NULL) {
         err = MP_MEM;
     }
#endif
    if (err == MP_OKAY) {
        SP_ASM_SQR(h, l, a->dp[0]);
        t[0] = h;
        h = 0;
        SP_ASM_MUL_ADD2_NO(l, h, o, a->dp[0], a->dp[1]);
        t[1] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD2_NO(l, h, o, a->dp[0], a->dp[2]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[1]);
        t[2] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD2(l, h, o, a->dp[0], a->dp[3]);
        SP_ASM_MUL_ADD2(l, h, o, a->dp[1], a->dp[2]);
        t[3] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD2(l, h, o, a->dp[0], a->dp[4]);
        SP_ASM_MUL_ADD2(l, h, o, a->dp[1], a->dp[3]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[2]);
        t[4] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[5]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[4]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[3]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        t[5] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[6]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[5]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[4]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[3]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        t[6] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[7]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[6]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[5]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[4]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        t[7] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[8]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[7]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[6]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[5]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[4]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        t[8] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[9]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[8]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[7]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[6]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[5]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        t[9] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[10]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[9]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[8]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[7]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[6]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[5]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        t[10] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[11]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[10]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[9]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[8]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[7]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[6]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        t[11] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[12]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[11]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[10]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[9]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[8]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[7]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[6]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        t[12] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[13]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[12]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[11]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[10]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[9]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[8]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[7]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        t[13] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[14]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[13]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[12]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[11]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[10]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[9]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[8]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[7]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        t[14] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[15]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[14]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[13]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[12]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[11]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[10]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[9]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[8]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        t[15] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[16]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[15]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[14]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[13]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[12]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[11]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[10]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[9]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[8]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        t[16] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[17]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[16]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[15]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[14]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[13]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[12]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[11]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[10]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[9]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        t[17] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[18]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[17]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[16]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[15]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[14]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[13]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[12]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[11]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[10]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[9]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        t[18] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[19]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[18]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[17]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[16]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[15]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[14]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[13]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[12]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[11]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[10]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        t[19] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[20]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[19]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[18]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[17]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[16]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[15]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[14]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[13]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[12]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[11]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[10]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        t[20] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[21]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[20]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[19]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[18]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[17]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[16]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[15]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[14]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[13]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[12]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[11]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        t[21] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[22]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[21]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[20]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[19]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[18]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[17]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[16]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[15]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[14]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[13]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[12]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[11]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        t[22] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[0], a->dp[23]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[1], a->dp[22]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[21]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[20]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[19]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[18]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[17]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[16]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[15]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[14]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[13]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[11], a->dp[12]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        t[23] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[1], a->dp[23]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[2], a->dp[22]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[21]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[20]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[19]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[18]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[17]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[16]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[15]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[14]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[11], a->dp[13]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[12]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        r->dp[24] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[2], a->dp[23]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[3], a->dp[22]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[21]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[20]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[19]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[18]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[17]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[16]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[15]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[11], a->dp[14]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[12], a->dp[13]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        r->dp[25] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[3], a->dp[23]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[4], a->dp[22]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[21]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[20]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[19]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[18]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[17]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[16]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[11], a->dp[15]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[12], a->dp[14]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[13]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        r->dp[26] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[4], a->dp[23]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[5], a->dp[22]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[21]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[20]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[19]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[18]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[17]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[11], a->dp[16]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[12], a->dp[15]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[13], a->dp[14]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        r->dp[27] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[5], a->dp[23]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[6], a->dp[22]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[21]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[20]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[19]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[18]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[11], a->dp[17]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[12], a->dp[16]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[13], a->dp[15]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[14]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        r->dp[28] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[6], a->dp[23]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[7], a->dp[22]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[21]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[20]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[19]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[11], a->dp[18]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[12], a->dp[17]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[13], a->dp[16]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[14], a->dp[15]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        r->dp[29] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[7], a->dp[23]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[8], a->dp[22]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[21]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[20]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[11], a->dp[19]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[12], a->dp[18]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[13], a->dp[17]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[14], a->dp[16]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[15]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        r->dp[30] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[8], a->dp[23]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[9], a->dp[22]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[21]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[11], a->dp[20]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[12], a->dp[19]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[13], a->dp[18]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[14], a->dp[17]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[15], a->dp[16]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        r->dp[31] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[9], a->dp[23]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[10], a->dp[22]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[11], a->dp[21]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[12], a->dp[20]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[13], a->dp[19]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[14], a->dp[18]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[15], a->dp[17]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[16]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        r->dp[32] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[10], a->dp[23]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[11], a->dp[22]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[12], a->dp[21]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[13], a->dp[20]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[14], a->dp[19]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[15], a->dp[18]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[16], a->dp[17]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        r->dp[33] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[11], a->dp[23]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[12], a->dp[22]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[13], a->dp[21]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[14], a->dp[20]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[15], a->dp[19]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[16], a->dp[18]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[17]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        r->dp[34] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[12], a->dp[23]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[13], a->dp[22]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[14], a->dp[21]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[15], a->dp[20]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[16], a->dp[19]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[17], a->dp[18]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        r->dp[35] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[13], a->dp[23]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[14], a->dp[22]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[15], a->dp[21]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[16], a->dp[20]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[17], a->dp[19]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[18]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        r->dp[36] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[14], a->dp[23]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[15], a->dp[22]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[16], a->dp[21]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[17], a->dp[20]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[18], a->dp[19]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        r->dp[37] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[15], a->dp[23]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[16], a->dp[22]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[17], a->dp[21]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[18], a->dp[20]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[19]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        r->dp[38] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[16], a->dp[23]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[17], a->dp[22]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[18], a->dp[21]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[19], a->dp[20]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        r->dp[39] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[17], a->dp[23]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[18], a->dp[22]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[19], a->dp[21]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[20]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        r->dp[40] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_SET(tl, th, to, a->dp[18], a->dp[23]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[19], a->dp[22]);
        SP_ASM_MUL_ADD(tl, th, to, a->dp[20], a->dp[21]);
        SP_ASM_ADD_DBL_3(l, h, o, tl, th, to);
        r->dp[41] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD2(l, h, o, a->dp[19], a->dp[23]);
        SP_ASM_MUL_ADD2(l, h, o, a->dp[20], a->dp[22]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[21]);
        r->dp[42] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD2(l, h, o, a->dp[20], a->dp[23]);
        SP_ASM_MUL_ADD2(l, h, o, a->dp[21], a->dp[22]);
        r->dp[43] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD2(l, h, o, a->dp[21], a->dp[23]);
        SP_ASM_SQR_ADD(l, h, o, a->dp[22]);
        r->dp[44] = l;
        l = h;
        h = o;
        o = 0;
        SP_ASM_MUL_ADD2(l, h, o, a->dp[22], a->dp[23]);
        r->dp[45] = l;
        l = h;
        h = o;
        SP_ASM_SQR_ADD_NO(l, h, a->dp[23]);
        r->dp[46] = l;
        r->dp[47] = h;
        XMEMCPY(r->dp, t, 24 * sizeof(sp_int_digit));
        r->used = 48;
        sp_clamp(r);
    }

#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_SP_NO_MALLOC)
    if (t != NULL) {
        XFREE(t, NULL, DYNAMIC_TYPE_BIGINT);
    }
#endif
    return err;
}
    #endif /* SP_INT_DIGITS >= 48 */

    #if SP_INT_DIGITS >= 64
/* Square a and store in r. r = a * a
 *
 * Karatsuba implementation.
 *
 * @param  [in]   a  SP integer to square.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_sqr_32(const sp_int* a, sp_int* r)
{
    int err = MP_OKAY;
    unsigned int i;
    sp_int_digit l;
    sp_int_digit h;
    sp_int* z0;
    sp_int* z1;
    sp_int* z2;
    sp_int_digit ca;
    DECL_SP_INT(a1, 16);
    DECL_SP_INT_ARRAY(z, 33, 2);

    ALLOC_SP_INT(a1, 16, err, NULL);
    ALLOC_SP_INT_ARRAY(z, 33, 2, err, NULL);
    if (err == MP_OKAY) {
        z1 = z[0];
        z2 = z[1];
        z0 = r;

        XMEMCPY(a1->dp, &a->dp[16], sizeof(sp_int_digit) * 16);
        a1->used = 16;

        /* z2 = a1 ^ 2 */
        err = _sp_sqr_16(a1, z2);
    }
    if (err == MP_OKAY) {
        l = 0;
        h = 0;
        for (i = 0; i < 16; i++) {
            SP_ASM_ADDC(l, h, a1->dp[i]);
            SP_ASM_ADDC(l, h, a->dp[i]);
            a1->dp[i] = l;
            l = h;
            h = 0;
        }
        ca = l;

        /* z0 = a0 ^ 2 */
        err = _sp_sqr_16(a, z0);
    }
    if (err == MP_OKAY) {
        /* z1 = (a0 + a1) ^ 2 */
        err = _sp_sqr_16(a1, z1);
    }
    if (err == MP_OKAY) {
        /* r = (z2 << 32) + (z1 - z0 - z2) << 16) + z0 */
        /* r = z0 */
        /* r += (z1 - z0 - z2) << 16 */
        z1->dp[32] = ca;
        l = 0;
        if (ca) {
            l = z1->dp[0 + 16];
            h = 0;
            SP_ASM_ADDC(l, h, a1->dp[0]);
            SP_ASM_ADDC(l, h, a1->dp[0]);
            z1->dp[0 + 16] = l;
            l = h;
            h = 0;
            for (i = 1; i < 16; i++) {
                SP_ASM_ADDC(l, h, z1->dp[i + 16]);
                SP_ASM_ADDC(l, h, a1->dp[i]);
                SP_ASM_ADDC(l, h, a1->dp[i]);
                z1->dp[i + 16] = l;
                l = h;
                h = 0;
            }
        }
        z1->dp[32] += l;
        /* z1 = z1 - z0 - z1 */
        l = z1->dp[0];
        h = 0;
        SP_ASM_SUBB(l, h, z0->dp[0]);
        SP_ASM_SUBB(l, h, z2->dp[0]);
        z1->dp[0] = l;
        l = h;
        h = 0;
        for (i = 1; i < 32; i++) {
            l += z1->dp[i];
            SP_ASM_SUBB(l, h, z0->dp[i]);
            SP_ASM_SUBB(l, h, z2->dp[i]);
            z1->dp[i] = l;
            l = h;
            h = 0;
        }
        z1->dp[i] += l;
        /* r += z1 << 16 */
        l = 0;
        h = 0;
        for (i = 0; i < 16; i++) {
            SP_ASM_ADDC(l, h, r->dp[i + 16]);
            SP_ASM_ADDC(l, h, z1->dp[i]);
            r->dp[i + 16] = l;
            l = h;
            h = 0;
        }
        for (; i < 33; i++) {
            SP_ASM_ADDC(l, h, z1->dp[i]);
            r->dp[i + 16] = l;
            l = h;
            h = 0;
        }
        /* r += z2 << 32  */
        l = 0;
        h = 0;
        for (i = 0; i < 17; i++) {
            SP_ASM_ADDC(l, h, r->dp[i + 32]);
            SP_ASM_ADDC(l, h, z2->dp[i]);
            r->dp[i + 32] = l;
            l = h;
            h = 0;
        }
        for (; i < 32; i++) {
            SP_ASM_ADDC(l, h, z2->dp[i]);
            r->dp[i + 32] = l;
            l = h;
            h = 0;
        }
        r->used = 64;
        sp_clamp(r);
    }

    FREE_SP_INT_ARRAY(z, NULL);
    FREE_SP_INT(a1, NULL);
    return err;
}
    #endif /* SP_INT_DIGITS >= 64 */

    #if SP_INT_DIGITS >= 96
/* Square a and store in r. r = a * a
 *
 * Karatsuba implementation.
 *
 * @param  [in]   a  SP integer to square.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_sqr_48(const sp_int* a, sp_int* r)
{
    int err = MP_OKAY;
    unsigned int i;
    sp_int_digit l;
    sp_int_digit h;
    sp_int* z0;
    sp_int* z1;
    sp_int* z2;
    sp_int_digit ca;
    DECL_SP_INT(a1, 24);
    DECL_SP_INT_ARRAY(z, 49, 2);

    ALLOC_SP_INT(a1, 24, err, NULL);
    ALLOC_SP_INT_ARRAY(z, 49, 2, err, NULL);
    if (err == MP_OKAY) {
        z1 = z[0];
        z2 = z[1];
        z0 = r;

        XMEMCPY(a1->dp, &a->dp[24], sizeof(sp_int_digit) * 24);
        a1->used = 24;

        /* z2 = a1 ^ 2 */
        err = _sp_sqr_24(a1, z2);
    }
    if (err == MP_OKAY) {
        l = 0;
        h = 0;
        for (i = 0; i < 24; i++) {
            SP_ASM_ADDC(l, h, a1->dp[i]);
            SP_ASM_ADDC(l, h, a->dp[i]);
            a1->dp[i] = l;
            l = h;
            h = 0;
        }
        ca = l;

        /* z0 = a0 ^ 2 */
        err = _sp_sqr_24(a, z0);
    }
    if (err == MP_OKAY) {
        /* z1 = (a0 + a1) ^ 2 */
        err = _sp_sqr_24(a1, z1);
    }
    if (err == MP_OKAY) {
        /* r = (z2 << 48) + (z1 - z0 - z2) << 24) + z0 */
        /* r = z0 */
        /* r += (z1 - z0 - z2) << 24 */
        z1->dp[48] = ca;
        l = 0;
        if (ca) {
            l = z1->dp[0 + 24];
            h = 0;
            SP_ASM_ADDC(l, h, a1->dp[0]);
            SP_ASM_ADDC(l, h, a1->dp[0]);
            z1->dp[0 + 24] = l;
            l = h;
            h = 0;
            for (i = 1; i < 24; i++) {
                SP_ASM_ADDC(l, h, z1->dp[i + 24]);
                SP_ASM_ADDC(l, h, a1->dp[i]);
                SP_ASM_ADDC(l, h, a1->dp[i]);
                z1->dp[i + 24] = l;
                l = h;
                h = 0;
            }
        }
        z1->dp[48] += l;
        /* z1 = z1 - z0 - z1 */
        l = z1->dp[0];
        h = 0;
        SP_ASM_SUBB(l, h, z0->dp[0]);
        SP_ASM_SUBB(l, h, z2->dp[0]);
        z1->dp[0] = l;
        l = h;
        h = 0;
        for (i = 1; i < 48; i++) {
            l += z1->dp[i];
            SP_ASM_SUBB(l, h, z0->dp[i]);
            SP_ASM_SUBB(l, h, z2->dp[i]);
            z1->dp[i] = l;
            l = h;
            h = 0;
        }
        z1->dp[i] += l;
        /* r += z1 << 16 */
        l = 0;
        h = 0;
        for (i = 0; i < 24; i++) {
            SP_ASM_ADDC(l, h, r->dp[i + 24]);
            SP_ASM_ADDC(l, h, z1->dp[i]);
            r->dp[i + 24] = l;
            l = h;
            h = 0;
        }
        for (; i < 49; i++) {
            SP_ASM_ADDC(l, h, z1->dp[i]);
            r->dp[i + 24] = l;
            l = h;
            h = 0;
        }
        /* r += z2 << 48  */
        l = 0;
        h = 0;
        for (i = 0; i < 25; i++) {
            SP_ASM_ADDC(l, h, r->dp[i + 48]);
            SP_ASM_ADDC(l, h, z2->dp[i]);
            r->dp[i + 48] = l;
            l = h;
            h = 0;
        }
        for (; i < 48; i++) {
            SP_ASM_ADDC(l, h, z2->dp[i]);
            r->dp[i + 48] = l;
            l = h;
            h = 0;
        }
        r->used = 96;
        sp_clamp(r);
    }

    FREE_SP_INT_ARRAY(z, NULL);
    FREE_SP_INT(a1, NULL);
    return err;
}
    #endif /* SP_INT_DIGITS >= 96 */

    #if SP_INT_DIGITS >= 128
/* Square a and store in r. r = a * a
 *
 * Karatsuba implementation.
 *
 * @param  [in]   a  SP integer to square.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_sqr_64(const sp_int* a, sp_int* r)
{
    int err = MP_OKAY;
    unsigned int i;
    sp_int_digit l;
    sp_int_digit h;
    sp_int* z0;
    sp_int* z1;
    sp_int* z2;
    sp_int_digit ca;
    DECL_SP_INT(a1, 32);
    DECL_SP_INT_ARRAY(z, 65, 2);

    ALLOC_SP_INT(a1, 32, err, NULL);
    ALLOC_SP_INT_ARRAY(z, 65, 2, err, NULL);
    if (err == MP_OKAY) {
        z1 = z[0];
        z2 = z[1];
        z0 = r;

        XMEMCPY(a1->dp, &a->dp[32], sizeof(sp_int_digit) * 32);
        a1->used = 32;

        /* z2 = a1 ^ 2 */
        err = _sp_sqr_32(a1, z2);
    }
    if (err == MP_OKAY) {
        l = 0;
        h = 0;
        for (i = 0; i < 32; i++) {
            SP_ASM_ADDC(l, h, a1->dp[i]);
            SP_ASM_ADDC(l, h, a->dp[i]);
            a1->dp[i] = l;
            l = h;
            h = 0;
        }
        ca = l;

        /* z0 = a0 ^ 2 */
        err = _sp_sqr_32(a, z0);
    }
    if (err == MP_OKAY) {
        /* z1 = (a0 + a1) ^ 2 */
        err = _sp_sqr_32(a1, z1);
    }
    if (err == MP_OKAY) {
        /* r = (z2 << 64) + (z1 - z0 - z2) << 32) + z0 */
        /* r = z0 */
        /* r += (z1 - z0 - z2) << 32 */
        z1->dp[64] = ca;
        l = 0;
        if (ca) {
            l = z1->dp[0 + 32];
            h = 0;
            SP_ASM_ADDC(l, h, a1->dp[0]);
            SP_ASM_ADDC(l, h, a1->dp[0]);
            z1->dp[0 + 32] = l;
            l = h;
            h = 0;
            for (i = 1; i < 32; i++) {
                SP_ASM_ADDC(l, h, z1->dp[i + 32]);
                SP_ASM_ADDC(l, h, a1->dp[i]);
                SP_ASM_ADDC(l, h, a1->dp[i]);
                z1->dp[i + 32] = l;
                l = h;
                h = 0;
            }
        }
        z1->dp[64] += l;
        /* z1 = z1 - z0 - z1 */
        l = z1->dp[0];
        h = 0;
        SP_ASM_SUBB(l, h, z0->dp[0]);
        SP_ASM_SUBB(l, h, z2->dp[0]);
        z1->dp[0] = l;
        l = h;
        h = 0;
        for (i = 1; i < 64; i++) {
            l += z1->dp[i];
            SP_ASM_SUBB(l, h, z0->dp[i]);
            SP_ASM_SUBB(l, h, z2->dp[i]);
            z1->dp[i] = l;
            l = h;
            h = 0;
        }
        z1->dp[i] += l;
        /* r += z1 << 16 */
        l = 0;
        h = 0;
        for (i = 0; i < 32; i++) {
            SP_ASM_ADDC(l, h, r->dp[i + 32]);
            SP_ASM_ADDC(l, h, z1->dp[i]);
            r->dp[i + 32] = l;
            l = h;
            h = 0;
        }
        for (; i < 65; i++) {
            SP_ASM_ADDC(l, h, z1->dp[i]);
            r->dp[i + 32] = l;
            l = h;
            h = 0;
        }
        /* r += z2 << 64  */
        l = 0;
        h = 0;
        for (i = 0; i < 33; i++) {
            SP_ASM_ADDC(l, h, r->dp[i + 64]);
            SP_ASM_ADDC(l, h, z2->dp[i]);
            r->dp[i + 64] = l;
            l = h;
            h = 0;
        }
        for (; i < 64; i++) {
            SP_ASM_ADDC(l, h, z2->dp[i]);
            r->dp[i + 64] = l;
            l = h;
            h = 0;
        }
        r->used = 128;
        sp_clamp(r);
    }

    FREE_SP_INT_ARRAY(z, NULL);
    FREE_SP_INT(a1, NULL);
    return err;
}
    #endif /* SP_INT_DIGITS >= 128 */

    #if SP_INT_DIGITS >= 192
/* Square a and store in r. r = a * a
 *
 * Karatsuba implementation.
 *
 * @param  [in]   a  SP integer to square.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_sqr_96(const sp_int* a, sp_int* r)
{
    int err = MP_OKAY;
    unsigned int i;
    sp_int_digit l;
    sp_int_digit h;
    sp_int* z0;
    sp_int* z1;
    sp_int* z2;
    sp_int_digit ca;
    DECL_SP_INT(a1, 48);
    DECL_SP_INT_ARRAY(z, 97, 2);

    ALLOC_SP_INT(a1, 48, err, NULL);
    ALLOC_SP_INT_ARRAY(z, 97, 2, err, NULL);
    if (err == MP_OKAY) {
        z1 = z[0];
        z2 = z[1];
        z0 = r;

        XMEMCPY(a1->dp, &a->dp[48], sizeof(sp_int_digit) * 48);
        a1->used = 48;

        /* z2 = a1 ^ 2 */
        err = _sp_sqr_48(a1, z2);
    }
    if (err == MP_OKAY) {
        l = 0;
        h = 0;
        for (i = 0; i < 48; i++) {
            SP_ASM_ADDC(l, h, a1->dp[i]);
            SP_ASM_ADDC(l, h, a->dp[i]);
            a1->dp[i] = l;
            l = h;
            h = 0;
        }
        ca = l;

        /* z0 = a0 ^ 2 */
        err = _sp_sqr_48(a, z0);
    }
    if (err == MP_OKAY) {
        /* z1 = (a0 + a1) ^ 2 */
        err = _sp_sqr_48(a1, z1);
    }
    if (err == MP_OKAY) {
        /* r = (z2 << 96) + (z1 - z0 - z2) << 48) + z0 */
        /* r = z0 */
        /* r += (z1 - z0 - z2) << 48 */
        z1->dp[96] = ca;
        l = 0;
        if (ca) {
            l = z1->dp[0 + 48];
            h = 0;
            SP_ASM_ADDC(l, h, a1->dp[0]);
            SP_ASM_ADDC(l, h, a1->dp[0]);
            z1->dp[0 + 48] = l;
            l = h;
            h = 0;
            for (i = 1; i < 48; i++) {
                SP_ASM_ADDC(l, h, z1->dp[i + 48]);
                SP_ASM_ADDC(l, h, a1->dp[i]);
                SP_ASM_ADDC(l, h, a1->dp[i]);
                z1->dp[i + 48] = l;
                l = h;
                h = 0;
            }
        }
        z1->dp[96] += l;
        /* z1 = z1 - z0 - z1 */
        l = z1->dp[0];
        h = 0;
        SP_ASM_SUBB(l, h, z0->dp[0]);
        SP_ASM_SUBB(l, h, z2->dp[0]);
        z1->dp[0] = l;
        l = h;
        h = 0;
        for (i = 1; i < 96; i++) {
            l += z1->dp[i];
            SP_ASM_SUBB(l, h, z0->dp[i]);
            SP_ASM_SUBB(l, h, z2->dp[i]);
            z1->dp[i] = l;
            l = h;
            h = 0;
        }
        z1->dp[i] += l;
        /* r += z1 << 16 */
        l = 0;
        h = 0;
        for (i = 0; i < 48; i++) {
            SP_ASM_ADDC(l, h, r->dp[i + 48]);
            SP_ASM_ADDC(l, h, z1->dp[i]);
            r->dp[i + 48] = l;
            l = h;
            h = 0;
        }
        for (; i < 97; i++) {
            SP_ASM_ADDC(l, h, z1->dp[i]);
            r->dp[i + 48] = l;
            l = h;
            h = 0;
        }
        /* r += z2 << 96  */
        l = 0;
        h = 0;
        for (i = 0; i < 49; i++) {
            SP_ASM_ADDC(l, h, r->dp[i + 96]);
            SP_ASM_ADDC(l, h, z2->dp[i]);
            r->dp[i + 96] = l;
            l = h;
            h = 0;
        }
        for (; i < 96; i++) {
            SP_ASM_ADDC(l, h, z2->dp[i]);
            r->dp[i + 96] = l;
            l = h;
            h = 0;
        }
        r->used = 192;
        sp_clamp(r);
    }

    FREE_SP_INT_ARRAY(z, NULL);
    FREE_SP_INT(a1, NULL);
    return err;
}
    #endif /* SP_INT_DIGITS >= 192 */

#endif /* SQR_MUL_ASM && WOLFSSL_SP_INT_LARGE_COMBA */
#endif /* !WOLFSSL_SP_SMALL */

/* Square a and store in r. r = a * a
 *
 * @param  [in]   a  SP integer to square.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a or r is NULL, or the result will be too big for fixed
 *          data length.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
int sp_sqr(const sp_int* a, sp_int* r)
{
#if defined(WOLFSSL_SP_MATH) && defined(WOLFSSL_SP_SMALL)
    return sp_mul(a, a, r);
#else
    int err = MP_OKAY;

    if ((a == NULL) || (r == NULL)) {
        err = MP_VAL;
    }
    /* Need extra digit during calculation. */
    if ((err == MP_OKAY) && (a->used * 2 > r->size)) {
        err = MP_VAL;
    }

#if 0
    if (err == MP_OKAY) {
        sp_print(a, "a");
    }
#endif

    if (err == MP_OKAY) {
        if (a->used == 0) {
            _sp_zero(r);
        }
    else
#ifndef WOLFSSL_SP_SMALL
#if !defined(WOLFSSL_HAVE_SP_ECC) && defined(HAVE_ECC)
#if (SP_WORD_SIZE == 64 && SP_INT_BITS >= 256)
        if (a->used == 4) {
            err = _sp_sqr_4(a, r);
        }
        else
#endif /* SP_WORD_SIZE == 64 */
#if (SP_WORD_SIZE == 64 && SP_INT_BITS >= 384)
#ifdef SQR_MUL_ASM
        if (a->used == 6) {
            err = _sp_sqr_6(a, r);
        }
        else
#endif /* SQR_MUL_ASM */
#endif /* SP_WORD_SIZE == 64 */
#if (SP_WORD_SIZE == 32 && SP_INT_BITS >= 256)
#ifdef SQR_MUL_ASM
        if (a->used == 8) {
            err = _sp_sqr_8(a, r);
        }
        else
#endif /* SQR_MUL_ASM */
#endif /* SP_WORD_SIZE == 32 */
#if (SP_WORD_SIZE == 32 && SP_INT_BITS >= 384)
#ifdef SQR_MUL_ASM
        if (a->used == 12) {
            err = _sp_sqr_12(a, r);
        }
        else
#endif /* SQR_MUL_ASM */
#endif /* SP_WORD_SIZE == 32 */
#endif /* !WOLFSSL_HAVE_SP_ECC && HAVE_ECC */
#if defined(SQR_MUL_ASM) && (defined(WOLFSSL_SP_INT_LARGE_COMBA) || \
    (!defined(WOLFSSL_SP_MATH) && defined(WOLFCRYPT_HAVE_SAKKE) && \
    (SP_WORD_SIZE == 64)))
    #if SP_INT_DIGITS >= 32
        if (a->used == 16) {
            err = _sp_sqr_16(a, r);
        }
        else
    #endif /* SP_INT_DIGITS >= 32 */
#endif /* SQR_MUL_ASM && (WOLFSSL_SP_INT_LARGE_COMBA || !WOLFSSL_SP_MATH &&
        * WOLFCRYPT_HAVE_SAKKE && SP_WORD_SIZE == 64 */
#if defined(SQR_MUL_ASM) && defined(WOLFSSL_SP_INT_LARGE_COMBA)
    #if SP_INT_DIGITS >= 48
        if (a->used == 24) {
            err = _sp_sqr_24(a, r);
        }
        else
    #endif /* SP_INT_DIGITS >= 48 */
    #if SP_INT_DIGITS >= 64
        if (a->used == 32) {
            err = _sp_sqr_32(a, r);
        }
        else
    #endif /* SP_INT_DIGITS >= 64 */
    #if SP_INT_DIGITS >= 96
        if (a->used == 48) {
            err = _sp_sqr_48(a, r);
        }
        else
    #endif /* SP_INT_DIGITS >= 96 */
    #if SP_INT_DIGITS >= 128
        if (a->used == 64) {
            err = _sp_sqr_64(a, r);
        }
        else
    #endif /* SP_INT_DIGITS >= 128 */
    #if SP_INT_DIGITS >= 192
        if (a->used == 96) {
            err = _sp_sqr_96(a, r);
        }
        else
    #endif /* SP_INT_DIGITS >= 192 */
#endif /* SQR_MUL_ASM && WOLFSSL_SP_INT_LARGE_COMBA */
#endif /* !WOLFSSL_SP_SMALL */
        {
            err = _sp_sqr(a, r);
        }
    }

#ifdef WOLFSSL_SP_INT_NEGATIVE
    if (err == MP_OKAY) {
        r->sign = MP_ZPOS;
    }
#endif

#if 0
    if (err == MP_OKAY) {
        sp_print(r, "rsqr");
    }
#endif

    return err;
#endif /* WOLFSSL_SP_MATH && WOLFSSL_SP_SMALL */
}
/* END SP_SQR implementations */

#endif /* WOLFSSL_SP_MATH_ALL || WOLFSSL_HAVE_SP_DH || HAVE_ECC ||
        * (!NO_RSA && !WOLFSSL_RSA_VERIFY_ONLY) */

#if defined(WOLFSSL_SP_MATH_ALL) || \
    (!defined(NO_RSA) && !defined(WOLFSSL_RSA_VERIFY_ONLY) && \
    !defined(WOLFSSL_RSA_PUBLIC_ONLY)) || !defined(NO_DH) || defined(HAVE_ECC)
/* Square a mod m and store in r: r = (a * a) mod m
 *
 * @param  [in]   a  SP integer to square.
 * @param  [in]   m  SP integer that is the modulus.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_sqrmod(const sp_int* a, const sp_int* m, sp_int* r)
{
    int err = MP_OKAY;
    /* Create temporary for multiplication result. */
    DECL_SP_INT(t, a->used * 2);

    ALLOC_SP_INT(t, a->used * 2, err, NULL);
    if (err == MP_OKAY) {
        err = sp_init_size(t, a->used * 2);
    }

    /* Square and reduce. */
    if (err == MP_OKAY) {
        err = sp_sqr(a, t);
    }
    if (err == MP_OKAY) {
        err = sp_mod(t, m, r);
    }

    /* Dispose of an allocated SP int. */
    FREE_SP_INT(t, NULL);
    return err;
}

/* Square a mod m and store in r: r = (a * a) mod m
 *
 * @param  [in]   a  SP integer to square.
 * @param  [in]   m  SP integer that is the modulus.
 * @param  [out]  r  SP integer result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a, m or r is NULL; or m is 0; or a squared is too big
 *          for fixed data length.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
int sp_sqrmod(const sp_int* a, const sp_int* m, sp_int* r)
{
    int err = MP_OKAY;

    /* Validate parameters. */
    if ((a == NULL) || (m == NULL) || (r == NULL)) {
        err = MP_VAL;
    }
    /* Ensure r has space for intermediate result. */
    if ((err == MP_OKAY) && (r != m) && (a->used * 2 > r->size)) {
        err = MP_VAL;
    }
    /* Ensure a is not too big. */
    if ((err == MP_OKAY) && (r == m) && (a->used * 2 > SP_INT_DIGITS)) {
        err = MP_VAL;
    }

    /* Use r as intermediate result if not same as pointer m which is needed
     * after first intermediate result.
     */
    if ((err == MP_OKAY) && (r != m)) {
        /* Square and reduce. */
        err = sp_sqr(a, r);
        if (err == MP_OKAY) {
            err = sp_mod(r, m, r);
        }
    }
    else if (err == MP_OKAY) {
        /* Do operation with temporary. */
        err = _sp_sqrmod(a, m, r);
    }

    return err;
}
#endif /* !WOLFSSL_RSA_VERIFY_ONLY */

/**********************
 * Montgomery functions
 **********************/

#if defined(WOLFSSL_SP_MATH_ALL) || defined(WOLFSSL_HAVE_SP_DH) || \
    defined(WOLFCRYPT_HAVE_ECCSI) || defined(WOLFCRYPT_HAVE_SAKKE) || \
    defined(OPENSSL_ALL)
/* Reduce a number in Montgomery form.
 *
 * Assumes a and m are not NULL and m is not 0.
 *
 * DigitMask(a,i) := mask out the 'i'th digit in place.
 *
 * Algorithm:
 *  1. mask = (1 << (NumBits(m) % WORD_SIZE)) - 1
 *  2. For i = 0..NumDigits(m)-1
 *   2.1. mu = (mp * DigitMask(a, i)) & WORD_MASK
 *   2.2. If i == NumDigits(m)-1 and mask != 0 then mu & = mask
 *   2.3. a += mu * DigitMask(m, 0)
 *   2.4. For j = 1 up to NumDigits(m)-2
 *    2.4.1 a += mu * DigitMask(m, j)
 *   2.5 a += mu * DigitMask(m, NumDigits(m)-1))
 * 3. a >>= NumBits(m)
 * 4. a = a % m
 *
 * @param  [in,out]  a   SP integer to Montgomery reduce.
 * @param  [in]      m   SP integer that is the modulus.
 * @param  [in]      mp  SP integer digit that is the bottom digit of inv(-m).
 *
 * @return  MP_OKAY on success.
 */
static int _sp_mont_red(sp_int* a, const sp_int* m, sp_int_digit mp)
{
#if !defined(SQR_MUL_ASM)
    unsigned int i;
    int bits;
    sp_int_word w;
    sp_int_digit mu;

#if 0
    sp_print(a, "a");
    sp_print(m, "m");
#endif

    /* Count bits in modulus. */
    bits = sp_count_bits(m);

    /* Adding numbers into m->used * 2 digits - zero out unused digits. */
    for (i = a->used; i < m->used * 2; i++) {
        a->dp[i] = 0;
    }

    /* Special case when modulus is 1 digit or less. */
    if (m->used <= 1) {
        /* mu = (mp * DigitMask(a, i)) & WORD_MASK */
        mu = mp * a->dp[0];
        /* a += mu * m */
        w = a->dp[0];
        w += (sp_int_word)mu * m->dp[0];
        a->dp[0] = (sp_int_digit)w;
        w >>= SP_WORD_SIZE;
        w += a->dp[1];
        a->dp[1] = (sp_int_digit)w;
        w >>= SP_WORD_SIZE;
        a->dp[2] = (sp_int_digit)w;
        a->used = 3;
        /* mp is SP_WORD_SIZE */
        bits = SP_WORD_SIZE;
    }
    else {
        /* 1. mask = (1 << (NumBits(m) % WORD_SIZE)) - 1
         *    Mask when last digit of modulus doesn't have highest bit set.
         */
        sp_int_digit mask = (sp_int_digit)
            (((sp_int_digit)1 << (bits & (SP_WORD_SIZE - 1))) - 1);
        /* Overflow. */
        sp_int_word o = 0;

        /* 2. For i = 0..NumDigits(m)-1 */
        for (i = 0; i < m->used; i++) {
            unsigned int j;

            /* 2.1. mu = (mp * DigitMask(a, i)) & WORD_MASK */
            mu = mp * a->dp[i];
            /* 2.2. If i == NumDigits(m)-1 and mask != 0 then mu & = mask */
            if ((i == m->used - 1) && (mask != 0)) {
                mu &= mask;
            }

            /* 2.3. a += mu * DigitMask(m, 0) */
            w = a->dp[i];
            w += (sp_int_word)mu * m->dp[0];
            a->dp[i] = (sp_int_digit)w;
            w >>= SP_WORD_SIZE;
            /* 2.4. For j = 1 up to NumDigits(m)-2 */
            for (j = 1; j < m->used - 1; j++) {
                /* 2.4.1 a += mu * DigitMask(m, j) */
                w += a->dp[i + j];
                w += (sp_int_word)mu * m->dp[j];
                a->dp[i + j] = (sp_int_digit)w;
                w >>= SP_WORD_SIZE;
            }
            /* Handle overflow. */
            w += o;
            w += a->dp[i + j];
            o = (sp_int_digit)(w >> SP_WORD_SIZE);
            /* 2.5 a += mu * DigitMask(m, NumDigits(m)-1)) */
            w = ((sp_int_word)mu * m->dp[j]) + (sp_int_digit)w;
            a->dp[i + j] = (sp_int_digit)w;
            w >>= SP_WORD_SIZE;
            o += w;
        }
        /* Handle overflow. */
        o += a->dp[m->used * 2 - 1];
        a->dp[m->used * 2 - 1] = (sp_int_digit)o;
        o >>= SP_WORD_SIZE;
        a->dp[m->used * 2] = (sp_int_digit)o;
        a->used = m->used * 2 + 1;
    }

    /* Remove leading zeros. */
    sp_clamp(a);
    /* 3. a >>= NumBits(m) */
    (void)sp_rshb(a, bits, a);

    /* 4. a = a mod m */
    if (_sp_cmp_abs(a, m) != MP_LT) {
        _sp_sub_off(a, m, a, 0);
    }

#if 0
    sp_print(a, "rr");
#endif

    return MP_OKAY;
#else /* !SQR_MUL_ASM */
    unsigned int i;
    unsigned int j;
    int bits;
    sp_int_digit mu;
    sp_int_digit o;
    sp_int_digit mask;

#if 0
    sp_print(a, "a");
    sp_print(m, "m");
#endif

    bits = sp_count_bits(m);
    mask = ((sp_int_digit)1 << (bits & (SP_WORD_SIZE - 1))) - 1;

    for (i = a->used; i < m->used * 2; i++) {
        a->dp[i] = 0;
    }

    if (m->used <= 1) {
        sp_int_digit l;
        sp_int_digit h;

        /* mu = (mp * DigitMask(a, i)) & WORD_MASK */
        mu = mp * a->dp[0];
        /* a += mu * m */
        l = a->dp[0];
        h = 0;
        SP_ASM_MUL_ADD_NO(l, h, mu, m->dp[0]);
        a->dp[0] = l;
        l = h;
        h = 0;
        SP_ASM_ADDC(l, h, a->dp[1]);
        a->dp[1] = l;
        a->dp[2] = h;
        a->used = m->used * 2 + 1;
        /* mp is SP_WORD_SIZE */
        bits = SP_WORD_SIZE;
    }
#if !defined(WOLFSSL_SP_MATH) && defined(HAVE_ECC)
#if SP_WORD_SIZE == 64
#if SP_INT_DIGITS >= 8
    else if ((m->used == 4) && (mask == 0)) {
        sp_int_digit l;
        sp_int_digit h;
        sp_int_digit o2;

        l = 0;
        h = 0;
        o = 0;
        o2 = 0;
        /* For i = 0..NumDigits(m)-1 */
        for (i = 0; i < 4; i++) {
            /* mu = (mp * DigitMask(a, i)) & WORD_MASK */
            mu = mp * a->dp[0];
            l = a->dp[0];
            /* a = (a + mu * m) >> WORD_SIZE */
            SP_ASM_MUL_ADD_NO(l, h, mu, m->dp[0]);
            l = h;
            h = 0;
            SP_ASM_ADDC(l, h, a->dp[1]);
            SP_ASM_MUL_ADD_NO(l, h, mu, m->dp[1]);
            a->dp[0] = l;
            l = h;
            h = 0;
            SP_ASM_ADDC(l, h, a->dp[2]);
            SP_ASM_MUL_ADD_NO(l, h, mu, m->dp[2]);
            a->dp[1] = l;
            l = h;
            h = o2;
            o2 = 0;
            SP_ASM_ADDC_REG(l, h, o);
            SP_ASM_ADDC(l, h, a->dp[i + 3]);
            SP_ASM_MUL_ADD(l, h, o2, mu, m->dp[3]);
            a->dp[2] = l;
            o = h;
            l = h;
            h = 0;
        }
        /* Handle overflow. */
        h = o2;
        SP_ASM_ADDC(l, h, a->dp[7]);
        a->dp[3] = l;
        a->dp[4] = h;
        a->used = 5;

        /* Remove leading zeros. */
        sp_clamp(a);

        /* a = a mod m */
        if (_sp_cmp_abs(a, m) != MP_LT) {
            _sp_sub_off(a, m, a, 0);
        }

        return MP_OKAY;
    }
#endif /* SP_INT_DIGITS >= 8 */
#if SP_INT_DIGITS >= 12
    else if ((m->used == 6) && (mask == 0)) {
        sp_int_digit l;
        sp_int_digit h;
        sp_int_digit o2;

        l = 0;
        h = 0;
        o = 0;
        o2 = 0;
        /* For i = 0..NumDigits(m)-1 */
        for (i = 0; i < 6; i++) {
            /* mu = (mp * DigitMask(a, i)) & WORD_MASK */
            mu = mp * a->dp[0];
            l = a->dp[0];
            /* a = (a + mu * m) >> WORD_SIZE */
            SP_ASM_MUL_ADD_NO(l, h, mu, m->dp[0]);
            l = h;
            h = 0;
            SP_ASM_ADDC(l, h, a->dp[1]);
            SP_ASM_MUL_ADD_NO(l, h, mu, m->dp[1]);
            a->dp[0] = l;
            l = h;
            h = 0;
            SP_ASM_ADDC(l, h, a->dp[2]);
            SP_ASM_MUL_ADD_NO(l, h, mu, m->dp[2]);
            a->dp[1] = l;
            l = h;
            h = 0;
            SP_ASM_ADDC(l, h, a->dp[3]);
            SP_ASM_MUL_ADD_NO(l, h, mu, m->dp[3]);
            a->dp[2] = l;
            l = h;
            h = 0;
            SP_ASM_ADDC(l, h, a->dp[4]);
            SP_ASM_MUL_ADD_NO(l, h, mu, m->dp[4]);
            a->dp[3] = l;
            l = h;
            h = o2;
            o2 = 0;
            SP_ASM_ADDC_REG(l, h, o);
            SP_ASM_ADDC(l, h, a->dp[i + 5]);
            SP_ASM_MUL_ADD(l, h, o2, mu, m->dp[5]);
            a->dp[4] = l;
            o = h;
            l = h;
            h = 0;
        }
        /* Handle overflow. */
        h = o2;
        SP_ASM_ADDC(l, h, a->dp[11]);
        a->dp[5] = l;
        a->dp[6] = h;
        a->used = 7;

        /* Remove leading zeros. */
        sp_clamp(a);

        /* a = a mod m */
        if (_sp_cmp_abs(a, m) != MP_LT) {
            _sp_sub_off(a, m, a, 0);
        }

        return MP_OKAY;
    }
#endif /* SP_INT_DIGITS >= 12 */
#elif SP_WORD_SIZE == 32
    else if ((m->used <= 12) && (mask == 0)) {
        sp_int_digit l;
        sp_int_digit h;
        sp_int_digit o2;
        sp_int_digit* ad;
        const sp_int_digit* md;

        o = 0;
        o2 = 0;
        ad = a->dp;
        /* For i = 0..NumDigits(m)-1 */
        for (i = 0; i < m->used; i++) {
            md = m->dp;
            /*  mu = (mp * DigitMask(a, i)) & WORD_MASK */
            mu = mp * ad[0];

            /* a = (a + mu * m, 0) >> WORD_SIZE */
            l = ad[0];
            h = 0;
            SP_ASM_MUL_ADD_NO(l, h, mu, *(md++));
            l = h;
            for (j = 1; j + 1 < m->used - 1; j += 2) {
                h = 0;
                SP_ASM_ADDC(l, h, ad[j]);
                SP_ASM_MUL_ADD_NO(l, h, mu, *(md++));
                ad[j - 1] = l;
                l = 0;
                SP_ASM_ADDC(h, l, ad[j + 1]);
                SP_ASM_MUL_ADD_NO(h, l, mu, *(md++));
                ad[j] = h;
            }
            for (; j < m->used - 1; j++) {
                h = 0;
                SP_ASM_ADDC(l, h, ad[j]);
                SP_ASM_MUL_ADD_NO(l, h, mu, *(md++));
                ad[j - 1] = l;
                l = h;
            }
            h = o2;
            o2 = 0;
            SP_ASM_ADDC_REG(l, h, o);
            SP_ASM_ADDC(l, h, ad[i + j]);
            SP_ASM_MUL_ADD(l, h, o2, mu, *md);
            ad[j - 1] = l;
            o = h;
        }
        /* Handle overflow. */
        l = o;
        h = o2;
        SP_ASM_ADDC(l, h, a->dp[m->used * 2 - 1]);
        a->dp[m->used  - 1] = l;
        a->dp[m->used] = h;
        a->used = m->used + 1;

        /* Remove leading zeros. */
        sp_clamp(a);

        /* a = a mod m */
        if (_sp_cmp_abs(a, m) != MP_LT) {
            _sp_sub_off(a, m, a, 0);
        }

        return MP_OKAY;
    }
#endif /* SP_WORD_SIZE == 64 | 32 */
#endif /* !WOLFSSL_SP_MATH && HAVE_ECC */
    else {
        sp_int_digit l;
        sp_int_digit h;
        sp_int_digit o2;
        sp_int_digit* ad;
        const sp_int_digit* md;

        o = 0;
        o2 = 0;
        ad = a->dp;
        /* 2. For i = 0..NumDigits(m)-1 */
        for (i = 0; i < m->used; i++, ad++) {
            md = m->dp;
            /* 2.1. mu = (mp * DigitMask(a, i)) & WORD_MASK */
            mu = mp * ad[0];
            /* 2.2. If i == NumDigits(m)-1 and mask != 0 then mu & = mask */
            if ((i == m->used - 1) && (mask != 0)) {
                mu &= mask;
            }

            /* 2.3 a += mu * DigitMask(m, 0) */
            l = ad[0];
            h = 0;
            SP_ASM_MUL_ADD_NO(l, h, mu, *(md++));
            ad[0] = l;
            l = h;
            /* 2.4. If i == NumDigits(m)-1 and mask != 0 then mu & = mask */
            for (j = 1; j + 1 < m->used - 1; j += 2) {
                h = 0;
                /* 2.4.1. a += mu * DigitMask(m, j) */
                SP_ASM_ADDC(l, h, ad[j + 0]);
                SP_ASM_MUL_ADD_NO(l, h, mu, *(md++));
                ad[j + 0] = l;
                l = 0;
                /* 2.4.1. a += mu * DigitMask(m, j) */
                SP_ASM_ADDC(h, l, ad[j + 1]);
                SP_ASM_MUL_ADD_NO(h, l, mu, *(md++));
                ad[j + 1] = h;
            }
            for (; j < m->used - 1; j++) {
                h = 0;
                /* 2.4.1. a += mu * DigitMask(m, j) */
                SP_ASM_ADDC(l, h, ad[j]);
                SP_ASM_MUL_ADD_NO(l, h, mu, *(md++));
                ad[j] = l;
                l = h;
            }
            h = o2;
            o2 = 0;
            SP_ASM_ADDC_REG(l, h, o);
            /* 2.5 a += mu * DigitMask(m, NumDigits(m)-1) */
            SP_ASM_ADDC(l, h, ad[j]);
            SP_ASM_MUL_ADD(l, h, o2, mu, *md);
            ad[j] = l;
            o = h;
        }
        /* Handle overflow. */
        l = o;
        h = o2;
        SP_ASM_ADDC(l, h, a->dp[m->used * 2 - 1]);
        a->dp[m->used * 2 - 1] = l;
        a->dp[m->used * 2] = h;
        a->used = m->used * 2 + 1;
    }

    /* Remove leading zeros. */
    sp_clamp(a);
    (void)sp_rshb(a, bits, a);

    /* a = a mod m */
    if (_sp_cmp_abs(a, m) != MP_LT) {
        _sp_sub_off(a, m, a, 0);
    }

#if 0
    sp_print(a, "rr");
#endif

    return MP_OKAY;
#endif /* !SQR_MUL_ASM */
}

#if !defined(WOLFSSL_RSA_VERIFY_ONLY) || \
    (defined(WOLFSSL_SP_MATH_ALL) && defined(HAVE_ECC))
/* Reduce a number in Montgomery form.
 *
 * @param  [in,out]  a   SP integer to Montgomery reduce.
 * @param  [in]      m   SP integer that is the modulus.
 * @param  [in]      mp  SP integer digit that is the bottom digit of inv(-m).
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a or m is NULL or m is zero.
 */
int sp_mont_red(sp_int* a, const sp_int* m, sp_int_digit mp)
{
    int err;

    /* Validate parameters. */
    if ((a == NULL) || (m == NULL) || sp_iszero(m)) {
        err = MP_VAL;
    }
    /* Ensure a has enough space for calculation. */
    else if (a->size < m->used * 2 + 1) {
        err = MP_VAL;
    }
    else {
        /* Perform Montogomery Reduction. */
        err = _sp_mont_red(a, m, mp);
    }

    return err;
}
#endif

/* Calculate the bottom digit of the inverse of negative m.
 * (rho * m) mod 2^n = -1, where n is the number of bits in a digit.
 *
 * Used when performing Montgomery Reduction.
 * m must be odd.
 * Jeffrey Hurchalla’s method.
 *   https://arxiv.org/pdf/2204.04342.pdf
 *
 * @param  [in]   m   SP integer that is the modulus.
 * @param  [out]  mp  SP integer digit that is the bottom digit of inv(-m).
 */
static void _sp_mont_setup(const sp_int* m, sp_int_digit* rho)
{
    sp_int_digit d = m->dp[0];
    sp_int_digit x = (3 * d) ^ 2;
    sp_int_digit y = 1 - d * x;

#if SP_WORD_SIZE >= 16
    x *= 1 + y; y *= y;
#endif
#if SP_WORD_SIZE >= 32
    x *= 1 + y; y *= y;
#endif
#if SP_WORD_SIZE >= 64
    x *= 1 + y; y *= y;
#endif
    x *= 1 + y;

    /* rho = -1/m mod d, subtract x (unsigned) from 0, assign negative */
    *rho = (sp_int_digit)((sp_int_sdigit)0 - (sp_int_sdigit)x);
}

/* Calculate the bottom digit of the inverse of negative m.
 * (rho * m) mod 2^n = -1, where n is the number of bits in a digit.
 *
 * Used when performing Montgomery Reduction.
 *
 * @param  [in]   m   SP integer that is the modulus.
 * @param  [out]  mp  SP integer digit that is the bottom digit of inv(-m).
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when m or rho is NULL.
 */
int sp_mont_setup(const sp_int* m, sp_int_digit* rho)
{
    int err = MP_OKAY;

    /* Validate parameters. */
    if ((m == NULL) || (rho == NULL)) {
        err = MP_VAL;
    }
    /* Calculation only works with odd modulus. */
    if ((err == MP_OKAY) && !sp_isodd(m)) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
        /* Calculate negative of inverse mod 2^n. */
        _sp_mont_setup(m, rho);
    }

    return err;
}

/* Calculate the normalization value of m.
 *   norm = 2^k - m, where k is the number of bits in m
 *
 * @param  [out]  norm   SP integer that normalises numbers into Montgomery
 *                       form.
 * @param  [in]   m      SP integer that is the modulus.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when norm or m is NULL, or number of bits in m is maximual.
 */
int sp_mont_norm(sp_int* norm, const sp_int* m)
{
    int err = MP_OKAY;
    unsigned int bits = 0;

    /* Validate parameters. */
    if ((norm == NULL) || (m == NULL)) {
        err = MP_VAL;
    }
    if (err == MP_OKAY) {
        /* Find top bit and ensure norm has enough space. */
        bits = (unsigned int)sp_count_bits(m);
        if (bits >= norm->size * SP_WORD_SIZE) {
            err = MP_VAL;
        }
    }
    if (err == MP_OKAY) {
        /* Round up for case when m is less than a word - no advantage in using
         * a smaller mask and would take more operations.
         */
        if (bits < SP_WORD_SIZE) {
            bits = SP_WORD_SIZE;
        }
        /* Smallest number greater than m of form 2^n. */
        _sp_zero(norm);
        err = sp_set_bit(norm, (int)bits);
    }
    if (err == MP_OKAY) {
        /* norm = 2^n % m */
        err = sp_sub(norm, m, norm);
    }
    if ((err == MP_OKAY) && (bits == SP_WORD_SIZE)) {
        /* Sub made norm one word and now finish calculation. */
        norm->dp[0] %= m->dp[0];
    }
    if (err == MP_OKAY) {
        /* Remove leading zeros. */
        sp_clamp(norm);
    }

    return err;
}
#endif /* WOLFSSL_SP_MATH_ALL || WOLFSSL_HAVE_SP_DH ||
        * WOLFCRYPT_HAVE_ECCSI || WOLFCRYPT_HAVE_SAKKE */

/*********************************
 * To and from binary and strings.
 *********************************/

/* Calculate the number of 8-bit values required to represent the
 * multi-precision number.
 *
 * When a is NULL, return s 0.
 *
 * @param  [in]  a  SP integer.
 *
 * @return  The count of 8-bit values.
 * @return  0 when a is NULL.
 */
int sp_unsigned_bin_size(const sp_int* a)
{
    int cnt = 0;

    if (a != NULL) {
        cnt = (sp_count_bits(a) + 7) / 8;
    }

    return cnt;
}

/* Convert a number as an array of bytes in big-endian format to a
 * multi-precision number.
 *
 * @param  [out]  a     SP integer.
 * @param  [in]   in    Array of bytes.
 * @param  [in]   inSz  Number of data bytes in array.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when the number is too big to fit in an SP.
 */
int sp_read_unsigned_bin(sp_int* a, const byte* in, word32 inSz)
{
    int err = MP_OKAY;

    /* Validate parameters. */
    if ((a == NULL) || ((in == NULL) && (inSz > 0))) {
        err = MP_VAL;
    }

    /* Check a has enough space for number. */
    if ((err == MP_OKAY) && (inSz > (word32)a->size * SP_WORD_SIZEOF)) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
        /* Load full digits at a time from in. */
        int i;
        int j = 0;

        a->used = (inSz + SP_WORD_SIZEOF - 1) / SP_WORD_SIZEOF;

    #if defined(BIG_ENDIAN_ORDER) && !defined(WOLFSSL_SP_INT_DIGIT_ALIGN)
        /* Data endian matches respresentation of number.
         * Directly copy if we don't have alignment issues.
         */
        for (i = (int)(inSz-1); i > SP_WORD_SIZEOF-1; i -= SP_WORD_SIZEOF) {
            a->dp[j++] = *(sp_int_digit*)(in + i - (SP_WORD_SIZEOF - 1));
        }
    #else
        /* Construct digit from required number of bytes. */
        for (i = (int)(inSz-1); i >= SP_WORD_SIZEOF - 1; i -= SP_WORD_SIZEOF) {
            a->dp[j]  = ((sp_int_digit)in[i - 0] <<  0)
        #if SP_WORD_SIZE >= 16
                      | ((sp_int_digit)in[i - 1] <<  8)
        #endif
        #if SP_WORD_SIZE >= 32
                      | ((sp_int_digit)in[i - 2] << 16) |
                        ((sp_int_digit)in[i - 3] << 24)
        #endif
        #if SP_WORD_SIZE >= 64
                      | ((sp_int_digit)in[i - 4] << 32) |
                        ((sp_int_digit)in[i - 5] << 40) |
                        ((sp_int_digit)in[i - 6] << 48) |
                        ((sp_int_digit)in[i - 7] << 56)
        #endif
                                                       ;
            j++;
        }
    #endif

#if SP_WORD_SIZE >= 16
        /* Handle leftovers. */
        if (i >= 0) {
    #ifdef BIG_ENDIAN_ORDER
            int s;

            /* Place remaining bytes into last digit. */
            a->dp[a->used - 1] = 0;
            for (s = 0; i >= 0; i--,s += 8) {
                a->dp[j] |= ((sp_int_digit)in[i]) << s;
            }
    #else
            /* Cast digits to an array of bytes so we can insert directly. */
            byte *d = (byte*)a->dp;

            /* Zero out all bytes in last digit. */
            a->dp[a->used - 1] = 0;
            /* Place remaining bytes directly into digit. */
            switch (i) {
            #if SP_WORD_SIZE >= 64
                case 6: d[inSz - 1 - 6] = in[6]; FALL_THROUGH;
                case 5: d[inSz - 1 - 5] = in[5]; FALL_THROUGH;
                case 4: d[inSz - 1 - 4] = in[4]; FALL_THROUGH;
                case 3: d[inSz - 1 - 3] = in[3]; FALL_THROUGH;
            #endif
            #if SP_WORD_SIZE >= 32
                case 2: d[inSz - 1 - 2] = in[2]; FALL_THROUGH;
                case 1: d[inSz - 1 - 1] = in[1]; FALL_THROUGH;
            #endif
                case 0: d[inSz - 1 - 0] = in[0];
            }
    #endif /* LITTLE_ENDIAN_ORDER */
        }
#endif
        sp_clamp(a);
    }

    return err;
}

/* Convert the multi-precision number to an array of bytes in big-endian format.
 *
 * The array must be large enough for encoded number - use mp_unsigned_bin_size
 * to calculate the number of bytes required.
 *
 * @param  [in]   a    SP integer.
 * @param  [out]  out  Array to put encoding into.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a or out is NULL.
 */
int sp_to_unsigned_bin(const sp_int* a, byte* out)
{
    /* Write assuming output buffer is big enough. */
    return sp_to_unsigned_bin_len(a, out, sp_unsigned_bin_size(a));
}

/* Convert the multi-precision number to an array of bytes in big-endian format.
 *
 * The array must be large enough for encoded number - use mp_unsigned_bin_size
 * to calculate the number of bytes required.
 * Front-pads the output array with zeros to make number the size of the array.
 *
 * @param  [in]   a      SP integer.
 * @param  [out]  out    Array to put encoding into.
 * @param  [in]   outSz  Size of the array in bytes.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a or out is NULL.
 */
int sp_to_unsigned_bin_len(const sp_int* a, byte* out, int outSz)
{
    int err = MP_OKAY;

    /* Validate parameters. */
    if ((a == NULL) || (out == NULL) || (outSz < 0)) {
        err = MP_VAL;
    }

#if SP_WORD_SIZE > 8
    if (err == MP_OKAY) {
        /* Start at the end of the buffer - least significant byte. */
        int j = outSz - 1;

        if (!sp_iszero(a)) {
            unsigned int i;

            /* Put each digit in. */
            for (i = 0; (j >= 0) && (i < a->used); i++) {
                int b;
                sp_int_digit d = a->dp[i];
                /* Place each byte of a digit into the buffer. */
                for (b = 0; b < SP_WORD_SIZE; b += 8) {
                    out[j--] = (byte)d;
                    d >>= 8;
                    /* Stop if the output buffer is filled. */
                    if (j < 0) {
                        if ((i < a->used - 1) || (d > 0)) {
                            err = MP_VAL;
                        }
                        break;
                    }
                }
            }
        }
        /* Front pad buffer with 0s. */
        for (; j >= 0; j--) {
            out[j] = 0;
        }
    }
#else
    if ((err == MP_OKAY) && ((unsigned int)outSz < a->used)) {
        err = MP_VAL;
    }
    if (err == MP_OKAY) {
        unsigned int i;
        int j;

        XMEMSET(out, 0, (unsigned int)outSz - a->used);

        for (i = 0, j = outSz - 1; i < a->used; i++, j--) {
            out[j] = a->dp[i];
        }
    }
#endif

    return err;
}

#if defined(WOLFSSL_SP_MATH_ALL) && !defined(NO_RSA) && \
    !defined(WOLFSSL_RSA_VERIFY_ONLY)
/* Store the number in big-endian format in array at an offset.
 * The array must be large enough for encoded number - use mp_unsigned_bin_size
 * to calculate the number of bytes required.
 *
 * @param  [in]   o    Offset into array o start encoding.
 * @param  [in]   a    SP integer.
 * @param  [out]  out  Array to put encoding into.
 *
 * @return  Index of next byte after data.
 * @return  MP_VAL when a or out is NULL.
 */
int sp_to_unsigned_bin_at_pos(int o, const sp_int* a, unsigned char* out)
{
    /* Get length of data that will be written. */
    int len = sp_unsigned_bin_size(a);
    /* Write number to buffer at offset. */
    int ret = sp_to_unsigned_bin_len(a, out + o, len);

    if (ret == MP_OKAY) {
        /* Return offset of next byte after number. */
        ret = o + len;
    }

    return ret;
}
#endif /* WOLFSSL_SP_MATH_ALL && !NO_RSA && !WOLFSSL_RSA_VERIFY_ONLY */

#ifdef WOLFSSL_SP_READ_RADIX_16
/* Convert hexadecimal number as string in big-endian format to a
 * multi-precision number.
 *
 * Assumes negative sign and leading zeros have been stripped.
 *
 * @param  [out]  a   SP integer.
 * @param  [in]   in  NUL terminated string.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when radix not supported, value is negative, or a character
 *          is not valid.
 */
static int _sp_read_radix_16(sp_int* a, const char* in)
{
    int err = MP_OKAY;
    int i;
    unsigned int s = 0;
    unsigned int j = 0;
    sp_int_digit d;

    /* Make all nibbles in digit 0. */
    d = 0;
    /* Step through string a character at a time starting at end - least
     * significant byte. */
    for (i = (int)(XSTRLEN(in) - 1); i >= 0; i--) {
        /* Convert character from hex. */
        int ch = (int)HexCharToByte(in[i]);
        /* Check for invalid character. */
        if (ch < 0) {
            err = MP_VAL;
            break;
        }

        /* Check whether we have filled the digit. */
        if (s == SP_WORD_SIZE) {
            /* Store digit and move index to next in a. */
            a->dp[j++] = d;
            /* Fail if we are out of space in a. */
            if (j >= a->size) {
                err = MP_VAL;
                break;
            }
            /* Set shift back to 0 - lowest nibble. */
            s = 0;
            /* Make all nibbles in digit 0. */
            d = 0;
        }

        /* Put next nibble into digit. */
        d |= ((sp_int_digit)ch) << s;
        /* Update shift for next nibble. */
        s += 4;
    }

    if (err == MP_OKAY) {
        /* If space, store last digit. */
        if (j < a->size) {
            a->dp[j] = d;
        }
        /* Update used count. */
        a->used = j + 1;
        /* Remove leading zeros. */
        sp_clamp(a);
    }

    return err;
}
#endif /* WOLFSSL_SP_READ_RADIX_16 */

#ifdef WOLFSSL_SP_READ_RADIX_10
/* Convert decimal number as string in big-endian format to a multi-precision
 * number.
 *
 * Assumes negative sign and leading zeros have been stripped.
 *
 * @param  [out]  a   SP integer.
 * @param  [in]   in  NUL terminated string.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when radix not supported, value is negative, or a character
 *          is not valid.
 */
static int _sp_read_radix_10(sp_int* a, const char* in)
{
    int  err = MP_OKAY;
    int  i;
    char ch;

    /* Start with a being zero. */
    _sp_zero(a);

    /* Process all characters. */
    for (i = 0; in[i] != '\0'; i++) {
        /* Get character. */
        ch = in[i];
        /* Check character is valid. */
        if ((ch >= '0') && (ch <= '9')) {
            /* Assume '0'..'9' are continuous valus as characters. */
            ch -= '0';
        }
        else {
            /* Return error on invalid character. */
            err = MP_VAL;
            break;
        }

        /* Multiply a by 10. */
        err = _sp_mul_d(a, 10, a, 0);
        if (err != MP_OKAY) {
            break;
        }
        /* Add character value. */
        err = _sp_add_d(a, (sp_int_digit)ch, a);
        if (err != MP_OKAY) {
            break;
        }
    }

    return err;
}
#endif /* WOLFSSL_SP_READ_RADIX_10 */

#if defined(WOLFSSL_SP_READ_RADIX_16) || defined(WOLFSSL_SP_READ_RADIX_10)
/* Convert a number as string in big-endian format to a big number.
 * Only supports base-16 (hexadecimal) and base-10 (decimal).
 *
 * Negative values supported when WOLFSSL_SP_INT_NEGATIVE is defined.
 *
 * @param  [out]  a      SP integer.
 * @param  [in]   in     NUL terminated string.
 * @param  [in]   radix  Number of values in a digit.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a or in is NULL, radix not supported, value is negative,
 *          or a character is not valid.
 */
int sp_read_radix(sp_int* a, const char* in, int radix)
{
    int err = MP_OKAY;
#ifdef WOLFSSL_SP_INT_NEGATIVE
    unsigned int sign = MP_ZPOS;
#endif

    if ((a == NULL) || (in == NULL)) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
    #ifndef WOLFSSL_SP_INT_NEGATIVE
        if (*in == '-') {
            err = MP_VAL;
        }
        else
    #endif
        {
        #ifdef WOLFSSL_SP_INT_NEGATIVE
            if (*in == '-') {
                /* Make number negative if signed string. */
                sign = MP_NEG;
                in++;
            }
        #endif /* WOLFSSL_SP_INT_NEGATIVE */
            /* Skip leading zeros. */
            while (*in == '0') {
                in++;
            }

            if (radix == 16) {
                err = _sp_read_radix_16(a, in);
            }
        #ifdef WOLFSSL_SP_READ_RADIX_10
            else if (radix == 10) {
                err = _sp_read_radix_10(a, in);
            }
        #endif
            else {
                err = MP_VAL;
            }

        #ifdef WOLFSSL_SP_INT_NEGATIVE
            /* Ensure not negative when zero. */
            if (err == MP_OKAY) {
                if (sp_iszero(a)) {
                    a->sign = MP_ZPOS;
                }
                else {
                    a->sign = sign;
                }
            }
        #endif
        }
    }

    return err;
}
#endif /* WOLFSSL_SP_READ_RADIX_16 || WOLFSSL_SP_READ_RADIX_10 */

#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
    defined(WC_MP_TO_RADIX)
/* Put the big-endian, hex string encoding of a into str.
 *
 * Assumes str is large enough for result.
 * Use sp_radix_size() to calculate required length.
 *
 * @param  [in]   a    SP integer to convert.
 * @param  [out]  str  String to hold hex string result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a or str is NULL.
 */
int sp_tohex(const sp_int* a, char* str)
{
    int err = MP_OKAY;

    /* Validate parameters. */
    if ((a == NULL) || (str == NULL)) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
        /* Quick out if number is zero. */
        if (sp_iszero(a) == MP_YES) {
        #ifndef WC_DISABLE_RADIX_ZERO_PAD
            /* Make string represent complete bytes. */
            *str++ = '0';
        #endif /* WC_DISABLE_RADIX_ZERO_PAD */
            *str++ = '0';
        }
        else {
            int i;
            int j;
            sp_int_digit d;

        #ifdef WOLFSSL_SP_INT_NEGATIVE
            if (a->sign == MP_NEG) {
                /* Add negative sign character. */
                *str = '-';
                str++;
            }
        #endif /* WOLFSSL_SP_INT_NEGATIVE */

            /* Start at last digit - most significant digit. */
            i = (int)(a->used - 1);
            d = a->dp[i];
        #ifndef WC_DISABLE_RADIX_ZERO_PAD
            /* Find highest non-zero byte in most-significant word. */
            for (j = SP_WORD_SIZE - 8; j >= 0 && i >= 0; j -= 8) {
                /* When a byte at this index is not 0 break out to start
                 * writing.
                 */
                if (((d >> j) & 0xff) != 0) {
                    break;
                }
                /* Skip this digit if it was 0. */
                if (j == 0) {
                    j = SP_WORD_SIZE - 8;
                    d = a->dp[--i];
                }
            }
            /* Start with high nibble of byte. */
            j += 4;
        #else
            /* Find highest non-zero nibble in most-significant word. */
            for (j = SP_WORD_SIZE - 4; j >= 0; j -= 4) {
                /* When a nibble at this index is not 0 break out to start
                 * writing.
                 */
                if (((d >> j) & 0xf) != 0) {
                    break;
                }
                /* Skip this digit if it was 0. */
                if (j == 0) {
                    j = SP_WORD_SIZE - 4;
                    d = a->dp[--i];
                }
            }
        #endif /* WC_DISABLE_RADIX_ZERO_PAD */
            /* Write out as much as required from most-significant digit. */
            for (; j >= 0; j -= 4) {
                *(str++) = ByteToHex((byte)(d >> j));
            }
            /* Write rest of digits. */
            for (--i; i >= 0; i--) {
                /* Get digit from memory. */
                d = a->dp[i];
                /* Write out all nibbles of digit. */
                for (j = SP_WORD_SIZE - 4; j >= 0; j -= 4) {
                    *(str++) = (char)ByteToHex((byte)(d >> j));
                }
            }
        }
        /* Terminate string. */
        *str = '\0';
    }

    return err;
}
#endif /* (WOLFSSL_SP_MATH_ALL && !WOLFSSL_RSA_VERIFY_ONLY) || WC_MP_TO_RADIX */

#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
    defined(WOLFSSL_KEY_GEN) || defined(HAVE_COMP_KEY) || \
    defined(WC_MP_TO_RADIX)
/* Put the big-endian, decimal string encoding of a into str.
 *
 * Assumes str is large enough for result.
 * Use sp_radix_size() to calculate required length.
 *
 * @param  [in]   a    SP integer to convert.
 * @param  [out]  str  String to hold hex string result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a or str is NULL.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
int sp_todecimal(const sp_int* a, char* str)
{
    int err = MP_OKAY;
    int i;
    int j;
    sp_int_digit d = 0;

    /* Validate parameters. */
    if ((a == NULL) || (str == NULL)) {
        err = MP_VAL;
    }
    /* Quick out if number is zero. */
    else if (sp_iszero(a) == MP_YES) {
        *str++ = '0';
        *str = '\0';
    }
    else if (a->used >= SP_INT_DIGITS) {
        err = MP_VAL;
    }
    else {
        /* Temporary that is divided by 10. */
        DECL_SP_INT(t, a->used + 1);

        ALLOC_SP_INT_SIZE(t, a->used + 1, err, NULL);
        if (err == MP_OKAY) {
            _sp_copy(a, t);
        }
        if (err == MP_OKAY) {
        #ifdef WOLFSSL_SP_INT_NEGATIVE
            if (a->sign == MP_NEG) {
                /* Add negative sign character. */
                *str = '-';
                str++;
            }
        #endif /* WOLFSSL_SP_INT_NEGATIVE */

            /* Write out little endian. */
            i = 0;
            do {
                /* Divide by 10 and get remainder of division. */
                (void)sp_div_d(t, 10, t, &d);
                /* Write out remainder as a character. */
                str[i++] = (char)('0' + d);
            }
            /* Keep going while we there is a value to write. */
            while (!sp_iszero(t));
            /* Terminate string. */
            str[i] = '\0';

            if (err == MP_OKAY) {
                /* Reverse string to big endian. */
                for (j = 0; j <= (i - 1) / 2; j++) {
                    int c = (unsigned char)str[j];
                    str[j] = str[i - 1 - j];
                    str[i - 1 - j] = (char)c;
                }
            }
        }

        FREE_SP_INT(t, NULL);
    }

    return err;
}
#endif /* WOLFSSL_SP_MATH_ALL || WOLFSSL_KEY_GEN || HAVE_COMP_KEY */

#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
    defined(WC_MP_TO_RADIX)
/* Put the string version, big-endian, of a in str using the given radix.
 *
 * @param  [in]   a      SP integer to convert.
 * @param  [out]  str    String to hold hex string result.
 * @param  [in]   radix  Base of character.
 *                       Valid values: MP_RADIX_HEX, MP_RADIX_DEC.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a or str is NULL, or radix not supported.
 */
int sp_toradix(const sp_int* a, char* str, int radix)
{
    int err = MP_OKAY;

    /* Validate parameters. */
    if ((a == NULL) || (str == NULL)) {
        err = MP_VAL;
    }
    /* Handle base 16 if requested. */
    else if (radix == MP_RADIX_HEX) {
        err = sp_tohex(a, str);
    }
#if defined(WOLFSSL_SP_MATH_ALL) || defined(WOLFSSL_KEY_GEN) || \
    defined(HAVE_COMP_KEY)
    /* Handle base 10 if requested. */
    else if (radix == MP_RADIX_DEC) {
        err = sp_todecimal(a, str);
    }
#endif /* WOLFSSL_SP_MATH_ALL || WOLFSSL_KEY_GEN || HAVE_COMP_KEY */
    else {
        /* Base not supported. */
        err = MP_VAL;
    }

    return err;
}
#endif /* (WOLFSSL_SP_MATH_ALL && !WOLFSSL_RSA_VERIFY_ONLY) || WC_MP_TO_RADIX */

#if (defined(WOLFSSL_SP_MATH_ALL) && !defined(WOLFSSL_RSA_VERIFY_ONLY)) || \
    defined(WC_MP_TO_RADIX)
/* Calculate the length of the string version, big-endian, of a using the given
 * radix.
 *
 * @param  [in]   a      SP integer to convert.
 * @param  [in]   radix  Base of character.
 *                       Valid values: MP_RADIX_HEX, MP_RADIX_DEC.
 * @param  [out]  size   The number of characters in encoding.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a or size is NULL, or radix not supported.
 */
int sp_radix_size(const sp_int* a, int radix, int* size)
{
    int err = MP_OKAY;

    /* Validate parameters. */
    if ((a == NULL) || (size == NULL)) {
        err = MP_VAL;
    }
    /* Handle base 16 if requested. */
    else if (radix == MP_RADIX_HEX) {
        if (a->used == 0) {
        #ifndef WC_DISABLE_RADIX_ZERO_PAD
            /* 00 and '\0' */
            *size = 2 + 1;
        #else
            /* Zero and '\0' */
            *size = 1 + 1;
        #endif /* WC_DISABLE_RADIX_ZERO_PAD */
        }
        else {
            /* Count of nibbles. */
            int cnt = (sp_count_bits(a) + 3) / 4;
        #ifndef WC_DISABLE_RADIX_ZERO_PAD
            /* Must have even number of nibbles to have complete bytes. */
            if (cnt & 1) {
                cnt++;
            }
        #endif /* WC_DISABLE_RADIX_ZERO_PAD */
        #ifdef WOLFSSL_SP_INT_NEGATIVE
            /* Add to count of characters for negative sign. */
            if (a->sign == MP_NEG) {
                cnt++;
            }
        #endif /* WOLFSSL_SP_INT_NEGATIVE */
            /* One more for \0 */
            *size = cnt + 1;
        }
    }
#if defined(WOLFSSL_SP_MATH_ALL) || defined(WOLFSSL_KEY_GEN) || \
    defined(HAVE_COMP_KEY)
    /* Handle base 10 if requested. */
    else if (radix == MP_RADIX_DEC) {
        int i;
        sp_int_digit d;

        /* quick out if its zero */
        if (sp_iszero(a) == MP_YES) {
            /* Zero and '\0' */
            *size = 1 + 1;
        }
        else {
            DECL_SP_INT(t, a->used);

            /* Temporary to be divided by 10. */
            ALLOC_SP_INT(t, a->used, err, NULL);
            if (err == MP_OKAY) {
                t->size = a->used;
                _sp_copy(a, t);
            }

            if (err == MP_OKAY) {
                /* Count number of times number can be divided by 10. */
                for (i = 0; !sp_iszero(t); i++) {
                    (void)sp_div_d(t, 10, t, &d);
                }
            #ifdef WOLFSSL_SP_INT_NEGATIVE
                /* Add to count of characters for negative sign. */
                if (a->sign == MP_NEG) {
                    i++;
                }
            #endif /* WOLFSSL_SP_INT_NEGATIVE */
                /* One more for \0 */
                *size = i + 1;
            }

            FREE_SP_INT(t, NULL);
        }
    }
#endif /* WOLFSSL_SP_MATH_ALL || WOLFSSL_KEY_GEN || HAVE_COMP_KEY */
    else {
        /* Base not supported. */
        err = MP_VAL;
    }

    return err;
}
#endif /* (WOLFSSL_SP_MATH_ALL && !WOLFSSL_RSA_VERIFY_ONLY) || WC_MP_TO_RADIX */

/***************************************
 * Prime number generation and checking.
 ***************************************/

#if defined(WOLFSSL_KEY_GEN) && (!defined(NO_RSA) || !defined(NO_DH) || \
    !defined(NO_DSA)) && !defined(WC_NO_RNG)
#ifndef WOLFSSL_SP_MILLER_RABIN_CNT
/* Always done 8 iterations of Miller-Rabin on check of primality when
 * generating.
 */
#define WOLFSSL_SP_MILLER_RABIN_CNT     8
#endif

/* Generate a random prime for RSA only.
 *
 * @param  [out]  r     SP integer to hold result.
 * @param  [in]   len   Number of bytes in prime. Use -ve to indicate the two
 *                      lowest bits must be set.
 * @param  [in]   rng   Random number generator.
 * @param  [in]   heap  Heap hint. Unused.
 *
 * @return  MP_OKAY on success
 * @return  MP_VAL when r or rng is NULL, length is not supported or random
 *          number generator fails.
 */
int sp_rand_prime(sp_int* r, int len, WC_RNG* rng, void* heap)
{
    static const byte USE_BBS = 3;
    int  err = MP_OKAY;
    byte low_bits = 1;
    int  isPrime = MP_NO;
#if defined(WOLFSSL_SP_MATH_ALL) || defined(BIG_ENDIAN_ORDER)
    int  bits = 0;
#endif /* WOLFSSL_SP_MATH_ALL */
    unsigned int digits = 0;

    (void)heap;

    /* Check NULL parameters and 0 is not prime so 0 bytes is invalid. */
    if ((r == NULL) || (rng == NULL) || (len == 0)) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
        /* Get type. */
        if (len < 0) {
            low_bits = USE_BBS;
            len = -len;
        }

        /* Get number of digits required to handle required number of bytes. */
        digits = ((unsigned int)len + SP_WORD_SIZEOF - 1) / SP_WORD_SIZEOF;
        /* Ensure result has space. */
        if (r->size < digits) {
            err = MP_VAL;
        }
    }

    if (err == MP_OKAY) {
    #ifndef WOLFSSL_SP_MATH_ALL
        /* For minimal maths, support only what's in SP and needed for DH. */
    #if defined(WOLFSSL_HAVE_SP_DH) && defined(WOLFSSL_KEY_GEN)
        if (len == 32) {
        }
        else
    #endif /* WOLFSSL_HAVE_SP_DH && WOLFSSL_KEY_GEN */
        /* Generate RSA primes that are half the modulus length. */
    #ifdef WOLFSSL_SP_4096
        if (len == 256) {
            /* Support 2048-bit operations compiled in. */
        }
        else
    #endif
    #ifndef WOLFSSL_SP_NO_3072
        if (len == 192) {
            /* Support 1536-bit operations compiled in. */
        }
        else
    #endif
    #ifndef WOLFSSL_SP_NO_2048
        if (len == 128) {
            /* Support 1024-bit operations compiled in. */
        }
        else
    #endif
        {
            /* Bit length not supported in SP. */
            err = MP_VAL;
        }
    #endif /* !WOLFSSL_SP_MATH_ALL */

    #ifdef WOLFSSL_SP_INT_NEGATIVE
        /* Generated number is always positive. */
        r->sign = MP_ZPOS;
    #endif /* WOLFSSL_SP_INT_NEGATIVE */
        /* Set number of digits that will be used. */
        r->used = digits;
    #if defined(WOLFSSL_SP_MATH_ALL) || defined(BIG_ENDIAN_ORDER)
        /* Calculate number of bits in last digit. */
        bits = (len * 8) & SP_WORD_MASK;
    #endif /* WOLFSSL_SP_MATH_ALL || BIG_ENDIAN_ORDER */
    }

    /* Assume the candidate is probably prime and then test until it is proven
     * composite.
     */
    while ((err == MP_OKAY) && (isPrime == MP_NO)) {
#ifdef SHOW_GEN
        printf(".");
        fflush(stdout);
#endif /* SHOW_GEN */
        /* Generate bytes into digit array. */
        err = wc_RNG_GenerateBlock(rng, (byte*)r->dp, (word32)len);
        if (err != 0) {
            err = MP_VAL;
            break;
        }

        /* Set top bits to ensure bit length required is generated.
         * Also set second top to help ensure product of two primes is
         * going to be twice the number of bits of each.
         */
#ifdef LITTLE_ENDIAN_ORDER
        ((byte*)r->dp)[len-1]             |= 0x80 | 0x40;
#else
        ((byte*)(r->dp + r->used - 1))[0] |= 0x80 | 0x40;
#endif /* LITTLE_ENDIAN_ORDER */

#ifdef BIG_ENDIAN_ORDER
        /* Bytes were put into wrong place when less than full digit. */
        if (bits != 0) {
            r->dp[r->used - 1] >>= SP_WORD_SIZE - bits;
        }
#endif /* BIG_ENDIAN_ORDER */
#ifdef WOLFSSL_SP_MATH_ALL
        /* Mask top digit when less than a digit requested. */
        if (bits > 0) {
            r->dp[r->used - 1] &= ((sp_int_digit)1 << bits) - 1;
        }
#endif /* WOLFSSL_SP_MATH_ALL */
        /* Set mandatory low bits
         *  - bottom bit to make odd.
         *  - For BBS, second lowest too to make Blum integer (3 mod 4).
         */
        r->dp[0] |= low_bits;

        /* Running Miller-Rabin up to 3 times gives us a 2^{-80} chance
         * of a 1024-bit candidate being a false positive, when it is our
         * prime candidate. (Note 4.49 of Handbook of Applied Cryptography.)
         */
        err = sp_prime_is_prime_ex(r, WOLFSSL_SP_MILLER_RABIN_CNT, &isPrime,
            rng);
    }

    return err;
}
#endif /* WOLFSSL_KEY_GEN && (!NO_DH || !NO_DSA) && !WC_NO_RNG */

#ifdef WOLFSSL_SP_PRIME_GEN
/* Miller-Rabin test of "a" to the base of "b" as described in
 * HAC pp. 139 Algorithm 4.24
 *
 * Sets result to 0 if definitely composite or 1 if probably prime.
 * Randomly the chance of error is no more than 1/4 and often
 * very much lower.
 *
 * a is assumed to be odd.
 *
 * @param  [in]   a       SP integer to check.
 * @param  [in]   b       SP integer that is a small prime.
 * @param  [out]  result  MP_YES when number is likey prime.
 *                        MP_NO otherwise.
 * @param  [in]   n1      SP integer temporary.
 * @param  [in]   r       SP integer temporary.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int sp_prime_miller_rabin(const sp_int* a, sp_int* b, int* result,
    sp_int* n1, sp_int* r)
{
    int err = MP_OKAY;
    int s = 0;
    sp_int* y = b;

    /* Assume not prime. */
    *result = MP_NO;

    /* Ensure small prime is 2 or more. */
    if (sp_cmp_d(b, 1) != MP_GT) {
        err = MP_VAL;
    }
    if (err == MP_OKAY) {
        /* n1 = a - 1 (a is assumed odd.) */
        (void)sp_copy(a, n1);
        n1->dp[0]--;

        /* Set 2**s * r = n1 */
        /* Count the number of least significant bits which are zero. */
        s = sp_cnt_lsb(n1);
        /* Divide n - 1 by 2**s into r. */
        (void)sp_rshb(n1, s, r);

        /* Compute y = b**r mod a */
        err = sp_exptmod(b, r, a, y);
    }
    if (err == MP_OKAY) {
        /* Assume probably prime until shown otherwise. */
        *result = MP_YES;

        /* If y != 1 and y != n1 do */
        if ((sp_cmp_d(y, 1) != MP_EQ) && (_sp_cmp(y, n1) != MP_EQ)) {
            int j = 1;
            /* While j <= s-1 and y != n1 */
            while ((j <= (s - 1)) && (_sp_cmp(y, n1) != MP_EQ)) {
                /* Square for bit shifted down. */
                err = sp_sqrmod(y, a, y);
                if (err != MP_OKAY) {
                    break;
                }

                /* If y == 1 then composite. */
                if (sp_cmp_d(y, 1) == MP_EQ) {
                    *result = MP_NO;
                    break;
                }
                ++j;
            }

            /* If y != n1 then composite. */
            if ((*result == MP_YES) && (_sp_cmp(y, n1) != MP_EQ)) {
                *result = MP_NO;
            }
        }
    }

    return err;
}

#if SP_WORD_SIZE == 8
/* Number of pre-computed primes. First n primes - fitting in a digit. */
#define SP_PRIME_SIZE      54

static const sp_int_digit sp_primes[SP_PRIME_SIZE] = {
    0x02, 0x03, 0x05, 0x07, 0x0B, 0x0D, 0x11, 0x13,
    0x17, 0x1D, 0x1F, 0x25, 0x29, 0x2B, 0x2F, 0x35,
    0x3B, 0x3D, 0x43, 0x47, 0x49, 0x4F, 0x53, 0x59,
    0x61, 0x65, 0x67, 0x6B, 0x6D, 0x71, 0x7F, 0x83,
    0x89, 0x8B, 0x95, 0x97, 0x9D, 0xA3, 0xA7, 0xAD,
    0xB3, 0xB5, 0xBF, 0xC1, 0xC5, 0xC7, 0xD3, 0xDF,
    0xE3, 0xE5, 0xE9, 0xEF, 0xF1, 0xFB
};
#else
/* Number of pre-computed primes. First n primes. */
#define SP_PRIME_SIZE      256

/* The first 256 primes. */
static const sp_uint16 sp_primes[SP_PRIME_SIZE] = {
    0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
    0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
    0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
    0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, 0x0083,
    0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
    0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
    0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
    0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,

    0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
    0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
    0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
    0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
    0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
    0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
    0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
    0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,

    0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
    0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
    0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
    0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
    0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
    0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
    0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
    0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,

    0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
    0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
    0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
    0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
    0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
    0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
    0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
    0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
};
#endif

/* Compare the first n primes with a.
 *
 * @param [in]  a       Number to check.
 * @param [out] result  Whether number was found to be prime.
 * @return  0 when no small prime matches.
 * @return  1 when small prime matches.
 */
static WC_INLINE int sp_cmp_primes(const sp_int* a, int* result)
{
    int i;
    int haveRes = 0;

    *result = MP_NO;
    /* Check one digit a against primes table. */
    for (i = 0; i < SP_PRIME_SIZE; i++) {
        if (sp_cmp_d(a, sp_primes[i]) == MP_EQ) {
            *result = MP_YES;
            haveRes = 1;
            break;
        }
    }

    return haveRes;
}

/* Using composites is only faster when using 64-bit values. */
#if !defined(WOLFSSL_SP_SMALL) && (SP_WORD_SIZE == 64)
/* Number of composites. */
#define SP_COMP_CNT     38

/* Products of small primes that fit into 64-bits. */
static sp_int_digit sp_comp[SP_COMP_CNT] = {
    0x088886ffdb344692, 0x34091fa96ffdf47b, 0x3c47d8d728a77ebb,
    0x077ab7da9d709ea9, 0x310df3e7bd4bc897, 0xe657d7a1fd5161d1,
    0x02ad3dbe0cca85ff, 0x0787f9a02c3388a7, 0x1113c5cc6d101657,
    0x2456c94f936bdb15, 0x4236a30b85ffe139, 0x805437b38eada69d,
    0x00723e97bddcd2af, 0x00a5a792ee239667, 0x00e451352ebca269,
    0x013a7955f14b7805, 0x01d37cbd653b06ff, 0x0288fe4eca4d7cdf,
    0x039fddb60d3af63d, 0x04cd73f19080fb03, 0x0639c390b9313f05,
    0x08a1c420d25d388f, 0x0b4b5322977db499, 0x0e94c170a802ee29,
    0x11f6a0e8356100df, 0x166c8898f7b3d683, 0x1babda0a0afd724b,
    0x2471b07c44024abf, 0x2d866dbc2558ad71, 0x3891410d45fb47df,
    0x425d5866b049e263, 0x51f767298e2cf13b, 0x6d9f9ece5fc74f13,
    0x7f5ffdb0f56ee64d, 0x943740d46a1bc71f, 0xaf2d7ca25cec848f,
    0xcec010484e4ad877, 0xef972c3cfafbcd25
};

/* Index of next prime after those used to create composite. */
static int sp_comp_idx[SP_COMP_CNT] = {
     15,  25,  34,  42,  50,  58,  65,  72,  79,  86,  93, 100, 106, 112, 118,
    124, 130, 136, 142, 148, 154, 160, 166, 172, 178, 184, 190, 196, 202, 208,
    214, 220, 226, 232, 238, 244, 250, 256
};
#endif

/* Determines whether any of the first n small primes divide a evenly.
 *
 * @param [in]      a        Number to check.
 * @param [in, out] haveRes  Boolean indicating a no prime result found.
 * @param [in, out] result   Whether a is known to be prime.
 * @return  MP_OKAY on success.
 * @return  Negative on failure.
 */
static WC_INLINE int sp_div_primes(const sp_int* a, int* haveRes, int* result)
{
    int i;
#if !defined(WOLFSSL_SP_SMALL) && (SP_WORD_SIZE == 64)
    int j;
#endif
    sp_int_digit d;
    int err = MP_OKAY;

#if defined(WOLFSSL_SP_SMALL) || (SP_WORD_SIZE < 64)
    /* Do trial division of a with all known small primes. */
    for (i = 0; i < SP_PRIME_SIZE; i++) {
        /* Small prime divides a when remainder is 0. */
        err = sp_mod_d(a, (sp_int_digit)sp_primes[i], &d);
        if ((err != MP_OKAY) || (d == 0)) {
            *result = MP_NO;
            *haveRes = 1;
            break;
        }
    }
#else
    /* Start with first prime in composite. */
    i = 0;
    for (j = 0; (!(*haveRes)) && (j < SP_COMP_CNT); j++) {
        /* Reduce a down to a single word.  */
        err = sp_mod_d(a, sp_comp[j], &d);
        if ((err != MP_OKAY) || (d == 0)) {
            *result = MP_NO;
            *haveRes = 1;
            break;
        }
        /* Do trial division of d with small primes that make up composite. */
        for (; i < sp_comp_idx[j]; i++) {
            /* Small prime divides a when remainder is 0. */
            if (d % sp_primes[i] == 0) {
                *result = MP_NO;
                *haveRes = 1;
                break;
            }
        }
    }
#endif

    return err;
}

/* Check whether a is prime by checking t iterations of Miller-Rabin.
 *
 * @param  [in]   a       SP integer to check.
 * @param  [in]   trials  Number of trials of Miller-Rabin test to perform.
 * @param  [out]  result  MP_YES when number is prime.
 *                        MP_NO otherwise.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_prime_trials(const sp_int* a, int trials, int* result)
{
    int err = MP_OKAY;
    int i;
    sp_int* n1;
    sp_int* r;
    DECL_SP_INT_ARRAY(t, a->used + 1, 2);
    DECL_SP_INT(b, a->used * 2 + 1);

    ALLOC_SP_INT_ARRAY(t, a->used + 1, 2, err, NULL);
    /* Allocate number that will hold modular exponentiation result. */
    ALLOC_SP_INT(b, a->used * 2 + 1, err, NULL);
    if (err == MP_OKAY) {
        n1 = t[0];
        r  = t[1];

        _sp_init_size(n1, a->used + 1);
        _sp_init_size(r, a->used + 1);
        _sp_init_size(b, a->used * 2 + 1);

        /* Do requested number of trials of Miller-Rabin test. */
        for (i = 0; i < trials; i++) {
            /* Miller-Rabin test with known small prime. */
            _sp_set(b, sp_primes[i]);
            err = sp_prime_miller_rabin(a, b, result, n1, r);
            if ((err != MP_OKAY) || (*result == MP_NO)) {
                break;
            }
        }

        /* Clear temporary values. */
        sp_clear(n1);
        sp_clear(r);
        sp_clear(b);
    }

    /* Free allocated temporary. */
    FREE_SP_INT(b, NULL);
    FREE_SP_INT_ARRAY(t, NULL);
    return err;
}

/* Check whether a is prime.
 * Checks against a number of small primes and does t iterations of
 * Miller-Rabin.
 *
 * @param  [in]   a       SP integer to check.
 * @param  [in]   trials  Number of trials of Miller-Rabin test to perform.
 * @param  [out]  result  MP_YES when number is prime.
 *                        MP_NO otherwise.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a or result is NULL, or trials is out of range.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
int sp_prime_is_prime(const sp_int* a, int trials, int* result)
{
    int         err = MP_OKAY;
    int         haveRes = 0;

    /* Validate parameters. */
    if ((a == NULL) || (result == NULL)) {
        if (result != NULL) {
            *result = MP_NO;
        }
        err = MP_VAL;
    }
    else if (a->used * 2 >= SP_INT_DIGITS) {
        err = MP_VAL;
    }
    /* Check validity of Miller-Rabin iterations count.
     * Must do at least one and need a unique pre-computed prime for each
     * iteration.
     */
    if ((err == MP_OKAY) && ((trials <= 0) || (trials > SP_PRIME_SIZE))) {
        *result = MP_NO;
        err = MP_VAL;
    }

    /* Short-cut, 1 is not prime. */
    if ((err == MP_OKAY) && sp_isone(a)) {
        *result = MP_NO;
        haveRes = 1;
    }

    SAVE_VECTOR_REGISTERS(err = _svr_ret;);

    /* Check against known small primes when a has 1 digit. */
    if ((err == MP_OKAY) && (!haveRes) && (a->used == 1) &&
            (a->dp[0] <= sp_primes[SP_PRIME_SIZE - 1])) {
        haveRes = sp_cmp_primes(a, result);
    }

    /* Check all small primes for even divisibility. */
    if ((err == MP_OKAY) && (!haveRes)) {
        err = sp_div_primes(a, &haveRes, result);
    }

    /* Check a number of iterations of Miller-Rabin with small primes. */
    if ((err == MP_OKAY) && (!haveRes)) {
        err = _sp_prime_trials(a, trials, result);
    }

    RESTORE_VECTOR_REGISTERS();

    return err;
}

#ifndef WC_NO_RNG
/* Check whether a is prime by doing t iterations of Miller-Rabin.
 *
 * t random numbers should give a (1/4)^t chance of a false prime.
 *
 * @param  [in]   a       SP integer to check.
 * @param  [in]   trials  Number of iterations of Miller-Rabin test to perform.
 * @param  [out]  result  MP_YES when number is prime.
 *                        MP_NO otherwise.
 * @param  [in]   rng     Random number generator for Miller-Rabin testing.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a, result or rng is NULL.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_prime_random_trials(const sp_int* a, int trials, int* result,
    WC_RNG* rng)
{
    int err = MP_OKAY;
    int bits = sp_count_bits(a);
    word32 baseSz = ((word32)bits + 7) / 8;
    DECL_SP_INT_ARRAY(ds, a->used + 1, 2);
    DECL_SP_INT_ARRAY(d, a->used * 2 + 1, 2);

    ALLOC_SP_INT_ARRAY(ds, a->used + 1, 2, err, NULL);
    ALLOC_SP_INT_ARRAY(d, a->used * 2 + 1, 2, err, NULL);
    if (err == MP_OKAY) {
        sp_int* c  = ds[0];
        sp_int* n1 = ds[1];
        sp_int* b  = d[0];
        sp_int* r  = d[1];

        _sp_init_size(c , a->used + 1);
        _sp_init_size(n1, a->used + 1);
        _sp_init_size(b , a->used * 2 + 1);
        _sp_init_size(r , a->used * 2 + 1);

        _sp_sub_d(a, 2, c);

        bits &= SP_WORD_MASK;

        /* Keep trying random numbers until all trials complete. */
        while (trials > 0) {
            /* Generate random trial number. */
            err = wc_RNG_GenerateBlock(rng, (byte*)b->dp, baseSz);
            if (err != MP_OKAY) {
                break;
            }
            b->used = a->used;
        #ifdef BIG_ENDIAN_ORDER
            /* Fix top digit if fewer bytes than a full digit generated. */
            if (((baseSz * 8) & SP_WORD_MASK) != 0) {
                b->dp[b->used-1] >>=
                    SP_WORD_SIZE - ((baseSz * 8) & SP_WORD_MASK);
            }
        #endif /* BIG_ENDIAN_ORDER */

            /* Ensure the top word has no more bits than necessary. */
            if (bits > 0) {
                b->dp[b->used - 1] &= ((sp_int_digit)1 << bits) - 1;
                sp_clamp(b);
            }

            /* Can't use random value it is: 0, 1, a-2, a-1, >= a  */
            if ((sp_cmp_d(b, 2) != MP_GT) || (_sp_cmp(b, c) != MP_LT)) {
                continue;
            }

            /* Perform Miller-Rabin test with random value. */
            err = sp_prime_miller_rabin(a, b, result, n1, r);
            if ((err != MP_OKAY) || (*result == MP_NO)) {
                break;
            }

            /* Trial complete. */
            trials--;
        }

        /* Zeroize temporary values used when generating private prime. */
        sp_forcezero(n1);
        sp_forcezero(r);
        sp_forcezero(b);
        sp_forcezero(c);
    }

    FREE_SP_INT_ARRAY(d, NULL);
    FREE_SP_INT_ARRAY(ds, NULL);
    return err;
}
#endif /*!WC_NO_RNG */

/* Check whether a is prime.
 * Checks against a number of small primes and does t iterations of
 * Miller-Rabin.
 *
 * @param  [in]   a       SP integer to check.
 * @param  [in]   trials  Number of iterations of Miller-Rabin test to perform.
 * @param  [out]  result  MP_YES when number is prime.
 *                        MP_NO otherwise.
 * @param  [in]   rng     Random number generator for Miller-Rabin testing.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a, result or rng is NULL.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
int sp_prime_is_prime_ex(const sp_int* a, int trials, int* result, WC_RNG* rng)
{
    int err = MP_OKAY;
    int ret = MP_YES;
    int haveRes = 0;

    if ((a == NULL) || (result == NULL) || (rng == NULL)) {
        err = MP_VAL;
    }
#ifndef WC_NO_RNG
    if ((err == MP_OKAY) && (a->used * 2 >= SP_INT_DIGITS)) {
        err = MP_VAL;
    }
#endif
#ifdef WOLFSSL_SP_INT_NEGATIVE
    if ((err == MP_OKAY) && (a->sign == MP_NEG)) {
        err = MP_VAL;
    }
#endif

    /* Ensure trials is valid. Maximum based on number of small primes
     * available. */
    if ((err == MP_OKAY) && ((trials <= 0) || (trials > SP_PRIME_SIZE))) {
        err = MP_VAL;
    }

    if ((err == MP_OKAY) && sp_isone(a)) {
        ret = MP_NO;
        haveRes = 1;
    }

    SAVE_VECTOR_REGISTERS(err = _svr_ret;);

    /* Check against known small primes when a has 1 digit. */
    if ((err == MP_OKAY) && (!haveRes) && (a->used == 1) &&
            (a->dp[0] <= (sp_int_digit)sp_primes[SP_PRIME_SIZE - 1])) {
        haveRes = sp_cmp_primes(a, &ret);
    }

    /* Check all small primes for even divisibility. */
    if ((err == MP_OKAY) && (!haveRes)) {
        err = sp_div_primes(a, &haveRes, &ret);
    }

#ifndef WC_NO_RNG
    /* Check a number of iterations of Miller-Rabin with random large values. */
    if ((err == MP_OKAY) && (!haveRes)) {
        err = _sp_prime_random_trials(a, trials, &ret, rng);
    }
#else
    (void)trials;
#endif /* !WC_NO_RNG */

    if (result != NULL) {
        *result = ret;
    }

    RESTORE_VECTOR_REGISTERS();

    return err;
}
#endif /* WOLFSSL_SP_PRIME_GEN */

#if !defined(NO_RSA) && defined(WOLFSSL_KEY_GEN)

/* Calculates the Greatest Common Denominator (GCD) of a and b into r.
 *
 * Find the largest number that divides both a and b without remainder.
 * r <= a, r <= b, a % r == 0, b % r == 0
 *
 * a and b are positive integers.
 *
 * Euclidian Algorithm:
 *  1. If a > b then a = b, b = a
 *  2. u = a
 *  3. v = b % a
 *  4. While v != 0
 *   4.1. t = u % v
 *   4.2. u <= v, v <= t, t <= u
 *  5. r = u
 *
 * @param  [in]   a  SP integer of first operand.
 * @param  [in]   b  SP integer of second operand.
 * @param  [out]  r  SP integer to hold result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static WC_INLINE int _sp_gcd(const sp_int* a, const sp_int* b, sp_int* r)
{
    int err = MP_OKAY;
    sp_int* u = NULL;
    sp_int* v = NULL;
    sp_int* t = NULL;
    /* Used for swapping sp_ints. */
    sp_int* s;
    /* Determine maximum digit length numbers will reach. */
    unsigned int used = (a->used >= b->used) ? a->used + 1 : b->used + 1;
    DECL_SP_INT_ARRAY(d, used, 3);

    SAVE_VECTOR_REGISTERS(err = _svr_ret;);

    ALLOC_SP_INT_ARRAY(d, used, 3, err, NULL);
    if (err == MP_OKAY) {
        u = d[0];
        v = d[1];
        t = d[2];

        _sp_init_size(u, used);
        _sp_init_size(v, used);
        _sp_init_size(t, used);

        /* 1. If a > b then a = b, b = a.
         *    Make a <= b.
         */
        if (_sp_cmp(a, b) == MP_GT) {
            const sp_int* tmp;
            tmp = a;
            a = b;
            b = tmp;
        }
        /* 2. u = a, v = b mod a */
        _sp_copy(a, u);
        /* 3. v = b mod a */
        if (a->used == 1) {
            err = sp_mod_d(b, a->dp[0], &v->dp[0]);
            v->used = (v->dp[0] != 0);
        }
        else {
            err = sp_mod(b, a, v);
        }
    }

    /* 4. While v != 0 */
    /* Keep reducing larger by smaller until smaller is 0 or u and v both one
     * digit.
     */
    while ((err == MP_OKAY) && (!sp_iszero(v)) && (u->used > 1)) {
        /* u' = v, v' = u mod v */
        /* 4.1 t = u mod v */
        if (v->used == 1) {
            err = sp_mod_d(u, v->dp[0], &t->dp[0]);
            t->used = (t->dp[0] != 0);
        }
        else {
            err = sp_mod(u, v, t);
        }
        /* 4.2. u <= v, v <= t, t <= u */
        s = u; u = v; v = t; t = s;
    }
    /* Only one digit remaining in u and v. */
    while ((err == MP_OKAY) && (!sp_iszero(v))) {
        /* u' = v, v' = u mod v */
        /* 4.1 t = u mod v */
        t->dp[0] = u->dp[0] % v->dp[0];
        t->used = (t->dp[0] != 0);
        /* 4.2. u <= v, v <= t, t <= u */
        s = u; u = v; v = t; t = s;
    }
    if (err == MP_OKAY) {
        /* 5. r = u */
        _sp_copy(u, r);
    }

    FREE_SP_INT_ARRAY(d, NULL);

    RESTORE_VECTOR_REGISTERS();

    return err;
}

/* Calculates the Greatest Common Denominator (GCD) of a and b into r.
 *
 * Find the largest number that divides both a and b without remainder.
 * r <= a, r <= b, a % r == 0, b % r == 0
 *
 * a and b are positive integers.
 *
 * @param  [in]   a  SP integer of first operand.
 * @param  [in]   b  SP integer of second operand.
 * @param  [out]  r  SP integer to hold result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a, b or r is NULL or too large.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
int sp_gcd(const sp_int* a, const sp_int* b, sp_int* r)
{
    int err = MP_OKAY;

    /* Validate parameters. */
    if ((a == NULL) || (b == NULL) || (r == NULL)) {
        err = MP_VAL;
    }
    /* Check that we have space in numbers to do work. */
    else if ((a->used >= SP_INT_DIGITS) || (b->used >= SP_INT_DIGITS)) {
        err = MP_VAL;
    }
    /* Check that r is large enough to hold maximum sized result. */
    else if (((a->used <= b->used) && (r->size < a->used)) ||
             ((b->used < a->used) && (r->size < b->used))) {
        err = MP_VAL;
    }
#ifdef WOLFSSL_SP_INT_NEGATIVE
    /* Algorithm doesn't work with negative numbers. */
    else if ((a->sign == MP_NEG) || (b->sign == MP_NEG)) {
        err = MP_VAL;
    }
#endif
    else if (sp_iszero(a)) {
        /* GCD of 0 and 0 is undefined - all integers divide 0. */
        if (sp_iszero(b)) {
            err = MP_VAL;
        }
        else {
            /* GCD of 0 and b is b - b divides 0. */
            err = sp_copy(b, r);
        }
    }
    else if (sp_iszero(b)) {
        /* GCD of 0 and a is a - a divides 0. */
        err = sp_copy(a, r);
    }
    else {
        /* Calculate GCD. */
        err = _sp_gcd(a, b, r);
    }

    return err;
}

#endif /* WOLFSSL_SP_MATH_ALL && !NO_RSA && WOLFSSL_KEY_GEN */

#if !defined(NO_RSA) && defined(WOLFSSL_KEY_GEN) && \
    (!defined(WC_RSA_BLINDING) || defined(HAVE_FIPS) || defined(HAVE_SELFTEST))

/* Calculates the Lowest Common Multiple (LCM) of a and b and stores in r.
 * Smallest number divisible by both numbers.
 *
 * a and b are positive integers.
 *
 * lcm(a, b) = (a / gcd(a, b)) * b
 * Divide the common divisor from a and multiply by b.
 *
 * Algorithm:
 *  1. t0 = gcd(a, b)
 *  2. If a > b then
 *   2.1. t1 = a / t0
 *   2.2. r = b * t1
 *  3. Else
 *   3.1. t1 = b / t0
 *   3.2. r = a * t1
 *
 * @param  [in]   a  SP integer of first operand.
 * @param  [in]   b  SP integer of second operand.
 * @param  [out]  r  SP integer to hold result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
static int _sp_lcm(const sp_int* a, const sp_int* b, sp_int* r)
{
    int err = MP_OKAY;
    /* Determine maximum digit length numbers will reach. */
    unsigned int used = ((a->used >= b->used) ? a->used + 1: b->used + 1);
    DECL_SP_INT_ARRAY(t, used, 2);

    ALLOC_SP_INT_ARRAY(t, used, 2, err, NULL);
    if (err == MP_OKAY) {
        _sp_init_size(t[0], used);
        _sp_init_size(t[1], used);

        SAVE_VECTOR_REGISTERS(err = _svr_ret;);

        if (err == MP_OKAY) {
            /* 1. t0 = gcd(a, b) */
            err = sp_gcd(a, b, t[0]);
        }

        if (err == MP_OKAY) {
            /* Divide the greater by the common divisor and multiply by other
             * to operate on the smallest length numbers.
             */
            /* 2. If a > b then */
            if (_sp_cmp_abs(a, b) == MP_GT) {
                /* 2.1. t1 = a / t0 */
                err = sp_div(a, t[0], t[1], NULL);
                if (err == MP_OKAY) {
                    /* 2.2. r = b * t1 */
                    err = sp_mul(b, t[1], r);
                }
            }
            /* 3. Else */
            else {
                /* 3.1. t1 = b / t0 */
                err = sp_div(b, t[0], t[1], NULL);
                if (err == MP_OKAY) {
                    /* 3.2. r = a * t1 */
                    err = sp_mul(a, t[1], r);
                }
            }
        }

        RESTORE_VECTOR_REGISTERS();
    }

    FREE_SP_INT_ARRAY(t, NULL);
    return err;
}

/* Calculates the Lowest Common Multiple (LCM) of a and b and stores in r.
 * Smallest number divisible by both numbers.
 *
 * a and b are positive integers.
 *
 * @param  [in]   a  SP integer of first operand.
 * @param  [in]   b  SP integer of second operand.
 * @param  [out]  r  SP integer to hold result.
 *
 * @return  MP_OKAY on success.
 * @return  MP_VAL when a, b or r is NULL; or a or b is zero.
 * @return  MP_MEM when dynamic memory allocation fails.
 */
int sp_lcm(const sp_int* a, const sp_int* b, sp_int* r)
{
    int err = MP_OKAY;

    /* Validate parameters. */
    if ((a == NULL) || (b == NULL) || (r == NULL)) {
        err = MP_VAL;
    }
#ifdef WOLFSSL_SP_INT_NEGATIVE
    /* Ensure a and b are positive. */
    else if ((a->sign == MP_NEG) || (b->sign >= MP_NEG)) {
        err = MP_VAL;
    }
#endif
    /* Ensure r has space for maximumal result. */
    else if (r->size < a->used + b->used) {
        err = MP_VAL;
    }

    /* LCM of 0 and any number is undefined as 0 is not in the set of values
     * being used.
     */
    if ((err == MP_OKAY) && (mp_iszero(a) || mp_iszero(b))) {
        err = MP_VAL;
    }

    if (err == MP_OKAY) {
        /* Do operation. */
        err = _sp_lcm(a, b, r);
    }

    return err;
}

#endif /* WOLFSSL_SP_MATH_ALL && !NO_RSA && WOLFSSL_KEY_GEN */

/* Returns the run time settings.
 *
 * @return  Settings value.
 */
word32 CheckRunTimeSettings(void)
{
    return CTC_SETTINGS;
}

/* Returns the fast math settings.
 *
 * @return  Setting - number of bits in a digit.
 */
word32 CheckRunTimeFastMath(void)
{
    return SP_WORD_SIZE;
}

#ifdef WOLFSSL_CHECK_MEM_ZERO
/* Add an MP to check.
 *
 * @param [in] name  Name of address to check.
 * @param [in] sp    sp_int that needs to be checked.
 */
void sp_memzero_add(const char* name, sp_int* sp)
{
    wc_MemZero_Add(name, sp->dp, sp->size * sizeof(sp_digit));
}

/* Check the memory in the data pointer for memory that must be zero.
 *
 * @param [in] sp    sp_int that needs to be checked.
 */
void sp_memzero_check(sp_int* sp)
{
    wc_MemZero_Check(sp->dp, sp->size * sizeof(sp_digit));
}
#endif /* WOLFSSL_CHECK_MEM_ZERO */

#if (!defined(WOLFSSL_SMALL_STACK) && !defined(SP_ALLOC)) || \
    defined(WOLFSSL_SP_NO_MALLOC)
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && \
    !defined(WOLFSSL_SP_NO_DYN_STACK)
#pragma GCC diagnostic pop
#endif
#endif

#endif /* WOLFSSL_SP_MATH || WOLFSSL_SP_MATH_ALL */