aes-gcm-armv8_64.pl 281 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728572957305731573257335734573557365737573857395740574157425743574457455746574757485749575057515752575357545755575657575758575957605761576257635764576557665767576857695770577157725773577457755776577757785779578057815782578357845785578657875788578957905791579257935794579557965797579857995800580158025803580458055806580758085809581058115812581358145815581658175818581958205821582258235824582558265827582858295830583158325833583458355836583758385839584058415842584358445845584658475848584958505851585258535854585558565857585858595860586158625863586458655866586758685869587058715872587358745875587658775878587958805881588258835884588558865887588858895890589158925893589458955896589758985899590059015902590359045905590659075908590959105911591259135914591559165917591859195920592159225923592459255926592759285929593059315932593359345935593659375938593959405941594259435944594559465947594859495950595159525953595459555956595759585959596059615962596359645965596659675968596959705971597259735974597559765977597859795980598159825983598459855986598759885989599059915992599359945995599659975998599960006001600260036004600560066007600860096010601160126013601460156016601760186019602060216022602360246025602660276028602960306031603260336034603560366037603860396040604160426043604460456046604760486049605060516052605360546055605660576058605960606061606260636064606560666067606860696070607160726073607460756076607760786079608060816082608360846085608660876088608960906091609260936094609560966097609860996100610161026103
  1. #! /usr/bin/env perl
  2. # Copyright 2019-2023 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. #========================================================================
  10. # Written by Fangming Fang <fangming.fang@arm.com> for the OpenSSL project,
  11. # derived from https://github.com/ARM-software/AArch64cryptolib, original
  12. # author Samuel Lee <Samuel.Lee@arm.com>. The module is, however, dual
  13. # licensed under OpenSSL and CRYPTOGAMS licenses depending on where you
  14. # obtain it. For further details see http://www.openssl.org/~appro/cryptogams/.
  15. #========================================================================
  16. #
  17. # Approach - assume we don't want to reload constants, so reserve ~half of vector register file for constants
  18. #
  19. # main loop to act on 4 16B blocks per iteration, and then do modulo of the accumulated intermediate hashes from the 4 blocks
  20. #
  21. # ____________________________________________________
  22. # | |
  23. # | PRE |
  24. # |____________________________________________________|
  25. # | | | |
  26. # | CTR block 4k+8 | AES block 4k+4 | GHASH block 4k+0 |
  27. # |________________|________________|__________________|
  28. # | | | |
  29. # | CTR block 4k+9 | AES block 4k+5 | GHASH block 4k+1 |
  30. # |________________|________________|__________________|
  31. # | | | |
  32. # | CTR block 4k+10| AES block 4k+6 | GHASH block 4k+2 |
  33. # |________________|________________|__________________|
  34. # | | | |
  35. # | CTR block 4k+11| AES block 4k+7 | GHASH block 4k+3 |
  36. # |________________|____(mostly)____|__________________|
  37. # | |
  38. # | MODULO |
  39. # |____________________________________________________|
  40. #
  41. # PRE:
  42. # Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0
  43. # EXT low_acc, low_acc, low_acc, #8
  44. # EOR res_curr (4k+0), res_curr (4k+0), low_acc
  45. #
  46. # CTR block:
  47. # Increment and byte reverse counter in scalar registers and transfer to SIMD registers
  48. # REV ctr32, rev_ctr32
  49. # ORR ctr64, constctr96_top32, ctr32, LSL #32
  50. # INS ctr_next.d[0], constctr96_bottom64 // Keeping this in scalar registers to free up space in SIMD RF
  51. # INS ctr_next.d[1], ctr64X
  52. # ADD rev_ctr32, #1
  53. #
  54. # AES block:
  55. # Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example.
  56. # Doing small trick here of loading input in scalar registers, EORing with last key and then transferring
  57. # Given we are very constrained in our ASIMD registers this is quite important
  58. #
  59. # Encrypt:
  60. # LDR input_low, [ input_ptr ], #8
  61. # LDR input_high, [ input_ptr ], #8
  62. # EOR input_low, k14_low
  63. # EOR input_high, k14_high
  64. # INS res_curr.d[0], input_low
  65. # INS res_curr.d[1], input_high
  66. # AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
  67. # AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
  68. # AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
  69. # AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
  70. # AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
  71. # AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
  72. # AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
  73. # AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
  74. # AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
  75. # AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
  76. # AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
  77. # AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
  78. # AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
  79. # AESE ctr_curr, k13
  80. # EOR res_curr, res_curr, ctr_curr
  81. # ST1 { res_curr.16b }, [ output_ptr ], #16
  82. #
  83. # Decrypt:
  84. # AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
  85. # AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
  86. # AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
  87. # AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
  88. # AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
  89. # AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
  90. # AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
  91. # AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
  92. # AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
  93. # AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
  94. # AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
  95. # AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
  96. # AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
  97. # AESE ctr_curr, k13
  98. # LDR res_curr, [ input_ptr ], #16
  99. # EOR res_curr, res_curr, ctr_curr
  100. # MOV output_low, res_curr.d[0]
  101. # MOV output_high, res_curr.d[1]
  102. # EOR output_low, k14_low
  103. # EOR output_high, k14_high
  104. # STP output_low, output_high, [ output_ptr ], #16
  105. #
  106. # GHASH block X:
  107. # do 128b karatsuba polynomial multiplication on block
  108. # We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b
  109. #
  110. # multiplication:
  111. # Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
  112. #
  113. # The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies:
  114. # Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64
  115. #
  116. # There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are
  117. # multiplying with "twisted" powers of H
  118. #
  119. # Note: We can PMULL directly into the acc_x in first GHASH of the loop
  120. # Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical
  121. # path latency dominates the performance
  122. #
  123. # This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers
  124. # than indicated here
  125. # REV64 res_curr, res_curr
  126. # INS t_m.d[0], res_curr.d[1]
  127. # EOR t_m.8B, t_m.8B, res_curr.8B
  128. # PMULL2 t_h, res_curr, HX
  129. # PMULL t_l, res_curr, HX
  130. # PMULL t_m, t_m, HX_k
  131. # EOR acc_h, acc_h, t_h
  132. # EOR acc_l, acc_l, t_l
  133. # EOR acc_m, acc_m, t_m
  134. #
  135. # MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them
  136. # There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo
  137. # with a reversed constant
  138. # EOR acc_m, acc_m, acc_h
  139. # EOR acc_m, acc_m, acc_l // Finish off karatsuba processing
  140. # PMULL t_mod, acc_h, mod_constant
  141. # EXT acc_h, acc_h, acc_h, #8
  142. # EOR acc_m, acc_m, acc_h
  143. # EOR acc_m, acc_m, t_mod
  144. # PMULL acc_h, acc_m, mod_constant
  145. # EXT acc_m, acc_m, acc_m, #8
  146. # EOR acc_l, acc_l, acc_h
  147. # EOR acc_l, acc_l, acc_m
  148. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  149. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  150. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  151. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  152. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
  153. die "can't locate arm-xlate.pl";
  154. open OUT,"| \"$^X\" $xlate $flavour $output";
  155. *STDOUT=*OUT;
  156. $input_ptr="x0"; #argument block
  157. $bit_length="x1";
  158. $output_ptr="x2";
  159. $current_tag="x3";
  160. $counter="x16";
  161. $cc="x8";
  162. {
  163. my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
  164. my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
  165. my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
  166. my ($output_l0,$output_h0)=map("x$_",(6..7));
  167. my $ctr32w="w9";
  168. my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk10_l,$rk10_h,$len)=map("x$_",(9..15));
  169. my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
  170. my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
  171. my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
  172. my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
  173. my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
  174. my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
  175. my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
  176. my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
  177. my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
  178. my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
  179. my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
  180. my $t0="v8";
  181. my $t0d="d8";
  182. my ($t1,$t2,$t3)=map("v$_",(28..30));
  183. my ($t1d,$t2d,$t3d)=map("d$_",(28..30));
  184. my $t4="v8";
  185. my $t4d="d8";
  186. my $t5="v28";
  187. my $t5d="d28";
  188. my $t6="v31";
  189. my $t6d="d31";
  190. my $t7="v4";
  191. my $t7d="d4";
  192. my $t8="v29";
  193. my $t8d="d29";
  194. my $t9="v30";
  195. my $t9d="d30";
  196. my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
  197. my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
  198. my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
  199. my $mod_constantd="d8";
  200. my $mod_constant="v8";
  201. my $mod_t="v31";
  202. my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9)=map("v$_.16b",(18..27));
  203. my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s)=map("v$_.4s",(18..27));
  204. my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q)=map("q$_",(18..27));
  205. my $rk2q1="v20.1q";
  206. my $rk3q1="v21.1q";
  207. my $rk4v="v22";
  208. my $rk4d="d22";
  209. $code=<<___;
  210. #include "arm_arch.h"
  211. #if __ARM_MAX_ARCH__>=8
  212. ___
  213. $code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
  214. $code.=<<___ if ($flavour !~ /64/);
  215. .fpu neon
  216. #ifdef __thumb2__
  217. .syntax unified
  218. .thumb
  219. # define INST(a,b,c,d) $_byte c,0xef,a,b
  220. #else
  221. .code 32
  222. # define INST(a,b,c,d) $_byte a,b,c,0xf2
  223. #endif
  224. .text
  225. ___
  226. #########################################################################################
  227. # size_t aes_gcm_enc_128_kernel(const uint8_t * plaintext,
  228. # uint64_t plaintext_length,
  229. # uint8_t * ciphertext,
  230. # uint64_t *Xi,
  231. # unsigned char ivec[16],
  232. # const void *key);
  233. #
  234. $code.=<<___;
  235. .global aes_gcm_enc_128_kernel
  236. .type aes_gcm_enc_128_kernel,%function
  237. .align 4
  238. aes_gcm_enc_128_kernel:
  239. AARCH64_VALID_CALL_TARGET
  240. cbz x1, .L128_enc_ret
  241. stp x19, x20, [sp, #-112]!
  242. mov x16, x4
  243. mov x8, x5
  244. stp x21, x22, [sp, #16]
  245. stp x23, x24, [sp, #32]
  246. stp d8, d9, [sp, #48]
  247. stp d10, d11, [sp, #64]
  248. stp d12, d13, [sp, #80]
  249. stp d14, d15, [sp, #96]
  250. ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
  251. #ifdef __AARCH64EB__
  252. rev $ctr96_b64x, $ctr96_b64x
  253. rev $ctr96_t32x, $ctr96_t32x
  254. #endif
  255. ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10
  256. #ifdef __AARCH64EB__
  257. ror $rk10_l, $rk10_l, #32
  258. ror $rk10_h, $rk10_h, #32
  259. #endif
  260. ld1 {$acc_lb}, [$current_tag]
  261. ext $acc_lb, $acc_lb, $acc_lb, #8
  262. rev64 $acc_lb, $acc_lb
  263. lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
  264. mov $len, $main_end_input_ptr
  265. ld1 {$rk0s}, [$cc], #16 @ load rk0
  266. add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
  267. sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
  268. lsr $rctr32x, $ctr96_t32x, #32
  269. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  270. #ifndef __AARCH64EB__
  271. ext $h4b, $h4b, $h4b, #8
  272. #endif
  273. fmov $ctr1d, $ctr96_b64x @ CTR block 1
  274. rev $rctr32w, $rctr32w @ rev_ctr32
  275. add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
  276. orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
  277. ld1 {$rk1s}, [$cc], #16 @ load rk1
  278. rev $ctr32w, $rctr32w @ CTR block 1
  279. add $rctr32w, $rctr32w, #1 @ CTR block 1
  280. fmov $ctr3d, $ctr96_b64x @ CTR block 3
  281. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
  282. ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
  283. fmov $ctr1.d[1], $ctr32x @ CTR block 1
  284. rev $ctr32w, $rctr32w @ CTR block 2
  285. fmov $ctr2d, $ctr96_b64x @ CTR block 2
  286. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
  287. add $rctr32w, $rctr32w, #1 @ CTR block 2
  288. fmov $ctr2.d[1], $ctr32x @ CTR block 2
  289. rev $ctr32w, $rctr32w @ CTR block 3
  290. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
  291. ld1 {$rk2s}, [$cc], #16 @ load rk2
  292. add $rctr32w, $rctr32w, #1 @ CTR block 3
  293. fmov $ctr3.d[1], $ctr32x @ CTR block 3
  294. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  295. #ifndef __AARCH64EB__
  296. ext $h3b, $h3b, $h3b, #8
  297. #endif
  298. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
  299. ld1 {$rk3s}, [$cc], #16 @ load rk3
  300. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
  301. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  302. #ifndef __AARCH64EB__
  303. ext $h1b, $h1b, $h1b, #8
  304. #endif
  305. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
  306. ld1 {$rk4s}, [$cc], #16 @ load rk4
  307. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
  308. ld1 {$rk5s}, [$cc], #16 @ load rk5
  309. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
  310. trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
  311. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
  312. ld1 {$rk6s}, [$cc], #16 @ load rk6
  313. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
  314. ld1 {$rk7s}, [$cc], #16 @ load rk7
  315. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
  316. trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
  317. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
  318. ld1 {$rk8s}, [$cc], #16 @ load rk8
  319. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
  320. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  321. #ifndef __AARCH64EB__
  322. ext $h2b, $h2b, $h2b, #8
  323. #endif
  324. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
  325. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
  326. eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
  327. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
  328. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
  329. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
  330. ld1 {$rk9s}, [$cc], #16 @ load rk9
  331. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
  332. and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
  333. trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
  334. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
  335. add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
  336. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
  337. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
  338. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
  339. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
  340. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
  341. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
  342. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
  343. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
  344. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
  345. trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
  346. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
  347. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
  348. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
  349. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
  350. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
  351. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
  352. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
  353. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
  354. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
  355. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
  356. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
  357. aese $ctr2b, $rk9 @ AES block 2 - round 9
  358. aese $ctr0b, $rk9 @ AES block 0 - round 9
  359. eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
  360. aese $ctr1b, $rk9 @ AES block 1 - round 9
  361. aese $ctr3b, $rk9 @ AES block 3 - round 9
  362. b.ge .L128_enc_tail @ handle tail
  363. ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
  364. #ifdef __AARCH64EB__
  365. rev $input_l0, $input_l0
  366. rev $input_h0, $input_h0
  367. #endif
  368. ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
  369. #ifdef __AARCH64EB__
  370. rev $input_l2, $input_l2
  371. rev $input_h2, $input_h2
  372. #endif
  373. ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
  374. #ifdef __AARCH64EB__
  375. rev $input_l1, $input_l1
  376. rev $input_h1, $input_h1
  377. #endif
  378. ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
  379. #ifdef __AARCH64EB__
  380. rev $input_l3, $input_l3
  381. rev $input_h3, $input_h3
  382. #endif
  383. eor $input_l0, $input_l0, $rk10_l @ AES block 0 - round 10 low
  384. eor $input_h0, $input_h0, $rk10_h @ AES block 0 - round 10 high
  385. eor $input_l2, $input_l2, $rk10_l @ AES block 2 - round 10 low
  386. fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
  387. eor $input_l1, $input_l1, $rk10_l @ AES block 1 - round 10 low
  388. eor $input_h2, $input_h2, $rk10_h @ AES block 2 - round 10 high
  389. fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
  390. fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
  391. eor $input_h1, $input_h1, $rk10_h @ AES block 1 - round 10 high
  392. eor $input_l3, $input_l3, $rk10_l @ AES block 3 - round 10 low
  393. fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
  394. fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
  395. eor $input_h3, $input_h3, $rk10_h @ AES block 3 - round 10 high
  396. rev $ctr32w, $rctr32w @ CTR block 4
  397. fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
  398. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
  399. eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
  400. fmov $ctr0d, $ctr96_b64x @ CTR block 4
  401. add $rctr32w, $rctr32w, #1 @ CTR block 4
  402. fmov $ctr0.d[1], $ctr32x @ CTR block 4
  403. rev $ctr32w, $rctr32w @ CTR block 5
  404. eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
  405. fmov $ctr1d, $ctr96_b64x @ CTR block 5
  406. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
  407. add $rctr32w, $rctr32w, #1 @ CTR block 5
  408. add $input_ptr, $input_ptr, #64 @ AES input_ptr update
  409. fmov $ctr1.d[1], $ctr32x @ CTR block 5
  410. fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
  411. rev $ctr32w, $rctr32w @ CTR block 6
  412. st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
  413. fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
  414. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
  415. add $rctr32w, $rctr32w, #1 @ CTR block 6
  416. eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
  417. st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
  418. fmov $ctr2d, $ctr96_b64x @ CTR block 6
  419. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  420. fmov $ctr2.d[1], $ctr32x @ CTR block 6
  421. rev $ctr32w, $rctr32w @ CTR block 7
  422. st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
  423. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
  424. eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
  425. st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
  426. b.ge .L128_enc_prepretail @ do prepretail
  427. .L128_enc_main_loop: @ main loop start
  428. ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext
  429. #ifdef __AARCH64EB__
  430. rev $input_l3, $input_l3
  431. rev $input_h3, $input_h3
  432. #endif
  433. rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
  434. rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
  435. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  436. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
  437. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  438. rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
  439. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  440. add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
  441. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
  442. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  443. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  444. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  445. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  446. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  447. eor $res0b, $res0b, $acc_lb @ PRE 1
  448. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  449. eor $input_h3, $input_h3, $rk10_h @ AES block 4k+3 - round 10 high
  450. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  451. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  452. ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
  453. #ifdef __AARCH64EB__
  454. rev $input_l0, $input_l0
  455. rev $input_h0, $input_h0
  456. #endif
  457. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  458. rev $ctr32w, $rctr32w @ CTR block 4k+8
  459. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  460. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  461. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
  462. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  463. add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
  464. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  465. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  466. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  467. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  468. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  469. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  470. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  471. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  472. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  473. rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
  474. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  475. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  476. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  477. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  478. eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high
  479. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  480. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  481. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  482. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  483. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  484. eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low
  485. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  486. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  487. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  488. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  489. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  490. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  491. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  492. movi $mod_constant.8b, #0xc2
  493. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  494. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  495. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  496. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  497. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  498. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  499. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  500. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  501. ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
  502. #ifdef __AARCH64EB__
  503. rev $input_l1, $input_l1
  504. rev $input_h1, $input_h1
  505. #endif
  506. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  507. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  508. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  509. ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
  510. #ifdef __AARCH64EB__
  511. rev $input_l2, $input_l2
  512. rev $input_h2, $input_h2
  513. #endif
  514. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  515. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  516. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  517. eor $input_l1, $input_l1, $rk10_l @ AES block 4k+5 - round 10 low
  518. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  519. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  520. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  521. eor $input_l3, $input_l3, $rk10_l @ AES block 4k+3 - round 10 low
  522. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  523. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  524. fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
  525. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  526. fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
  527. add $input_ptr, $input_ptr, #64 @ AES input_ptr update
  528. fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low
  529. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  530. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  531. fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
  532. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  533. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  534. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  535. eor $input_h1, $input_h1, $rk10_h @ AES block 4k+5 - round 10 high
  536. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  537. fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
  538. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  539. fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high
  540. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  541. cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
  542. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  543. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  544. aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
  545. eor $input_l2, $input_l2, $rk10_l @ AES block 4k+6 - round 10 low
  546. eor $input_h2, $input_h2, $rk10_h @ AES block 4k+6 - round 10 high
  547. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  548. fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
  549. aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
  550. fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
  551. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  552. eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
  553. fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
  554. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  555. fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
  556. rev $ctr32w, $rctr32w @ CTR block 4k+9
  557. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  558. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  559. eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
  560. add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
  561. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
  562. fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
  563. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  564. fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
  565. rev $ctr32w, $rctr32w @ CTR block 4k+10
  566. aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
  567. st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
  568. eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
  569. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
  570. aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
  571. add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
  572. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  573. fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
  574. eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
  575. st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
  576. fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
  577. st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
  578. rev $ctr32w, $rctr32w @ CTR block 4k+11
  579. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
  580. eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result
  581. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  582. st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result
  583. b.lt .L128_enc_main_loop
  584. .L128_enc_prepretail: @ PREPRETAIL
  585. rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
  586. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
  587. rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
  588. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  589. add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
  590. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
  591. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  592. rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
  593. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  594. rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
  595. eor $res0b, $res0b, $acc_lb @ PRE 1
  596. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  597. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  598. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  599. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  600. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  601. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  602. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  603. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  604. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  605. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  606. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  607. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  608. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  609. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  610. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  611. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  612. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  613. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  614. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  615. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  616. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  617. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  618. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  619. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  620. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  621. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  622. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  623. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  624. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  625. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  626. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  627. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  628. movi $mod_constant.8b, #0xc2
  629. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  630. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  631. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  632. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  633. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  634. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  635. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  636. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  637. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  638. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  639. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  640. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  641. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  642. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  643. pmull $t1.1q, $acc_h.1d, $mod_constant.1d
  644. eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
  645. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  646. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  647. ext $acc_hb, $acc_hb, $acc_hb, #8
  648. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  649. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  650. eor $acc_mb, $acc_mb, $acc_lb
  651. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  652. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  653. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  654. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  655. eor $acc_mb, $acc_mb, $t1.16b
  656. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  657. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  658. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  659. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  660. eor $acc_mb, $acc_mb, $acc_hb
  661. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  662. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  663. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  664. pmull $t1.1q, $acc_m.1d, $mod_constant.1d
  665. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  666. ext $acc_mb, $acc_mb, $acc_mb, #8
  667. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  668. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  669. eor $acc_lb, $acc_lb, $t1.16b
  670. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  671. aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
  672. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  673. aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
  674. aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
  675. eor $acc_lb, $acc_lb, $acc_mb
  676. aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
  677. .L128_enc_tail: @ TAIL
  678. sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
  679. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
  680. #ifdef __AARCH64EB__
  681. rev $input_l0, $input_l0
  682. rev $input_h0, $input_h0
  683. #endif
  684. cmp $main_end_input_ptr, #48
  685. ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
  686. eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low
  687. eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high
  688. fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
  689. fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
  690. eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
  691. b.gt .L128_enc_blocks_more_than_3
  692. sub $rctr32w, $rctr32w, #1
  693. movi $acc_l.8b, #0
  694. mov $ctr3b, $ctr2b
  695. cmp $main_end_input_ptr, #32
  696. mov $ctr2b, $ctr1b
  697. movi $acc_h.8b, #0
  698. movi $acc_m.8b, #0
  699. b.gt .L128_enc_blocks_more_than_2
  700. mov $ctr3b, $ctr1b
  701. cmp $main_end_input_ptr, #16
  702. sub $rctr32w, $rctr32w, #1
  703. b.gt .L128_enc_blocks_more_than_1
  704. sub $rctr32w, $rctr32w, #1
  705. b .L128_enc_blocks_less_than_1
  706. .L128_enc_blocks_more_than_3: @ blocks left > 3
  707. st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
  708. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
  709. #ifdef __AARCH64EB__
  710. rev $input_l0, $input_l0
  711. rev $input_h0, $input_h0
  712. #endif
  713. rev64 $res0b, $res1b @ GHASH final-3 block
  714. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  715. eor $input_h0, $input_h0, $rk10_h @ AES final-2 block - round 10 high
  716. eor $input_l0, $input_l0, $rk10_l @ AES final-2 block - round 10 low
  717. fmov $res1d, $input_l0 @ AES final-2 block - mov low
  718. movi $t0.8b, #0 @ suppress further partial tag feed in
  719. fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
  720. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
  721. mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
  722. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
  723. mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
  724. eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
  725. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
  726. pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
  727. .L128_enc_blocks_more_than_2: @ blocks left > 2
  728. st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
  729. rev64 $res0b, $res1b @ GHASH final-2 block
  730. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
  731. #ifdef __AARCH64EB__
  732. rev $input_l0, $input_l0
  733. rev $input_h0, $input_h0
  734. #endif
  735. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  736. eor $input_l0, $input_l0, $rk10_l @ AES final-1 block - round 10 low
  737. fmov $res1d, $input_l0 @ AES final-1 block - mov low
  738. eor $input_h0, $input_h0, $rk10_h @ AES final-1 block - round 10 high
  739. pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
  740. fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
  741. mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
  742. pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
  743. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
  744. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
  745. eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
  746. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
  747. pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
  748. movi $t0.8b, #0 @ suppress further partial tag feed in
  749. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
  750. .L128_enc_blocks_more_than_1: @ blocks left > 1
  751. st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
  752. rev64 $res0b, $res1b @ GHASH final-1 block
  753. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
  754. #ifdef __AARCH64EB__
  755. rev $input_l0, $input_l0
  756. rev $input_h0, $input_h0
  757. #endif
  758. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  759. eor $input_h0, $input_h0, $rk10_h @ AES final block - round 10 high
  760. eor $input_l0, $input_l0, $rk10_l @ AES final block - round 10 low
  761. fmov $res1d, $input_l0 @ AES final block - mov low
  762. pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
  763. fmov $res1.d[1], $input_h0 @ AES final block - mov high
  764. mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
  765. pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
  766. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
  767. eor $res1b, $res1b, $ctr3b @ AES final block - result
  768. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
  769. pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
  770. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
  771. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
  772. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
  773. movi $t0.8b, #0 @ suppress further partial tag feed in
  774. .L128_enc_blocks_less_than_1: @ blocks left <= 1
  775. and $bit_length, $bit_length, #127 @ bit_length %= 128
  776. mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff
  777. mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff
  778. sub $bit_length, $bit_length, #128 @ bit_length -= 128
  779. neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
  780. and $bit_length, $bit_length, #127 @ bit_length %= 128
  781. lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block
  782. cmp $bit_length, #64
  783. csel $input_l0, $rk10_l, $rk10_h, lt
  784. csel $input_h0, $rk10_h, xzr, lt
  785. fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
  786. fmov $ctr0.d[1], $input_h0
  787. and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
  788. rev64 $res0b, $res1b @ GHASH final block
  789. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  790. mov $t0d, $res0.d[1] @ GHASH final block - mid
  791. pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
  792. ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
  793. eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
  794. #ifndef __AARCH64EB__
  795. rev $ctr32w, $rctr32w
  796. #else
  797. mov $ctr32w, $rctr32w
  798. #endif
  799. pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
  800. pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
  801. eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
  802. eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
  803. eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
  804. movi $mod_constant.8b, #0xc2
  805. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  806. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  807. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  808. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  809. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  810. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  811. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  812. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  813. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  814. bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
  815. eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
  816. st1 { $res1b}, [$output_ptr] @ store all 16B
  817. str $ctr32w, [$counter, #12] @ store the updated counter
  818. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  819. ext $acc_lb, $acc_lb, $acc_lb, #8
  820. rev64 $acc_lb, $acc_lb
  821. mov x0, $len
  822. st1 { $acc_l.16b }, [$current_tag]
  823. ldp x21, x22, [sp, #16]
  824. ldp x23, x24, [sp, #32]
  825. ldp d8, d9, [sp, #48]
  826. ldp d10, d11, [sp, #64]
  827. ldp d12, d13, [sp, #80]
  828. ldp d14, d15, [sp, #96]
  829. ldp x19, x20, [sp], #112
  830. ret
  831. .L128_enc_ret:
  832. mov w0, #0x0
  833. ret
  834. .size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel
  835. ___
  836. #########################################################################################
  837. # size_t aes_gcm_dec_128_kernel(const uint8_t * ciphertext,
  838. # uint64_t plaintext_length,
  839. # uint8_t * plaintext,
  840. # uint64_t *Xi,
  841. # unsigned char ivec[16],
  842. # const void *key);
  843. #
  844. $code.=<<___;
  845. .global aes_gcm_dec_128_kernel
  846. .type aes_gcm_dec_128_kernel,%function
  847. .align 4
  848. aes_gcm_dec_128_kernel:
  849. AARCH64_VALID_CALL_TARGET
  850. cbz x1, .L128_dec_ret
  851. stp x19, x20, [sp, #-112]!
  852. mov x16, x4
  853. mov x8, x5
  854. stp x21, x22, [sp, #16]
  855. stp x23, x24, [sp, #32]
  856. stp d8, d9, [sp, #48]
  857. stp d10, d11, [sp, #64]
  858. stp d12, d13, [sp, #80]
  859. stp d14, d15, [sp, #96]
  860. lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
  861. mov $len, $main_end_input_ptr
  862. ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
  863. #ifdef __AARCH64EB__
  864. rev $ctr96_b64x, $ctr96_b64x
  865. rev $ctr96_t32x, $ctr96_t32x
  866. #endif
  867. ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10
  868. #ifdef __AARCH64EB__
  869. ror $rk10_h, $rk10_h, 32
  870. ror $rk10_l, $rk10_l, 32
  871. #endif
  872. sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
  873. ld1 {$rk0s}, [$cc], #16 @ load rk0
  874. and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
  875. ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
  876. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  877. #ifndef __AARCH64EB__
  878. ext $h2b, $h2b, $h2b, #8
  879. #endif
  880. lsr $rctr32x, $ctr96_t32x, #32
  881. fmov $ctr2d, $ctr96_b64x @ CTR block 2
  882. ld1 {$rk1s}, [$cc], #16 @ load rk1
  883. orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
  884. rev $rctr32w, $rctr32w @ rev_ctr32
  885. fmov $ctr1d, $ctr96_b64x @ CTR block 1
  886. add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
  887. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
  888. rev $ctr32w, $rctr32w @ CTR block 1
  889. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
  890. ld1 {$rk2s}, [$cc], #16 @ load rk2
  891. add $rctr32w, $rctr32w, #1 @ CTR block 1
  892. fmov $ctr1.d[1], $ctr32x @ CTR block 1
  893. rev $ctr32w, $rctr32w @ CTR block 2
  894. add $rctr32w, $rctr32w, #1 @ CTR block 2
  895. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
  896. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
  897. fmov $ctr2.d[1], $ctr32x @ CTR block 2
  898. rev $ctr32w, $rctr32w @ CTR block 3
  899. fmov $ctr3d, $ctr96_b64x @ CTR block 3
  900. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
  901. add $rctr32w, $rctr32w, #1 @ CTR block 3
  902. fmov $ctr3.d[1], $ctr32x @ CTR block 3
  903. add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
  904. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
  905. ld1 {$rk3s}, [$cc], #16 @ load rk3
  906. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
  907. ld1 {$rk4s}, [$cc], #16 @ load rk4
  908. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
  909. ld1 {$rk5s}, [$cc], #16 @ load rk5
  910. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
  911. ld1 {$rk6s}, [$cc], #16 @ load rk6
  912. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
  913. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
  914. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
  915. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
  916. ld1 { $acc_lb}, [$current_tag]
  917. ext $acc_lb, $acc_lb, $acc_lb, #8
  918. rev64 $acc_lb, $acc_lb
  919. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
  920. ld1 {$rk7s}, [$cc], #16 @ load rk7
  921. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
  922. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
  923. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
  924. ld1 {$rk8s}, [$cc], #16 @ load rk8
  925. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
  926. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
  927. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
  928. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  929. #ifndef __AARCH64EB__
  930. ext $h3b, $h3b, $h3b, #8
  931. #endif
  932. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
  933. ld1 {$rk9s}, [$cc], #16 @ load rk9
  934. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
  935. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
  936. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
  937. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
  938. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
  939. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  940. #ifndef __AARCH64EB__
  941. ext $h1b, $h1b, $h1b, #8
  942. #endif
  943. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
  944. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
  945. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
  946. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
  947. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
  948. trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
  949. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  950. #ifndef __AARCH64EB__
  951. ext $h4b, $h4b, $h4b, #8
  952. #endif
  953. trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
  954. add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
  955. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
  956. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
  957. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
  958. eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
  959. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
  960. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
  961. trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
  962. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
  963. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
  964. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
  965. trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
  966. aese $ctr2b, $rk9 @ AES block 2 - round 9
  967. aese $ctr3b, $rk9 @ AES block 3 - round 9
  968. aese $ctr0b, $rk9 @ AES block 0 - round 9
  969. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
  970. aese $ctr1b, $rk9 @ AES block 1 - round 9
  971. eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
  972. b.ge .L128_dec_tail @ handle tail
  973. ld1 {$res0b, $res1b}, [$input_ptr], #32 @ AES block 0 - load ciphertext; AES block 1 - load ciphertext
  974. eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
  975. ld1 {$res2b}, [$input_ptr], #16 @ AES block 2 - load ciphertext
  976. eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
  977. rev64 $res0b, $res0b @ GHASH block 0
  978. rev $ctr32w, $rctr32w @ CTR block 4
  979. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
  980. add $rctr32w, $rctr32w, #1 @ CTR block 4
  981. ld1 {$res3b}, [$input_ptr], #16 @ AES block 3 - load ciphertext
  982. rev64 $res1b, $res1b @ GHASH block 1
  983. mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
  984. mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
  985. mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
  986. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  987. mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
  988. fmov $ctr0d, $ctr96_b64x @ CTR block 4
  989. fmov $ctr0.d[1], $ctr32x @ CTR block 4
  990. rev $ctr32w, $rctr32w @ CTR block 5
  991. eor $output_l1, $output_l1, $rk10_l @ AES block 1 - round 10 low
  992. #ifdef __AARCH64EB__
  993. rev $output_l1, $output_l1
  994. #endif
  995. fmov $ctr1d, $ctr96_b64x @ CTR block 5
  996. add $rctr32w, $rctr32w, #1 @ CTR block 5
  997. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
  998. fmov $ctr1.d[1], $ctr32x @ CTR block 5
  999. rev $ctr32w, $rctr32w @ CTR block 6
  1000. add $rctr32w, $rctr32w, #1 @ CTR block 6
  1001. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
  1002. eor $output_h1, $output_h1, $rk10_h @ AES block 1 - round 10 high
  1003. #ifdef __AARCH64EB__
  1004. rev $output_h1, $output_h1
  1005. #endif
  1006. eor $output_l0, $output_l0, $rk10_l @ AES block 0 - round 10 low
  1007. #ifdef __AARCH64EB__
  1008. rev $output_l0, $output_l0
  1009. #endif
  1010. eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
  1011. eor $output_h0, $output_h0, $rk10_h @ AES block 0 - round 10 high
  1012. #ifdef __AARCH64EB__
  1013. rev $output_h0, $output_h0
  1014. #endif
  1015. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
  1016. stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
  1017. b.ge .L128_dec_prepretail @ do prepretail
  1018. .L128_dec_main_loop: @ main loop start
  1019. eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
  1020. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  1021. mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
  1022. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  1023. mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
  1024. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  1025. fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
  1026. rev64 $res2b, $res2b @ GHASH block 4k+2
  1027. fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
  1028. rev $ctr32w, $rctr32w @ CTR block 4k+7
  1029. mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
  1030. eor $res0b, $res0b, $acc_lb @ PRE 1
  1031. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  1032. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  1033. rev64 $res3b, $res3b @ GHASH block 4k+3
  1034. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  1035. mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
  1036. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
  1037. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  1038. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
  1039. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  1040. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  1041. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
  1042. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  1043. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  1044. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  1045. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  1046. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  1047. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  1048. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  1049. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  1050. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  1051. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  1052. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  1053. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  1054. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  1055. eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low
  1056. #ifdef __AARCH64EB__
  1057. rev $output_l3, $output_l3
  1058. #endif
  1059. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  1060. eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high
  1061. #ifdef __AARCH64EB__
  1062. rev $output_h2, $output_h2
  1063. #endif
  1064. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  1065. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  1066. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  1067. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  1068. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  1069. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  1070. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  1071. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  1072. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  1073. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  1074. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  1075. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  1076. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  1077. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  1078. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  1079. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  1080. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  1081. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  1082. eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high
  1083. #ifdef __AARCH64EB__
  1084. rev $output_h3, $output_h3
  1085. #endif
  1086. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  1087. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  1088. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  1089. eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low
  1090. #ifdef __AARCH64EB__
  1091. rev $output_l2, $output_l2
  1092. #endif
  1093. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  1094. movi $mod_constant.8b, #0xc2
  1095. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  1096. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  1097. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  1098. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  1099. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  1100. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  1101. stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
  1102. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  1103. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  1104. ld1 {$res0b}, [$input_ptr], #16 @ AES block 4k+3 - load ciphertext
  1105. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  1106. add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
  1107. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  1108. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  1109. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  1110. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  1111. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  1112. stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
  1113. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  1114. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  1115. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  1116. rev $ctr32w, $rctr32w @ CTR block 4k+8
  1117. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  1118. ld1 {$res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
  1119. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  1120. aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
  1121. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
  1122. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  1123. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  1124. aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
  1125. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  1126. eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
  1127. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  1128. ld1 {$res2b}, [$input_ptr], #16 @ AES block 4k+5 - load ciphertext
  1129. add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
  1130. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  1131. eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
  1132. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  1133. ld1 {$res3b}, [$input_ptr], #16 @ AES block 4k+6 - load ciphertext
  1134. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  1135. rev64 $res1b, $res1b @ GHASH block 4k+5
  1136. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  1137. mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
  1138. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  1139. mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
  1140. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  1141. fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
  1142. pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  1143. fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
  1144. rev $ctr32w, $rctr32w @ CTR block 4k+9
  1145. aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
  1146. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
  1147. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  1148. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  1149. eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high
  1150. #ifdef __AARCH64EB__
  1151. rev $output_h0, $output_h0
  1152. #endif
  1153. eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
  1154. mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
  1155. eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low
  1156. #ifdef __AARCH64EB__
  1157. rev $output_l0, $output_l0
  1158. #endif
  1159. eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
  1160. mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
  1161. add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
  1162. aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
  1163. fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
  1164. cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
  1165. rev64 $res0b, $res0b @ GHASH block 4k+4
  1166. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  1167. fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
  1168. rev $ctr32w, $rctr32w @ CTR block 4k+10
  1169. add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
  1170. eor $output_h1, $output_h1, $rk10_h @ AES block 4k+5 - round 10 high
  1171. #ifdef __AARCH64EB__
  1172. rev $output_h1, $output_h1
  1173. #endif
  1174. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
  1175. eor $output_l1, $output_l1, $rk10_l @ AES block 4k+5 - round 10 low
  1176. #ifdef __AARCH64EB__
  1177. rev $output_l1, $output_l1
  1178. #endif
  1179. stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
  1180. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
  1181. b.lt L128_dec_main_loop
  1182. .L128_dec_prepretail: @ PREPRETAIL
  1183. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  1184. mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
  1185. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  1186. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  1187. eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
  1188. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  1189. mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
  1190. eor $res0b, $res0b, $acc_lb @ PRE 1
  1191. fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
  1192. rev64 $res2b, $res2b @ GHASH block 4k+2
  1193. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  1194. fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
  1195. rev $ctr32w, $rctr32w @ CTR block 4k+7
  1196. mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
  1197. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  1198. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  1199. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  1200. mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
  1201. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  1202. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  1203. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  1204. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
  1205. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  1206. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  1207. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
  1208. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  1209. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
  1210. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  1211. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  1212. rev64 $res3b, $res3b @ GHASH block 4k+3
  1213. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  1214. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  1215. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  1216. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  1217. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  1218. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  1219. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  1220. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  1221. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  1222. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  1223. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  1224. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  1225. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  1226. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  1227. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  1228. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  1229. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  1230. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  1231. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  1232. movi $mod_constant.8b, #0xc2
  1233. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  1234. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  1235. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  1236. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  1237. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  1238. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  1239. eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low
  1240. #ifdef __AARCH64EB__
  1241. rev $output_l3, $output_l3
  1242. #endif
  1243. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  1244. eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low
  1245. #ifdef __AARCH64EB__
  1246. rev $output_l2, $output_l2
  1247. #endif
  1248. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  1249. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  1250. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  1251. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  1252. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  1253. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  1254. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  1255. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  1256. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  1257. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  1258. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  1259. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  1260. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  1261. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  1262. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  1263. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  1264. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  1265. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  1266. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  1267. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  1268. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  1269. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  1270. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  1271. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  1272. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  1273. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  1274. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  1275. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  1276. aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
  1277. pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  1278. eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high
  1279. #ifdef __AARCH64EB__
  1280. rev $output_h3, $output_h3
  1281. #endif
  1282. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  1283. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  1284. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  1285. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  1286. eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
  1287. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  1288. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  1289. eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high
  1290. #ifdef __AARCH64EB__
  1291. rev $output_h2, $output_h2
  1292. #endif
  1293. aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
  1294. stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
  1295. aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
  1296. add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
  1297. stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
  1298. aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
  1299. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  1300. .L128_dec_tail: @ TAIL
  1301. sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
  1302. ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
  1303. eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
  1304. mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
  1305. mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
  1306. cmp $main_end_input_ptr, #48
  1307. eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high
  1308. #ifdef __AARCH64EB__
  1309. rev $output_h0, $output_h0
  1310. #endif
  1311. ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
  1312. eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low
  1313. #ifdef __AARCH64EB__
  1314. rev $output_l0, $output_l0
  1315. #endif
  1316. b.gt .L128_dec_blocks_more_than_3
  1317. mov $ctr3b, $ctr2b
  1318. sub $rctr32w, $rctr32w, #1
  1319. movi $acc_l.8b, #0
  1320. movi $acc_h.8b, #0
  1321. mov $ctr2b, $ctr1b
  1322. movi $acc_m.8b, #0
  1323. cmp $main_end_input_ptr, #32
  1324. b.gt .L128_dec_blocks_more_than_2
  1325. cmp $main_end_input_ptr, #16
  1326. mov $ctr3b, $ctr1b
  1327. sub $rctr32w, $rctr32w, #1
  1328. b.gt .L128_dec_blocks_more_than_1
  1329. sub $rctr32w, $rctr32w, #1
  1330. b .L128_dec_blocks_less_than_1
  1331. .L128_dec_blocks_more_than_3: @ blocks left > 3
  1332. rev64 $res0b, $res1b @ GHASH final-3 block
  1333. ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
  1334. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1335. mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
  1336. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
  1337. eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
  1338. mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
  1339. mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
  1340. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
  1341. mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
  1342. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
  1343. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
  1344. movi $t0.8b, #0 @ suppress further partial tag feed in
  1345. eor $output_h0, $output_h0, $rk10_h @ AES final-2 block - round 10 high
  1346. #ifdef __AARCH64EB__
  1347. rev $output_h0, $output_h0
  1348. #endif
  1349. pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
  1350. eor $output_l0, $output_l0, $rk10_l @ AES final-2 block - round 10 low
  1351. #ifdef __AARCH64EB__
  1352. rev $output_l0, $output_l0
  1353. #endif
  1354. .L128_dec_blocks_more_than_2: @ blocks left > 2
  1355. rev64 $res0b, $res1b @ GHASH final-2 block
  1356. ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
  1357. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1358. eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
  1359. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
  1360. mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
  1361. pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
  1362. pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
  1363. mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
  1364. mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
  1365. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
  1366. movi $t0.8b, #0 @ suppress further partial tag feed in
  1367. pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
  1368. eor $output_l0, $output_l0, $rk10_l @ AES final-1 block - round 10 low
  1369. #ifdef __AARCH64EB__
  1370. rev $output_l0, $output_l0
  1371. #endif
  1372. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
  1373. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
  1374. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
  1375. eor $output_h0, $output_h0, $rk10_h @ AES final-1 block - round 10 high
  1376. #ifdef __AARCH64EB__
  1377. rev $output_h0, $output_h0
  1378. #endif
  1379. .L128_dec_blocks_more_than_1: @ blocks left > 1
  1380. rev64 $res0b, $res1b @ GHASH final-1 block
  1381. ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
  1382. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1383. mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
  1384. eor $ctr0b, $res1b, $ctr3b @ AES final block - result
  1385. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
  1386. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
  1387. mov $output_l0, $ctr0.d[0] @ AES final block - mov low
  1388. mov $output_h0, $ctr0.d[1] @ AES final block - mov high
  1389. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
  1390. pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
  1391. pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
  1392. pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
  1393. movi $t0.8b, #0 @ suppress further partial tag feed in
  1394. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
  1395. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
  1396. eor $output_h0, $output_h0, $rk10_h @ AES final block - round 10 high
  1397. #ifdef __AARCH64EB__
  1398. rev $output_h0, $output_h0
  1399. #endif
  1400. eor $output_l0, $output_l0, $rk10_l @ AES final block - round 10 low
  1401. #ifdef __AARCH64EB__
  1402. rev $output_l0, $output_l0
  1403. #endif
  1404. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
  1405. .L128_dec_blocks_less_than_1: @ blocks left <= 1
  1406. mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff
  1407. and $bit_length, $bit_length, #127 @ bit_length %= 128
  1408. mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff
  1409. sub $bit_length, $bit_length, #128 @ bit_length -= 128
  1410. neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
  1411. and $bit_length, $bit_length, #127 @ bit_length %= 128
  1412. lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block
  1413. cmp $bit_length, #64
  1414. csel $ctr96_b64x, $rk10_h, xzr, lt
  1415. csel $ctr32x, $rk10_l, $rk10_h, lt
  1416. fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
  1417. mov $ctr0.d[1], $ctr96_b64x
  1418. and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
  1419. rev64 $res0b, $res1b @ GHASH final block
  1420. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1421. ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
  1422. and $output_h0, $output_h0, $ctr96_b64x
  1423. pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
  1424. mov $t0d, $res0.d[1] @ GHASH final block - mid
  1425. eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
  1426. eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
  1427. pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
  1428. pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
  1429. bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
  1430. and $output_l0, $output_l0, $ctr32x
  1431. #ifndef __AARCH64EB__
  1432. rev $ctr32w, $rctr32w
  1433. #else
  1434. mov $ctr32w, $rctr32w
  1435. #endif
  1436. eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
  1437. movi $mod_constant.8b, #0xc2
  1438. eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
  1439. bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
  1440. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  1441. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  1442. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  1443. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  1444. orr $output_l0, $output_l0, $end_input_ptr
  1445. str $ctr32w, [$counter, #12] @ store the updated counter
  1446. orr $output_h0, $output_h0, $main_end_input_ptr
  1447. stp $output_l0, $output_h0, [$output_ptr]
  1448. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  1449. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  1450. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  1451. pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  1452. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  1453. eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
  1454. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  1455. ext $acc_lb, $acc_lb, $acc_lb, #8
  1456. rev64 $acc_lb, $acc_lb
  1457. mov x0, $len
  1458. st1 { $acc_l.16b }, [$current_tag]
  1459. ldp x21, x22, [sp, #16]
  1460. ldp x23, x24, [sp, #32]
  1461. ldp d8, d9, [sp, #48]
  1462. ldp d10, d11, [sp, #64]
  1463. ldp d12, d13, [sp, #80]
  1464. ldp d14, d15, [sp, #96]
  1465. ldp x19, x20, [sp], #112
  1466. ret
  1467. .L128_dec_ret:
  1468. mov w0, #0x0
  1469. ret
  1470. .size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel
  1471. ___
  1472. }
  1473. {
  1474. my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
  1475. my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
  1476. my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
  1477. my ($output_l0,$output_h0)=map("x$_",(6..7));
  1478. my $ctr32w="w9";
  1479. my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk12_l,$rk12_h,$len)=map("x$_",(9..15));
  1480. my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
  1481. my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
  1482. my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
  1483. my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
  1484. my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
  1485. my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
  1486. my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
  1487. my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
  1488. my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
  1489. my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
  1490. my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
  1491. my $t0="v8";
  1492. my $t0d="d8";
  1493. my $t3="v4";
  1494. my $t3d="d4";
  1495. my ($t1,$t2)=map("v$_",(30..31));
  1496. my ($t1d,$t2d)=map("d$_",(30..31));
  1497. my $t4="v30";
  1498. my $t4d="d30";
  1499. my $t5="v8";
  1500. my $t5d="d8";
  1501. my $t6="v31";
  1502. my $t6d="d31";
  1503. my $t7="v5";
  1504. my $t7d="d5";
  1505. my $t8="v6";
  1506. my $t8d="d6";
  1507. my $t9="v30";
  1508. my $t9d="d30";
  1509. my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
  1510. my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
  1511. my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
  1512. my $mod_constantd="d8";
  1513. my $mod_constant="v8";
  1514. my $mod_t="v31";
  1515. my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11)=map("v$_.16b",(18..29));
  1516. my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q)=map("q$_",(18..29));
  1517. my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s,$rk10s,$rk11s)=map("v$_.4s",(18..29));
  1518. my $rk2q1="v20.1q";
  1519. my $rk3q1="v21.1q";
  1520. my $rk4v="v22";
  1521. my $rk4d="d22";
  1522. #########################################################################################
  1523. # size_t aes_gcm_enc_192_kernel(const uint8_t * plaintext,
  1524. # uint64_t plaintext_length,
  1525. # uint8_t * ciphertext,
  1526. # uint64_t *Xi,
  1527. # unsigned char ivec[16],
  1528. # const void *key);
  1529. #
  1530. $code.=<<___;
  1531. .global aes_gcm_enc_192_kernel
  1532. .type aes_gcm_enc_192_kernel,%function
  1533. .align 4
  1534. aes_gcm_enc_192_kernel:
  1535. AARCH64_VALID_CALL_TARGET
  1536. cbz x1, .L192_enc_ret
  1537. stp x19, x20, [sp, #-112]!
  1538. mov x16, x4
  1539. mov x8, x5
  1540. stp x21, x22, [sp, #16]
  1541. stp x23, x24, [sp, #32]
  1542. stp d8, d9, [sp, #48]
  1543. stp d10, d11, [sp, #64]
  1544. stp d12, d13, [sp, #80]
  1545. stp d14, d15, [sp, #96]
  1546. ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
  1547. #ifdef __AARCH64EB__
  1548. rev $ctr96_b64x, $ctr96_b64x
  1549. rev $ctr96_t32x, $ctr96_t32x
  1550. #endif
  1551. ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12
  1552. #ifdef __AARCH64EB__
  1553. ror $rk12_l, $rk12_l, #32
  1554. ror $rk12_h, $rk12_h, #32
  1555. #endif
  1556. ld1 {$rk0s}, [$cc], #16 @ load rk0
  1557. ld1 {$rk1s}, [$cc], #16 @ load rk1
  1558. ld1 {$rk2s}, [$cc], #16 @ load rk2
  1559. lsr $rctr32x, $ctr96_t32x, #32
  1560. ld1 {$rk3s}, [$cc], #16 @ load rk3
  1561. orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
  1562. ld1 {$rk4s}, [$cc], #16 @ load rk4
  1563. rev $rctr32w, $rctr32w @ rev_ctr32
  1564. add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
  1565. fmov $ctr3d, $ctr96_b64x @ CTR block 3
  1566. rev $ctr32w, $rctr32w @ CTR block 1
  1567. add $rctr32w, $rctr32w, #1 @ CTR block 1
  1568. fmov $ctr1d, $ctr96_b64x @ CTR block 1
  1569. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
  1570. ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
  1571. fmov $ctr1.d[1], $ctr32x @ CTR block 1
  1572. rev $ctr32w, $rctr32w @ CTR block 2
  1573. add $rctr32w, $rctr32w, #1 @ CTR block 2
  1574. fmov $ctr2d, $ctr96_b64x @ CTR block 2
  1575. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
  1576. fmov $ctr2.d[1], $ctr32x @ CTR block 2
  1577. rev $ctr32w, $rctr32w @ CTR block 3
  1578. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
  1579. ld1 {$rk5s}, [$cc], #16 @ load rk5
  1580. fmov $ctr3.d[1], $ctr32x @ CTR block 3
  1581. ld1 {$rk6s}, [$cc], #16 @ load rk6
  1582. ld1 {$rk7s}, [$cc], #16 @ load rk7
  1583. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
  1584. ld1 { $acc_lb}, [$current_tag]
  1585. ext $acc_lb, $acc_lb, $acc_lb, #8
  1586. rev64 $acc_lb, $acc_lb
  1587. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
  1588. ld1 {$rk8s}, [$cc], #16 @ load rk8
  1589. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
  1590. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  1591. #ifndef __AARCH64EB__
  1592. ext $h4b, $h4b, $h4b, #8
  1593. #endif
  1594. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
  1595. ld1 {$rk9s}, [$cc], #16 @ load rk9
  1596. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
  1597. ld1 {$rk10s}, [$cc], #16 @ load rk10
  1598. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
  1599. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  1600. #ifndef __AARCH64EB__
  1601. ext $h1b, $h1b, $h1b, #8
  1602. #endif
  1603. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
  1604. ld1 {$rk11s}, [$cc], #16 @ load rk11
  1605. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
  1606. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  1607. #ifndef __AARCH64EB__
  1608. ext $h3b, $h3b, $h3b, #8
  1609. #endif
  1610. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
  1611. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
  1612. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
  1613. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
  1614. trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
  1615. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
  1616. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
  1617. trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
  1618. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
  1619. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
  1620. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
  1621. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
  1622. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
  1623. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
  1624. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
  1625. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
  1626. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
  1627. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
  1628. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
  1629. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
  1630. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  1631. #ifndef __AARCH64EB__
  1632. ext $h2b, $h2b, $h2b, #8
  1633. #endif
  1634. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
  1635. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
  1636. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
  1637. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
  1638. trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
  1639. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
  1640. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
  1641. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
  1642. trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
  1643. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
  1644. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
  1645. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
  1646. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
  1647. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
  1648. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
  1649. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
  1650. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
  1651. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
  1652. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
  1653. lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
  1654. mov $len, $main_end_input_ptr
  1655. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
  1656. sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
  1657. eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
  1658. and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
  1659. eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
  1660. aese $ctr2b, $rk11 @ AES block 2 - round 11
  1661. add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
  1662. add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
  1663. aese $ctr1b, $rk11 @ AES block 1 - round 11
  1664. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
  1665. aese $ctr0b, $rk11 @ AES block 0 - round 11
  1666. add $rctr32w, $rctr32w, #1 @ CTR block 3
  1667. aese $ctr3b, $rk11 @ AES block 3 - round 11
  1668. b.ge .L192_enc_tail @ handle tail
  1669. rev $ctr32w, $rctr32w @ CTR block 4
  1670. ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
  1671. #ifdef __AARCH64EB__
  1672. rev $input_l0, $input_l0
  1673. rev $input_h0, $input_h0
  1674. #endif
  1675. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
  1676. ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
  1677. #ifdef __AARCH64EB__
  1678. rev $input_l2, $input_l2
  1679. rev $input_h2, $input_h2
  1680. #endif
  1681. ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
  1682. #ifdef __AARCH64EB__
  1683. rev $input_l3, $input_l3
  1684. rev $input_h3, $input_h3
  1685. #endif
  1686. ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
  1687. #ifdef __AARCH64EB__
  1688. rev $input_l1, $input_l1
  1689. rev $input_h1, $input_h1
  1690. #endif
  1691. add $input_ptr, $input_ptr, #64 @ AES input_ptr update
  1692. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  1693. eor $input_l0, $input_l0, $rk12_l @ AES block 0 - round 12 low
  1694. eor $input_h0, $input_h0, $rk12_h @ AES block 0 - round 12 high
  1695. eor $input_h2, $input_h2, $rk12_h @ AES block 2 - round 12 high
  1696. fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
  1697. eor $input_h3, $input_h3, $rk12_h @ AES block 3 - round 12 high
  1698. fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
  1699. eor $input_l2, $input_l2, $rk12_l @ AES block 2 - round 12 low
  1700. eor $input_l1, $input_l1, $rk12_l @ AES block 1 - round 12 low
  1701. fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
  1702. eor $input_h1, $input_h1, $rk12_h @ AES block 1 - round 12 high
  1703. fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
  1704. eor $input_l3, $input_l3, $rk12_l @ AES block 3 - round 12 low
  1705. fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
  1706. add $rctr32w, $rctr32w, #1 @ CTR block 4
  1707. eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
  1708. fmov $ctr0d, $ctr96_b64x @ CTR block 4
  1709. fmov $ctr0.d[1], $ctr32x @ CTR block 4
  1710. rev $ctr32w, $rctr32w @ CTR block 5
  1711. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
  1712. add $rctr32w, $rctr32w, #1 @ CTR block 5
  1713. fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
  1714. st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
  1715. fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
  1716. eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
  1717. fmov $ctr1d, $ctr96_b64x @ CTR block 5
  1718. st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
  1719. fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
  1720. fmov $ctr1.d[1], $ctr32x @ CTR block 5
  1721. rev $ctr32w, $rctr32w @ CTR block 6
  1722. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
  1723. add $rctr32w, $rctr32w, #1 @ CTR block 6
  1724. eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
  1725. fmov $ctr2d, $ctr96_b64x @ CTR block 6
  1726. fmov $ctr2.d[1], $ctr32x @ CTR block 6
  1727. rev $ctr32w, $rctr32w @ CTR block 7
  1728. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
  1729. st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
  1730. eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
  1731. st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
  1732. b.ge .L192_enc_prepretail @ do prepretail
  1733. .L192_enc_main_loop: @ main loop start
  1734. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  1735. rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
  1736. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  1737. ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
  1738. #ifdef __AARCH64EB__
  1739. rev $input_l1, $input_l1
  1740. rev $input_h1, $input_h1
  1741. #endif
  1742. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  1743. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
  1744. rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
  1745. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  1746. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
  1747. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  1748. rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
  1749. ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
  1750. #ifdef __AARCH64EB__
  1751. rev $input_l2, $input_l2
  1752. rev $input_h2, $input_h2
  1753. #endif
  1754. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  1755. ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext
  1756. #ifdef __AARCH64EB__
  1757. rev $input_l3, $input_l3
  1758. rev $input_h3, $input_h3
  1759. #endif
  1760. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  1761. eor $res0b, $res0b, $acc_lb @ PRE 1
  1762. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  1763. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  1764. rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
  1765. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  1766. eor $input_h3, $input_h3, $rk12_h @ AES block 4k+3 - round 12 high
  1767. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  1768. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  1769. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  1770. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  1771. eor $input_l2, $input_l2, $rk12_l @ AES block 4k+6 - round 12 low
  1772. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  1773. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  1774. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  1775. eor $input_l1, $input_l1, $rk12_l @ AES block 4k+5 - round 12 low
  1776. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  1777. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  1778. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  1779. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  1780. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  1781. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  1782. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  1783. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  1784. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  1785. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  1786. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  1787. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  1788. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  1789. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  1790. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  1791. eor $input_h1, $input_h1, $rk12_h @ AES block 4k+5 - round 12 high
  1792. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  1793. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  1794. add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
  1795. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  1796. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  1797. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  1798. eor $input_h2, $input_h2, $rk12_h @ AES block 4k+6 - round 12 high
  1799. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  1800. eor $input_l3, $input_l3, $rk12_l @ AES block 4k+3 - round 12 low
  1801. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  1802. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  1803. rev $ctr32w, $rctr32w @ CTR block 4k+8
  1804. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  1805. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
  1806. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  1807. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  1808. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  1809. ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
  1810. #ifdef __AARCH64EB__
  1811. rev $input_l0, $input_l0
  1812. rev $input_h0, $input_h0
  1813. #endif
  1814. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  1815. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  1816. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  1817. add $input_ptr, $input_ptr, #64 @ AES input_ptr update
  1818. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  1819. movi $mod_constant.8b, #0xc2
  1820. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  1821. eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high
  1822. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  1823. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  1824. eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low
  1825. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  1826. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  1827. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  1828. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  1829. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  1830. fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
  1831. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  1832. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  1833. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  1834. fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
  1835. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  1836. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  1837. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  1838. cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
  1839. fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
  1840. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  1841. fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
  1842. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  1843. fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low
  1844. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  1845. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  1846. add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
  1847. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  1848. fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high
  1849. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  1850. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  1851. fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
  1852. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  1853. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
  1854. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  1855. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  1856. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  1857. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
  1858. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
  1859. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  1860. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
  1861. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
  1862. aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
  1863. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
  1864. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  1865. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
  1866. eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
  1867. fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
  1868. aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
  1869. fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
  1870. rev $ctr32w, $rctr32w @ CTR block 4k+9
  1871. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  1872. fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
  1873. st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
  1874. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
  1875. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
  1876. eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
  1877. add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
  1878. fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
  1879. aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
  1880. fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
  1881. rev $ctr32w, $rctr32w @ CTR block 4k+10
  1882. add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
  1883. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  1884. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
  1885. st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
  1886. eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
  1887. aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
  1888. eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
  1889. fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
  1890. st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
  1891. fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
  1892. rev $ctr32w, $rctr32w @ CTR block 4k+11
  1893. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  1894. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
  1895. eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result
  1896. st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result
  1897. b.lt .L192_enc_main_loop
  1898. .L192_enc_prepretail: @ PREPRETAIL
  1899. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  1900. rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
  1901. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
  1902. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  1903. add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
  1904. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  1905. rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
  1906. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  1907. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
  1908. eor $res0b, $res0b, $acc_lb @ PRE 1
  1909. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  1910. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  1911. rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
  1912. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  1913. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  1914. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  1915. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  1916. rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
  1917. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  1918. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  1919. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  1920. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  1921. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  1922. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  1923. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  1924. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  1925. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  1926. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  1927. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  1928. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  1929. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  1930. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  1931. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  1932. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  1933. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  1934. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  1935. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  1936. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  1937. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  1938. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  1939. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  1940. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  1941. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  1942. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  1943. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  1944. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  1945. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  1946. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  1947. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  1948. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  1949. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  1950. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  1951. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  1952. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  1953. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  1954. movi $mod_constant.8b, #0xc2
  1955. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  1956. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  1957. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  1958. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  1959. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  1960. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  1961. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  1962. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  1963. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  1964. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  1965. eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
  1966. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  1967. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  1968. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  1969. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  1970. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  1971. eor $acc_mb, $acc_mb, $acc_lb
  1972. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  1973. pmull $t1.1q, $acc_h.1d, $mod_constant.1d
  1974. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  1975. ext $acc_hb, $acc_hb, $acc_hb, #8
  1976. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  1977. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  1978. eor $acc_mb, $acc_mb, $t1.16b
  1979. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  1980. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  1981. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
  1982. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  1983. eor $acc_mb, $acc_mb, $acc_hb
  1984. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
  1985. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
  1986. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
  1987. pmull $t1.1q, $acc_m.1d, $mod_constant.1d
  1988. ext $acc_mb, $acc_mb, $acc_mb, #8
  1989. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
  1990. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
  1991. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
  1992. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
  1993. eor $acc_lb, $acc_lb, $t1.16b
  1994. aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
  1995. aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
  1996. aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
  1997. aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
  1998. eor $acc_lb, $acc_lb, $acc_mb
  1999. .L192_enc_tail: @ TAIL
  2000. sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
  2001. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
  2002. #ifdef __AARCH64EB__
  2003. rev $input_l0, $input_l0
  2004. rev $input_h0, $input_h0
  2005. #endif
  2006. eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low
  2007. eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high
  2008. fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
  2009. fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
  2010. cmp $main_end_input_ptr, #48
  2011. eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
  2012. ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
  2013. b.gt .L192_enc_blocks_more_than_3
  2014. sub $rctr32w, $rctr32w, #1
  2015. movi $acc_m.8b, #0
  2016. mov $ctr3b, $ctr2b
  2017. movi $acc_h.8b, #0
  2018. cmp $main_end_input_ptr, #32
  2019. mov $ctr2b, $ctr1b
  2020. movi $acc_l.8b, #0
  2021. b.gt .L192_enc_blocks_more_than_2
  2022. sub $rctr32w, $rctr32w, #1
  2023. mov $ctr3b, $ctr1b
  2024. cmp $main_end_input_ptr, #16
  2025. b.gt .L192_enc_blocks_more_than_1
  2026. sub $rctr32w, $rctr32w, #1
  2027. b .L192_enc_blocks_less_than_1
  2028. .L192_enc_blocks_more_than_3: @ blocks left > 3
  2029. st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
  2030. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
  2031. #ifdef __AARCH64EB__
  2032. rev $input_l0, $input_l0
  2033. rev $input_h0, $input_h0
  2034. #endif
  2035. rev64 $res0b, $res1b @ GHASH final-3 block
  2036. eor $input_l0, $input_l0, $rk12_l @ AES final-2 block - round 12 low
  2037. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2038. eor $input_h0, $input_h0, $rk12_h @ AES final-2 block - round 12 high
  2039. fmov $res1d, $input_l0 @ AES final-2 block - mov low
  2040. fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
  2041. mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
  2042. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
  2043. mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
  2044. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
  2045. movi $t0.8b, #0 @ suppress further partial tag feed in
  2046. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
  2047. pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
  2048. eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
  2049. .L192_enc_blocks_more_than_2: @ blocks left > 2
  2050. st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
  2051. rev64 $res0b, $res1b @ GHASH final-2 block
  2052. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
  2053. #ifdef __AARCH64EB__
  2054. rev $input_l0, $input_l0
  2055. rev $input_h0, $input_h0
  2056. #endif
  2057. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2058. eor $input_h0, $input_h0, $rk12_h @ AES final-1 block - round 12 high
  2059. pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
  2060. mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
  2061. pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
  2062. eor $input_l0, $input_l0, $rk12_l @ AES final-1 block - round 12 low
  2063. fmov $res1d, $input_l0 @ AES final-1 block - mov low
  2064. fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
  2065. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
  2066. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
  2067. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
  2068. pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
  2069. movi $t0.8b, #0 @ suppress further partial tag feed in
  2070. eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
  2071. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
  2072. .L192_enc_blocks_more_than_1: @ blocks left > 1
  2073. st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
  2074. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
  2075. #ifdef __AARCH64EB__
  2076. rev $input_l0, $input_l0
  2077. rev $input_h0, $input_h0
  2078. #endif
  2079. rev64 $res0b, $res1b @ GHASH final-1 block
  2080. eor $input_l0, $input_l0, $rk12_l @ AES final block - round 12 low
  2081. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2082. movi $t0.8b, #0 @ suppress further partial tag feed in
  2083. mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
  2084. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
  2085. eor $input_h0, $input_h0, $rk12_h @ AES final block - round 12 high
  2086. fmov $res1d, $input_l0 @ AES final block - mov low
  2087. pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
  2088. fmov $res1.d[1], $input_h0 @ AES final block - mov high
  2089. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
  2090. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
  2091. pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
  2092. pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
  2093. eor $res1b, $res1b, $ctr3b @ AES final block - result
  2094. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
  2095. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
  2096. .L192_enc_blocks_less_than_1: @ blocks left <= 1
  2097. ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
  2098. #ifndef __AARCH64EB__
  2099. rev $ctr32w, $rctr32w
  2100. #else
  2101. mov $ctr32w, $rctr32w
  2102. #endif
  2103. and $bit_length, $bit_length, #127 @ bit_length %= 128
  2104. sub $bit_length, $bit_length, #128 @ bit_length -= 128
  2105. mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff
  2106. neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
  2107. mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff
  2108. and $bit_length, $bit_length, #127 @ bit_length %= 128
  2109. lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block
  2110. cmp $bit_length, #64
  2111. csel $input_l0, $rk12_l, $rk12_h, lt
  2112. csel $input_h0, $rk12_h, xzr, lt
  2113. fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
  2114. fmov $ctr0.d[1], $input_h0
  2115. and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
  2116. rev64 $res0b, $res1b @ GHASH final block
  2117. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2118. mov $t0d, $res0.d[1] @ GHASH final block - mid
  2119. pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
  2120. pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
  2121. eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
  2122. eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
  2123. eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
  2124. pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
  2125. eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
  2126. movi $mod_constant.8b, #0xc2
  2127. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  2128. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  2129. bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
  2130. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  2131. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  2132. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  2133. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  2134. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  2135. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  2136. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  2137. eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
  2138. str $ctr32w, [$counter, #12] @ store the updated counter
  2139. st1 { $res1b}, [$output_ptr] @ store all 16B
  2140. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  2141. ext $acc_lb, $acc_lb, $acc_lb, #8
  2142. rev64 $acc_lb, $acc_lb
  2143. mov x0, $len
  2144. st1 { $acc_l.16b }, [$current_tag]
  2145. ldp x21, x22, [sp, #16]
  2146. ldp x23, x24, [sp, #32]
  2147. ldp d8, d9, [sp, #48]
  2148. ldp d10, d11, [sp, #64]
  2149. ldp d12, d13, [sp, #80]
  2150. ldp d14, d15, [sp, #96]
  2151. ldp x19, x20, [sp], #112
  2152. ret
  2153. .L192_enc_ret:
  2154. mov w0, #0x0
  2155. ret
  2156. .size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel
  2157. ___
  2158. #########################################################################################
  2159. # size_t aes_gcm_dec_192_kernel(const uint8_t * ciphertext,
  2160. # uint64_t plaintext_length,
  2161. # uint8_t * plaintext,
  2162. # uint64_t *Xi,
  2163. # unsigned char ivec[16],
  2164. # const void *key);
  2165. #
  2166. $code.=<<___;
  2167. .global aes_gcm_dec_192_kernel
  2168. .type aes_gcm_dec_192_kernel,%function
  2169. .align 4
  2170. aes_gcm_dec_192_kernel:
  2171. AARCH64_VALID_CALL_TARGET
  2172. cbz x1, .L192_dec_ret
  2173. stp x19, x20, [sp, #-112]!
  2174. mov x16, x4
  2175. mov x8, x5
  2176. stp x21, x22, [sp, #16]
  2177. stp x23, x24, [sp, #32]
  2178. stp d8, d9, [sp, #48]
  2179. stp d10, d11, [sp, #64]
  2180. stp d12, d13, [sp, #80]
  2181. stp d14, d15, [sp, #96]
  2182. add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
  2183. ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
  2184. #ifdef __AARCH64EB__
  2185. rev $ctr96_b64x, $ctr96_b64x
  2186. rev $ctr96_t32x, $ctr96_t32x
  2187. #endif
  2188. ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12
  2189. #ifdef __AARCH64EB__
  2190. ror $rk12_l, $rk12_l, #32
  2191. ror $rk12_h, $rk12_h, #32
  2192. #endif
  2193. ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
  2194. ld1 {$rk0s}, [$cc], #16 @ load rk0
  2195. lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
  2196. mov $len, $main_end_input_ptr
  2197. ld1 {$rk1s}, [$cc], #16 @ load rk1
  2198. lsr $rctr32x, $ctr96_t32x, #32
  2199. orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
  2200. fmov $ctr3d, $ctr96_b64x @ CTR block 3
  2201. rev $rctr32w, $rctr32w @ rev_ctr32
  2202. fmov $ctr1d, $ctr96_b64x @ CTR block 1
  2203. add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
  2204. ld1 {$rk2s}, [$cc], #16 @ load rk2
  2205. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
  2206. rev $ctr32w, $rctr32w @ CTR block 1
  2207. add $rctr32w, $rctr32w, #1 @ CTR block 1
  2208. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
  2209. ld1 {$rk3s}, [$cc], #16 @ load rk3
  2210. fmov $ctr1.d[1], $ctr32x @ CTR block 1
  2211. rev $ctr32w, $rctr32w @ CTR block 2
  2212. add $rctr32w, $rctr32w, #1 @ CTR block 2
  2213. fmov $ctr2d, $ctr96_b64x @ CTR block 2
  2214. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
  2215. fmov $ctr2.d[1], $ctr32x @ CTR block 2
  2216. rev $ctr32w, $rctr32w @ CTR block 3
  2217. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
  2218. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
  2219. fmov $ctr3.d[1], $ctr32x @ CTR block 3
  2220. ld1 {$rk4s}, [$cc], #16 @ load rk4
  2221. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
  2222. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
  2223. ld1 {$rk5s}, [$cc], #16 @ load rk5
  2224. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
  2225. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  2226. #ifndef __AARCH64EB__
  2227. ext $h4b, $h4b, $h4b, #8
  2228. #endif
  2229. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
  2230. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  2231. #ifndef __AARCH64EB__
  2232. ext $h2b, $h2b, $h2b, #8
  2233. #endif
  2234. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
  2235. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  2236. #ifndef __AARCH64EB__
  2237. ext $h3b, $h3b, $h3b, #8
  2238. #endif
  2239. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
  2240. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
  2241. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  2242. #ifndef __AARCH64EB__
  2243. ext $h1b, $h1b, $h1b, #8
  2244. #endif
  2245. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
  2246. ld1 {$rk6s}, [$cc], #16 @ load rk6
  2247. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
  2248. ld1 {$rk7s}, [$cc], #16 @ load rk7
  2249. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
  2250. ld1 {$rk8s}, [$cc], #16 @ load rk8
  2251. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
  2252. ld1 {$rk9s}, [$cc], #16 @ load rk9
  2253. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
  2254. ld1 { $acc_lb}, [$current_tag]
  2255. ext $acc_lb, $acc_lb, $acc_lb, #8
  2256. rev64 $acc_lb, $acc_lb
  2257. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
  2258. add $rctr32w, $rctr32w, #1 @ CTR block 3
  2259. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
  2260. trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
  2261. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
  2262. ld1 {$rk10s}, [$cc], #16 @ load rk10
  2263. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
  2264. trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
  2265. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
  2266. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
  2267. trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
  2268. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
  2269. ld1 {$rk11s}, [$cc], #16 @ load rk11
  2270. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
  2271. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
  2272. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
  2273. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
  2274. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
  2275. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
  2276. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
  2277. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
  2278. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
  2279. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
  2280. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
  2281. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
  2282. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
  2283. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
  2284. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
  2285. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
  2286. sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
  2287. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
  2288. and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
  2289. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
  2290. add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
  2291. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
  2292. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
  2293. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
  2294. trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
  2295. aese $ctr3b, $rk11 @ AES block 3 - round 11
  2296. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
  2297. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
  2298. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
  2299. eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
  2300. aese $ctr2b, $rk11 @ AES block 2 - round 11
  2301. aese $ctr1b, $rk11 @ AES block 1 - round 11
  2302. eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
  2303. aese $ctr0b, $rk11 @ AES block 0 - round 11
  2304. b.ge .L192_dec_tail @ handle tail
  2305. ld1 {$res0b, $res1b}, [$input_ptr], #32 @ AES block 0,1 - load ciphertext
  2306. eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
  2307. eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
  2308. rev $ctr32w, $rctr32w @ CTR block 4
  2309. ld1 {$res2b, $res3b}, [$input_ptr], #32 @ AES block 2,3 - load ciphertext
  2310. mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
  2311. mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
  2312. mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
  2313. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
  2314. add $rctr32w, $rctr32w, #1 @ CTR block 4
  2315. mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
  2316. rev64 $res0b, $res0b @ GHASH block 0
  2317. fmov $ctr0d, $ctr96_b64x @ CTR block 4
  2318. rev64 $res1b, $res1b @ GHASH block 1
  2319. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  2320. eor $output_l1, $output_l1, $rk12_l @ AES block 1 - round 12 low
  2321. #ifdef __AARCH64EB__
  2322. rev $output_l1, $output_l1
  2323. #endif
  2324. fmov $ctr0.d[1], $ctr32x @ CTR block 4
  2325. rev $ctr32w, $rctr32w @ CTR block 5
  2326. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
  2327. fmov $ctr1d, $ctr96_b64x @ CTR block 5
  2328. eor $output_h1, $output_h1, $rk12_h @ AES block 1 - round 12 high
  2329. #ifdef __AARCH64EB__
  2330. rev $output_h1, $output_h1
  2331. #endif
  2332. add $rctr32w, $rctr32w, #1 @ CTR block 5
  2333. fmov $ctr1.d[1], $ctr32x @ CTR block 5
  2334. eor $output_l0, $output_l0, $rk12_l @ AES block 0 - round 12 low
  2335. #ifdef __AARCH64EB__
  2336. rev $output_l0, $output_l0
  2337. #endif
  2338. rev $ctr32w, $rctr32w @ CTR block 6
  2339. eor $output_h0, $output_h0, $rk12_h @ AES block 0 - round 12 high
  2340. #ifdef __AARCH64EB__
  2341. rev $output_h0, $output_h0
  2342. #endif
  2343. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
  2344. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
  2345. stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
  2346. add $rctr32w, $rctr32w, #1 @ CTR block 6
  2347. eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
  2348. b.ge .L192_dec_prepretail @ do prepretail
  2349. .L192_dec_main_loop: @ main loop start
  2350. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  2351. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  2352. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  2353. mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
  2354. mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
  2355. eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
  2356. rev64 $res3b, $res3b @ GHASH block 4k+3
  2357. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  2358. fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
  2359. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  2360. eor $res0b, $res0b, $acc_lb @ PRE 1
  2361. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  2362. fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
  2363. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  2364. mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
  2365. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  2366. mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
  2367. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  2368. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
  2369. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  2370. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  2371. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  2372. rev $ctr32w, $rctr32w @ CTR block 4k+7
  2373. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  2374. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
  2375. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
  2376. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  2377. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  2378. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  2379. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  2380. eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high
  2381. #ifdef __AARCH64EB__
  2382. rev $output_h2, $output_h2
  2383. #endif
  2384. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  2385. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  2386. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  2387. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  2388. rev64 $res2b, $res2b @ GHASH block 4k+2
  2389. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  2390. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  2391. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  2392. eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low
  2393. #ifdef __AARCH64EB__
  2394. rev $output_l2, $output_l2
  2395. #endif
  2396. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  2397. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  2398. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  2399. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  2400. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  2401. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  2402. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  2403. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  2404. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  2405. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  2406. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  2407. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  2408. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  2409. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  2410. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  2411. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  2412. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  2413. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  2414. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  2415. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  2416. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  2417. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  2418. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  2419. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  2420. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  2421. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  2422. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  2423. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  2424. movi $mod_constant.8b, #0xc2
  2425. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  2426. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  2427. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  2428. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  2429. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
  2430. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  2431. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  2432. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  2433. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  2434. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
  2435. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
  2436. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  2437. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  2438. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  2439. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  2440. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
  2441. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  2442. ld1 {$res0b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
  2443. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  2444. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  2445. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  2446. ld1 {$res1b}, [$input_ptr], #16 @ AES block 4k+5 - load ciphertext
  2447. eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low
  2448. #ifdef __AARCH64EB__
  2449. rev $output_l3, $output_l3
  2450. #endif
  2451. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  2452. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  2453. aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
  2454. add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
  2455. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  2456. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  2457. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  2458. ld1 {$res2b}, [$input_ptr], #16 @ AES block 4k+6 - load ciphertext
  2459. aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
  2460. ld1 {$res3b}, [$input_ptr], #16 @ AES block 4k+7 - load ciphertext
  2461. rev $ctr32w, $rctr32w @ CTR block 4k+8
  2462. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  2463. stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
  2464. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
  2465. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  2466. cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
  2467. eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
  2468. eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high
  2469. #ifdef __AARCH64EB__
  2470. rev $output_h3, $output_h3
  2471. #endif
  2472. eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
  2473. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
  2474. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
  2475. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
  2476. pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  2477. mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
  2478. mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
  2479. stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
  2480. rev64 $res1b, $res1b @ GHASH block 4k+5
  2481. aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
  2482. mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
  2483. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
  2484. mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
  2485. fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
  2486. add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
  2487. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  2488. eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
  2489. fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
  2490. rev $ctr32w, $rctr32w @ CTR block 4k+9
  2491. eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low
  2492. #ifdef __AARCH64EB__
  2493. rev $output_l0, $output_l0
  2494. #endif
  2495. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
  2496. eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
  2497. fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
  2498. add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
  2499. eor $output_l1, $output_l1, $rk12_l @ AES block 4k+5 - round 12 low
  2500. #ifdef __AARCH64EB__
  2501. rev $output_l1, $output_l1
  2502. #endif
  2503. fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
  2504. rev $ctr32w, $rctr32w @ CTR block 4k+10
  2505. eor $output_h1, $output_h1, $rk12_h @ AES block 4k+5 - round 12 high
  2506. #ifdef __AARCH64EB__
  2507. rev $output_h1, $output_h1
  2508. #endif
  2509. eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high
  2510. #ifdef __AARCH64EB__
  2511. rev $output_h0, $output_h0
  2512. #endif
  2513. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
  2514. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  2515. add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
  2516. rev64 $res0b, $res0b @ GHASH block 4k+4
  2517. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
  2518. aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
  2519. stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
  2520. b.lt .L192_dec_main_loop
  2521. .L192_dec_prepretail: @ PREPRETAIL
  2522. mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
  2523. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  2524. eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
  2525. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  2526. mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
  2527. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  2528. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  2529. eor $res0b, $res0b, $acc_lb @ PRE 1
  2530. fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
  2531. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  2532. mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
  2533. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  2534. mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
  2535. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  2536. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  2537. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
  2538. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  2539. rev64 $res2b, $res2b @ GHASH block 4k+2
  2540. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  2541. fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
  2542. rev $ctr32w, $rctr32w @ CTR block 4k+7
  2543. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
  2544. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  2545. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  2546. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  2547. eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high
  2548. #ifdef __AARCH64EB__
  2549. rev $output_h3, $output_h3
  2550. #endif
  2551. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
  2552. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  2553. eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low
  2554. #ifdef __AARCH64EB__
  2555. rev $output_l2, $output_l2
  2556. #endif
  2557. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  2558. eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high
  2559. #ifdef __AARCH64EB__
  2560. rev $output_h2, $output_h2
  2561. #endif
  2562. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  2563. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  2564. eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low
  2565. #ifdef __AARCH64EB__
  2566. rev $output_l3, $output_l3
  2567. #endif
  2568. stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
  2569. rev64 $res3b, $res3b @ GHASH block 4k+3
  2570. stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
  2571. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  2572. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  2573. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  2574. add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
  2575. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  2576. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  2577. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  2578. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  2579. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  2580. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  2581. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  2582. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  2583. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  2584. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  2585. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  2586. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  2587. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  2588. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  2589. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  2590. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  2591. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  2592. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  2593. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  2594. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  2595. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  2596. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  2597. movi $mod_constant.8b, #0xc2
  2598. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  2599. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  2600. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  2601. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  2602. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  2603. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  2604. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  2605. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  2606. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  2607. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  2608. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  2609. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  2610. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  2611. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  2612. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  2613. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  2614. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  2615. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  2616. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  2617. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  2618. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  2619. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  2620. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  2621. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
  2622. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  2623. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  2624. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  2625. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
  2626. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  2627. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  2628. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  2629. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  2630. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  2631. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  2632. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
  2633. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  2634. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
  2635. pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  2636. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
  2637. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
  2638. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
  2639. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  2640. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
  2641. aese $ctr0b, $rk11
  2642. eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
  2643. aese $ctr2b, $rk11
  2644. aese $ctr1b, $rk11
  2645. aese $ctr3b, $rk11
  2646. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  2647. .L192_dec_tail: @ TAIL
  2648. sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
  2649. ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
  2650. eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
  2651. mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
  2652. mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
  2653. ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
  2654. cmp $main_end_input_ptr, #48
  2655. eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high
  2656. #ifdef __AARCH64EB__
  2657. rev $output_h0, $output_h0
  2658. #endif
  2659. eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low
  2660. #ifdef __AARCH64EB__
  2661. rev $output_l0, $output_l0
  2662. #endif
  2663. b.gt .L192_dec_blocks_more_than_3
  2664. movi $acc_l.8b, #0
  2665. movi $acc_h.8b, #0
  2666. mov $ctr3b, $ctr2b
  2667. mov $ctr2b, $ctr1b
  2668. sub $rctr32w, $rctr32w, #1
  2669. movi $acc_m.8b, #0
  2670. cmp $main_end_input_ptr, #32
  2671. b.gt .L192_dec_blocks_more_than_2
  2672. mov $ctr3b, $ctr1b
  2673. cmp $main_end_input_ptr, #16
  2674. sub $rctr32w, $rctr32w, #1
  2675. b.gt .L192_dec_blocks_more_than_1
  2676. sub $rctr32w, $rctr32w, #1
  2677. b .L192_dec_blocks_less_than_1
  2678. .L192_dec_blocks_more_than_3: @ blocks left > 3
  2679. rev64 $res0b, $res1b @ GHASH final-3 block
  2680. ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
  2681. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
  2682. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2683. eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
  2684. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
  2685. mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
  2686. mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
  2687. mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
  2688. mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
  2689. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
  2690. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
  2691. eor $output_l0, $output_l0, $rk12_l @ AES final-2 block - round 12 low
  2692. #ifdef __AARCH64EB__
  2693. rev $output_l0, $output_l0
  2694. #endif
  2695. movi $t0.8b, #0 @ suppress further partial tag feed in
  2696. pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
  2697. eor $output_h0, $output_h0, $rk12_h @ AES final-2 block - round 12 high
  2698. #ifdef __AARCH64EB__
  2699. rev $output_h0, $output_h0
  2700. #endif
  2701. .L192_dec_blocks_more_than_2: @ blocks left > 2
  2702. rev64 $res0b, $res1b @ GHASH final-2 block
  2703. ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
  2704. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2705. movi $t0.8b, #0 @ suppress further partial tag feed in
  2706. eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
  2707. mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
  2708. pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
  2709. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
  2710. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
  2711. mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
  2712. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
  2713. mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
  2714. pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
  2715. pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
  2716. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
  2717. eor $output_h0, $output_h0, $rk12_h @ AES final-1 block - round 12 high
  2718. #ifdef __AARCH64EB__
  2719. rev $output_h0, $output_h0
  2720. #endif
  2721. eor $output_l0, $output_l0, $rk12_l @ AES final-1 block - round 12 low
  2722. #ifdef __AARCH64EB__
  2723. rev $output_l0, $output_l0
  2724. #endif
  2725. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
  2726. .L192_dec_blocks_more_than_1: @ blocks left > 1
  2727. rev64 $res0b, $res1b @ GHASH final-1 block
  2728. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2729. ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
  2730. mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
  2731. pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
  2732. eor $ctr0b, $res1b, $ctr3b @ AES final block - result
  2733. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
  2734. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
  2735. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
  2736. pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
  2737. mov $output_h0, $ctr0.d[1] @ AES final block - mov high
  2738. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
  2739. mov $output_l0, $ctr0.d[0] @ AES final block - mov low
  2740. pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
  2741. movi $t0.8b, #0 @ suppress further partial tag feed in
  2742. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
  2743. eor $output_h0, $output_h0, $rk12_h @ AES final block - round 12 high
  2744. #ifdef __AARCH64EB__
  2745. rev $output_h0, $output_h0
  2746. #endif
  2747. eor $output_l0, $output_l0, $rk12_l @ AES final block - round 12 low
  2748. #ifdef __AARCH64EB__
  2749. rev $output_l0, $output_l0
  2750. #endif
  2751. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
  2752. .L192_dec_blocks_less_than_1: @ blocks left <= 1
  2753. mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff
  2754. ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
  2755. and $bit_length, $bit_length, #127 @ bit_length %= 128
  2756. sub $bit_length, $bit_length, #128 @ bit_length -= 128
  2757. neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
  2758. and $bit_length, $bit_length, #127 @ bit_length %= 128
  2759. mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff
  2760. lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block
  2761. cmp $bit_length, #64
  2762. csel $ctr32x, $rk12_l, $rk12_h, lt
  2763. csel $ctr96_b64x, $rk12_h, xzr, lt
  2764. fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
  2765. and $output_l0, $output_l0, $ctr32x
  2766. bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
  2767. orr $output_l0, $output_l0, $end_input_ptr
  2768. mov $ctr0.d[1], $ctr96_b64x
  2769. #ifndef __AARCH64EB__
  2770. rev $ctr32w, $rctr32w
  2771. #else
  2772. mov $ctr32w, $rctr32w
  2773. #endif
  2774. and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
  2775. str $ctr32w, [$counter, #12] @ store the updated counter
  2776. rev64 $res0b, $res1b @ GHASH final block
  2777. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2778. bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
  2779. and $output_h0, $output_h0, $ctr96_b64x
  2780. pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
  2781. mov $t0d, $res0.d[1] @ GHASH final block - mid
  2782. pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
  2783. eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
  2784. eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
  2785. pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
  2786. eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
  2787. eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
  2788. movi $mod_constant.8b, #0xc2
  2789. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  2790. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  2791. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  2792. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  2793. orr $output_h0, $output_h0, $main_end_input_ptr
  2794. stp $output_l0, $output_h0, [$output_ptr]
  2795. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  2796. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  2797. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  2798. pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  2799. eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
  2800. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  2801. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  2802. ext $acc_lb, $acc_lb, $acc_lb, #8
  2803. rev64 $acc_lb, $acc_lb
  2804. mov x0, $len
  2805. st1 { $acc_l.16b }, [$current_tag]
  2806. ldp x21, x22, [sp, #16]
  2807. ldp x23, x24, [sp, #32]
  2808. ldp d8, d9, [sp, #48]
  2809. ldp d10, d11, [sp, #64]
  2810. ldp d12, d13, [sp, #80]
  2811. ldp d14, d15, [sp, #96]
  2812. ldp x19, x20, [sp], #112
  2813. ret
  2814. .L192_dec_ret:
  2815. mov w0, #0x0
  2816. ret
  2817. .size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel
  2818. ___
  2819. }
  2820. {
  2821. my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
  2822. my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
  2823. my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
  2824. my ($output_l0,$output_h0)=map("x$_",(6..7));
  2825. my $ctr32w="w9";
  2826. my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk14_l,$rk14_h,$len)=map("x$_",(9..15));
  2827. my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
  2828. my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
  2829. my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
  2830. my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
  2831. my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
  2832. my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
  2833. my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
  2834. my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
  2835. my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
  2836. my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
  2837. my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
  2838. my $t0="v8";
  2839. my $t0d="d8";
  2840. my $t1="v4";
  2841. my $t1d="d4";
  2842. my $t2="v8";
  2843. my $t2d="d8";
  2844. my $t3="v4";
  2845. my $t3d="d4";
  2846. my $t4="v4";
  2847. my $t4d="d4";
  2848. my $t5="v5";
  2849. my $t5d="d5";
  2850. my $t6="v8";
  2851. my $t6d="d8";
  2852. my $t7="v5";
  2853. my $t7d="d5";
  2854. my $t8="v6";
  2855. my $t8d="d6";
  2856. my $t9="v4";
  2857. my $t9d="d4";
  2858. my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
  2859. my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
  2860. my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
  2861. my $mod_constantd="d8";
  2862. my $mod_constant="v8";
  2863. my $mod_t="v7";
  2864. my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11,$rk12,$rk13)=map("v$_.16b",(18..31));
  2865. my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s,$rk10s,$rk11s,$rk12s,$rk13s)=map("v$_.4s",(18..31));
  2866. my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q,$rk12q,$rk13q)=map("q$_",(18..31));
  2867. my $rk2q1="v20.1q";
  2868. my $rk3q1="v21.1q";
  2869. my $rk4v="v22";
  2870. my $rk4d="d22";
  2871. #########################################################################################
  2872. # size_t aes_gcm_enc_256_kernel(const uint8_t * plaintext,
  2873. # uint64_t plaintext_length,
  2874. # uint8_t * ciphertext,
  2875. # uint64_t *Xi,
  2876. # unsigned char ivec[16],
  2877. # const void *key);
  2878. #
  2879. $code.=<<___;
  2880. .global aes_gcm_enc_256_kernel
  2881. .type aes_gcm_enc_256_kernel,%function
  2882. .align 4
  2883. aes_gcm_enc_256_kernel:
  2884. AARCH64_VALID_CALL_TARGET
  2885. cbz x1, .L256_enc_ret
  2886. stp x19, x20, [sp, #-112]!
  2887. mov x16, x4
  2888. mov x8, x5
  2889. stp x21, x22, [sp, #16]
  2890. stp x23, x24, [sp, #32]
  2891. stp d8, d9, [sp, #48]
  2892. stp d10, d11, [sp, #64]
  2893. stp d12, d13, [sp, #80]
  2894. stp d14, d15, [sp, #96]
  2895. add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
  2896. lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
  2897. mov $len, $main_end_input_ptr
  2898. ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
  2899. #ifdef __AARCH64EB__
  2900. rev $ctr96_b64x, $ctr96_b64x
  2901. rev $ctr96_t32x, $ctr96_t32x
  2902. #endif
  2903. ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14
  2904. #ifdef __AARCH64EB__
  2905. ror $rk14_l, $rk14_l, #32
  2906. ror $rk14_h, $rk14_h, #32
  2907. #endif
  2908. ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
  2909. sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
  2910. ld1 {$rk0s}, [$cc], #16 @ load rk0
  2911. and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
  2912. ld1 {$rk1s}, [$cc], #16 @ load rk1
  2913. add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
  2914. lsr $rctr32x, $ctr96_t32x, #32
  2915. fmov $ctr2d, $ctr96_b64x @ CTR block 2
  2916. orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
  2917. rev $rctr32w, $rctr32w @ rev_ctr32
  2918. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
  2919. fmov $ctr1d, $ctr96_b64x @ CTR block 1
  2920. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
  2921. add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
  2922. rev $ctr32w, $rctr32w @ CTR block 1
  2923. fmov $ctr3d, $ctr96_b64x @ CTR block 3
  2924. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
  2925. add $rctr32w, $rctr32w, #1 @ CTR block 1
  2926. ld1 {$rk2s}, [$cc], #16 @ load rk2
  2927. fmov $ctr1.d[1], $ctr32x @ CTR block 1
  2928. rev $ctr32w, $rctr32w @ CTR block 2
  2929. add $rctr32w, $rctr32w, #1 @ CTR block 2
  2930. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
  2931. ld1 {$rk3s}, [$cc], #16 @ load rk3
  2932. fmov $ctr2.d[1], $ctr32x @ CTR block 2
  2933. rev $ctr32w, $rctr32w @ CTR block 3
  2934. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
  2935. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
  2936. fmov $ctr3.d[1], $ctr32x @ CTR block 3
  2937. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
  2938. ld1 {$rk4s}, [$cc], #16 @ load rk4
  2939. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
  2940. ld1 {$rk5s}, [$cc], #16 @ load rk5
  2941. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
  2942. ld1 {$rk6s}, [$cc], #16 @ load rk6
  2943. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
  2944. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  2945. #ifndef __AARCH64EB__
  2946. ext $h3b, $h3b, $h3b, #8
  2947. #endif
  2948. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
  2949. ld1 {$rk7s}, [$cc], #16 @ load rk7
  2950. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
  2951. ld1 {$rk8s}, [$cc], #16 @ load rk8
  2952. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
  2953. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  2954. #ifndef __AARCH64EB__
  2955. ext $h2b, $h2b, $h2b, #8
  2956. #endif
  2957. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
  2958. ld1 {$rk9s}, [$cc], #16 @ load rk9
  2959. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
  2960. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  2961. #ifndef __AARCH64EB__
  2962. ext $h4b, $h4b, $h4b, #8
  2963. #endif
  2964. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
  2965. ld1 {$rk10s}, [$cc], #16 @ load rk10
  2966. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
  2967. ld1 {$rk11s}, [$cc], #16 @ load rk11
  2968. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
  2969. add $rctr32w, $rctr32w, #1 @ CTR block 3
  2970. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
  2971. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
  2972. ld1 { $acc_lb}, [$current_tag]
  2973. ext $acc_lb, $acc_lb, $acc_lb, #8
  2974. rev64 $acc_lb, $acc_lb
  2975. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
  2976. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
  2977. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
  2978. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
  2979. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
  2980. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
  2981. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
  2982. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
  2983. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
  2984. trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
  2985. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
  2986. ld1 {$rk12s}, [$cc], #16 @ load rk12
  2987. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
  2988. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  2989. #ifndef __AARCH64EB__
  2990. ext $h1b, $h1b, $h1b, #8
  2991. #endif
  2992. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
  2993. ld1 {$rk13s}, [$cc], #16 @ load rk13
  2994. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
  2995. trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
  2996. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
  2997. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
  2998. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
  2999. trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
  3000. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
  3001. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
  3002. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
  3003. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
  3004. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
  3005. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
  3006. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
  3007. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
  3008. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
  3009. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
  3010. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
  3011. aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
  3012. aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
  3013. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
  3014. aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
  3015. aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
  3016. aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
  3017. eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
  3018. aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
  3019. aese $ctr2b, $rk13 @ AES block 2 - round 13
  3020. trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
  3021. aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
  3022. aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
  3023. aese $ctr1b, $rk13 @ AES block 1 - round 13
  3024. aese $ctr0b, $rk13 @ AES block 0 - round 13
  3025. aese $ctr3b, $rk13 @ AES block 3 - round 13
  3026. eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
  3027. b.ge .L256_enc_tail @ handle tail
  3028. ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
  3029. #ifdef __AARCH64EB__
  3030. rev $input_l1, $input_l1
  3031. rev $input_h1, $input_h1
  3032. #endif
  3033. rev $ctr32w, $rctr32w @ CTR block 4
  3034. ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
  3035. #ifdef __AARCH64EB__
  3036. rev $input_l0, $input_l0
  3037. rev $input_h0, $input_h0
  3038. #endif
  3039. ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
  3040. #ifdef __AARCH64EB__
  3041. rev $input_l3, $input_l3
  3042. rev $input_h3, $input_h3
  3043. #endif
  3044. ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
  3045. #ifdef __AARCH64EB__
  3046. rev $input_l2, $input_l2
  3047. rev $input_h2, $input_h2
  3048. #endif
  3049. add $input_ptr, $input_ptr, #64 @ AES input_ptr update
  3050. eor $input_l1, $input_l1, $rk14_l @ AES block 1 - round 14 low
  3051. eor $input_h1, $input_h1, $rk14_h @ AES block 1 - round 14 high
  3052. fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
  3053. eor $input_l0, $input_l0, $rk14_l @ AES block 0 - round 14 low
  3054. eor $input_h0, $input_h0, $rk14_h @ AES block 0 - round 14 high
  3055. eor $input_h3, $input_h3, $rk14_h @ AES block 3 - round 14 high
  3056. fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
  3057. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  3058. fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
  3059. eor $input_l3, $input_l3, $rk14_l @ AES block 3 - round 14 low
  3060. eor $input_l2, $input_l2, $rk14_l @ AES block 2 - round 14 low
  3061. fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
  3062. fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
  3063. add $rctr32w, $rctr32w, #1 @ CTR block 4
  3064. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
  3065. fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
  3066. eor $input_h2, $input_h2, $rk14_h @ AES block 2 - round 14 high
  3067. fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
  3068. eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
  3069. fmov $ctr0d, $ctr96_b64x @ CTR block 4
  3070. fmov $ctr0.d[1], $ctr32x @ CTR block 4
  3071. rev $ctr32w, $rctr32w @ CTR block 5
  3072. add $rctr32w, $rctr32w, #1 @ CTR block 5
  3073. eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
  3074. fmov $ctr1d, $ctr96_b64x @ CTR block 5
  3075. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
  3076. fmov $ctr1.d[1], $ctr32x @ CTR block 5
  3077. rev $ctr32w, $rctr32w @ CTR block 6
  3078. st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
  3079. fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
  3080. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
  3081. eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
  3082. st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
  3083. add $rctr32w, $rctr32w, #1 @ CTR block 6
  3084. fmov $ctr2d, $ctr96_b64x @ CTR block 6
  3085. fmov $ctr2.d[1], $ctr32x @ CTR block 6
  3086. st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
  3087. rev $ctr32w, $rctr32w @ CTR block 7
  3088. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
  3089. eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
  3090. st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
  3091. b.ge L256_enc_prepretail @ do prepretail
  3092. .L256_enc_main_loop: @ main loop start
  3093. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  3094. rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
  3095. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  3096. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
  3097. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  3098. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  3099. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  3100. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
  3101. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  3102. ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+7 - load plaintext
  3103. #ifdef __AARCH64EB__
  3104. rev $input_l3, $input_l3
  3105. rev $input_h3, $input_h3
  3106. #endif
  3107. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  3108. ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
  3109. #ifdef __AARCH64EB__
  3110. rev $input_l2, $input_l2
  3111. rev $input_h2, $input_h2
  3112. #endif
  3113. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  3114. eor $res0b, $res0b, $acc_lb @ PRE 1
  3115. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  3116. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  3117. eor $input_l3, $input_l3, $rk14_l @ AES block 4k+7 - round 14 low
  3118. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  3119. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  3120. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  3121. eor $input_h2, $input_h2, $rk14_h @ AES block 4k+6 - round 14 high
  3122. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  3123. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  3124. rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
  3125. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  3126. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  3127. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  3128. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  3129. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  3130. rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
  3131. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  3132. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  3133. rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
  3134. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  3135. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  3136. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  3137. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  3138. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  3139. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  3140. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  3141. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  3142. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  3143. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  3144. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  3145. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  3146. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  3147. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  3148. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  3149. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  3150. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  3151. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  3152. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  3153. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  3154. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  3155. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  3156. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  3157. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  3158. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  3159. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  3160. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  3161. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  3162. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  3163. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  3164. ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
  3165. #ifdef __AARCH64EB__
  3166. rev $input_l1, $input_l1
  3167. rev $input_h1, $input_h1
  3168. #endif
  3169. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  3170. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  3171. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  3172. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  3173. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  3174. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  3175. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  3176. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  3177. eor $input_l1, $input_l1, $rk14_l @ AES block 4k+5 - round 14 low
  3178. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
  3179. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  3180. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  3181. eor $input_l2, $input_l2, $rk14_l @ AES block 4k+6 - round 14 low
  3182. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
  3183. movi $mod_constant.8b, #0xc2
  3184. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  3185. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  3186. fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
  3187. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  3188. ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
  3189. #ifdef __AARCH64EB__
  3190. rev $input_l0, $input_l0
  3191. rev $input_h0, $input_h0
  3192. #endif
  3193. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
  3194. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  3195. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  3196. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  3197. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
  3198. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
  3199. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  3200. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
  3201. add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
  3202. aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
  3203. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  3204. aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
  3205. add $input_ptr, $input_ptr, #64 @ AES input_ptr update
  3206. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  3207. rev $ctr32w, $rctr32w @ CTR block 4k+8
  3208. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  3209. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
  3210. eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low
  3211. aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
  3212. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  3213. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
  3214. eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high
  3215. fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
  3216. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
  3217. eor $mod_t.16b, $acc_hb, $mod_t.16b @ MODULO - fold into mid
  3218. aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
  3219. eor $input_h1, $input_h1, $rk14_h @ AES block 4k+5 - round 14 high
  3220. aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
  3221. eor $input_h3, $input_h3, $rk14_h @ AES block 4k+7 - round 14 high
  3222. aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
  3223. add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
  3224. aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
  3225. fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
  3226. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  3227. aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
  3228. fmov $ctr_t3d, $input_l3 @ AES block 4k+7 - mov low
  3229. aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
  3230. fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
  3231. fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
  3232. cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
  3233. fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
  3234. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  3235. eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
  3236. fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
  3237. fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
  3238. rev $ctr32w, $rctr32w @ CTR block 4k+9
  3239. add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
  3240. eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
  3241. fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
  3242. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
  3243. aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
  3244. fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
  3245. aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
  3246. rev $ctr32w, $rctr32w @ CTR block 4k+10
  3247. st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
  3248. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
  3249. eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
  3250. fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+7 - mov high
  3251. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  3252. st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
  3253. add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
  3254. aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
  3255. eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
  3256. fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
  3257. st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
  3258. fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
  3259. rev $ctr32w, $rctr32w @ CTR block 4k+11
  3260. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  3261. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
  3262. eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+7 - result
  3263. st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+7 - store result
  3264. b.lt L256_enc_main_loop
  3265. .L256_enc_prepretail: @ PREPRETAIL
  3266. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  3267. rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
  3268. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  3269. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
  3270. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  3271. rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
  3272. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
  3273. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  3274. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  3275. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  3276. eor $res0b, $res0b, $acc_lb @ PRE 1
  3277. rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
  3278. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  3279. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  3280. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  3281. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  3282. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  3283. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  3284. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  3285. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  3286. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  3287. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  3288. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  3289. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  3290. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  3291. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  3292. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  3293. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  3294. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  3295. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  3296. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  3297. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  3298. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  3299. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  3300. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  3301. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  3302. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  3303. rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
  3304. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  3305. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  3306. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  3307. add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
  3308. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  3309. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  3310. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  3311. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  3312. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  3313. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  3314. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  3315. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  3316. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  3317. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  3318. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  3319. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  3320. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  3321. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  3322. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  3323. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  3324. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  3325. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  3326. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  3327. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  3328. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  3329. movi $mod_constant.8b, #0xc2
  3330. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  3331. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  3332. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  3333. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  3334. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  3335. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  3336. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  3337. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  3338. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  3339. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  3340. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
  3341. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  3342. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  3343. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
  3344. eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
  3345. pmull $t1.1q, $acc_h.1d, $mod_constant.1d
  3346. ext $acc_hb, $acc_hb, $acc_hb, #8
  3347. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
  3348. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  3349. eor $acc_mb, $acc_mb, $acc_lb
  3350. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
  3351. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
  3352. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  3353. aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
  3354. eor $acc_mb, $acc_mb, $t1.16b
  3355. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
  3356. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
  3357. aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
  3358. aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
  3359. eor $acc_mb, $acc_mb, $acc_hb
  3360. aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
  3361. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
  3362. aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
  3363. pmull $t1.1q, $acc_m.1d, $mod_constant.1d
  3364. aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
  3365. ext $acc_mb, $acc_mb, $acc_mb, #8
  3366. aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
  3367. aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
  3368. eor $acc_lb, $acc_lb, $t1.16b
  3369. aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
  3370. aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
  3371. aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
  3372. aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
  3373. eor $acc_lb, $acc_lb, $acc_mb
  3374. .L256_enc_tail: @ TAIL
  3375. ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
  3376. sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
  3377. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
  3378. #ifdef __AARCH64EB__
  3379. rev $input_l0, $input_l0
  3380. rev $input_h0, $input_h0
  3381. #endif
  3382. eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low
  3383. eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high
  3384. cmp $main_end_input_ptr, #48
  3385. fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
  3386. fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
  3387. eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
  3388. b.gt .L256_enc_blocks_more_than_3
  3389. cmp $main_end_input_ptr, #32
  3390. mov $ctr3b, $ctr2b
  3391. movi $acc_l.8b, #0
  3392. movi $acc_h.8b, #0
  3393. sub $rctr32w, $rctr32w, #1
  3394. mov $ctr2b, $ctr1b
  3395. movi $acc_m.8b, #0
  3396. b.gt .L256_enc_blocks_more_than_2
  3397. mov $ctr3b, $ctr1b
  3398. sub $rctr32w, $rctr32w, #1
  3399. cmp $main_end_input_ptr, #16
  3400. b.gt .L256_enc_blocks_more_than_1
  3401. sub $rctr32w, $rctr32w, #1
  3402. b .L256_enc_blocks_less_than_1
  3403. .L256_enc_blocks_more_than_3: @ blocks left > 3
  3404. st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
  3405. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
  3406. #ifdef __AARCH64EB__
  3407. rev $input_l0, $input_l0
  3408. rev $input_h0, $input_h0
  3409. #endif
  3410. rev64 $res0b, $res1b @ GHASH final-3 block
  3411. eor $input_l0, $input_l0, $rk14_l @ AES final-2 block - round 14 low
  3412. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3413. eor $input_h0, $input_h0, $rk14_h @ AES final-2 block - round 14 high
  3414. mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
  3415. fmov $res1d, $input_l0 @ AES final-2 block - mov low
  3416. fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
  3417. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
  3418. movi $t0.8b, #0 @ suppress further partial tag feed in
  3419. mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
  3420. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
  3421. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
  3422. pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
  3423. eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
  3424. .L256_enc_blocks_more_than_2: @ blocks left > 2
  3425. st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
  3426. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
  3427. #ifdef __AARCH64EB__
  3428. rev $input_l0, $input_l0
  3429. rev $input_h0, $input_h0
  3430. #endif
  3431. rev64 $res0b, $res1b @ GHASH final-2 block
  3432. eor $input_l0, $input_l0, $rk14_l @ AES final-1 block - round 14 low
  3433. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3434. fmov $res1d, $input_l0 @ AES final-1 block - mov low
  3435. eor $input_h0, $input_h0, $rk14_h @ AES final-1 block - round 14 high
  3436. fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
  3437. movi $t0.8b, #0 @ suppress further partial tag feed in
  3438. pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
  3439. mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
  3440. pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
  3441. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
  3442. eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
  3443. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
  3444. pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
  3445. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
  3446. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
  3447. .L256_enc_blocks_more_than_1: @ blocks left > 1
  3448. st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
  3449. rev64 $res0b, $res1b @ GHASH final-1 block
  3450. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
  3451. #ifdef __AARCH64EB__
  3452. rev $input_l0, $input_l0
  3453. rev $input_h0, $input_h0
  3454. #endif
  3455. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3456. movi $t0.8b, #0 @ suppress further partial tag feed in
  3457. eor $input_l0, $input_l0, $rk14_l @ AES final block - round 14 low
  3458. mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
  3459. pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
  3460. eor $input_h0, $input_h0, $rk14_h @ AES final block - round 14 high
  3461. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
  3462. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
  3463. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
  3464. fmov $res1d, $input_l0 @ AES final block - mov low
  3465. fmov $res1.d[1], $input_h0 @ AES final block - mov high
  3466. pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
  3467. pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
  3468. eor $res1b, $res1b, $ctr3b @ AES final block - result
  3469. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
  3470. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
  3471. .L256_enc_blocks_less_than_1: @ blocks left <= 1
  3472. and $bit_length, $bit_length, #127 @ bit_length %= 128
  3473. mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff
  3474. sub $bit_length, $bit_length, #128 @ bit_length -= 128
  3475. neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
  3476. ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
  3477. mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff
  3478. and $bit_length, $bit_length, #127 @ bit_length %= 128
  3479. lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block
  3480. cmp $bit_length, #64
  3481. csel $input_l0, $rk14_l, $rk14_h, lt
  3482. csel $input_h0, $rk14_h, xzr, lt
  3483. fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
  3484. fmov $ctr0.d[1], $input_h0
  3485. and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
  3486. rev64 $res0b, $res1b @ GHASH final block
  3487. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3488. bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
  3489. pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
  3490. mov $t0d, $res0.d[1] @ GHASH final block - mid
  3491. #ifndef __AARCH64EB__
  3492. rev $ctr32w, $rctr32w
  3493. #else
  3494. mov $ctr32w, $rctr32w
  3495. #endif
  3496. pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
  3497. eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
  3498. eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
  3499. pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
  3500. eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
  3501. eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
  3502. movi $mod_constant.8b, #0xc2
  3503. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  3504. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  3505. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  3506. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  3507. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  3508. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  3509. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  3510. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  3511. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  3512. str $ctr32w, [$counter, #12] @ store the updated counter
  3513. st1 { $res1b}, [$output_ptr] @ store all 16B
  3514. eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
  3515. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  3516. ext $acc_lb, $acc_lb, $acc_lb, #8
  3517. rev64 $acc_lb, $acc_lb
  3518. mov x0, $len
  3519. st1 { $acc_l.16b }, [$current_tag]
  3520. ldp x21, x22, [sp, #16]
  3521. ldp x23, x24, [sp, #32]
  3522. ldp d8, d9, [sp, #48]
  3523. ldp d10, d11, [sp, #64]
  3524. ldp d12, d13, [sp, #80]
  3525. ldp d14, d15, [sp, #96]
  3526. ldp x19, x20, [sp], #112
  3527. ret
  3528. .L256_enc_ret:
  3529. mov w0, #0x0
  3530. ret
  3531. .size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel
  3532. ___
  3533. {
  3534. my $t8="v4";
  3535. my $t8d="d4";
  3536. my $t9="v6";
  3537. my $t9d="d6";
  3538. #########################################################################################
  3539. # size_t aes_gcm_dec_256_kernel(const uint8_t * ciphertext,
  3540. # uint64_t plaintext_length,
  3541. # uint8_t * plaintext,
  3542. # uint64_t *Xi,
  3543. # unsigned char ivec[16],
  3544. # const void *key);
  3545. #
  3546. $code.=<<___;
  3547. .global aes_gcm_dec_256_kernel
  3548. .type aes_gcm_dec_256_kernel,%function
  3549. .align 4
  3550. aes_gcm_dec_256_kernel:
  3551. AARCH64_VALID_CALL_TARGET
  3552. cbz x1, .L256_dec_ret
  3553. stp x19, x20, [sp, #-112]!
  3554. mov x16, x4
  3555. mov x8, x5
  3556. stp x21, x22, [sp, #16]
  3557. stp x23, x24, [sp, #32]
  3558. stp d8, d9, [sp, #48]
  3559. stp d10, d11, [sp, #64]
  3560. stp d12, d13, [sp, #80]
  3561. stp d14, d15, [sp, #96]
  3562. lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
  3563. mov $len, $main_end_input_ptr
  3564. ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
  3565. #ifdef __AARCH64EB__
  3566. rev $ctr96_b64x, $ctr96_b64x
  3567. rev $ctr96_t32x, $ctr96_t32x
  3568. #endif
  3569. ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14
  3570. #ifdef __AARCH64EB__
  3571. ror $rk14_h, $rk14_h, #32
  3572. ror $rk14_l, $rk14_l, #32
  3573. #endif
  3574. ld1 {$rk0s}, [$cc], #16 @ load rk0
  3575. sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
  3576. ld1 {$rk1s}, [$cc], #16 @ load rk1
  3577. and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
  3578. add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
  3579. ld1 {$rk2s}, [$cc], #16 @ load rk2
  3580. lsr $rctr32x, $ctr96_t32x, #32
  3581. ld1 {$rk3s}, [$cc], #16 @ load rk3
  3582. orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
  3583. ld1 {$rk4s}, [$cc], #16 @ load rk4
  3584. add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
  3585. rev $rctr32w, $rctr32w @ rev_ctr32
  3586. add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
  3587. fmov $ctr3d, $ctr96_b64x @ CTR block 3
  3588. rev $ctr32w, $rctr32w @ CTR block 1
  3589. add $rctr32w, $rctr32w, #1 @ CTR block 1
  3590. fmov $ctr1d, $ctr96_b64x @ CTR block 1
  3591. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
  3592. ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
  3593. fmov $ctr1.d[1], $ctr32x @ CTR block 1
  3594. rev $ctr32w, $rctr32w @ CTR block 2
  3595. add $rctr32w, $rctr32w, #1 @ CTR block 2
  3596. fmov $ctr2d, $ctr96_b64x @ CTR block 2
  3597. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
  3598. fmov $ctr2.d[1], $ctr32x @ CTR block 2
  3599. rev $ctr32w, $rctr32w @ CTR block 3
  3600. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
  3601. ld1 {$rk5s}, [$cc], #16 @ load rk5
  3602. fmov $ctr3.d[1], $ctr32x @ CTR block 3
  3603. add $rctr32w, $rctr32w, #1 @ CTR block 3
  3604. ld1 {$rk6s}, [$cc], #16 @ load rk6
  3605. ld1 {$rk7s}, [$cc], #16 @ load rk7
  3606. ld1 {$rk8s}, [$cc], #16 @ load rk8
  3607. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
  3608. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  3609. #ifndef __AARCH64EB__
  3610. ext $h3b, $h3b, $h3b, #8
  3611. #endif
  3612. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
  3613. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  3614. #ifndef __AARCH64EB__
  3615. ext $h4b, $h4b, $h4b, #8
  3616. #endif
  3617. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
  3618. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  3619. #ifndef __AARCH64EB__
  3620. ext $h2b, $h2b, $h2b, #8
  3621. #endif
  3622. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
  3623. ld1 {$rk9s}, [$cc], #16 @ load rk9
  3624. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
  3625. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
  3626. ld1 { $acc_lb}, [$current_tag]
  3627. ext $acc_lb, $acc_lb, $acc_lb, #8
  3628. rev64 $acc_lb, $acc_lb
  3629. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
  3630. ld1 {$rk10s}, [$cc], #16 @ load rk10
  3631. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
  3632. ld1 {$rk11s}, [$cc], #16 @ load rk11
  3633. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
  3634. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  3635. #ifndef __AARCH64EB__
  3636. ext $h1b, $h1b, $h1b, #8
  3637. #endif
  3638. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
  3639. ld1 {$rk12s}, [$cc], #16 @ load rk12
  3640. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
  3641. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
  3642. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
  3643. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
  3644. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
  3645. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
  3646. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
  3647. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
  3648. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
  3649. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
  3650. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
  3651. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
  3652. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
  3653. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
  3654. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
  3655. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
  3656. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
  3657. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
  3658. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
  3659. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
  3660. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
  3661. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
  3662. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
  3663. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
  3664. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
  3665. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
  3666. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
  3667. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
  3668. ld1 {$rk13s}, [$cc], #16 @ load rk13
  3669. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
  3670. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
  3671. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
  3672. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
  3673. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
  3674. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
  3675. aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
  3676. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
  3677. aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
  3678. aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
  3679. aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
  3680. trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
  3681. trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
  3682. trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
  3683. trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
  3684. aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
  3685. aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
  3686. aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
  3687. aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
  3688. eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
  3689. aese $ctr1b, $rk13 @ AES block 1 - round 13
  3690. aese $ctr2b, $rk13 @ AES block 2 - round 13
  3691. eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
  3692. aese $ctr3b, $rk13 @ AES block 3 - round 13
  3693. aese $ctr0b, $rk13 @ AES block 0 - round 13
  3694. b.ge .L256_dec_tail @ handle tail
  3695. ld1 {$res0b, $res1b}, [$input_ptr], #32 @ AES block 0,1 - load ciphertext
  3696. rev $ctr32w, $rctr32w @ CTR block 4
  3697. eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
  3698. eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
  3699. rev64 $res1b, $res1b @ GHASH block 1
  3700. ld1 {$res2b}, [$input_ptr], #16 @ AES block 2 - load ciphertext
  3701. mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
  3702. mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
  3703. rev64 $res0b, $res0b @ GHASH block 0
  3704. add $rctr32w, $rctr32w, #1 @ CTR block 4
  3705. fmov $ctr0d, $ctr96_b64x @ CTR block 4
  3706. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
  3707. fmov $ctr0.d[1], $ctr32x @ CTR block 4
  3708. rev $ctr32w, $rctr32w @ CTR block 5
  3709. add $rctr32w, $rctr32w, #1 @ CTR block 5
  3710. mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
  3711. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
  3712. mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
  3713. eor $output_h0, $output_h0, $rk14_h @ AES block 0 - round 14 high
  3714. #ifdef __AARCH64EB__
  3715. rev $output_h0, $output_h0
  3716. #endif
  3717. eor $output_l0, $output_l0, $rk14_l @ AES block 0 - round 14 low
  3718. #ifdef __AARCH64EB__
  3719. rev $output_l0, $output_l0
  3720. #endif
  3721. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
  3722. fmov $ctr1d, $ctr96_b64x @ CTR block 5
  3723. ld1 {$res3b}, [$input_ptr], #16 @ AES block 3 - load ciphertext
  3724. fmov $ctr1.d[1], $ctr32x @ CTR block 5
  3725. rev $ctr32w, $rctr32w @ CTR block 6
  3726. add $rctr32w, $rctr32w, #1 @ CTR block 6
  3727. eor $output_l1, $output_l1, $rk14_l @ AES block 1 - round 14 low
  3728. #ifdef __AARCH64EB__
  3729. rev $output_l1, $output_l1
  3730. #endif
  3731. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
  3732. eor $output_h1, $output_h1, $rk14_h @ AES block 1 - round 14 high
  3733. #ifdef __AARCH64EB__
  3734. rev $output_h1, $output_h1
  3735. #endif
  3736. stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
  3737. eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
  3738. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  3739. b.ge .L256_dec_prepretail @ do prepretail
  3740. .L256_dec_main_loop: @ main loop start
  3741. mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
  3742. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  3743. eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
  3744. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  3745. mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
  3746. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  3747. fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
  3748. fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
  3749. eor $res0b, $res0b, $acc_lb @ PRE 1
  3750. rev $ctr32w, $rctr32w @ CTR block 4k+7
  3751. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  3752. mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
  3753. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  3754. mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
  3755. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  3756. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  3757. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
  3758. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  3759. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
  3760. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  3761. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
  3762. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  3763. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  3764. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  3765. eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high
  3766. #ifdef __AARCH64EB__
  3767. rev $output_h2, $output_h2
  3768. #endif
  3769. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  3770. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  3771. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  3772. rev64 $res2b, $res2b @ GHASH block 4k+2
  3773. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  3774. eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low
  3775. #ifdef __AARCH64EB__
  3776. rev $output_l2, $output_l2
  3777. #endif
  3778. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  3779. stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
  3780. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  3781. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  3782. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  3783. rev64 $res3b, $res3b @ GHASH block 4k+3
  3784. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  3785. eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low
  3786. #ifdef __AARCH64EB__
  3787. rev $output_l3, $output_l3
  3788. #endif
  3789. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  3790. eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high
  3791. #ifdef __AARCH64EB__
  3792. rev $output_h3, $output_h3
  3793. #endif
  3794. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  3795. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  3796. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  3797. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  3798. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  3799. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  3800. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  3801. add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
  3802. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  3803. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  3804. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  3805. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  3806. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  3807. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  3808. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  3809. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  3810. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  3811. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  3812. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  3813. rev $ctr32w, $rctr32w @ CTR block 4k+8
  3814. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  3815. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  3816. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  3817. add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
  3818. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  3819. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  3820. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  3821. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  3822. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  3823. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  3824. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  3825. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  3826. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  3827. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  3828. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  3829. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  3830. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
  3831. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  3832. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  3833. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
  3834. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  3835. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  3836. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  3837. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  3838. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
  3839. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  3840. movi $mod_constant.8b, #0xc2
  3841. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  3842. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  3843. aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
  3844. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  3845. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  3846. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  3847. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  3848. aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
  3849. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  3850. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  3851. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
  3852. ld1 {$res0b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
  3853. aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
  3854. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  3855. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
  3856. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  3857. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
  3858. ld1 {$res1b}, [$input_ptr], #16 @ AES block 4k+5 - load ciphertext
  3859. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  3860. eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
  3861. aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
  3862. stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
  3863. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
  3864. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  3865. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
  3866. ld1 {$res2b}, [$input_ptr], #16 @ AES block 4k+6 - load ciphertext
  3867. aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
  3868. ld1 {$res3b}, [$input_ptr], #16 @ AES block 4k+7 - load ciphertext
  3869. aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
  3870. mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
  3871. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
  3872. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  3873. aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
  3874. mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
  3875. aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
  3876. fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
  3877. aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
  3878. fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
  3879. pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  3880. eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
  3881. rev $ctr32w, $rctr32w @ CTR block 4k+9
  3882. aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
  3883. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
  3884. cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
  3885. add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
  3886. eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low
  3887. #ifdef __AARCH64EB__
  3888. rev $output_l0, $output_l0
  3889. #endif
  3890. eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high
  3891. #ifdef __AARCH64EB__
  3892. rev $output_h0, $output_h0
  3893. #endif
  3894. mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
  3895. eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
  3896. eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
  3897. aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
  3898. mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
  3899. fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
  3900. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  3901. fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
  3902. rev $ctr32w, $rctr32w @ CTR block 4k+10
  3903. add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
  3904. aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
  3905. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
  3906. rev64 $res1b, $res1b @ GHASH block 4k+5
  3907. eor $output_h1, $output_h1, $rk14_h @ AES block 4k+5 - round 14 high
  3908. #ifdef __AARCH64EB__
  3909. rev $output_h1, $output_h1
  3910. #endif
  3911. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
  3912. eor $output_l1, $output_l1, $rk14_l @ AES block 4k+5 - round 14 low
  3913. #ifdef __AARCH64EB__
  3914. rev $output_l1, $output_l1
  3915. #endif
  3916. stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
  3917. rev64 $res0b, $res0b @ GHASH block 4k+4
  3918. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  3919. b.lt .L256_dec_main_loop
  3920. .L256_dec_prepretail: @ PREPRETAIL
  3921. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  3922. mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
  3923. eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
  3924. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  3925. mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
  3926. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  3927. fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
  3928. fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
  3929. rev $ctr32w, $rctr32w @ CTR block 4k+7
  3930. eor $res0b, $res0b, $acc_lb @ PRE 1
  3931. rev64 $res2b, $res2b @ GHASH block 4k+2
  3932. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
  3933. mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
  3934. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  3935. mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
  3936. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  3937. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  3938. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
  3939. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  3940. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
  3941. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  3942. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  3943. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  3944. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  3945. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  3946. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  3947. rev64 $res3b, $res3b @ GHASH block 4k+3
  3948. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  3949. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  3950. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  3951. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  3952. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  3953. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  3954. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  3955. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  3956. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  3957. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  3958. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  3959. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  3960. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  3961. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  3962. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  3963. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  3964. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  3965. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  3966. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  3967. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  3968. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  3969. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  3970. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  3971. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  3972. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  3973. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  3974. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  3975. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  3976. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  3977. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  3978. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  3979. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  3980. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  3981. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  3982. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  3983. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  3984. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  3985. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  3986. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  3987. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  3988. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  3989. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  3990. movi $mod_constant.8b, #0xc2
  3991. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  3992. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  3993. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  3994. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  3995. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  3996. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  3997. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  3998. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  3999. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  4000. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  4001. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  4002. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  4003. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  4004. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  4005. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  4006. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
  4007. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  4008. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  4009. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
  4010. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  4011. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
  4012. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
  4013. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  4014. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
  4015. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
  4016. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
  4017. eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high
  4018. #ifdef __AARCH64EB__
  4019. rev $output_h2, $output_h2
  4020. #endif
  4021. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
  4022. eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low
  4023. #ifdef __AARCH64EB__
  4024. rev $output_l3, $output_l3
  4025. #endif
  4026. aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
  4027. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  4028. aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
  4029. add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
  4030. aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
  4031. eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low
  4032. #ifdef __AARCH64EB__
  4033. rev $output_l2, $output_l2
  4034. #endif
  4035. aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
  4036. pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  4037. eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high
  4038. #ifdef __AARCH64EB__
  4039. rev $output_h3, $output_h3
  4040. #endif
  4041. aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
  4042. stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
  4043. aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
  4044. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  4045. aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
  4046. stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
  4047. aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
  4048. eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
  4049. aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
  4050. aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
  4051. aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
  4052. aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
  4053. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  4054. .L256_dec_tail: @ TAIL
  4055. sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
  4056. ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
  4057. eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
  4058. mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
  4059. mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
  4060. ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
  4061. cmp $main_end_input_ptr, #48
  4062. eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low
  4063. #ifdef __AARCH64EB__
  4064. rev $output_l0, $output_l0
  4065. #endif
  4066. eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high
  4067. #ifdef __AARCH64EB__
  4068. rev $output_h0, $output_h0
  4069. #endif
  4070. b.gt .L256_dec_blocks_more_than_3
  4071. sub $rctr32w, $rctr32w, #1
  4072. mov $ctr3b, $ctr2b
  4073. movi $acc_m.8b, #0
  4074. movi $acc_l.8b, #0
  4075. cmp $main_end_input_ptr, #32
  4076. movi $acc_h.8b, #0
  4077. mov $ctr2b, $ctr1b
  4078. b.gt .L256_dec_blocks_more_than_2
  4079. sub $rctr32w, $rctr32w, #1
  4080. mov $ctr3b, $ctr1b
  4081. cmp $main_end_input_ptr, #16
  4082. b.gt .L256_dec_blocks_more_than_1
  4083. sub $rctr32w, $rctr32w, #1
  4084. b .L256_dec_blocks_less_than_1
  4085. .L256_dec_blocks_more_than_3: @ blocks left > 3
  4086. rev64 $res0b, $res1b @ GHASH final-3 block
  4087. ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
  4088. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
  4089. mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
  4090. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  4091. eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
  4092. mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
  4093. mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
  4094. mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
  4095. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
  4096. movi $t0.8b, #0 @ suppress further partial tag feed in
  4097. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
  4098. pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
  4099. eor $output_l0, $output_l0, $rk14_l @ AES final-2 block - round 14 low
  4100. #ifdef __AARCH64EB__
  4101. rev $output_l0, $output_l0
  4102. #endif
  4103. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
  4104. eor $output_h0, $output_h0, $rk14_h @ AES final-2 block - round 14 high
  4105. #ifdef __AARCH64EB__
  4106. rev $output_h0, $output_h0
  4107. #endif
  4108. .L256_dec_blocks_more_than_2: @ blocks left > 2
  4109. rev64 $res0b, $res1b @ GHASH final-2 block
  4110. ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
  4111. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  4112. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
  4113. eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
  4114. mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
  4115. pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
  4116. pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
  4117. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
  4118. mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
  4119. mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
  4120. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
  4121. movi $t0.8b, #0 @ suppress further partial tag feed in
  4122. pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
  4123. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
  4124. eor $output_l0, $output_l0, $rk14_l @ AES final-1 block - round 14 low
  4125. #ifdef __AARCH64EB__
  4126. rev $output_l0, $output_l0
  4127. #endif
  4128. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
  4129. eor $output_h0, $output_h0, $rk14_h @ AES final-1 block - round 14 high
  4130. #ifdef __AARCH64EB__
  4131. rev $output_h0, $output_h0
  4132. #endif
  4133. .L256_dec_blocks_more_than_1: @ blocks left > 1
  4134. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
  4135. rev64 $res0b, $res1b @ GHASH final-1 block
  4136. ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
  4137. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  4138. movi $t0.8b, #0 @ suppress further partial tag feed in
  4139. mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
  4140. eor $ctr0b, $res1b, $ctr3b @ AES final block - result
  4141. pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
  4142. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
  4143. pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
  4144. mov $output_l0, $ctr0.d[0] @ AES final block - mov low
  4145. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
  4146. mov $output_h0, $ctr0.d[1] @ AES final block - mov high
  4147. pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
  4148. eor $output_l0, $output_l0, $rk14_l @ AES final block - round 14 low
  4149. #ifdef __AARCH64EB__
  4150. rev $output_l0, $output_l0
  4151. #endif
  4152. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
  4153. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
  4154. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
  4155. eor $output_h0, $output_h0, $rk14_h @ AES final block - round 14 high
  4156. #ifdef __AARCH64EB__
  4157. rev $output_h0, $output_h0
  4158. #endif
  4159. .L256_dec_blocks_less_than_1: @ blocks left <= 1
  4160. and $bit_length, $bit_length, #127 @ bit_length %= 128
  4161. mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff
  4162. sub $bit_length, $bit_length, #128 @ bit_length -= 128
  4163. mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff
  4164. ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
  4165. neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
  4166. and $bit_length, $bit_length, #127 @ bit_length %= 128
  4167. lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block
  4168. cmp $bit_length, #64
  4169. csel $ctr32x, $rk14_l, $rk14_h, lt
  4170. csel $ctr96_b64x, $rk14_h, xzr, lt
  4171. fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
  4172. and $output_l0, $output_l0, $ctr32x
  4173. mov $ctr0.d[1], $ctr96_b64x
  4174. bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
  4175. #ifndef __AARCH64EB__
  4176. rev $ctr32w, $rctr32w
  4177. #else
  4178. mov $ctr32w, $rctr32w
  4179. #endif
  4180. bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
  4181. orr $output_l0, $output_l0, $end_input_ptr
  4182. and $output_h0, $output_h0, $ctr96_b64x
  4183. orr $output_h0, $output_h0, $main_end_input_ptr
  4184. and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
  4185. rev64 $res0b, $res1b @ GHASH final block
  4186. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  4187. pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
  4188. mov $t0d, $res0.d[1] @ GHASH final block - mid
  4189. eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
  4190. pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
  4191. pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
  4192. eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
  4193. eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
  4194. eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
  4195. movi $mod_constant.8b, #0xc2
  4196. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  4197. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  4198. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  4199. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  4200. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  4201. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  4202. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  4203. pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  4204. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  4205. eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
  4206. stp $output_l0, $output_h0, [$output_ptr]
  4207. str $ctr32w, [$counter, #12] @ store the updated counter
  4208. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  4209. ext $acc_lb, $acc_lb, $acc_lb, #8
  4210. rev64 $acc_lb, $acc_lb
  4211. mov x0, $len
  4212. st1 { $acc_l.16b }, [$current_tag]
  4213. ldp x21, x22, [sp, #16]
  4214. ldp x23, x24, [sp, #32]
  4215. ldp d8, d9, [sp, #48]
  4216. ldp d10, d11, [sp, #64]
  4217. ldp d12, d13, [sp, #80]
  4218. ldp d14, d15, [sp, #96]
  4219. ldp x19, x20, [sp], #112
  4220. ret
  4221. .L256_dec_ret:
  4222. mov w0, #0x0
  4223. ret
  4224. .size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel
  4225. ___
  4226. }
  4227. }
  4228. $code.=<<___;
  4229. .asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  4230. .align 2
  4231. #endif
  4232. ___
  4233. if ($flavour =~ /64/) { ######## 64-bit code
  4234. sub unvmov {
  4235. my $arg=shift;
  4236. $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
  4237. sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
  4238. $3<8?$3:$3+8,($4 eq "lo")?0:1;
  4239. }
  4240. foreach(split("\n",$code)) {
  4241. s/@\s/\/\//o; # old->new style commentary
  4242. print $_,"\n";
  4243. }
  4244. } else { ######## 32-bit code
  4245. sub unvdup32 {
  4246. my $arg=shift;
  4247. $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
  4248. sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
  4249. }
  4250. sub unvpmullp64 {
  4251. my ($mnemonic,$arg)=@_;
  4252. if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
  4253. my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
  4254. |(($2&7)<<17)|(($2&8)<<4)
  4255. |(($3&7)<<1) |(($3&8)<<2);
  4256. $word |= 0x00010001 if ($mnemonic =~ "2");
  4257. # since ARMv7 instructions are always encoded little-endian.
  4258. # correct solution is to use .inst directive, but older%%%%
  4259. # assemblers don't implement it:-(
  4260. sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
  4261. $word&0xff,($word>>8)&0xff,
  4262. ($word>>16)&0xff,($word>>24)&0xff,
  4263. $mnemonic,$arg;
  4264. }
  4265. }
  4266. foreach(split("\n",$code)) {
  4267. s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
  4268. s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
  4269. s/\/\/\s?/@ /o; # new->old style commentary
  4270. # fix up remaining new-style suffixes
  4271. s/\],#[0-9]+/]!/o;
  4272. s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
  4273. s/vdup\.32\s+(.*)/unvdup32($1)/geo or
  4274. s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
  4275. s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
  4276. s/^(\s+)b\./$1b/o or
  4277. s/^(\s+)ret/$1bx\tlr/o;
  4278. if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
  4279. print " it $2\n";
  4280. }
  4281. s/__AARCH64E([BL])__/__ARME$1__/go;
  4282. print $_,"\n";
  4283. }
  4284. }
  4285. close STDOUT or die "error closing STDOUT: $!"; # enforce flush