aes-gcm-armv8_64.pl 272 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722
  1. #! /usr/bin/env perl
  2. # Copyright 2019-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. #========================================================================
  10. # Written by Fangming Fang <fangming.fang@arm.com> for the OpenSSL project,
  11. # derived from https://github.com/ARM-software/AArch64cryptolib, original
  12. # author Samuel Lee <Samuel.Lee@arm.com>. The module is, however, dual
  13. # licensed under OpenSSL and CRYPTOGAMS licenses depending on where you
  14. # obtain it. For further details see http://www.openssl.org/~appro/cryptogams/.
  15. #========================================================================
  16. #
  17. # Approach - assume we don't want to reload constants, so reserve ~half of vector register file for constants
  18. #
  19. # main loop to act on 4 16B blocks per iteration, and then do modulo of the accumulated intermediate hashes from the 4 blocks
  20. #
  21. # ____________________________________________________
  22. # | |
  23. # | PRE |
  24. # |____________________________________________________|
  25. # | | | |
  26. # | CTR block 4k+8 | AES block 4k+4 | GHASH block 4k+0 |
  27. # |________________|________________|__________________|
  28. # | | | |
  29. # | CTR block 4k+9 | AES block 4k+5 | GHASH block 4k+1 |
  30. # |________________|________________|__________________|
  31. # | | | |
  32. # | CTR block 4k+10| AES block 4k+6 | GHASH block 4k+2 |
  33. # |________________|________________|__________________|
  34. # | | | |
  35. # | CTR block 4k+11| AES block 4k+7 | GHASH block 4k+3 |
  36. # |________________|____(mostly)____|__________________|
  37. # | |
  38. # | MODULO |
  39. # |____________________________________________________|
  40. #
  41. # PRE:
  42. # Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0
  43. # EXT low_acc, low_acc, low_acc, #8
  44. # EOR res_curr (4k+0), res_curr (4k+0), low_acc
  45. #
  46. # CTR block:
  47. # Increment and byte reverse counter in scalar registers and transfer to SIMD registers
  48. # REV ctr32, rev_ctr32
  49. # ORR ctr64, constctr96_top32, ctr32, LSL #32
  50. # INS ctr_next.d[0], constctr96_bottom64 // Keeping this in scalar registers to free up space in SIMD RF
  51. # INS ctr_next.d[1], ctr64X
  52. # ADD rev_ctr32, #1
  53. #
  54. # AES block:
  55. # Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example.
  56. # Doing small trick here of loading input in scalar registers, EORing with last key and then transferring
  57. # Given we are very constrained in our ASIMD registers this is quite important
  58. #
  59. # Encrypt:
  60. # LDR input_low, [ input_ptr ], #8
  61. # LDR input_high, [ input_ptr ], #8
  62. # EOR input_low, k14_low
  63. # EOR input_high, k14_high
  64. # INS res_curr.d[0], input_low
  65. # INS res_curr.d[1], input_high
  66. # AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
  67. # AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
  68. # AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
  69. # AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
  70. # AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
  71. # AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
  72. # AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
  73. # AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
  74. # AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
  75. # AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
  76. # AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
  77. # AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
  78. # AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
  79. # AESE ctr_curr, k13
  80. # EOR res_curr, res_curr, ctr_curr
  81. # ST1 { res_curr.16b }, [ output_ptr ], #16
  82. #
  83. # Decrypt:
  84. # AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
  85. # AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
  86. # AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
  87. # AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
  88. # AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
  89. # AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
  90. # AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
  91. # AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
  92. # AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
  93. # AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
  94. # AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
  95. # AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
  96. # AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
  97. # AESE ctr_curr, k13
  98. # LDR res_curr, [ input_ptr ], #16
  99. # EOR res_curr, res_curr, ctr_curr
  100. # MOV output_low, res_curr.d[0]
  101. # MOV output_high, res_curr.d[1]
  102. # EOR output_low, k14_low
  103. # EOR output_high, k14_high
  104. # STP output_low, output_high, [ output_ptr ], #16
  105. #
  106. # GHASH block X:
  107. # do 128b karatsuba polynomial multiplication on block
  108. # We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b
  109. #
  110. # multiplication:
  111. # Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
  112. #
  113. # The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies:
  114. # Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64
  115. #
  116. # There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are
  117. # multiplying with "twisted" powers of H
  118. #
  119. # Note: We can PMULL directly into the acc_x in first GHASH of the loop
  120. # Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical
  121. # path latency dominates the performance
  122. #
  123. # This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers
  124. # than indicated here
  125. # REV64 res_curr, res_curr
  126. # INS t_m.d[0], res_curr.d[1]
  127. # EOR t_m.8B, t_m.8B, res_curr.8B
  128. # PMULL2 t_h, res_curr, HX
  129. # PMULL t_l, res_curr, HX
  130. # PMULL t_m, t_m, HX_k
  131. # EOR acc_h, acc_h, t_h
  132. # EOR acc_l, acc_l, t_l
  133. # EOR acc_m, acc_m, t_m
  134. #
  135. # MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them
  136. # There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo
  137. # with a reversed constant
  138. # EOR acc_m, acc_m, acc_h
  139. # EOR acc_m, acc_m, acc_l // Finish off karatsuba processing
  140. # PMULL t_mod, acc_h, mod_constant
  141. # EXT acc_h, acc_h, acc_h, #8
  142. # EOR acc_m, acc_m, acc_h
  143. # EOR acc_m, acc_m, t_mod
  144. # PMULL acc_h, acc_m, mod_constant
  145. # EXT acc_m, acc_m, acc_m, #8
  146. # EOR acc_l, acc_l, acc_h
  147. # EOR acc_l, acc_l, acc_m
  148. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  149. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  150. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  151. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  152. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
  153. die "can't locate arm-xlate.pl";
  154. open OUT,"| \"$^X\" $xlate $flavour $output";
  155. *STDOUT=*OUT;
  156. $input_ptr="x0"; #argument block
  157. $bit_length="x1";
  158. $output_ptr="x2";
  159. $current_tag="x3";
  160. $counter="x16";
  161. $cc="x8";
  162. {
  163. my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
  164. my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
  165. my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
  166. my ($output_l0,$output_h0)=map("x$_",(6..7));
  167. my $ctr32w="w9";
  168. my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk10_l,$rk10_h,$len)=map("x$_",(9..15));
  169. my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
  170. my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
  171. my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
  172. my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
  173. my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
  174. my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
  175. my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
  176. my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
  177. my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
  178. my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
  179. my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
  180. my $t0="v8";
  181. my $t0d="d8";
  182. my ($t1,$t2,$t3)=map("v$_",(28..30));
  183. my ($t1d,$t2d,$t3d)=map("d$_",(28..30));
  184. my $t4="v8";
  185. my $t4d="d8";
  186. my $t5="v28";
  187. my $t5d="d28";
  188. my $t6="v31";
  189. my $t6d="d31";
  190. my $t7="v4";
  191. my $t7d="d4";
  192. my $t8="v29";
  193. my $t8d="d29";
  194. my $t9="v30";
  195. my $t9d="d30";
  196. my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
  197. my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
  198. my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
  199. my $mod_constantd="d8";
  200. my $mod_constant="v8";
  201. my $mod_t="v31";
  202. my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9)=map("v$_.16b",(18..27));
  203. my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q)=map("q$_",(18..27));
  204. my $rk2q1="v20.1q";
  205. my $rk3q1="v21.1q";
  206. my $rk4v="v22";
  207. my $rk4d="d22";
  208. $code=<<___;
  209. #include "arm_arch.h"
  210. #if __ARM_MAX_ARCH__>=8
  211. ___
  212. $code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
  213. $code.=<<___ if ($flavour !~ /64/);
  214. .fpu neon
  215. #ifdef __thumb2__
  216. .syntax unified
  217. .thumb
  218. # define INST(a,b,c,d) $_byte c,0xef,a,b
  219. #else
  220. .code 32
  221. # define INST(a,b,c,d) $_byte a,b,c,0xf2
  222. #endif
  223. .text
  224. ___
  225. #########################################################################################
  226. # size_t aes_gcm_enc_128_kernel(const unsigned char *in,
  227. # size_t len,
  228. # unsigned char *out,
  229. # const void *key,
  230. # unsigned char ivec[16],
  231. # u64 *Xi);
  232. #
  233. $code.=<<___;
  234. .global aes_gcm_enc_128_kernel
  235. .type aes_gcm_enc_128_kernel,%function
  236. .align 4
  237. aes_gcm_enc_128_kernel:
  238. cbz x1, .L128_enc_ret
  239. stp x19, x20, [sp, #-112]!
  240. mov x16, x4
  241. mov x8, x5
  242. stp x21, x22, [sp, #16]
  243. stp x23, x24, [sp, #32]
  244. stp d8, d9, [sp, #48]
  245. stp d10, d11, [sp, #64]
  246. stp d12, d13, [sp, #80]
  247. stp d14, d15, [sp, #96]
  248. ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
  249. ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10
  250. ld1 {$acc_lb}, [$current_tag]
  251. ext $acc_lb, $acc_lb, $acc_lb, #8
  252. rev64 $acc_lb, $acc_lb
  253. lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
  254. mov $len, $main_end_input_ptr
  255. ldr $rk9q, [$cc, #144] @ load rk9
  256. add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
  257. sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
  258. lsr $rctr32x, $ctr96_t32x, #32
  259. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  260. ext $h4b, $h4b, $h4b, #8
  261. fmov $ctr1d, $ctr96_b64x @ CTR block 1
  262. rev $rctr32w, $rctr32w @ rev_ctr32
  263. add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
  264. orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
  265. ldr $rk0q, [$cc, #0] @ load rk0
  266. rev $ctr32w, $rctr32w @ CTR block 1
  267. add $rctr32w, $rctr32w, #1 @ CTR block 1
  268. fmov $ctr3d, $ctr96_b64x @ CTR block 3
  269. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
  270. ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
  271. fmov $ctr1.d[1], $ctr32x @ CTR block 1
  272. rev $ctr32w, $rctr32w @ CTR block 2
  273. fmov $ctr2d, $ctr96_b64x @ CTR block 2
  274. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
  275. add $rctr32w, $rctr32w, #1 @ CTR block 2
  276. fmov $ctr2.d[1], $ctr32x @ CTR block 2
  277. rev $ctr32w, $rctr32w @ CTR block 3
  278. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
  279. ldr $rk1q, [$cc, #16] @ load rk1
  280. add $rctr32w, $rctr32w, #1 @ CTR block 3
  281. fmov $ctr3.d[1], $ctr32x @ CTR block 3
  282. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  283. ext $h3b, $h3b, $h3b, #8
  284. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
  285. ldr $rk2q, [$cc, #32] @ load rk2
  286. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
  287. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  288. ext $h1b, $h1b, $h1b, #8
  289. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
  290. ldr $rk8q, [$cc, #128] @ load rk8
  291. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
  292. ldr $rk3q, [$cc, #48] @ load rk3
  293. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
  294. trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
  295. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
  296. ldr $rk6q, [$cc, #96] @ load rk6
  297. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
  298. ldr $rk7q, [$cc, #112] @ load rk7
  299. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
  300. trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
  301. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
  302. ldr $rk5q, [$cc, #80] @ load rk5
  303. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
  304. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  305. ext $h2b, $h2b, $h2b, #8
  306. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
  307. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
  308. eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
  309. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
  310. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
  311. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
  312. ldr $rk4q, [$cc, #64] @ load rk4
  313. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
  314. and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
  315. trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
  316. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
  317. add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
  318. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
  319. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
  320. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
  321. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
  322. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
  323. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
  324. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
  325. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
  326. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
  327. trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
  328. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
  329. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
  330. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
  331. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
  332. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
  333. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
  334. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
  335. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
  336. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
  337. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
  338. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
  339. aese $ctr2b, $rk9 @ AES block 2 - round 9
  340. aese $ctr0b, $rk9 @ AES block 0 - round 9
  341. eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
  342. aese $ctr1b, $rk9 @ AES block 1 - round 9
  343. aese $ctr3b, $rk9 @ AES block 3 - round 9
  344. b.ge .L128_enc_tail @ handle tail
  345. ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
  346. ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
  347. ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
  348. ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
  349. eor $input_l0, $input_l0, $rk10_l @ AES block 0 - round 10 low
  350. eor $input_h0, $input_h0, $rk10_h @ AES block 0 - round 10 high
  351. eor $input_l2, $input_l2, $rk10_l @ AES block 2 - round 10 low
  352. fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
  353. eor $input_l1, $input_l1, $rk10_l @ AES block 1 - round 10 low
  354. eor $input_h2, $input_h2, $rk10_h @ AES block 2 - round 10 high
  355. fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
  356. fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
  357. eor $input_h1, $input_h1, $rk10_h @ AES block 1 - round 10 high
  358. eor $input_l3, $input_l3, $rk10_l @ AES block 3 - round 10 low
  359. fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
  360. fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
  361. eor $input_h3, $input_h3, $rk10_h @ AES block 3 - round 10 high
  362. rev $ctr32w, $rctr32w @ CTR block 4
  363. fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
  364. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
  365. eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
  366. fmov $ctr0d, $ctr96_b64x @ CTR block 4
  367. add $rctr32w, $rctr32w, #1 @ CTR block 4
  368. fmov $ctr0.d[1], $ctr32x @ CTR block 4
  369. rev $ctr32w, $rctr32w @ CTR block 5
  370. eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
  371. fmov $ctr1d, $ctr96_b64x @ CTR block 5
  372. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
  373. add $rctr32w, $rctr32w, #1 @ CTR block 5
  374. add $input_ptr, $input_ptr, #64 @ AES input_ptr update
  375. fmov $ctr1.d[1], $ctr32x @ CTR block 5
  376. fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
  377. rev $ctr32w, $rctr32w @ CTR block 6
  378. st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
  379. fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
  380. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
  381. add $rctr32w, $rctr32w, #1 @ CTR block 6
  382. eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
  383. st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
  384. fmov $ctr2d, $ctr96_b64x @ CTR block 6
  385. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  386. fmov $ctr2.d[1], $ctr32x @ CTR block 6
  387. rev $ctr32w, $rctr32w @ CTR block 7
  388. st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
  389. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
  390. eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
  391. st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
  392. b.ge .L128_enc_prepretail @ do prepretail
  393. .L128_enc_main_loop: @ main loop start
  394. ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext
  395. rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
  396. rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
  397. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  398. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
  399. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  400. rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
  401. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  402. add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
  403. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
  404. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  405. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  406. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  407. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  408. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  409. eor $res0b, $res0b, $acc_lb @ PRE 1
  410. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  411. eor $input_h3, $input_h3, $rk10_h @ AES block 4k+3 - round 10 high
  412. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  413. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  414. ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
  415. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  416. rev $ctr32w, $rctr32w @ CTR block 4k+8
  417. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  418. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  419. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
  420. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  421. add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
  422. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  423. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  424. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  425. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  426. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  427. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  428. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  429. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  430. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  431. rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
  432. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  433. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  434. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  435. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  436. eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high
  437. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  438. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  439. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  440. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  441. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  442. eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low
  443. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  444. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  445. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  446. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  447. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  448. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  449. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  450. movi $mod_constant.8b, #0xc2
  451. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  452. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  453. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  454. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  455. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  456. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  457. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  458. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  459. ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
  460. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  461. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  462. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  463. ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
  464. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  465. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  466. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  467. eor $input_l1, $input_l1, $rk10_l @ AES block 4k+5 - round 10 low
  468. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  469. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  470. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  471. eor $input_l3, $input_l3, $rk10_l @ AES block 4k+3 - round 10 low
  472. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  473. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  474. fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
  475. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  476. fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
  477. add $input_ptr, $input_ptr, #64 @ AES input_ptr update
  478. fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low
  479. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  480. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  481. fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
  482. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  483. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  484. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  485. eor $input_h1, $input_h1, $rk10_h @ AES block 4k+5 - round 10 high
  486. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  487. fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
  488. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  489. fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high
  490. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  491. cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
  492. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  493. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  494. aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
  495. eor $input_l2, $input_l2, $rk10_l @ AES block 4k+6 - round 10 low
  496. eor $input_h2, $input_h2, $rk10_h @ AES block 4k+6 - round 10 high
  497. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  498. fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
  499. aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
  500. fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
  501. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  502. eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
  503. fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
  504. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  505. fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
  506. rev $ctr32w, $rctr32w @ CTR block 4k+9
  507. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  508. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  509. eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
  510. add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
  511. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
  512. fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
  513. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  514. fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
  515. rev $ctr32w, $rctr32w @ CTR block 4k+10
  516. aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
  517. st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
  518. eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
  519. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
  520. aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
  521. add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
  522. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  523. fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
  524. eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
  525. st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
  526. fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
  527. st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
  528. rev $ctr32w, $rctr32w @ CTR block 4k+11
  529. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
  530. eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result
  531. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  532. st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result
  533. b.lt .L128_enc_main_loop
  534. .L128_enc_prepretail: @ PREPRETAIL
  535. rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
  536. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
  537. rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
  538. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  539. add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
  540. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
  541. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  542. rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
  543. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  544. rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
  545. eor $res0b, $res0b, $acc_lb @ PRE 1
  546. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  547. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  548. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  549. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  550. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  551. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  552. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  553. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  554. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  555. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  556. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  557. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  558. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  559. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  560. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  561. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  562. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  563. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  564. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  565. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  566. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  567. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  568. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  569. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  570. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  571. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  572. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  573. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  574. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  575. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  576. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  577. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  578. movi $mod_constant.8b, #0xc2
  579. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  580. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  581. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  582. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  583. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  584. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  585. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  586. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  587. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  588. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  589. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  590. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  591. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  592. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  593. pmull $t1.1q, $acc_h.1d, $mod_constant.1d
  594. eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
  595. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  596. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  597. ext $acc_hb, $acc_hb, $acc_hb, #8
  598. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  599. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  600. eor $acc_mb, $acc_mb, $acc_lb
  601. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  602. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  603. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  604. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  605. eor $acc_mb, $acc_mb, $t1.16b
  606. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  607. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  608. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  609. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  610. eor $acc_mb, $acc_mb, $acc_hb
  611. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  612. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  613. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  614. pmull $t1.1q, $acc_m.1d, $mod_constant.1d
  615. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  616. ext $acc_mb, $acc_mb, $acc_mb, #8
  617. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  618. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  619. eor $acc_lb, $acc_lb, $t1.16b
  620. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  621. aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
  622. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  623. aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
  624. aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
  625. eor $acc_lb, $acc_lb, $acc_mb
  626. aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
  627. .L128_enc_tail: @ TAIL
  628. sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
  629. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
  630. cmp $main_end_input_ptr, #48
  631. ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
  632. eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low
  633. eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high
  634. fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
  635. fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
  636. eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
  637. b.gt .L128_enc_blocks_more_than_3
  638. sub $rctr32w, $rctr32w, #1
  639. movi $acc_l.8b, #0
  640. mov $ctr3b, $ctr2b
  641. cmp $main_end_input_ptr, #32
  642. mov $ctr2b, $ctr1b
  643. movi $acc_h.8b, #0
  644. movi $acc_m.8b, #0
  645. b.gt .L128_enc_blocks_more_than_2
  646. mov $ctr3b, $ctr1b
  647. cmp $main_end_input_ptr, #16
  648. sub $rctr32w, $rctr32w, #1
  649. b.gt .L128_enc_blocks_more_than_1
  650. sub $rctr32w, $rctr32w, #1
  651. b .L128_enc_blocks_less_than_1
  652. .L128_enc_blocks_more_than_3: @ blocks left > 3
  653. st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
  654. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
  655. rev64 $res0b, $res1b @ GHASH final-3 block
  656. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  657. eor $input_h0, $input_h0, $rk10_h @ AES final-2 block - round 10 high
  658. eor $input_l0, $input_l0, $rk10_l @ AES final-2 block - round 10 low
  659. fmov $res1d, $input_l0 @ AES final-2 block - mov low
  660. movi $t0.8b, #0 @ suppress further partial tag feed in
  661. fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
  662. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
  663. mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
  664. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
  665. mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
  666. eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
  667. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
  668. pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
  669. .L128_enc_blocks_more_than_2: @ blocks left > 2
  670. st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
  671. rev64 $res0b, $res1b @ GHASH final-2 block
  672. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
  673. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  674. eor $input_l0, $input_l0, $rk10_l @ AES final-1 block - round 10 low
  675. fmov $res1d, $input_l0 @ AES final-1 block - mov low
  676. eor $input_h0, $input_h0, $rk10_h @ AES final-1 block - round 10 high
  677. pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
  678. fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
  679. mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
  680. pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
  681. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
  682. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
  683. eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
  684. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
  685. pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
  686. movi $t0.8b, #0 @ suppress further partial tag feed in
  687. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
  688. .L128_enc_blocks_more_than_1: @ blocks left > 1
  689. st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
  690. rev64 $res0b, $res1b @ GHASH final-1 block
  691. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
  692. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  693. eor $input_h0, $input_h0, $rk10_h @ AES final block - round 10 high
  694. eor $input_l0, $input_l0, $rk10_l @ AES final block - round 10 low
  695. fmov $res1d, $input_l0 @ AES final block - mov low
  696. pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
  697. fmov $res1.d[1], $input_h0 @ AES final block - mov high
  698. mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
  699. pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
  700. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
  701. eor $res1b, $res1b, $ctr3b @ AES final block - result
  702. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
  703. pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
  704. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
  705. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
  706. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
  707. movi $t0.8b, #0 @ suppress further partial tag feed in
  708. .L128_enc_blocks_less_than_1: @ blocks left <= 1
  709. and $bit_length, $bit_length, #127 @ bit_length %= 128
  710. mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff
  711. mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff
  712. sub $bit_length, $bit_length, #128 @ bit_length -= 128
  713. neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
  714. and $bit_length, $bit_length, #127 @ bit_length %= 128
  715. lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block
  716. cmp $bit_length, #64
  717. csel $input_l0, $rk10_l, $rk10_h, lt
  718. csel $input_h0, $rk10_h, xzr, lt
  719. fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
  720. fmov $ctr0.d[1], $input_h0
  721. and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
  722. rev64 $res0b, $res1b @ GHASH final block
  723. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  724. mov $t0d, $res0.d[1] @ GHASH final block - mid
  725. pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
  726. ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
  727. eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
  728. rev $ctr32w, $rctr32w
  729. pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
  730. pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
  731. eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
  732. eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
  733. eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
  734. movi $mod_constant.8b, #0xc2
  735. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  736. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  737. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  738. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  739. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  740. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  741. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  742. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  743. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  744. bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
  745. eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
  746. st1 { $res1b}, [$output_ptr] @ store all 16B
  747. str $ctr32w, [$counter, #12] @ store the updated counter
  748. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  749. ext $acc_lb, $acc_lb, $acc_lb, #8
  750. rev64 $acc_lb, $acc_lb
  751. mov x0, $len
  752. st1 { $acc_l.16b }, [$current_tag]
  753. ldp x21, x22, [sp, #16]
  754. ldp x23, x24, [sp, #32]
  755. ldp d8, d9, [sp, #48]
  756. ldp d10, d11, [sp, #64]
  757. ldp d12, d13, [sp, #80]
  758. ldp d14, d15, [sp, #96]
  759. ldp x19, x20, [sp], #112
  760. ret
  761. .L128_enc_ret:
  762. mov w0, #0x0
  763. ret
  764. .size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel
  765. ___
  766. #########################################################################################
  767. # size_t aes_gcm_dec_128_kernel(const unsigned char *in,
  768. # size_t len,
  769. # unsigned char *out,
  770. # const void *key,
  771. # unsigned char ivec[16],
  772. # u64 *Xi);
  773. #
  774. $code.=<<___;
  775. .global aes_gcm_dec_128_kernel
  776. .type aes_gcm_dec_128_kernel,%function
  777. .align 4
  778. aes_gcm_dec_128_kernel:
  779. cbz x1, .L128_dec_ret
  780. stp x19, x20, [sp, #-112]!
  781. mov x16, x4
  782. mov x8, x5
  783. stp x21, x22, [sp, #16]
  784. stp x23, x24, [sp, #32]
  785. stp d8, d9, [sp, #48]
  786. stp d10, d11, [sp, #64]
  787. stp d12, d13, [sp, #80]
  788. stp d14, d15, [sp, #96]
  789. lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
  790. mov $len, $main_end_input_ptr
  791. ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
  792. sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
  793. ldr $rk0q, [$cc, #0] @ load rk0
  794. and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
  795. ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
  796. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  797. ext $h2b, $h2b, $h2b, #8
  798. lsr $rctr32x, $ctr96_t32x, #32
  799. fmov $ctr2d, $ctr96_b64x @ CTR block 2
  800. ldr $rk1q, [$cc, #16] @ load rk1
  801. orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
  802. rev $rctr32w, $rctr32w @ rev_ctr32
  803. fmov $ctr1d, $ctr96_b64x @ CTR block 1
  804. add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
  805. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
  806. rev $ctr32w, $rctr32w @ CTR block 1
  807. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
  808. ldr $rk2q, [$cc, #32] @ load rk2
  809. add $rctr32w, $rctr32w, #1 @ CTR block 1
  810. fmov $ctr1.d[1], $ctr32x @ CTR block 1
  811. rev $ctr32w, $rctr32w @ CTR block 2
  812. add $rctr32w, $rctr32w, #1 @ CTR block 2
  813. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
  814. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
  815. fmov $ctr2.d[1], $ctr32x @ CTR block 2
  816. rev $ctr32w, $rctr32w @ CTR block 3
  817. fmov $ctr3d, $ctr96_b64x @ CTR block 3
  818. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
  819. add $rctr32w, $rctr32w, #1 @ CTR block 3
  820. fmov $ctr3.d[1], $ctr32x @ CTR block 3
  821. add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
  822. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
  823. ldr $rk3q, [$cc, #48] @ load rk3
  824. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
  825. ldr $rk6q, [$cc, #96] @ load rk6
  826. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
  827. ldr $rk7q, [$cc, #112] @ load rk7
  828. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
  829. ldr $rk4q, [$cc, #64] @ load rk4
  830. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
  831. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
  832. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
  833. ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10
  834. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
  835. ld1 { $acc_lb}, [$current_tag]
  836. ext $acc_lb, $acc_lb, $acc_lb, #8
  837. rev64 $acc_lb, $acc_lb
  838. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
  839. ldr $rk5q, [$cc, #80] @ load rk5
  840. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
  841. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
  842. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
  843. ldr $rk9q, [$cc, #144] @ load rk9
  844. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
  845. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
  846. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
  847. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  848. ext $h3b, $h3b, $h3b, #8
  849. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
  850. ldr $rk8q, [$cc, #128] @ load rk8
  851. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
  852. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
  853. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
  854. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
  855. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
  856. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  857. ext $h1b, $h1b, $h1b, #8
  858. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
  859. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
  860. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
  861. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
  862. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
  863. trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
  864. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  865. ext $h4b, $h4b, $h4b, #8
  866. trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
  867. add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
  868. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
  869. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
  870. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
  871. eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
  872. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
  873. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
  874. trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
  875. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
  876. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
  877. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
  878. trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
  879. aese $ctr2b, $rk9 @ AES block 2 - round 9
  880. aese $ctr3b, $rk9 @ AES block 3 - round 9
  881. aese $ctr0b, $rk9 @ AES block 0 - round 9
  882. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
  883. aese $ctr1b, $rk9 @ AES block 1 - round 9
  884. eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
  885. b.ge .L128_dec_tail @ handle tail
  886. ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
  887. ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
  888. eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
  889. ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
  890. eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
  891. rev64 $res0b, $res0b @ GHASH block 0
  892. rev $ctr32w, $rctr32w @ CTR block 4
  893. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
  894. add $rctr32w, $rctr32w, #1 @ CTR block 4
  895. ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
  896. rev64 $res1b, $res1b @ GHASH block 1
  897. add $input_ptr, $input_ptr, #64 @ AES input_ptr update
  898. mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
  899. mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
  900. mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
  901. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  902. mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
  903. fmov $ctr0d, $ctr96_b64x @ CTR block 4
  904. fmov $ctr0.d[1], $ctr32x @ CTR block 4
  905. rev $ctr32w, $rctr32w @ CTR block 5
  906. eor $output_l1, $output_l1, $rk10_l @ AES block 1 - round 10 low
  907. fmov $ctr1d, $ctr96_b64x @ CTR block 5
  908. add $rctr32w, $rctr32w, #1 @ CTR block 5
  909. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
  910. fmov $ctr1.d[1], $ctr32x @ CTR block 5
  911. rev $ctr32w, $rctr32w @ CTR block 6
  912. add $rctr32w, $rctr32w, #1 @ CTR block 6
  913. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
  914. eor $output_h1, $output_h1, $rk10_h @ AES block 1 - round 10 high
  915. eor $output_l0, $output_l0, $rk10_l @ AES block 0 - round 10 low
  916. eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
  917. eor $output_h0, $output_h0, $rk10_h @ AES block 0 - round 10 high
  918. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
  919. stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
  920. b.ge .L128_dec_prepretail @ do prepretail
  921. .L128_dec_main_loop: @ main loop start
  922. eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
  923. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  924. mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
  925. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  926. mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
  927. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  928. fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
  929. rev64 $res2b, $res2b @ GHASH block 4k+2
  930. fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
  931. rev $ctr32w, $rctr32w @ CTR block 4k+7
  932. mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
  933. eor $res0b, $res0b, $acc_lb @ PRE 1
  934. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  935. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  936. rev64 $res3b, $res3b @ GHASH block 4k+3
  937. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  938. mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
  939. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
  940. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  941. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
  942. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  943. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  944. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
  945. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  946. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  947. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  948. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  949. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  950. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  951. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  952. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  953. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  954. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  955. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  956. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  957. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  958. eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low
  959. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  960. eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high
  961. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  962. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  963. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  964. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  965. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  966. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  967. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  968. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  969. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  970. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  971. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  972. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  973. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  974. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  975. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  976. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  977. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  978. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  979. eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high
  980. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  981. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  982. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  983. eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low
  984. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  985. movi $mod_constant.8b, #0xc2
  986. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  987. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  988. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  989. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  990. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  991. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  992. stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
  993. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  994. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  995. ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
  996. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  997. add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
  998. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  999. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  1000. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  1001. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  1002. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  1003. stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
  1004. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  1005. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  1006. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  1007. rev $ctr32w, $rctr32w @ CTR block 4k+8
  1008. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  1009. ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
  1010. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  1011. aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
  1012. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
  1013. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  1014. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  1015. aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
  1016. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  1017. eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
  1018. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  1019. ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
  1020. add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
  1021. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  1022. eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
  1023. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  1024. ldr $res3q, [$input_ptr, #48] @ AES block 4k+3 - load ciphertext
  1025. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  1026. add $input_ptr, $input_ptr, #64 @ AES input_ptr update
  1027. rev64 $res1b, $res1b @ GHASH block 4k+5
  1028. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  1029. mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
  1030. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  1031. mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
  1032. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  1033. fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
  1034. pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  1035. fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
  1036. rev $ctr32w, $rctr32w @ CTR block 4k+9
  1037. aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
  1038. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
  1039. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  1040. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  1041. eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high
  1042. eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
  1043. mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
  1044. eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low
  1045. eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
  1046. mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
  1047. add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
  1048. aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
  1049. fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
  1050. cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
  1051. rev64 $res0b, $res0b @ GHASH block 4k+4
  1052. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  1053. fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
  1054. rev $ctr32w, $rctr32w @ CTR block 4k+10
  1055. add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
  1056. eor $output_h1, $output_h1, $rk10_h @ AES block 4k+5 - round 10 high
  1057. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
  1058. eor $output_l1, $output_l1, $rk10_l @ AES block 4k+5 - round 10 low
  1059. stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
  1060. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
  1061. b.lt L128_dec_main_loop
  1062. .L128_dec_prepretail: @ PREPRETAIL
  1063. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  1064. mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
  1065. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  1066. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  1067. eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
  1068. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  1069. mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
  1070. eor $res0b, $res0b, $acc_lb @ PRE 1
  1071. fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
  1072. rev64 $res2b, $res2b @ GHASH block 4k+2
  1073. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  1074. fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
  1075. rev $ctr32w, $rctr32w @ CTR block 4k+7
  1076. mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
  1077. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  1078. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  1079. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  1080. mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
  1081. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  1082. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  1083. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  1084. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
  1085. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  1086. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  1087. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
  1088. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  1089. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
  1090. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  1091. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  1092. rev64 $res3b, $res3b @ GHASH block 4k+3
  1093. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  1094. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  1095. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  1096. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  1097. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  1098. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  1099. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  1100. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  1101. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  1102. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  1103. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  1104. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  1105. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  1106. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  1107. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  1108. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  1109. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  1110. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  1111. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  1112. movi $mod_constant.8b, #0xc2
  1113. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  1114. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  1115. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  1116. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  1117. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  1118. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  1119. eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low
  1120. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  1121. eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low
  1122. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  1123. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  1124. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  1125. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  1126. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  1127. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  1128. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  1129. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  1130. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  1131. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  1132. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  1133. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  1134. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  1135. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  1136. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  1137. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  1138. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  1139. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  1140. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  1141. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  1142. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  1143. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  1144. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  1145. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  1146. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  1147. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  1148. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  1149. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  1150. aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
  1151. pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  1152. eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high
  1153. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  1154. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  1155. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  1156. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  1157. eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
  1158. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  1159. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  1160. eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high
  1161. aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
  1162. stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
  1163. aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
  1164. add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
  1165. stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
  1166. aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
  1167. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  1168. .L128_dec_tail: @ TAIL
  1169. sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
  1170. ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
  1171. eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
  1172. mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
  1173. mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
  1174. cmp $main_end_input_ptr, #48
  1175. eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high
  1176. ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
  1177. eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low
  1178. b.gt .L128_dec_blocks_more_than_3
  1179. mov $ctr3b, $ctr2b
  1180. sub $rctr32w, $rctr32w, #1
  1181. movi $acc_l.8b, #0
  1182. movi $acc_h.8b, #0
  1183. mov $ctr2b, $ctr1b
  1184. movi $acc_m.8b, #0
  1185. cmp $main_end_input_ptr, #32
  1186. b.gt .L128_dec_blocks_more_than_2
  1187. cmp $main_end_input_ptr, #16
  1188. mov $ctr3b, $ctr1b
  1189. sub $rctr32w, $rctr32w, #1
  1190. b.gt .L128_dec_blocks_more_than_1
  1191. sub $rctr32w, $rctr32w, #1
  1192. b .L128_dec_blocks_less_than_1
  1193. .L128_dec_blocks_more_than_3: @ blocks left > 3
  1194. rev64 $res0b, $res1b @ GHASH final-3 block
  1195. ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
  1196. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1197. mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
  1198. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
  1199. eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
  1200. mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
  1201. mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
  1202. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
  1203. mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
  1204. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
  1205. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
  1206. movi $t0.8b, #0 @ suppress further partial tag feed in
  1207. eor $output_h0, $output_h0, $rk10_h @ AES final-2 block - round 10 high
  1208. pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
  1209. eor $output_l0, $output_l0, $rk10_l @ AES final-2 block - round 10 low
  1210. .L128_dec_blocks_more_than_2: @ blocks left > 2
  1211. rev64 $res0b, $res1b @ GHASH final-2 block
  1212. ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
  1213. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1214. eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
  1215. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
  1216. mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
  1217. pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
  1218. pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
  1219. mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
  1220. mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
  1221. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
  1222. movi $t0.8b, #0 @ suppress further partial tag feed in
  1223. pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
  1224. eor $output_l0, $output_l0, $rk10_l @ AES final-1 block - round 10 low
  1225. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
  1226. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
  1227. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
  1228. eor $output_h0, $output_h0, $rk10_h @ AES final-1 block - round 10 high
  1229. .L128_dec_blocks_more_than_1: @ blocks left > 1
  1230. rev64 $res0b, $res1b @ GHASH final-1 block
  1231. ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
  1232. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1233. mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
  1234. eor $ctr0b, $res1b, $ctr3b @ AES final block - result
  1235. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
  1236. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
  1237. mov $output_l0, $ctr0.d[0] @ AES final block - mov low
  1238. mov $output_h0, $ctr0.d[1] @ AES final block - mov high
  1239. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
  1240. pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
  1241. pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
  1242. pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
  1243. movi $t0.8b, #0 @ suppress further partial tag feed in
  1244. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
  1245. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
  1246. eor $output_h0, $output_h0, $rk10_h @ AES final block - round 10 high
  1247. eor $output_l0, $output_l0, $rk10_l @ AES final block - round 10 low
  1248. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
  1249. .L128_dec_blocks_less_than_1: @ blocks left <= 1
  1250. mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff
  1251. and $bit_length, $bit_length, #127 @ bit_length %= 128
  1252. mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff
  1253. sub $bit_length, $bit_length, #128 @ bit_length -= 128
  1254. neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
  1255. and $bit_length, $bit_length, #127 @ bit_length %= 128
  1256. lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block
  1257. cmp $bit_length, #64
  1258. csel $ctr96_b64x, $rk10_h, xzr, lt
  1259. csel $ctr32x, $rk10_l, $rk10_h, lt
  1260. fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
  1261. mov $ctr0.d[1], $ctr96_b64x
  1262. and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
  1263. rev64 $res0b, $res1b @ GHASH final block
  1264. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1265. ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
  1266. and $output_h0, $output_h0, $ctr96_b64x
  1267. pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
  1268. mov $t0d, $res0.d[1] @ GHASH final block - mid
  1269. eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
  1270. eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
  1271. pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
  1272. pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
  1273. bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
  1274. and $output_l0, $output_l0, $ctr32x
  1275. rev $ctr32w, $rctr32w
  1276. eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
  1277. movi $mod_constant.8b, #0xc2
  1278. eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
  1279. bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
  1280. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  1281. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  1282. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  1283. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  1284. orr $output_l0, $output_l0, $end_input_ptr
  1285. str $ctr32w, [$counter, #12] @ store the updated counter
  1286. orr $output_h0, $output_h0, $main_end_input_ptr
  1287. stp $output_l0, $output_h0, [$output_ptr]
  1288. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  1289. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  1290. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  1291. pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  1292. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  1293. eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
  1294. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  1295. ext $acc_lb, $acc_lb, $acc_lb, #8
  1296. rev64 $acc_lb, $acc_lb
  1297. mov x0, $len
  1298. st1 { $acc_l.16b }, [$current_tag]
  1299. ldp x21, x22, [sp, #16]
  1300. ldp x23, x24, [sp, #32]
  1301. ldp d8, d9, [sp, #48]
  1302. ldp d10, d11, [sp, #64]
  1303. ldp d12, d13, [sp, #80]
  1304. ldp d14, d15, [sp, #96]
  1305. ldp x19, x20, [sp], #112
  1306. ret
  1307. .L128_dec_ret:
  1308. mov w0, #0x0
  1309. ret
  1310. .size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel
  1311. ___
  1312. }
  1313. {
  1314. my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
  1315. my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
  1316. my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
  1317. my ($output_l0,$output_h0)=map("x$_",(6..7));
  1318. my $ctr32w="w9";
  1319. my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk12_l,$rk12_h,$len)=map("x$_",(9..15));
  1320. my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
  1321. my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
  1322. my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
  1323. my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
  1324. my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
  1325. my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
  1326. my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
  1327. my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
  1328. my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
  1329. my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
  1330. my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
  1331. my $t0="v8";
  1332. my $t0d="d8";
  1333. my $t3="v4";
  1334. my $t3d="d4";
  1335. my ($t1,$t2)=map("v$_",(30..31));
  1336. my ($t1d,$t2d)=map("d$_",(30..31));
  1337. my $t4="v30";
  1338. my $t4d="d30";
  1339. my $t5="v8";
  1340. my $t5d="d8";
  1341. my $t6="v31";
  1342. my $t6d="d31";
  1343. my $t7="v5";
  1344. my $t7d="d5";
  1345. my $t8="v6";
  1346. my $t8d="d6";
  1347. my $t9="v30";
  1348. my $t9d="d30";
  1349. my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
  1350. my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
  1351. my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
  1352. my $mod_constantd="d8";
  1353. my $mod_constant="v8";
  1354. my $mod_t="v31";
  1355. my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11)=map("v$_.16b",(18..29));
  1356. my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q)=map("q$_",(18..29));
  1357. my $rk2q1="v20.1q";
  1358. my $rk3q1="v21.1q";
  1359. my $rk4v="v22";
  1360. my $rk4d="d22";
  1361. #########################################################################################
  1362. # size_t aes_gcm_enc_192_kernel(const unsigned char *in,
  1363. # size_t len,
  1364. # unsigned char *out,
  1365. # const void *key,
  1366. # unsigned char ivec[16],
  1367. # u64 *Xi);
  1368. #
  1369. $code.=<<___;
  1370. .global aes_gcm_enc_192_kernel
  1371. .type aes_gcm_enc_192_kernel,%function
  1372. .align 4
  1373. aes_gcm_enc_192_kernel:
  1374. cbz x1, .L192_enc_ret
  1375. stp x19, x20, [sp, #-112]!
  1376. mov x16, x4
  1377. mov x8, x5
  1378. stp x21, x22, [sp, #16]
  1379. stp x23, x24, [sp, #32]
  1380. stp d8, d9, [sp, #48]
  1381. stp d10, d11, [sp, #64]
  1382. stp d12, d13, [sp, #80]
  1383. stp d14, d15, [sp, #96]
  1384. ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
  1385. ldr $rk5q, [$cc, #80] @ load rk5
  1386. ldr $rk4q, [$cc, #64] @ load rk4
  1387. ldr $rk8q, [$cc, #128] @ load rk8
  1388. lsr $rctr32x, $ctr96_t32x, #32
  1389. ldr $rk6q, [$cc, #96] @ load rk6
  1390. orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
  1391. ldr $rk7q, [$cc, #112] @ load rk7
  1392. rev $rctr32w, $rctr32w @ rev_ctr32
  1393. add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
  1394. fmov $ctr3d, $ctr96_b64x @ CTR block 3
  1395. rev $ctr32w, $rctr32w @ CTR block 1
  1396. add $rctr32w, $rctr32w, #1 @ CTR block 1
  1397. fmov $ctr1d, $ctr96_b64x @ CTR block 1
  1398. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
  1399. ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
  1400. fmov $ctr1.d[1], $ctr32x @ CTR block 1
  1401. rev $ctr32w, $rctr32w @ CTR block 2
  1402. add $rctr32w, $rctr32w, #1 @ CTR block 2
  1403. fmov $ctr2d, $ctr96_b64x @ CTR block 2
  1404. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
  1405. fmov $ctr2.d[1], $ctr32x @ CTR block 2
  1406. rev $ctr32w, $rctr32w @ CTR block 3
  1407. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
  1408. ldr $rk0q, [$cc, #0] @ load rk0
  1409. fmov $ctr3.d[1], $ctr32x @ CTR block 3
  1410. ldr $rk3q, [$cc, #48] @ load rk3
  1411. ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12
  1412. ldr $rk1q, [$cc, #16] @ load rk1
  1413. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
  1414. ld1 { $acc_lb}, [$current_tag]
  1415. ext $acc_lb, $acc_lb, $acc_lb, #8
  1416. rev64 $acc_lb, $acc_lb
  1417. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
  1418. ldr $rk11q, [$cc, #176] @ load rk11
  1419. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
  1420. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  1421. ext $h4b, $h4b, $h4b, #8
  1422. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
  1423. ldr $rk2q, [$cc, #32] @ load rk2
  1424. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
  1425. ldr $rk10q, [$cc, #160] @ load rk10
  1426. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
  1427. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  1428. ext $h1b, $h1b, $h1b, #8
  1429. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
  1430. ldr $rk9q, [$cc, #144] @ load rk9
  1431. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
  1432. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  1433. ext $h3b, $h3b, $h3b, #8
  1434. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
  1435. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
  1436. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
  1437. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
  1438. trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
  1439. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
  1440. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
  1441. trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
  1442. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
  1443. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
  1444. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
  1445. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
  1446. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
  1447. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
  1448. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
  1449. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
  1450. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
  1451. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
  1452. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
  1453. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
  1454. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  1455. ext $h2b, $h2b, $h2b, #8
  1456. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
  1457. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
  1458. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
  1459. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
  1460. trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
  1461. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
  1462. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
  1463. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
  1464. trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
  1465. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
  1466. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
  1467. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
  1468. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
  1469. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
  1470. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
  1471. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
  1472. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
  1473. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
  1474. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
  1475. lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
  1476. mov $len, $main_end_input_ptr
  1477. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
  1478. sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
  1479. eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
  1480. and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
  1481. eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
  1482. aese $ctr2b, $rk11 @ AES block 2 - round 11
  1483. add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
  1484. add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
  1485. aese $ctr1b, $rk11 @ AES block 1 - round 11
  1486. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
  1487. aese $ctr0b, $rk11 @ AES block 0 - round 11
  1488. add $rctr32w, $rctr32w, #1 @ CTR block 3
  1489. aese $ctr3b, $rk11 @ AES block 3 - round 11
  1490. b.ge .L192_enc_tail @ handle tail
  1491. rev $ctr32w, $rctr32w @ CTR block 4
  1492. ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
  1493. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
  1494. ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
  1495. ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
  1496. ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
  1497. add $input_ptr, $input_ptr, #64 @ AES input_ptr update
  1498. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  1499. eor $input_l0, $input_l0, $rk12_l @ AES block 0 - round 12 low
  1500. eor $input_h0, $input_h0, $rk12_h @ AES block 0 - round 12 high
  1501. eor $input_h2, $input_h2, $rk12_h @ AES block 2 - round 12 high
  1502. fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
  1503. eor $input_h3, $input_h3, $rk12_h @ AES block 3 - round 12 high
  1504. fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
  1505. eor $input_l2, $input_l2, $rk12_l @ AES block 2 - round 12 low
  1506. eor $input_l1, $input_l1, $rk12_l @ AES block 1 - round 12 low
  1507. fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
  1508. eor $input_h1, $input_h1, $rk12_h @ AES block 1 - round 12 high
  1509. fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
  1510. eor $input_l3, $input_l3, $rk12_l @ AES block 3 - round 12 low
  1511. fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
  1512. add $rctr32w, $rctr32w, #1 @ CTR block 4
  1513. eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
  1514. fmov $ctr0d, $ctr96_b64x @ CTR block 4
  1515. fmov $ctr0.d[1], $ctr32x @ CTR block 4
  1516. rev $ctr32w, $rctr32w @ CTR block 5
  1517. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
  1518. add $rctr32w, $rctr32w, #1 @ CTR block 5
  1519. fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
  1520. st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
  1521. fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
  1522. eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
  1523. fmov $ctr1d, $ctr96_b64x @ CTR block 5
  1524. st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
  1525. fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
  1526. fmov $ctr1.d[1], $ctr32x @ CTR block 5
  1527. rev $ctr32w, $rctr32w @ CTR block 6
  1528. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
  1529. add $rctr32w, $rctr32w, #1 @ CTR block 6
  1530. eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
  1531. fmov $ctr2d, $ctr96_b64x @ CTR block 6
  1532. fmov $ctr2.d[1], $ctr32x @ CTR block 6
  1533. rev $ctr32w, $rctr32w @ CTR block 7
  1534. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
  1535. st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
  1536. eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
  1537. st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
  1538. b.ge .L192_enc_prepretail @ do prepretail
  1539. .L192_enc_main_loop: @ main loop start
  1540. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  1541. rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
  1542. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  1543. ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
  1544. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  1545. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
  1546. rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
  1547. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  1548. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
  1549. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  1550. rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
  1551. ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
  1552. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  1553. ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext
  1554. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  1555. eor $res0b, $res0b, $acc_lb @ PRE 1
  1556. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  1557. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  1558. rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
  1559. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  1560. eor $input_h3, $input_h3, $rk12_h @ AES block 4k+3 - round 12 high
  1561. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  1562. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  1563. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  1564. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  1565. eor $input_l2, $input_l2, $rk12_l @ AES block 4k+6 - round 12 low
  1566. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  1567. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  1568. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  1569. eor $input_l1, $input_l1, $rk12_l @ AES block 4k+5 - round 12 low
  1570. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  1571. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  1572. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  1573. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  1574. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  1575. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  1576. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  1577. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  1578. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  1579. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  1580. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  1581. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  1582. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  1583. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  1584. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  1585. eor $input_h1, $input_h1, $rk12_h @ AES block 4k+5 - round 12 high
  1586. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  1587. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  1588. add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
  1589. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  1590. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  1591. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  1592. eor $input_h2, $input_h2, $rk12_h @ AES block 4k+6 - round 12 high
  1593. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  1594. eor $input_l3, $input_l3, $rk12_l @ AES block 4k+3 - round 12 low
  1595. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  1596. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  1597. rev $ctr32w, $rctr32w @ CTR block 4k+8
  1598. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  1599. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
  1600. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  1601. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  1602. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  1603. ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
  1604. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  1605. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  1606. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  1607. add $input_ptr, $input_ptr, #64 @ AES input_ptr update
  1608. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  1609. movi $mod_constant.8b, #0xc2
  1610. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  1611. eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high
  1612. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  1613. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  1614. eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low
  1615. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  1616. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  1617. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  1618. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  1619. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  1620. fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
  1621. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  1622. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  1623. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  1624. fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
  1625. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  1626. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  1627. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  1628. cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
  1629. fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
  1630. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  1631. fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
  1632. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  1633. fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low
  1634. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  1635. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  1636. add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
  1637. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  1638. fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high
  1639. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  1640. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  1641. fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
  1642. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  1643. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
  1644. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  1645. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  1646. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  1647. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
  1648. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
  1649. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  1650. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
  1651. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
  1652. aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
  1653. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
  1654. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  1655. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
  1656. eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
  1657. fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
  1658. aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
  1659. fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
  1660. rev $ctr32w, $rctr32w @ CTR block 4k+9
  1661. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  1662. fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
  1663. st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
  1664. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
  1665. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
  1666. eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
  1667. add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
  1668. fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
  1669. aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
  1670. fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
  1671. rev $ctr32w, $rctr32w @ CTR block 4k+10
  1672. add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
  1673. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  1674. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
  1675. st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
  1676. eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
  1677. aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
  1678. eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
  1679. fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
  1680. st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
  1681. fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
  1682. rev $ctr32w, $rctr32w @ CTR block 4k+11
  1683. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  1684. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
  1685. eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result
  1686. st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result
  1687. b.lt .L192_enc_main_loop
  1688. .L192_enc_prepretail: @ PREPRETAIL
  1689. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  1690. rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
  1691. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
  1692. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  1693. add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
  1694. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  1695. rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
  1696. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  1697. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
  1698. eor $res0b, $res0b, $acc_lb @ PRE 1
  1699. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  1700. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  1701. rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
  1702. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  1703. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  1704. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  1705. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  1706. rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
  1707. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  1708. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  1709. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  1710. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  1711. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  1712. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  1713. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  1714. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  1715. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  1716. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  1717. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  1718. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  1719. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  1720. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  1721. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  1722. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  1723. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  1724. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  1725. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  1726. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  1727. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  1728. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  1729. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  1730. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  1731. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  1732. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  1733. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  1734. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  1735. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  1736. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  1737. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  1738. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  1739. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  1740. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  1741. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  1742. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  1743. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  1744. movi $mod_constant.8b, #0xc2
  1745. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  1746. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  1747. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  1748. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  1749. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  1750. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  1751. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  1752. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  1753. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  1754. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  1755. eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
  1756. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  1757. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  1758. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  1759. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  1760. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  1761. eor $acc_mb, $acc_mb, $acc_lb
  1762. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  1763. pmull $t1.1q, $acc_h.1d, $mod_constant.1d
  1764. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  1765. ext $acc_hb, $acc_hb, $acc_hb, #8
  1766. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  1767. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  1768. eor $acc_mb, $acc_mb, $t1.16b
  1769. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  1770. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  1771. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
  1772. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  1773. eor $acc_mb, $acc_mb, $acc_hb
  1774. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
  1775. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
  1776. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
  1777. pmull $t1.1q, $acc_m.1d, $mod_constant.1d
  1778. ext $acc_mb, $acc_mb, $acc_mb, #8
  1779. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
  1780. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
  1781. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
  1782. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
  1783. eor $acc_lb, $acc_lb, $t1.16b
  1784. aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
  1785. aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
  1786. aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
  1787. aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
  1788. eor $acc_lb, $acc_lb, $acc_mb
  1789. .L192_enc_tail: @ TAIL
  1790. sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
  1791. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
  1792. eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low
  1793. eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high
  1794. fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
  1795. fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
  1796. cmp $main_end_input_ptr, #48
  1797. eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
  1798. ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
  1799. b.gt .L192_enc_blocks_more_than_3
  1800. sub $rctr32w, $rctr32w, #1
  1801. movi $acc_m.8b, #0
  1802. mov $ctr3b, $ctr2b
  1803. movi $acc_h.8b, #0
  1804. cmp $main_end_input_ptr, #32
  1805. mov $ctr2b, $ctr1b
  1806. movi $acc_l.8b, #0
  1807. b.gt .L192_enc_blocks_more_than_2
  1808. sub $rctr32w, $rctr32w, #1
  1809. mov $ctr3b, $ctr1b
  1810. cmp $main_end_input_ptr, #16
  1811. b.gt .L192_enc_blocks_more_than_1
  1812. sub $rctr32w, $rctr32w, #1
  1813. b .L192_enc_blocks_less_than_1
  1814. .L192_enc_blocks_more_than_3: @ blocks left > 3
  1815. st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
  1816. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
  1817. rev64 $res0b, $res1b @ GHASH final-3 block
  1818. eor $input_l0, $input_l0, $rk12_l @ AES final-2 block - round 12 low
  1819. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1820. eor $input_h0, $input_h0, $rk12_h @ AES final-2 block - round 12 high
  1821. fmov $res1d, $input_l0 @ AES final-2 block - mov low
  1822. fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
  1823. mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
  1824. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
  1825. mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
  1826. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
  1827. movi $t0.8b, #0 @ suppress further partial tag feed in
  1828. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
  1829. pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
  1830. eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
  1831. .L192_enc_blocks_more_than_2: @ blocks left > 2
  1832. st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
  1833. rev64 $res0b, $res1b @ GHASH final-2 block
  1834. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
  1835. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1836. eor $input_h0, $input_h0, $rk12_h @ AES final-1 block - round 12 high
  1837. pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
  1838. mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
  1839. pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
  1840. eor $input_l0, $input_l0, $rk12_l @ AES final-1 block - round 12 low
  1841. fmov $res1d, $input_l0 @ AES final-1 block - mov low
  1842. fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
  1843. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
  1844. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
  1845. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
  1846. pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
  1847. movi $t0.8b, #0 @ suppress further partial tag feed in
  1848. eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
  1849. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
  1850. .L192_enc_blocks_more_than_1: @ blocks left > 1
  1851. st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
  1852. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
  1853. rev64 $res0b, $res1b @ GHASH final-1 block
  1854. eor $input_l0, $input_l0, $rk12_l @ AES final block - round 12 low
  1855. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1856. movi $t0.8b, #0 @ suppress further partial tag feed in
  1857. mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
  1858. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
  1859. eor $input_h0, $input_h0, $rk12_h @ AES final block - round 12 high
  1860. fmov $res1d, $input_l0 @ AES final block - mov low
  1861. pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
  1862. fmov $res1.d[1], $input_h0 @ AES final block - mov high
  1863. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
  1864. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
  1865. pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
  1866. pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
  1867. eor $res1b, $res1b, $ctr3b @ AES final block - result
  1868. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
  1869. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
  1870. .L192_enc_blocks_less_than_1: @ blocks left <= 1
  1871. ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
  1872. rev $ctr32w, $rctr32w
  1873. and $bit_length, $bit_length, #127 @ bit_length %= 128
  1874. sub $bit_length, $bit_length, #128 @ bit_length -= 128
  1875. mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff
  1876. neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
  1877. mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff
  1878. and $bit_length, $bit_length, #127 @ bit_length %= 128
  1879. lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block
  1880. cmp $bit_length, #64
  1881. csel $input_l0, $rk12_l, $rk12_h, lt
  1882. csel $input_h0, $rk12_h, xzr, lt
  1883. fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
  1884. fmov $ctr0.d[1], $input_h0
  1885. and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
  1886. rev64 $res0b, $res1b @ GHASH final block
  1887. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1888. mov $t0d, $res0.d[1] @ GHASH final block - mid
  1889. pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
  1890. pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
  1891. eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
  1892. eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
  1893. eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
  1894. pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
  1895. eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
  1896. movi $mod_constant.8b, #0xc2
  1897. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  1898. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  1899. bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
  1900. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  1901. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  1902. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  1903. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  1904. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  1905. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  1906. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  1907. eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
  1908. str $ctr32w, [$counter, #12] @ store the updated counter
  1909. st1 { $res1b}, [$output_ptr] @ store all 16B
  1910. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  1911. ext $acc_lb, $acc_lb, $acc_lb, #8
  1912. rev64 $acc_lb, $acc_lb
  1913. mov x0, $len
  1914. st1 { $acc_l.16b }, [$current_tag]
  1915. ldp x21, x22, [sp, #16]
  1916. ldp x23, x24, [sp, #32]
  1917. ldp d8, d9, [sp, #48]
  1918. ldp d10, d11, [sp, #64]
  1919. ldp d12, d13, [sp, #80]
  1920. ldp d14, d15, [sp, #96]
  1921. ldp x19, x20, [sp], #112
  1922. ret
  1923. .L192_enc_ret:
  1924. mov w0, #0x0
  1925. ret
  1926. .size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel
  1927. ___
  1928. #########################################################################################
  1929. # size_t aes_gcm_dec_192_kernel(const unsigned char *in,
  1930. # size_t len,
  1931. # unsigned char *out,
  1932. # const void *key,
  1933. # unsigned char ivec[16],
  1934. # u64 *Xi);
  1935. #
  1936. $code.=<<___;
  1937. .global aes_gcm_dec_192_kernel
  1938. .type aes_gcm_dec_192_kernel,%function
  1939. .align 4
  1940. aes_gcm_dec_192_kernel:
  1941. cbz x1, .L192_dec_ret
  1942. stp x19, x20, [sp, #-112]!
  1943. mov x16, x4
  1944. mov x8, x5
  1945. stp x21, x22, [sp, #16]
  1946. stp x23, x24, [sp, #32]
  1947. stp d8, d9, [sp, #48]
  1948. stp d10, d11, [sp, #64]
  1949. stp d12, d13, [sp, #80]
  1950. stp d14, d15, [sp, #96]
  1951. add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
  1952. ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
  1953. ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
  1954. ldr $rk0q, [$cc, #0] @ load rk0
  1955. lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
  1956. mov $len, $main_end_input_ptr
  1957. ldr $rk2q, [$cc, #32] @ load rk2
  1958. lsr $rctr32x, $ctr96_t32x, #32
  1959. orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
  1960. fmov $ctr3d, $ctr96_b64x @ CTR block 3
  1961. rev $rctr32w, $rctr32w @ rev_ctr32
  1962. fmov $ctr1d, $ctr96_b64x @ CTR block 1
  1963. add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
  1964. ldr $rk1q, [$cc, #16] @ load rk1
  1965. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
  1966. rev $ctr32w, $rctr32w @ CTR block 1
  1967. add $rctr32w, $rctr32w, #1 @ CTR block 1
  1968. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
  1969. ldr $rk3q, [$cc, #48] @ load rk3
  1970. fmov $ctr1.d[1], $ctr32x @ CTR block 1
  1971. rev $ctr32w, $rctr32w @ CTR block 2
  1972. add $rctr32w, $rctr32w, #1 @ CTR block 2
  1973. fmov $ctr2d, $ctr96_b64x @ CTR block 2
  1974. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
  1975. fmov $ctr2.d[1], $ctr32x @ CTR block 2
  1976. rev $ctr32w, $rctr32w @ CTR block 3
  1977. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
  1978. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
  1979. fmov $ctr3.d[1], $ctr32x @ CTR block 3
  1980. ldr $rk8q, [$cc, #128] @ load rk8
  1981. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
  1982. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
  1983. ldr $rk11q, [$cc, #176] @ load rk11
  1984. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
  1985. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  1986. ext $h4b, $h4b, $h4b, #8
  1987. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
  1988. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  1989. ext $h2b, $h2b, $h2b, #8
  1990. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
  1991. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  1992. ext $h3b, $h3b, $h3b, #8
  1993. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
  1994. ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12
  1995. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
  1996. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  1997. ext $h1b, $h1b, $h1b, #8
  1998. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
  1999. ldr $rk10q, [$cc, #160] @ load rk10
  2000. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
  2001. ldr $rk9q, [$cc, #144] @ load rk9
  2002. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
  2003. ldr $rk7q, [$cc, #112] @ load rk7
  2004. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
  2005. ldr $rk4q, [$cc, #64] @ load rk4
  2006. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
  2007. ld1 { $acc_lb}, [$current_tag]
  2008. ext $acc_lb, $acc_lb, $acc_lb, #8
  2009. rev64 $acc_lb, $acc_lb
  2010. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
  2011. add $rctr32w, $rctr32w, #1 @ CTR block 3
  2012. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
  2013. trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
  2014. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
  2015. ldr $rk5q, [$cc, #80] @ load rk5
  2016. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
  2017. trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
  2018. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
  2019. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
  2020. trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
  2021. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
  2022. ldr $rk6q, [$cc, #96] @ load rk6
  2023. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
  2024. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
  2025. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
  2026. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
  2027. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
  2028. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
  2029. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
  2030. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
  2031. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
  2032. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
  2033. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
  2034. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
  2035. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
  2036. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
  2037. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
  2038. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
  2039. sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
  2040. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
  2041. and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
  2042. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
  2043. add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
  2044. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
  2045. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
  2046. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
  2047. trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
  2048. aese $ctr3b, $rk11 @ AES block 3 - round 11
  2049. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
  2050. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
  2051. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
  2052. eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
  2053. aese $ctr2b, $rk11 @ AES block 2 - round 11
  2054. aese $ctr1b, $rk11 @ AES block 1 - round 11
  2055. eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
  2056. aese $ctr0b, $rk11 @ AES block 0 - round 11
  2057. b.ge .L192_dec_tail @ handle tail
  2058. ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
  2059. ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
  2060. eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
  2061. eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
  2062. rev $ctr32w, $rctr32w @ CTR block 4
  2063. ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
  2064. ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
  2065. mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
  2066. mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
  2067. mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
  2068. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
  2069. add $rctr32w, $rctr32w, #1 @ CTR block 4
  2070. mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
  2071. rev64 $res0b, $res0b @ GHASH block 0
  2072. add $input_ptr, $input_ptr, #64 @ AES input_ptr update
  2073. fmov $ctr0d, $ctr96_b64x @ CTR block 4
  2074. rev64 $res1b, $res1b @ GHASH block 1
  2075. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  2076. eor $output_l1, $output_l1, $rk12_l @ AES block 1 - round 12 low
  2077. fmov $ctr0.d[1], $ctr32x @ CTR block 4
  2078. rev $ctr32w, $rctr32w @ CTR block 5
  2079. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
  2080. fmov $ctr1d, $ctr96_b64x @ CTR block 5
  2081. eor $output_h1, $output_h1, $rk12_h @ AES block 1 - round 12 high
  2082. add $rctr32w, $rctr32w, #1 @ CTR block 5
  2083. fmov $ctr1.d[1], $ctr32x @ CTR block 5
  2084. eor $output_l0, $output_l0, $rk12_l @ AES block 0 - round 12 low
  2085. rev $ctr32w, $rctr32w @ CTR block 6
  2086. eor $output_h0, $output_h0, $rk12_h @ AES block 0 - round 12 high
  2087. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
  2088. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
  2089. stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
  2090. add $rctr32w, $rctr32w, #1 @ CTR block 6
  2091. eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
  2092. b.ge .L192_dec_prepretail @ do prepretail
  2093. .L192_dec_main_loop: @ main loop start
  2094. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  2095. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  2096. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  2097. mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
  2098. mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
  2099. eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
  2100. rev64 $res3b, $res3b @ GHASH block 4k+3
  2101. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  2102. fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
  2103. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  2104. eor $res0b, $res0b, $acc_lb @ PRE 1
  2105. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  2106. fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
  2107. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  2108. mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
  2109. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  2110. mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
  2111. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  2112. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
  2113. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  2114. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  2115. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  2116. rev $ctr32w, $rctr32w @ CTR block 4k+7
  2117. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  2118. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
  2119. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
  2120. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  2121. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  2122. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  2123. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  2124. eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high
  2125. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  2126. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  2127. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  2128. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  2129. rev64 $res2b, $res2b @ GHASH block 4k+2
  2130. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  2131. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  2132. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  2133. eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low
  2134. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  2135. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  2136. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  2137. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  2138. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  2139. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  2140. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  2141. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  2142. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  2143. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  2144. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  2145. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  2146. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  2147. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  2148. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  2149. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  2150. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  2151. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  2152. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  2153. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  2154. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  2155. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  2156. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  2157. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  2158. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  2159. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  2160. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  2161. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  2162. movi $mod_constant.8b, #0xc2
  2163. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  2164. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  2165. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  2166. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  2167. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
  2168. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  2169. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  2170. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  2171. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  2172. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
  2173. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
  2174. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  2175. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  2176. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  2177. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  2178. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
  2179. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  2180. ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
  2181. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  2182. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  2183. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  2184. ldr $res3q, [$input_ptr, #48] @ AES block 4k+7 - load ciphertext
  2185. eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low
  2186. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  2187. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  2188. aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
  2189. add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
  2190. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  2191. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  2192. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  2193. ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
  2194. aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
  2195. ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
  2196. rev $ctr32w, $rctr32w @ CTR block 4k+8
  2197. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  2198. stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
  2199. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
  2200. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  2201. add $input_ptr, $input_ptr, #64 @ AES input_ptr update
  2202. cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
  2203. eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
  2204. eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high
  2205. eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
  2206. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
  2207. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
  2208. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
  2209. pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  2210. mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
  2211. mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
  2212. stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
  2213. rev64 $res1b, $res1b @ GHASH block 4k+5
  2214. aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
  2215. mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
  2216. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
  2217. mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
  2218. fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
  2219. add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
  2220. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  2221. eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
  2222. fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
  2223. rev $ctr32w, $rctr32w @ CTR block 4k+9
  2224. eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low
  2225. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
  2226. eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
  2227. fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
  2228. add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
  2229. eor $output_l1, $output_l1, $rk12_l @ AES block 4k+5 - round 12 low
  2230. fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
  2231. rev $ctr32w, $rctr32w @ CTR block 4k+10
  2232. eor $output_h1, $output_h1, $rk12_h @ AES block 4k+5 - round 12 high
  2233. eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high
  2234. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
  2235. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  2236. add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
  2237. rev64 $res0b, $res0b @ GHASH block 4k+4
  2238. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
  2239. aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
  2240. stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
  2241. b.lt .L192_dec_main_loop
  2242. .L192_dec_prepretail: @ PREPRETAIL
  2243. mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
  2244. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  2245. eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
  2246. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  2247. mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
  2248. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  2249. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  2250. eor $res0b, $res0b, $acc_lb @ PRE 1
  2251. fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
  2252. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  2253. mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
  2254. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  2255. mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
  2256. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  2257. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  2258. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
  2259. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  2260. rev64 $res2b, $res2b @ GHASH block 4k+2
  2261. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  2262. fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
  2263. rev $ctr32w, $rctr32w @ CTR block 4k+7
  2264. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
  2265. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  2266. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  2267. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  2268. eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high
  2269. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
  2270. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  2271. eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low
  2272. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  2273. eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high
  2274. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  2275. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  2276. eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low
  2277. stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
  2278. rev64 $res3b, $res3b @ GHASH block 4k+3
  2279. stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
  2280. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  2281. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  2282. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  2283. add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
  2284. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  2285. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  2286. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  2287. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  2288. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  2289. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  2290. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  2291. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  2292. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  2293. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  2294. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  2295. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  2296. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  2297. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  2298. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  2299. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  2300. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  2301. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  2302. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  2303. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  2304. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  2305. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  2306. movi $mod_constant.8b, #0xc2
  2307. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  2308. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  2309. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  2310. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  2311. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  2312. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  2313. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  2314. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  2315. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  2316. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  2317. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  2318. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  2319. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  2320. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  2321. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  2322. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  2323. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  2324. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  2325. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  2326. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  2327. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  2328. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  2329. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  2330. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
  2331. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  2332. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  2333. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  2334. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
  2335. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  2336. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  2337. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  2338. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  2339. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  2340. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  2341. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
  2342. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  2343. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
  2344. pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  2345. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
  2346. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
  2347. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
  2348. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  2349. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
  2350. aese $ctr0b, $rk11
  2351. eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
  2352. aese $ctr2b, $rk11
  2353. aese $ctr1b, $rk11
  2354. aese $ctr3b, $rk11
  2355. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  2356. .L192_dec_tail: @ TAIL
  2357. sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
  2358. ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
  2359. eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
  2360. mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
  2361. mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
  2362. ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
  2363. cmp $main_end_input_ptr, #48
  2364. eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high
  2365. eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low
  2366. b.gt .L192_dec_blocks_more_than_3
  2367. movi $acc_l.8b, #0
  2368. movi $acc_h.8b, #0
  2369. mov $ctr3b, $ctr2b
  2370. mov $ctr2b, $ctr1b
  2371. sub $rctr32w, $rctr32w, #1
  2372. movi $acc_m.8b, #0
  2373. cmp $main_end_input_ptr, #32
  2374. b.gt .L192_dec_blocks_more_than_2
  2375. mov $ctr3b, $ctr1b
  2376. cmp $main_end_input_ptr, #16
  2377. sub $rctr32w, $rctr32w, #1
  2378. b.gt .L192_dec_blocks_more_than_1
  2379. sub $rctr32w, $rctr32w, #1
  2380. b .L192_dec_blocks_less_than_1
  2381. .L192_dec_blocks_more_than_3: @ blocks left > 3
  2382. rev64 $res0b, $res1b @ GHASH final-3 block
  2383. ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
  2384. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
  2385. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2386. eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
  2387. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
  2388. mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
  2389. mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
  2390. mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
  2391. mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
  2392. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
  2393. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
  2394. eor $output_l0, $output_l0, $rk12_l @ AES final-2 block - round 12 low
  2395. movi $t0.8b, #0 @ suppress further partial tag feed in
  2396. pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
  2397. eor $output_h0, $output_h0, $rk12_h @ AES final-2 block - round 12 high
  2398. .L192_dec_blocks_more_than_2: @ blocks left > 2
  2399. rev64 $res0b, $res1b @ GHASH final-2 block
  2400. ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
  2401. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2402. movi $t0.8b, #0 @ suppress further partial tag feed in
  2403. eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
  2404. mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
  2405. pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
  2406. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
  2407. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
  2408. mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
  2409. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
  2410. mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
  2411. pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
  2412. pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
  2413. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
  2414. eor $output_h0, $output_h0, $rk12_h @ AES final-1 block - round 12 high
  2415. eor $output_l0, $output_l0, $rk12_l @ AES final-1 block - round 12 low
  2416. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
  2417. .L192_dec_blocks_more_than_1: @ blocks left > 1
  2418. rev64 $res0b, $res1b @ GHASH final-1 block
  2419. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2420. ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
  2421. mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
  2422. pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
  2423. eor $ctr0b, $res1b, $ctr3b @ AES final block - result
  2424. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
  2425. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
  2426. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
  2427. pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
  2428. mov $output_h0, $ctr0.d[1] @ AES final block - mov high
  2429. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
  2430. mov $output_l0, $ctr0.d[0] @ AES final block - mov low
  2431. pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
  2432. movi $t0.8b, #0 @ suppress further partial tag feed in
  2433. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
  2434. eor $output_h0, $output_h0, $rk12_h @ AES final block - round 12 high
  2435. eor $output_l0, $output_l0, $rk12_l @ AES final block - round 12 low
  2436. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
  2437. .L192_dec_blocks_less_than_1: @ blocks left <= 1
  2438. mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff
  2439. ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
  2440. and $bit_length, $bit_length, #127 @ bit_length %= 128
  2441. sub $bit_length, $bit_length, #128 @ bit_length -= 128
  2442. neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
  2443. and $bit_length, $bit_length, #127 @ bit_length %= 128
  2444. mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff
  2445. lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block
  2446. cmp $bit_length, #64
  2447. csel $ctr32x, $rk12_l, $rk12_h, lt
  2448. csel $ctr96_b64x, $rk12_h, xzr, lt
  2449. fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
  2450. and $output_l0, $output_l0, $ctr32x
  2451. bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
  2452. orr $output_l0, $output_l0, $end_input_ptr
  2453. mov $ctr0.d[1], $ctr96_b64x
  2454. rev $ctr32w, $rctr32w
  2455. and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
  2456. str $ctr32w, [$counter, #12] @ store the updated counter
  2457. rev64 $res0b, $res1b @ GHASH final block
  2458. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2459. bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
  2460. and $output_h0, $output_h0, $ctr96_b64x
  2461. pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
  2462. mov $t0d, $res0.d[1] @ GHASH final block - mid
  2463. pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
  2464. eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
  2465. eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
  2466. pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
  2467. eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
  2468. eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
  2469. movi $mod_constant.8b, #0xc2
  2470. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  2471. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  2472. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  2473. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  2474. orr $output_h0, $output_h0, $main_end_input_ptr
  2475. stp $output_l0, $output_h0, [$output_ptr]
  2476. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  2477. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  2478. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  2479. pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  2480. eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
  2481. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  2482. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  2483. ext $acc_lb, $acc_lb, $acc_lb, #8
  2484. rev64 $acc_lb, $acc_lb
  2485. mov x0, $len
  2486. st1 { $acc_l.16b }, [$current_tag]
  2487. ldp x21, x22, [sp, #16]
  2488. ldp x23, x24, [sp, #32]
  2489. ldp d8, d9, [sp, #48]
  2490. ldp d10, d11, [sp, #64]
  2491. ldp d12, d13, [sp, #80]
  2492. ldp d14, d15, [sp, #96]
  2493. ldp x19, x20, [sp], #112
  2494. ret
  2495. .L192_dec_ret:
  2496. mov w0, #0x0
  2497. ret
  2498. .size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel
  2499. ___
  2500. }
  2501. {
  2502. my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
  2503. my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
  2504. my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
  2505. my ($output_l0,$output_h0)=map("x$_",(6..7));
  2506. my $ctr32w="w9";
  2507. my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk14_l,$rk14_h,$len)=map("x$_",(9..15));
  2508. my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
  2509. my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
  2510. my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
  2511. my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
  2512. my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
  2513. my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
  2514. my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
  2515. my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
  2516. my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
  2517. my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
  2518. my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
  2519. my $t0="v8";
  2520. my $t0d="d8";
  2521. my $t1="v4";
  2522. my $t1d="d4";
  2523. my $t2="v8";
  2524. my $t2d="d8";
  2525. my $t3="v4";
  2526. my $t3d="d4";
  2527. my $t4="v4";
  2528. my $t4d="d4";
  2529. my $t5="v5";
  2530. my $t5d="d5";
  2531. my $t6="v8";
  2532. my $t6d="d8";
  2533. my $t7="v5";
  2534. my $t7d="d5";
  2535. my $t8="v6";
  2536. my $t8d="d6";
  2537. my $t9="v4";
  2538. my $t9d="d4";
  2539. my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
  2540. my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
  2541. my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
  2542. my $mod_constantd="d8";
  2543. my $mod_constant="v8";
  2544. my $mod_t="v7";
  2545. my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11,$rk12,$rk13)=map("v$_.16b",(18..31));
  2546. my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q,$rk12q,$rk13q)=map("q$_",(18..31));
  2547. my $rk2q1="v20.1q";
  2548. my $rk3q1="v21.1q";
  2549. my $rk4v="v22";
  2550. my $rk4d="d22";
  2551. #########################################################################################
  2552. # size_t aes_gcm_enc_256_kernel(const unsigned char *in,
  2553. # size_t len,
  2554. # unsigned char *out,
  2555. # const void *key,
  2556. # unsigned char ivec[16],
  2557. # u64 *Xi);
  2558. #
  2559. $code.=<<___;
  2560. .global aes_gcm_enc_256_kernel
  2561. .type aes_gcm_enc_256_kernel,%function
  2562. .align 4
  2563. aes_gcm_enc_256_kernel:
  2564. cbz x1, .L256_enc_ret
  2565. stp x19, x20, [sp, #-112]!
  2566. mov x16, x4
  2567. mov x8, x5
  2568. stp x21, x22, [sp, #16]
  2569. stp x23, x24, [sp, #32]
  2570. stp d8, d9, [sp, #48]
  2571. stp d10, d11, [sp, #64]
  2572. stp d12, d13, [sp, #80]
  2573. stp d14, d15, [sp, #96]
  2574. add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
  2575. lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
  2576. mov $len, $main_end_input_ptr
  2577. ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
  2578. ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
  2579. sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
  2580. ldr $rk0q, [$cc, #0] @ load rk0
  2581. and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
  2582. ldr $rk7q, [$cc, #112] @ load rk7
  2583. add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
  2584. lsr $rctr32x, $ctr96_t32x, #32
  2585. fmov $ctr2d, $ctr96_b64x @ CTR block 2
  2586. orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
  2587. rev $rctr32w, $rctr32w @ rev_ctr32
  2588. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
  2589. fmov $ctr1d, $ctr96_b64x @ CTR block 1
  2590. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
  2591. add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
  2592. rev $ctr32w, $rctr32w @ CTR block 1
  2593. fmov $ctr3d, $ctr96_b64x @ CTR block 3
  2594. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
  2595. add $rctr32w, $rctr32w, #1 @ CTR block 1
  2596. ldr $rk1q, [$cc, #16] @ load rk1
  2597. fmov $ctr1.d[1], $ctr32x @ CTR block 1
  2598. rev $ctr32w, $rctr32w @ CTR block 2
  2599. add $rctr32w, $rctr32w, #1 @ CTR block 2
  2600. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
  2601. ldr $rk2q, [$cc, #32] @ load rk2
  2602. fmov $ctr2.d[1], $ctr32x @ CTR block 2
  2603. rev $ctr32w, $rctr32w @ CTR block 3
  2604. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
  2605. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
  2606. fmov $ctr3.d[1], $ctr32x @ CTR block 3
  2607. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
  2608. ldr $rk3q, [$cc, #48] @ load rk3
  2609. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
  2610. ldr $rk6q, [$cc, #96] @ load rk6
  2611. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
  2612. ldr $rk5q, [$cc, #80] @ load rk5
  2613. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
  2614. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  2615. ext $h3b, $h3b, $h3b, #8
  2616. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
  2617. ldr $rk13q, [$cc, #208] @ load rk13
  2618. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
  2619. ldr $rk4q, [$cc, #64] @ load rk4
  2620. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
  2621. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  2622. ext $h2b, $h2b, $h2b, #8
  2623. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
  2624. ldr $rk12q, [$cc, #192] @ load rk12
  2625. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
  2626. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  2627. ext $h4b, $h4b, $h4b, #8
  2628. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
  2629. ldr $rk11q, [$cc, #176] @ load rk11
  2630. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
  2631. ldr $rk8q, [$cc, #128] @ load rk8
  2632. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
  2633. add $rctr32w, $rctr32w, #1 @ CTR block 3
  2634. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
  2635. ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14
  2636. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
  2637. ld1 { $acc_lb}, [$current_tag]
  2638. ext $acc_lb, $acc_lb, $acc_lb, #8
  2639. rev64 $acc_lb, $acc_lb
  2640. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
  2641. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
  2642. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
  2643. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
  2644. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
  2645. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
  2646. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
  2647. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
  2648. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
  2649. trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
  2650. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
  2651. ldr $rk9q, [$cc, #144] @ load rk9
  2652. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
  2653. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  2654. ext $h1b, $h1b, $h1b, #8
  2655. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
  2656. ldr $rk10q, [$cc, #160] @ load rk10
  2657. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
  2658. trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
  2659. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
  2660. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
  2661. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
  2662. trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
  2663. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
  2664. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
  2665. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
  2666. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
  2667. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
  2668. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
  2669. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
  2670. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
  2671. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
  2672. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
  2673. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
  2674. aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
  2675. aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
  2676. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
  2677. aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
  2678. aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
  2679. aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
  2680. eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
  2681. aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
  2682. aese $ctr2b, $rk13 @ AES block 2 - round 13
  2683. trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
  2684. aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
  2685. aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
  2686. aese $ctr1b, $rk13 @ AES block 1 - round 13
  2687. aese $ctr0b, $rk13 @ AES block 0 - round 13
  2688. aese $ctr3b, $rk13 @ AES block 3 - round 13
  2689. eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
  2690. b.ge .L256_enc_tail @ handle tail
  2691. ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
  2692. rev $ctr32w, $rctr32w @ CTR block 4
  2693. ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
  2694. ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
  2695. ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
  2696. add $input_ptr, $input_ptr, #64 @ AES input_ptr update
  2697. eor $input_l1, $input_l1, $rk14_l @ AES block 1 - round 14 low
  2698. eor $input_h1, $input_h1, $rk14_h @ AES block 1 - round 14 high
  2699. fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
  2700. eor $input_l0, $input_l0, $rk14_l @ AES block 0 - round 14 low
  2701. eor $input_h0, $input_h0, $rk14_h @ AES block 0 - round 14 high
  2702. eor $input_h3, $input_h3, $rk14_h @ AES block 3 - round 14 high
  2703. fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
  2704. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  2705. fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
  2706. eor $input_l3, $input_l3, $rk14_l @ AES block 3 - round 14 low
  2707. eor $input_l2, $input_l2, $rk14_l @ AES block 2 - round 14 low
  2708. fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
  2709. fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
  2710. add $rctr32w, $rctr32w, #1 @ CTR block 4
  2711. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
  2712. fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
  2713. eor $input_h2, $input_h2, $rk14_h @ AES block 2 - round 14 high
  2714. fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
  2715. eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
  2716. fmov $ctr0d, $ctr96_b64x @ CTR block 4
  2717. fmov $ctr0.d[1], $ctr32x @ CTR block 4
  2718. rev $ctr32w, $rctr32w @ CTR block 5
  2719. add $rctr32w, $rctr32w, #1 @ CTR block 5
  2720. eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
  2721. fmov $ctr1d, $ctr96_b64x @ CTR block 5
  2722. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
  2723. fmov $ctr1.d[1], $ctr32x @ CTR block 5
  2724. rev $ctr32w, $rctr32w @ CTR block 6
  2725. st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
  2726. fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
  2727. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
  2728. eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
  2729. st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
  2730. add $rctr32w, $rctr32w, #1 @ CTR block 6
  2731. fmov $ctr2d, $ctr96_b64x @ CTR block 6
  2732. fmov $ctr2.d[1], $ctr32x @ CTR block 6
  2733. st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
  2734. rev $ctr32w, $rctr32w @ CTR block 7
  2735. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
  2736. eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
  2737. st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
  2738. b.ge L256_enc_prepretail @ do prepretail
  2739. .L256_enc_main_loop: @ main loop start
  2740. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  2741. rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
  2742. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  2743. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
  2744. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  2745. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  2746. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  2747. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
  2748. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  2749. ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+7 - load plaintext
  2750. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  2751. ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
  2752. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  2753. eor $res0b, $res0b, $acc_lb @ PRE 1
  2754. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  2755. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  2756. eor $input_l3, $input_l3, $rk14_l @ AES block 4k+7 - round 14 low
  2757. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  2758. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  2759. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  2760. eor $input_h2, $input_h2, $rk14_h @ AES block 4k+6 - round 14 high
  2761. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  2762. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  2763. rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
  2764. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  2765. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  2766. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  2767. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  2768. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  2769. rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
  2770. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  2771. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  2772. rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
  2773. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  2774. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  2775. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  2776. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  2777. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  2778. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  2779. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  2780. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  2781. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  2782. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  2783. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  2784. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  2785. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  2786. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  2787. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  2788. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  2789. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  2790. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  2791. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  2792. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  2793. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  2794. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  2795. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  2796. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  2797. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  2798. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  2799. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  2800. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  2801. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  2802. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  2803. ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
  2804. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  2805. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  2806. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  2807. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  2808. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  2809. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  2810. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  2811. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  2812. eor $input_l1, $input_l1, $rk14_l @ AES block 4k+5 - round 14 low
  2813. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
  2814. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  2815. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  2816. eor $input_l2, $input_l2, $rk14_l @ AES block 4k+6 - round 14 low
  2817. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
  2818. movi $mod_constant.8b, #0xc2
  2819. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  2820. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  2821. fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
  2822. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  2823. ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
  2824. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
  2825. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  2826. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  2827. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  2828. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
  2829. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
  2830. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  2831. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
  2832. add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
  2833. aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
  2834. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  2835. aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
  2836. add $input_ptr, $input_ptr, #64 @ AES input_ptr update
  2837. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  2838. rev $ctr32w, $rctr32w @ CTR block 4k+8
  2839. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  2840. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
  2841. eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low
  2842. aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
  2843. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  2844. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
  2845. eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high
  2846. fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
  2847. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
  2848. eor $mod_t.16b, $acc_hb, $mod_t.16b @ MODULO - fold into mid
  2849. aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
  2850. eor $input_h1, $input_h1, $rk14_h @ AES block 4k+5 - round 14 high
  2851. aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
  2852. eor $input_h3, $input_h3, $rk14_h @ AES block 4k+7 - round 14 high
  2853. aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
  2854. add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
  2855. aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
  2856. fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
  2857. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  2858. aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
  2859. fmov $ctr_t3d, $input_l3 @ AES block 4k+7 - mov low
  2860. aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
  2861. fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
  2862. fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
  2863. cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
  2864. fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
  2865. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  2866. eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
  2867. fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
  2868. fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
  2869. rev $ctr32w, $rctr32w @ CTR block 4k+9
  2870. add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
  2871. eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
  2872. fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
  2873. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
  2874. aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
  2875. fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
  2876. aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
  2877. rev $ctr32w, $rctr32w @ CTR block 4k+10
  2878. st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
  2879. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
  2880. eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
  2881. fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+7 - mov high
  2882. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  2883. st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
  2884. add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
  2885. aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
  2886. eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
  2887. fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
  2888. st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
  2889. fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
  2890. rev $ctr32w, $rctr32w @ CTR block 4k+11
  2891. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  2892. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
  2893. eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+7 - result
  2894. st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+7 - store result
  2895. b.lt L256_enc_main_loop
  2896. .L256_enc_prepretail: @ PREPRETAIL
  2897. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  2898. rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
  2899. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  2900. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
  2901. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  2902. rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
  2903. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
  2904. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  2905. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  2906. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  2907. eor $res0b, $res0b, $acc_lb @ PRE 1
  2908. rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
  2909. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  2910. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  2911. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  2912. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  2913. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  2914. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  2915. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  2916. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  2917. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  2918. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  2919. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  2920. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  2921. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  2922. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  2923. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  2924. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  2925. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  2926. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  2927. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  2928. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  2929. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  2930. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  2931. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  2932. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  2933. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  2934. rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
  2935. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  2936. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  2937. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  2938. add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
  2939. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  2940. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  2941. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  2942. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  2943. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  2944. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  2945. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  2946. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  2947. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  2948. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  2949. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  2950. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  2951. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  2952. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  2953. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  2954. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  2955. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  2956. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  2957. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  2958. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  2959. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  2960. movi $mod_constant.8b, #0xc2
  2961. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  2962. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  2963. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  2964. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  2965. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  2966. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  2967. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  2968. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  2969. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  2970. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  2971. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
  2972. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  2973. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  2974. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
  2975. eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
  2976. pmull $t1.1q, $acc_h.1d, $mod_constant.1d
  2977. ext $acc_hb, $acc_hb, $acc_hb, #8
  2978. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
  2979. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  2980. eor $acc_mb, $acc_mb, $acc_lb
  2981. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
  2982. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
  2983. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  2984. aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
  2985. eor $acc_mb, $acc_mb, $t1.16b
  2986. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
  2987. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
  2988. aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
  2989. aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
  2990. eor $acc_mb, $acc_mb, $acc_hb
  2991. aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
  2992. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
  2993. aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
  2994. pmull $t1.1q, $acc_m.1d, $mod_constant.1d
  2995. aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
  2996. ext $acc_mb, $acc_mb, $acc_mb, #8
  2997. aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
  2998. aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
  2999. eor $acc_lb, $acc_lb, $t1.16b
  3000. aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
  3001. aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
  3002. aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
  3003. aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
  3004. eor $acc_lb, $acc_lb, $acc_mb
  3005. .L256_enc_tail: @ TAIL
  3006. ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
  3007. sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
  3008. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
  3009. eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low
  3010. eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high
  3011. cmp $main_end_input_ptr, #48
  3012. fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
  3013. fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
  3014. eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
  3015. b.gt .L256_enc_blocks_more_than_3
  3016. cmp $main_end_input_ptr, #32
  3017. mov $ctr3b, $ctr2b
  3018. movi $acc_l.8b, #0
  3019. movi $acc_h.8b, #0
  3020. sub $rctr32w, $rctr32w, #1
  3021. mov $ctr2b, $ctr1b
  3022. movi $acc_m.8b, #0
  3023. b.gt .L256_enc_blocks_more_than_2
  3024. mov $ctr3b, $ctr1b
  3025. sub $rctr32w, $rctr32w, #1
  3026. cmp $main_end_input_ptr, #16
  3027. b.gt .L256_enc_blocks_more_than_1
  3028. sub $rctr32w, $rctr32w, #1
  3029. b .L256_enc_blocks_less_than_1
  3030. .L256_enc_blocks_more_than_3: @ blocks left > 3
  3031. st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
  3032. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
  3033. rev64 $res0b, $res1b @ GHASH final-3 block
  3034. eor $input_l0, $input_l0, $rk14_l @ AES final-2 block - round 14 low
  3035. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3036. eor $input_h0, $input_h0, $rk14_h @ AES final-2 block - round 14 high
  3037. mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
  3038. fmov $res1d, $input_l0 @ AES final-2 block - mov low
  3039. fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
  3040. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
  3041. movi $t0.8b, #0 @ suppress further partial tag feed in
  3042. mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
  3043. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
  3044. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
  3045. pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
  3046. eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
  3047. .L256_enc_blocks_more_than_2: @ blocks left > 2
  3048. st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
  3049. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
  3050. rev64 $res0b, $res1b @ GHASH final-2 block
  3051. eor $input_l0, $input_l0, $rk14_l @ AES final-1 block - round 14 low
  3052. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3053. fmov $res1d, $input_l0 @ AES final-1 block - mov low
  3054. eor $input_h0, $input_h0, $rk14_h @ AES final-1 block - round 14 high
  3055. fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
  3056. movi $t0.8b, #0 @ suppress further partial tag feed in
  3057. pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
  3058. mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
  3059. pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
  3060. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
  3061. eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
  3062. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
  3063. pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
  3064. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
  3065. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
  3066. .L256_enc_blocks_more_than_1: @ blocks left > 1
  3067. st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
  3068. rev64 $res0b, $res1b @ GHASH final-1 block
  3069. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
  3070. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3071. movi $t0.8b, #0 @ suppress further partial tag feed in
  3072. eor $input_l0, $input_l0, $rk14_l @ AES final block - round 14 low
  3073. mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
  3074. pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
  3075. eor $input_h0, $input_h0, $rk14_h @ AES final block - round 14 high
  3076. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
  3077. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
  3078. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
  3079. fmov $res1d, $input_l0 @ AES final block - mov low
  3080. fmov $res1.d[1], $input_h0 @ AES final block - mov high
  3081. pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
  3082. pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
  3083. eor $res1b, $res1b, $ctr3b @ AES final block - result
  3084. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
  3085. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
  3086. .L256_enc_blocks_less_than_1: @ blocks left <= 1
  3087. and $bit_length, $bit_length, #127 @ bit_length %= 128
  3088. mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff
  3089. sub $bit_length, $bit_length, #128 @ bit_length -= 128
  3090. neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
  3091. ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
  3092. mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff
  3093. and $bit_length, $bit_length, #127 @ bit_length %= 128
  3094. lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block
  3095. cmp $bit_length, #64
  3096. csel $input_l0, $rk14_l, $rk14_h, lt
  3097. csel $input_h0, $rk14_h, xzr, lt
  3098. fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
  3099. fmov $ctr0.d[1], $input_h0
  3100. and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
  3101. rev64 $res0b, $res1b @ GHASH final block
  3102. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3103. bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
  3104. pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
  3105. mov $t0d, $res0.d[1] @ GHASH final block - mid
  3106. rev $ctr32w, $rctr32w
  3107. pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
  3108. eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
  3109. eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
  3110. pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
  3111. eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
  3112. eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
  3113. movi $mod_constant.8b, #0xc2
  3114. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  3115. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  3116. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  3117. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  3118. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  3119. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  3120. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  3121. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  3122. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  3123. str $ctr32w, [$counter, #12] @ store the updated counter
  3124. st1 { $res1b}, [$output_ptr] @ store all 16B
  3125. eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
  3126. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  3127. ext $acc_lb, $acc_lb, $acc_lb, #8
  3128. rev64 $acc_lb, $acc_lb
  3129. mov x0, $len
  3130. st1 { $acc_l.16b }, [$current_tag]
  3131. ldp x21, x22, [sp, #16]
  3132. ldp x23, x24, [sp, #32]
  3133. ldp d8, d9, [sp, #48]
  3134. ldp d10, d11, [sp, #64]
  3135. ldp d12, d13, [sp, #80]
  3136. ldp d14, d15, [sp, #96]
  3137. ldp x19, x20, [sp], #112
  3138. ret
  3139. .L256_enc_ret:
  3140. mov w0, #0x0
  3141. ret
  3142. .size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel
  3143. ___
  3144. {
  3145. my $t8="v4";
  3146. my $t8d="d4";
  3147. my $t9="v6";
  3148. my $t9d="d6";
  3149. #########################################################################################
  3150. # size_t aes_gcm_dec_256_kernel(const unsigned char *in,
  3151. # size_t len,
  3152. # unsigned char *out,
  3153. # const void *key,
  3154. # unsigned char ivec[16],
  3155. # u64 *Xi);
  3156. #
  3157. $code.=<<___;
  3158. .global aes_gcm_dec_256_kernel
  3159. .type aes_gcm_dec_256_kernel,%function
  3160. .align 4
  3161. aes_gcm_dec_256_kernel:
  3162. cbz x1, .L256_dec_ret
  3163. stp x19, x20, [sp, #-112]!
  3164. mov x16, x4
  3165. mov x8, x5
  3166. stp x21, x22, [sp, #16]
  3167. stp x23, x24, [sp, #32]
  3168. stp d8, d9, [sp, #48]
  3169. stp d10, d11, [sp, #64]
  3170. stp d12, d13, [sp, #80]
  3171. stp d14, d15, [sp, #96]
  3172. lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
  3173. mov $len, $main_end_input_ptr
  3174. ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
  3175. ldr $rk8q, [$cc, #128] @ load rk8
  3176. sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
  3177. ldr $rk7q, [$cc, #112] @ load rk7
  3178. and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
  3179. add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
  3180. ldr $rk6q, [$cc, #96] @ load rk6
  3181. lsr $rctr32x, $ctr96_t32x, #32
  3182. ldr $rk5q, [$cc, #80] @ load rk5
  3183. orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
  3184. ldr $rk3q, [$cc, #48] @ load rk3
  3185. add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
  3186. rev $rctr32w, $rctr32w @ rev_ctr32
  3187. add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
  3188. fmov $ctr3d, $ctr96_b64x @ CTR block 3
  3189. rev $ctr32w, $rctr32w @ CTR block 1
  3190. add $rctr32w, $rctr32w, #1 @ CTR block 1
  3191. fmov $ctr1d, $ctr96_b64x @ CTR block 1
  3192. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
  3193. ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
  3194. fmov $ctr1.d[1], $ctr32x @ CTR block 1
  3195. rev $ctr32w, $rctr32w @ CTR block 2
  3196. add $rctr32w, $rctr32w, #1 @ CTR block 2
  3197. fmov $ctr2d, $ctr96_b64x @ CTR block 2
  3198. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
  3199. fmov $ctr2.d[1], $ctr32x @ CTR block 2
  3200. rev $ctr32w, $rctr32w @ CTR block 3
  3201. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
  3202. ldr $rk0q, [$cc, #0] @ load rk0
  3203. fmov $ctr3.d[1], $ctr32x @ CTR block 3
  3204. add $rctr32w, $rctr32w, #1 @ CTR block 3
  3205. ldr $rk4q, [$cc, #64] @ load rk4
  3206. ldr $rk13q, [$cc, #208] @ load rk13
  3207. ldr $rk1q, [$cc, #16] @ load rk1
  3208. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
  3209. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  3210. ext $h3b, $h3b, $h3b, #8
  3211. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
  3212. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  3213. ext $h4b, $h4b, $h4b, #8
  3214. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
  3215. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  3216. ext $h2b, $h2b, $h2b, #8
  3217. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
  3218. ldr $rk2q, [$cc, #32] @ load rk2
  3219. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
  3220. ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14
  3221. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
  3222. ld1 { $acc_lb}, [$current_tag]
  3223. ext $acc_lb, $acc_lb, $acc_lb, #8
  3224. rev64 $acc_lb, $acc_lb
  3225. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
  3226. ldr $rk9q, [$cc, #144] @ load rk9
  3227. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
  3228. ldr $rk12q, [$cc, #192] @ load rk12
  3229. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
  3230. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  3231. ext $h1b, $h1b, $h1b, #8
  3232. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
  3233. ldr $rk10q, [$cc, #160] @ load rk10
  3234. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
  3235. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
  3236. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
  3237. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
  3238. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
  3239. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
  3240. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
  3241. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
  3242. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
  3243. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
  3244. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
  3245. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
  3246. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
  3247. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
  3248. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
  3249. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
  3250. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
  3251. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
  3252. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
  3253. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
  3254. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
  3255. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
  3256. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
  3257. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
  3258. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
  3259. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
  3260. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
  3261. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
  3262. ldr $rk11q, [$cc, #176] @ load rk11
  3263. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
  3264. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
  3265. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
  3266. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
  3267. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
  3268. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
  3269. aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
  3270. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
  3271. aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
  3272. aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
  3273. aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
  3274. trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
  3275. trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
  3276. trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
  3277. trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
  3278. aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
  3279. aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
  3280. aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
  3281. aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
  3282. eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
  3283. aese $ctr1b, $rk13 @ AES block 1 - round 13
  3284. aese $ctr2b, $rk13 @ AES block 2 - round 13
  3285. eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
  3286. aese $ctr3b, $rk13 @ AES block 3 - round 13
  3287. aese $ctr0b, $rk13 @ AES block 0 - round 13
  3288. b.ge .L256_dec_tail @ handle tail
  3289. ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
  3290. ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
  3291. rev $ctr32w, $rctr32w @ CTR block 4
  3292. eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
  3293. eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
  3294. rev64 $res1b, $res1b @ GHASH block 1
  3295. ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
  3296. mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
  3297. mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
  3298. rev64 $res0b, $res0b @ GHASH block 0
  3299. add $rctr32w, $rctr32w, #1 @ CTR block 4
  3300. fmov $ctr0d, $ctr96_b64x @ CTR block 4
  3301. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
  3302. fmov $ctr0.d[1], $ctr32x @ CTR block 4
  3303. rev $ctr32w, $rctr32w @ CTR block 5
  3304. add $rctr32w, $rctr32w, #1 @ CTR block 5
  3305. mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
  3306. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
  3307. mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
  3308. eor $output_h0, $output_h0, $rk14_h @ AES block 0 - round 14 high
  3309. eor $output_l0, $output_l0, $rk14_l @ AES block 0 - round 14 low
  3310. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
  3311. fmov $ctr1d, $ctr96_b64x @ CTR block 5
  3312. ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
  3313. add $input_ptr, $input_ptr, #64 @ AES input_ptr update
  3314. fmov $ctr1.d[1], $ctr32x @ CTR block 5
  3315. rev $ctr32w, $rctr32w @ CTR block 6
  3316. add $rctr32w, $rctr32w, #1 @ CTR block 6
  3317. eor $output_l1, $output_l1, $rk14_l @ AES block 1 - round 14 low
  3318. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
  3319. eor $output_h1, $output_h1, $rk14_h @ AES block 1 - round 14 high
  3320. stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
  3321. eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
  3322. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  3323. b.ge .L256_dec_prepretail @ do prepretail
  3324. .L256_dec_main_loop: @ main loop start
  3325. mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
  3326. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  3327. eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
  3328. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  3329. mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
  3330. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  3331. fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
  3332. fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
  3333. eor $res0b, $res0b, $acc_lb @ PRE 1
  3334. rev $ctr32w, $rctr32w @ CTR block 4k+7
  3335. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  3336. mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
  3337. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  3338. mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
  3339. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  3340. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  3341. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
  3342. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  3343. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
  3344. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  3345. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
  3346. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  3347. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  3348. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  3349. eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high
  3350. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  3351. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  3352. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  3353. rev64 $res2b, $res2b @ GHASH block 4k+2
  3354. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  3355. eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low
  3356. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  3357. stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
  3358. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  3359. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  3360. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  3361. rev64 $res3b, $res3b @ GHASH block 4k+3
  3362. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  3363. eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low
  3364. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  3365. eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high
  3366. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  3367. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  3368. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  3369. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  3370. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  3371. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  3372. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  3373. add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
  3374. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  3375. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  3376. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  3377. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  3378. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  3379. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  3380. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  3381. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  3382. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  3383. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  3384. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  3385. rev $ctr32w, $rctr32w @ CTR block 4k+8
  3386. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  3387. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  3388. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  3389. add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
  3390. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  3391. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  3392. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  3393. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  3394. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  3395. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  3396. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  3397. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  3398. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  3399. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  3400. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  3401. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  3402. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
  3403. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  3404. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  3405. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
  3406. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  3407. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  3408. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  3409. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  3410. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
  3411. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  3412. movi $mod_constant.8b, #0xc2
  3413. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  3414. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  3415. aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
  3416. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  3417. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  3418. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  3419. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  3420. aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
  3421. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  3422. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  3423. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
  3424. ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
  3425. aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
  3426. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  3427. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
  3428. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  3429. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
  3430. ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
  3431. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  3432. eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
  3433. aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
  3434. stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
  3435. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
  3436. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  3437. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
  3438. ldr $res3q, [$input_ptr, #48] @ AES block 4k+7 - load ciphertext
  3439. aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
  3440. ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
  3441. aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
  3442. mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
  3443. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
  3444. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  3445. aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
  3446. add $input_ptr, $input_ptr, #64 @ AES input_ptr update
  3447. mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
  3448. aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
  3449. fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
  3450. aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
  3451. fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
  3452. pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  3453. eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
  3454. rev $ctr32w, $rctr32w @ CTR block 4k+9
  3455. aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
  3456. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
  3457. cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
  3458. add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
  3459. eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low
  3460. eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high
  3461. mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
  3462. eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
  3463. eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
  3464. aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
  3465. mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
  3466. fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
  3467. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  3468. fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
  3469. rev $ctr32w, $rctr32w @ CTR block 4k+10
  3470. add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
  3471. aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
  3472. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
  3473. rev64 $res1b, $res1b @ GHASH block 4k+5
  3474. eor $output_h1, $output_h1, $rk14_h @ AES block 4k+5 - round 14 high
  3475. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
  3476. eor $output_l1, $output_l1, $rk14_l @ AES block 4k+5 - round 14 low
  3477. stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
  3478. rev64 $res0b, $res0b @ GHASH block 4k+4
  3479. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  3480. b.lt .L256_dec_main_loop
  3481. .L256_dec_prepretail: @ PREPRETAIL
  3482. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  3483. mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
  3484. eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
  3485. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  3486. mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
  3487. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  3488. fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
  3489. fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
  3490. rev $ctr32w, $rctr32w @ CTR block 4k+7
  3491. eor $res0b, $res0b, $acc_lb @ PRE 1
  3492. rev64 $res2b, $res2b @ GHASH block 4k+2
  3493. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
  3494. mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
  3495. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  3496. mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
  3497. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  3498. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  3499. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
  3500. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  3501. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
  3502. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  3503. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  3504. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  3505. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  3506. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  3507. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  3508. rev64 $res3b, $res3b @ GHASH block 4k+3
  3509. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  3510. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  3511. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  3512. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  3513. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  3514. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  3515. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  3516. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  3517. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  3518. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  3519. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  3520. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  3521. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  3522. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  3523. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  3524. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  3525. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  3526. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  3527. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  3528. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  3529. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  3530. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  3531. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  3532. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  3533. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  3534. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  3535. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  3536. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  3537. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  3538. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  3539. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  3540. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  3541. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  3542. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  3543. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  3544. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  3545. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  3546. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  3547. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  3548. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  3549. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  3550. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  3551. movi $mod_constant.8b, #0xc2
  3552. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  3553. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  3554. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  3555. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  3556. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  3557. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  3558. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  3559. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  3560. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  3561. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  3562. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  3563. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  3564. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  3565. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  3566. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  3567. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
  3568. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  3569. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  3570. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
  3571. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  3572. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
  3573. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
  3574. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  3575. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
  3576. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
  3577. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
  3578. eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high
  3579. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
  3580. eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low
  3581. aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
  3582. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  3583. aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
  3584. add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
  3585. aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
  3586. eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low
  3587. aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
  3588. pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  3589. eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high
  3590. aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
  3591. stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
  3592. aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
  3593. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  3594. aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
  3595. stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
  3596. aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
  3597. eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
  3598. aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
  3599. aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
  3600. aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
  3601. aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
  3602. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  3603. .L256_dec_tail: @ TAIL
  3604. sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
  3605. ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
  3606. eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
  3607. mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
  3608. mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
  3609. ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
  3610. cmp $main_end_input_ptr, #48
  3611. eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low
  3612. eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high
  3613. b.gt .L256_dec_blocks_more_than_3
  3614. sub $rctr32w, $rctr32w, #1
  3615. mov $ctr3b, $ctr2b
  3616. movi $acc_m.8b, #0
  3617. movi $acc_l.8b, #0
  3618. cmp $main_end_input_ptr, #32
  3619. movi $acc_h.8b, #0
  3620. mov $ctr2b, $ctr1b
  3621. b.gt .L256_dec_blocks_more_than_2
  3622. sub $rctr32w, $rctr32w, #1
  3623. mov $ctr3b, $ctr1b
  3624. cmp $main_end_input_ptr, #16
  3625. b.gt .L256_dec_blocks_more_than_1
  3626. sub $rctr32w, $rctr32w, #1
  3627. b .L256_dec_blocks_less_than_1
  3628. .L256_dec_blocks_more_than_3: @ blocks left > 3
  3629. rev64 $res0b, $res1b @ GHASH final-3 block
  3630. ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
  3631. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
  3632. mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
  3633. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3634. eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
  3635. mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
  3636. mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
  3637. mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
  3638. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
  3639. movi $t0.8b, #0 @ suppress further partial tag feed in
  3640. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
  3641. pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
  3642. eor $output_l0, $output_l0, $rk14_l @ AES final-2 block - round 14 low
  3643. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
  3644. eor $output_h0, $output_h0, $rk14_h @ AES final-2 block - round 14 high
  3645. .L256_dec_blocks_more_than_2: @ blocks left > 2
  3646. rev64 $res0b, $res1b @ GHASH final-2 block
  3647. ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
  3648. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3649. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
  3650. eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
  3651. mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
  3652. pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
  3653. pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
  3654. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
  3655. mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
  3656. mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
  3657. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
  3658. movi $t0.8b, #0 @ suppress further partial tag feed in
  3659. pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
  3660. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
  3661. eor $output_l0, $output_l0, $rk14_l @ AES final-1 block - round 14 low
  3662. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
  3663. eor $output_h0, $output_h0, $rk14_h @ AES final-1 block - round 14 high
  3664. .L256_dec_blocks_more_than_1: @ blocks left > 1
  3665. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
  3666. rev64 $res0b, $res1b @ GHASH final-1 block
  3667. ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
  3668. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3669. movi $t0.8b, #0 @ suppress further partial tag feed in
  3670. mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
  3671. eor $ctr0b, $res1b, $ctr3b @ AES final block - result
  3672. pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
  3673. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
  3674. pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
  3675. mov $output_l0, $ctr0.d[0] @ AES final block - mov low
  3676. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
  3677. mov $output_h0, $ctr0.d[1] @ AES final block - mov high
  3678. pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
  3679. eor $output_l0, $output_l0, $rk14_l @ AES final block - round 14 low
  3680. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
  3681. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
  3682. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
  3683. eor $output_h0, $output_h0, $rk14_h @ AES final block - round 14 high
  3684. .L256_dec_blocks_less_than_1: @ blocks left <= 1
  3685. and $bit_length, $bit_length, #127 @ bit_length %= 128
  3686. mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff
  3687. sub $bit_length, $bit_length, #128 @ bit_length -= 128
  3688. mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff
  3689. ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
  3690. neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
  3691. and $bit_length, $bit_length, #127 @ bit_length %= 128
  3692. lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block
  3693. cmp $bit_length, #64
  3694. csel $ctr32x, $rk14_l, $rk14_h, lt
  3695. csel $ctr96_b64x, $rk14_h, xzr, lt
  3696. fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
  3697. and $output_l0, $output_l0, $ctr32x
  3698. mov $ctr0.d[1], $ctr96_b64x
  3699. bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
  3700. rev $ctr32w, $rctr32w
  3701. bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
  3702. orr $output_l0, $output_l0, $end_input_ptr
  3703. and $output_h0, $output_h0, $ctr96_b64x
  3704. orr $output_h0, $output_h0, $main_end_input_ptr
  3705. and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
  3706. rev64 $res0b, $res1b @ GHASH final block
  3707. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3708. pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
  3709. mov $t0d, $res0.d[1] @ GHASH final block - mid
  3710. eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
  3711. pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
  3712. pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
  3713. eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
  3714. eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
  3715. eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
  3716. movi $mod_constant.8b, #0xc2
  3717. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  3718. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  3719. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  3720. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  3721. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  3722. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  3723. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  3724. pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  3725. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  3726. eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
  3727. stp $output_l0, $output_h0, [$output_ptr]
  3728. str $ctr32w, [$counter, #12] @ store the updated counter
  3729. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  3730. ext $acc_lb, $acc_lb, $acc_lb, #8
  3731. rev64 $acc_lb, $acc_lb
  3732. mov x0, $len
  3733. st1 { $acc_l.16b }, [$current_tag]
  3734. ldp x21, x22, [sp, #16]
  3735. ldp x23, x24, [sp, #32]
  3736. ldp d8, d9, [sp, #48]
  3737. ldp d10, d11, [sp, #64]
  3738. ldp d12, d13, [sp, #80]
  3739. ldp d14, d15, [sp, #96]
  3740. ldp x19, x20, [sp], #112
  3741. ret
  3742. .L256_dec_ret:
  3743. mov w0, #0x0
  3744. ret
  3745. .size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel
  3746. ___
  3747. }
  3748. }
  3749. $code.=<<___;
  3750. .asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  3751. .align 2
  3752. #endif
  3753. ___
  3754. if ($flavour =~ /64/) { ######## 64-bit code
  3755. sub unvmov {
  3756. my $arg=shift;
  3757. $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
  3758. sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
  3759. $3<8?$3:$3+8,($4 eq "lo")?0:1;
  3760. }
  3761. foreach(split("\n",$code)) {
  3762. s/@\s/\/\//o; # old->new style commentary
  3763. print $_,"\n";
  3764. }
  3765. } else { ######## 32-bit code
  3766. sub unvdup32 {
  3767. my $arg=shift;
  3768. $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
  3769. sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
  3770. }
  3771. sub unvpmullp64 {
  3772. my ($mnemonic,$arg)=@_;
  3773. if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
  3774. my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
  3775. |(($2&7)<<17)|(($2&8)<<4)
  3776. |(($3&7)<<1) |(($3&8)<<2);
  3777. $word |= 0x00010001 if ($mnemonic =~ "2");
  3778. # since ARMv7 instructions are always encoded little-endian.
  3779. # correct solution is to use .inst directive, but older%%%%
  3780. # assemblers don't implement it:-(
  3781. sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
  3782. $word&0xff,($word>>8)&0xff,
  3783. ($word>>16)&0xff,($word>>24)&0xff,
  3784. $mnemonic,$arg;
  3785. }
  3786. }
  3787. foreach(split("\n",$code)) {
  3788. s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
  3789. s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
  3790. s/\/\/\s?/@ /o; # new->old style commentary
  3791. # fix up remaining new-style suffixes
  3792. s/\],#[0-9]+/]!/o;
  3793. s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
  3794. s/vdup\.32\s+(.*)/unvdup32($1)/geo or
  3795. s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
  3796. s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
  3797. s/^(\s+)b\./$1b/o or
  3798. s/^(\s+)ret/$1bx\tlr/o;
  3799. if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
  3800. print " it $2\n";
  3801. }
  3802. print $_,"\n";
  3803. }
  3804. }
  3805. close STDOUT or die "error closing STDOUT: $!"; # enforce flush