aes-gcm-armv8_64.pl 272 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088308930903091309230933094309530963097309830993100310131023103310431053106310731083109311031113112311331143115311631173118311931203121312231233124312531263127312831293130313131323133313431353136313731383139314031413142314331443145314631473148314931503151315231533154315531563157315831593160316131623163316431653166316731683169317031713172317331743175317631773178317931803181318231833184318531863187318831893190319131923193319431953196319731983199320032013202320332043205320632073208320932103211321232133214321532163217321832193220322132223223322432253226322732283229323032313232323332343235323632373238323932403241324232433244324532463247324832493250325132523253325432553256325732583259326032613262326332643265326632673268326932703271327232733274327532763277327832793280328132823283328432853286328732883289329032913292329332943295329632973298329933003301330233033304330533063307330833093310331133123313331433153316331733183319332033213322332333243325332633273328332933303331333233333334333533363337333833393340334133423343334433453346334733483349335033513352335333543355335633573358335933603361336233633364336533663367336833693370337133723373337433753376337733783379338033813382338333843385338633873388338933903391339233933394339533963397339833993400340134023403340434053406340734083409341034113412341334143415341634173418341934203421342234233424342534263427342834293430343134323433343434353436343734383439344034413442344334443445344634473448344934503451345234533454345534563457345834593460346134623463346434653466346734683469347034713472347334743475347634773478347934803481348234833484348534863487348834893490349134923493349434953496349734983499350035013502350335043505350635073508350935103511351235133514351535163517351835193520352135223523352435253526352735283529353035313532353335343535353635373538353935403541354235433544354535463547354835493550355135523553355435553556355735583559356035613562356335643565356635673568356935703571357235733574357535763577357835793580358135823583358435853586358735883589359035913592359335943595359635973598359936003601360236033604360536063607360836093610361136123613361436153616361736183619362036213622362336243625362636273628362936303631363236333634363536363637363836393640364136423643364436453646364736483649365036513652365336543655365636573658365936603661366236633664366536663667366836693670367136723673367436753676367736783679368036813682368336843685368636873688368936903691369236933694369536963697369836993700370137023703370437053706370737083709371037113712371337143715371637173718371937203721372237233724372537263727372837293730373137323733373437353736373737383739374037413742374337443745374637473748374937503751375237533754375537563757375837593760376137623763376437653766376737683769377037713772377337743775377637773778377937803781378237833784378537863787378837893790379137923793379437953796379737983799380038013802380338043805380638073808380938103811381238133814381538163817381838193820382138223823382438253826382738283829383038313832383338343835383638373838383938403841384238433844384538463847384838493850385138523853385438553856385738583859386038613862386338643865386638673868386938703871387238733874387538763877387838793880388138823883388438853886388738883889389038913892389338943895389638973898389939003901390239033904390539063907390839093910391139123913391439153916391739183919392039213922392339243925392639273928392939303931393239333934393539363937393839393940394139423943394439453946394739483949395039513952395339543955395639573958395939603961396239633964396539663967396839693970397139723973397439753976397739783979398039813982398339843985398639873988398939903991399239933994399539963997399839994000400140024003400440054006400740084009401040114012401340144015401640174018401940204021402240234024402540264027402840294030403140324033403440354036403740384039404040414042404340444045404640474048404940504051405240534054405540564057405840594060406140624063406440654066406740684069407040714072407340744075407640774078407940804081408240834084408540864087408840894090409140924093409440954096409740984099410041014102410341044105410641074108410941104111411241134114411541164117411841194120412141224123412441254126412741284129413041314132413341344135413641374138413941404141414241434144414541464147414841494150415141524153415441554156415741584159416041614162416341644165416641674168416941704171417241734174417541764177417841794180418141824183418441854186418741884189419041914192419341944195419641974198419942004201420242034204420542064207420842094210421142124213421442154216421742184219422042214222422342244225422642274228422942304231423242334234423542364237423842394240424142424243424442454246424742484249425042514252425342544255425642574258425942604261426242634264426542664267426842694270427142724273427442754276427742784279428042814282428342844285428642874288428942904291429242934294429542964297429842994300430143024303430443054306430743084309431043114312431343144315431643174318431943204321432243234324432543264327432843294330433143324333433443354336433743384339434043414342434343444345434643474348434943504351435243534354435543564357435843594360436143624363436443654366436743684369437043714372437343744375437643774378437943804381438243834384438543864387438843894390439143924393439443954396439743984399440044014402440344044405440644074408440944104411441244134414441544164417441844194420442144224423442444254426442744284429443044314432443344344435443644374438443944404441444244434444444544464447444844494450445144524453445444554456445744584459446044614462446344644465446644674468446944704471447244734474447544764477447844794480448144824483448444854486448744884489449044914492449344944495449644974498449945004501450245034504450545064507450845094510451145124513451445154516451745184519452045214522452345244525452645274528452945304531453245334534453545364537453845394540454145424543454445454546454745484549455045514552455345544555455645574558455945604561456245634564456545664567456845694570457145724573457445754576457745784579458045814582458345844585458645874588458945904591459245934594459545964597459845994600460146024603460446054606460746084609461046114612461346144615461646174618461946204621462246234624462546264627462846294630463146324633463446354636463746384639464046414642464346444645464646474648464946504651465246534654465546564657465846594660466146624663466446654666466746684669467046714672467346744675467646774678467946804681468246834684468546864687468846894690469146924693469446954696469746984699470047014702470347044705470647074708470947104711471247134714471547164717471847194720472147224723472447254726472747284729473047314732473347344735473647374738473947404741474247434744474547464747474847494750475147524753475447554756475747584759476047614762476347644765476647674768476947704771477247734774477547764777477847794780478147824783478447854786478747884789479047914792479347944795479647974798479948004801480248034804480548064807480848094810481148124813481448154816481748184819482048214822482348244825482648274828482948304831483248334834483548364837483848394840484148424843484448454846484748484849485048514852485348544855485648574858485948604861486248634864486548664867486848694870487148724873487448754876487748784879488048814882488348844885488648874888488948904891489248934894489548964897489848994900490149024903490449054906490749084909491049114912491349144915491649174918491949204921492249234924492549264927492849294930493149324933493449354936493749384939494049414942494349444945494649474948494949504951495249534954495549564957495849594960496149624963496449654966496749684969497049714972497349744975497649774978497949804981498249834984498549864987498849894990499149924993499449954996499749984999500050015002500350045005500650075008500950105011501250135014501550165017501850195020502150225023502450255026502750285029503050315032503350345035503650375038503950405041504250435044504550465047504850495050505150525053505450555056505750585059506050615062506350645065506650675068506950705071507250735074507550765077507850795080508150825083508450855086508750885089509050915092509350945095509650975098509951005101510251035104510551065107510851095110511151125113511451155116511751185119512051215122512351245125512651275128512951305131513251335134513551365137513851395140514151425143514451455146514751485149515051515152515351545155515651575158515951605161516251635164516551665167516851695170517151725173517451755176517751785179518051815182518351845185518651875188518951905191519251935194519551965197519851995200520152025203520452055206520752085209521052115212521352145215521652175218521952205221522252235224522552265227522852295230523152325233523452355236523752385239524052415242524352445245524652475248524952505251525252535254525552565257525852595260526152625263526452655266526752685269527052715272527352745275527652775278527952805281528252835284528552865287528852895290529152925293529452955296529752985299530053015302530353045305530653075308530953105311531253135314531553165317531853195320532153225323532453255326532753285329533053315332533353345335533653375338533953405341534253435344534553465347534853495350535153525353535453555356535753585359536053615362536353645365536653675368536953705371537253735374537553765377537853795380538153825383538453855386538753885389539053915392539353945395539653975398539954005401540254035404540554065407540854095410541154125413541454155416541754185419542054215422542354245425542654275428542954305431543254335434543554365437543854395440544154425443544454455446544754485449545054515452545354545455545654575458545954605461546254635464546554665467546854695470547154725473547454755476547754785479548054815482548354845485548654875488548954905491549254935494549554965497549854995500550155025503550455055506550755085509551055115512551355145515551655175518551955205521552255235524552555265527552855295530553155325533553455355536553755385539554055415542554355445545554655475548554955505551555255535554555555565557555855595560556155625563556455655566556755685569557055715572557355745575557655775578557955805581558255835584558555865587558855895590559155925593559455955596559755985599560056015602560356045605560656075608560956105611561256135614561556165617561856195620562156225623562456255626562756285629563056315632563356345635563656375638563956405641564256435644564556465647564856495650565156525653565456555656565756585659566056615662566356645665566656675668566956705671567256735674567556765677567856795680568156825683568456855686568756885689569056915692569356945695569656975698569957005701570257035704570557065707570857095710571157125713571457155716571757185719572057215722572357245725572657275728
  1. #! /usr/bin/env perl
  2. # Copyright 2019-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. #========================================================================
  10. # Written by Fangming Fang <fangming.fang@arm.com> for the OpenSSL project,
  11. # derived from https://github.com/ARM-software/AArch64cryptolib, original
  12. # author Samuel Lee <Samuel.Lee@arm.com>. The module is, however, dual
  13. # licensed under OpenSSL and CRYPTOGAMS licenses depending on where you
  14. # obtain it. For further details see http://www.openssl.org/~appro/cryptogams/.
  15. #========================================================================
  16. #
  17. # Approach - assume we don't want to reload constants, so reserve ~half of vector register file for constants
  18. #
  19. # main loop to act on 4 16B blocks per iteration, and then do modulo of the accumulated intermediate hashes from the 4 blocks
  20. #
  21. # ____________________________________________________
  22. # | |
  23. # | PRE |
  24. # |____________________________________________________|
  25. # | | | |
  26. # | CTR block 4k+8 | AES block 4k+4 | GHASH block 4k+0 |
  27. # |________________|________________|__________________|
  28. # | | | |
  29. # | CTR block 4k+9 | AES block 4k+5 | GHASH block 4k+1 |
  30. # |________________|________________|__________________|
  31. # | | | |
  32. # | CTR block 4k+10| AES block 4k+6 | GHASH block 4k+2 |
  33. # |________________|________________|__________________|
  34. # | | | |
  35. # | CTR block 4k+11| AES block 4k+7 | GHASH block 4k+3 |
  36. # |________________|____(mostly)____|__________________|
  37. # | |
  38. # | MODULO |
  39. # |____________________________________________________|
  40. #
  41. # PRE:
  42. # Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0
  43. # EXT low_acc, low_acc, low_acc, #8
  44. # EOR res_curr (4k+0), res_curr (4k+0), low_acc
  45. #
  46. # CTR block:
  47. # Increment and byte reverse counter in scalar registers and transfer to SIMD registers
  48. # REV ctr32, rev_ctr32
  49. # ORR ctr64, constctr96_top32, ctr32, LSL #32
  50. # INS ctr_next.d[0], constctr96_bottom64 // Keeping this in scalar registers to free up space in SIMD RF
  51. # INS ctr_next.d[1], ctr64X
  52. # ADD rev_ctr32, #1
  53. #
  54. # AES block:
  55. # Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example.
  56. # Doing small trick here of loading input in scalar registers, EORing with last key and then transferring
  57. # Given we are very constrained in our ASIMD registers this is quite important
  58. #
  59. # Encrypt:
  60. # LDR input_low, [ input_ptr ], #8
  61. # LDR input_high, [ input_ptr ], #8
  62. # EOR input_low, k14_low
  63. # EOR input_high, k14_high
  64. # INS res_curr.d[0], input_low
  65. # INS res_curr.d[1], input_high
  66. # AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
  67. # AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
  68. # AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
  69. # AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
  70. # AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
  71. # AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
  72. # AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
  73. # AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
  74. # AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
  75. # AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
  76. # AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
  77. # AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
  78. # AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
  79. # AESE ctr_curr, k13
  80. # EOR res_curr, res_curr, ctr_curr
  81. # ST1 { res_curr.16b }, [ output_ptr ], #16
  82. #
  83. # Decrypt:
  84. # AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
  85. # AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
  86. # AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
  87. # AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
  88. # AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
  89. # AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
  90. # AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
  91. # AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
  92. # AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
  93. # AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
  94. # AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
  95. # AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
  96. # AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
  97. # AESE ctr_curr, k13
  98. # LDR res_curr, [ input_ptr ], #16
  99. # EOR res_curr, res_curr, ctr_curr
  100. # MOV output_low, res_curr.d[0]
  101. # MOV output_high, res_curr.d[1]
  102. # EOR output_low, k14_low
  103. # EOR output_high, k14_high
  104. # STP output_low, output_high, [ output_ptr ], #16
  105. #
  106. # GHASH block X:
  107. # do 128b karatsuba polynomial multiplication on block
  108. # We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b
  109. #
  110. # multiplication:
  111. # Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
  112. #
  113. # The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies:
  114. # Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64
  115. #
  116. # There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are
  117. # multiplying with "twisted" powers of H
  118. #
  119. # Note: We can PMULL directly into the acc_x in first GHASH of the loop
  120. # Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical
  121. # path latency dominates the performance
  122. #
  123. # This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers
  124. # than indicated here
  125. # REV64 res_curr, res_curr
  126. # INS t_m.d[0], res_curr.d[1]
  127. # EOR t_m.8B, t_m.8B, res_curr.8B
  128. # PMULL2 t_h, res_curr, HX
  129. # PMULL t_l, res_curr, HX
  130. # PMULL t_m, t_m, HX_k
  131. # EOR acc_h, acc_h, t_h
  132. # EOR acc_l, acc_l, t_l
  133. # EOR acc_m, acc_m, t_m
  134. #
  135. # MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them
  136. # There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo
  137. # with a reversed constant
  138. # EOR acc_m, acc_m, acc_h
  139. # EOR acc_m, acc_m, acc_l // Finish off karatsuba processing
  140. # PMULL t_mod, acc_h, mod_constant
  141. # EXT acc_h, acc_h, acc_h, #8
  142. # EOR acc_m, acc_m, acc_h
  143. # EOR acc_m, acc_m, t_mod
  144. # PMULL acc_h, acc_m, mod_constant
  145. # EXT acc_m, acc_m, acc_m, #8
  146. # EOR acc_l, acc_l, acc_h
  147. # EOR acc_l, acc_l, acc_m
  148. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  149. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  150. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  151. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  152. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
  153. die "can't locate arm-xlate.pl";
  154. open OUT,"| \"$^X\" $xlate $flavour $output";
  155. *STDOUT=*OUT;
  156. $input_ptr="x0"; #argument block
  157. $bit_length="x1";
  158. $output_ptr="x2";
  159. $current_tag="x3";
  160. $counter="x16";
  161. $cc="x8";
  162. {
  163. my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
  164. my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
  165. my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
  166. my ($output_l0,$output_h0)=map("x$_",(6..7));
  167. my $ctr32w="w9";
  168. my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk10_l,$rk10_h,$len)=map("x$_",(9..15));
  169. my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
  170. my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
  171. my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
  172. my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
  173. my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
  174. my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
  175. my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
  176. my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
  177. my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
  178. my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
  179. my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
  180. my $t0="v8";
  181. my $t0d="d8";
  182. my ($t1,$t2,$t3)=map("v$_",(28..30));
  183. my ($t1d,$t2d,$t3d)=map("d$_",(28..30));
  184. my $t4="v8";
  185. my $t4d="d8";
  186. my $t5="v28";
  187. my $t5d="d28";
  188. my $t6="v31";
  189. my $t6d="d31";
  190. my $t7="v4";
  191. my $t7d="d4";
  192. my $t8="v29";
  193. my $t8d="d29";
  194. my $t9="v30";
  195. my $t9d="d30";
  196. my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
  197. my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
  198. my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
  199. my $mod_constantd="d8";
  200. my $mod_constant="v8";
  201. my $mod_t="v31";
  202. my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9)=map("v$_.16b",(18..27));
  203. my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q)=map("q$_",(18..27));
  204. my $rk2q1="v20.1q";
  205. my $rk3q1="v21.1q";
  206. my $rk4v="v22";
  207. my $rk4d="d22";
  208. $code=<<___;
  209. #include "arm_arch.h"
  210. #if __ARM_MAX_ARCH__>=8
  211. ___
  212. $code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
  213. $code.=<<___ if ($flavour !~ /64/);
  214. .fpu neon
  215. #ifdef __thumb2__
  216. .syntax unified
  217. .thumb
  218. # define INST(a,b,c,d) $_byte c,0xef,a,b
  219. #else
  220. .code 32
  221. # define INST(a,b,c,d) $_byte a,b,c,0xf2
  222. #endif
  223. .text
  224. ___
  225. #########################################################################################
  226. # size_t aes_gcm_enc_128_kernel(const unsigned char *in,
  227. # size_t len,
  228. # unsigned char *out,
  229. # const void *key,
  230. # unsigned char ivec[16],
  231. # u64 *Xi);
  232. #
  233. $code.=<<___;
  234. .global aes_gcm_enc_128_kernel
  235. .type aes_gcm_enc_128_kernel,%function
  236. .align 4
  237. aes_gcm_enc_128_kernel:
  238. AARCH64_VALID_CALL_TARGET
  239. cbz x1, .L128_enc_ret
  240. stp x19, x20, [sp, #-112]!
  241. mov x16, x4
  242. mov x8, x5
  243. stp x21, x22, [sp, #16]
  244. stp x23, x24, [sp, #32]
  245. stp d8, d9, [sp, #48]
  246. stp d10, d11, [sp, #64]
  247. stp d12, d13, [sp, #80]
  248. stp d14, d15, [sp, #96]
  249. ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
  250. ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10
  251. ld1 {$acc_lb}, [$current_tag]
  252. ext $acc_lb, $acc_lb, $acc_lb, #8
  253. rev64 $acc_lb, $acc_lb
  254. lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
  255. mov $len, $main_end_input_ptr
  256. ldr $rk9q, [$cc, #144] @ load rk9
  257. add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
  258. sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
  259. lsr $rctr32x, $ctr96_t32x, #32
  260. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  261. ext $h4b, $h4b, $h4b, #8
  262. fmov $ctr1d, $ctr96_b64x @ CTR block 1
  263. rev $rctr32w, $rctr32w @ rev_ctr32
  264. add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
  265. orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
  266. ldr $rk0q, [$cc, #0] @ load rk0
  267. rev $ctr32w, $rctr32w @ CTR block 1
  268. add $rctr32w, $rctr32w, #1 @ CTR block 1
  269. fmov $ctr3d, $ctr96_b64x @ CTR block 3
  270. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
  271. ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
  272. fmov $ctr1.d[1], $ctr32x @ CTR block 1
  273. rev $ctr32w, $rctr32w @ CTR block 2
  274. fmov $ctr2d, $ctr96_b64x @ CTR block 2
  275. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
  276. add $rctr32w, $rctr32w, #1 @ CTR block 2
  277. fmov $ctr2.d[1], $ctr32x @ CTR block 2
  278. rev $ctr32w, $rctr32w @ CTR block 3
  279. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
  280. ldr $rk1q, [$cc, #16] @ load rk1
  281. add $rctr32w, $rctr32w, #1 @ CTR block 3
  282. fmov $ctr3.d[1], $ctr32x @ CTR block 3
  283. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  284. ext $h3b, $h3b, $h3b, #8
  285. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
  286. ldr $rk2q, [$cc, #32] @ load rk2
  287. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
  288. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  289. ext $h1b, $h1b, $h1b, #8
  290. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
  291. ldr $rk8q, [$cc, #128] @ load rk8
  292. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
  293. ldr $rk3q, [$cc, #48] @ load rk3
  294. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
  295. trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
  296. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
  297. ldr $rk6q, [$cc, #96] @ load rk6
  298. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
  299. ldr $rk7q, [$cc, #112] @ load rk7
  300. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
  301. trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
  302. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
  303. ldr $rk5q, [$cc, #80] @ load rk5
  304. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
  305. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  306. ext $h2b, $h2b, $h2b, #8
  307. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
  308. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
  309. eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
  310. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
  311. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
  312. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
  313. ldr $rk4q, [$cc, #64] @ load rk4
  314. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
  315. and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
  316. trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
  317. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
  318. add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
  319. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
  320. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
  321. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
  322. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
  323. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
  324. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
  325. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
  326. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
  327. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
  328. trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
  329. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
  330. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
  331. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
  332. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
  333. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
  334. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
  335. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
  336. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
  337. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
  338. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
  339. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
  340. aese $ctr2b, $rk9 @ AES block 2 - round 9
  341. aese $ctr0b, $rk9 @ AES block 0 - round 9
  342. eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
  343. aese $ctr1b, $rk9 @ AES block 1 - round 9
  344. aese $ctr3b, $rk9 @ AES block 3 - round 9
  345. b.ge .L128_enc_tail @ handle tail
  346. ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
  347. ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
  348. ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
  349. ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
  350. eor $input_l0, $input_l0, $rk10_l @ AES block 0 - round 10 low
  351. eor $input_h0, $input_h0, $rk10_h @ AES block 0 - round 10 high
  352. eor $input_l2, $input_l2, $rk10_l @ AES block 2 - round 10 low
  353. fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
  354. eor $input_l1, $input_l1, $rk10_l @ AES block 1 - round 10 low
  355. eor $input_h2, $input_h2, $rk10_h @ AES block 2 - round 10 high
  356. fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
  357. fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
  358. eor $input_h1, $input_h1, $rk10_h @ AES block 1 - round 10 high
  359. eor $input_l3, $input_l3, $rk10_l @ AES block 3 - round 10 low
  360. fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
  361. fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
  362. eor $input_h3, $input_h3, $rk10_h @ AES block 3 - round 10 high
  363. rev $ctr32w, $rctr32w @ CTR block 4
  364. fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
  365. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
  366. eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
  367. fmov $ctr0d, $ctr96_b64x @ CTR block 4
  368. add $rctr32w, $rctr32w, #1 @ CTR block 4
  369. fmov $ctr0.d[1], $ctr32x @ CTR block 4
  370. rev $ctr32w, $rctr32w @ CTR block 5
  371. eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
  372. fmov $ctr1d, $ctr96_b64x @ CTR block 5
  373. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
  374. add $rctr32w, $rctr32w, #1 @ CTR block 5
  375. add $input_ptr, $input_ptr, #64 @ AES input_ptr update
  376. fmov $ctr1.d[1], $ctr32x @ CTR block 5
  377. fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
  378. rev $ctr32w, $rctr32w @ CTR block 6
  379. st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
  380. fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
  381. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
  382. add $rctr32w, $rctr32w, #1 @ CTR block 6
  383. eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
  384. st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
  385. fmov $ctr2d, $ctr96_b64x @ CTR block 6
  386. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  387. fmov $ctr2.d[1], $ctr32x @ CTR block 6
  388. rev $ctr32w, $rctr32w @ CTR block 7
  389. st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
  390. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
  391. eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
  392. st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
  393. b.ge .L128_enc_prepretail @ do prepretail
  394. .L128_enc_main_loop: @ main loop start
  395. ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext
  396. rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
  397. rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
  398. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  399. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
  400. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  401. rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
  402. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  403. add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
  404. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
  405. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  406. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  407. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  408. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  409. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  410. eor $res0b, $res0b, $acc_lb @ PRE 1
  411. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  412. eor $input_h3, $input_h3, $rk10_h @ AES block 4k+3 - round 10 high
  413. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  414. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  415. ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
  416. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  417. rev $ctr32w, $rctr32w @ CTR block 4k+8
  418. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  419. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  420. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
  421. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  422. add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
  423. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  424. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  425. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  426. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  427. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  428. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  429. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  430. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  431. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  432. rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
  433. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  434. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  435. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  436. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  437. eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high
  438. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  439. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  440. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  441. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  442. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  443. eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low
  444. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  445. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  446. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  447. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  448. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  449. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  450. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  451. movi $mod_constant.8b, #0xc2
  452. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  453. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  454. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  455. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  456. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  457. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  458. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  459. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  460. ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
  461. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  462. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  463. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  464. ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
  465. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  466. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  467. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  468. eor $input_l1, $input_l1, $rk10_l @ AES block 4k+5 - round 10 low
  469. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  470. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  471. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  472. eor $input_l3, $input_l3, $rk10_l @ AES block 4k+3 - round 10 low
  473. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  474. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  475. fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
  476. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  477. fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
  478. add $input_ptr, $input_ptr, #64 @ AES input_ptr update
  479. fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low
  480. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  481. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  482. fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
  483. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  484. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  485. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  486. eor $input_h1, $input_h1, $rk10_h @ AES block 4k+5 - round 10 high
  487. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  488. fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
  489. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  490. fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high
  491. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  492. cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
  493. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  494. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  495. aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
  496. eor $input_l2, $input_l2, $rk10_l @ AES block 4k+6 - round 10 low
  497. eor $input_h2, $input_h2, $rk10_h @ AES block 4k+6 - round 10 high
  498. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  499. fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
  500. aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
  501. fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
  502. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  503. eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
  504. fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
  505. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  506. fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
  507. rev $ctr32w, $rctr32w @ CTR block 4k+9
  508. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  509. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  510. eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
  511. add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
  512. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
  513. fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
  514. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  515. fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
  516. rev $ctr32w, $rctr32w @ CTR block 4k+10
  517. aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
  518. st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
  519. eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
  520. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
  521. aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
  522. add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
  523. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  524. fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
  525. eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
  526. st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
  527. fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
  528. st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
  529. rev $ctr32w, $rctr32w @ CTR block 4k+11
  530. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
  531. eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result
  532. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  533. st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result
  534. b.lt .L128_enc_main_loop
  535. .L128_enc_prepretail: @ PREPRETAIL
  536. rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
  537. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
  538. rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
  539. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  540. add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
  541. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
  542. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  543. rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
  544. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  545. rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
  546. eor $res0b, $res0b, $acc_lb @ PRE 1
  547. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  548. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  549. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  550. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  551. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  552. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  553. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  554. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  555. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  556. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  557. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  558. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  559. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  560. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  561. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  562. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  563. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  564. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  565. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  566. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  567. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  568. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  569. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  570. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  571. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  572. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  573. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  574. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  575. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  576. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  577. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  578. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  579. movi $mod_constant.8b, #0xc2
  580. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  581. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  582. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  583. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  584. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  585. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  586. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  587. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  588. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  589. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  590. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  591. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  592. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  593. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  594. pmull $t1.1q, $acc_h.1d, $mod_constant.1d
  595. eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
  596. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  597. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  598. ext $acc_hb, $acc_hb, $acc_hb, #8
  599. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  600. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  601. eor $acc_mb, $acc_mb, $acc_lb
  602. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  603. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  604. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  605. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  606. eor $acc_mb, $acc_mb, $t1.16b
  607. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  608. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  609. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  610. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  611. eor $acc_mb, $acc_mb, $acc_hb
  612. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  613. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  614. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  615. pmull $t1.1q, $acc_m.1d, $mod_constant.1d
  616. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  617. ext $acc_mb, $acc_mb, $acc_mb, #8
  618. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  619. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  620. eor $acc_lb, $acc_lb, $t1.16b
  621. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  622. aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
  623. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  624. aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
  625. aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
  626. eor $acc_lb, $acc_lb, $acc_mb
  627. aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
  628. .L128_enc_tail: @ TAIL
  629. sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
  630. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
  631. cmp $main_end_input_ptr, #48
  632. ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
  633. eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low
  634. eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high
  635. fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
  636. fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
  637. eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
  638. b.gt .L128_enc_blocks_more_than_3
  639. sub $rctr32w, $rctr32w, #1
  640. movi $acc_l.8b, #0
  641. mov $ctr3b, $ctr2b
  642. cmp $main_end_input_ptr, #32
  643. mov $ctr2b, $ctr1b
  644. movi $acc_h.8b, #0
  645. movi $acc_m.8b, #0
  646. b.gt .L128_enc_blocks_more_than_2
  647. mov $ctr3b, $ctr1b
  648. cmp $main_end_input_ptr, #16
  649. sub $rctr32w, $rctr32w, #1
  650. b.gt .L128_enc_blocks_more_than_1
  651. sub $rctr32w, $rctr32w, #1
  652. b .L128_enc_blocks_less_than_1
  653. .L128_enc_blocks_more_than_3: @ blocks left > 3
  654. st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
  655. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
  656. rev64 $res0b, $res1b @ GHASH final-3 block
  657. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  658. eor $input_h0, $input_h0, $rk10_h @ AES final-2 block - round 10 high
  659. eor $input_l0, $input_l0, $rk10_l @ AES final-2 block - round 10 low
  660. fmov $res1d, $input_l0 @ AES final-2 block - mov low
  661. movi $t0.8b, #0 @ suppress further partial tag feed in
  662. fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
  663. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
  664. mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
  665. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
  666. mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
  667. eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
  668. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
  669. pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
  670. .L128_enc_blocks_more_than_2: @ blocks left > 2
  671. st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
  672. rev64 $res0b, $res1b @ GHASH final-2 block
  673. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
  674. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  675. eor $input_l0, $input_l0, $rk10_l @ AES final-1 block - round 10 low
  676. fmov $res1d, $input_l0 @ AES final-1 block - mov low
  677. eor $input_h0, $input_h0, $rk10_h @ AES final-1 block - round 10 high
  678. pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
  679. fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
  680. mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
  681. pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
  682. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
  683. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
  684. eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
  685. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
  686. pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
  687. movi $t0.8b, #0 @ suppress further partial tag feed in
  688. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
  689. .L128_enc_blocks_more_than_1: @ blocks left > 1
  690. st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
  691. rev64 $res0b, $res1b @ GHASH final-1 block
  692. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
  693. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  694. eor $input_h0, $input_h0, $rk10_h @ AES final block - round 10 high
  695. eor $input_l0, $input_l0, $rk10_l @ AES final block - round 10 low
  696. fmov $res1d, $input_l0 @ AES final block - mov low
  697. pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
  698. fmov $res1.d[1], $input_h0 @ AES final block - mov high
  699. mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
  700. pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
  701. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
  702. eor $res1b, $res1b, $ctr3b @ AES final block - result
  703. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
  704. pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
  705. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
  706. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
  707. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
  708. movi $t0.8b, #0 @ suppress further partial tag feed in
  709. .L128_enc_blocks_less_than_1: @ blocks left <= 1
  710. and $bit_length, $bit_length, #127 @ bit_length %= 128
  711. mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff
  712. mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff
  713. sub $bit_length, $bit_length, #128 @ bit_length -= 128
  714. neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
  715. and $bit_length, $bit_length, #127 @ bit_length %= 128
  716. lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block
  717. cmp $bit_length, #64
  718. csel $input_l0, $rk10_l, $rk10_h, lt
  719. csel $input_h0, $rk10_h, xzr, lt
  720. fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
  721. fmov $ctr0.d[1], $input_h0
  722. and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
  723. rev64 $res0b, $res1b @ GHASH final block
  724. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  725. mov $t0d, $res0.d[1] @ GHASH final block - mid
  726. pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
  727. ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
  728. eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
  729. rev $ctr32w, $rctr32w
  730. pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
  731. pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
  732. eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
  733. eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
  734. eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
  735. movi $mod_constant.8b, #0xc2
  736. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  737. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  738. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  739. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  740. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  741. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  742. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  743. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  744. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  745. bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
  746. eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
  747. st1 { $res1b}, [$output_ptr] @ store all 16B
  748. str $ctr32w, [$counter, #12] @ store the updated counter
  749. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  750. ext $acc_lb, $acc_lb, $acc_lb, #8
  751. rev64 $acc_lb, $acc_lb
  752. mov x0, $len
  753. st1 { $acc_l.16b }, [$current_tag]
  754. ldp x21, x22, [sp, #16]
  755. ldp x23, x24, [sp, #32]
  756. ldp d8, d9, [sp, #48]
  757. ldp d10, d11, [sp, #64]
  758. ldp d12, d13, [sp, #80]
  759. ldp d14, d15, [sp, #96]
  760. ldp x19, x20, [sp], #112
  761. ret
  762. .L128_enc_ret:
  763. mov w0, #0x0
  764. ret
  765. .size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel
  766. ___
  767. #########################################################################################
  768. # size_t aes_gcm_dec_128_kernel(const unsigned char *in,
  769. # size_t len,
  770. # unsigned char *out,
  771. # const void *key,
  772. # unsigned char ivec[16],
  773. # u64 *Xi);
  774. #
  775. $code.=<<___;
  776. .global aes_gcm_dec_128_kernel
  777. .type aes_gcm_dec_128_kernel,%function
  778. .align 4
  779. aes_gcm_dec_128_kernel:
  780. AARCH64_VALID_CALL_TARGET
  781. cbz x1, .L128_dec_ret
  782. stp x19, x20, [sp, #-112]!
  783. mov x16, x4
  784. mov x8, x5
  785. stp x21, x22, [sp, #16]
  786. stp x23, x24, [sp, #32]
  787. stp d8, d9, [sp, #48]
  788. stp d10, d11, [sp, #64]
  789. stp d12, d13, [sp, #80]
  790. stp d14, d15, [sp, #96]
  791. lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
  792. mov $len, $main_end_input_ptr
  793. ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
  794. sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
  795. ldr $rk0q, [$cc, #0] @ load rk0
  796. and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
  797. ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
  798. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  799. ext $h2b, $h2b, $h2b, #8
  800. lsr $rctr32x, $ctr96_t32x, #32
  801. fmov $ctr2d, $ctr96_b64x @ CTR block 2
  802. ldr $rk1q, [$cc, #16] @ load rk1
  803. orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
  804. rev $rctr32w, $rctr32w @ rev_ctr32
  805. fmov $ctr1d, $ctr96_b64x @ CTR block 1
  806. add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
  807. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
  808. rev $ctr32w, $rctr32w @ CTR block 1
  809. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
  810. ldr $rk2q, [$cc, #32] @ load rk2
  811. add $rctr32w, $rctr32w, #1 @ CTR block 1
  812. fmov $ctr1.d[1], $ctr32x @ CTR block 1
  813. rev $ctr32w, $rctr32w @ CTR block 2
  814. add $rctr32w, $rctr32w, #1 @ CTR block 2
  815. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
  816. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
  817. fmov $ctr2.d[1], $ctr32x @ CTR block 2
  818. rev $ctr32w, $rctr32w @ CTR block 3
  819. fmov $ctr3d, $ctr96_b64x @ CTR block 3
  820. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
  821. add $rctr32w, $rctr32w, #1 @ CTR block 3
  822. fmov $ctr3.d[1], $ctr32x @ CTR block 3
  823. add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
  824. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
  825. ldr $rk3q, [$cc, #48] @ load rk3
  826. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
  827. ldr $rk6q, [$cc, #96] @ load rk6
  828. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
  829. ldr $rk7q, [$cc, #112] @ load rk7
  830. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
  831. ldr $rk4q, [$cc, #64] @ load rk4
  832. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
  833. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
  834. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
  835. ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10
  836. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
  837. ld1 { $acc_lb}, [$current_tag]
  838. ext $acc_lb, $acc_lb, $acc_lb, #8
  839. rev64 $acc_lb, $acc_lb
  840. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
  841. ldr $rk5q, [$cc, #80] @ load rk5
  842. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
  843. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
  844. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
  845. ldr $rk9q, [$cc, #144] @ load rk9
  846. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
  847. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
  848. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
  849. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  850. ext $h3b, $h3b, $h3b, #8
  851. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
  852. ldr $rk8q, [$cc, #128] @ load rk8
  853. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
  854. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
  855. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
  856. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
  857. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
  858. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  859. ext $h1b, $h1b, $h1b, #8
  860. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
  861. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
  862. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
  863. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
  864. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
  865. trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
  866. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  867. ext $h4b, $h4b, $h4b, #8
  868. trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
  869. add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
  870. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
  871. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
  872. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
  873. eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
  874. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
  875. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
  876. trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
  877. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
  878. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
  879. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
  880. trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
  881. aese $ctr2b, $rk9 @ AES block 2 - round 9
  882. aese $ctr3b, $rk9 @ AES block 3 - round 9
  883. aese $ctr0b, $rk9 @ AES block 0 - round 9
  884. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
  885. aese $ctr1b, $rk9 @ AES block 1 - round 9
  886. eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
  887. b.ge .L128_dec_tail @ handle tail
  888. ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
  889. ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
  890. eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
  891. ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
  892. eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
  893. rev64 $res0b, $res0b @ GHASH block 0
  894. rev $ctr32w, $rctr32w @ CTR block 4
  895. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
  896. add $rctr32w, $rctr32w, #1 @ CTR block 4
  897. ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
  898. rev64 $res1b, $res1b @ GHASH block 1
  899. add $input_ptr, $input_ptr, #64 @ AES input_ptr update
  900. mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
  901. mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
  902. mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
  903. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  904. mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
  905. fmov $ctr0d, $ctr96_b64x @ CTR block 4
  906. fmov $ctr0.d[1], $ctr32x @ CTR block 4
  907. rev $ctr32w, $rctr32w @ CTR block 5
  908. eor $output_l1, $output_l1, $rk10_l @ AES block 1 - round 10 low
  909. fmov $ctr1d, $ctr96_b64x @ CTR block 5
  910. add $rctr32w, $rctr32w, #1 @ CTR block 5
  911. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
  912. fmov $ctr1.d[1], $ctr32x @ CTR block 5
  913. rev $ctr32w, $rctr32w @ CTR block 6
  914. add $rctr32w, $rctr32w, #1 @ CTR block 6
  915. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
  916. eor $output_h1, $output_h1, $rk10_h @ AES block 1 - round 10 high
  917. eor $output_l0, $output_l0, $rk10_l @ AES block 0 - round 10 low
  918. eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
  919. eor $output_h0, $output_h0, $rk10_h @ AES block 0 - round 10 high
  920. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
  921. stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
  922. b.ge .L128_dec_prepretail @ do prepretail
  923. .L128_dec_main_loop: @ main loop start
  924. eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
  925. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  926. mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
  927. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  928. mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
  929. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  930. fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
  931. rev64 $res2b, $res2b @ GHASH block 4k+2
  932. fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
  933. rev $ctr32w, $rctr32w @ CTR block 4k+7
  934. mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
  935. eor $res0b, $res0b, $acc_lb @ PRE 1
  936. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  937. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  938. rev64 $res3b, $res3b @ GHASH block 4k+3
  939. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  940. mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
  941. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
  942. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  943. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
  944. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  945. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  946. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
  947. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  948. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  949. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  950. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  951. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  952. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  953. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  954. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  955. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  956. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  957. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  958. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  959. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  960. eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low
  961. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  962. eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high
  963. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  964. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  965. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  966. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  967. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  968. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  969. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  970. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  971. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  972. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  973. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  974. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  975. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  976. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  977. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  978. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  979. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  980. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  981. eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high
  982. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  983. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  984. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  985. eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low
  986. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  987. movi $mod_constant.8b, #0xc2
  988. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  989. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  990. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  991. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  992. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  993. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  994. stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
  995. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  996. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  997. ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
  998. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  999. add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
  1000. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  1001. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  1002. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  1003. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  1004. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  1005. stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
  1006. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  1007. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  1008. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  1009. rev $ctr32w, $rctr32w @ CTR block 4k+8
  1010. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  1011. ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
  1012. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  1013. aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
  1014. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
  1015. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  1016. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  1017. aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
  1018. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  1019. eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
  1020. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  1021. ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
  1022. add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
  1023. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  1024. eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
  1025. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  1026. ldr $res3q, [$input_ptr, #48] @ AES block 4k+3 - load ciphertext
  1027. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  1028. add $input_ptr, $input_ptr, #64 @ AES input_ptr update
  1029. rev64 $res1b, $res1b @ GHASH block 4k+5
  1030. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  1031. mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
  1032. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  1033. mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
  1034. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  1035. fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
  1036. pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  1037. fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
  1038. rev $ctr32w, $rctr32w @ CTR block 4k+9
  1039. aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
  1040. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
  1041. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  1042. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  1043. eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high
  1044. eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
  1045. mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
  1046. eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low
  1047. eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
  1048. mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
  1049. add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
  1050. aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
  1051. fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
  1052. cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
  1053. rev64 $res0b, $res0b @ GHASH block 4k+4
  1054. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  1055. fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
  1056. rev $ctr32w, $rctr32w @ CTR block 4k+10
  1057. add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
  1058. eor $output_h1, $output_h1, $rk10_h @ AES block 4k+5 - round 10 high
  1059. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
  1060. eor $output_l1, $output_l1, $rk10_l @ AES block 4k+5 - round 10 low
  1061. stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
  1062. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
  1063. b.lt L128_dec_main_loop
  1064. .L128_dec_prepretail: @ PREPRETAIL
  1065. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  1066. mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
  1067. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  1068. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  1069. eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
  1070. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  1071. mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
  1072. eor $res0b, $res0b, $acc_lb @ PRE 1
  1073. fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
  1074. rev64 $res2b, $res2b @ GHASH block 4k+2
  1075. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  1076. fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
  1077. rev $ctr32w, $rctr32w @ CTR block 4k+7
  1078. mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
  1079. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  1080. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  1081. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  1082. mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
  1083. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  1084. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  1085. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  1086. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
  1087. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  1088. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  1089. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
  1090. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  1091. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
  1092. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  1093. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  1094. rev64 $res3b, $res3b @ GHASH block 4k+3
  1095. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  1096. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  1097. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  1098. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  1099. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  1100. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  1101. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  1102. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  1103. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  1104. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  1105. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  1106. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  1107. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  1108. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  1109. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  1110. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  1111. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  1112. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  1113. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  1114. movi $mod_constant.8b, #0xc2
  1115. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  1116. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  1117. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  1118. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  1119. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  1120. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  1121. eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low
  1122. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  1123. eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low
  1124. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  1125. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  1126. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  1127. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  1128. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  1129. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  1130. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  1131. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  1132. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  1133. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  1134. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  1135. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  1136. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  1137. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  1138. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  1139. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  1140. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  1141. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  1142. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  1143. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  1144. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  1145. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  1146. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  1147. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  1148. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  1149. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  1150. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  1151. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  1152. aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
  1153. pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  1154. eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high
  1155. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  1156. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  1157. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  1158. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  1159. eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
  1160. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  1161. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  1162. eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high
  1163. aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
  1164. stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
  1165. aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
  1166. add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
  1167. stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
  1168. aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
  1169. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  1170. .L128_dec_tail: @ TAIL
  1171. sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
  1172. ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
  1173. eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
  1174. mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
  1175. mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
  1176. cmp $main_end_input_ptr, #48
  1177. eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high
  1178. ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
  1179. eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low
  1180. b.gt .L128_dec_blocks_more_than_3
  1181. mov $ctr3b, $ctr2b
  1182. sub $rctr32w, $rctr32w, #1
  1183. movi $acc_l.8b, #0
  1184. movi $acc_h.8b, #0
  1185. mov $ctr2b, $ctr1b
  1186. movi $acc_m.8b, #0
  1187. cmp $main_end_input_ptr, #32
  1188. b.gt .L128_dec_blocks_more_than_2
  1189. cmp $main_end_input_ptr, #16
  1190. mov $ctr3b, $ctr1b
  1191. sub $rctr32w, $rctr32w, #1
  1192. b.gt .L128_dec_blocks_more_than_1
  1193. sub $rctr32w, $rctr32w, #1
  1194. b .L128_dec_blocks_less_than_1
  1195. .L128_dec_blocks_more_than_3: @ blocks left > 3
  1196. rev64 $res0b, $res1b @ GHASH final-3 block
  1197. ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
  1198. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1199. mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
  1200. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
  1201. eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
  1202. mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
  1203. mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
  1204. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
  1205. mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
  1206. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
  1207. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
  1208. movi $t0.8b, #0 @ suppress further partial tag feed in
  1209. eor $output_h0, $output_h0, $rk10_h @ AES final-2 block - round 10 high
  1210. pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
  1211. eor $output_l0, $output_l0, $rk10_l @ AES final-2 block - round 10 low
  1212. .L128_dec_blocks_more_than_2: @ blocks left > 2
  1213. rev64 $res0b, $res1b @ GHASH final-2 block
  1214. ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
  1215. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1216. eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
  1217. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
  1218. mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
  1219. pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
  1220. pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
  1221. mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
  1222. mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
  1223. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
  1224. movi $t0.8b, #0 @ suppress further partial tag feed in
  1225. pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
  1226. eor $output_l0, $output_l0, $rk10_l @ AES final-1 block - round 10 low
  1227. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
  1228. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
  1229. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
  1230. eor $output_h0, $output_h0, $rk10_h @ AES final-1 block - round 10 high
  1231. .L128_dec_blocks_more_than_1: @ blocks left > 1
  1232. rev64 $res0b, $res1b @ GHASH final-1 block
  1233. ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
  1234. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1235. mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
  1236. eor $ctr0b, $res1b, $ctr3b @ AES final block - result
  1237. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
  1238. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
  1239. mov $output_l0, $ctr0.d[0] @ AES final block - mov low
  1240. mov $output_h0, $ctr0.d[1] @ AES final block - mov high
  1241. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
  1242. pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
  1243. pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
  1244. pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
  1245. movi $t0.8b, #0 @ suppress further partial tag feed in
  1246. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
  1247. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
  1248. eor $output_h0, $output_h0, $rk10_h @ AES final block - round 10 high
  1249. eor $output_l0, $output_l0, $rk10_l @ AES final block - round 10 low
  1250. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
  1251. .L128_dec_blocks_less_than_1: @ blocks left <= 1
  1252. mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff
  1253. and $bit_length, $bit_length, #127 @ bit_length %= 128
  1254. mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff
  1255. sub $bit_length, $bit_length, #128 @ bit_length -= 128
  1256. neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
  1257. and $bit_length, $bit_length, #127 @ bit_length %= 128
  1258. lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block
  1259. cmp $bit_length, #64
  1260. csel $ctr96_b64x, $rk10_h, xzr, lt
  1261. csel $ctr32x, $rk10_l, $rk10_h, lt
  1262. fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
  1263. mov $ctr0.d[1], $ctr96_b64x
  1264. and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
  1265. rev64 $res0b, $res1b @ GHASH final block
  1266. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1267. ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
  1268. and $output_h0, $output_h0, $ctr96_b64x
  1269. pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
  1270. mov $t0d, $res0.d[1] @ GHASH final block - mid
  1271. eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
  1272. eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
  1273. pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
  1274. pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
  1275. bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
  1276. and $output_l0, $output_l0, $ctr32x
  1277. rev $ctr32w, $rctr32w
  1278. eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
  1279. movi $mod_constant.8b, #0xc2
  1280. eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
  1281. bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
  1282. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  1283. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  1284. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  1285. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  1286. orr $output_l0, $output_l0, $end_input_ptr
  1287. str $ctr32w, [$counter, #12] @ store the updated counter
  1288. orr $output_h0, $output_h0, $main_end_input_ptr
  1289. stp $output_l0, $output_h0, [$output_ptr]
  1290. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  1291. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  1292. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  1293. pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  1294. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  1295. eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
  1296. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  1297. ext $acc_lb, $acc_lb, $acc_lb, #8
  1298. rev64 $acc_lb, $acc_lb
  1299. mov x0, $len
  1300. st1 { $acc_l.16b }, [$current_tag]
  1301. ldp x21, x22, [sp, #16]
  1302. ldp x23, x24, [sp, #32]
  1303. ldp d8, d9, [sp, #48]
  1304. ldp d10, d11, [sp, #64]
  1305. ldp d12, d13, [sp, #80]
  1306. ldp d14, d15, [sp, #96]
  1307. ldp x19, x20, [sp], #112
  1308. ret
  1309. .L128_dec_ret:
  1310. mov w0, #0x0
  1311. ret
  1312. .size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel
  1313. ___
  1314. }
  1315. {
  1316. my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
  1317. my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
  1318. my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
  1319. my ($output_l0,$output_h0)=map("x$_",(6..7));
  1320. my $ctr32w="w9";
  1321. my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk12_l,$rk12_h,$len)=map("x$_",(9..15));
  1322. my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
  1323. my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
  1324. my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
  1325. my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
  1326. my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
  1327. my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
  1328. my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
  1329. my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
  1330. my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
  1331. my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
  1332. my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
  1333. my $t0="v8";
  1334. my $t0d="d8";
  1335. my $t3="v4";
  1336. my $t3d="d4";
  1337. my ($t1,$t2)=map("v$_",(30..31));
  1338. my ($t1d,$t2d)=map("d$_",(30..31));
  1339. my $t4="v30";
  1340. my $t4d="d30";
  1341. my $t5="v8";
  1342. my $t5d="d8";
  1343. my $t6="v31";
  1344. my $t6d="d31";
  1345. my $t7="v5";
  1346. my $t7d="d5";
  1347. my $t8="v6";
  1348. my $t8d="d6";
  1349. my $t9="v30";
  1350. my $t9d="d30";
  1351. my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
  1352. my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
  1353. my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
  1354. my $mod_constantd="d8";
  1355. my $mod_constant="v8";
  1356. my $mod_t="v31";
  1357. my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11)=map("v$_.16b",(18..29));
  1358. my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q)=map("q$_",(18..29));
  1359. my $rk2q1="v20.1q";
  1360. my $rk3q1="v21.1q";
  1361. my $rk4v="v22";
  1362. my $rk4d="d22";
  1363. #########################################################################################
  1364. # size_t aes_gcm_enc_192_kernel(const unsigned char *in,
  1365. # size_t len,
  1366. # unsigned char *out,
  1367. # const void *key,
  1368. # unsigned char ivec[16],
  1369. # u64 *Xi);
  1370. #
  1371. $code.=<<___;
  1372. .global aes_gcm_enc_192_kernel
  1373. .type aes_gcm_enc_192_kernel,%function
  1374. .align 4
  1375. aes_gcm_enc_192_kernel:
  1376. AARCH64_VALID_CALL_TARGET
  1377. cbz x1, .L192_enc_ret
  1378. stp x19, x20, [sp, #-112]!
  1379. mov x16, x4
  1380. mov x8, x5
  1381. stp x21, x22, [sp, #16]
  1382. stp x23, x24, [sp, #32]
  1383. stp d8, d9, [sp, #48]
  1384. stp d10, d11, [sp, #64]
  1385. stp d12, d13, [sp, #80]
  1386. stp d14, d15, [sp, #96]
  1387. ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
  1388. ldr $rk5q, [$cc, #80] @ load rk5
  1389. ldr $rk4q, [$cc, #64] @ load rk4
  1390. ldr $rk8q, [$cc, #128] @ load rk8
  1391. lsr $rctr32x, $ctr96_t32x, #32
  1392. ldr $rk6q, [$cc, #96] @ load rk6
  1393. orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
  1394. ldr $rk7q, [$cc, #112] @ load rk7
  1395. rev $rctr32w, $rctr32w @ rev_ctr32
  1396. add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
  1397. fmov $ctr3d, $ctr96_b64x @ CTR block 3
  1398. rev $ctr32w, $rctr32w @ CTR block 1
  1399. add $rctr32w, $rctr32w, #1 @ CTR block 1
  1400. fmov $ctr1d, $ctr96_b64x @ CTR block 1
  1401. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
  1402. ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
  1403. fmov $ctr1.d[1], $ctr32x @ CTR block 1
  1404. rev $ctr32w, $rctr32w @ CTR block 2
  1405. add $rctr32w, $rctr32w, #1 @ CTR block 2
  1406. fmov $ctr2d, $ctr96_b64x @ CTR block 2
  1407. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
  1408. fmov $ctr2.d[1], $ctr32x @ CTR block 2
  1409. rev $ctr32w, $rctr32w @ CTR block 3
  1410. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
  1411. ldr $rk0q, [$cc, #0] @ load rk0
  1412. fmov $ctr3.d[1], $ctr32x @ CTR block 3
  1413. ldr $rk3q, [$cc, #48] @ load rk3
  1414. ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12
  1415. ldr $rk1q, [$cc, #16] @ load rk1
  1416. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
  1417. ld1 { $acc_lb}, [$current_tag]
  1418. ext $acc_lb, $acc_lb, $acc_lb, #8
  1419. rev64 $acc_lb, $acc_lb
  1420. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
  1421. ldr $rk11q, [$cc, #176] @ load rk11
  1422. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
  1423. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  1424. ext $h4b, $h4b, $h4b, #8
  1425. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
  1426. ldr $rk2q, [$cc, #32] @ load rk2
  1427. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
  1428. ldr $rk10q, [$cc, #160] @ load rk10
  1429. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
  1430. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  1431. ext $h1b, $h1b, $h1b, #8
  1432. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
  1433. ldr $rk9q, [$cc, #144] @ load rk9
  1434. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
  1435. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  1436. ext $h3b, $h3b, $h3b, #8
  1437. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
  1438. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
  1439. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
  1440. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
  1441. trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
  1442. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
  1443. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
  1444. trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
  1445. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
  1446. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
  1447. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
  1448. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
  1449. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
  1450. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
  1451. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
  1452. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
  1453. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
  1454. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
  1455. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
  1456. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
  1457. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  1458. ext $h2b, $h2b, $h2b, #8
  1459. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
  1460. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
  1461. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
  1462. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
  1463. trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
  1464. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
  1465. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
  1466. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
  1467. trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
  1468. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
  1469. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
  1470. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
  1471. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
  1472. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
  1473. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
  1474. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
  1475. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
  1476. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
  1477. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
  1478. lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
  1479. mov $len, $main_end_input_ptr
  1480. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
  1481. sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
  1482. eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
  1483. and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
  1484. eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
  1485. aese $ctr2b, $rk11 @ AES block 2 - round 11
  1486. add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
  1487. add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
  1488. aese $ctr1b, $rk11 @ AES block 1 - round 11
  1489. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
  1490. aese $ctr0b, $rk11 @ AES block 0 - round 11
  1491. add $rctr32w, $rctr32w, #1 @ CTR block 3
  1492. aese $ctr3b, $rk11 @ AES block 3 - round 11
  1493. b.ge .L192_enc_tail @ handle tail
  1494. rev $ctr32w, $rctr32w @ CTR block 4
  1495. ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
  1496. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
  1497. ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
  1498. ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
  1499. ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
  1500. add $input_ptr, $input_ptr, #64 @ AES input_ptr update
  1501. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  1502. eor $input_l0, $input_l0, $rk12_l @ AES block 0 - round 12 low
  1503. eor $input_h0, $input_h0, $rk12_h @ AES block 0 - round 12 high
  1504. eor $input_h2, $input_h2, $rk12_h @ AES block 2 - round 12 high
  1505. fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
  1506. eor $input_h3, $input_h3, $rk12_h @ AES block 3 - round 12 high
  1507. fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
  1508. eor $input_l2, $input_l2, $rk12_l @ AES block 2 - round 12 low
  1509. eor $input_l1, $input_l1, $rk12_l @ AES block 1 - round 12 low
  1510. fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
  1511. eor $input_h1, $input_h1, $rk12_h @ AES block 1 - round 12 high
  1512. fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
  1513. eor $input_l3, $input_l3, $rk12_l @ AES block 3 - round 12 low
  1514. fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
  1515. add $rctr32w, $rctr32w, #1 @ CTR block 4
  1516. eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
  1517. fmov $ctr0d, $ctr96_b64x @ CTR block 4
  1518. fmov $ctr0.d[1], $ctr32x @ CTR block 4
  1519. rev $ctr32w, $rctr32w @ CTR block 5
  1520. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
  1521. add $rctr32w, $rctr32w, #1 @ CTR block 5
  1522. fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
  1523. st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
  1524. fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
  1525. eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
  1526. fmov $ctr1d, $ctr96_b64x @ CTR block 5
  1527. st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
  1528. fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
  1529. fmov $ctr1.d[1], $ctr32x @ CTR block 5
  1530. rev $ctr32w, $rctr32w @ CTR block 6
  1531. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
  1532. add $rctr32w, $rctr32w, #1 @ CTR block 6
  1533. eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
  1534. fmov $ctr2d, $ctr96_b64x @ CTR block 6
  1535. fmov $ctr2.d[1], $ctr32x @ CTR block 6
  1536. rev $ctr32w, $rctr32w @ CTR block 7
  1537. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
  1538. st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
  1539. eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
  1540. st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
  1541. b.ge .L192_enc_prepretail @ do prepretail
  1542. .L192_enc_main_loop: @ main loop start
  1543. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  1544. rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
  1545. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  1546. ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
  1547. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  1548. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
  1549. rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
  1550. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  1551. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
  1552. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  1553. rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
  1554. ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
  1555. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  1556. ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext
  1557. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  1558. eor $res0b, $res0b, $acc_lb @ PRE 1
  1559. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  1560. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  1561. rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
  1562. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  1563. eor $input_h3, $input_h3, $rk12_h @ AES block 4k+3 - round 12 high
  1564. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  1565. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  1566. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  1567. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  1568. eor $input_l2, $input_l2, $rk12_l @ AES block 4k+6 - round 12 low
  1569. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  1570. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  1571. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  1572. eor $input_l1, $input_l1, $rk12_l @ AES block 4k+5 - round 12 low
  1573. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  1574. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  1575. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  1576. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  1577. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  1578. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  1579. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  1580. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  1581. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  1582. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  1583. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  1584. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  1585. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  1586. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  1587. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  1588. eor $input_h1, $input_h1, $rk12_h @ AES block 4k+5 - round 12 high
  1589. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  1590. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  1591. add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
  1592. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  1593. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  1594. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  1595. eor $input_h2, $input_h2, $rk12_h @ AES block 4k+6 - round 12 high
  1596. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  1597. eor $input_l3, $input_l3, $rk12_l @ AES block 4k+3 - round 12 low
  1598. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  1599. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  1600. rev $ctr32w, $rctr32w @ CTR block 4k+8
  1601. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  1602. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
  1603. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  1604. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  1605. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  1606. ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
  1607. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  1608. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  1609. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  1610. add $input_ptr, $input_ptr, #64 @ AES input_ptr update
  1611. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  1612. movi $mod_constant.8b, #0xc2
  1613. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  1614. eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high
  1615. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  1616. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  1617. eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low
  1618. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  1619. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  1620. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  1621. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  1622. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  1623. fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
  1624. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  1625. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  1626. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  1627. fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
  1628. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  1629. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  1630. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  1631. cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
  1632. fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
  1633. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  1634. fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
  1635. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  1636. fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low
  1637. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  1638. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  1639. add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
  1640. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  1641. fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high
  1642. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  1643. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  1644. fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
  1645. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  1646. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
  1647. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  1648. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  1649. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  1650. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
  1651. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
  1652. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  1653. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
  1654. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
  1655. aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
  1656. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
  1657. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  1658. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
  1659. eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
  1660. fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
  1661. aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
  1662. fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
  1663. rev $ctr32w, $rctr32w @ CTR block 4k+9
  1664. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  1665. fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
  1666. st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
  1667. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
  1668. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
  1669. eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
  1670. add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
  1671. fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
  1672. aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
  1673. fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
  1674. rev $ctr32w, $rctr32w @ CTR block 4k+10
  1675. add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
  1676. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  1677. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
  1678. st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
  1679. eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
  1680. aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
  1681. eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
  1682. fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
  1683. st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
  1684. fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
  1685. rev $ctr32w, $rctr32w @ CTR block 4k+11
  1686. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  1687. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
  1688. eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result
  1689. st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result
  1690. b.lt .L192_enc_main_loop
  1691. .L192_enc_prepretail: @ PREPRETAIL
  1692. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  1693. rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
  1694. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
  1695. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  1696. add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
  1697. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  1698. rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
  1699. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  1700. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
  1701. eor $res0b, $res0b, $acc_lb @ PRE 1
  1702. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  1703. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  1704. rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
  1705. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  1706. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  1707. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  1708. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  1709. rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
  1710. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  1711. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  1712. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  1713. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  1714. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  1715. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  1716. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  1717. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  1718. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  1719. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  1720. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  1721. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  1722. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  1723. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  1724. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  1725. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  1726. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  1727. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  1728. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  1729. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  1730. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  1731. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  1732. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  1733. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  1734. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  1735. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  1736. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  1737. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  1738. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  1739. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  1740. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  1741. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  1742. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  1743. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  1744. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  1745. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  1746. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  1747. movi $mod_constant.8b, #0xc2
  1748. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  1749. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  1750. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  1751. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  1752. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  1753. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  1754. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  1755. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  1756. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  1757. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  1758. eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
  1759. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  1760. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  1761. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  1762. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  1763. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  1764. eor $acc_mb, $acc_mb, $acc_lb
  1765. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  1766. pmull $t1.1q, $acc_h.1d, $mod_constant.1d
  1767. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  1768. ext $acc_hb, $acc_hb, $acc_hb, #8
  1769. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  1770. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  1771. eor $acc_mb, $acc_mb, $t1.16b
  1772. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  1773. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  1774. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
  1775. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  1776. eor $acc_mb, $acc_mb, $acc_hb
  1777. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
  1778. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
  1779. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
  1780. pmull $t1.1q, $acc_m.1d, $mod_constant.1d
  1781. ext $acc_mb, $acc_mb, $acc_mb, #8
  1782. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
  1783. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
  1784. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
  1785. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
  1786. eor $acc_lb, $acc_lb, $t1.16b
  1787. aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
  1788. aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
  1789. aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
  1790. aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
  1791. eor $acc_lb, $acc_lb, $acc_mb
  1792. .L192_enc_tail: @ TAIL
  1793. sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
  1794. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
  1795. eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low
  1796. eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high
  1797. fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
  1798. fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
  1799. cmp $main_end_input_ptr, #48
  1800. eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
  1801. ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
  1802. b.gt .L192_enc_blocks_more_than_3
  1803. sub $rctr32w, $rctr32w, #1
  1804. movi $acc_m.8b, #0
  1805. mov $ctr3b, $ctr2b
  1806. movi $acc_h.8b, #0
  1807. cmp $main_end_input_ptr, #32
  1808. mov $ctr2b, $ctr1b
  1809. movi $acc_l.8b, #0
  1810. b.gt .L192_enc_blocks_more_than_2
  1811. sub $rctr32w, $rctr32w, #1
  1812. mov $ctr3b, $ctr1b
  1813. cmp $main_end_input_ptr, #16
  1814. b.gt .L192_enc_blocks_more_than_1
  1815. sub $rctr32w, $rctr32w, #1
  1816. b .L192_enc_blocks_less_than_1
  1817. .L192_enc_blocks_more_than_3: @ blocks left > 3
  1818. st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
  1819. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
  1820. rev64 $res0b, $res1b @ GHASH final-3 block
  1821. eor $input_l0, $input_l0, $rk12_l @ AES final-2 block - round 12 low
  1822. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1823. eor $input_h0, $input_h0, $rk12_h @ AES final-2 block - round 12 high
  1824. fmov $res1d, $input_l0 @ AES final-2 block - mov low
  1825. fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
  1826. mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
  1827. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
  1828. mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
  1829. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
  1830. movi $t0.8b, #0 @ suppress further partial tag feed in
  1831. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
  1832. pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
  1833. eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
  1834. .L192_enc_blocks_more_than_2: @ blocks left > 2
  1835. st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
  1836. rev64 $res0b, $res1b @ GHASH final-2 block
  1837. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
  1838. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1839. eor $input_h0, $input_h0, $rk12_h @ AES final-1 block - round 12 high
  1840. pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
  1841. mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
  1842. pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
  1843. eor $input_l0, $input_l0, $rk12_l @ AES final-1 block - round 12 low
  1844. fmov $res1d, $input_l0 @ AES final-1 block - mov low
  1845. fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
  1846. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
  1847. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
  1848. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
  1849. pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
  1850. movi $t0.8b, #0 @ suppress further partial tag feed in
  1851. eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
  1852. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
  1853. .L192_enc_blocks_more_than_1: @ blocks left > 1
  1854. st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
  1855. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
  1856. rev64 $res0b, $res1b @ GHASH final-1 block
  1857. eor $input_l0, $input_l0, $rk12_l @ AES final block - round 12 low
  1858. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1859. movi $t0.8b, #0 @ suppress further partial tag feed in
  1860. mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
  1861. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
  1862. eor $input_h0, $input_h0, $rk12_h @ AES final block - round 12 high
  1863. fmov $res1d, $input_l0 @ AES final block - mov low
  1864. pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
  1865. fmov $res1.d[1], $input_h0 @ AES final block - mov high
  1866. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
  1867. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
  1868. pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
  1869. pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
  1870. eor $res1b, $res1b, $ctr3b @ AES final block - result
  1871. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
  1872. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
  1873. .L192_enc_blocks_less_than_1: @ blocks left <= 1
  1874. ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
  1875. rev $ctr32w, $rctr32w
  1876. and $bit_length, $bit_length, #127 @ bit_length %= 128
  1877. sub $bit_length, $bit_length, #128 @ bit_length -= 128
  1878. mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff
  1879. neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
  1880. mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff
  1881. and $bit_length, $bit_length, #127 @ bit_length %= 128
  1882. lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block
  1883. cmp $bit_length, #64
  1884. csel $input_l0, $rk12_l, $rk12_h, lt
  1885. csel $input_h0, $rk12_h, xzr, lt
  1886. fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
  1887. fmov $ctr0.d[1], $input_h0
  1888. and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
  1889. rev64 $res0b, $res1b @ GHASH final block
  1890. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  1891. mov $t0d, $res0.d[1] @ GHASH final block - mid
  1892. pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
  1893. pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
  1894. eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
  1895. eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
  1896. eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
  1897. pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
  1898. eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
  1899. movi $mod_constant.8b, #0xc2
  1900. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  1901. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  1902. bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
  1903. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  1904. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  1905. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  1906. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  1907. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  1908. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  1909. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  1910. eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
  1911. str $ctr32w, [$counter, #12] @ store the updated counter
  1912. st1 { $res1b}, [$output_ptr] @ store all 16B
  1913. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  1914. ext $acc_lb, $acc_lb, $acc_lb, #8
  1915. rev64 $acc_lb, $acc_lb
  1916. mov x0, $len
  1917. st1 { $acc_l.16b }, [$current_tag]
  1918. ldp x21, x22, [sp, #16]
  1919. ldp x23, x24, [sp, #32]
  1920. ldp d8, d9, [sp, #48]
  1921. ldp d10, d11, [sp, #64]
  1922. ldp d12, d13, [sp, #80]
  1923. ldp d14, d15, [sp, #96]
  1924. ldp x19, x20, [sp], #112
  1925. ret
  1926. .L192_enc_ret:
  1927. mov w0, #0x0
  1928. ret
  1929. .size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel
  1930. ___
  1931. #########################################################################################
  1932. # size_t aes_gcm_dec_192_kernel(const unsigned char *in,
  1933. # size_t len,
  1934. # unsigned char *out,
  1935. # const void *key,
  1936. # unsigned char ivec[16],
  1937. # u64 *Xi);
  1938. #
  1939. $code.=<<___;
  1940. .global aes_gcm_dec_192_kernel
  1941. .type aes_gcm_dec_192_kernel,%function
  1942. .align 4
  1943. aes_gcm_dec_192_kernel:
  1944. AARCH64_VALID_CALL_TARGET
  1945. cbz x1, .L192_dec_ret
  1946. stp x19, x20, [sp, #-112]!
  1947. mov x16, x4
  1948. mov x8, x5
  1949. stp x21, x22, [sp, #16]
  1950. stp x23, x24, [sp, #32]
  1951. stp d8, d9, [sp, #48]
  1952. stp d10, d11, [sp, #64]
  1953. stp d12, d13, [sp, #80]
  1954. stp d14, d15, [sp, #96]
  1955. add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
  1956. ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
  1957. ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
  1958. ldr $rk0q, [$cc, #0] @ load rk0
  1959. lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
  1960. mov $len, $main_end_input_ptr
  1961. ldr $rk2q, [$cc, #32] @ load rk2
  1962. lsr $rctr32x, $ctr96_t32x, #32
  1963. orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
  1964. fmov $ctr3d, $ctr96_b64x @ CTR block 3
  1965. rev $rctr32w, $rctr32w @ rev_ctr32
  1966. fmov $ctr1d, $ctr96_b64x @ CTR block 1
  1967. add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
  1968. ldr $rk1q, [$cc, #16] @ load rk1
  1969. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
  1970. rev $ctr32w, $rctr32w @ CTR block 1
  1971. add $rctr32w, $rctr32w, #1 @ CTR block 1
  1972. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
  1973. ldr $rk3q, [$cc, #48] @ load rk3
  1974. fmov $ctr1.d[1], $ctr32x @ CTR block 1
  1975. rev $ctr32w, $rctr32w @ CTR block 2
  1976. add $rctr32w, $rctr32w, #1 @ CTR block 2
  1977. fmov $ctr2d, $ctr96_b64x @ CTR block 2
  1978. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
  1979. fmov $ctr2.d[1], $ctr32x @ CTR block 2
  1980. rev $ctr32w, $rctr32w @ CTR block 3
  1981. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
  1982. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
  1983. fmov $ctr3.d[1], $ctr32x @ CTR block 3
  1984. ldr $rk8q, [$cc, #128] @ load rk8
  1985. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
  1986. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
  1987. ldr $rk11q, [$cc, #176] @ load rk11
  1988. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
  1989. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  1990. ext $h4b, $h4b, $h4b, #8
  1991. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
  1992. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  1993. ext $h2b, $h2b, $h2b, #8
  1994. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
  1995. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  1996. ext $h3b, $h3b, $h3b, #8
  1997. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
  1998. ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12
  1999. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
  2000. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  2001. ext $h1b, $h1b, $h1b, #8
  2002. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
  2003. ldr $rk10q, [$cc, #160] @ load rk10
  2004. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
  2005. ldr $rk9q, [$cc, #144] @ load rk9
  2006. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
  2007. ldr $rk7q, [$cc, #112] @ load rk7
  2008. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
  2009. ldr $rk4q, [$cc, #64] @ load rk4
  2010. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
  2011. ld1 { $acc_lb}, [$current_tag]
  2012. ext $acc_lb, $acc_lb, $acc_lb, #8
  2013. rev64 $acc_lb, $acc_lb
  2014. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
  2015. add $rctr32w, $rctr32w, #1 @ CTR block 3
  2016. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
  2017. trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
  2018. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
  2019. ldr $rk5q, [$cc, #80] @ load rk5
  2020. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
  2021. trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
  2022. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
  2023. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
  2024. trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
  2025. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
  2026. ldr $rk6q, [$cc, #96] @ load rk6
  2027. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
  2028. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
  2029. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
  2030. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
  2031. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
  2032. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
  2033. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
  2034. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
  2035. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
  2036. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
  2037. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
  2038. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
  2039. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
  2040. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
  2041. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
  2042. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
  2043. sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
  2044. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
  2045. and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
  2046. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
  2047. add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
  2048. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
  2049. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
  2050. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
  2051. trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
  2052. aese $ctr3b, $rk11 @ AES block 3 - round 11
  2053. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
  2054. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
  2055. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
  2056. eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
  2057. aese $ctr2b, $rk11 @ AES block 2 - round 11
  2058. aese $ctr1b, $rk11 @ AES block 1 - round 11
  2059. eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
  2060. aese $ctr0b, $rk11 @ AES block 0 - round 11
  2061. b.ge .L192_dec_tail @ handle tail
  2062. ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
  2063. ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
  2064. eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
  2065. eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
  2066. rev $ctr32w, $rctr32w @ CTR block 4
  2067. ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
  2068. ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
  2069. mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
  2070. mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
  2071. mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
  2072. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
  2073. add $rctr32w, $rctr32w, #1 @ CTR block 4
  2074. mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
  2075. rev64 $res0b, $res0b @ GHASH block 0
  2076. add $input_ptr, $input_ptr, #64 @ AES input_ptr update
  2077. fmov $ctr0d, $ctr96_b64x @ CTR block 4
  2078. rev64 $res1b, $res1b @ GHASH block 1
  2079. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  2080. eor $output_l1, $output_l1, $rk12_l @ AES block 1 - round 12 low
  2081. fmov $ctr0.d[1], $ctr32x @ CTR block 4
  2082. rev $ctr32w, $rctr32w @ CTR block 5
  2083. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
  2084. fmov $ctr1d, $ctr96_b64x @ CTR block 5
  2085. eor $output_h1, $output_h1, $rk12_h @ AES block 1 - round 12 high
  2086. add $rctr32w, $rctr32w, #1 @ CTR block 5
  2087. fmov $ctr1.d[1], $ctr32x @ CTR block 5
  2088. eor $output_l0, $output_l0, $rk12_l @ AES block 0 - round 12 low
  2089. rev $ctr32w, $rctr32w @ CTR block 6
  2090. eor $output_h0, $output_h0, $rk12_h @ AES block 0 - round 12 high
  2091. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
  2092. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
  2093. stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
  2094. add $rctr32w, $rctr32w, #1 @ CTR block 6
  2095. eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
  2096. b.ge .L192_dec_prepretail @ do prepretail
  2097. .L192_dec_main_loop: @ main loop start
  2098. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  2099. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  2100. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  2101. mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
  2102. mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
  2103. eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
  2104. rev64 $res3b, $res3b @ GHASH block 4k+3
  2105. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  2106. fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
  2107. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  2108. eor $res0b, $res0b, $acc_lb @ PRE 1
  2109. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  2110. fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
  2111. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  2112. mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
  2113. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  2114. mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
  2115. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  2116. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
  2117. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  2118. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  2119. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  2120. rev $ctr32w, $rctr32w @ CTR block 4k+7
  2121. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  2122. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
  2123. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
  2124. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  2125. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  2126. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  2127. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  2128. eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high
  2129. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  2130. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  2131. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  2132. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  2133. rev64 $res2b, $res2b @ GHASH block 4k+2
  2134. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  2135. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  2136. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  2137. eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low
  2138. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  2139. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  2140. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  2141. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  2142. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  2143. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  2144. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  2145. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  2146. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  2147. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  2148. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  2149. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  2150. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  2151. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  2152. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  2153. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  2154. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  2155. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  2156. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  2157. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  2158. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  2159. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  2160. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  2161. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  2162. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  2163. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  2164. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  2165. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  2166. movi $mod_constant.8b, #0xc2
  2167. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  2168. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  2169. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  2170. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  2171. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
  2172. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  2173. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  2174. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  2175. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  2176. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
  2177. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
  2178. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  2179. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  2180. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  2181. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  2182. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
  2183. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  2184. ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
  2185. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  2186. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  2187. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  2188. ldr $res3q, [$input_ptr, #48] @ AES block 4k+7 - load ciphertext
  2189. eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low
  2190. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  2191. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  2192. aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
  2193. add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
  2194. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  2195. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  2196. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  2197. ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
  2198. aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
  2199. ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
  2200. rev $ctr32w, $rctr32w @ CTR block 4k+8
  2201. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  2202. stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
  2203. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
  2204. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  2205. add $input_ptr, $input_ptr, #64 @ AES input_ptr update
  2206. cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
  2207. eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
  2208. eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high
  2209. eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
  2210. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
  2211. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
  2212. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
  2213. pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  2214. mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
  2215. mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
  2216. stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
  2217. rev64 $res1b, $res1b @ GHASH block 4k+5
  2218. aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
  2219. mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
  2220. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
  2221. mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
  2222. fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
  2223. add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
  2224. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  2225. eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
  2226. fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
  2227. rev $ctr32w, $rctr32w @ CTR block 4k+9
  2228. eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low
  2229. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
  2230. eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
  2231. fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
  2232. add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
  2233. eor $output_l1, $output_l1, $rk12_l @ AES block 4k+5 - round 12 low
  2234. fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
  2235. rev $ctr32w, $rctr32w @ CTR block 4k+10
  2236. eor $output_h1, $output_h1, $rk12_h @ AES block 4k+5 - round 12 high
  2237. eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high
  2238. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
  2239. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  2240. add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
  2241. rev64 $res0b, $res0b @ GHASH block 4k+4
  2242. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
  2243. aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
  2244. stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
  2245. b.lt .L192_dec_main_loop
  2246. .L192_dec_prepretail: @ PREPRETAIL
  2247. mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
  2248. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  2249. eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
  2250. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  2251. mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
  2252. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  2253. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  2254. eor $res0b, $res0b, $acc_lb @ PRE 1
  2255. fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
  2256. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  2257. mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
  2258. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  2259. mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
  2260. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  2261. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  2262. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
  2263. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  2264. rev64 $res2b, $res2b @ GHASH block 4k+2
  2265. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  2266. fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
  2267. rev $ctr32w, $rctr32w @ CTR block 4k+7
  2268. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
  2269. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  2270. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  2271. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  2272. eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high
  2273. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
  2274. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  2275. eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low
  2276. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  2277. eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high
  2278. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  2279. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  2280. eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low
  2281. stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
  2282. rev64 $res3b, $res3b @ GHASH block 4k+3
  2283. stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
  2284. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  2285. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  2286. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  2287. add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
  2288. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  2289. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  2290. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  2291. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  2292. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  2293. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  2294. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  2295. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  2296. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  2297. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  2298. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  2299. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  2300. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  2301. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  2302. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  2303. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  2304. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  2305. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  2306. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  2307. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  2308. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  2309. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  2310. movi $mod_constant.8b, #0xc2
  2311. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  2312. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  2313. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  2314. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  2315. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  2316. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  2317. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  2318. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  2319. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  2320. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  2321. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  2322. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  2323. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  2324. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  2325. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  2326. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  2327. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  2328. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  2329. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  2330. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  2331. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  2332. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  2333. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  2334. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
  2335. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  2336. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  2337. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  2338. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
  2339. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  2340. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  2341. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  2342. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  2343. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  2344. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  2345. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
  2346. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  2347. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
  2348. pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  2349. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
  2350. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
  2351. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
  2352. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  2353. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
  2354. aese $ctr0b, $rk11
  2355. eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
  2356. aese $ctr2b, $rk11
  2357. aese $ctr1b, $rk11
  2358. aese $ctr3b, $rk11
  2359. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  2360. .L192_dec_tail: @ TAIL
  2361. sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
  2362. ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
  2363. eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
  2364. mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
  2365. mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
  2366. ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
  2367. cmp $main_end_input_ptr, #48
  2368. eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high
  2369. eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low
  2370. b.gt .L192_dec_blocks_more_than_3
  2371. movi $acc_l.8b, #0
  2372. movi $acc_h.8b, #0
  2373. mov $ctr3b, $ctr2b
  2374. mov $ctr2b, $ctr1b
  2375. sub $rctr32w, $rctr32w, #1
  2376. movi $acc_m.8b, #0
  2377. cmp $main_end_input_ptr, #32
  2378. b.gt .L192_dec_blocks_more_than_2
  2379. mov $ctr3b, $ctr1b
  2380. cmp $main_end_input_ptr, #16
  2381. sub $rctr32w, $rctr32w, #1
  2382. b.gt .L192_dec_blocks_more_than_1
  2383. sub $rctr32w, $rctr32w, #1
  2384. b .L192_dec_blocks_less_than_1
  2385. .L192_dec_blocks_more_than_3: @ blocks left > 3
  2386. rev64 $res0b, $res1b @ GHASH final-3 block
  2387. ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
  2388. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
  2389. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2390. eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
  2391. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
  2392. mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
  2393. mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
  2394. mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
  2395. mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
  2396. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
  2397. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
  2398. eor $output_l0, $output_l0, $rk12_l @ AES final-2 block - round 12 low
  2399. movi $t0.8b, #0 @ suppress further partial tag feed in
  2400. pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
  2401. eor $output_h0, $output_h0, $rk12_h @ AES final-2 block - round 12 high
  2402. .L192_dec_blocks_more_than_2: @ blocks left > 2
  2403. rev64 $res0b, $res1b @ GHASH final-2 block
  2404. ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
  2405. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2406. movi $t0.8b, #0 @ suppress further partial tag feed in
  2407. eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
  2408. mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
  2409. pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
  2410. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
  2411. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
  2412. mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
  2413. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
  2414. mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
  2415. pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
  2416. pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
  2417. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
  2418. eor $output_h0, $output_h0, $rk12_h @ AES final-1 block - round 12 high
  2419. eor $output_l0, $output_l0, $rk12_l @ AES final-1 block - round 12 low
  2420. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
  2421. .L192_dec_blocks_more_than_1: @ blocks left > 1
  2422. rev64 $res0b, $res1b @ GHASH final-1 block
  2423. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2424. ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
  2425. mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
  2426. pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
  2427. eor $ctr0b, $res1b, $ctr3b @ AES final block - result
  2428. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
  2429. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
  2430. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
  2431. pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
  2432. mov $output_h0, $ctr0.d[1] @ AES final block - mov high
  2433. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
  2434. mov $output_l0, $ctr0.d[0] @ AES final block - mov low
  2435. pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
  2436. movi $t0.8b, #0 @ suppress further partial tag feed in
  2437. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
  2438. eor $output_h0, $output_h0, $rk12_h @ AES final block - round 12 high
  2439. eor $output_l0, $output_l0, $rk12_l @ AES final block - round 12 low
  2440. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
  2441. .L192_dec_blocks_less_than_1: @ blocks left <= 1
  2442. mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff
  2443. ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
  2444. and $bit_length, $bit_length, #127 @ bit_length %= 128
  2445. sub $bit_length, $bit_length, #128 @ bit_length -= 128
  2446. neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
  2447. and $bit_length, $bit_length, #127 @ bit_length %= 128
  2448. mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff
  2449. lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block
  2450. cmp $bit_length, #64
  2451. csel $ctr32x, $rk12_l, $rk12_h, lt
  2452. csel $ctr96_b64x, $rk12_h, xzr, lt
  2453. fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
  2454. and $output_l0, $output_l0, $ctr32x
  2455. bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
  2456. orr $output_l0, $output_l0, $end_input_ptr
  2457. mov $ctr0.d[1], $ctr96_b64x
  2458. rev $ctr32w, $rctr32w
  2459. and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
  2460. str $ctr32w, [$counter, #12] @ store the updated counter
  2461. rev64 $res0b, $res1b @ GHASH final block
  2462. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  2463. bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
  2464. and $output_h0, $output_h0, $ctr96_b64x
  2465. pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
  2466. mov $t0d, $res0.d[1] @ GHASH final block - mid
  2467. pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
  2468. eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
  2469. eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
  2470. pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
  2471. eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
  2472. eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
  2473. movi $mod_constant.8b, #0xc2
  2474. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  2475. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  2476. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  2477. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  2478. orr $output_h0, $output_h0, $main_end_input_ptr
  2479. stp $output_l0, $output_h0, [$output_ptr]
  2480. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  2481. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  2482. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  2483. pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  2484. eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
  2485. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  2486. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  2487. ext $acc_lb, $acc_lb, $acc_lb, #8
  2488. rev64 $acc_lb, $acc_lb
  2489. mov x0, $len
  2490. st1 { $acc_l.16b }, [$current_tag]
  2491. ldp x21, x22, [sp, #16]
  2492. ldp x23, x24, [sp, #32]
  2493. ldp d8, d9, [sp, #48]
  2494. ldp d10, d11, [sp, #64]
  2495. ldp d12, d13, [sp, #80]
  2496. ldp d14, d15, [sp, #96]
  2497. ldp x19, x20, [sp], #112
  2498. ret
  2499. .L192_dec_ret:
  2500. mov w0, #0x0
  2501. ret
  2502. .size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel
  2503. ___
  2504. }
  2505. {
  2506. my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
  2507. my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
  2508. my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
  2509. my ($output_l0,$output_h0)=map("x$_",(6..7));
  2510. my $ctr32w="w9";
  2511. my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk14_l,$rk14_h,$len)=map("x$_",(9..15));
  2512. my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
  2513. my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
  2514. my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
  2515. my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
  2516. my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
  2517. my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
  2518. my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
  2519. my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
  2520. my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
  2521. my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
  2522. my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
  2523. my $t0="v8";
  2524. my $t0d="d8";
  2525. my $t1="v4";
  2526. my $t1d="d4";
  2527. my $t2="v8";
  2528. my $t2d="d8";
  2529. my $t3="v4";
  2530. my $t3d="d4";
  2531. my $t4="v4";
  2532. my $t4d="d4";
  2533. my $t5="v5";
  2534. my $t5d="d5";
  2535. my $t6="v8";
  2536. my $t6d="d8";
  2537. my $t7="v5";
  2538. my $t7d="d5";
  2539. my $t8="v6";
  2540. my $t8d="d6";
  2541. my $t9="v4";
  2542. my $t9d="d4";
  2543. my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
  2544. my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
  2545. my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
  2546. my $mod_constantd="d8";
  2547. my $mod_constant="v8";
  2548. my $mod_t="v7";
  2549. my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11,$rk12,$rk13)=map("v$_.16b",(18..31));
  2550. my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q,$rk12q,$rk13q)=map("q$_",(18..31));
  2551. my $rk2q1="v20.1q";
  2552. my $rk3q1="v21.1q";
  2553. my $rk4v="v22";
  2554. my $rk4d="d22";
  2555. #########################################################################################
  2556. # size_t aes_gcm_enc_256_kernel(const unsigned char *in,
  2557. # size_t len,
  2558. # unsigned char *out,
  2559. # const void *key,
  2560. # unsigned char ivec[16],
  2561. # u64 *Xi);
  2562. #
  2563. $code.=<<___;
  2564. .global aes_gcm_enc_256_kernel
  2565. .type aes_gcm_enc_256_kernel,%function
  2566. .align 4
  2567. aes_gcm_enc_256_kernel:
  2568. AARCH64_VALID_CALL_TARGET
  2569. cbz x1, .L256_enc_ret
  2570. stp x19, x20, [sp, #-112]!
  2571. mov x16, x4
  2572. mov x8, x5
  2573. stp x21, x22, [sp, #16]
  2574. stp x23, x24, [sp, #32]
  2575. stp d8, d9, [sp, #48]
  2576. stp d10, d11, [sp, #64]
  2577. stp d12, d13, [sp, #80]
  2578. stp d14, d15, [sp, #96]
  2579. add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
  2580. lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
  2581. mov $len, $main_end_input_ptr
  2582. ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
  2583. ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
  2584. sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
  2585. ldr $rk0q, [$cc, #0] @ load rk0
  2586. and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
  2587. ldr $rk7q, [$cc, #112] @ load rk7
  2588. add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
  2589. lsr $rctr32x, $ctr96_t32x, #32
  2590. fmov $ctr2d, $ctr96_b64x @ CTR block 2
  2591. orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
  2592. rev $rctr32w, $rctr32w @ rev_ctr32
  2593. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
  2594. fmov $ctr1d, $ctr96_b64x @ CTR block 1
  2595. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
  2596. add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
  2597. rev $ctr32w, $rctr32w @ CTR block 1
  2598. fmov $ctr3d, $ctr96_b64x @ CTR block 3
  2599. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
  2600. add $rctr32w, $rctr32w, #1 @ CTR block 1
  2601. ldr $rk1q, [$cc, #16] @ load rk1
  2602. fmov $ctr1.d[1], $ctr32x @ CTR block 1
  2603. rev $ctr32w, $rctr32w @ CTR block 2
  2604. add $rctr32w, $rctr32w, #1 @ CTR block 2
  2605. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
  2606. ldr $rk2q, [$cc, #32] @ load rk2
  2607. fmov $ctr2.d[1], $ctr32x @ CTR block 2
  2608. rev $ctr32w, $rctr32w @ CTR block 3
  2609. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
  2610. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
  2611. fmov $ctr3.d[1], $ctr32x @ CTR block 3
  2612. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
  2613. ldr $rk3q, [$cc, #48] @ load rk3
  2614. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
  2615. ldr $rk6q, [$cc, #96] @ load rk6
  2616. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
  2617. ldr $rk5q, [$cc, #80] @ load rk5
  2618. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
  2619. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  2620. ext $h3b, $h3b, $h3b, #8
  2621. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
  2622. ldr $rk13q, [$cc, #208] @ load rk13
  2623. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
  2624. ldr $rk4q, [$cc, #64] @ load rk4
  2625. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
  2626. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  2627. ext $h2b, $h2b, $h2b, #8
  2628. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
  2629. ldr $rk12q, [$cc, #192] @ load rk12
  2630. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
  2631. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  2632. ext $h4b, $h4b, $h4b, #8
  2633. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
  2634. ldr $rk11q, [$cc, #176] @ load rk11
  2635. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
  2636. ldr $rk8q, [$cc, #128] @ load rk8
  2637. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
  2638. add $rctr32w, $rctr32w, #1 @ CTR block 3
  2639. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
  2640. ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14
  2641. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
  2642. ld1 { $acc_lb}, [$current_tag]
  2643. ext $acc_lb, $acc_lb, $acc_lb, #8
  2644. rev64 $acc_lb, $acc_lb
  2645. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
  2646. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
  2647. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
  2648. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
  2649. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
  2650. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
  2651. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
  2652. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
  2653. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
  2654. trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
  2655. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
  2656. ldr $rk9q, [$cc, #144] @ load rk9
  2657. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
  2658. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  2659. ext $h1b, $h1b, $h1b, #8
  2660. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
  2661. ldr $rk10q, [$cc, #160] @ load rk10
  2662. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
  2663. trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
  2664. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
  2665. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
  2666. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
  2667. trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
  2668. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
  2669. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
  2670. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
  2671. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
  2672. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
  2673. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
  2674. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
  2675. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
  2676. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
  2677. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
  2678. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
  2679. aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
  2680. aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
  2681. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
  2682. aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
  2683. aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
  2684. aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
  2685. eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
  2686. aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
  2687. aese $ctr2b, $rk13 @ AES block 2 - round 13
  2688. trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
  2689. aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
  2690. aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
  2691. aese $ctr1b, $rk13 @ AES block 1 - round 13
  2692. aese $ctr0b, $rk13 @ AES block 0 - round 13
  2693. aese $ctr3b, $rk13 @ AES block 3 - round 13
  2694. eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
  2695. b.ge .L256_enc_tail @ handle tail
  2696. ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
  2697. rev $ctr32w, $rctr32w @ CTR block 4
  2698. ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
  2699. ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
  2700. ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
  2701. add $input_ptr, $input_ptr, #64 @ AES input_ptr update
  2702. eor $input_l1, $input_l1, $rk14_l @ AES block 1 - round 14 low
  2703. eor $input_h1, $input_h1, $rk14_h @ AES block 1 - round 14 high
  2704. fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
  2705. eor $input_l0, $input_l0, $rk14_l @ AES block 0 - round 14 low
  2706. eor $input_h0, $input_h0, $rk14_h @ AES block 0 - round 14 high
  2707. eor $input_h3, $input_h3, $rk14_h @ AES block 3 - round 14 high
  2708. fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
  2709. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  2710. fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
  2711. eor $input_l3, $input_l3, $rk14_l @ AES block 3 - round 14 low
  2712. eor $input_l2, $input_l2, $rk14_l @ AES block 2 - round 14 low
  2713. fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
  2714. fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
  2715. add $rctr32w, $rctr32w, #1 @ CTR block 4
  2716. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
  2717. fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
  2718. eor $input_h2, $input_h2, $rk14_h @ AES block 2 - round 14 high
  2719. fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
  2720. eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
  2721. fmov $ctr0d, $ctr96_b64x @ CTR block 4
  2722. fmov $ctr0.d[1], $ctr32x @ CTR block 4
  2723. rev $ctr32w, $rctr32w @ CTR block 5
  2724. add $rctr32w, $rctr32w, #1 @ CTR block 5
  2725. eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
  2726. fmov $ctr1d, $ctr96_b64x @ CTR block 5
  2727. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
  2728. fmov $ctr1.d[1], $ctr32x @ CTR block 5
  2729. rev $ctr32w, $rctr32w @ CTR block 6
  2730. st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
  2731. fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
  2732. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
  2733. eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
  2734. st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
  2735. add $rctr32w, $rctr32w, #1 @ CTR block 6
  2736. fmov $ctr2d, $ctr96_b64x @ CTR block 6
  2737. fmov $ctr2.d[1], $ctr32x @ CTR block 6
  2738. st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
  2739. rev $ctr32w, $rctr32w @ CTR block 7
  2740. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
  2741. eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
  2742. st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
  2743. b.ge L256_enc_prepretail @ do prepretail
  2744. .L256_enc_main_loop: @ main loop start
  2745. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  2746. rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
  2747. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  2748. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
  2749. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  2750. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  2751. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  2752. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
  2753. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  2754. ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+7 - load plaintext
  2755. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  2756. ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
  2757. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  2758. eor $res0b, $res0b, $acc_lb @ PRE 1
  2759. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  2760. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  2761. eor $input_l3, $input_l3, $rk14_l @ AES block 4k+7 - round 14 low
  2762. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  2763. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  2764. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  2765. eor $input_h2, $input_h2, $rk14_h @ AES block 4k+6 - round 14 high
  2766. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  2767. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  2768. rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
  2769. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  2770. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  2771. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  2772. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  2773. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  2774. rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
  2775. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  2776. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  2777. rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
  2778. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  2779. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  2780. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  2781. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  2782. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  2783. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  2784. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  2785. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  2786. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  2787. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  2788. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  2789. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  2790. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  2791. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  2792. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  2793. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  2794. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  2795. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  2796. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  2797. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  2798. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  2799. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  2800. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  2801. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  2802. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  2803. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  2804. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  2805. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  2806. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  2807. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  2808. ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
  2809. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  2810. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  2811. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  2812. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  2813. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  2814. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  2815. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  2816. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  2817. eor $input_l1, $input_l1, $rk14_l @ AES block 4k+5 - round 14 low
  2818. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
  2819. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  2820. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  2821. eor $input_l2, $input_l2, $rk14_l @ AES block 4k+6 - round 14 low
  2822. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
  2823. movi $mod_constant.8b, #0xc2
  2824. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  2825. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  2826. fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
  2827. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  2828. ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
  2829. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
  2830. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  2831. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  2832. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  2833. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
  2834. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
  2835. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  2836. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
  2837. add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
  2838. aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
  2839. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  2840. aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
  2841. add $input_ptr, $input_ptr, #64 @ AES input_ptr update
  2842. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  2843. rev $ctr32w, $rctr32w @ CTR block 4k+8
  2844. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  2845. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
  2846. eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low
  2847. aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
  2848. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  2849. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
  2850. eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high
  2851. fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
  2852. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
  2853. eor $mod_t.16b, $acc_hb, $mod_t.16b @ MODULO - fold into mid
  2854. aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
  2855. eor $input_h1, $input_h1, $rk14_h @ AES block 4k+5 - round 14 high
  2856. aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
  2857. eor $input_h3, $input_h3, $rk14_h @ AES block 4k+7 - round 14 high
  2858. aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
  2859. add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
  2860. aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
  2861. fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
  2862. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  2863. aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
  2864. fmov $ctr_t3d, $input_l3 @ AES block 4k+7 - mov low
  2865. aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
  2866. fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
  2867. fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
  2868. cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
  2869. fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
  2870. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  2871. eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
  2872. fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
  2873. fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
  2874. rev $ctr32w, $rctr32w @ CTR block 4k+9
  2875. add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
  2876. eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
  2877. fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
  2878. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
  2879. aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
  2880. fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
  2881. aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
  2882. rev $ctr32w, $rctr32w @ CTR block 4k+10
  2883. st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
  2884. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
  2885. eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
  2886. fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+7 - mov high
  2887. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  2888. st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
  2889. add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
  2890. aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
  2891. eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
  2892. fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
  2893. st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
  2894. fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
  2895. rev $ctr32w, $rctr32w @ CTR block 4k+11
  2896. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  2897. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
  2898. eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+7 - result
  2899. st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+7 - store result
  2900. b.lt L256_enc_main_loop
  2901. .L256_enc_prepretail: @ PREPRETAIL
  2902. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  2903. rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
  2904. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  2905. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
  2906. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  2907. rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
  2908. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
  2909. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  2910. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  2911. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  2912. eor $res0b, $res0b, $acc_lb @ PRE 1
  2913. rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
  2914. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  2915. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  2916. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  2917. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  2918. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  2919. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  2920. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  2921. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  2922. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  2923. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  2924. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  2925. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  2926. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  2927. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  2928. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  2929. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  2930. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  2931. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  2932. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  2933. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  2934. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  2935. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  2936. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  2937. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  2938. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  2939. rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
  2940. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  2941. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  2942. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  2943. add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
  2944. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  2945. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  2946. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  2947. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  2948. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  2949. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  2950. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  2951. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  2952. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  2953. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  2954. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  2955. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  2956. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  2957. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  2958. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  2959. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  2960. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  2961. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  2962. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  2963. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  2964. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  2965. movi $mod_constant.8b, #0xc2
  2966. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  2967. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  2968. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  2969. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  2970. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  2971. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  2972. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  2973. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  2974. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  2975. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  2976. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
  2977. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  2978. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  2979. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
  2980. eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
  2981. pmull $t1.1q, $acc_h.1d, $mod_constant.1d
  2982. ext $acc_hb, $acc_hb, $acc_hb, #8
  2983. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
  2984. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  2985. eor $acc_mb, $acc_mb, $acc_lb
  2986. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
  2987. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
  2988. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  2989. aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
  2990. eor $acc_mb, $acc_mb, $t1.16b
  2991. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
  2992. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
  2993. aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
  2994. aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
  2995. eor $acc_mb, $acc_mb, $acc_hb
  2996. aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
  2997. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
  2998. aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
  2999. pmull $t1.1q, $acc_m.1d, $mod_constant.1d
  3000. aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
  3001. ext $acc_mb, $acc_mb, $acc_mb, #8
  3002. aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
  3003. aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
  3004. eor $acc_lb, $acc_lb, $t1.16b
  3005. aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
  3006. aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
  3007. aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
  3008. aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
  3009. eor $acc_lb, $acc_lb, $acc_mb
  3010. .L256_enc_tail: @ TAIL
  3011. ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
  3012. sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
  3013. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
  3014. eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low
  3015. eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high
  3016. cmp $main_end_input_ptr, #48
  3017. fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
  3018. fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
  3019. eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
  3020. b.gt .L256_enc_blocks_more_than_3
  3021. cmp $main_end_input_ptr, #32
  3022. mov $ctr3b, $ctr2b
  3023. movi $acc_l.8b, #0
  3024. movi $acc_h.8b, #0
  3025. sub $rctr32w, $rctr32w, #1
  3026. mov $ctr2b, $ctr1b
  3027. movi $acc_m.8b, #0
  3028. b.gt .L256_enc_blocks_more_than_2
  3029. mov $ctr3b, $ctr1b
  3030. sub $rctr32w, $rctr32w, #1
  3031. cmp $main_end_input_ptr, #16
  3032. b.gt .L256_enc_blocks_more_than_1
  3033. sub $rctr32w, $rctr32w, #1
  3034. b .L256_enc_blocks_less_than_1
  3035. .L256_enc_blocks_more_than_3: @ blocks left > 3
  3036. st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
  3037. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
  3038. rev64 $res0b, $res1b @ GHASH final-3 block
  3039. eor $input_l0, $input_l0, $rk14_l @ AES final-2 block - round 14 low
  3040. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3041. eor $input_h0, $input_h0, $rk14_h @ AES final-2 block - round 14 high
  3042. mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
  3043. fmov $res1d, $input_l0 @ AES final-2 block - mov low
  3044. fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
  3045. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
  3046. movi $t0.8b, #0 @ suppress further partial tag feed in
  3047. mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
  3048. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
  3049. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
  3050. pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
  3051. eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
  3052. .L256_enc_blocks_more_than_2: @ blocks left > 2
  3053. st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
  3054. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
  3055. rev64 $res0b, $res1b @ GHASH final-2 block
  3056. eor $input_l0, $input_l0, $rk14_l @ AES final-1 block - round 14 low
  3057. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3058. fmov $res1d, $input_l0 @ AES final-1 block - mov low
  3059. eor $input_h0, $input_h0, $rk14_h @ AES final-1 block - round 14 high
  3060. fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
  3061. movi $t0.8b, #0 @ suppress further partial tag feed in
  3062. pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
  3063. mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
  3064. pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
  3065. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
  3066. eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
  3067. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
  3068. pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
  3069. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
  3070. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
  3071. .L256_enc_blocks_more_than_1: @ blocks left > 1
  3072. st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
  3073. rev64 $res0b, $res1b @ GHASH final-1 block
  3074. ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
  3075. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3076. movi $t0.8b, #0 @ suppress further partial tag feed in
  3077. eor $input_l0, $input_l0, $rk14_l @ AES final block - round 14 low
  3078. mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
  3079. pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
  3080. eor $input_h0, $input_h0, $rk14_h @ AES final block - round 14 high
  3081. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
  3082. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
  3083. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
  3084. fmov $res1d, $input_l0 @ AES final block - mov low
  3085. fmov $res1.d[1], $input_h0 @ AES final block - mov high
  3086. pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
  3087. pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
  3088. eor $res1b, $res1b, $ctr3b @ AES final block - result
  3089. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
  3090. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
  3091. .L256_enc_blocks_less_than_1: @ blocks left <= 1
  3092. and $bit_length, $bit_length, #127 @ bit_length %= 128
  3093. mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff
  3094. sub $bit_length, $bit_length, #128 @ bit_length -= 128
  3095. neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
  3096. ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
  3097. mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff
  3098. and $bit_length, $bit_length, #127 @ bit_length %= 128
  3099. lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block
  3100. cmp $bit_length, #64
  3101. csel $input_l0, $rk14_l, $rk14_h, lt
  3102. csel $input_h0, $rk14_h, xzr, lt
  3103. fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
  3104. fmov $ctr0.d[1], $input_h0
  3105. and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
  3106. rev64 $res0b, $res1b @ GHASH final block
  3107. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3108. bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
  3109. pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
  3110. mov $t0d, $res0.d[1] @ GHASH final block - mid
  3111. rev $ctr32w, $rctr32w
  3112. pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
  3113. eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
  3114. eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
  3115. pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
  3116. eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
  3117. eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
  3118. movi $mod_constant.8b, #0xc2
  3119. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  3120. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  3121. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  3122. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  3123. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  3124. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  3125. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  3126. pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  3127. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  3128. str $ctr32w, [$counter, #12] @ store the updated counter
  3129. st1 { $res1b}, [$output_ptr] @ store all 16B
  3130. eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
  3131. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  3132. ext $acc_lb, $acc_lb, $acc_lb, #8
  3133. rev64 $acc_lb, $acc_lb
  3134. mov x0, $len
  3135. st1 { $acc_l.16b }, [$current_tag]
  3136. ldp x21, x22, [sp, #16]
  3137. ldp x23, x24, [sp, #32]
  3138. ldp d8, d9, [sp, #48]
  3139. ldp d10, d11, [sp, #64]
  3140. ldp d12, d13, [sp, #80]
  3141. ldp d14, d15, [sp, #96]
  3142. ldp x19, x20, [sp], #112
  3143. ret
  3144. .L256_enc_ret:
  3145. mov w0, #0x0
  3146. ret
  3147. .size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel
  3148. ___
  3149. {
  3150. my $t8="v4";
  3151. my $t8d="d4";
  3152. my $t9="v6";
  3153. my $t9d="d6";
  3154. #########################################################################################
  3155. # size_t aes_gcm_dec_256_kernel(const unsigned char *in,
  3156. # size_t len,
  3157. # unsigned char *out,
  3158. # const void *key,
  3159. # unsigned char ivec[16],
  3160. # u64 *Xi);
  3161. #
  3162. $code.=<<___;
  3163. .global aes_gcm_dec_256_kernel
  3164. .type aes_gcm_dec_256_kernel,%function
  3165. .align 4
  3166. aes_gcm_dec_256_kernel:
  3167. AARCH64_VALID_CALL_TARGET
  3168. cbz x1, .L256_dec_ret
  3169. stp x19, x20, [sp, #-112]!
  3170. mov x16, x4
  3171. mov x8, x5
  3172. stp x21, x22, [sp, #16]
  3173. stp x23, x24, [sp, #32]
  3174. stp d8, d9, [sp, #48]
  3175. stp d10, d11, [sp, #64]
  3176. stp d12, d13, [sp, #80]
  3177. stp d14, d15, [sp, #96]
  3178. lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
  3179. mov $len, $main_end_input_ptr
  3180. ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
  3181. ldr $rk8q, [$cc, #128] @ load rk8
  3182. sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
  3183. ldr $rk7q, [$cc, #112] @ load rk7
  3184. and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
  3185. add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
  3186. ldr $rk6q, [$cc, #96] @ load rk6
  3187. lsr $rctr32x, $ctr96_t32x, #32
  3188. ldr $rk5q, [$cc, #80] @ load rk5
  3189. orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
  3190. ldr $rk3q, [$cc, #48] @ load rk3
  3191. add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
  3192. rev $rctr32w, $rctr32w @ rev_ctr32
  3193. add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
  3194. fmov $ctr3d, $ctr96_b64x @ CTR block 3
  3195. rev $ctr32w, $rctr32w @ CTR block 1
  3196. add $rctr32w, $rctr32w, #1 @ CTR block 1
  3197. fmov $ctr1d, $ctr96_b64x @ CTR block 1
  3198. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
  3199. ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
  3200. fmov $ctr1.d[1], $ctr32x @ CTR block 1
  3201. rev $ctr32w, $rctr32w @ CTR block 2
  3202. add $rctr32w, $rctr32w, #1 @ CTR block 2
  3203. fmov $ctr2d, $ctr96_b64x @ CTR block 2
  3204. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
  3205. fmov $ctr2.d[1], $ctr32x @ CTR block 2
  3206. rev $ctr32w, $rctr32w @ CTR block 3
  3207. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
  3208. ldr $rk0q, [$cc, #0] @ load rk0
  3209. fmov $ctr3.d[1], $ctr32x @ CTR block 3
  3210. add $rctr32w, $rctr32w, #1 @ CTR block 3
  3211. ldr $rk4q, [$cc, #64] @ load rk4
  3212. ldr $rk13q, [$cc, #208] @ load rk13
  3213. ldr $rk1q, [$cc, #16] @ load rk1
  3214. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
  3215. ldr $h3q, [$current_tag, #80] @ load h3l | h3h
  3216. ext $h3b, $h3b, $h3b, #8
  3217. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
  3218. ldr $h4q, [$current_tag, #112] @ load h4l | h4h
  3219. ext $h4b, $h4b, $h4b, #8
  3220. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
  3221. ldr $h2q, [$current_tag, #64] @ load h2l | h2h
  3222. ext $h2b, $h2b, $h2b, #8
  3223. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
  3224. ldr $rk2q, [$cc, #32] @ load rk2
  3225. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
  3226. ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14
  3227. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
  3228. ld1 { $acc_lb}, [$current_tag]
  3229. ext $acc_lb, $acc_lb, $acc_lb, #8
  3230. rev64 $acc_lb, $acc_lb
  3231. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
  3232. ldr $rk9q, [$cc, #144] @ load rk9
  3233. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
  3234. ldr $rk12q, [$cc, #192] @ load rk12
  3235. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
  3236. ldr $h1q, [$current_tag, #32] @ load h1l | h1h
  3237. ext $h1b, $h1b, $h1b, #8
  3238. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
  3239. ldr $rk10q, [$cc, #160] @ load rk10
  3240. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
  3241. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
  3242. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
  3243. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
  3244. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
  3245. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
  3246. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
  3247. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
  3248. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
  3249. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
  3250. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
  3251. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
  3252. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
  3253. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
  3254. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
  3255. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
  3256. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
  3257. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
  3258. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
  3259. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
  3260. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
  3261. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
  3262. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
  3263. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
  3264. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
  3265. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
  3266. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
  3267. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
  3268. ldr $rk11q, [$cc, #176] @ load rk11
  3269. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
  3270. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
  3271. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
  3272. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
  3273. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
  3274. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
  3275. aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
  3276. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
  3277. aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
  3278. aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
  3279. aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
  3280. trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
  3281. trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
  3282. trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
  3283. trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
  3284. aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
  3285. aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
  3286. aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
  3287. aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
  3288. eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
  3289. aese $ctr1b, $rk13 @ AES block 1 - round 13
  3290. aese $ctr2b, $rk13 @ AES block 2 - round 13
  3291. eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
  3292. aese $ctr3b, $rk13 @ AES block 3 - round 13
  3293. aese $ctr0b, $rk13 @ AES block 0 - round 13
  3294. b.ge .L256_dec_tail @ handle tail
  3295. ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
  3296. ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
  3297. rev $ctr32w, $rctr32w @ CTR block 4
  3298. eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
  3299. eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
  3300. rev64 $res1b, $res1b @ GHASH block 1
  3301. ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
  3302. mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
  3303. mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
  3304. rev64 $res0b, $res0b @ GHASH block 0
  3305. add $rctr32w, $rctr32w, #1 @ CTR block 4
  3306. fmov $ctr0d, $ctr96_b64x @ CTR block 4
  3307. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
  3308. fmov $ctr0.d[1], $ctr32x @ CTR block 4
  3309. rev $ctr32w, $rctr32w @ CTR block 5
  3310. add $rctr32w, $rctr32w, #1 @ CTR block 5
  3311. mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
  3312. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
  3313. mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
  3314. eor $output_h0, $output_h0, $rk14_h @ AES block 0 - round 14 high
  3315. eor $output_l0, $output_l0, $rk14_l @ AES block 0 - round 14 low
  3316. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
  3317. fmov $ctr1d, $ctr96_b64x @ CTR block 5
  3318. ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
  3319. add $input_ptr, $input_ptr, #64 @ AES input_ptr update
  3320. fmov $ctr1.d[1], $ctr32x @ CTR block 5
  3321. rev $ctr32w, $rctr32w @ CTR block 6
  3322. add $rctr32w, $rctr32w, #1 @ CTR block 6
  3323. eor $output_l1, $output_l1, $rk14_l @ AES block 1 - round 14 low
  3324. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
  3325. eor $output_h1, $output_h1, $rk14_h @ AES block 1 - round 14 high
  3326. stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
  3327. eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
  3328. cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
  3329. b.ge .L256_dec_prepretail @ do prepretail
  3330. .L256_dec_main_loop: @ main loop start
  3331. mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
  3332. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  3333. eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
  3334. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  3335. mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
  3336. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  3337. fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
  3338. fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
  3339. eor $res0b, $res0b, $acc_lb @ PRE 1
  3340. rev $ctr32w, $rctr32w @ CTR block 4k+7
  3341. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  3342. mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
  3343. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  3344. mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
  3345. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  3346. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  3347. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
  3348. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  3349. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
  3350. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  3351. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
  3352. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  3353. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  3354. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  3355. eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high
  3356. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  3357. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  3358. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  3359. rev64 $res2b, $res2b @ GHASH block 4k+2
  3360. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  3361. eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low
  3362. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  3363. stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
  3364. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  3365. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  3366. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  3367. rev64 $res3b, $res3b @ GHASH block 4k+3
  3368. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  3369. eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low
  3370. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  3371. eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high
  3372. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  3373. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  3374. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  3375. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  3376. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  3377. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  3378. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  3379. add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
  3380. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  3381. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  3382. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  3383. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  3384. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  3385. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  3386. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  3387. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  3388. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  3389. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  3390. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  3391. rev $ctr32w, $rctr32w @ CTR block 4k+8
  3392. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  3393. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  3394. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  3395. add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
  3396. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  3397. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  3398. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  3399. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  3400. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  3401. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  3402. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  3403. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  3404. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  3405. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  3406. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  3407. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  3408. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
  3409. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  3410. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  3411. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
  3412. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  3413. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  3414. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  3415. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  3416. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
  3417. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  3418. movi $mod_constant.8b, #0xc2
  3419. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  3420. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  3421. aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
  3422. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  3423. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  3424. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  3425. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  3426. aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
  3427. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  3428. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  3429. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
  3430. ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
  3431. aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
  3432. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  3433. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
  3434. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  3435. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
  3436. ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
  3437. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  3438. eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
  3439. aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
  3440. stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
  3441. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
  3442. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  3443. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
  3444. ldr $res3q, [$input_ptr, #48] @ AES block 4k+7 - load ciphertext
  3445. aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
  3446. ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
  3447. aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
  3448. mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
  3449. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
  3450. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  3451. aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
  3452. add $input_ptr, $input_ptr, #64 @ AES input_ptr update
  3453. mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
  3454. aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
  3455. fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
  3456. aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
  3457. fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
  3458. pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  3459. eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
  3460. rev $ctr32w, $rctr32w @ CTR block 4k+9
  3461. aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
  3462. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
  3463. cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
  3464. add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
  3465. eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low
  3466. eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high
  3467. mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
  3468. eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
  3469. eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
  3470. aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
  3471. mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
  3472. fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
  3473. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  3474. fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
  3475. rev $ctr32w, $rctr32w @ CTR block 4k+10
  3476. add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
  3477. aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
  3478. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
  3479. rev64 $res1b, $res1b @ GHASH block 4k+5
  3480. eor $output_h1, $output_h1, $rk14_h @ AES block 4k+5 - round 14 high
  3481. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
  3482. eor $output_l1, $output_l1, $rk14_l @ AES block 4k+5 - round 14 low
  3483. stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
  3484. rev64 $res0b, $res0b @ GHASH block 4k+4
  3485. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  3486. b.lt .L256_dec_main_loop
  3487. .L256_dec_prepretail: @ PREPRETAIL
  3488. ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
  3489. mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
  3490. eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
  3491. aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
  3492. mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
  3493. aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
  3494. fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
  3495. fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
  3496. rev $ctr32w, $rctr32w @ CTR block 4k+7
  3497. eor $res0b, $res0b, $acc_lb @ PRE 1
  3498. rev64 $res2b, $res2b @ GHASH block 4k+2
  3499. orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
  3500. mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
  3501. aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
  3502. mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
  3503. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
  3504. mov $t0d, $res0.d[1] @ GHASH block 4k - mid
  3505. fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
  3506. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
  3507. fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
  3508. aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
  3509. mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
  3510. aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
  3511. eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
  3512. pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
  3513. aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
  3514. rev64 $res3b, $res3b @ GHASH block 4k+3
  3515. aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
  3516. pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
  3517. eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
  3518. pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
  3519. aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
  3520. mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
  3521. aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
  3522. aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
  3523. eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
  3524. aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
  3525. aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
  3526. mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
  3527. aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
  3528. eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
  3529. pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
  3530. aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
  3531. aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
  3532. eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
  3533. pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
  3534. aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
  3535. eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
  3536. aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
  3537. pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
  3538. eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
  3539. pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
  3540. aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
  3541. ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
  3542. aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
  3543. aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
  3544. eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
  3545. pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
  3546. aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
  3547. mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
  3548. aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
  3549. pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
  3550. aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
  3551. eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
  3552. aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
  3553. aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
  3554. eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
  3555. aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
  3556. aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
  3557. movi $mod_constant.8b, #0xc2
  3558. aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
  3559. eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
  3560. pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
  3561. aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
  3562. eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
  3563. aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
  3564. aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
  3565. eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
  3566. aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
  3567. aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
  3568. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  3569. aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
  3570. aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
  3571. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  3572. aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
  3573. aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
  3574. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  3575. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  3576. aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
  3577. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  3578. aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
  3579. aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
  3580. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  3581. aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
  3582. aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
  3583. aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
  3584. eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high
  3585. aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
  3586. eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low
  3587. aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
  3588. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  3589. aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
  3590. add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
  3591. aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
  3592. eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low
  3593. aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
  3594. pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  3595. eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high
  3596. aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
  3597. stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
  3598. aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
  3599. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  3600. aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
  3601. stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
  3602. aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
  3603. eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
  3604. aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
  3605. aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
  3606. aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
  3607. aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
  3608. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  3609. .L256_dec_tail: @ TAIL
  3610. sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
  3611. ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
  3612. eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
  3613. mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
  3614. mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
  3615. ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
  3616. cmp $main_end_input_ptr, #48
  3617. eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low
  3618. eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high
  3619. b.gt .L256_dec_blocks_more_than_3
  3620. sub $rctr32w, $rctr32w, #1
  3621. mov $ctr3b, $ctr2b
  3622. movi $acc_m.8b, #0
  3623. movi $acc_l.8b, #0
  3624. cmp $main_end_input_ptr, #32
  3625. movi $acc_h.8b, #0
  3626. mov $ctr2b, $ctr1b
  3627. b.gt .L256_dec_blocks_more_than_2
  3628. sub $rctr32w, $rctr32w, #1
  3629. mov $ctr3b, $ctr1b
  3630. cmp $main_end_input_ptr, #16
  3631. b.gt .L256_dec_blocks_more_than_1
  3632. sub $rctr32w, $rctr32w, #1
  3633. b .L256_dec_blocks_less_than_1
  3634. .L256_dec_blocks_more_than_3: @ blocks left > 3
  3635. rev64 $res0b, $res1b @ GHASH final-3 block
  3636. ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
  3637. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
  3638. mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
  3639. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3640. eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
  3641. mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
  3642. mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
  3643. mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
  3644. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
  3645. movi $t0.8b, #0 @ suppress further partial tag feed in
  3646. pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
  3647. pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
  3648. eor $output_l0, $output_l0, $rk14_l @ AES final-2 block - round 14 low
  3649. pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
  3650. eor $output_h0, $output_h0, $rk14_h @ AES final-2 block - round 14 high
  3651. .L256_dec_blocks_more_than_2: @ blocks left > 2
  3652. rev64 $res0b, $res1b @ GHASH final-2 block
  3653. ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
  3654. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3655. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
  3656. eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
  3657. mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
  3658. pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
  3659. pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
  3660. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
  3661. mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
  3662. mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
  3663. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
  3664. movi $t0.8b, #0 @ suppress further partial tag feed in
  3665. pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
  3666. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
  3667. eor $output_l0, $output_l0, $rk14_l @ AES final-1 block - round 14 low
  3668. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
  3669. eor $output_h0, $output_h0, $rk14_h @ AES final-1 block - round 14 high
  3670. .L256_dec_blocks_more_than_1: @ blocks left > 1
  3671. stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
  3672. rev64 $res0b, $res1b @ GHASH final-1 block
  3673. ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
  3674. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3675. movi $t0.8b, #0 @ suppress further partial tag feed in
  3676. mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
  3677. eor $ctr0b, $res1b, $ctr3b @ AES final block - result
  3678. pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
  3679. eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
  3680. pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
  3681. mov $output_l0, $ctr0.d[0] @ AES final block - mov low
  3682. ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
  3683. mov $output_h0, $ctr0.d[1] @ AES final block - mov high
  3684. pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
  3685. eor $output_l0, $output_l0, $rk14_l @ AES final block - round 14 low
  3686. eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
  3687. eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
  3688. eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
  3689. eor $output_h0, $output_h0, $rk14_h @ AES final block - round 14 high
  3690. .L256_dec_blocks_less_than_1: @ blocks left <= 1
  3691. and $bit_length, $bit_length, #127 @ bit_length %= 128
  3692. mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff
  3693. sub $bit_length, $bit_length, #128 @ bit_length -= 128
  3694. mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff
  3695. ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
  3696. neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
  3697. and $bit_length, $bit_length, #127 @ bit_length %= 128
  3698. lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block
  3699. cmp $bit_length, #64
  3700. csel $ctr32x, $rk14_l, $rk14_h, lt
  3701. csel $ctr96_b64x, $rk14_h, xzr, lt
  3702. fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
  3703. and $output_l0, $output_l0, $ctr32x
  3704. mov $ctr0.d[1], $ctr96_b64x
  3705. bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
  3706. rev $ctr32w, $rctr32w
  3707. bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
  3708. orr $output_l0, $output_l0, $end_input_ptr
  3709. and $output_h0, $output_h0, $ctr96_b64x
  3710. orr $output_h0, $output_h0, $main_end_input_ptr
  3711. and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
  3712. rev64 $res0b, $res1b @ GHASH final block
  3713. eor $res0b, $res0b, $t0.16b @ feed in partial tag
  3714. pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
  3715. mov $t0d, $res0.d[1] @ GHASH final block - mid
  3716. eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
  3717. pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
  3718. pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
  3719. eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
  3720. eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
  3721. eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
  3722. movi $mod_constant.8b, #0xc2
  3723. eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
  3724. shl $mod_constantd, $mod_constantd, #56 @ mod_constant
  3725. eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
  3726. pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
  3727. ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
  3728. eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
  3729. eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
  3730. pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
  3731. ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
  3732. eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
  3733. stp $output_l0, $output_h0, [$output_ptr]
  3734. str $ctr32w, [$counter, #12] @ store the updated counter
  3735. eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
  3736. ext $acc_lb, $acc_lb, $acc_lb, #8
  3737. rev64 $acc_lb, $acc_lb
  3738. mov x0, $len
  3739. st1 { $acc_l.16b }, [$current_tag]
  3740. ldp x21, x22, [sp, #16]
  3741. ldp x23, x24, [sp, #32]
  3742. ldp d8, d9, [sp, #48]
  3743. ldp d10, d11, [sp, #64]
  3744. ldp d12, d13, [sp, #80]
  3745. ldp d14, d15, [sp, #96]
  3746. ldp x19, x20, [sp], #112
  3747. ret
  3748. .L256_dec_ret:
  3749. mov w0, #0x0
  3750. ret
  3751. .size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel
  3752. ___
  3753. }
  3754. }
  3755. $code.=<<___;
  3756. .asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  3757. .align 2
  3758. #endif
  3759. ___
  3760. if ($flavour =~ /64/) { ######## 64-bit code
  3761. sub unvmov {
  3762. my $arg=shift;
  3763. $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
  3764. sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
  3765. $3<8?$3:$3+8,($4 eq "lo")?0:1;
  3766. }
  3767. foreach(split("\n",$code)) {
  3768. s/@\s/\/\//o; # old->new style commentary
  3769. print $_,"\n";
  3770. }
  3771. } else { ######## 32-bit code
  3772. sub unvdup32 {
  3773. my $arg=shift;
  3774. $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
  3775. sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
  3776. }
  3777. sub unvpmullp64 {
  3778. my ($mnemonic,$arg)=@_;
  3779. if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
  3780. my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
  3781. |(($2&7)<<17)|(($2&8)<<4)
  3782. |(($3&7)<<1) |(($3&8)<<2);
  3783. $word |= 0x00010001 if ($mnemonic =~ "2");
  3784. # since ARMv7 instructions are always encoded little-endian.
  3785. # correct solution is to use .inst directive, but older%%%%
  3786. # assemblers don't implement it:-(
  3787. sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
  3788. $word&0xff,($word>>8)&0xff,
  3789. ($word>>16)&0xff,($word>>24)&0xff,
  3790. $mnemonic,$arg;
  3791. }
  3792. }
  3793. foreach(split("\n",$code)) {
  3794. s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
  3795. s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
  3796. s/\/\/\s?/@ /o; # new->old style commentary
  3797. # fix up remaining new-style suffixes
  3798. s/\],#[0-9]+/]!/o;
  3799. s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
  3800. s/vdup\.32\s+(.*)/unvdup32($1)/geo or
  3801. s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
  3802. s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
  3803. s/^(\s+)b\./$1b/o or
  3804. s/^(\s+)ret/$1bx\tlr/o;
  3805. if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
  3806. print " it $2\n";
  3807. }
  3808. print $_,"\n";
  3809. }
  3810. }
  3811. close STDOUT or die "error closing STDOUT: $!"; # enforce flush