aesv8-armx.pl 99 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515
  1. #! /usr/bin/env perl
  2. # Copyright 2014-2023 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # This module implements support for ARMv8 AES instructions. The
  17. # module is endian-agnostic in sense that it supports both big- and
  18. # little-endian cases. As does it support both 32- and 64-bit modes
  19. # of operation. Latter is achieved by limiting amount of utilized
  20. # registers to 16, which implies additional NEON load and integer
  21. # instructions. This has no effect on mighty Apple A7, where results
  22. # are literally equal to the theoretical estimates based on AES
  23. # instruction latencies and issue rates. On Cortex-A53, an in-order
  24. # execution core, this costs up to 10-15%, which is partially
  25. # compensated by implementing dedicated code path for 128-bit
  26. # CBC encrypt case. On Cortex-A57 parallelizable mode performance
  27. # seems to be limited by sheer amount of NEON instructions...
  28. #
  29. # April 2019
  30. #
  31. # Key to performance of parallelize-able modes is round instruction
  32. # interleaving. But which factor to use? There is optimal one for
  33. # each combination of instruction latency and issue rate, beyond
  34. # which increasing interleave factor doesn't pay off. While on cons
  35. # side we have code size increase and resource waste on platforms for
  36. # which interleave factor is too high. In other words you want it to
  37. # be just right. So far interleave factor of 3x was serving well all
  38. # platforms. But for ThunderX2 optimal interleave factor was measured
  39. # to be 5x...
  40. #
  41. # Performance in cycles per byte processed with 128-bit key:
  42. #
  43. # CBC enc CBC dec CTR
  44. # Apple A7 2.39 1.20 1.20
  45. # Cortex-A53 1.32 1.17/1.29(**) 1.36/1.46
  46. # Cortex-A57(*) 1.95 0.82/0.85 0.89/0.93
  47. # Cortex-A72 1.33 0.85/0.88 0.92/0.96
  48. # Denver 1.96 0.65/0.86 0.76/0.80
  49. # Mongoose 1.33 1.23/1.20 1.30/1.20
  50. # Kryo 1.26 0.87/0.94 1.00/1.00
  51. # ThunderX2 5.95 1.25 1.30
  52. #
  53. # (*) original 3.64/1.34/1.32 results were for r0p0 revision
  54. # and are still same even for updated module;
  55. # (**) numbers after slash are for 32-bit code, which is 3x-
  56. # interleaved;
  57. # $output is the last argument if it looks like a file (it has an extension)
  58. # $flavour is the first argument if it doesn't look like a file
  59. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  60. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  61. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  62. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  63. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  64. die "can't locate arm-xlate.pl";
  65. open OUT,"| \"$^X\" $xlate $flavour \"$output\""
  66. or die "can't call $xlate: $!";
  67. *STDOUT=*OUT;
  68. $prefix="aes_v8";
  69. $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
  70. $code=<<___;
  71. #include "arm_arch.h"
  72. #if __ARM_MAX_ARCH__>=7
  73. ___
  74. $code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
  75. $code.=<<___ if ($flavour !~ /64/);
  76. .arch armv7-a // don't confuse not-so-latest binutils with argv8 :-)
  77. .fpu neon
  78. #ifdef __thumb2__
  79. .syntax unified
  80. .thumb
  81. # define INST(a,b,c,d) $_byte c,d|0xc,a,b
  82. #else
  83. .code 32
  84. # define INST(a,b,c,d) $_byte a,b,c,d
  85. #endif
  86. .text
  87. ___
  88. # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
  89. # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
  90. # maintain both 32- and 64-bit codes within single module and
  91. # transliterate common code to either flavour with regex vodoo.
  92. #
  93. {{{
  94. my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
  95. my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
  96. $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
  97. $code.=<<___;
  98. .align 5
  99. .Lrcon:
  100. .long 0x01,0x01,0x01,0x01
  101. .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
  102. .long 0x1b,0x1b,0x1b,0x1b
  103. .globl ${prefix}_set_encrypt_key
  104. .type ${prefix}_set_encrypt_key,%function
  105. .align 5
  106. ${prefix}_set_encrypt_key:
  107. .Lenc_key:
  108. ___
  109. $code.=<<___ if ($flavour =~ /64/);
  110. AARCH64_VALID_CALL_TARGET
  111. // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
  112. stp x29,x30,[sp,#-16]!
  113. add x29,sp,#0
  114. ___
  115. $code.=<<___;
  116. mov $ptr,#-1
  117. cmp $inp,#0
  118. b.eq .Lenc_key_abort
  119. cmp $out,#0
  120. b.eq .Lenc_key_abort
  121. mov $ptr,#-2
  122. cmp $bits,#128
  123. b.lt .Lenc_key_abort
  124. cmp $bits,#256
  125. b.gt .Lenc_key_abort
  126. tst $bits,#0x3f
  127. b.ne .Lenc_key_abort
  128. adr $ptr,.Lrcon
  129. cmp $bits,#192
  130. veor $zero,$zero,$zero
  131. vld1.8 {$in0},[$inp],#16
  132. mov $bits,#8 // reuse $bits
  133. vld1.32 {$rcon,$mask},[$ptr],#32
  134. b.lt .Loop128
  135. b.eq .L192
  136. b .L256
  137. .align 4
  138. .Loop128:
  139. vtbl.8 $key,{$in0},$mask
  140. vext.8 $tmp,$zero,$in0,#12
  141. vst1.32 {$in0},[$out],#16
  142. aese $key,$zero
  143. subs $bits,$bits,#1
  144. veor $in0,$in0,$tmp
  145. vext.8 $tmp,$zero,$tmp,#12
  146. veor $in0,$in0,$tmp
  147. vext.8 $tmp,$zero,$tmp,#12
  148. veor $key,$key,$rcon
  149. veor $in0,$in0,$tmp
  150. vshl.u8 $rcon,$rcon,#1
  151. veor $in0,$in0,$key
  152. b.ne .Loop128
  153. vld1.32 {$rcon},[$ptr]
  154. vtbl.8 $key,{$in0},$mask
  155. vext.8 $tmp,$zero,$in0,#12
  156. vst1.32 {$in0},[$out],#16
  157. aese $key,$zero
  158. veor $in0,$in0,$tmp
  159. vext.8 $tmp,$zero,$tmp,#12
  160. veor $in0,$in0,$tmp
  161. vext.8 $tmp,$zero,$tmp,#12
  162. veor $key,$key,$rcon
  163. veor $in0,$in0,$tmp
  164. vshl.u8 $rcon,$rcon,#1
  165. veor $in0,$in0,$key
  166. vtbl.8 $key,{$in0},$mask
  167. vext.8 $tmp,$zero,$in0,#12
  168. vst1.32 {$in0},[$out],#16
  169. aese $key,$zero
  170. veor $in0,$in0,$tmp
  171. vext.8 $tmp,$zero,$tmp,#12
  172. veor $in0,$in0,$tmp
  173. vext.8 $tmp,$zero,$tmp,#12
  174. veor $key,$key,$rcon
  175. veor $in0,$in0,$tmp
  176. veor $in0,$in0,$key
  177. vst1.32 {$in0},[$out]
  178. add $out,$out,#0x50
  179. mov $rounds,#10
  180. b .Ldone
  181. .align 4
  182. .L192:
  183. vld1.8 {$in1},[$inp],#8
  184. vmov.i8 $key,#8 // borrow $key
  185. vst1.32 {$in0},[$out],#16
  186. vsub.i8 $mask,$mask,$key // adjust the mask
  187. .Loop192:
  188. vtbl.8 $key,{$in1},$mask
  189. vext.8 $tmp,$zero,$in0,#12
  190. #ifdef __ARMEB__
  191. vst1.32 {$in1},[$out],#16
  192. sub $out,$out,#8
  193. #else
  194. vst1.32 {$in1},[$out],#8
  195. #endif
  196. aese $key,$zero
  197. subs $bits,$bits,#1
  198. veor $in0,$in0,$tmp
  199. vext.8 $tmp,$zero,$tmp,#12
  200. veor $in0,$in0,$tmp
  201. vext.8 $tmp,$zero,$tmp,#12
  202. veor $in0,$in0,$tmp
  203. vdup.32 $tmp,${in0}[3]
  204. veor $tmp,$tmp,$in1
  205. veor $key,$key,$rcon
  206. vext.8 $in1,$zero,$in1,#12
  207. vshl.u8 $rcon,$rcon,#1
  208. veor $in1,$in1,$tmp
  209. veor $in0,$in0,$key
  210. veor $in1,$in1,$key
  211. vst1.32 {$in0},[$out],#16
  212. b.ne .Loop192
  213. mov $rounds,#12
  214. add $out,$out,#0x20
  215. b .Ldone
  216. .align 4
  217. .L256:
  218. vld1.8 {$in1},[$inp]
  219. mov $bits,#7
  220. mov $rounds,#14
  221. vst1.32 {$in0},[$out],#16
  222. .Loop256:
  223. vtbl.8 $key,{$in1},$mask
  224. vext.8 $tmp,$zero,$in0,#12
  225. vst1.32 {$in1},[$out],#16
  226. aese $key,$zero
  227. subs $bits,$bits,#1
  228. veor $in0,$in0,$tmp
  229. vext.8 $tmp,$zero,$tmp,#12
  230. veor $in0,$in0,$tmp
  231. vext.8 $tmp,$zero,$tmp,#12
  232. veor $key,$key,$rcon
  233. veor $in0,$in0,$tmp
  234. vshl.u8 $rcon,$rcon,#1
  235. veor $in0,$in0,$key
  236. vst1.32 {$in0},[$out],#16
  237. b.eq .Ldone
  238. vdup.32 $key,${in0}[3] // just splat
  239. vext.8 $tmp,$zero,$in1,#12
  240. aese $key,$zero
  241. veor $in1,$in1,$tmp
  242. vext.8 $tmp,$zero,$tmp,#12
  243. veor $in1,$in1,$tmp
  244. vext.8 $tmp,$zero,$tmp,#12
  245. veor $in1,$in1,$tmp
  246. veor $in1,$in1,$key
  247. b .Loop256
  248. .Ldone:
  249. str $rounds,[$out]
  250. mov $ptr,#0
  251. .Lenc_key_abort:
  252. mov x0,$ptr // return value
  253. `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
  254. ret
  255. .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
  256. .globl ${prefix}_set_decrypt_key
  257. .type ${prefix}_set_decrypt_key,%function
  258. .align 5
  259. ${prefix}_set_decrypt_key:
  260. ___
  261. $code.=<<___ if ($flavour =~ /64/);
  262. AARCH64_SIGN_LINK_REGISTER
  263. stp x29,x30,[sp,#-16]!
  264. add x29,sp,#0
  265. ___
  266. $code.=<<___ if ($flavour !~ /64/);
  267. stmdb sp!,{r4,lr}
  268. ___
  269. $code.=<<___;
  270. bl .Lenc_key
  271. cmp x0,#0
  272. b.ne .Ldec_key_abort
  273. sub $out,$out,#240 // restore original $out
  274. mov x4,#-16
  275. add $inp,$out,x12,lsl#4 // end of key schedule
  276. vld1.32 {v0.16b},[$out]
  277. vld1.32 {v1.16b},[$inp]
  278. vst1.32 {v0.16b},[$inp],x4
  279. vst1.32 {v1.16b},[$out],#16
  280. .Loop_imc:
  281. vld1.32 {v0.16b},[$out]
  282. vld1.32 {v1.16b},[$inp]
  283. aesimc v0.16b,v0.16b
  284. aesimc v1.16b,v1.16b
  285. vst1.32 {v0.16b},[$inp],x4
  286. vst1.32 {v1.16b},[$out],#16
  287. cmp $inp,$out
  288. b.hi .Loop_imc
  289. vld1.32 {v0.16b},[$out]
  290. aesimc v0.16b,v0.16b
  291. vst1.32 {v0.16b},[$inp]
  292. eor x0,x0,x0 // return value
  293. .Ldec_key_abort:
  294. ___
  295. $code.=<<___ if ($flavour !~ /64/);
  296. ldmia sp!,{r4,pc}
  297. ___
  298. $code.=<<___ if ($flavour =~ /64/);
  299. ldp x29,x30,[sp],#16
  300. AARCH64_VALIDATE_LINK_REGISTER
  301. ret
  302. ___
  303. $code.=<<___;
  304. .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
  305. ___
  306. }}}
  307. {{{
  308. sub gen_block () {
  309. my $dir = shift;
  310. my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
  311. my ($inp,$out,$key)=map("x$_",(0..2));
  312. my $rounds="w3";
  313. my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
  314. $code.=<<___;
  315. .globl ${prefix}_${dir}crypt
  316. .type ${prefix}_${dir}crypt,%function
  317. .align 5
  318. ${prefix}_${dir}crypt:
  319. ___
  320. $code.=<<___ if ($flavour =~ /64/);
  321. AARCH64_VALID_CALL_TARGET
  322. ___
  323. $code.=<<___;
  324. ldr $rounds,[$key,#240]
  325. vld1.32 {$rndkey0},[$key],#16
  326. vld1.8 {$inout},[$inp]
  327. sub $rounds,$rounds,#2
  328. vld1.32 {$rndkey1},[$key],#16
  329. .Loop_${dir}c:
  330. aes$e $inout,$rndkey0
  331. aes$mc $inout,$inout
  332. vld1.32 {$rndkey0},[$key],#16
  333. subs $rounds,$rounds,#2
  334. aes$e $inout,$rndkey1
  335. aes$mc $inout,$inout
  336. vld1.32 {$rndkey1},[$key],#16
  337. b.gt .Loop_${dir}c
  338. aes$e $inout,$rndkey0
  339. aes$mc $inout,$inout
  340. vld1.32 {$rndkey0},[$key]
  341. aes$e $inout,$rndkey1
  342. veor $inout,$inout,$rndkey0
  343. vst1.8 {$inout},[$out]
  344. ret
  345. .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
  346. ___
  347. }
  348. &gen_block("en");
  349. &gen_block("de");
  350. }}}
  351. # Performance in cycles per byte.
  352. # Processed with AES-ECB different key size.
  353. # It shows the value before and after optimization as below:
  354. # (before/after):
  355. #
  356. # AES-128-ECB AES-192-ECB AES-256-ECB
  357. # Cortex-A57 1.85/0.82 2.16/0.96 2.47/1.10
  358. # Cortex-A72 1.64/0.85 1.82/0.99 2.13/1.14
  359. # Optimization is implemented by loop unrolling and interleaving.
  360. # Commonly, we choose the unrolling factor as 5, if the input
  361. # data size smaller than 5 blocks, but not smaller than 3 blocks,
  362. # choose 3 as the unrolling factor.
  363. # If the input data size dsize >= 5*16 bytes, then take 5 blocks
  364. # as one iteration, every loop the left size lsize -= 5*16.
  365. # If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration,
  366. # every loop lsize -=3*16.
  367. # If lsize < 3*16 bytes, treat them as the tail, interleave the
  368. # two blocks AES instructions.
  369. # There is one special case, if the original input data size dsize
  370. # = 16 bytes, we will treat it separately to improve the
  371. # performance: one independent code block without LR, FP load and
  372. # store, just looks like what the original ECB implementation does.
  373. {{{
  374. my ($inp,$out,$len,$key)=map("x$_",(0..3));
  375. my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8");
  376. my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
  377. my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
  378. ### q7 last round key
  379. ### q10-q15 q7 Last 7 round keys
  380. ### q8-q9 preloaded round keys except last 7 keys for big size
  381. ### q5, q6, q8-q9 preloaded round keys except last 7 keys for only 16 byte
  382. {
  383. my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
  384. my ($dat3,$in3,$tmp3); # used only in 64-bit mode
  385. my ($dat4,$in4,$tmp4);
  386. if ($flavour =~ /64/) {
  387. ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
  388. }
  389. $code.=<<___;
  390. .globl ${prefix}_ecb_encrypt
  391. .type ${prefix}_ecb_encrypt,%function
  392. .align 5
  393. ${prefix}_ecb_encrypt:
  394. ___
  395. $code.=<<___ if ($flavour =~ /64/);
  396. AARCH64_VALID_CALL_TARGET
  397. subs $len,$len,#16
  398. // Original input data size bigger than 16, jump to big size processing.
  399. b.ne .Lecb_big_size
  400. vld1.8 {$dat0},[$inp]
  401. cmp $enc,#0 // en- or decrypting?
  402. ldr $rounds,[$key,#240]
  403. vld1.32 {q5-q6},[$key],#32 // load key schedule...
  404. b.eq .Lecb_small_dec
  405. aese $dat0,q5
  406. aesmc $dat0,$dat0
  407. vld1.32 {q8-q9},[$key],#32 // load key schedule...
  408. aese $dat0,q6
  409. aesmc $dat0,$dat0
  410. subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-ecb processing
  411. b.eq .Lecb_128_enc
  412. .Lecb_round_loop:
  413. aese $dat0,q8
  414. aesmc $dat0,$dat0
  415. vld1.32 {q8},[$key],#16 // load key schedule...
  416. aese $dat0,q9
  417. aesmc $dat0,$dat0
  418. vld1.32 {q9},[$key],#16 // load key schedule...
  419. subs $rounds,$rounds,#2 // bias
  420. b.gt .Lecb_round_loop
  421. .Lecb_128_enc:
  422. vld1.32 {q10-q11},[$key],#32 // load key schedule...
  423. aese $dat0,q8
  424. aesmc $dat0,$dat0
  425. aese $dat0,q9
  426. aesmc $dat0,$dat0
  427. vld1.32 {q12-q13},[$key],#32 // load key schedule...
  428. aese $dat0,q10
  429. aesmc $dat0,$dat0
  430. aese $dat0,q11
  431. aesmc $dat0,$dat0
  432. vld1.32 {q14-q15},[$key],#32 // load key schedule...
  433. aese $dat0,q12
  434. aesmc $dat0,$dat0
  435. aese $dat0,q13
  436. aesmc $dat0,$dat0
  437. vld1.32 {$rndlast},[$key]
  438. aese $dat0,q14
  439. aesmc $dat0,$dat0
  440. aese $dat0,q15
  441. veor $dat0,$dat0,$rndlast
  442. vst1.8 {$dat0},[$out]
  443. b .Lecb_Final_abort
  444. .Lecb_small_dec:
  445. aesd $dat0,q5
  446. aesimc $dat0,$dat0
  447. vld1.32 {q8-q9},[$key],#32 // load key schedule...
  448. aesd $dat0,q6
  449. aesimc $dat0,$dat0
  450. subs $rounds,$rounds,#10 // bias
  451. b.eq .Lecb_128_dec
  452. .Lecb_dec_round_loop:
  453. aesd $dat0,q8
  454. aesimc $dat0,$dat0
  455. vld1.32 {q8},[$key],#16 // load key schedule...
  456. aesd $dat0,q9
  457. aesimc $dat0,$dat0
  458. vld1.32 {q9},[$key],#16 // load key schedule...
  459. subs $rounds,$rounds,#2 // bias
  460. b.gt .Lecb_dec_round_loop
  461. .Lecb_128_dec:
  462. vld1.32 {q10-q11},[$key],#32 // load key schedule...
  463. aesd $dat0,q8
  464. aesimc $dat0,$dat0
  465. aesd $dat0,q9
  466. aesimc $dat0,$dat0
  467. vld1.32 {q12-q13},[$key],#32 // load key schedule...
  468. aesd $dat0,q10
  469. aesimc $dat0,$dat0
  470. aesd $dat0,q11
  471. aesimc $dat0,$dat0
  472. vld1.32 {q14-q15},[$key],#32 // load key schedule...
  473. aesd $dat0,q12
  474. aesimc $dat0,$dat0
  475. aesd $dat0,q13
  476. aesimc $dat0,$dat0
  477. vld1.32 {$rndlast},[$key]
  478. aesd $dat0,q14
  479. aesimc $dat0,$dat0
  480. aesd $dat0,q15
  481. veor $dat0,$dat0,$rndlast
  482. vst1.8 {$dat0},[$out]
  483. b .Lecb_Final_abort
  484. .Lecb_big_size:
  485. ___
  486. $code.=<<___ if ($flavour =~ /64/);
  487. stp x29,x30,[sp,#-16]!
  488. add x29,sp,#0
  489. ___
  490. $code.=<<___ if ($flavour !~ /64/);
  491. mov ip,sp
  492. stmdb sp!,{r4-r8,lr}
  493. vstmdb sp!,{d8-d15} @ ABI specification says so
  494. ldmia ip,{r4-r5} @ load remaining args
  495. subs $len,$len,#16
  496. ___
  497. $code.=<<___;
  498. mov $step,#16
  499. b.lo .Lecb_done
  500. cclr $step,eq
  501. cmp $enc,#0 // en- or decrypting?
  502. ldr $rounds,[$key,#240]
  503. and $len,$len,#-16
  504. vld1.8 {$dat},[$inp],$step
  505. vld1.32 {q8-q9},[$key] // load key schedule...
  506. sub $rounds,$rounds,#6
  507. add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
  508. sub $rounds,$rounds,#2
  509. vld1.32 {q10-q11},[$key_],#32
  510. vld1.32 {q12-q13},[$key_],#32
  511. vld1.32 {q14-q15},[$key_],#32
  512. vld1.32 {$rndlast},[$key_]
  513. add $key_,$key,#32
  514. mov $cnt,$rounds
  515. b.eq .Lecb_dec
  516. vld1.8 {$dat1},[$inp],#16
  517. subs $len,$len,#32 // bias
  518. add $cnt,$rounds,#2
  519. vorr $in1,$dat1,$dat1
  520. vorr $dat2,$dat1,$dat1
  521. vorr $dat1,$dat,$dat
  522. b.lo .Lecb_enc_tail
  523. vorr $dat1,$in1,$in1
  524. vld1.8 {$dat2},[$inp],#16
  525. ___
  526. $code.=<<___ if ($flavour =~ /64/);
  527. cmp $len,#32
  528. b.lo .Loop3x_ecb_enc
  529. vld1.8 {$dat3},[$inp],#16
  530. vld1.8 {$dat4},[$inp],#16
  531. sub $len,$len,#32 // bias
  532. mov $cnt,$rounds
  533. .Loop5x_ecb_enc:
  534. aese $dat0,q8
  535. aesmc $dat0,$dat0
  536. aese $dat1,q8
  537. aesmc $dat1,$dat1
  538. aese $dat2,q8
  539. aesmc $dat2,$dat2
  540. aese $dat3,q8
  541. aesmc $dat3,$dat3
  542. aese $dat4,q8
  543. aesmc $dat4,$dat4
  544. vld1.32 {q8},[$key_],#16
  545. subs $cnt,$cnt,#2
  546. aese $dat0,q9
  547. aesmc $dat0,$dat0
  548. aese $dat1,q9
  549. aesmc $dat1,$dat1
  550. aese $dat2,q9
  551. aesmc $dat2,$dat2
  552. aese $dat3,q9
  553. aesmc $dat3,$dat3
  554. aese $dat4,q9
  555. aesmc $dat4,$dat4
  556. vld1.32 {q9},[$key_],#16
  557. b.gt .Loop5x_ecb_enc
  558. aese $dat0,q8
  559. aesmc $dat0,$dat0
  560. aese $dat1,q8
  561. aesmc $dat1,$dat1
  562. aese $dat2,q8
  563. aesmc $dat2,$dat2
  564. aese $dat3,q8
  565. aesmc $dat3,$dat3
  566. aese $dat4,q8
  567. aesmc $dat4,$dat4
  568. cmp $len,#0x40 // because .Lecb_enc_tail4x
  569. sub $len,$len,#0x50
  570. aese $dat0,q9
  571. aesmc $dat0,$dat0
  572. aese $dat1,q9
  573. aesmc $dat1,$dat1
  574. aese $dat2,q9
  575. aesmc $dat2,$dat2
  576. aese $dat3,q9
  577. aesmc $dat3,$dat3
  578. aese $dat4,q9
  579. aesmc $dat4,$dat4
  580. csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
  581. mov $key_,$key
  582. aese $dat0,q10
  583. aesmc $dat0,$dat0
  584. aese $dat1,q10
  585. aesmc $dat1,$dat1
  586. aese $dat2,q10
  587. aesmc $dat2,$dat2
  588. aese $dat3,q10
  589. aesmc $dat3,$dat3
  590. aese $dat4,q10
  591. aesmc $dat4,$dat4
  592. add $inp,$inp,x6 // $inp is adjusted in such way that
  593. // at exit from the loop $dat1-$dat4
  594. // are loaded with last "words"
  595. add x6,$len,#0x60 // because .Lecb_enc_tail4x
  596. aese $dat0,q11
  597. aesmc $dat0,$dat0
  598. aese $dat1,q11
  599. aesmc $dat1,$dat1
  600. aese $dat2,q11
  601. aesmc $dat2,$dat2
  602. aese $dat3,q11
  603. aesmc $dat3,$dat3
  604. aese $dat4,q11
  605. aesmc $dat4,$dat4
  606. aese $dat0,q12
  607. aesmc $dat0,$dat0
  608. aese $dat1,q12
  609. aesmc $dat1,$dat1
  610. aese $dat2,q12
  611. aesmc $dat2,$dat2
  612. aese $dat3,q12
  613. aesmc $dat3,$dat3
  614. aese $dat4,q12
  615. aesmc $dat4,$dat4
  616. aese $dat0,q13
  617. aesmc $dat0,$dat0
  618. aese $dat1,q13
  619. aesmc $dat1,$dat1
  620. aese $dat2,q13
  621. aesmc $dat2,$dat2
  622. aese $dat3,q13
  623. aesmc $dat3,$dat3
  624. aese $dat4,q13
  625. aesmc $dat4,$dat4
  626. aese $dat0,q14
  627. aesmc $dat0,$dat0
  628. aese $dat1,q14
  629. aesmc $dat1,$dat1
  630. aese $dat2,q14
  631. aesmc $dat2,$dat2
  632. aese $dat3,q14
  633. aesmc $dat3,$dat3
  634. aese $dat4,q14
  635. aesmc $dat4,$dat4
  636. aese $dat0,q15
  637. vld1.8 {$in0},[$inp],#16
  638. aese $dat1,q15
  639. vld1.8 {$in1},[$inp],#16
  640. aese $dat2,q15
  641. vld1.8 {$in2},[$inp],#16
  642. aese $dat3,q15
  643. vld1.8 {$in3},[$inp],#16
  644. aese $dat4,q15
  645. vld1.8 {$in4},[$inp],#16
  646. cbz x6,.Lecb_enc_tail4x
  647. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  648. veor $tmp0,$rndlast,$dat0
  649. vorr $dat0,$in0,$in0
  650. veor $tmp1,$rndlast,$dat1
  651. vorr $dat1,$in1,$in1
  652. veor $tmp2,$rndlast,$dat2
  653. vorr $dat2,$in2,$in2
  654. veor $tmp3,$rndlast,$dat3
  655. vorr $dat3,$in3,$in3
  656. veor $tmp4,$rndlast,$dat4
  657. vst1.8 {$tmp0},[$out],#16
  658. vorr $dat4,$in4,$in4
  659. vst1.8 {$tmp1},[$out],#16
  660. mov $cnt,$rounds
  661. vst1.8 {$tmp2},[$out],#16
  662. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  663. vst1.8 {$tmp3},[$out],#16
  664. vst1.8 {$tmp4},[$out],#16
  665. b.hs .Loop5x_ecb_enc
  666. add $len,$len,#0x50
  667. cbz $len,.Lecb_done
  668. add $cnt,$rounds,#2
  669. subs $len,$len,#0x30
  670. vorr $dat0,$in2,$in2
  671. vorr $dat1,$in3,$in3
  672. vorr $dat2,$in4,$in4
  673. b.lo .Lecb_enc_tail
  674. b .Loop3x_ecb_enc
  675. .align 4
  676. .Lecb_enc_tail4x:
  677. veor $tmp1,$rndlast,$dat1
  678. veor $tmp2,$rndlast,$dat2
  679. veor $tmp3,$rndlast,$dat3
  680. veor $tmp4,$rndlast,$dat4
  681. vst1.8 {$tmp1},[$out],#16
  682. vst1.8 {$tmp2},[$out],#16
  683. vst1.8 {$tmp3},[$out],#16
  684. vst1.8 {$tmp4},[$out],#16
  685. b .Lecb_done
  686. .align 4
  687. ___
  688. $code.=<<___;
  689. .Loop3x_ecb_enc:
  690. aese $dat0,q8
  691. aesmc $dat0,$dat0
  692. aese $dat1,q8
  693. aesmc $dat1,$dat1
  694. aese $dat2,q8
  695. aesmc $dat2,$dat2
  696. vld1.32 {q8},[$key_],#16
  697. subs $cnt,$cnt,#2
  698. aese $dat0,q9
  699. aesmc $dat0,$dat0
  700. aese $dat1,q9
  701. aesmc $dat1,$dat1
  702. aese $dat2,q9
  703. aesmc $dat2,$dat2
  704. vld1.32 {q9},[$key_],#16
  705. b.gt .Loop3x_ecb_enc
  706. aese $dat0,q8
  707. aesmc $dat0,$dat0
  708. aese $dat1,q8
  709. aesmc $dat1,$dat1
  710. aese $dat2,q8
  711. aesmc $dat2,$dat2
  712. subs $len,$len,#0x30
  713. mov.lo x6,$len // x6, $cnt, is zero at this point
  714. aese $dat0,q9
  715. aesmc $dat0,$dat0
  716. aese $dat1,q9
  717. aesmc $dat1,$dat1
  718. aese $dat2,q9
  719. aesmc $dat2,$dat2
  720. add $inp,$inp,x6 // $inp is adjusted in such way that
  721. // at exit from the loop $dat1-$dat2
  722. // are loaded with last "words"
  723. mov $key_,$key
  724. aese $dat0,q12
  725. aesmc $dat0,$dat0
  726. aese $dat1,q12
  727. aesmc $dat1,$dat1
  728. aese $dat2,q12
  729. aesmc $dat2,$dat2
  730. vld1.8 {$in0},[$inp],#16
  731. aese $dat0,q13
  732. aesmc $dat0,$dat0
  733. aese $dat1,q13
  734. aesmc $dat1,$dat1
  735. aese $dat2,q13
  736. aesmc $dat2,$dat2
  737. vld1.8 {$in1},[$inp],#16
  738. aese $dat0,q14
  739. aesmc $dat0,$dat0
  740. aese $dat1,q14
  741. aesmc $dat1,$dat1
  742. aese $dat2,q14
  743. aesmc $dat2,$dat2
  744. vld1.8 {$in2},[$inp],#16
  745. aese $dat0,q15
  746. aese $dat1,q15
  747. aese $dat2,q15
  748. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  749. add $cnt,$rounds,#2
  750. veor $tmp0,$rndlast,$dat0
  751. veor $tmp1,$rndlast,$dat1
  752. veor $dat2,$dat2,$rndlast
  753. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  754. vst1.8 {$tmp0},[$out],#16
  755. vorr $dat0,$in0,$in0
  756. vst1.8 {$tmp1},[$out],#16
  757. vorr $dat1,$in1,$in1
  758. vst1.8 {$dat2},[$out],#16
  759. vorr $dat2,$in2,$in2
  760. b.hs .Loop3x_ecb_enc
  761. cmn $len,#0x30
  762. b.eq .Lecb_done
  763. nop
  764. .Lecb_enc_tail:
  765. aese $dat1,q8
  766. aesmc $dat1,$dat1
  767. aese $dat2,q8
  768. aesmc $dat2,$dat2
  769. vld1.32 {q8},[$key_],#16
  770. subs $cnt,$cnt,#2
  771. aese $dat1,q9
  772. aesmc $dat1,$dat1
  773. aese $dat2,q9
  774. aesmc $dat2,$dat2
  775. vld1.32 {q9},[$key_],#16
  776. b.gt .Lecb_enc_tail
  777. aese $dat1,q8
  778. aesmc $dat1,$dat1
  779. aese $dat2,q8
  780. aesmc $dat2,$dat2
  781. aese $dat1,q9
  782. aesmc $dat1,$dat1
  783. aese $dat2,q9
  784. aesmc $dat2,$dat2
  785. aese $dat1,q12
  786. aesmc $dat1,$dat1
  787. aese $dat2,q12
  788. aesmc $dat2,$dat2
  789. cmn $len,#0x20
  790. aese $dat1,q13
  791. aesmc $dat1,$dat1
  792. aese $dat2,q13
  793. aesmc $dat2,$dat2
  794. aese $dat1,q14
  795. aesmc $dat1,$dat1
  796. aese $dat2,q14
  797. aesmc $dat2,$dat2
  798. aese $dat1,q15
  799. aese $dat2,q15
  800. b.eq .Lecb_enc_one
  801. veor $tmp1,$rndlast,$dat1
  802. veor $tmp2,$rndlast,$dat2
  803. vst1.8 {$tmp1},[$out],#16
  804. vst1.8 {$tmp2},[$out],#16
  805. b .Lecb_done
  806. .Lecb_enc_one:
  807. veor $tmp1,$rndlast,$dat2
  808. vst1.8 {$tmp1},[$out],#16
  809. b .Lecb_done
  810. ___
  811. $code.=<<___;
  812. .align 5
  813. .Lecb_dec:
  814. vld1.8 {$dat1},[$inp],#16
  815. subs $len,$len,#32 // bias
  816. add $cnt,$rounds,#2
  817. vorr $in1,$dat1,$dat1
  818. vorr $dat2,$dat1,$dat1
  819. vorr $dat1,$dat,$dat
  820. b.lo .Lecb_dec_tail
  821. vorr $dat1,$in1,$in1
  822. vld1.8 {$dat2},[$inp],#16
  823. ___
  824. $code.=<<___ if ($flavour =~ /64/);
  825. cmp $len,#32
  826. b.lo .Loop3x_ecb_dec
  827. vld1.8 {$dat3},[$inp],#16
  828. vld1.8 {$dat4},[$inp],#16
  829. sub $len,$len,#32 // bias
  830. mov $cnt,$rounds
  831. .Loop5x_ecb_dec:
  832. aesd $dat0,q8
  833. aesimc $dat0,$dat0
  834. aesd $dat1,q8
  835. aesimc $dat1,$dat1
  836. aesd $dat2,q8
  837. aesimc $dat2,$dat2
  838. aesd $dat3,q8
  839. aesimc $dat3,$dat3
  840. aesd $dat4,q8
  841. aesimc $dat4,$dat4
  842. vld1.32 {q8},[$key_],#16
  843. subs $cnt,$cnt,#2
  844. aesd $dat0,q9
  845. aesimc $dat0,$dat0
  846. aesd $dat1,q9
  847. aesimc $dat1,$dat1
  848. aesd $dat2,q9
  849. aesimc $dat2,$dat2
  850. aesd $dat3,q9
  851. aesimc $dat3,$dat3
  852. aesd $dat4,q9
  853. aesimc $dat4,$dat4
  854. vld1.32 {q9},[$key_],#16
  855. b.gt .Loop5x_ecb_dec
  856. aesd $dat0,q8
  857. aesimc $dat0,$dat0
  858. aesd $dat1,q8
  859. aesimc $dat1,$dat1
  860. aesd $dat2,q8
  861. aesimc $dat2,$dat2
  862. aesd $dat3,q8
  863. aesimc $dat3,$dat3
  864. aesd $dat4,q8
  865. aesimc $dat4,$dat4
  866. cmp $len,#0x40 // because .Lecb_tail4x
  867. sub $len,$len,#0x50
  868. aesd $dat0,q9
  869. aesimc $dat0,$dat0
  870. aesd $dat1,q9
  871. aesimc $dat1,$dat1
  872. aesd $dat2,q9
  873. aesimc $dat2,$dat2
  874. aesd $dat3,q9
  875. aesimc $dat3,$dat3
  876. aesd $dat4,q9
  877. aesimc $dat4,$dat4
  878. csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
  879. mov $key_,$key
  880. aesd $dat0,q10
  881. aesimc $dat0,$dat0
  882. aesd $dat1,q10
  883. aesimc $dat1,$dat1
  884. aesd $dat2,q10
  885. aesimc $dat2,$dat2
  886. aesd $dat3,q10
  887. aesimc $dat3,$dat3
  888. aesd $dat4,q10
  889. aesimc $dat4,$dat4
  890. add $inp,$inp,x6 // $inp is adjusted in such way that
  891. // at exit from the loop $dat1-$dat4
  892. // are loaded with last "words"
  893. add x6,$len,#0x60 // because .Lecb_tail4x
  894. aesd $dat0,q11
  895. aesimc $dat0,$dat0
  896. aesd $dat1,q11
  897. aesimc $dat1,$dat1
  898. aesd $dat2,q11
  899. aesimc $dat2,$dat2
  900. aesd $dat3,q11
  901. aesimc $dat3,$dat3
  902. aesd $dat4,q11
  903. aesimc $dat4,$dat4
  904. aesd $dat0,q12
  905. aesimc $dat0,$dat0
  906. aesd $dat1,q12
  907. aesimc $dat1,$dat1
  908. aesd $dat2,q12
  909. aesimc $dat2,$dat2
  910. aesd $dat3,q12
  911. aesimc $dat3,$dat3
  912. aesd $dat4,q12
  913. aesimc $dat4,$dat4
  914. aesd $dat0,q13
  915. aesimc $dat0,$dat0
  916. aesd $dat1,q13
  917. aesimc $dat1,$dat1
  918. aesd $dat2,q13
  919. aesimc $dat2,$dat2
  920. aesd $dat3,q13
  921. aesimc $dat3,$dat3
  922. aesd $dat4,q13
  923. aesimc $dat4,$dat4
  924. aesd $dat0,q14
  925. aesimc $dat0,$dat0
  926. aesd $dat1,q14
  927. aesimc $dat1,$dat1
  928. aesd $dat2,q14
  929. aesimc $dat2,$dat2
  930. aesd $dat3,q14
  931. aesimc $dat3,$dat3
  932. aesd $dat4,q14
  933. aesimc $dat4,$dat4
  934. aesd $dat0,q15
  935. vld1.8 {$in0},[$inp],#16
  936. aesd $dat1,q15
  937. vld1.8 {$in1},[$inp],#16
  938. aesd $dat2,q15
  939. vld1.8 {$in2},[$inp],#16
  940. aesd $dat3,q15
  941. vld1.8 {$in3},[$inp],#16
  942. aesd $dat4,q15
  943. vld1.8 {$in4},[$inp],#16
  944. cbz x6,.Lecb_tail4x
  945. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  946. veor $tmp0,$rndlast,$dat0
  947. vorr $dat0,$in0,$in0
  948. veor $tmp1,$rndlast,$dat1
  949. vorr $dat1,$in1,$in1
  950. veor $tmp2,$rndlast,$dat2
  951. vorr $dat2,$in2,$in2
  952. veor $tmp3,$rndlast,$dat3
  953. vorr $dat3,$in3,$in3
  954. veor $tmp4,$rndlast,$dat4
  955. vst1.8 {$tmp0},[$out],#16
  956. vorr $dat4,$in4,$in4
  957. vst1.8 {$tmp1},[$out],#16
  958. mov $cnt,$rounds
  959. vst1.8 {$tmp2},[$out],#16
  960. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  961. vst1.8 {$tmp3},[$out],#16
  962. vst1.8 {$tmp4},[$out],#16
  963. b.hs .Loop5x_ecb_dec
  964. add $len,$len,#0x50
  965. cbz $len,.Lecb_done
  966. add $cnt,$rounds,#2
  967. subs $len,$len,#0x30
  968. vorr $dat0,$in2,$in2
  969. vorr $dat1,$in3,$in3
  970. vorr $dat2,$in4,$in4
  971. b.lo .Lecb_dec_tail
  972. b .Loop3x_ecb_dec
  973. .align 4
  974. .Lecb_tail4x:
  975. veor $tmp1,$rndlast,$dat1
  976. veor $tmp2,$rndlast,$dat2
  977. veor $tmp3,$rndlast,$dat3
  978. veor $tmp4,$rndlast,$dat4
  979. vst1.8 {$tmp1},[$out],#16
  980. vst1.8 {$tmp2},[$out],#16
  981. vst1.8 {$tmp3},[$out],#16
  982. vst1.8 {$tmp4},[$out],#16
  983. b .Lecb_done
  984. .align 4
  985. ___
  986. $code.=<<___;
  987. .Loop3x_ecb_dec:
  988. aesd $dat0,q8
  989. aesimc $dat0,$dat0
  990. aesd $dat1,q8
  991. aesimc $dat1,$dat1
  992. aesd $dat2,q8
  993. aesimc $dat2,$dat2
  994. vld1.32 {q8},[$key_],#16
  995. subs $cnt,$cnt,#2
  996. aesd $dat0,q9
  997. aesimc $dat0,$dat0
  998. aesd $dat1,q9
  999. aesimc $dat1,$dat1
  1000. aesd $dat2,q9
  1001. aesimc $dat2,$dat2
  1002. vld1.32 {q9},[$key_],#16
  1003. b.gt .Loop3x_ecb_dec
  1004. aesd $dat0,q8
  1005. aesimc $dat0,$dat0
  1006. aesd $dat1,q8
  1007. aesimc $dat1,$dat1
  1008. aesd $dat2,q8
  1009. aesimc $dat2,$dat2
  1010. subs $len,$len,#0x30
  1011. mov.lo x6,$len // x6, $cnt, is zero at this point
  1012. aesd $dat0,q9
  1013. aesimc $dat0,$dat0
  1014. aesd $dat1,q9
  1015. aesimc $dat1,$dat1
  1016. aesd $dat2,q9
  1017. aesimc $dat2,$dat2
  1018. add $inp,$inp,x6 // $inp is adjusted in such way that
  1019. // at exit from the loop $dat1-$dat2
  1020. // are loaded with last "words"
  1021. mov $key_,$key
  1022. aesd $dat0,q12
  1023. aesimc $dat0,$dat0
  1024. aesd $dat1,q12
  1025. aesimc $dat1,$dat1
  1026. aesd $dat2,q12
  1027. aesimc $dat2,$dat2
  1028. vld1.8 {$in0},[$inp],#16
  1029. aesd $dat0,q13
  1030. aesimc $dat0,$dat0
  1031. aesd $dat1,q13
  1032. aesimc $dat1,$dat1
  1033. aesd $dat2,q13
  1034. aesimc $dat2,$dat2
  1035. vld1.8 {$in1},[$inp],#16
  1036. aesd $dat0,q14
  1037. aesimc $dat0,$dat0
  1038. aesd $dat1,q14
  1039. aesimc $dat1,$dat1
  1040. aesd $dat2,q14
  1041. aesimc $dat2,$dat2
  1042. vld1.8 {$in2},[$inp],#16
  1043. aesd $dat0,q15
  1044. aesd $dat1,q15
  1045. aesd $dat2,q15
  1046. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  1047. add $cnt,$rounds,#2
  1048. veor $tmp0,$rndlast,$dat0
  1049. veor $tmp1,$rndlast,$dat1
  1050. veor $dat2,$dat2,$rndlast
  1051. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  1052. vst1.8 {$tmp0},[$out],#16
  1053. vorr $dat0,$in0,$in0
  1054. vst1.8 {$tmp1},[$out],#16
  1055. vorr $dat1,$in1,$in1
  1056. vst1.8 {$dat2},[$out],#16
  1057. vorr $dat2,$in2,$in2
  1058. b.hs .Loop3x_ecb_dec
  1059. cmn $len,#0x30
  1060. b.eq .Lecb_done
  1061. nop
  1062. .Lecb_dec_tail:
  1063. aesd $dat1,q8
  1064. aesimc $dat1,$dat1
  1065. aesd $dat2,q8
  1066. aesimc $dat2,$dat2
  1067. vld1.32 {q8},[$key_],#16
  1068. subs $cnt,$cnt,#2
  1069. aesd $dat1,q9
  1070. aesimc $dat1,$dat1
  1071. aesd $dat2,q9
  1072. aesimc $dat2,$dat2
  1073. vld1.32 {q9},[$key_],#16
  1074. b.gt .Lecb_dec_tail
  1075. aesd $dat1,q8
  1076. aesimc $dat1,$dat1
  1077. aesd $dat2,q8
  1078. aesimc $dat2,$dat2
  1079. aesd $dat1,q9
  1080. aesimc $dat1,$dat1
  1081. aesd $dat2,q9
  1082. aesimc $dat2,$dat2
  1083. aesd $dat1,q12
  1084. aesimc $dat1,$dat1
  1085. aesd $dat2,q12
  1086. aesimc $dat2,$dat2
  1087. cmn $len,#0x20
  1088. aesd $dat1,q13
  1089. aesimc $dat1,$dat1
  1090. aesd $dat2,q13
  1091. aesimc $dat2,$dat2
  1092. aesd $dat1,q14
  1093. aesimc $dat1,$dat1
  1094. aesd $dat2,q14
  1095. aesimc $dat2,$dat2
  1096. aesd $dat1,q15
  1097. aesd $dat2,q15
  1098. b.eq .Lecb_dec_one
  1099. veor $tmp1,$rndlast,$dat1
  1100. veor $tmp2,$rndlast,$dat2
  1101. vst1.8 {$tmp1},[$out],#16
  1102. vst1.8 {$tmp2},[$out],#16
  1103. b .Lecb_done
  1104. .Lecb_dec_one:
  1105. veor $tmp1,$rndlast,$dat2
  1106. vst1.8 {$tmp1},[$out],#16
  1107. .Lecb_done:
  1108. ___
  1109. }
  1110. $code.=<<___ if ($flavour !~ /64/);
  1111. vldmia sp!,{d8-d15}
  1112. ldmia sp!,{r4-r8,pc}
  1113. ___
  1114. $code.=<<___ if ($flavour =~ /64/);
  1115. ldr x29,[sp],#16
  1116. ___
  1117. $code.=<<___ if ($flavour =~ /64/);
  1118. .Lecb_Final_abort:
  1119. ret
  1120. ___
  1121. $code.=<<___;
  1122. .size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
  1123. ___
  1124. }}}
  1125. {{{
  1126. my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
  1127. my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
  1128. my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
  1129. my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
  1130. my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
  1131. ### q8-q15 preloaded key schedule
  1132. $code.=<<___;
  1133. .globl ${prefix}_cbc_encrypt
  1134. .type ${prefix}_cbc_encrypt,%function
  1135. .align 5
  1136. ${prefix}_cbc_encrypt:
  1137. ___
  1138. $code.=<<___ if ($flavour =~ /64/);
  1139. AARCH64_VALID_CALL_TARGET
  1140. // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
  1141. stp x29,x30,[sp,#-16]!
  1142. add x29,sp,#0
  1143. ___
  1144. $code.=<<___ if ($flavour !~ /64/);
  1145. mov ip,sp
  1146. stmdb sp!,{r4-r8,lr}
  1147. vstmdb sp!,{d8-d15} @ ABI specification says so
  1148. ldmia ip,{r4-r5} @ load remaining args
  1149. ___
  1150. $code.=<<___;
  1151. subs $len,$len,#16
  1152. mov $step,#16
  1153. b.lo .Lcbc_abort
  1154. cclr $step,eq
  1155. cmp $enc,#0 // en- or decrypting?
  1156. ldr $rounds,[$key,#240]
  1157. and $len,$len,#-16
  1158. vld1.8 {$ivec},[$ivp]
  1159. vld1.8 {$dat},[$inp],$step
  1160. vld1.32 {q8-q9},[$key] // load key schedule...
  1161. sub $rounds,$rounds,#6
  1162. add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
  1163. sub $rounds,$rounds,#2
  1164. vld1.32 {q10-q11},[$key_],#32
  1165. vld1.32 {q12-q13},[$key_],#32
  1166. vld1.32 {q14-q15},[$key_],#32
  1167. vld1.32 {$rndlast},[$key_]
  1168. add $key_,$key,#32
  1169. mov $cnt,$rounds
  1170. b.eq .Lcbc_dec
  1171. cmp $rounds,#2
  1172. veor $dat,$dat,$ivec
  1173. veor $rndzero_n_last,q8,$rndlast
  1174. b.eq .Lcbc_enc128
  1175. vld1.32 {$in0-$in1},[$key_]
  1176. add $key_,$key,#16
  1177. add $key4,$key,#16*4
  1178. add $key5,$key,#16*5
  1179. aese $dat,q8
  1180. aesmc $dat,$dat
  1181. add $key6,$key,#16*6
  1182. add $key7,$key,#16*7
  1183. b .Lenter_cbc_enc
  1184. .align 4
  1185. .Loop_cbc_enc:
  1186. aese $dat,q8
  1187. aesmc $dat,$dat
  1188. vst1.8 {$ivec},[$out],#16
  1189. .Lenter_cbc_enc:
  1190. aese $dat,q9
  1191. aesmc $dat,$dat
  1192. aese $dat,$in0
  1193. aesmc $dat,$dat
  1194. vld1.32 {q8},[$key4]
  1195. cmp $rounds,#4
  1196. aese $dat,$in1
  1197. aesmc $dat,$dat
  1198. vld1.32 {q9},[$key5]
  1199. b.eq .Lcbc_enc192
  1200. aese $dat,q8
  1201. aesmc $dat,$dat
  1202. vld1.32 {q8},[$key6]
  1203. aese $dat,q9
  1204. aesmc $dat,$dat
  1205. vld1.32 {q9},[$key7]
  1206. nop
  1207. .Lcbc_enc192:
  1208. aese $dat,q8
  1209. aesmc $dat,$dat
  1210. subs $len,$len,#16
  1211. aese $dat,q9
  1212. aesmc $dat,$dat
  1213. cclr $step,eq
  1214. aese $dat,q10
  1215. aesmc $dat,$dat
  1216. aese $dat,q11
  1217. aesmc $dat,$dat
  1218. vld1.8 {q8},[$inp],$step
  1219. aese $dat,q12
  1220. aesmc $dat,$dat
  1221. veor q8,q8,$rndzero_n_last
  1222. aese $dat,q13
  1223. aesmc $dat,$dat
  1224. vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
  1225. aese $dat,q14
  1226. aesmc $dat,$dat
  1227. aese $dat,q15
  1228. veor $ivec,$dat,$rndlast
  1229. b.hs .Loop_cbc_enc
  1230. vst1.8 {$ivec},[$out],#16
  1231. b .Lcbc_done
  1232. .align 5
  1233. .Lcbc_enc128:
  1234. vld1.32 {$in0-$in1},[$key_]
  1235. aese $dat,q8
  1236. aesmc $dat,$dat
  1237. b .Lenter_cbc_enc128
  1238. .Loop_cbc_enc128:
  1239. aese $dat,q8
  1240. aesmc $dat,$dat
  1241. vst1.8 {$ivec},[$out],#16
  1242. .Lenter_cbc_enc128:
  1243. aese $dat,q9
  1244. aesmc $dat,$dat
  1245. subs $len,$len,#16
  1246. aese $dat,$in0
  1247. aesmc $dat,$dat
  1248. cclr $step,eq
  1249. aese $dat,$in1
  1250. aesmc $dat,$dat
  1251. aese $dat,q10
  1252. aesmc $dat,$dat
  1253. aese $dat,q11
  1254. aesmc $dat,$dat
  1255. vld1.8 {q8},[$inp],$step
  1256. aese $dat,q12
  1257. aesmc $dat,$dat
  1258. aese $dat,q13
  1259. aesmc $dat,$dat
  1260. aese $dat,q14
  1261. aesmc $dat,$dat
  1262. veor q8,q8,$rndzero_n_last
  1263. aese $dat,q15
  1264. veor $ivec,$dat,$rndlast
  1265. b.hs .Loop_cbc_enc128
  1266. vst1.8 {$ivec},[$out],#16
  1267. b .Lcbc_done
  1268. ___
  1269. {
  1270. my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
  1271. my ($dat3,$in3,$tmp3); # used only in 64-bit mode
  1272. my ($dat4,$in4,$tmp4);
  1273. if ($flavour =~ /64/) {
  1274. ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
  1275. }
  1276. $code.=<<___;
  1277. .align 5
  1278. .Lcbc_dec:
  1279. vld1.8 {$dat2},[$inp],#16
  1280. subs $len,$len,#32 // bias
  1281. add $cnt,$rounds,#2
  1282. vorr $in1,$dat,$dat
  1283. vorr $dat1,$dat,$dat
  1284. vorr $in2,$dat2,$dat2
  1285. b.lo .Lcbc_dec_tail
  1286. vorr $dat1,$dat2,$dat2
  1287. vld1.8 {$dat2},[$inp],#16
  1288. vorr $in0,$dat,$dat
  1289. vorr $in1,$dat1,$dat1
  1290. vorr $in2,$dat2,$dat2
  1291. ___
  1292. $code.=<<___ if ($flavour =~ /64/);
  1293. cmp $len,#32
  1294. b.lo .Loop3x_cbc_dec
  1295. vld1.8 {$dat3},[$inp],#16
  1296. vld1.8 {$dat4},[$inp],#16
  1297. sub $len,$len,#32 // bias
  1298. mov $cnt,$rounds
  1299. vorr $in3,$dat3,$dat3
  1300. vorr $in4,$dat4,$dat4
  1301. .Loop5x_cbc_dec:
  1302. aesd $dat0,q8
  1303. aesimc $dat0,$dat0
  1304. aesd $dat1,q8
  1305. aesimc $dat1,$dat1
  1306. aesd $dat2,q8
  1307. aesimc $dat2,$dat2
  1308. aesd $dat3,q8
  1309. aesimc $dat3,$dat3
  1310. aesd $dat4,q8
  1311. aesimc $dat4,$dat4
  1312. vld1.32 {q8},[$key_],#16
  1313. subs $cnt,$cnt,#2
  1314. aesd $dat0,q9
  1315. aesimc $dat0,$dat0
  1316. aesd $dat1,q9
  1317. aesimc $dat1,$dat1
  1318. aesd $dat2,q9
  1319. aesimc $dat2,$dat2
  1320. aesd $dat3,q9
  1321. aesimc $dat3,$dat3
  1322. aesd $dat4,q9
  1323. aesimc $dat4,$dat4
  1324. vld1.32 {q9},[$key_],#16
  1325. b.gt .Loop5x_cbc_dec
  1326. aesd $dat0,q8
  1327. aesimc $dat0,$dat0
  1328. aesd $dat1,q8
  1329. aesimc $dat1,$dat1
  1330. aesd $dat2,q8
  1331. aesimc $dat2,$dat2
  1332. aesd $dat3,q8
  1333. aesimc $dat3,$dat3
  1334. aesd $dat4,q8
  1335. aesimc $dat4,$dat4
  1336. cmp $len,#0x40 // because .Lcbc_tail4x
  1337. sub $len,$len,#0x50
  1338. aesd $dat0,q9
  1339. aesimc $dat0,$dat0
  1340. aesd $dat1,q9
  1341. aesimc $dat1,$dat1
  1342. aesd $dat2,q9
  1343. aesimc $dat2,$dat2
  1344. aesd $dat3,q9
  1345. aesimc $dat3,$dat3
  1346. aesd $dat4,q9
  1347. aesimc $dat4,$dat4
  1348. csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
  1349. mov $key_,$key
  1350. aesd $dat0,q10
  1351. aesimc $dat0,$dat0
  1352. aesd $dat1,q10
  1353. aesimc $dat1,$dat1
  1354. aesd $dat2,q10
  1355. aesimc $dat2,$dat2
  1356. aesd $dat3,q10
  1357. aesimc $dat3,$dat3
  1358. aesd $dat4,q10
  1359. aesimc $dat4,$dat4
  1360. add $inp,$inp,x6 // $inp is adjusted in such way that
  1361. // at exit from the loop $dat1-$dat4
  1362. // are loaded with last "words"
  1363. add x6,$len,#0x60 // because .Lcbc_tail4x
  1364. aesd $dat0,q11
  1365. aesimc $dat0,$dat0
  1366. aesd $dat1,q11
  1367. aesimc $dat1,$dat1
  1368. aesd $dat2,q11
  1369. aesimc $dat2,$dat2
  1370. aesd $dat3,q11
  1371. aesimc $dat3,$dat3
  1372. aesd $dat4,q11
  1373. aesimc $dat4,$dat4
  1374. aesd $dat0,q12
  1375. aesimc $dat0,$dat0
  1376. aesd $dat1,q12
  1377. aesimc $dat1,$dat1
  1378. aesd $dat2,q12
  1379. aesimc $dat2,$dat2
  1380. aesd $dat3,q12
  1381. aesimc $dat3,$dat3
  1382. aesd $dat4,q12
  1383. aesimc $dat4,$dat4
  1384. aesd $dat0,q13
  1385. aesimc $dat0,$dat0
  1386. aesd $dat1,q13
  1387. aesimc $dat1,$dat1
  1388. aesd $dat2,q13
  1389. aesimc $dat2,$dat2
  1390. aesd $dat3,q13
  1391. aesimc $dat3,$dat3
  1392. aesd $dat4,q13
  1393. aesimc $dat4,$dat4
  1394. aesd $dat0,q14
  1395. aesimc $dat0,$dat0
  1396. aesd $dat1,q14
  1397. aesimc $dat1,$dat1
  1398. aesd $dat2,q14
  1399. aesimc $dat2,$dat2
  1400. aesd $dat3,q14
  1401. aesimc $dat3,$dat3
  1402. aesd $dat4,q14
  1403. aesimc $dat4,$dat4
  1404. veor $tmp0,$ivec,$rndlast
  1405. aesd $dat0,q15
  1406. veor $tmp1,$in0,$rndlast
  1407. vld1.8 {$in0},[$inp],#16
  1408. aesd $dat1,q15
  1409. veor $tmp2,$in1,$rndlast
  1410. vld1.8 {$in1},[$inp],#16
  1411. aesd $dat2,q15
  1412. veor $tmp3,$in2,$rndlast
  1413. vld1.8 {$in2},[$inp],#16
  1414. aesd $dat3,q15
  1415. veor $tmp4,$in3,$rndlast
  1416. vld1.8 {$in3},[$inp],#16
  1417. aesd $dat4,q15
  1418. vorr $ivec,$in4,$in4
  1419. vld1.8 {$in4},[$inp],#16
  1420. cbz x6,.Lcbc_tail4x
  1421. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  1422. veor $tmp0,$tmp0,$dat0
  1423. vorr $dat0,$in0,$in0
  1424. veor $tmp1,$tmp1,$dat1
  1425. vorr $dat1,$in1,$in1
  1426. veor $tmp2,$tmp2,$dat2
  1427. vorr $dat2,$in2,$in2
  1428. veor $tmp3,$tmp3,$dat3
  1429. vorr $dat3,$in3,$in3
  1430. veor $tmp4,$tmp4,$dat4
  1431. vst1.8 {$tmp0},[$out],#16
  1432. vorr $dat4,$in4,$in4
  1433. vst1.8 {$tmp1},[$out],#16
  1434. mov $cnt,$rounds
  1435. vst1.8 {$tmp2},[$out],#16
  1436. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  1437. vst1.8 {$tmp3},[$out],#16
  1438. vst1.8 {$tmp4},[$out],#16
  1439. b.hs .Loop5x_cbc_dec
  1440. add $len,$len,#0x50
  1441. cbz $len,.Lcbc_done
  1442. add $cnt,$rounds,#2
  1443. subs $len,$len,#0x30
  1444. vorr $dat0,$in2,$in2
  1445. vorr $in0,$in2,$in2
  1446. vorr $dat1,$in3,$in3
  1447. vorr $in1,$in3,$in3
  1448. vorr $dat2,$in4,$in4
  1449. vorr $in2,$in4,$in4
  1450. b.lo .Lcbc_dec_tail
  1451. b .Loop3x_cbc_dec
  1452. .align 4
  1453. .Lcbc_tail4x:
  1454. veor $tmp1,$tmp0,$dat1
  1455. veor $tmp2,$tmp2,$dat2
  1456. veor $tmp3,$tmp3,$dat3
  1457. veor $tmp4,$tmp4,$dat4
  1458. vst1.8 {$tmp1},[$out],#16
  1459. vst1.8 {$tmp2},[$out],#16
  1460. vst1.8 {$tmp3},[$out],#16
  1461. vst1.8 {$tmp4},[$out],#16
  1462. b .Lcbc_done
  1463. .align 4
  1464. ___
  1465. $code.=<<___;
  1466. .Loop3x_cbc_dec:
  1467. aesd $dat0,q8
  1468. aesimc $dat0,$dat0
  1469. aesd $dat1,q8
  1470. aesimc $dat1,$dat1
  1471. aesd $dat2,q8
  1472. aesimc $dat2,$dat2
  1473. vld1.32 {q8},[$key_],#16
  1474. subs $cnt,$cnt,#2
  1475. aesd $dat0,q9
  1476. aesimc $dat0,$dat0
  1477. aesd $dat1,q9
  1478. aesimc $dat1,$dat1
  1479. aesd $dat2,q9
  1480. aesimc $dat2,$dat2
  1481. vld1.32 {q9},[$key_],#16
  1482. b.gt .Loop3x_cbc_dec
  1483. aesd $dat0,q8
  1484. aesimc $dat0,$dat0
  1485. aesd $dat1,q8
  1486. aesimc $dat1,$dat1
  1487. aesd $dat2,q8
  1488. aesimc $dat2,$dat2
  1489. veor $tmp0,$ivec,$rndlast
  1490. subs $len,$len,#0x30
  1491. veor $tmp1,$in0,$rndlast
  1492. mov.lo x6,$len // x6, $cnt, is zero at this point
  1493. aesd $dat0,q9
  1494. aesimc $dat0,$dat0
  1495. aesd $dat1,q9
  1496. aesimc $dat1,$dat1
  1497. aesd $dat2,q9
  1498. aesimc $dat2,$dat2
  1499. veor $tmp2,$in1,$rndlast
  1500. add $inp,$inp,x6 // $inp is adjusted in such way that
  1501. // at exit from the loop $dat1-$dat2
  1502. // are loaded with last "words"
  1503. vorr $ivec,$in2,$in2
  1504. mov $key_,$key
  1505. aesd $dat0,q12
  1506. aesimc $dat0,$dat0
  1507. aesd $dat1,q12
  1508. aesimc $dat1,$dat1
  1509. aesd $dat2,q12
  1510. aesimc $dat2,$dat2
  1511. vld1.8 {$in0},[$inp],#16
  1512. aesd $dat0,q13
  1513. aesimc $dat0,$dat0
  1514. aesd $dat1,q13
  1515. aesimc $dat1,$dat1
  1516. aesd $dat2,q13
  1517. aesimc $dat2,$dat2
  1518. vld1.8 {$in1},[$inp],#16
  1519. aesd $dat0,q14
  1520. aesimc $dat0,$dat0
  1521. aesd $dat1,q14
  1522. aesimc $dat1,$dat1
  1523. aesd $dat2,q14
  1524. aesimc $dat2,$dat2
  1525. vld1.8 {$in2},[$inp],#16
  1526. aesd $dat0,q15
  1527. aesd $dat1,q15
  1528. aesd $dat2,q15
  1529. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  1530. add $cnt,$rounds,#2
  1531. veor $tmp0,$tmp0,$dat0
  1532. veor $tmp1,$tmp1,$dat1
  1533. veor $dat2,$dat2,$tmp2
  1534. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  1535. vst1.8 {$tmp0},[$out],#16
  1536. vorr $dat0,$in0,$in0
  1537. vst1.8 {$tmp1},[$out],#16
  1538. vorr $dat1,$in1,$in1
  1539. vst1.8 {$dat2},[$out],#16
  1540. vorr $dat2,$in2,$in2
  1541. b.hs .Loop3x_cbc_dec
  1542. cmn $len,#0x30
  1543. b.eq .Lcbc_done
  1544. nop
  1545. .Lcbc_dec_tail:
  1546. aesd $dat1,q8
  1547. aesimc $dat1,$dat1
  1548. aesd $dat2,q8
  1549. aesimc $dat2,$dat2
  1550. vld1.32 {q8},[$key_],#16
  1551. subs $cnt,$cnt,#2
  1552. aesd $dat1,q9
  1553. aesimc $dat1,$dat1
  1554. aesd $dat2,q9
  1555. aesimc $dat2,$dat2
  1556. vld1.32 {q9},[$key_],#16
  1557. b.gt .Lcbc_dec_tail
  1558. aesd $dat1,q8
  1559. aesimc $dat1,$dat1
  1560. aesd $dat2,q8
  1561. aesimc $dat2,$dat2
  1562. aesd $dat1,q9
  1563. aesimc $dat1,$dat1
  1564. aesd $dat2,q9
  1565. aesimc $dat2,$dat2
  1566. aesd $dat1,q12
  1567. aesimc $dat1,$dat1
  1568. aesd $dat2,q12
  1569. aesimc $dat2,$dat2
  1570. cmn $len,#0x20
  1571. aesd $dat1,q13
  1572. aesimc $dat1,$dat1
  1573. aesd $dat2,q13
  1574. aesimc $dat2,$dat2
  1575. veor $tmp1,$ivec,$rndlast
  1576. aesd $dat1,q14
  1577. aesimc $dat1,$dat1
  1578. aesd $dat2,q14
  1579. aesimc $dat2,$dat2
  1580. veor $tmp2,$in1,$rndlast
  1581. aesd $dat1,q15
  1582. aesd $dat2,q15
  1583. b.eq .Lcbc_dec_one
  1584. veor $tmp1,$tmp1,$dat1
  1585. veor $tmp2,$tmp2,$dat2
  1586. vorr $ivec,$in2,$in2
  1587. vst1.8 {$tmp1},[$out],#16
  1588. vst1.8 {$tmp2},[$out],#16
  1589. b .Lcbc_done
  1590. .Lcbc_dec_one:
  1591. veor $tmp1,$tmp1,$dat2
  1592. vorr $ivec,$in2,$in2
  1593. vst1.8 {$tmp1},[$out],#16
  1594. .Lcbc_done:
  1595. vst1.8 {$ivec},[$ivp]
  1596. .Lcbc_abort:
  1597. ___
  1598. }
  1599. $code.=<<___ if ($flavour !~ /64/);
  1600. vldmia sp!,{d8-d15}
  1601. ldmia sp!,{r4-r8,pc}
  1602. ___
  1603. $code.=<<___ if ($flavour =~ /64/);
  1604. ldr x29,[sp],#16
  1605. ret
  1606. ___
  1607. $code.=<<___;
  1608. .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
  1609. ___
  1610. }}}
  1611. {{{
  1612. my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
  1613. my ($rounds,$roundsx,$cnt,$key_)=("w5","x5","w6","x7");
  1614. my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
  1615. my ($tctr3,$tctr4,$tctr5,$tctr6)=map("w$_",(11,13..15));
  1616. my ($tctr7,$tctr8,$tctr9,$tctr10,$tctr11)=map("w$_",(19..23));
  1617. # q0-q7 => v0-v7; q8-q23 => v16-v31; q24-q31 => v8-v15
  1618. my ($ivec,$rndlast,$rndping,$rndpang)=map("q$_",(0..3));
  1619. my ($in0,$in1,$in2,$in3,$in4,$in5)=map("q$_",(4..9));
  1620. my ($in6,$in7,$in8,$in9,$in10,$in11)=map("q$_",(10..15));
  1621. my ($dat0,$dat1,$dat2,$dat3,$dat4,$dat5)=map("q$_",(16..21));
  1622. my ($dat6,$dat7,$dat8,$dat9,$dat10,$dat11)=map("q$_",(22..27));
  1623. my ($tmp0,$tmp1,$tmp2)=map("q$_",(25..27));
  1624. #q_X => qX, for ldp & stp
  1625. my ($in0q,$in1q,$in2q,$in3q)=map("q_$_",(4..7));
  1626. my ($in4q,$in5q,$in6q,$in7q,$in8q,$in9q,$in10q,$in11q)=map("q_$_",(16..23));
  1627. my ($dat8d,$dat9d,$dat10d,$dat11d)=map("d$_",(8..11));
  1628. $code.=<<___ if ($flavour =~ /64/);
  1629. .globl ${prefix}_ctr32_encrypt_blocks_unroll12_eor3
  1630. .type ${prefix}_ctr32_encrypt_blocks_unroll12_eor3,%function
  1631. .align 5
  1632. ${prefix}_ctr32_encrypt_blocks_unroll12_eor3:
  1633. AARCH64_VALID_CALL_TARGET
  1634. // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
  1635. stp x29,x30,[sp,#-80]!
  1636. stp d8,d9,[sp, #16]
  1637. stp d10,d11,[sp, #32]
  1638. stp d12,d13,[sp, #48]
  1639. stp d14,d15,[sp, #64]
  1640. add x29,sp,#0
  1641. ldr $rounds,[$key,#240]
  1642. ldr $ctr, [$ivp, #12]
  1643. #ifdef __AARCH64EB__
  1644. vld1.8 {$dat0},[$ivp]
  1645. #else
  1646. vld1.32 {$dat0},[$ivp]
  1647. #endif
  1648. vld1.32 {$rndping-$rndpang},[$key] // load key schedule...
  1649. sub $rounds,$rounds,#4
  1650. cmp $len,#2
  1651. add $key_,$key,$roundsx,lsl#4 // pointer to last round key
  1652. sub $rounds,$rounds,#2
  1653. add $key_, $key_, #64
  1654. vld1.32 {$rndlast},[$key_]
  1655. add $key_,$key,#32
  1656. mov $cnt,$rounds
  1657. #ifndef __AARCH64EB__
  1658. rev $ctr, $ctr
  1659. #endif
  1660. vorr $dat1,$dat0,$dat0
  1661. add $tctr1, $ctr, #1
  1662. vorr $dat2,$dat0,$dat0
  1663. add $ctr, $ctr, #2
  1664. vorr $ivec,$dat0,$dat0
  1665. rev $tctr1, $tctr1
  1666. vmov.32 ${dat1}[3],$tctr1
  1667. b.ls .Lctr32_tail_unroll
  1668. cmp $len,#6
  1669. rev $tctr2, $ctr
  1670. sub $len,$len,#3 // bias
  1671. vmov.32 ${dat2}[3],$tctr2
  1672. b.lo .Loop3x_ctr32_unroll
  1673. cmp $len,#9
  1674. vorr $dat3,$dat0,$dat0
  1675. add $tctr3, $ctr, #1
  1676. vorr $dat4,$dat0,$dat0
  1677. add $tctr4, $ctr, #2
  1678. rev $tctr3, $tctr3
  1679. vorr $dat5,$dat0,$dat0
  1680. add $ctr, $ctr, #3
  1681. rev $tctr4, $tctr4
  1682. vmov.32 ${dat3}[3],$tctr3
  1683. rev $tctr5, $ctr
  1684. vmov.32 ${dat4}[3],$tctr4
  1685. vmov.32 ${dat5}[3],$tctr5
  1686. sub $len,$len,#3
  1687. b.lo .Loop6x_ctr32_unroll
  1688. // push regs to stack when 12 data chunks are interleaved
  1689. stp x19,x20,[sp,#-16]!
  1690. stp x21,x22,[sp,#-16]!
  1691. stp x23,x24,[sp,#-16]!
  1692. stp $dat8d,$dat9d,[sp,#-32]!
  1693. stp $dat10d,$dat11d,[sp,#-32]!
  1694. add $tctr6,$ctr,#1
  1695. add $tctr7,$ctr,#2
  1696. add $tctr8,$ctr,#3
  1697. add $tctr9,$ctr,#4
  1698. add $tctr10,$ctr,#5
  1699. add $ctr,$ctr,#6
  1700. vorr $dat6,$dat0,$dat0
  1701. rev $tctr6,$tctr6
  1702. vorr $dat7,$dat0,$dat0
  1703. rev $tctr7,$tctr7
  1704. vorr $dat8,$dat0,$dat0
  1705. rev $tctr8,$tctr8
  1706. vorr $dat9,$dat0,$dat0
  1707. rev $tctr9,$tctr9
  1708. vorr $dat10,$dat0,$dat0
  1709. rev $tctr10,$tctr10
  1710. vorr $dat11,$dat0,$dat0
  1711. rev $tctr11,$ctr
  1712. sub $len,$len,#6 // bias
  1713. vmov.32 ${dat6}[3],$tctr6
  1714. vmov.32 ${dat7}[3],$tctr7
  1715. vmov.32 ${dat8}[3],$tctr8
  1716. vmov.32 ${dat9}[3],$tctr9
  1717. vmov.32 ${dat10}[3],$tctr10
  1718. vmov.32 ${dat11}[3],$tctr11
  1719. b .Loop12x_ctr32_unroll
  1720. .align 4
  1721. .Loop12x_ctr32_unroll:
  1722. aese $dat0,$rndping
  1723. aesmc $dat0,$dat0
  1724. aese $dat1,$rndping
  1725. aesmc $dat1,$dat1
  1726. aese $dat2,$rndping
  1727. aesmc $dat2,$dat2
  1728. aese $dat3,$rndping
  1729. aesmc $dat3,$dat3
  1730. aese $dat4,$rndping
  1731. aesmc $dat4,$dat4
  1732. aese $dat5,$rndping
  1733. aesmc $dat5,$dat5
  1734. aese $dat6,$rndping
  1735. aesmc $dat6,$dat6
  1736. aese $dat7,$rndping
  1737. aesmc $dat7,$dat7
  1738. aese $dat8,$rndping
  1739. aesmc $dat8,$dat8
  1740. aese $dat9,$rndping
  1741. aesmc $dat9,$dat9
  1742. aese $dat10,$rndping
  1743. aesmc $dat10,$dat10
  1744. aese $dat11,$rndping
  1745. aesmc $dat11,$dat11
  1746. vld1.32 {$rndping},[$key_],#16
  1747. subs $cnt,$cnt,#2
  1748. aese $dat0,$rndpang
  1749. aesmc $dat0,$dat0
  1750. aese $dat1,$rndpang
  1751. aesmc $dat1,$dat1
  1752. aese $dat2,$rndpang
  1753. aesmc $dat2,$dat2
  1754. aese $dat3,$rndpang
  1755. aesmc $dat3,$dat3
  1756. aese $dat4,$rndpang
  1757. aesmc $dat4,$dat4
  1758. aese $dat5,$rndpang
  1759. aesmc $dat5,$dat5
  1760. aese $dat6,$rndpang
  1761. aesmc $dat6,$dat6
  1762. aese $dat7,$rndpang
  1763. aesmc $dat7,$dat7
  1764. aese $dat8,$rndpang
  1765. aesmc $dat8,$dat8
  1766. aese $dat9,$rndpang
  1767. aesmc $dat9,$dat9
  1768. aese $dat10,$rndpang
  1769. aesmc $dat10,$dat10
  1770. aese $dat11,$rndpang
  1771. aesmc $dat11,$dat11
  1772. vld1.32 {$rndpang},[$key_],#16
  1773. b.gt .Loop12x_ctr32_unroll
  1774. aese $dat0,$rndping
  1775. aesmc $dat0,$dat0
  1776. aese $dat1,$rndping
  1777. aesmc $dat1,$dat1
  1778. aese $dat2,$rndping
  1779. aesmc $dat2,$dat2
  1780. aese $dat3,$rndping
  1781. aesmc $dat3,$dat3
  1782. aese $dat4,$rndping
  1783. aesmc $dat4,$dat4
  1784. aese $dat5,$rndping
  1785. aesmc $dat5,$dat5
  1786. aese $dat6,$rndping
  1787. aesmc $dat6,$dat6
  1788. aese $dat7,$rndping
  1789. aesmc $dat7,$dat7
  1790. aese $dat8,$rndping
  1791. aesmc $dat8,$dat8
  1792. aese $dat9,$rndping
  1793. aesmc $dat9,$dat9
  1794. aese $dat10,$rndping
  1795. aesmc $dat10,$dat10
  1796. aese $dat11,$rndping
  1797. aesmc $dat11,$dat11
  1798. vld1.32 {$rndping},[$key_],#16
  1799. aese $dat0,$rndpang
  1800. aesmc $dat0,$dat0
  1801. aese $dat1,$rndpang
  1802. aesmc $dat1,$dat1
  1803. aese $dat2,$rndpang
  1804. aesmc $dat2,$dat2
  1805. aese $dat3,$rndpang
  1806. aesmc $dat3,$dat3
  1807. aese $dat4,$rndpang
  1808. aesmc $dat4,$dat4
  1809. aese $dat5,$rndpang
  1810. aesmc $dat5,$dat5
  1811. aese $dat6,$rndpang
  1812. aesmc $dat6,$dat6
  1813. aese $dat7,$rndpang
  1814. aesmc $dat7,$dat7
  1815. aese $dat8,$rndpang
  1816. aesmc $dat8,$dat8
  1817. aese $dat9,$rndpang
  1818. aesmc $dat9,$dat9
  1819. aese $dat10,$rndpang
  1820. aesmc $dat10,$dat10
  1821. aese $dat11,$rndpang
  1822. aesmc $dat11,$dat11
  1823. vld1.32 {$rndpang},[$key_],#16
  1824. aese $dat0,$rndping
  1825. aesmc $dat0,$dat0
  1826. add $tctr0,$ctr,#1
  1827. add $tctr1,$ctr,#2
  1828. aese $dat1,$rndping
  1829. aesmc $dat1,$dat1
  1830. add $tctr2,$ctr,#3
  1831. add $tctr3,$ctr,#4
  1832. aese $dat2,$rndping
  1833. aesmc $dat2,$dat2
  1834. add $tctr4,$ctr,#5
  1835. add $tctr5,$ctr,#6
  1836. rev $tctr0,$tctr0
  1837. aese $dat3,$rndping
  1838. aesmc $dat3,$dat3
  1839. add $tctr6,$ctr,#7
  1840. add $tctr7,$ctr,#8
  1841. rev $tctr1,$tctr1
  1842. rev $tctr2,$tctr2
  1843. aese $dat4,$rndping
  1844. aesmc $dat4,$dat4
  1845. add $tctr8,$ctr,#9
  1846. add $tctr9,$ctr,#10
  1847. rev $tctr3,$tctr3
  1848. rev $tctr4,$tctr4
  1849. aese $dat5,$rndping
  1850. aesmc $dat5,$dat5
  1851. add $tctr10,$ctr,#11
  1852. add $tctr11,$ctr,#12
  1853. rev $tctr5,$tctr5
  1854. rev $tctr6,$tctr6
  1855. aese $dat6,$rndping
  1856. aesmc $dat6,$dat6
  1857. rev $tctr7,$tctr7
  1858. rev $tctr8,$tctr8
  1859. aese $dat7,$rndping
  1860. aesmc $dat7,$dat7
  1861. rev $tctr9,$tctr9
  1862. rev $tctr10,$tctr10
  1863. aese $dat8,$rndping
  1864. aesmc $dat8,$dat8
  1865. rev $tctr11,$tctr11
  1866. aese $dat9,$rndping
  1867. aesmc $dat9,$dat9
  1868. aese $dat10,$rndping
  1869. aesmc $dat10,$dat10
  1870. aese $dat11,$rndping
  1871. aesmc $dat11,$dat11
  1872. vld1.32 {$rndping},[$key_],#16
  1873. aese $dat0,$rndpang
  1874. aesmc $dat0,$dat0
  1875. aese $dat1,$rndpang
  1876. aesmc $dat1,$dat1
  1877. aese $dat2,$rndpang
  1878. aesmc $dat2,$dat2
  1879. aese $dat3,$rndpang
  1880. aesmc $dat3,$dat3
  1881. vld1.8 {$in0,$in1,$in2,$in3},[$inp],#64
  1882. aese $dat4,$rndpang
  1883. aesmc $dat4,$dat4
  1884. aese $dat5,$rndpang
  1885. aesmc $dat5,$dat5
  1886. aese $dat6,$rndpang
  1887. aesmc $dat6,$dat6
  1888. aese $dat7,$rndpang
  1889. aesmc $dat7,$dat7
  1890. vld1.8 {$in4,$in5,$in6,$in7},[$inp],#64
  1891. aese $dat8,$rndpang
  1892. aesmc $dat8,$dat8
  1893. aese $dat9,$rndpang
  1894. aesmc $dat9,$dat9
  1895. aese $dat10,$rndpang
  1896. aesmc $dat10,$dat10
  1897. aese $dat11,$rndpang
  1898. aesmc $dat11,$dat11
  1899. vld1.8 {$in8,$in9,$in10,$in11},[$inp],#64
  1900. vld1.32 {$rndpang},[$key_],#16
  1901. mov $key_, $key
  1902. aese $dat0,$rndping
  1903. aesmc $dat0,$dat0
  1904. aese $dat1,$rndping
  1905. aesmc $dat1,$dat1
  1906. aese $dat2,$rndping
  1907. aesmc $dat2,$dat2
  1908. aese $dat3,$rndping
  1909. aesmc $dat3,$dat3
  1910. aese $dat4,$rndping
  1911. aesmc $dat4,$dat4
  1912. aese $dat5,$rndping
  1913. aesmc $dat5,$dat5
  1914. aese $dat6,$rndping
  1915. aesmc $dat6,$dat6
  1916. aese $dat7,$rndping
  1917. aesmc $dat7,$dat7
  1918. aese $dat8,$rndping
  1919. aesmc $dat8,$dat8
  1920. aese $dat9,$rndping
  1921. aesmc $dat9,$dat9
  1922. aese $dat10,$rndping
  1923. aesmc $dat10,$dat10
  1924. aese $dat11,$rndping
  1925. aesmc $dat11,$dat11
  1926. vld1.32 {$rndping},[$key_],#16 // re-pre-load rndkey[0]
  1927. aese $dat0,$rndpang
  1928. eor3 $in0,$in0,$rndlast,$dat0
  1929. vorr $dat0,$ivec,$ivec
  1930. aese $dat1,$rndpang
  1931. eor3 $in1,$in1,$rndlast,$dat1
  1932. vorr $dat1,$ivec,$ivec
  1933. aese $dat2,$rndpang
  1934. eor3 $in2,$in2,$rndlast,$dat2
  1935. vorr $dat2,$ivec,$ivec
  1936. aese $dat3,$rndpang
  1937. eor3 $in3,$in3,$rndlast,$dat3
  1938. vorr $dat3,$ivec,$ivec
  1939. aese $dat4,$rndpang
  1940. eor3 $in4,$in4,$rndlast,$dat4
  1941. vorr $dat4,$ivec,$ivec
  1942. aese $dat5,$rndpang
  1943. eor3 $in5,$in5,$rndlast,$dat5
  1944. vorr $dat5,$ivec,$ivec
  1945. aese $dat6,$rndpang
  1946. eor3 $in6,$in6,$rndlast,$dat6
  1947. vorr $dat6,$ivec,$ivec
  1948. aese $dat7,$rndpang
  1949. eor3 $in7,$in7,$rndlast,$dat7
  1950. vorr $dat7,$ivec,$ivec
  1951. aese $dat8,$rndpang
  1952. eor3 $in8,$in8,$rndlast,$dat8
  1953. vorr $dat8,$ivec,$ivec
  1954. aese $dat9,$rndpang
  1955. eor3 $in9,$in9,$rndlast,$dat9
  1956. vorr $dat9,$ivec,$ivec
  1957. aese $dat10,$rndpang
  1958. eor3 $in10,$in10,$rndlast,$dat10
  1959. vorr $dat10,$ivec,$ivec
  1960. aese $dat11,$rndpang
  1961. eor3 $in11,$in11,$rndlast,$dat11
  1962. vorr $dat11,$ivec,$ivec
  1963. vld1.32 {$rndpang},[$key_],#16 // re-pre-load rndkey[1]
  1964. vmov.32 ${dat0}[3],$tctr0
  1965. vmov.32 ${dat1}[3],$tctr1
  1966. vmov.32 ${dat2}[3],$tctr2
  1967. vmov.32 ${dat3}[3],$tctr3
  1968. vst1.8 {$in0,$in1,$in2,$in3},[$out],#64
  1969. vmov.32 ${dat4}[3],$tctr4
  1970. vmov.32 ${dat5}[3],$tctr5
  1971. vmov.32 ${dat6}[3],$tctr6
  1972. vmov.32 ${dat7}[3],$tctr7
  1973. vst1.8 {$in4,$in5,$in6,$in7},[$out],#64
  1974. vmov.32 ${dat8}[3],$tctr8
  1975. vmov.32 ${dat9}[3],$tctr9
  1976. vmov.32 ${dat10}[3],$tctr10
  1977. vmov.32 ${dat11}[3],$tctr11
  1978. vst1.8 {$in8,$in9,$in10,$in11},[$out],#64
  1979. mov $cnt,$rounds
  1980. add $ctr,$ctr,#12
  1981. subs $len,$len,#12
  1982. b.hs .Loop12x_ctr32_unroll
  1983. // pop regs from stack when 12 data chunks are interleaved
  1984. ldp $dat10d,$dat11d,[sp],#32
  1985. ldp $dat8d,$dat9d,[sp],#32
  1986. ldp x23,x24,[sp],#16
  1987. ldp x21,x22,[sp],#16
  1988. ldp x19,x20,[sp],#16
  1989. add $len,$len,#12
  1990. cbz $len,.Lctr32_done_unroll
  1991. sub $ctr,$ctr,#12
  1992. cmp $len,#2
  1993. b.ls .Lctr32_tail_unroll
  1994. cmp $len,#6
  1995. sub $len,$len,#3 // bias
  1996. add $ctr,$ctr,#3
  1997. b.lo .Loop3x_ctr32_unroll
  1998. sub $len,$len,#3
  1999. add $ctr,$ctr,#3
  2000. b.lo .Loop6x_ctr32_unroll
  2001. .align 4
  2002. .Loop6x_ctr32_unroll:
  2003. aese $dat0,$rndping
  2004. aesmc $dat0,$dat0
  2005. aese $dat1,$rndping
  2006. aesmc $dat1,$dat1
  2007. aese $dat2,$rndping
  2008. aesmc $dat2,$dat2
  2009. aese $dat3,$rndping
  2010. aesmc $dat3,$dat3
  2011. aese $dat4,$rndping
  2012. aesmc $dat4,$dat4
  2013. aese $dat5,$rndping
  2014. aesmc $dat5,$dat5
  2015. vld1.32 {$rndping},[$key_],#16
  2016. subs $cnt,$cnt,#2
  2017. aese $dat0,$rndpang
  2018. aesmc $dat0,$dat0
  2019. aese $dat1,$rndpang
  2020. aesmc $dat1,$dat1
  2021. aese $dat2,$rndpang
  2022. aesmc $dat2,$dat2
  2023. aese $dat3,$rndpang
  2024. aesmc $dat3,$dat3
  2025. aese $dat4,$rndpang
  2026. aesmc $dat4,$dat4
  2027. aese $dat5,$rndpang
  2028. aesmc $dat5,$dat5
  2029. vld1.32 {$rndpang},[$key_],#16
  2030. b.gt .Loop6x_ctr32_unroll
  2031. aese $dat0,$rndping
  2032. aesmc $dat0,$dat0
  2033. aese $dat1,$rndping
  2034. aesmc $dat1,$dat1
  2035. aese $dat2,$rndping
  2036. aesmc $dat2,$dat2
  2037. aese $dat3,$rndping
  2038. aesmc $dat3,$dat3
  2039. aese $dat4,$rndping
  2040. aesmc $dat4,$dat4
  2041. aese $dat5,$rndping
  2042. aesmc $dat5,$dat5
  2043. vld1.32 {$rndping},[$key_],#16
  2044. aese $dat0,$rndpang
  2045. aesmc $dat0,$dat0
  2046. aese $dat1,$rndpang
  2047. aesmc $dat1,$dat1
  2048. aese $dat2,$rndpang
  2049. aesmc $dat2,$dat2
  2050. aese $dat3,$rndpang
  2051. aesmc $dat3,$dat3
  2052. aese $dat4,$rndpang
  2053. aesmc $dat4,$dat4
  2054. aese $dat5,$rndpang
  2055. aesmc $dat5,$dat5
  2056. vld1.32 {$rndpang},[$key_],#16
  2057. aese $dat0,$rndping
  2058. aesmc $dat0,$dat0
  2059. add $tctr0,$ctr,#1
  2060. add $tctr1,$ctr,#2
  2061. aese $dat1,$rndping
  2062. aesmc $dat1,$dat1
  2063. add $tctr2,$ctr,#3
  2064. add $tctr3,$ctr,#4
  2065. aese $dat2,$rndping
  2066. aesmc $dat2,$dat2
  2067. add $tctr4,$ctr,#5
  2068. add $tctr5,$ctr,#6
  2069. rev $tctr0,$tctr0
  2070. aese $dat3,$rndping
  2071. aesmc $dat3,$dat3
  2072. rev $tctr1,$tctr1
  2073. rev $tctr2,$tctr2
  2074. aese $dat4,$rndping
  2075. aesmc $dat4,$dat4
  2076. rev $tctr3,$tctr3
  2077. rev $tctr4,$tctr4
  2078. aese $dat5,$rndping
  2079. aesmc $dat5,$dat5
  2080. rev $tctr5,$tctr5
  2081. vld1.32 {$rndping},[$key_],#16
  2082. aese $dat0,$rndpang
  2083. aesmc $dat0,$dat0
  2084. aese $dat1,$rndpang
  2085. aesmc $dat1,$dat1
  2086. vld1.8 {$in0,$in1,$in2,$in3},[$inp],#64
  2087. aese $dat2,$rndpang
  2088. aesmc $dat2,$dat2
  2089. aese $dat3,$rndpang
  2090. aesmc $dat3,$dat3
  2091. vld1.8 {$in4,$in5},[$inp],#32
  2092. aese $dat4,$rndpang
  2093. aesmc $dat4,$dat4
  2094. aese $dat5,$rndpang
  2095. aesmc $dat5,$dat5
  2096. vld1.32 {$rndpang},[$key_],#16
  2097. mov $key_, $key
  2098. aese $dat0,$rndping
  2099. aesmc $dat0,$dat0
  2100. aese $dat1,$rndping
  2101. aesmc $dat1,$dat1
  2102. aese $dat2,$rndping
  2103. aesmc $dat2,$dat2
  2104. aese $dat3,$rndping
  2105. aesmc $dat3,$dat3
  2106. aese $dat4,$rndping
  2107. aesmc $dat4,$dat4
  2108. aese $dat5,$rndping
  2109. aesmc $dat5,$dat5
  2110. vld1.32 {$rndping},[$key_],#16 // re-pre-load rndkey[0]
  2111. aese $dat0,$rndpang
  2112. eor3 $in0,$in0,$rndlast,$dat0
  2113. aese $dat1,$rndpang
  2114. eor3 $in1,$in1,$rndlast,$dat1
  2115. aese $dat2,$rndpang
  2116. eor3 $in2,$in2,$rndlast,$dat2
  2117. aese $dat3,$rndpang
  2118. eor3 $in3,$in3,$rndlast,$dat3
  2119. aese $dat4,$rndpang
  2120. eor3 $in4,$in4,$rndlast,$dat4
  2121. aese $dat5,$rndpang
  2122. eor3 $in5,$in5,$rndlast,$dat5
  2123. vld1.32 {$rndpang},[$key_],#16 // re-pre-load rndkey[1]
  2124. vorr $dat0,$ivec,$ivec
  2125. vorr $dat1,$ivec,$ivec
  2126. vorr $dat2,$ivec,$ivec
  2127. vorr $dat3,$ivec,$ivec
  2128. vorr $dat4,$ivec,$ivec
  2129. vorr $dat5,$ivec,$ivec
  2130. vmov.32 ${dat0}[3],$tctr0
  2131. vmov.32 ${dat1}[3],$tctr1
  2132. vst1.8 {$in0,$in1,$in2,$in3},[$out],#64
  2133. vmov.32 ${dat2}[3],$tctr2
  2134. vmov.32 ${dat3}[3],$tctr3
  2135. vst1.8 {$in4,$in5},[$out],#32
  2136. vmov.32 ${dat4}[3],$tctr4
  2137. vmov.32 ${dat5}[3],$tctr5
  2138. cbz $len,.Lctr32_done_unroll
  2139. mov $cnt,$rounds
  2140. cmp $len,#2
  2141. b.ls .Lctr32_tail_unroll
  2142. sub $len,$len,#3 // bias
  2143. add $ctr,$ctr,#3
  2144. b .Loop3x_ctr32_unroll
  2145. .align 4
  2146. .Loop3x_ctr32_unroll:
  2147. aese $dat0,$rndping
  2148. aesmc $dat0,$dat0
  2149. aese $dat1,$rndping
  2150. aesmc $dat1,$dat1
  2151. aese $dat2,$rndping
  2152. aesmc $dat2,$dat2
  2153. vld1.32 {$rndping},[$key_],#16
  2154. subs $cnt,$cnt,#2
  2155. aese $dat0,$rndpang
  2156. aesmc $dat0,$dat0
  2157. aese $dat1,$rndpang
  2158. aesmc $dat1,$dat1
  2159. aese $dat2,$rndpang
  2160. aesmc $dat2,$dat2
  2161. vld1.32 {$rndpang},[$key_],#16
  2162. b.gt .Loop3x_ctr32_unroll
  2163. aese $dat0,$rndping
  2164. aesmc $tmp0,$dat0
  2165. aese $dat1,$rndping
  2166. aesmc $tmp1,$dat1
  2167. vld1.8 {$in0,$in1,$in2},[$inp],#48
  2168. vorr $dat0,$ivec,$ivec
  2169. aese $dat2,$rndping
  2170. aesmc $dat2,$dat2
  2171. vld1.32 {$rndping},[$key_],#16
  2172. vorr $dat1,$ivec,$ivec
  2173. aese $tmp0,$rndpang
  2174. aesmc $tmp0,$tmp0
  2175. aese $tmp1,$rndpang
  2176. aesmc $tmp1,$tmp1
  2177. aese $dat2,$rndpang
  2178. aesmc $tmp2,$dat2
  2179. vld1.32 {$rndpang},[$key_],#16
  2180. vorr $dat2,$ivec,$ivec
  2181. add $tctr0,$ctr,#1
  2182. aese $tmp0,$rndping
  2183. aesmc $tmp0,$tmp0
  2184. aese $tmp1,$rndping
  2185. aesmc $tmp1,$tmp1
  2186. add $tctr1,$ctr,#2
  2187. aese $tmp2,$rndping
  2188. aesmc $tmp2,$tmp2
  2189. vld1.32 {$rndping},[$key_],#16
  2190. add $ctr,$ctr,#3
  2191. aese $tmp0,$rndpang
  2192. aesmc $tmp0,$tmp0
  2193. aese $tmp1,$rndpang
  2194. aesmc $tmp1,$tmp1
  2195. rev $tctr0,$tctr0
  2196. aese $tmp2,$rndpang
  2197. aesmc $tmp2,$tmp2
  2198. vld1.32 {$rndpang},[$key_],#16
  2199. vmov.32 ${dat0}[3], $tctr0
  2200. mov $key_,$key
  2201. rev $tctr1,$tctr1
  2202. aese $tmp0,$rndping
  2203. aesmc $tmp0,$tmp0
  2204. aese $tmp1,$rndping
  2205. aesmc $tmp1,$tmp1
  2206. vmov.32 ${dat1}[3], $tctr1
  2207. rev $tctr2,$ctr
  2208. aese $tmp2,$rndping
  2209. aesmc $tmp2,$tmp2
  2210. vmov.32 ${dat2}[3], $tctr2
  2211. aese $tmp0,$rndpang
  2212. aese $tmp1,$rndpang
  2213. aese $tmp2,$rndpang
  2214. eor3 $in0,$in0,$rndlast,$tmp0
  2215. vld1.32 {$rndping},[$key_],#16 // re-pre-load rndkey[0]
  2216. eor3 $in1,$in1,$rndlast,$tmp1
  2217. mov $cnt,$rounds
  2218. eor3 $in2,$in2,$rndlast,$tmp2
  2219. vld1.32 {$rndpang},[$key_],#16 // re-pre-load rndkey[1]
  2220. vst1.8 {$in0,$in1,$in2},[$out],#48
  2221. cbz $len,.Lctr32_done_unroll
  2222. .Lctr32_tail_unroll:
  2223. cmp $len,#1
  2224. b.eq .Lctr32_tail_1_unroll
  2225. .Lctr32_tail_2_unroll:
  2226. aese $dat0,$rndping
  2227. aesmc $dat0,$dat0
  2228. aese $dat1,$rndping
  2229. aesmc $dat1,$dat1
  2230. vld1.32 {$rndping},[$key_],#16
  2231. subs $cnt,$cnt,#2
  2232. aese $dat0,$rndpang
  2233. aesmc $dat0,$dat0
  2234. aese $dat1,$rndpang
  2235. aesmc $dat1,$dat1
  2236. vld1.32 {$rndpang},[$key_],#16
  2237. b.gt .Lctr32_tail_2_unroll
  2238. aese $dat0,$rndping
  2239. aesmc $dat0,$dat0
  2240. aese $dat1,$rndping
  2241. aesmc $dat1,$dat1
  2242. vld1.32 {$rndping},[$key_],#16
  2243. aese $dat0,$rndpang
  2244. aesmc $dat0,$dat0
  2245. aese $dat1,$rndpang
  2246. aesmc $dat1,$dat1
  2247. vld1.32 {$rndpang},[$key_],#16
  2248. vld1.8 {$in0,$in1},[$inp],#32
  2249. aese $dat0,$rndping
  2250. aesmc $dat0,$dat0
  2251. aese $dat1,$rndping
  2252. aesmc $dat1,$dat1
  2253. vld1.32 {$rndping},[$key_],#16
  2254. aese $dat0,$rndpang
  2255. aesmc $dat0,$dat0
  2256. aese $dat1,$rndpang
  2257. aesmc $dat1,$dat1
  2258. vld1.32 {$rndpang},[$key_],#16
  2259. aese $dat0,$rndping
  2260. aesmc $dat0,$dat0
  2261. aese $dat1,$rndping
  2262. aesmc $dat1,$dat1
  2263. aese $dat0,$rndpang
  2264. aese $dat1,$rndpang
  2265. eor3 $in0,$in0,$rndlast,$dat0
  2266. eor3 $in1,$in1,$rndlast,$dat1
  2267. vst1.8 {$in0,$in1},[$out],#32
  2268. b .Lctr32_done_unroll
  2269. .Lctr32_tail_1_unroll:
  2270. aese $dat0,$rndping
  2271. aesmc $dat0,$dat0
  2272. vld1.32 {$rndping},[$key_],#16
  2273. subs $cnt,$cnt,#2
  2274. aese $dat0,$rndpang
  2275. aesmc $dat0,$dat0
  2276. vld1.32 {$rndpang},[$key_],#16
  2277. b.gt .Lctr32_tail_1_unroll
  2278. aese $dat0,$rndping
  2279. aesmc $dat0,$dat0
  2280. vld1.32 {$rndping},[$key_],#16
  2281. aese $dat0,$rndpang
  2282. aesmc $dat0,$dat0
  2283. vld1.32 {$rndpang},[$key_],#16
  2284. vld1.8 {$in0},[$inp]
  2285. aese $dat0,$rndping
  2286. aesmc $dat0,$dat0
  2287. vld1.32 {$rndping},[$key_],#16
  2288. aese $dat0,$rndpang
  2289. aesmc $dat0,$dat0
  2290. vld1.32 {$rndpang},[$key_],#16
  2291. aese $dat0,$rndping
  2292. aesmc $dat0,$dat0
  2293. aese $dat0,$rndpang
  2294. eor3 $in0,$in0,$rndlast,$dat0
  2295. vst1.8 {$in0},[$out],#16
  2296. .Lctr32_done_unroll:
  2297. ldp d8,d9,[sp, #16]
  2298. ldp d10,d11,[sp, #32]
  2299. ldp d12,d13,[sp, #48]
  2300. ldp d15,d16,[sp, #64]
  2301. ldr x29,[sp],#80
  2302. ret
  2303. .size ${prefix}_ctr32_encrypt_blocks_unroll12_eor3,.-${prefix}_ctr32_encrypt_blocks_unroll12_eor3
  2304. ___
  2305. }}}
  2306. {{{
  2307. my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
  2308. my ($rounds,$cnt,$key_)=("w5","w6","x7");
  2309. my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
  2310. my $step="x12"; # aliases with $tctr2
  2311. my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
  2312. my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
  2313. # used only in 64-bit mode...
  2314. my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23));
  2315. my ($dat,$tmp)=($dat0,$tmp0);
  2316. ### q8-q15 preloaded key schedule
  2317. $code.=<<___;
  2318. .globl ${prefix}_ctr32_encrypt_blocks
  2319. .type ${prefix}_ctr32_encrypt_blocks,%function
  2320. .align 5
  2321. ${prefix}_ctr32_encrypt_blocks:
  2322. ___
  2323. $code.=<<___ if ($flavour =~ /64/);
  2324. AARCH64_VALID_CALL_TARGET
  2325. // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
  2326. stp x29,x30,[sp,#-16]!
  2327. add x29,sp,#0
  2328. ___
  2329. $code.=<<___ if ($flavour !~ /64/);
  2330. mov ip,sp
  2331. stmdb sp!,{r4-r10,lr}
  2332. vstmdb sp!,{d8-d15} @ ABI specification says so
  2333. ldr r4, [ip] @ load remaining arg
  2334. ___
  2335. $code.=<<___;
  2336. ldr $rounds,[$key,#240]
  2337. ldr $ctr, [$ivp, #12]
  2338. #ifdef __ARMEB__
  2339. vld1.8 {$dat0},[$ivp]
  2340. #else
  2341. vld1.32 {$dat0},[$ivp]
  2342. #endif
  2343. vld1.32 {q8-q9},[$key] // load key schedule...
  2344. sub $rounds,$rounds,#4
  2345. mov $step,#16
  2346. cmp $len,#2
  2347. add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
  2348. sub $rounds,$rounds,#2
  2349. vld1.32 {q12-q13},[$key_],#32
  2350. vld1.32 {q14-q15},[$key_],#32
  2351. vld1.32 {$rndlast},[$key_]
  2352. add $key_,$key,#32
  2353. mov $cnt,$rounds
  2354. cclr $step,lo
  2355. #ifndef __ARMEB__
  2356. rev $ctr, $ctr
  2357. #endif
  2358. ___
  2359. $code.=<<___ if ($flavour =~ /64/);
  2360. vorr $dat1,$dat0,$dat0
  2361. add $tctr1, $ctr, #1
  2362. vorr $dat2,$dat0,$dat0
  2363. add $ctr, $ctr, #2
  2364. vorr $ivec,$dat0,$dat0
  2365. rev $tctr1, $tctr1
  2366. vmov.32 ${dat1}[3],$tctr1
  2367. b.ls .Lctr32_tail
  2368. rev $tctr2, $ctr
  2369. sub $len,$len,#3 // bias
  2370. vmov.32 ${dat2}[3],$tctr2
  2371. ___
  2372. $code.=<<___ if ($flavour !~ /64/);
  2373. add $tctr1, $ctr, #1
  2374. vorr $ivec,$dat0,$dat0
  2375. rev $tctr1, $tctr1
  2376. vmov.32 ${ivec}[3],$tctr1
  2377. add $ctr, $ctr, #2
  2378. vorr $dat1,$ivec,$ivec
  2379. b.ls .Lctr32_tail
  2380. rev $tctr2, $ctr
  2381. vmov.32 ${ivec}[3],$tctr2
  2382. sub $len,$len,#3 // bias
  2383. vorr $dat2,$ivec,$ivec
  2384. ___
  2385. $code.=<<___ if ($flavour =~ /64/);
  2386. cmp $len,#32
  2387. b.lo .Loop3x_ctr32
  2388. add w13,$ctr,#1
  2389. add w14,$ctr,#2
  2390. vorr $dat3,$dat0,$dat0
  2391. rev w13,w13
  2392. vorr $dat4,$dat0,$dat0
  2393. rev w14,w14
  2394. vmov.32 ${dat3}[3],w13
  2395. sub $len,$len,#2 // bias
  2396. vmov.32 ${dat4}[3],w14
  2397. add $ctr,$ctr,#2
  2398. b .Loop5x_ctr32
  2399. .align 4
  2400. .Loop5x_ctr32:
  2401. aese $dat0,q8
  2402. aesmc $dat0,$dat0
  2403. aese $dat1,q8
  2404. aesmc $dat1,$dat1
  2405. aese $dat2,q8
  2406. aesmc $dat2,$dat2
  2407. aese $dat3,q8
  2408. aesmc $dat3,$dat3
  2409. aese $dat4,q8
  2410. aesmc $dat4,$dat4
  2411. vld1.32 {q8},[$key_],#16
  2412. subs $cnt,$cnt,#2
  2413. aese $dat0,q9
  2414. aesmc $dat0,$dat0
  2415. aese $dat1,q9
  2416. aesmc $dat1,$dat1
  2417. aese $dat2,q9
  2418. aesmc $dat2,$dat2
  2419. aese $dat3,q9
  2420. aesmc $dat3,$dat3
  2421. aese $dat4,q9
  2422. aesmc $dat4,$dat4
  2423. vld1.32 {q9},[$key_],#16
  2424. b.gt .Loop5x_ctr32
  2425. mov $key_,$key
  2426. aese $dat0,q8
  2427. aesmc $dat0,$dat0
  2428. aese $dat1,q8
  2429. aesmc $dat1,$dat1
  2430. aese $dat2,q8
  2431. aesmc $dat2,$dat2
  2432. aese $dat3,q8
  2433. aesmc $dat3,$dat3
  2434. aese $dat4,q8
  2435. aesmc $dat4,$dat4
  2436. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  2437. aese $dat0,q9
  2438. aesmc $dat0,$dat0
  2439. aese $dat1,q9
  2440. aesmc $dat1,$dat1
  2441. aese $dat2,q9
  2442. aesmc $dat2,$dat2
  2443. aese $dat3,q9
  2444. aesmc $dat3,$dat3
  2445. aese $dat4,q9
  2446. aesmc $dat4,$dat4
  2447. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  2448. aese $dat0,q12
  2449. aesmc $dat0,$dat0
  2450. add $tctr0,$ctr,#1
  2451. add $tctr1,$ctr,#2
  2452. aese $dat1,q12
  2453. aesmc $dat1,$dat1
  2454. add $tctr2,$ctr,#3
  2455. add w13,$ctr,#4
  2456. aese $dat2,q12
  2457. aesmc $dat2,$dat2
  2458. add w14,$ctr,#5
  2459. rev $tctr0,$tctr0
  2460. aese $dat3,q12
  2461. aesmc $dat3,$dat3
  2462. rev $tctr1,$tctr1
  2463. rev $tctr2,$tctr2
  2464. aese $dat4,q12
  2465. aesmc $dat4,$dat4
  2466. rev w13,w13
  2467. rev w14,w14
  2468. aese $dat0,q13
  2469. aesmc $dat0,$dat0
  2470. aese $dat1,q13
  2471. aesmc $dat1,$dat1
  2472. aese $dat2,q13
  2473. aesmc $dat2,$dat2
  2474. aese $dat3,q13
  2475. aesmc $dat3,$dat3
  2476. aese $dat4,q13
  2477. aesmc $dat4,$dat4
  2478. aese $dat0,q14
  2479. aesmc $dat0,$dat0
  2480. vld1.8 {$in0},[$inp],#16
  2481. aese $dat1,q14
  2482. aesmc $dat1,$dat1
  2483. vld1.8 {$in1},[$inp],#16
  2484. aese $dat2,q14
  2485. aesmc $dat2,$dat2
  2486. vld1.8 {$in2},[$inp],#16
  2487. aese $dat3,q14
  2488. aesmc $dat3,$dat3
  2489. vld1.8 {$in3},[$inp],#16
  2490. aese $dat4,q14
  2491. aesmc $dat4,$dat4
  2492. vld1.8 {$in4},[$inp],#16
  2493. aese $dat0,q15
  2494. veor $in0,$in0,$rndlast
  2495. aese $dat1,q15
  2496. veor $in1,$in1,$rndlast
  2497. aese $dat2,q15
  2498. veor $in2,$in2,$rndlast
  2499. aese $dat3,q15
  2500. veor $in3,$in3,$rndlast
  2501. aese $dat4,q15
  2502. veor $in4,$in4,$rndlast
  2503. veor $in0,$in0,$dat0
  2504. vorr $dat0,$ivec,$ivec
  2505. veor $in1,$in1,$dat1
  2506. vorr $dat1,$ivec,$ivec
  2507. veor $in2,$in2,$dat2
  2508. vorr $dat2,$ivec,$ivec
  2509. veor $in3,$in3,$dat3
  2510. vorr $dat3,$ivec,$ivec
  2511. veor $in4,$in4,$dat4
  2512. vorr $dat4,$ivec,$ivec
  2513. vst1.8 {$in0},[$out],#16
  2514. vmov.32 ${dat0}[3],$tctr0
  2515. vst1.8 {$in1},[$out],#16
  2516. vmov.32 ${dat1}[3],$tctr1
  2517. vst1.8 {$in2},[$out],#16
  2518. vmov.32 ${dat2}[3],$tctr2
  2519. vst1.8 {$in3},[$out],#16
  2520. vmov.32 ${dat3}[3],w13
  2521. vst1.8 {$in4},[$out],#16
  2522. vmov.32 ${dat4}[3],w14
  2523. mov $cnt,$rounds
  2524. cbz $len,.Lctr32_done
  2525. add $ctr,$ctr,#5
  2526. subs $len,$len,#5
  2527. b.hs .Loop5x_ctr32
  2528. add $len,$len,#5
  2529. sub $ctr,$ctr,#5
  2530. cmp $len,#2
  2531. mov $step,#16
  2532. cclr $step,lo
  2533. b.ls .Lctr32_tail
  2534. sub $len,$len,#3 // bias
  2535. add $ctr,$ctr,#3
  2536. ___
  2537. $code.=<<___;
  2538. b .Loop3x_ctr32
  2539. .align 4
  2540. .Loop3x_ctr32:
  2541. aese $dat0,q8
  2542. aesmc $dat0,$dat0
  2543. aese $dat1,q8
  2544. aesmc $dat1,$dat1
  2545. aese $dat2,q8
  2546. aesmc $dat2,$dat2
  2547. vld1.32 {q8},[$key_],#16
  2548. subs $cnt,$cnt,#2
  2549. aese $dat0,q9
  2550. aesmc $dat0,$dat0
  2551. aese $dat1,q9
  2552. aesmc $dat1,$dat1
  2553. aese $dat2,q9
  2554. aesmc $dat2,$dat2
  2555. vld1.32 {q9},[$key_],#16
  2556. b.gt .Loop3x_ctr32
  2557. aese $dat0,q8
  2558. aesmc $tmp0,$dat0
  2559. aese $dat1,q8
  2560. aesmc $tmp1,$dat1
  2561. vld1.8 {$in0},[$inp],#16
  2562. ___
  2563. $code.=<<___ if ($flavour =~ /64/);
  2564. vorr $dat0,$ivec,$ivec
  2565. ___
  2566. $code.=<<___ if ($flavour !~ /64/);
  2567. add $tctr0,$ctr,#1
  2568. ___
  2569. $code.=<<___;
  2570. aese $dat2,q8
  2571. aesmc $dat2,$dat2
  2572. vld1.8 {$in1},[$inp],#16
  2573. ___
  2574. $code.=<<___ if ($flavour =~ /64/);
  2575. vorr $dat1,$ivec,$ivec
  2576. ___
  2577. $code.=<<___ if ($flavour !~ /64/);
  2578. rev $tctr0,$tctr0
  2579. ___
  2580. $code.=<<___;
  2581. aese $tmp0,q9
  2582. aesmc $tmp0,$tmp0
  2583. aese $tmp1,q9
  2584. aesmc $tmp1,$tmp1
  2585. vld1.8 {$in2},[$inp],#16
  2586. mov $key_,$key
  2587. aese $dat2,q9
  2588. aesmc $tmp2,$dat2
  2589. ___
  2590. $code.=<<___ if ($flavour =~ /64/);
  2591. vorr $dat2,$ivec,$ivec
  2592. add $tctr0,$ctr,#1
  2593. ___
  2594. $code.=<<___;
  2595. aese $tmp0,q12
  2596. aesmc $tmp0,$tmp0
  2597. aese $tmp1,q12
  2598. aesmc $tmp1,$tmp1
  2599. veor $in0,$in0,$rndlast
  2600. add $tctr1,$ctr,#2
  2601. aese $tmp2,q12
  2602. aesmc $tmp2,$tmp2
  2603. veor $in1,$in1,$rndlast
  2604. add $ctr,$ctr,#3
  2605. aese $tmp0,q13
  2606. aesmc $tmp0,$tmp0
  2607. aese $tmp1,q13
  2608. aesmc $tmp1,$tmp1
  2609. veor $in2,$in2,$rndlast
  2610. ___
  2611. $code.=<<___ if ($flavour =~ /64/);
  2612. rev $tctr0,$tctr0
  2613. aese $tmp2,q13
  2614. aesmc $tmp2,$tmp2
  2615. vmov.32 ${dat0}[3], $tctr0
  2616. ___
  2617. $code.=<<___ if ($flavour !~ /64/);
  2618. vmov.32 ${ivec}[3], $tctr0
  2619. aese $tmp2,q13
  2620. aesmc $tmp2,$tmp2
  2621. vorr $dat0,$ivec,$ivec
  2622. ___
  2623. $code.=<<___;
  2624. rev $tctr1,$tctr1
  2625. aese $tmp0,q14
  2626. aesmc $tmp0,$tmp0
  2627. ___
  2628. $code.=<<___ if ($flavour !~ /64/);
  2629. vmov.32 ${ivec}[3], $tctr1
  2630. rev $tctr2,$ctr
  2631. ___
  2632. $code.=<<___;
  2633. aese $tmp1,q14
  2634. aesmc $tmp1,$tmp1
  2635. ___
  2636. $code.=<<___ if ($flavour =~ /64/);
  2637. vmov.32 ${dat1}[3], $tctr1
  2638. rev $tctr2,$ctr
  2639. aese $tmp2,q14
  2640. aesmc $tmp2,$tmp2
  2641. vmov.32 ${dat2}[3], $tctr2
  2642. ___
  2643. $code.=<<___ if ($flavour !~ /64/);
  2644. vorr $dat1,$ivec,$ivec
  2645. vmov.32 ${ivec}[3], $tctr2
  2646. aese $tmp2,q14
  2647. aesmc $tmp2,$tmp2
  2648. vorr $dat2,$ivec,$ivec
  2649. ___
  2650. $code.=<<___;
  2651. subs $len,$len,#3
  2652. aese $tmp0,q15
  2653. aese $tmp1,q15
  2654. aese $tmp2,q15
  2655. veor $in0,$in0,$tmp0
  2656. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  2657. vst1.8 {$in0},[$out],#16
  2658. veor $in1,$in1,$tmp1
  2659. mov $cnt,$rounds
  2660. vst1.8 {$in1},[$out],#16
  2661. veor $in2,$in2,$tmp2
  2662. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  2663. vst1.8 {$in2},[$out],#16
  2664. b.hs .Loop3x_ctr32
  2665. adds $len,$len,#3
  2666. b.eq .Lctr32_done
  2667. cmp $len,#1
  2668. mov $step,#16
  2669. cclr $step,eq
  2670. .Lctr32_tail:
  2671. aese $dat0,q8
  2672. aesmc $dat0,$dat0
  2673. aese $dat1,q8
  2674. aesmc $dat1,$dat1
  2675. vld1.32 {q8},[$key_],#16
  2676. subs $cnt,$cnt,#2
  2677. aese $dat0,q9
  2678. aesmc $dat0,$dat0
  2679. aese $dat1,q9
  2680. aesmc $dat1,$dat1
  2681. vld1.32 {q9},[$key_],#16
  2682. b.gt .Lctr32_tail
  2683. aese $dat0,q8
  2684. aesmc $dat0,$dat0
  2685. aese $dat1,q8
  2686. aesmc $dat1,$dat1
  2687. aese $dat0,q9
  2688. aesmc $dat0,$dat0
  2689. aese $dat1,q9
  2690. aesmc $dat1,$dat1
  2691. vld1.8 {$in0},[$inp],$step
  2692. aese $dat0,q12
  2693. aesmc $dat0,$dat0
  2694. aese $dat1,q12
  2695. aesmc $dat1,$dat1
  2696. vld1.8 {$in1},[$inp]
  2697. aese $dat0,q13
  2698. aesmc $dat0,$dat0
  2699. aese $dat1,q13
  2700. aesmc $dat1,$dat1
  2701. veor $in0,$in0,$rndlast
  2702. aese $dat0,q14
  2703. aesmc $dat0,$dat0
  2704. aese $dat1,q14
  2705. aesmc $dat1,$dat1
  2706. veor $in1,$in1,$rndlast
  2707. aese $dat0,q15
  2708. aese $dat1,q15
  2709. cmp $len,#1
  2710. veor $in0,$in0,$dat0
  2711. veor $in1,$in1,$dat1
  2712. vst1.8 {$in0},[$out],#16
  2713. b.eq .Lctr32_done
  2714. vst1.8 {$in1},[$out]
  2715. .Lctr32_done:
  2716. ___
  2717. $code.=<<___ if ($flavour !~ /64/);
  2718. vldmia sp!,{d8-d15}
  2719. ldmia sp!,{r4-r10,pc}
  2720. ___
  2721. $code.=<<___ if ($flavour =~ /64/);
  2722. ldr x29,[sp],#16
  2723. ret
  2724. ___
  2725. $code.=<<___;
  2726. .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
  2727. ___
  2728. }}}
  2729. # Performance in cycles per byte.
  2730. # Processed with AES-XTS different key size.
  2731. # It shows the value before and after optimization as below:
  2732. # (before/after):
  2733. #
  2734. # AES-128-XTS AES-256-XTS
  2735. # Cortex-A57 3.36/1.09 4.02/1.37
  2736. # Cortex-A72 3.03/1.02 3.28/1.33
  2737. # Optimization is implemented by loop unrolling and interleaving.
  2738. # Commonly, we choose the unrolling factor as 5, if the input
  2739. # data size smaller than 5 blocks, but not smaller than 3 blocks,
  2740. # choose 3 as the unrolling factor.
  2741. # If the input data size dsize >= 5*16 bytes, then take 5 blocks
  2742. # as one iteration, every loop the left size lsize -= 5*16.
  2743. # If lsize < 5*16 bytes, treat them as the tail. Note: left 4*16 bytes
  2744. # will be processed specially, which be integrated into the 5*16 bytes
  2745. # loop to improve the efficiency.
  2746. # There is one special case, if the original input data size dsize
  2747. # = 16 bytes, we will treat it separately to improve the
  2748. # performance: one independent code block without LR, FP load and
  2749. # store.
  2750. # Encryption will process the (length -tailcnt) bytes as mentioned
  2751. # previously, then encrypt the composite block as last second
  2752. # cipher block.
  2753. # Decryption will process the (length -tailcnt -1) bytes as mentioned
  2754. # previously, then decrypt the last second cipher block to get the
  2755. # last plain block(tail), decrypt the composite block as last second
  2756. # plain text block.
  2757. {{{
  2758. my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
  2759. my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
  2760. my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
  2761. my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
  2762. my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
  2763. my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
  2764. my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b");
  2765. my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
  2766. my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
  2767. my ($tmpin)=("v26.16b");
  2768. my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
  2769. # q7 last round key
  2770. # q10-q15, q7 Last 7 round keys
  2771. # q8-q9 preloaded round keys except last 7 keys for big size
  2772. # q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte
  2773. my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
  2774. my ($dat3,$in3,$tmp3); # used only in 64-bit mode
  2775. my ($dat4,$in4,$tmp4);
  2776. if ($flavour =~ /64/) {
  2777. ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
  2778. }
  2779. $code.=<<___ if ($flavour =~ /64/);
  2780. .globl ${prefix}_xts_encrypt
  2781. .type ${prefix}_xts_encrypt,%function
  2782. .align 5
  2783. ${prefix}_xts_encrypt:
  2784. ___
  2785. $code.=<<___ if ($flavour =~ /64/);
  2786. AARCH64_VALID_CALL_TARGET
  2787. cmp $len,#16
  2788. // Original input data size bigger than 16, jump to big size processing.
  2789. b.ne .Lxts_enc_big_size
  2790. // Encrypt the iv with key2, as the first XEX iv.
  2791. ldr $rounds,[$key2,#240]
  2792. vld1.32 {$dat},[$key2],#16
  2793. vld1.8 {$iv0},[$ivp]
  2794. sub $rounds,$rounds,#2
  2795. vld1.32 {$dat1},[$key2],#16
  2796. .Loop_enc_iv_enc:
  2797. aese $iv0,$dat
  2798. aesmc $iv0,$iv0
  2799. vld1.32 {$dat},[$key2],#16
  2800. subs $rounds,$rounds,#2
  2801. aese $iv0,$dat1
  2802. aesmc $iv0,$iv0
  2803. vld1.32 {$dat1},[$key2],#16
  2804. b.gt .Loop_enc_iv_enc
  2805. aese $iv0,$dat
  2806. aesmc $iv0,$iv0
  2807. vld1.32 {$dat},[$key2]
  2808. aese $iv0,$dat1
  2809. veor $iv0,$iv0,$dat
  2810. vld1.8 {$dat0},[$inp]
  2811. veor $dat0,$iv0,$dat0
  2812. ldr $rounds,[$key1,#240]
  2813. vld1.32 {q20-q21},[$key1],#32 // load key schedule...
  2814. aese $dat0,q20
  2815. aesmc $dat0,$dat0
  2816. vld1.32 {q8-q9},[$key1],#32 // load key schedule...
  2817. aese $dat0,q21
  2818. aesmc $dat0,$dat0
  2819. subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-xts processing
  2820. b.eq .Lxts_128_enc
  2821. .Lxts_enc_round_loop:
  2822. aese $dat0,q8
  2823. aesmc $dat0,$dat0
  2824. vld1.32 {q8},[$key1],#16 // load key schedule...
  2825. aese $dat0,q9
  2826. aesmc $dat0,$dat0
  2827. vld1.32 {q9},[$key1],#16 // load key schedule...
  2828. subs $rounds,$rounds,#2 // bias
  2829. b.gt .Lxts_enc_round_loop
  2830. .Lxts_128_enc:
  2831. vld1.32 {q10-q11},[$key1],#32 // load key schedule...
  2832. aese $dat0,q8
  2833. aesmc $dat0,$dat0
  2834. aese $dat0,q9
  2835. aesmc $dat0,$dat0
  2836. vld1.32 {q12-q13},[$key1],#32 // load key schedule...
  2837. aese $dat0,q10
  2838. aesmc $dat0,$dat0
  2839. aese $dat0,q11
  2840. aesmc $dat0,$dat0
  2841. vld1.32 {q14-q15},[$key1],#32 // load key schedule...
  2842. aese $dat0,q12
  2843. aesmc $dat0,$dat0
  2844. aese $dat0,q13
  2845. aesmc $dat0,$dat0
  2846. vld1.32 {$rndlast},[$key1]
  2847. aese $dat0,q14
  2848. aesmc $dat0,$dat0
  2849. aese $dat0,q15
  2850. veor $dat0,$dat0,$rndlast
  2851. veor $dat0,$dat0,$iv0
  2852. vst1.8 {$dat0},[$out]
  2853. b .Lxts_enc_final_abort
  2854. .align 4
  2855. .Lxts_enc_big_size:
  2856. ___
  2857. $code.=<<___ if ($flavour =~ /64/);
  2858. stp $constnumx,$tmpinp,[sp,#-64]!
  2859. stp $tailcnt,$midnumx,[sp,#48]
  2860. stp $ivd10,$ivd20,[sp,#32]
  2861. stp $ivd30,$ivd40,[sp,#16]
  2862. // tailcnt store the tail value of length%16.
  2863. and $tailcnt,$len,#0xf
  2864. and $len,$len,#-16
  2865. subs $len,$len,#16
  2866. mov $step,#16
  2867. b.lo .Lxts_abort
  2868. csel $step,xzr,$step,eq
  2869. // Firstly, encrypt the iv with key2, as the first iv of XEX.
  2870. ldr $rounds,[$key2,#240]
  2871. vld1.32 {$dat},[$key2],#16
  2872. vld1.8 {$iv0},[$ivp]
  2873. sub $rounds,$rounds,#2
  2874. vld1.32 {$dat1},[$key2],#16
  2875. .Loop_iv_enc:
  2876. aese $iv0,$dat
  2877. aesmc $iv0,$iv0
  2878. vld1.32 {$dat},[$key2],#16
  2879. subs $rounds,$rounds,#2
  2880. aese $iv0,$dat1
  2881. aesmc $iv0,$iv0
  2882. vld1.32 {$dat1},[$key2],#16
  2883. b.gt .Loop_iv_enc
  2884. aese $iv0,$dat
  2885. aesmc $iv0,$iv0
  2886. vld1.32 {$dat},[$key2]
  2887. aese $iv0,$dat1
  2888. veor $iv0,$iv0,$dat
  2889. // The iv for second block
  2890. // $ivl- iv(low), $ivh - iv(high)
  2891. // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
  2892. fmov $ivl,$ivd00
  2893. fmov $ivh,$ivd01
  2894. mov $constnum,#0x87
  2895. extr $midnumx,$ivh,$ivh,#32
  2896. extr $ivh,$ivh,$ivl,#63
  2897. and $tmpmw,$constnum,$midnum,asr#31
  2898. eor $ivl,$tmpmx,$ivl,lsl#1
  2899. fmov $ivd10,$ivl
  2900. fmov $ivd11,$ivh
  2901. ldr $rounds0,[$key1,#240] // next starting point
  2902. vld1.8 {$dat},[$inp],$step
  2903. vld1.32 {q8-q9},[$key1] // load key schedule...
  2904. sub $rounds0,$rounds0,#6
  2905. add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys
  2906. sub $rounds0,$rounds0,#2
  2907. vld1.32 {q10-q11},[$key_],#32
  2908. vld1.32 {q12-q13},[$key_],#32
  2909. vld1.32 {q14-q15},[$key_],#32
  2910. vld1.32 {$rndlast},[$key_]
  2911. add $key_,$key1,#32
  2912. mov $rounds,$rounds0
  2913. // Encryption
  2914. .Lxts_enc:
  2915. vld1.8 {$dat2},[$inp],#16
  2916. subs $len,$len,#32 // bias
  2917. add $rounds,$rounds0,#2
  2918. vorr $in1,$dat,$dat
  2919. vorr $dat1,$dat,$dat
  2920. vorr $in3,$dat,$dat
  2921. vorr $in2,$dat2,$dat2
  2922. vorr $in4,$dat2,$dat2
  2923. b.lo .Lxts_inner_enc_tail
  2924. veor $dat,$dat,$iv0 // before encryption, xor with iv
  2925. veor $dat2,$dat2,$iv1
  2926. // The iv for third block
  2927. extr $midnumx,$ivh,$ivh,#32
  2928. extr $ivh,$ivh,$ivl,#63
  2929. and $tmpmw,$constnum,$midnum,asr#31
  2930. eor $ivl,$tmpmx,$ivl,lsl#1
  2931. fmov $ivd20,$ivl
  2932. fmov $ivd21,$ivh
  2933. vorr $dat1,$dat2,$dat2
  2934. vld1.8 {$dat2},[$inp],#16
  2935. vorr $in0,$dat,$dat
  2936. vorr $in1,$dat1,$dat1
  2937. veor $in2,$dat2,$iv2 // the third block
  2938. veor $dat2,$dat2,$iv2
  2939. cmp $len,#32
  2940. b.lo .Lxts_outer_enc_tail
  2941. // The iv for fourth block
  2942. extr $midnumx,$ivh,$ivh,#32
  2943. extr $ivh,$ivh,$ivl,#63
  2944. and $tmpmw,$constnum,$midnum,asr#31
  2945. eor $ivl,$tmpmx,$ivl,lsl#1
  2946. fmov $ivd30,$ivl
  2947. fmov $ivd31,$ivh
  2948. vld1.8 {$dat3},[$inp],#16
  2949. // The iv for fifth block
  2950. extr $midnumx,$ivh,$ivh,#32
  2951. extr $ivh,$ivh,$ivl,#63
  2952. and $tmpmw,$constnum,$midnum,asr#31
  2953. eor $ivl,$tmpmx,$ivl,lsl#1
  2954. fmov $ivd40,$ivl
  2955. fmov $ivd41,$ivh
  2956. vld1.8 {$dat4},[$inp],#16
  2957. veor $dat3,$dat3,$iv3 // the fourth block
  2958. veor $dat4,$dat4,$iv4
  2959. sub $len,$len,#32 // bias
  2960. mov $rounds,$rounds0
  2961. b .Loop5x_xts_enc
  2962. .align 4
  2963. .Loop5x_xts_enc:
  2964. aese $dat0,q8
  2965. aesmc $dat0,$dat0
  2966. aese $dat1,q8
  2967. aesmc $dat1,$dat1
  2968. aese $dat2,q8
  2969. aesmc $dat2,$dat2
  2970. aese $dat3,q8
  2971. aesmc $dat3,$dat3
  2972. aese $dat4,q8
  2973. aesmc $dat4,$dat4
  2974. vld1.32 {q8},[$key_],#16
  2975. subs $rounds,$rounds,#2
  2976. aese $dat0,q9
  2977. aesmc $dat0,$dat0
  2978. aese $dat1,q9
  2979. aesmc $dat1,$dat1
  2980. aese $dat2,q9
  2981. aesmc $dat2,$dat2
  2982. aese $dat3,q9
  2983. aesmc $dat3,$dat3
  2984. aese $dat4,q9
  2985. aesmc $dat4,$dat4
  2986. vld1.32 {q9},[$key_],#16
  2987. b.gt .Loop5x_xts_enc
  2988. aese $dat0,q8
  2989. aesmc $dat0,$dat0
  2990. aese $dat1,q8
  2991. aesmc $dat1,$dat1
  2992. aese $dat2,q8
  2993. aesmc $dat2,$dat2
  2994. aese $dat3,q8
  2995. aesmc $dat3,$dat3
  2996. aese $dat4,q8
  2997. aesmc $dat4,$dat4
  2998. subs $len,$len,#0x50 // because .Lxts_enc_tail4x
  2999. aese $dat0,q9
  3000. aesmc $dat0,$dat0
  3001. aese $dat1,q9
  3002. aesmc $dat1,$dat1
  3003. aese $dat2,q9
  3004. aesmc $dat2,$dat2
  3005. aese $dat3,q9
  3006. aesmc $dat3,$dat3
  3007. aese $dat4,q9
  3008. aesmc $dat4,$dat4
  3009. csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo
  3010. mov $key_,$key1
  3011. aese $dat0,q10
  3012. aesmc $dat0,$dat0
  3013. aese $dat1,q10
  3014. aesmc $dat1,$dat1
  3015. aese $dat2,q10
  3016. aesmc $dat2,$dat2
  3017. aese $dat3,q10
  3018. aesmc $dat3,$dat3
  3019. aese $dat4,q10
  3020. aesmc $dat4,$dat4
  3021. add $inp,$inp,$xoffset // x0 is adjusted in such way that
  3022. // at exit from the loop v1.16b-v26.16b
  3023. // are loaded with last "words"
  3024. add $xoffset,$len,#0x60 // because .Lxts_enc_tail4x
  3025. aese $dat0,q11
  3026. aesmc $dat0,$dat0
  3027. aese $dat1,q11
  3028. aesmc $dat1,$dat1
  3029. aese $dat2,q11
  3030. aesmc $dat2,$dat2
  3031. aese $dat3,q11
  3032. aesmc $dat3,$dat3
  3033. aese $dat4,q11
  3034. aesmc $dat4,$dat4
  3035. aese $dat0,q12
  3036. aesmc $dat0,$dat0
  3037. aese $dat1,q12
  3038. aesmc $dat1,$dat1
  3039. aese $dat2,q12
  3040. aesmc $dat2,$dat2
  3041. aese $dat3,q12
  3042. aesmc $dat3,$dat3
  3043. aese $dat4,q12
  3044. aesmc $dat4,$dat4
  3045. aese $dat0,q13
  3046. aesmc $dat0,$dat0
  3047. aese $dat1,q13
  3048. aesmc $dat1,$dat1
  3049. aese $dat2,q13
  3050. aesmc $dat2,$dat2
  3051. aese $dat3,q13
  3052. aesmc $dat3,$dat3
  3053. aese $dat4,q13
  3054. aesmc $dat4,$dat4
  3055. aese $dat0,q14
  3056. aesmc $dat0,$dat0
  3057. aese $dat1,q14
  3058. aesmc $dat1,$dat1
  3059. aese $dat2,q14
  3060. aesmc $dat2,$dat2
  3061. aese $dat3,q14
  3062. aesmc $dat3,$dat3
  3063. aese $dat4,q14
  3064. aesmc $dat4,$dat4
  3065. veor $tmp0,$rndlast,$iv0
  3066. aese $dat0,q15
  3067. // The iv for first block of one iteration
  3068. extr $midnumx,$ivh,$ivh,#32
  3069. extr $ivh,$ivh,$ivl,#63
  3070. and $tmpmw,$constnum,$midnum,asr#31
  3071. eor $ivl,$tmpmx,$ivl,lsl#1
  3072. fmov $ivd00,$ivl
  3073. fmov $ivd01,$ivh
  3074. veor $tmp1,$rndlast,$iv1
  3075. vld1.8 {$in0},[$inp],#16
  3076. aese $dat1,q15
  3077. // The iv for second block
  3078. extr $midnumx,$ivh,$ivh,#32
  3079. extr $ivh,$ivh,$ivl,#63
  3080. and $tmpmw,$constnum,$midnum,asr#31
  3081. eor $ivl,$tmpmx,$ivl,lsl#1
  3082. fmov $ivd10,$ivl
  3083. fmov $ivd11,$ivh
  3084. veor $tmp2,$rndlast,$iv2
  3085. vld1.8 {$in1},[$inp],#16
  3086. aese $dat2,q15
  3087. // The iv for third block
  3088. extr $midnumx,$ivh,$ivh,#32
  3089. extr $ivh,$ivh,$ivl,#63
  3090. and $tmpmw,$constnum,$midnum,asr#31
  3091. eor $ivl,$tmpmx,$ivl,lsl#1
  3092. fmov $ivd20,$ivl
  3093. fmov $ivd21,$ivh
  3094. veor $tmp3,$rndlast,$iv3
  3095. vld1.8 {$in2},[$inp],#16
  3096. aese $dat3,q15
  3097. // The iv for fourth block
  3098. extr $midnumx,$ivh,$ivh,#32
  3099. extr $ivh,$ivh,$ivl,#63
  3100. and $tmpmw,$constnum,$midnum,asr#31
  3101. eor $ivl,$tmpmx,$ivl,lsl#1
  3102. fmov $ivd30,$ivl
  3103. fmov $ivd31,$ivh
  3104. veor $tmp4,$rndlast,$iv4
  3105. vld1.8 {$in3},[$inp],#16
  3106. aese $dat4,q15
  3107. // The iv for fifth block
  3108. extr $midnumx,$ivh,$ivh,#32
  3109. extr $ivh,$ivh,$ivl,#63
  3110. and $tmpmw,$constnum,$midnum,asr #31
  3111. eor $ivl,$tmpmx,$ivl,lsl #1
  3112. fmov $ivd40,$ivl
  3113. fmov $ivd41,$ivh
  3114. vld1.8 {$in4},[$inp],#16
  3115. cbz $xoffset,.Lxts_enc_tail4x
  3116. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  3117. veor $tmp0,$tmp0,$dat0
  3118. veor $dat0,$in0,$iv0
  3119. veor $tmp1,$tmp1,$dat1
  3120. veor $dat1,$in1,$iv1
  3121. veor $tmp2,$tmp2,$dat2
  3122. veor $dat2,$in2,$iv2
  3123. veor $tmp3,$tmp3,$dat3
  3124. veor $dat3,$in3,$iv3
  3125. veor $tmp4,$tmp4,$dat4
  3126. vst1.8 {$tmp0},[$out],#16
  3127. veor $dat4,$in4,$iv4
  3128. vst1.8 {$tmp1},[$out],#16
  3129. mov $rounds,$rounds0
  3130. vst1.8 {$tmp2},[$out],#16
  3131. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  3132. vst1.8 {$tmp3},[$out],#16
  3133. vst1.8 {$tmp4},[$out],#16
  3134. b.hs .Loop5x_xts_enc
  3135. // If left 4 blocks, borrow the five block's processing.
  3136. cmn $len,#0x10
  3137. b.ne .Loop5x_enc_after
  3138. vorr $iv4,$iv3,$iv3
  3139. vorr $iv3,$iv2,$iv2
  3140. vorr $iv2,$iv1,$iv1
  3141. vorr $iv1,$iv0,$iv0
  3142. fmov $ivl,$ivd40
  3143. fmov $ivh,$ivd41
  3144. veor $dat0,$iv0,$in0
  3145. veor $dat1,$iv1,$in1
  3146. veor $dat2,$in2,$iv2
  3147. veor $dat3,$in3,$iv3
  3148. veor $dat4,$in4,$iv4
  3149. b.eq .Loop5x_xts_enc
  3150. .Loop5x_enc_after:
  3151. add $len,$len,#0x50
  3152. cbz $len,.Lxts_enc_done
  3153. add $rounds,$rounds0,#2
  3154. subs $len,$len,#0x30
  3155. b.lo .Lxts_inner_enc_tail
  3156. veor $dat0,$iv0,$in2
  3157. veor $dat1,$iv1,$in3
  3158. veor $dat2,$in4,$iv2
  3159. b .Lxts_outer_enc_tail
  3160. .align 4
  3161. .Lxts_enc_tail4x:
  3162. add $inp,$inp,#16
  3163. veor $tmp1,$dat1,$tmp1
  3164. vst1.8 {$tmp1},[$out],#16
  3165. veor $tmp2,$dat2,$tmp2
  3166. vst1.8 {$tmp2},[$out],#16
  3167. veor $tmp3,$dat3,$tmp3
  3168. veor $tmp4,$dat4,$tmp4
  3169. vst1.8 {$tmp3-$tmp4},[$out],#32
  3170. b .Lxts_enc_done
  3171. .align 4
  3172. .Lxts_outer_enc_tail:
  3173. aese $dat0,q8
  3174. aesmc $dat0,$dat0
  3175. aese $dat1,q8
  3176. aesmc $dat1,$dat1
  3177. aese $dat2,q8
  3178. aesmc $dat2,$dat2
  3179. vld1.32 {q8},[$key_],#16
  3180. subs $rounds,$rounds,#2
  3181. aese $dat0,q9
  3182. aesmc $dat0,$dat0
  3183. aese $dat1,q9
  3184. aesmc $dat1,$dat1
  3185. aese $dat2,q9
  3186. aesmc $dat2,$dat2
  3187. vld1.32 {q9},[$key_],#16
  3188. b.gt .Lxts_outer_enc_tail
  3189. aese $dat0,q8
  3190. aesmc $dat0,$dat0
  3191. aese $dat1,q8
  3192. aesmc $dat1,$dat1
  3193. aese $dat2,q8
  3194. aesmc $dat2,$dat2
  3195. veor $tmp0,$iv0,$rndlast
  3196. subs $len,$len,#0x30
  3197. // The iv for first block
  3198. fmov $ivl,$ivd20
  3199. fmov $ivh,$ivd21
  3200. //mov $constnum,#0x87
  3201. extr $midnumx,$ivh,$ivh,#32
  3202. extr $ivh,$ivh,$ivl,#63
  3203. and $tmpmw,$constnum,$midnum,asr#31
  3204. eor $ivl,$tmpmx,$ivl,lsl#1
  3205. fmov $ivd00,$ivl
  3206. fmov $ivd01,$ivh
  3207. veor $tmp1,$iv1,$rndlast
  3208. csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point
  3209. aese $dat0,q9
  3210. aesmc $dat0,$dat0
  3211. aese $dat1,q9
  3212. aesmc $dat1,$dat1
  3213. aese $dat2,q9
  3214. aesmc $dat2,$dat2
  3215. veor $tmp2,$iv2,$rndlast
  3216. add $xoffset,$xoffset,#0x20
  3217. add $inp,$inp,$xoffset
  3218. mov $key_,$key1
  3219. aese $dat0,q12
  3220. aesmc $dat0,$dat0
  3221. aese $dat1,q12
  3222. aesmc $dat1,$dat1
  3223. aese $dat2,q12
  3224. aesmc $dat2,$dat2
  3225. aese $dat0,q13
  3226. aesmc $dat0,$dat0
  3227. aese $dat1,q13
  3228. aesmc $dat1,$dat1
  3229. aese $dat2,q13
  3230. aesmc $dat2,$dat2
  3231. aese $dat0,q14
  3232. aesmc $dat0,$dat0
  3233. aese $dat1,q14
  3234. aesmc $dat1,$dat1
  3235. aese $dat2,q14
  3236. aesmc $dat2,$dat2
  3237. aese $dat0,q15
  3238. aese $dat1,q15
  3239. aese $dat2,q15
  3240. vld1.8 {$in2},[$inp],#16
  3241. add $rounds,$rounds0,#2
  3242. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  3243. veor $tmp0,$tmp0,$dat0
  3244. veor $tmp1,$tmp1,$dat1
  3245. veor $dat2,$dat2,$tmp2
  3246. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  3247. vst1.8 {$tmp0},[$out],#16
  3248. vst1.8 {$tmp1},[$out],#16
  3249. vst1.8 {$dat2},[$out],#16
  3250. cmn $len,#0x30
  3251. b.eq .Lxts_enc_done
  3252. .Lxts_encxor_one:
  3253. vorr $in3,$in1,$in1
  3254. vorr $in4,$in2,$in2
  3255. nop
  3256. .Lxts_inner_enc_tail:
  3257. cmn $len,#0x10
  3258. veor $dat1,$in3,$iv0
  3259. veor $dat2,$in4,$iv1
  3260. b.eq .Lxts_enc_tail_loop
  3261. veor $dat2,$in4,$iv0
  3262. .Lxts_enc_tail_loop:
  3263. aese $dat1,q8
  3264. aesmc $dat1,$dat1
  3265. aese $dat2,q8
  3266. aesmc $dat2,$dat2
  3267. vld1.32 {q8},[$key_],#16
  3268. subs $rounds,$rounds,#2
  3269. aese $dat1,q9
  3270. aesmc $dat1,$dat1
  3271. aese $dat2,q9
  3272. aesmc $dat2,$dat2
  3273. vld1.32 {q9},[$key_],#16
  3274. b.gt .Lxts_enc_tail_loop
  3275. aese $dat1,q8
  3276. aesmc $dat1,$dat1
  3277. aese $dat2,q8
  3278. aesmc $dat2,$dat2
  3279. aese $dat1,q9
  3280. aesmc $dat1,$dat1
  3281. aese $dat2,q9
  3282. aesmc $dat2,$dat2
  3283. aese $dat1,q12
  3284. aesmc $dat1,$dat1
  3285. aese $dat2,q12
  3286. aesmc $dat2,$dat2
  3287. cmn $len,#0x20
  3288. aese $dat1,q13
  3289. aesmc $dat1,$dat1
  3290. aese $dat2,q13
  3291. aesmc $dat2,$dat2
  3292. veor $tmp1,$iv0,$rndlast
  3293. aese $dat1,q14
  3294. aesmc $dat1,$dat1
  3295. aese $dat2,q14
  3296. aesmc $dat2,$dat2
  3297. veor $tmp2,$iv1,$rndlast
  3298. aese $dat1,q15
  3299. aese $dat2,q15
  3300. b.eq .Lxts_enc_one
  3301. veor $tmp1,$tmp1,$dat1
  3302. vst1.8 {$tmp1},[$out],#16
  3303. veor $tmp2,$tmp2,$dat2
  3304. vorr $iv0,$iv1,$iv1
  3305. vst1.8 {$tmp2},[$out],#16
  3306. fmov $ivl,$ivd10
  3307. fmov $ivh,$ivd11
  3308. mov $constnum,#0x87
  3309. extr $midnumx,$ivh,$ivh,#32
  3310. extr $ivh,$ivh,$ivl,#63
  3311. and $tmpmw,$constnum,$midnum,asr #31
  3312. eor $ivl,$tmpmx,$ivl,lsl #1
  3313. fmov $ivd00,$ivl
  3314. fmov $ivd01,$ivh
  3315. b .Lxts_enc_done
  3316. .Lxts_enc_one:
  3317. veor $tmp1,$tmp1,$dat2
  3318. vorr $iv0,$iv0,$iv0
  3319. vst1.8 {$tmp1},[$out],#16
  3320. fmov $ivl,$ivd00
  3321. fmov $ivh,$ivd01
  3322. mov $constnum,#0x87
  3323. extr $midnumx,$ivh,$ivh,#32
  3324. extr $ivh,$ivh,$ivl,#63
  3325. and $tmpmw,$constnum,$midnum,asr #31
  3326. eor $ivl,$tmpmx,$ivl,lsl #1
  3327. fmov $ivd00,$ivl
  3328. fmov $ivd01,$ivh
  3329. b .Lxts_enc_done
  3330. .align 5
  3331. .Lxts_enc_done:
  3332. // Process the tail block with cipher stealing.
  3333. tst $tailcnt,#0xf
  3334. b.eq .Lxts_abort
  3335. mov $tmpinp,$inp
  3336. mov $tmpoutp,$out
  3337. sub $out,$out,#16
  3338. .composite_enc_loop:
  3339. subs $tailcnt,$tailcnt,#1
  3340. ldrb $l2outp,[$out,$tailcnt]
  3341. ldrb $loutp,[$tmpinp,$tailcnt]
  3342. strb $l2outp,[$tmpoutp,$tailcnt]
  3343. strb $loutp,[$out,$tailcnt]
  3344. b.gt .composite_enc_loop
  3345. .Lxts_enc_load_done:
  3346. vld1.8 {$tmpin},[$out]
  3347. veor $tmpin,$tmpin,$iv0
  3348. // Encrypt the composite block to get the last second encrypted text block
  3349. ldr $rounds,[$key1,#240] // load key schedule...
  3350. vld1.32 {$dat},[$key1],#16
  3351. sub $rounds,$rounds,#2
  3352. vld1.32 {$dat1},[$key1],#16 // load key schedule...
  3353. .Loop_final_enc:
  3354. aese $tmpin,$dat0
  3355. aesmc $tmpin,$tmpin
  3356. vld1.32 {$dat0},[$key1],#16
  3357. subs $rounds,$rounds,#2
  3358. aese $tmpin,$dat1
  3359. aesmc $tmpin,$tmpin
  3360. vld1.32 {$dat1},[$key1],#16
  3361. b.gt .Loop_final_enc
  3362. aese $tmpin,$dat0
  3363. aesmc $tmpin,$tmpin
  3364. vld1.32 {$dat0},[$key1]
  3365. aese $tmpin,$dat1
  3366. veor $tmpin,$tmpin,$dat0
  3367. veor $tmpin,$tmpin,$iv0
  3368. vst1.8 {$tmpin},[$out]
  3369. .Lxts_abort:
  3370. ldp $tailcnt,$midnumx,[sp,#48]
  3371. ldp $ivd10,$ivd20,[sp,#32]
  3372. ldp $ivd30,$ivd40,[sp,#16]
  3373. ldp $constnumx,$tmpinp,[sp],#64
  3374. .Lxts_enc_final_abort:
  3375. ret
  3376. .size ${prefix}_xts_encrypt,.-${prefix}_xts_encrypt
  3377. ___
  3378. }}}
  3379. {{{
  3380. my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
  3381. my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
  3382. my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
  3383. my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
  3384. my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
  3385. my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
  3386. my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b");
  3387. my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
  3388. my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
  3389. my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
  3390. # q7 last round key
  3391. # q10-q15, q7 Last 7 round keys
  3392. # q8-q9 preloaded round keys except last 7 keys for big size
  3393. # q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte
  3394. {
  3395. my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
  3396. my ($dat3,$in3,$tmp3); # used only in 64-bit mode
  3397. my ($dat4,$in4,$tmp4);
  3398. if ($flavour =~ /64/) {
  3399. ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
  3400. }
  3401. $code.=<<___ if ($flavour =~ /64/);
  3402. .globl ${prefix}_xts_decrypt
  3403. .type ${prefix}_xts_decrypt,%function
  3404. .align 5
  3405. ${prefix}_xts_decrypt:
  3406. AARCH64_VALID_CALL_TARGET
  3407. ___
  3408. $code.=<<___ if ($flavour =~ /64/);
  3409. cmp $len,#16
  3410. // Original input data size bigger than 16, jump to big size processing.
  3411. b.ne .Lxts_dec_big_size
  3412. // Encrypt the iv with key2, as the first XEX iv.
  3413. ldr $rounds,[$key2,#240]
  3414. vld1.32 {$dat},[$key2],#16
  3415. vld1.8 {$iv0},[$ivp]
  3416. sub $rounds,$rounds,#2
  3417. vld1.32 {$dat1},[$key2],#16
  3418. .Loop_dec_small_iv_enc:
  3419. aese $iv0,$dat
  3420. aesmc $iv0,$iv0
  3421. vld1.32 {$dat},[$key2],#16
  3422. subs $rounds,$rounds,#2
  3423. aese $iv0,$dat1
  3424. aesmc $iv0,$iv0
  3425. vld1.32 {$dat1},[$key2],#16
  3426. b.gt .Loop_dec_small_iv_enc
  3427. aese $iv0,$dat
  3428. aesmc $iv0,$iv0
  3429. vld1.32 {$dat},[$key2]
  3430. aese $iv0,$dat1
  3431. veor $iv0,$iv0,$dat
  3432. vld1.8 {$dat0},[$inp]
  3433. veor $dat0,$iv0,$dat0
  3434. ldr $rounds,[$key1,#240]
  3435. vld1.32 {q20-q21},[$key1],#32 // load key schedule...
  3436. aesd $dat0,q20
  3437. aesimc $dat0,$dat0
  3438. vld1.32 {q8-q9},[$key1],#32 // load key schedule...
  3439. aesd $dat0,q21
  3440. aesimc $dat0,$dat0
  3441. subs $rounds,$rounds,#10 // bias
  3442. b.eq .Lxts_128_dec
  3443. .Lxts_dec_round_loop:
  3444. aesd $dat0,q8
  3445. aesimc $dat0,$dat0
  3446. vld1.32 {q8},[$key1],#16 // load key schedule...
  3447. aesd $dat0,q9
  3448. aesimc $dat0,$dat0
  3449. vld1.32 {q9},[$key1],#16 // load key schedule...
  3450. subs $rounds,$rounds,#2 // bias
  3451. b.gt .Lxts_dec_round_loop
  3452. .Lxts_128_dec:
  3453. vld1.32 {q10-q11},[$key1],#32 // load key schedule...
  3454. aesd $dat0,q8
  3455. aesimc $dat0,$dat0
  3456. aesd $dat0,q9
  3457. aesimc $dat0,$dat0
  3458. vld1.32 {q12-q13},[$key1],#32 // load key schedule...
  3459. aesd $dat0,q10
  3460. aesimc $dat0,$dat0
  3461. aesd $dat0,q11
  3462. aesimc $dat0,$dat0
  3463. vld1.32 {q14-q15},[$key1],#32 // load key schedule...
  3464. aesd $dat0,q12
  3465. aesimc $dat0,$dat0
  3466. aesd $dat0,q13
  3467. aesimc $dat0,$dat0
  3468. vld1.32 {$rndlast},[$key1]
  3469. aesd $dat0,q14
  3470. aesimc $dat0,$dat0
  3471. aesd $dat0,q15
  3472. veor $dat0,$dat0,$rndlast
  3473. veor $dat0,$iv0,$dat0
  3474. vst1.8 {$dat0},[$out]
  3475. b .Lxts_dec_final_abort
  3476. .Lxts_dec_big_size:
  3477. ___
  3478. $code.=<<___ if ($flavour =~ /64/);
  3479. stp $constnumx,$tmpinp,[sp,#-64]!
  3480. stp $tailcnt,$midnumx,[sp,#48]
  3481. stp $ivd10,$ivd20,[sp,#32]
  3482. stp $ivd30,$ivd40,[sp,#16]
  3483. and $tailcnt,$len,#0xf
  3484. and $len,$len,#-16
  3485. subs $len,$len,#16
  3486. mov $step,#16
  3487. b.lo .Lxts_dec_abort
  3488. // Encrypt the iv with key2, as the first XEX iv
  3489. ldr $rounds,[$key2,#240]
  3490. vld1.32 {$dat},[$key2],#16
  3491. vld1.8 {$iv0},[$ivp]
  3492. sub $rounds,$rounds,#2
  3493. vld1.32 {$dat1},[$key2],#16
  3494. .Loop_dec_iv_enc:
  3495. aese $iv0,$dat
  3496. aesmc $iv0,$iv0
  3497. vld1.32 {$dat},[$key2],#16
  3498. subs $rounds,$rounds,#2
  3499. aese $iv0,$dat1
  3500. aesmc $iv0,$iv0
  3501. vld1.32 {$dat1},[$key2],#16
  3502. b.gt .Loop_dec_iv_enc
  3503. aese $iv0,$dat
  3504. aesmc $iv0,$iv0
  3505. vld1.32 {$dat},[$key2]
  3506. aese $iv0,$dat1
  3507. veor $iv0,$iv0,$dat
  3508. // The iv for second block
  3509. // $ivl- iv(low), $ivh - iv(high)
  3510. // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
  3511. fmov $ivl,$ivd00
  3512. fmov $ivh,$ivd01
  3513. mov $constnum,#0x87
  3514. extr $midnumx,$ivh,$ivh,#32
  3515. extr $ivh,$ivh,$ivl,#63
  3516. and $tmpmw,$constnum,$midnum,asr #31
  3517. eor $ivl,$tmpmx,$ivl,lsl #1
  3518. fmov $ivd10,$ivl
  3519. fmov $ivd11,$ivh
  3520. ldr $rounds0,[$key1,#240] // load rounds number
  3521. // The iv for third block
  3522. extr $midnumx,$ivh,$ivh,#32
  3523. extr $ivh,$ivh,$ivl,#63
  3524. and $tmpmw,$constnum,$midnum,asr #31
  3525. eor $ivl,$tmpmx,$ivl,lsl #1
  3526. fmov $ivd20,$ivl
  3527. fmov $ivd21,$ivh
  3528. vld1.32 {q8-q9},[$key1] // load key schedule...
  3529. sub $rounds0,$rounds0,#6
  3530. add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys
  3531. sub $rounds0,$rounds0,#2
  3532. vld1.32 {q10-q11},[$key_],#32 // load key schedule...
  3533. vld1.32 {q12-q13},[$key_],#32
  3534. vld1.32 {q14-q15},[$key_],#32
  3535. vld1.32 {$rndlast},[$key_]
  3536. // The iv for fourth block
  3537. extr $midnumx,$ivh,$ivh,#32
  3538. extr $ivh,$ivh,$ivl,#63
  3539. and $tmpmw,$constnum,$midnum,asr #31
  3540. eor $ivl,$tmpmx,$ivl,lsl #1
  3541. fmov $ivd30,$ivl
  3542. fmov $ivd31,$ivh
  3543. add $key_,$key1,#32
  3544. mov $rounds,$rounds0
  3545. b .Lxts_dec
  3546. // Decryption
  3547. .align 5
  3548. .Lxts_dec:
  3549. tst $tailcnt,#0xf
  3550. b.eq .Lxts_dec_begin
  3551. subs $len,$len,#16
  3552. csel $step,xzr,$step,eq
  3553. vld1.8 {$dat},[$inp],#16
  3554. b.lo .Lxts_done
  3555. sub $inp,$inp,#16
  3556. .Lxts_dec_begin:
  3557. vld1.8 {$dat},[$inp],$step
  3558. subs $len,$len,#32 // bias
  3559. add $rounds,$rounds0,#2
  3560. vorr $in1,$dat,$dat
  3561. vorr $dat1,$dat,$dat
  3562. vorr $in3,$dat,$dat
  3563. vld1.8 {$dat2},[$inp],#16
  3564. vorr $in2,$dat2,$dat2
  3565. vorr $in4,$dat2,$dat2
  3566. b.lo .Lxts_inner_dec_tail
  3567. veor $dat,$dat,$iv0 // before decryt, xor with iv
  3568. veor $dat2,$dat2,$iv1
  3569. vorr $dat1,$dat2,$dat2
  3570. vld1.8 {$dat2},[$inp],#16
  3571. vorr $in0,$dat,$dat
  3572. vorr $in1,$dat1,$dat1
  3573. veor $in2,$dat2,$iv2 // third block xox with third iv
  3574. veor $dat2,$dat2,$iv2
  3575. cmp $len,#32
  3576. b.lo .Lxts_outer_dec_tail
  3577. vld1.8 {$dat3},[$inp],#16
  3578. // The iv for fifth block
  3579. extr $midnumx,$ivh,$ivh,#32
  3580. extr $ivh,$ivh,$ivl,#63
  3581. and $tmpmw,$constnum,$midnum,asr #31
  3582. eor $ivl,$tmpmx,$ivl,lsl #1
  3583. fmov $ivd40,$ivl
  3584. fmov $ivd41,$ivh
  3585. vld1.8 {$dat4},[$inp],#16
  3586. veor $dat3,$dat3,$iv3 // the fourth block
  3587. veor $dat4,$dat4,$iv4
  3588. sub $len,$len,#32 // bias
  3589. mov $rounds,$rounds0
  3590. b .Loop5x_xts_dec
  3591. .align 4
  3592. .Loop5x_xts_dec:
  3593. aesd $dat0,q8
  3594. aesimc $dat0,$dat0
  3595. aesd $dat1,q8
  3596. aesimc $dat1,$dat1
  3597. aesd $dat2,q8
  3598. aesimc $dat2,$dat2
  3599. aesd $dat3,q8
  3600. aesimc $dat3,$dat3
  3601. aesd $dat4,q8
  3602. aesimc $dat4,$dat4
  3603. vld1.32 {q8},[$key_],#16 // load key schedule...
  3604. subs $rounds,$rounds,#2
  3605. aesd $dat0,q9
  3606. aesimc $dat0,$dat0
  3607. aesd $dat1,q9
  3608. aesimc $dat1,$dat1
  3609. aesd $dat2,q9
  3610. aesimc $dat2,$dat2
  3611. aesd $dat3,q9
  3612. aesimc $dat3,$dat3
  3613. aesd $dat4,q9
  3614. aesimc $dat4,$dat4
  3615. vld1.32 {q9},[$key_],#16 // load key schedule...
  3616. b.gt .Loop5x_xts_dec
  3617. aesd $dat0,q8
  3618. aesimc $dat0,$dat0
  3619. aesd $dat1,q8
  3620. aesimc $dat1,$dat1
  3621. aesd $dat2,q8
  3622. aesimc $dat2,$dat2
  3623. aesd $dat3,q8
  3624. aesimc $dat3,$dat3
  3625. aesd $dat4,q8
  3626. aesimc $dat4,$dat4
  3627. subs $len,$len,#0x50 // because .Lxts_dec_tail4x
  3628. aesd $dat0,q9
  3629. aesimc $dat0,$dat
  3630. aesd $dat1,q9
  3631. aesimc $dat1,$dat1
  3632. aesd $dat2,q9
  3633. aesimc $dat2,$dat2
  3634. aesd $dat3,q9
  3635. aesimc $dat3,$dat3
  3636. aesd $dat4,q9
  3637. aesimc $dat4,$dat4
  3638. csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo
  3639. mov $key_,$key1
  3640. aesd $dat0,q10
  3641. aesimc $dat0,$dat0
  3642. aesd $dat1,q10
  3643. aesimc $dat1,$dat1
  3644. aesd $dat2,q10
  3645. aesimc $dat2,$dat2
  3646. aesd $dat3,q10
  3647. aesimc $dat3,$dat3
  3648. aesd $dat4,q10
  3649. aesimc $dat4,$dat4
  3650. add $inp,$inp,$xoffset // x0 is adjusted in such way that
  3651. // at exit from the loop v1.16b-v26.16b
  3652. // are loaded with last "words"
  3653. add $xoffset,$len,#0x60 // because .Lxts_dec_tail4x
  3654. aesd $dat0,q11
  3655. aesimc $dat0,$dat0
  3656. aesd $dat1,q11
  3657. aesimc $dat1,$dat1
  3658. aesd $dat2,q11
  3659. aesimc $dat2,$dat2
  3660. aesd $dat3,q11
  3661. aesimc $dat3,$dat3
  3662. aesd $dat4,q11
  3663. aesimc $dat4,$dat4
  3664. aesd $dat0,q12
  3665. aesimc $dat0,$dat0
  3666. aesd $dat1,q12
  3667. aesimc $dat1,$dat1
  3668. aesd $dat2,q12
  3669. aesimc $dat2,$dat2
  3670. aesd $dat3,q12
  3671. aesimc $dat3,$dat3
  3672. aesd $dat4,q12
  3673. aesimc $dat4,$dat4
  3674. aesd $dat0,q13
  3675. aesimc $dat0,$dat0
  3676. aesd $dat1,q13
  3677. aesimc $dat1,$dat1
  3678. aesd $dat2,q13
  3679. aesimc $dat2,$dat2
  3680. aesd $dat3,q13
  3681. aesimc $dat3,$dat3
  3682. aesd $dat4,q13
  3683. aesimc $dat4,$dat4
  3684. aesd $dat0,q14
  3685. aesimc $dat0,$dat0
  3686. aesd $dat1,q14
  3687. aesimc $dat1,$dat1
  3688. aesd $dat2,q14
  3689. aesimc $dat2,$dat2
  3690. aesd $dat3,q14
  3691. aesimc $dat3,$dat3
  3692. aesd $dat4,q14
  3693. aesimc $dat4,$dat4
  3694. veor $tmp0,$rndlast,$iv0
  3695. aesd $dat0,q15
  3696. // The iv for first block of next iteration.
  3697. extr $midnumx,$ivh,$ivh,#32
  3698. extr $ivh,$ivh,$ivl,#63
  3699. and $tmpmw,$constnum,$midnum,asr #31
  3700. eor $ivl,$tmpmx,$ivl,lsl #1
  3701. fmov $ivd00,$ivl
  3702. fmov $ivd01,$ivh
  3703. veor $tmp1,$rndlast,$iv1
  3704. vld1.8 {$in0},[$inp],#16
  3705. aesd $dat1,q15
  3706. // The iv for second block
  3707. extr $midnumx,$ivh,$ivh,#32
  3708. extr $ivh,$ivh,$ivl,#63
  3709. and $tmpmw,$constnum,$midnum,asr #31
  3710. eor $ivl,$tmpmx,$ivl,lsl #1
  3711. fmov $ivd10,$ivl
  3712. fmov $ivd11,$ivh
  3713. veor $tmp2,$rndlast,$iv2
  3714. vld1.8 {$in1},[$inp],#16
  3715. aesd $dat2,q15
  3716. // The iv for third block
  3717. extr $midnumx,$ivh,$ivh,#32
  3718. extr $ivh,$ivh,$ivl,#63
  3719. and $tmpmw,$constnum,$midnum,asr #31
  3720. eor $ivl,$tmpmx,$ivl,lsl #1
  3721. fmov $ivd20,$ivl
  3722. fmov $ivd21,$ivh
  3723. veor $tmp3,$rndlast,$iv3
  3724. vld1.8 {$in2},[$inp],#16
  3725. aesd $dat3,q15
  3726. // The iv for fourth block
  3727. extr $midnumx,$ivh,$ivh,#32
  3728. extr $ivh,$ivh,$ivl,#63
  3729. and $tmpmw,$constnum,$midnum,asr #31
  3730. eor $ivl,$tmpmx,$ivl,lsl #1
  3731. fmov $ivd30,$ivl
  3732. fmov $ivd31,$ivh
  3733. veor $tmp4,$rndlast,$iv4
  3734. vld1.8 {$in3},[$inp],#16
  3735. aesd $dat4,q15
  3736. // The iv for fifth block
  3737. extr $midnumx,$ivh,$ivh,#32
  3738. extr $ivh,$ivh,$ivl,#63
  3739. and $tmpmw,$constnum,$midnum,asr #31
  3740. eor $ivl,$tmpmx,$ivl,lsl #1
  3741. fmov $ivd40,$ivl
  3742. fmov $ivd41,$ivh
  3743. vld1.8 {$in4},[$inp],#16
  3744. cbz $xoffset,.Lxts_dec_tail4x
  3745. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  3746. veor $tmp0,$tmp0,$dat0
  3747. veor $dat0,$in0,$iv0
  3748. veor $tmp1,$tmp1,$dat1
  3749. veor $dat1,$in1,$iv1
  3750. veor $tmp2,$tmp2,$dat2
  3751. veor $dat2,$in2,$iv2
  3752. veor $tmp3,$tmp3,$dat3
  3753. veor $dat3,$in3,$iv3
  3754. veor $tmp4,$tmp4,$dat4
  3755. vst1.8 {$tmp0},[$out],#16
  3756. veor $dat4,$in4,$iv4
  3757. vst1.8 {$tmp1},[$out],#16
  3758. mov $rounds,$rounds0
  3759. vst1.8 {$tmp2},[$out],#16
  3760. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  3761. vst1.8 {$tmp3},[$out],#16
  3762. vst1.8 {$tmp4},[$out],#16
  3763. b.hs .Loop5x_xts_dec
  3764. cmn $len,#0x10
  3765. b.ne .Loop5x_dec_after
  3766. // If x2($len) equal to -0x10, the left blocks is 4.
  3767. // After specially processing, utilize the five blocks processing again.
  3768. // It will use the following IVs: $iv0,$iv0,$iv1,$iv2,$iv3.
  3769. vorr $iv4,$iv3,$iv3
  3770. vorr $iv3,$iv2,$iv2
  3771. vorr $iv2,$iv1,$iv1
  3772. vorr $iv1,$iv0,$iv0
  3773. fmov $ivl,$ivd40
  3774. fmov $ivh,$ivd41
  3775. veor $dat0,$iv0,$in0
  3776. veor $dat1,$iv1,$in1
  3777. veor $dat2,$in2,$iv2
  3778. veor $dat3,$in3,$iv3
  3779. veor $dat4,$in4,$iv4
  3780. b.eq .Loop5x_xts_dec
  3781. .Loop5x_dec_after:
  3782. add $len,$len,#0x50
  3783. cbz $len,.Lxts_done
  3784. add $rounds,$rounds0,#2
  3785. subs $len,$len,#0x30
  3786. b.lo .Lxts_inner_dec_tail
  3787. veor $dat0,$iv0,$in2
  3788. veor $dat1,$iv1,$in3
  3789. veor $dat2,$in4,$iv2
  3790. b .Lxts_outer_dec_tail
  3791. .align 4
  3792. .Lxts_dec_tail4x:
  3793. add $inp,$inp,#16
  3794. tst $tailcnt,#0xf
  3795. veor $tmp1,$dat1,$tmp0
  3796. vst1.8 {$tmp1},[$out],#16
  3797. veor $tmp2,$dat2,$tmp2
  3798. vst1.8 {$tmp2},[$out],#16
  3799. veor $tmp3,$dat3,$tmp3
  3800. veor $tmp4,$dat4,$tmp4
  3801. vst1.8 {$tmp3-$tmp4},[$out],#32
  3802. b.eq .Lxts_dec_abort
  3803. vld1.8 {$dat0},[$inp],#16
  3804. b .Lxts_done
  3805. .align 4
  3806. .Lxts_outer_dec_tail:
  3807. aesd $dat0,q8
  3808. aesimc $dat0,$dat0
  3809. aesd $dat1,q8
  3810. aesimc $dat1,$dat1
  3811. aesd $dat2,q8
  3812. aesimc $dat2,$dat2
  3813. vld1.32 {q8},[$key_],#16
  3814. subs $rounds,$rounds,#2
  3815. aesd $dat0,q9
  3816. aesimc $dat0,$dat0
  3817. aesd $dat1,q9
  3818. aesimc $dat1,$dat1
  3819. aesd $dat2,q9
  3820. aesimc $dat2,$dat2
  3821. vld1.32 {q9},[$key_],#16
  3822. b.gt .Lxts_outer_dec_tail
  3823. aesd $dat0,q8
  3824. aesimc $dat0,$dat0
  3825. aesd $dat1,q8
  3826. aesimc $dat1,$dat1
  3827. aesd $dat2,q8
  3828. aesimc $dat2,$dat2
  3829. veor $tmp0,$iv0,$rndlast
  3830. subs $len,$len,#0x30
  3831. // The iv for first block
  3832. fmov $ivl,$ivd20
  3833. fmov $ivh,$ivd21
  3834. mov $constnum,#0x87
  3835. extr $midnumx,$ivh,$ivh,#32
  3836. extr $ivh,$ivh,$ivl,#63
  3837. and $tmpmw,$constnum,$midnum,asr #31
  3838. eor $ivl,$tmpmx,$ivl,lsl #1
  3839. fmov $ivd00,$ivl
  3840. fmov $ivd01,$ivh
  3841. veor $tmp1,$iv1,$rndlast
  3842. csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point
  3843. aesd $dat0,q9
  3844. aesimc $dat0,$dat0
  3845. aesd $dat1,q9
  3846. aesimc $dat1,$dat1
  3847. aesd $dat2,q9
  3848. aesimc $dat2,$dat2
  3849. veor $tmp2,$iv2,$rndlast
  3850. // The iv for second block
  3851. extr $midnumx,$ivh,$ivh,#32
  3852. extr $ivh,$ivh,$ivl,#63
  3853. and $tmpmw,$constnum,$midnum,asr #31
  3854. eor $ivl,$tmpmx,$ivl,lsl #1
  3855. fmov $ivd10,$ivl
  3856. fmov $ivd11,$ivh
  3857. add $xoffset,$xoffset,#0x20
  3858. add $inp,$inp,$xoffset // $inp is adjusted to the last data
  3859. mov $key_,$key1
  3860. // The iv for third block
  3861. extr $midnumx,$ivh,$ivh,#32
  3862. extr $ivh,$ivh,$ivl,#63
  3863. and $tmpmw,$constnum,$midnum,asr #31
  3864. eor $ivl,$tmpmx,$ivl,lsl #1
  3865. fmov $ivd20,$ivl
  3866. fmov $ivd21,$ivh
  3867. aesd $dat0,q12
  3868. aesimc $dat0,$dat0
  3869. aesd $dat1,q12
  3870. aesimc $dat1,$dat1
  3871. aesd $dat2,q12
  3872. aesimc $dat2,$dat2
  3873. aesd $dat0,q13
  3874. aesimc $dat0,$dat0
  3875. aesd $dat1,q13
  3876. aesimc $dat1,$dat1
  3877. aesd $dat2,q13
  3878. aesimc $dat2,$dat2
  3879. aesd $dat0,q14
  3880. aesimc $dat0,$dat0
  3881. aesd $dat1,q14
  3882. aesimc $dat1,$dat1
  3883. aesd $dat2,q14
  3884. aesimc $dat2,$dat2
  3885. vld1.8 {$in2},[$inp],#16
  3886. aesd $dat0,q15
  3887. aesd $dat1,q15
  3888. aesd $dat2,q15
  3889. vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
  3890. add $rounds,$rounds0,#2
  3891. veor $tmp0,$tmp0,$dat0
  3892. veor $tmp1,$tmp1,$dat1
  3893. veor $dat2,$dat2,$tmp2
  3894. vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
  3895. vst1.8 {$tmp0},[$out],#16
  3896. vst1.8 {$tmp1},[$out],#16
  3897. vst1.8 {$dat2},[$out],#16
  3898. cmn $len,#0x30
  3899. add $len,$len,#0x30
  3900. b.eq .Lxts_done
  3901. sub $len,$len,#0x30
  3902. vorr $in3,$in1,$in1
  3903. vorr $in4,$in2,$in2
  3904. nop
  3905. .Lxts_inner_dec_tail:
  3906. // $len == -0x10 means two blocks left.
  3907. cmn $len,#0x10
  3908. veor $dat1,$in3,$iv0
  3909. veor $dat2,$in4,$iv1
  3910. b.eq .Lxts_dec_tail_loop
  3911. veor $dat2,$in4,$iv0
  3912. .Lxts_dec_tail_loop:
  3913. aesd $dat1,q8
  3914. aesimc $dat1,$dat1
  3915. aesd $dat2,q8
  3916. aesimc $dat2,$dat2
  3917. vld1.32 {q8},[$key_],#16
  3918. subs $rounds,$rounds,#2
  3919. aesd $dat1,q9
  3920. aesimc $dat1,$dat1
  3921. aesd $dat2,q9
  3922. aesimc $dat2,$dat2
  3923. vld1.32 {q9},[$key_],#16
  3924. b.gt .Lxts_dec_tail_loop
  3925. aesd $dat1,q8
  3926. aesimc $dat1,$dat1
  3927. aesd $dat2,q8
  3928. aesimc $dat2,$dat2
  3929. aesd $dat1,q9
  3930. aesimc $dat1,$dat1
  3931. aesd $dat2,q9
  3932. aesimc $dat2,$dat2
  3933. aesd $dat1,q12
  3934. aesimc $dat1,$dat1
  3935. aesd $dat2,q12
  3936. aesimc $dat2,$dat2
  3937. cmn $len,#0x20
  3938. aesd $dat1,q13
  3939. aesimc $dat1,$dat1
  3940. aesd $dat2,q13
  3941. aesimc $dat2,$dat2
  3942. veor $tmp1,$iv0,$rndlast
  3943. aesd $dat1,q14
  3944. aesimc $dat1,$dat1
  3945. aesd $dat2,q14
  3946. aesimc $dat2,$dat2
  3947. veor $tmp2,$iv1,$rndlast
  3948. aesd $dat1,q15
  3949. aesd $dat2,q15
  3950. b.eq .Lxts_dec_one
  3951. veor $tmp1,$tmp1,$dat1
  3952. veor $tmp2,$tmp2,$dat2
  3953. vorr $iv0,$iv2,$iv2
  3954. vorr $iv1,$iv3,$iv3
  3955. vst1.8 {$tmp1},[$out],#16
  3956. vst1.8 {$tmp2},[$out],#16
  3957. add $len,$len,#16
  3958. b .Lxts_done
  3959. .Lxts_dec_one:
  3960. veor $tmp1,$tmp1,$dat2
  3961. vorr $iv0,$iv1,$iv1
  3962. vorr $iv1,$iv2,$iv2
  3963. vst1.8 {$tmp1},[$out],#16
  3964. add $len,$len,#32
  3965. .Lxts_done:
  3966. tst $tailcnt,#0xf
  3967. b.eq .Lxts_dec_abort
  3968. // Processing the last two blocks with cipher stealing.
  3969. mov x7,x3
  3970. cbnz x2,.Lxts_dec_1st_done
  3971. vld1.8 {$dat0},[$inp],#16
  3972. // Decrypt the last second block to get the last plain text block
  3973. .Lxts_dec_1st_done:
  3974. eor $tmpin,$dat0,$iv1
  3975. ldr $rounds,[$key1,#240]
  3976. vld1.32 {$dat0},[$key1],#16
  3977. sub $rounds,$rounds,#2
  3978. vld1.32 {$dat1},[$key1],#16
  3979. .Loop_final_2nd_dec:
  3980. aesd $tmpin,$dat0
  3981. aesimc $tmpin,$tmpin
  3982. vld1.32 {$dat0},[$key1],#16 // load key schedule...
  3983. subs $rounds,$rounds,#2
  3984. aesd $tmpin,$dat1
  3985. aesimc $tmpin,$tmpin
  3986. vld1.32 {$dat1},[$key1],#16 // load key schedule...
  3987. b.gt .Loop_final_2nd_dec
  3988. aesd $tmpin,$dat0
  3989. aesimc $tmpin,$tmpin
  3990. vld1.32 {$dat0},[$key1]
  3991. aesd $tmpin,$dat1
  3992. veor $tmpin,$tmpin,$dat0
  3993. veor $tmpin,$tmpin,$iv1
  3994. vst1.8 {$tmpin},[$out]
  3995. mov $tmpinp,$inp
  3996. add $tmpoutp,$out,#16
  3997. // Composite the tailcnt "16 byte not aligned block" into the last second plain blocks
  3998. // to get the last encrypted block.
  3999. .composite_dec_loop:
  4000. subs $tailcnt,$tailcnt,#1
  4001. ldrb $l2outp,[$out,$tailcnt]
  4002. ldrb $loutp,[$tmpinp,$tailcnt]
  4003. strb $l2outp,[$tmpoutp,$tailcnt]
  4004. strb $loutp,[$out,$tailcnt]
  4005. b.gt .composite_dec_loop
  4006. .Lxts_dec_load_done:
  4007. vld1.8 {$tmpin},[$out]
  4008. veor $tmpin,$tmpin,$iv0
  4009. // Decrypt the composite block to get the last second plain text block
  4010. ldr $rounds,[$key_,#240]
  4011. vld1.32 {$dat},[$key_],#16
  4012. sub $rounds,$rounds,#2
  4013. vld1.32 {$dat1},[$key_],#16
  4014. .Loop_final_dec:
  4015. aesd $tmpin,$dat0
  4016. aesimc $tmpin,$tmpin
  4017. vld1.32 {$dat0},[$key_],#16 // load key schedule...
  4018. subs $rounds,$rounds,#2
  4019. aesd $tmpin,$dat1
  4020. aesimc $tmpin,$tmpin
  4021. vld1.32 {$dat1},[$key_],#16 // load key schedule...
  4022. b.gt .Loop_final_dec
  4023. aesd $tmpin,$dat0
  4024. aesimc $tmpin,$tmpin
  4025. vld1.32 {$dat0},[$key_]
  4026. aesd $tmpin,$dat1
  4027. veor $tmpin,$tmpin,$dat0
  4028. veor $tmpin,$tmpin,$iv0
  4029. vst1.8 {$tmpin},[$out]
  4030. .Lxts_dec_abort:
  4031. ldp $tailcnt,$midnumx,[sp,#48]
  4032. ldp $ivd10,$ivd20,[sp,#32]
  4033. ldp $ivd30,$ivd40,[sp,#16]
  4034. ldp $constnumx,$tmpinp,[sp],#64
  4035. .Lxts_dec_final_abort:
  4036. ret
  4037. .size ${prefix}_xts_decrypt,.-${prefix}_xts_decrypt
  4038. ___
  4039. }
  4040. }}}
  4041. $code.=<<___;
  4042. #endif
  4043. ___
  4044. ########################################
  4045. if ($flavour =~ /64/) { ######## 64-bit code
  4046. my %opcode = (
  4047. "aesd" => 0x4e285800, "aese" => 0x4e284800,
  4048. "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800,
  4049. "eor3" => 0xce000000, );
  4050. local *unaes = sub {
  4051. my ($mnemonic,$arg)=@_;
  4052. $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
  4053. sprintf ".inst\t0x%08x\t//%s %s",
  4054. $opcode{$mnemonic}|$1|($2<<5),
  4055. $mnemonic,$arg;
  4056. };
  4057. sub unsha3 {
  4058. my ($mnemonic,$arg)=@_;
  4059. $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
  4060. &&
  4061. sprintf ".inst\t0x%08x\t//%s %s",
  4062. $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
  4063. $mnemonic,$arg;
  4064. }
  4065. foreach(split("\n",$code)) {
  4066. s/\`([^\`]*)\`/eval($1)/geo;
  4067. s/\bq([0-9]+)\b/"v".($1<8?$1:($1<24?$1+8:$1-16)).".16b"/geo; # old->new registers
  4068. s/\bq_([0-9]+)\b/"q".$1/geo; # old->new registers
  4069. s/@\s/\/\//o; # old->new style commentary
  4070. #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
  4071. s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
  4072. s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
  4073. s/vmov\.i8/movi/o or # fix up legacy mnemonics
  4074. s/vext\.8/ext/o or
  4075. s/vrev32\.8/rev32/o or
  4076. s/vtst\.8/cmtst/o or
  4077. s/vshr/ushr/o or
  4078. s/^(\s+)v/$1/o or # strip off v prefix
  4079. s/\bbx\s+lr\b/ret/o;
  4080. s/\b(eor3)\s+(v.*)/unsha3($1,$2)/ge;
  4081. # fix up remaining legacy suffixes
  4082. s/\.[ui]?8//o;
  4083. m/\],#8/o and s/\.16b/\.8b/go;
  4084. s/\.[ui]?32//o and s/\.16b/\.4s/go;
  4085. s/\.[ui]?64//o and s/\.16b/\.2d/go;
  4086. s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
  4087. # Switch preprocessor checks to aarch64 versions.
  4088. s/__ARME([BL])__/__AARCH64E$1__/go;
  4089. print $_,"\n";
  4090. }
  4091. } else { ######## 32-bit code
  4092. my %opcode = (
  4093. "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
  4094. "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
  4095. local *unaes = sub {
  4096. my ($mnemonic,$arg)=@_;
  4097. if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
  4098. my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
  4099. |(($2&7)<<1) |(($2&8)<<2);
  4100. # since ARMv7 instructions are always encoded little-endian.
  4101. # correct solution is to use .inst directive, but older
  4102. # assemblers don't implement it:-(
  4103. sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
  4104. $word&0xff,($word>>8)&0xff,
  4105. ($word>>16)&0xff,($word>>24)&0xff,
  4106. $mnemonic,$arg;
  4107. }
  4108. };
  4109. sub unvtbl {
  4110. my $arg=shift;
  4111. $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
  4112. sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
  4113. "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
  4114. }
  4115. sub unvdup32 {
  4116. my $arg=shift;
  4117. $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
  4118. sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
  4119. }
  4120. sub unvmov32 {
  4121. my $arg=shift;
  4122. $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
  4123. sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
  4124. }
  4125. foreach(split("\n",$code)) {
  4126. s/\`([^\`]*)\`/eval($1)/geo;
  4127. s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
  4128. s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
  4129. s/\/\/\s?/@ /o; # new->old style commentary
  4130. # fix up remaining new-style suffixes
  4131. s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
  4132. s/\],#[0-9]+/]!/o;
  4133. s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
  4134. s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
  4135. s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
  4136. s/vdup\.32\s+(.*)/unvdup32($1)/geo or
  4137. s/vmov\.32\s+(.*)/unvmov32($1)/geo or
  4138. s/^(\s+)b\./$1b/o or
  4139. s/^(\s+)ret/$1bx\tlr/o;
  4140. if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
  4141. print " it $2\n";
  4142. }
  4143. print $_,"\n";
  4144. }
  4145. }
  4146. close STDOUT or die "error closing STDOUT: $!";