123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243 |
- #! /usr/bin/env perl
- # Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved.
- #
- # Licensed under the Apache License 2.0 (the "License"). You may not use
- # this file except in compliance with the License. You can obtain a copy
- # in the file LICENSE in the source distribution or at
- # https://www.openssl.org/source/license.html
- ###################################################################
- ### AES-128 [originally in CTR mode] ###
- ### bitsliced implementation for Intel Core 2 processors ###
- ### requires support of SSE extensions up to SSSE3 ###
- ### Author: Emilia Käsper and Peter Schwabe ###
- ### Date: 2009-03-19 ###
- ### Public domain ###
- ### ###
- ### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
- ### further information. ###
- ###################################################################
- #
- # September 2011.
- #
- # Started as transliteration to "perlasm" the original code has
- # undergone following changes:
- #
- # - code was made position-independent;
- # - rounds were folded into a loop resulting in >5x size reduction
- # from 12.5KB to 2.2KB;
- # - above was possible thanks to mixcolumns() modification that
- # allowed to feed its output back to aesenc[last], this was
- # achieved at cost of two additional inter-registers moves;
- # - some instruction reordering and interleaving;
- # - this module doesn't implement key setup subroutine, instead it
- # relies on conversion of "conventional" key schedule as returned
- # by AES_set_encrypt_key (see discussion below);
- # - first and last round keys are treated differently, which allowed
- # to skip one shiftrows(), reduce bit-sliced key schedule and
- # speed-up conversion by 22%;
- # - support for 192- and 256-bit keys was added;
- #
- # Resulting performance in CPU cycles spent to encrypt one byte out
- # of 4096-byte buffer with 128-bit key is:
- #
- # Emilia's this(*) difference
- #
- # Core 2 9.30 8.69 +7%
- # Nehalem(**) 7.63 6.88 +11%
- # Atom 17.1 16.4 +4%
- # Silvermont - 12.9
- # Goldmont - 8.85
- #
- # (*) Comparison is not completely fair, because "this" is ECB,
- # i.e. no extra processing such as counter values calculation
- # and xor-ing input as in Emilia's CTR implementation is
- # performed. However, the CTR calculations stand for not more
- # than 1% of total time, so comparison is *rather* fair.
- #
- # (**) Results were collected on Westmere, which is considered to
- # be equivalent to Nehalem for this code.
- #
- # As for key schedule conversion subroutine. Interface to OpenSSL
- # relies on per-invocation on-the-fly conversion. This naturally
- # has impact on performance, especially for short inputs. Conversion
- # time in CPU cycles and its ratio to CPU cycles spent in 8x block
- # function is:
- #
- # conversion conversion/8x block
- # Core 2 240 0.22
- # Nehalem 180 0.20
- # Atom 430 0.20
- #
- # The ratio values mean that 128-byte blocks will be processed
- # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
- # etc. Then keep in mind that input sizes not divisible by 128 are
- # *effectively* slower, especially shortest ones, e.g. consecutive
- # 144-byte blocks are processed 44% slower than one would expect,
- # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
- # it's still faster than ["hyper-threading-safe" code path in]
- # aes-x86_64.pl on all lengths above 64 bytes...
- #
- # October 2011.
- #
- # Add decryption procedure. Performance in CPU cycles spent to decrypt
- # one byte out of 4096-byte buffer with 128-bit key is:
- #
- # Core 2 9.98
- # Nehalem 7.80
- # Atom 17.9
- # Silvermont 14.0
- # Goldmont 10.2
- #
- # November 2011.
- #
- # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
- # suboptimal, but XTS is meant to be used with larger blocks...
- #
- # <appro@openssl.org>
- # $output is the last argument if it looks like a file (it has an extension)
- # $flavour is the first argument if it doesn't look like a file
- $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
- $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
- $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
- die "can't locate x86_64-xlate.pl";
- open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
- or die "can't call $xlate: $!";
- *STDOUT=*OUT;
- my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
- my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
- my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
- {
- my ($key,$rounds,$const)=("%rax","%r10d","%r11");
- sub Sbox {
- # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
- # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
- my @b=@_[0..7];
- my @t=@_[8..11];
- my @s=@_[12..15];
- &InBasisChange (@b);
- &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
- &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
- }
- sub InBasisChange {
- # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
- # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
- my @b=@_[0..7];
- $code.=<<___;
- pxor @b[6], @b[5]
- pxor @b[1], @b[2]
- pxor @b[0], @b[3]
- pxor @b[2], @b[6]
- pxor @b[0], @b[5]
- pxor @b[3], @b[6]
- pxor @b[7], @b[3]
- pxor @b[5], @b[7]
- pxor @b[4], @b[3]
- pxor @b[5], @b[4]
- pxor @b[1], @b[3]
- pxor @b[7], @b[2]
- pxor @b[5], @b[1]
- ___
- }
- sub OutBasisChange {
- # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
- # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
- my @b=@_[0..7];
- $code.=<<___;
- pxor @b[6], @b[0]
- pxor @b[4], @b[1]
- pxor @b[0], @b[2]
- pxor @b[6], @b[4]
- pxor @b[1], @b[6]
- pxor @b[5], @b[1]
- pxor @b[3], @b[5]
- pxor @b[7], @b[3]
- pxor @b[5], @b[7]
- pxor @b[5], @b[2]
- pxor @b[7], @b[4]
- ___
- }
- sub InvSbox {
- # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
- # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
- my @b=@_[0..7];
- my @t=@_[8..11];
- my @s=@_[12..15];
- &InvInBasisChange (@b);
- &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
- &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
- }
- sub InvInBasisChange { # OutBasisChange in reverse
- my @b=@_[5,1,2,6,3,7,0,4];
- $code.=<<___
- pxor @b[7], @b[4]
- pxor @b[5], @b[7]
- pxor @b[5], @b[2]
- pxor @b[7], @b[3]
- pxor @b[3], @b[5]
- pxor @b[5], @b[1]
- pxor @b[1], @b[6]
- pxor @b[0], @b[2]
- pxor @b[6], @b[4]
- pxor @b[6], @b[0]
- pxor @b[4], @b[1]
- ___
- }
- sub InvOutBasisChange { # InBasisChange in reverse
- my @b=@_[2,5,7,3,6,1,0,4];
- $code.=<<___;
- pxor @b[5], @b[1]
- pxor @b[7], @b[2]
- pxor @b[1], @b[3]
- pxor @b[5], @b[4]
- pxor @b[5], @b[7]
- pxor @b[4], @b[3]
- pxor @b[0], @b[5]
- pxor @b[7], @b[3]
- pxor @b[2], @b[6]
- pxor @b[1], @b[2]
- pxor @b[3], @b[6]
- pxor @b[0], @b[3]
- pxor @b[6], @b[5]
- ___
- }
- sub Mul_GF4 {
- #;*************************************************************
- #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
- #;*************************************************************
- my ($x0,$x1,$y0,$y1,$t0)=@_;
- $code.=<<___;
- movdqa $y0, $t0
- pxor $y1, $t0
- pand $x0, $t0
- pxor $x1, $x0
- pand $y0, $x1
- pand $y1, $x0
- pxor $x1, $x0
- pxor $t0, $x1
- ___
- }
- sub Mul_GF4_N { # not used, see next subroutine
- # multiply and scale by N
- my ($x0,$x1,$y0,$y1,$t0)=@_;
- $code.=<<___;
- movdqa $y0, $t0
- pxor $y1, $t0
- pand $x0, $t0
- pxor $x1, $x0
- pand $y0, $x1
- pand $y1, $x0
- pxor $x0, $x1
- pxor $t0, $x0
- ___
- }
- sub Mul_GF4_N_GF4 {
- # interleaved Mul_GF4_N and Mul_GF4
- my ($x0,$x1,$y0,$y1,$t0,
- $x2,$x3,$y2,$y3,$t1)=@_;
- $code.=<<___;
- movdqa $y0, $t0
- movdqa $y2, $t1
- pxor $y1, $t0
- pxor $y3, $t1
- pand $x0, $t0
- pand $x2, $t1
- pxor $x1, $x0
- pxor $x3, $x2
- pand $y0, $x1
- pand $y2, $x3
- pand $y1, $x0
- pand $y3, $x2
- pxor $x0, $x1
- pxor $x3, $x2
- pxor $t0, $x0
- pxor $t1, $x3
- ___
- }
- sub Mul_GF16_2 {
- my @x=@_[0..7];
- my @y=@_[8..11];
- my @t=@_[12..15];
- $code.=<<___;
- movdqa @x[0], @t[0]
- movdqa @x[1], @t[1]
- ___
- &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
- $code.=<<___;
- pxor @x[2], @t[0]
- pxor @x[3], @t[1]
- pxor @y[2], @y[0]
- pxor @y[3], @y[1]
- ___
- Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
- @x[2], @x[3], @y[2], @y[3], @t[2]);
- $code.=<<___;
- pxor @t[0], @x[0]
- pxor @t[0], @x[2]
- pxor @t[1], @x[1]
- pxor @t[1], @x[3]
- movdqa @x[4], @t[0]
- movdqa @x[5], @t[1]
- pxor @x[6], @t[0]
- pxor @x[7], @t[1]
- ___
- &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
- @x[6], @x[7], @y[2], @y[3], @t[2]);
- $code.=<<___;
- pxor @y[2], @y[0]
- pxor @y[3], @y[1]
- ___
- &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
- $code.=<<___;
- pxor @t[0], @x[4]
- pxor @t[0], @x[6]
- pxor @t[1], @x[5]
- pxor @t[1], @x[7]
- ___
- }
- sub Inv_GF256 {
- #;********************************************************************
- #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
- #;********************************************************************
- my @x=@_[0..7];
- my @t=@_[8..11];
- my @s=@_[12..15];
- # direct optimizations from hardware
- $code.=<<___;
- movdqa @x[4], @t[3]
- movdqa @x[5], @t[2]
- movdqa @x[1], @t[1]
- movdqa @x[7], @s[1]
- movdqa @x[0], @s[0]
- pxor @x[6], @t[3]
- pxor @x[7], @t[2]
- pxor @x[3], @t[1]
- movdqa @t[3], @s[2]
- pxor @x[6], @s[1]
- movdqa @t[2], @t[0]
- pxor @x[2], @s[0]
- movdqa @t[3], @s[3]
- por @t[1], @t[2]
- por @s[0], @t[3]
- pxor @t[0], @s[3]
- pand @s[0], @s[2]
- pxor @t[1], @s[0]
- pand @t[1], @t[0]
- pand @s[0], @s[3]
- movdqa @x[3], @s[0]
- pxor @x[2], @s[0]
- pand @s[0], @s[1]
- pxor @s[1], @t[3]
- pxor @s[1], @t[2]
- movdqa @x[4], @s[1]
- movdqa @x[1], @s[0]
- pxor @x[5], @s[1]
- pxor @x[0], @s[0]
- movdqa @s[1], @t[1]
- pand @s[0], @s[1]
- por @s[0], @t[1]
- pxor @s[1], @t[0]
- pxor @s[3], @t[3]
- pxor @s[2], @t[2]
- pxor @s[3], @t[1]
- movdqa @x[7], @s[0]
- pxor @s[2], @t[0]
- movdqa @x[6], @s[1]
- pxor @s[2], @t[1]
- movdqa @x[5], @s[2]
- pand @x[3], @s[0]
- movdqa @x[4], @s[3]
- pand @x[2], @s[1]
- pand @x[1], @s[2]
- por @x[0], @s[3]
- pxor @s[0], @t[3]
- pxor @s[1], @t[2]
- pxor @s[2], @t[1]
- pxor @s[3], @t[0]
- #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
- # new smaller inversion
- movdqa @t[3], @s[0]
- pand @t[1], @t[3]
- pxor @t[2], @s[0]
- movdqa @t[0], @s[2]
- movdqa @s[0], @s[3]
- pxor @t[3], @s[2]
- pand @s[2], @s[3]
- movdqa @t[1], @s[1]
- pxor @t[2], @s[3]
- pxor @t[0], @s[1]
- pxor @t[2], @t[3]
- pand @t[3], @s[1]
- movdqa @s[2], @t[2]
- pxor @t[0], @s[1]
- pxor @s[1], @t[2]
- pxor @s[1], @t[1]
- pand @t[0], @t[2]
- pxor @t[2], @s[2]
- pxor @t[2], @t[1]
- pand @s[3], @s[2]
- pxor @s[0], @s[2]
- ___
- # output in s3, s2, s1, t1
- # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
- # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
- &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
- ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
- }
- # AES linear components
- sub ShiftRows {
- my @x=@_[0..7];
- my $mask=pop;
- $code.=<<___;
- pxor 0x00($key),@x[0]
- pxor 0x10($key),@x[1]
- pxor 0x20($key),@x[2]
- pxor 0x30($key),@x[3]
- pshufb $mask,@x[0]
- pshufb $mask,@x[1]
- pxor 0x40($key),@x[4]
- pxor 0x50($key),@x[5]
- pshufb $mask,@x[2]
- pshufb $mask,@x[3]
- pxor 0x60($key),@x[6]
- pxor 0x70($key),@x[7]
- pshufb $mask,@x[4]
- pshufb $mask,@x[5]
- pshufb $mask,@x[6]
- pshufb $mask,@x[7]
- lea 0x80($key),$key
- ___
- }
- sub MixColumns {
- # modified to emit output in order suitable for feeding back to aesenc[last]
- my @x=@_[0..7];
- my @t=@_[8..15];
- my $inv=@_[16]; # optional
- $code.=<<___;
- pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
- pshufd \$0x93, @x[1], @t[1]
- pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
- pshufd \$0x93, @x[2], @t[2]
- pxor @t[1], @x[1]
- pshufd \$0x93, @x[3], @t[3]
- pxor @t[2], @x[2]
- pshufd \$0x93, @x[4], @t[4]
- pxor @t[3], @x[3]
- pshufd \$0x93, @x[5], @t[5]
- pxor @t[4], @x[4]
- pshufd \$0x93, @x[6], @t[6]
- pxor @t[5], @x[5]
- pshufd \$0x93, @x[7], @t[7]
- pxor @t[6], @x[6]
- pxor @t[7], @x[7]
- pxor @x[0], @t[1]
- pxor @x[7], @t[0]
- pxor @x[7], @t[1]
- pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
- pxor @x[1], @t[2]
- pshufd \$0x4E, @x[1], @x[1]
- pxor @x[4], @t[5]
- pxor @t[0], @x[0]
- pxor @x[5], @t[6]
- pxor @t[1], @x[1]
- pxor @x[3], @t[4]
- pshufd \$0x4E, @x[4], @t[0]
- pxor @x[6], @t[7]
- pshufd \$0x4E, @x[5], @t[1]
- pxor @x[2], @t[3]
- pshufd \$0x4E, @x[3], @x[4]
- pxor @x[7], @t[3]
- pshufd \$0x4E, @x[7], @x[5]
- pxor @x[7], @t[4]
- pshufd \$0x4E, @x[6], @x[3]
- pxor @t[4], @t[0]
- pshufd \$0x4E, @x[2], @x[6]
- pxor @t[5], @t[1]
- ___
- $code.=<<___ if (!$inv);
- pxor @t[3], @x[4]
- pxor @t[7], @x[5]
- pxor @t[6], @x[3]
- movdqa @t[0], @x[2]
- pxor @t[2], @x[6]
- movdqa @t[1], @x[7]
- ___
- $code.=<<___ if ($inv);
- pxor @x[4], @t[3]
- pxor @t[7], @x[5]
- pxor @x[3], @t[6]
- movdqa @t[0], @x[3]
- pxor @t[2], @x[6]
- movdqa @t[6], @x[2]
- movdqa @t[1], @x[7]
- movdqa @x[6], @x[4]
- movdqa @t[3], @x[6]
- ___
- }
- sub InvMixColumns_orig {
- my @x=@_[0..7];
- my @t=@_[8..15];
- $code.=<<___;
- # multiplication by 0x0e
- pshufd \$0x93, @x[7], @t[7]
- movdqa @x[2], @t[2]
- pxor @x[5], @x[7] # 7 5
- pxor @x[5], @x[2] # 2 5
- pshufd \$0x93, @x[0], @t[0]
- movdqa @x[5], @t[5]
- pxor @x[0], @x[5] # 5 0 [1]
- pxor @x[1], @x[0] # 0 1
- pshufd \$0x93, @x[1], @t[1]
- pxor @x[2], @x[1] # 1 25
- pxor @x[6], @x[0] # 01 6 [2]
- pxor @x[3], @x[1] # 125 3 [4]
- pshufd \$0x93, @x[3], @t[3]
- pxor @x[0], @x[2] # 25 016 [3]
- pxor @x[7], @x[3] # 3 75
- pxor @x[6], @x[7] # 75 6 [0]
- pshufd \$0x93, @x[6], @t[6]
- movdqa @x[4], @t[4]
- pxor @x[4], @x[6] # 6 4
- pxor @x[3], @x[4] # 4 375 [6]
- pxor @x[7], @x[3] # 375 756=36
- pxor @t[5], @x[6] # 64 5 [7]
- pxor @t[2], @x[3] # 36 2
- pxor @t[4], @x[3] # 362 4 [5]
- pshufd \$0x93, @t[5], @t[5]
- ___
- my @y = @x[7,5,0,2,1,3,4,6];
- $code.=<<___;
- # multiplication by 0x0b
- pxor @y[0], @y[1]
- pxor @t[0], @y[0]
- pxor @t[1], @y[1]
- pshufd \$0x93, @t[2], @t[2]
- pxor @t[5], @y[0]
- pxor @t[6], @y[1]
- pxor @t[7], @y[0]
- pshufd \$0x93, @t[4], @t[4]
- pxor @t[6], @t[7] # clobber t[7]
- pxor @y[0], @y[1]
- pxor @t[0], @y[3]
- pshufd \$0x93, @t[0], @t[0]
- pxor @t[1], @y[2]
- pxor @t[1], @y[4]
- pxor @t[2], @y[2]
- pshufd \$0x93, @t[1], @t[1]
- pxor @t[2], @y[3]
- pxor @t[2], @y[5]
- pxor @t[7], @y[2]
- pshufd \$0x93, @t[2], @t[2]
- pxor @t[3], @y[3]
- pxor @t[3], @y[6]
- pxor @t[3], @y[4]
- pshufd \$0x93, @t[3], @t[3]
- pxor @t[4], @y[7]
- pxor @t[4], @y[5]
- pxor @t[7], @y[7]
- pxor @t[5], @y[3]
- pxor @t[4], @y[4]
- pxor @t[5], @t[7] # clobber t[7] even more
- pxor @t[7], @y[5]
- pshufd \$0x93, @t[4], @t[4]
- pxor @t[7], @y[6]
- pxor @t[7], @y[4]
- pxor @t[5], @t[7]
- pshufd \$0x93, @t[5], @t[5]
- pxor @t[6], @t[7] # restore t[7]
- # multiplication by 0x0d
- pxor @y[7], @y[4]
- pxor @t[4], @y[7]
- pshufd \$0x93, @t[6], @t[6]
- pxor @t[0], @y[2]
- pxor @t[5], @y[7]
- pxor @t[2], @y[2]
- pshufd \$0x93, @t[7], @t[7]
- pxor @y[1], @y[3]
- pxor @t[1], @y[1]
- pxor @t[0], @y[0]
- pxor @t[0], @y[3]
- pxor @t[5], @y[1]
- pxor @t[5], @y[0]
- pxor @t[7], @y[1]
- pshufd \$0x93, @t[0], @t[0]
- pxor @t[6], @y[0]
- pxor @y[1], @y[3]
- pxor @t[1], @y[4]
- pshufd \$0x93, @t[1], @t[1]
- pxor @t[7], @y[7]
- pxor @t[2], @y[4]
- pxor @t[2], @y[5]
- pshufd \$0x93, @t[2], @t[2]
- pxor @t[6], @y[2]
- pxor @t[3], @t[6] # clobber t[6]
- pxor @y[7], @y[4]
- pxor @t[6], @y[3]
- pxor @t[6], @y[6]
- pxor @t[5], @y[5]
- pxor @t[4], @y[6]
- pshufd \$0x93, @t[4], @t[4]
- pxor @t[6], @y[5]
- pxor @t[7], @y[6]
- pxor @t[3], @t[6] # restore t[6]
- pshufd \$0x93, @t[5], @t[5]
- pshufd \$0x93, @t[6], @t[6]
- pshufd \$0x93, @t[7], @t[7]
- pshufd \$0x93, @t[3], @t[3]
- # multiplication by 0x09
- pxor @y[1], @y[4]
- pxor @y[1], @t[1] # t[1]=y[1]
- pxor @t[5], @t[0] # clobber t[0]
- pxor @t[5], @t[1]
- pxor @t[0], @y[3]
- pxor @y[0], @t[0] # t[0]=y[0]
- pxor @t[6], @t[1]
- pxor @t[7], @t[6] # clobber t[6]
- pxor @t[1], @y[4]
- pxor @t[4], @y[7]
- pxor @y[4], @t[4] # t[4]=y[4]
- pxor @t[3], @y[6]
- pxor @y[3], @t[3] # t[3]=y[3]
- pxor @t[2], @y[5]
- pxor @y[2], @t[2] # t[2]=y[2]
- pxor @t[7], @t[3]
- pxor @y[5], @t[5] # t[5]=y[5]
- pxor @t[6], @t[2]
- pxor @t[6], @t[5]
- pxor @y[6], @t[6] # t[6]=y[6]
- pxor @y[7], @t[7] # t[7]=y[7]
- movdqa @t[0],@XMM[0]
- movdqa @t[1],@XMM[1]
- movdqa @t[2],@XMM[2]
- movdqa @t[3],@XMM[3]
- movdqa @t[4],@XMM[4]
- movdqa @t[5],@XMM[5]
- movdqa @t[6],@XMM[6]
- movdqa @t[7],@XMM[7]
- ___
- }
- sub InvMixColumns {
- my @x=@_[0..7];
- my @t=@_[8..15];
- # Thanks to Jussi Kivilinna for providing pointer to
- #
- # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
- # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
- # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
- # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
- $code.=<<___;
- # multiplication by 0x05-0x00-0x04-0x00
- pshufd \$0x4E, @x[0], @t[0]
- pshufd \$0x4E, @x[6], @t[6]
- pxor @x[0], @t[0]
- pshufd \$0x4E, @x[7], @t[7]
- pxor @x[6], @t[6]
- pshufd \$0x4E, @x[1], @t[1]
- pxor @x[7], @t[7]
- pshufd \$0x4E, @x[2], @t[2]
- pxor @x[1], @t[1]
- pshufd \$0x4E, @x[3], @t[3]
- pxor @x[2], @t[2]
- pxor @t[6], @x[0]
- pxor @t[6], @x[1]
- pshufd \$0x4E, @x[4], @t[4]
- pxor @x[3], @t[3]
- pxor @t[0], @x[2]
- pxor @t[1], @x[3]
- pshufd \$0x4E, @x[5], @t[5]
- pxor @x[4], @t[4]
- pxor @t[7], @x[1]
- pxor @t[2], @x[4]
- pxor @x[5], @t[5]
- pxor @t[7], @x[2]
- pxor @t[6], @x[3]
- pxor @t[6], @x[4]
- pxor @t[3], @x[5]
- pxor @t[4], @x[6]
- pxor @t[7], @x[4]
- pxor @t[7], @x[5]
- pxor @t[5], @x[7]
- ___
- &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
- }
- sub aesenc { # not used
- my @b=@_[0..7];
- my @t=@_[8..15];
- $code.=<<___;
- movdqa 0x30($const),@t[0] # .LSR
- ___
- &ShiftRows (@b,@t[0]);
- &Sbox (@b,@t);
- &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
- }
- sub aesenclast { # not used
- my @b=@_[0..7];
- my @t=@_[8..15];
- $code.=<<___;
- movdqa 0x40($const),@t[0] # .LSRM0
- ___
- &ShiftRows (@b,@t[0]);
- &Sbox (@b,@t);
- $code.=<<___
- pxor 0x00($key),@b[0]
- pxor 0x10($key),@b[1]
- pxor 0x20($key),@b[4]
- pxor 0x30($key),@b[6]
- pxor 0x40($key),@b[3]
- pxor 0x50($key),@b[7]
- pxor 0x60($key),@b[2]
- pxor 0x70($key),@b[5]
- ___
- }
- sub swapmove {
- my ($a,$b,$n,$mask,$t)=@_;
- $code.=<<___;
- movdqa $b,$t
- psrlq \$$n,$b
- pxor $a,$b
- pand $mask,$b
- pxor $b,$a
- psllq \$$n,$b
- pxor $t,$b
- ___
- }
- sub swapmove2x {
- my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
- $code.=<<___;
- movdqa $b0,$t0
- psrlq \$$n,$b0
- movdqa $b1,$t1
- psrlq \$$n,$b1
- pxor $a0,$b0
- pxor $a1,$b1
- pand $mask,$b0
- pand $mask,$b1
- pxor $b0,$a0
- psllq \$$n,$b0
- pxor $b1,$a1
- psllq \$$n,$b1
- pxor $t0,$b0
- pxor $t1,$b1
- ___
- }
- sub bitslice {
- my @x=reverse(@_[0..7]);
- my ($t0,$t1,$t2,$t3)=@_[8..11];
- $code.=<<___;
- movdqa 0x00($const),$t0 # .LBS0
- movdqa 0x10($const),$t1 # .LBS1
- ___
- &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
- &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
- $code.=<<___;
- movdqa 0x20($const),$t0 # .LBS2
- ___
- &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
- &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
- &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
- &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
- }
- $code.=<<___;
- .text
- .extern asm_AES_encrypt
- .extern asm_AES_decrypt
- .type _bsaes_encrypt8,\@abi-omnipotent
- .align 64
- _bsaes_encrypt8:
- .cfi_startproc
- lea .LBS0(%rip), $const # constants table
- movdqa ($key), @XMM[9] # round 0 key
- lea 0x10($key), $key
- movdqa 0x50($const), @XMM[8] # .LM0SR
- pxor @XMM[9], @XMM[0] # xor with round0 key
- pxor @XMM[9], @XMM[1]
- pxor @XMM[9], @XMM[2]
- pxor @XMM[9], @XMM[3]
- pshufb @XMM[8], @XMM[0]
- pshufb @XMM[8], @XMM[1]
- pxor @XMM[9], @XMM[4]
- pxor @XMM[9], @XMM[5]
- pshufb @XMM[8], @XMM[2]
- pshufb @XMM[8], @XMM[3]
- pxor @XMM[9], @XMM[6]
- pxor @XMM[9], @XMM[7]
- pshufb @XMM[8], @XMM[4]
- pshufb @XMM[8], @XMM[5]
- pshufb @XMM[8], @XMM[6]
- pshufb @XMM[8], @XMM[7]
- _bsaes_encrypt8_bitslice:
- ___
- &bitslice (@XMM[0..7, 8..11]);
- $code.=<<___;
- dec $rounds
- jmp .Lenc_sbox
- .align 16
- .Lenc_loop:
- ___
- &ShiftRows (@XMM[0..7, 8]);
- $code.=".Lenc_sbox:\n";
- &Sbox (@XMM[0..7, 8..15]);
- $code.=<<___;
- dec $rounds
- jl .Lenc_done
- ___
- &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
- $code.=<<___;
- movdqa 0x30($const), @XMM[8] # .LSR
- jnz .Lenc_loop
- movdqa 0x40($const), @XMM[8] # .LSRM0
- jmp .Lenc_loop
- .align 16
- .Lenc_done:
- ___
- # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
- &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
- $code.=<<___;
- movdqa ($key), @XMM[8] # last round key
- pxor @XMM[8], @XMM[4]
- pxor @XMM[8], @XMM[6]
- pxor @XMM[8], @XMM[3]
- pxor @XMM[8], @XMM[7]
- pxor @XMM[8], @XMM[2]
- pxor @XMM[8], @XMM[5]
- pxor @XMM[8], @XMM[0]
- pxor @XMM[8], @XMM[1]
- ret
- .cfi_endproc
- .size _bsaes_encrypt8,.-_bsaes_encrypt8
- .type _bsaes_decrypt8,\@abi-omnipotent
- .align 64
- _bsaes_decrypt8:
- .cfi_startproc
- lea .LBS0(%rip), $const # constants table
- movdqa ($key), @XMM[9] # round 0 key
- lea 0x10($key), $key
- movdqa -0x30($const), @XMM[8] # .LM0ISR
- pxor @XMM[9], @XMM[0] # xor with round0 key
- pxor @XMM[9], @XMM[1]
- pxor @XMM[9], @XMM[2]
- pxor @XMM[9], @XMM[3]
- pshufb @XMM[8], @XMM[0]
- pshufb @XMM[8], @XMM[1]
- pxor @XMM[9], @XMM[4]
- pxor @XMM[9], @XMM[5]
- pshufb @XMM[8], @XMM[2]
- pshufb @XMM[8], @XMM[3]
- pxor @XMM[9], @XMM[6]
- pxor @XMM[9], @XMM[7]
- pshufb @XMM[8], @XMM[4]
- pshufb @XMM[8], @XMM[5]
- pshufb @XMM[8], @XMM[6]
- pshufb @XMM[8], @XMM[7]
- ___
- &bitslice (@XMM[0..7, 8..11]);
- $code.=<<___;
- dec $rounds
- jmp .Ldec_sbox
- .align 16
- .Ldec_loop:
- ___
- &ShiftRows (@XMM[0..7, 8]);
- $code.=".Ldec_sbox:\n";
- &InvSbox (@XMM[0..7, 8..15]);
- $code.=<<___;
- dec $rounds
- jl .Ldec_done
- ___
- &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
- $code.=<<___;
- movdqa -0x10($const), @XMM[8] # .LISR
- jnz .Ldec_loop
- movdqa -0x20($const), @XMM[8] # .LISRM0
- jmp .Ldec_loop
- .align 16
- .Ldec_done:
- ___
- &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
- $code.=<<___;
- movdqa ($key), @XMM[8] # last round key
- pxor @XMM[8], @XMM[6]
- pxor @XMM[8], @XMM[4]
- pxor @XMM[8], @XMM[2]
- pxor @XMM[8], @XMM[7]
- pxor @XMM[8], @XMM[3]
- pxor @XMM[8], @XMM[5]
- pxor @XMM[8], @XMM[0]
- pxor @XMM[8], @XMM[1]
- ret
- .cfi_endproc
- .size _bsaes_decrypt8,.-_bsaes_decrypt8
- ___
- }
- {
- my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
- sub bitslice_key {
- my @x=reverse(@_[0..7]);
- my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
- &swapmove (@x[0,1],1,$bs0,$t2,$t3);
- $code.=<<___;
- #&swapmove(@x[2,3],1,$t0,$t2,$t3);
- movdqa @x[0], @x[2]
- movdqa @x[1], @x[3]
- ___
- #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
- &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
- $code.=<<___;
- #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
- movdqa @x[0], @x[4]
- movdqa @x[2], @x[6]
- movdqa @x[1], @x[5]
- movdqa @x[3], @x[7]
- ___
- &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
- &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
- }
- $code.=<<___;
- .type _bsaes_key_convert,\@abi-omnipotent
- .align 16
- _bsaes_key_convert:
- .cfi_startproc
- lea .Lmasks(%rip), $const
- movdqu ($inp), %xmm7 # load round 0 key
- lea 0x10($inp), $inp
- movdqa 0x00($const), %xmm0 # 0x01...
- movdqa 0x10($const), %xmm1 # 0x02...
- movdqa 0x20($const), %xmm2 # 0x04...
- movdqa 0x30($const), %xmm3 # 0x08...
- movdqa 0x40($const), %xmm4 # .LM0
- pcmpeqd %xmm5, %xmm5 # .LNOT
- movdqu ($inp), %xmm6 # load round 1 key
- movdqa %xmm7, ($out) # save round 0 key
- lea 0x10($out), $out
- dec $rounds
- jmp .Lkey_loop
- .align 16
- .Lkey_loop:
- pshufb %xmm4, %xmm6 # .LM0
- movdqa %xmm0, %xmm8
- movdqa %xmm1, %xmm9
- pand %xmm6, %xmm8
- pand %xmm6, %xmm9
- movdqa %xmm2, %xmm10
- pcmpeqb %xmm0, %xmm8
- psllq \$4, %xmm0 # 0x10...
- movdqa %xmm3, %xmm11
- pcmpeqb %xmm1, %xmm9
- psllq \$4, %xmm1 # 0x20...
- pand %xmm6, %xmm10
- pand %xmm6, %xmm11
- movdqa %xmm0, %xmm12
- pcmpeqb %xmm2, %xmm10
- psllq \$4, %xmm2 # 0x40...
- movdqa %xmm1, %xmm13
- pcmpeqb %xmm3, %xmm11
- psllq \$4, %xmm3 # 0x80...
- movdqa %xmm2, %xmm14
- movdqa %xmm3, %xmm15
- pxor %xmm5, %xmm8 # "pnot"
- pxor %xmm5, %xmm9
- pand %xmm6, %xmm12
- pand %xmm6, %xmm13
- movdqa %xmm8, 0x00($out) # write bit-sliced round key
- pcmpeqb %xmm0, %xmm12
- psrlq \$4, %xmm0 # 0x01...
- movdqa %xmm9, 0x10($out)
- pcmpeqb %xmm1, %xmm13
- psrlq \$4, %xmm1 # 0x02...
- lea 0x10($inp), $inp
- pand %xmm6, %xmm14
- pand %xmm6, %xmm15
- movdqa %xmm10, 0x20($out)
- pcmpeqb %xmm2, %xmm14
- psrlq \$4, %xmm2 # 0x04...
- movdqa %xmm11, 0x30($out)
- pcmpeqb %xmm3, %xmm15
- psrlq \$4, %xmm3 # 0x08...
- movdqu ($inp), %xmm6 # load next round key
- pxor %xmm5, %xmm13 # "pnot"
- pxor %xmm5, %xmm14
- movdqa %xmm12, 0x40($out)
- movdqa %xmm13, 0x50($out)
- movdqa %xmm14, 0x60($out)
- movdqa %xmm15, 0x70($out)
- lea 0x80($out),$out
- dec $rounds
- jnz .Lkey_loop
- movdqa 0x50($const), %xmm7 # .L63
- #movdqa %xmm6, ($out) # don't save last round key
- ret
- .cfi_endproc
- .size _bsaes_key_convert,.-_bsaes_key_convert
- ___
- }
- if (0 && !$win64) { # following four functions are unsupported interface
- # used for benchmarking...
- $code.=<<___;
- .globl bsaes_enc_key_convert
- .type bsaes_enc_key_convert,\@function,2
- .align 16
- bsaes_enc_key_convert:
- mov 240($inp),%r10d # pass rounds
- mov $inp,%rcx # pass key
- mov $out,%rax # pass key schedule
- call _bsaes_key_convert
- pxor %xmm6,%xmm7 # fix up last round key
- movdqa %xmm7,(%rax) # save last round key
- ret
- .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
- .globl bsaes_encrypt_128
- .type bsaes_encrypt_128,\@function,4
- .align 16
- bsaes_encrypt_128:
- .Lenc128_loop:
- movdqu 0x00($inp), @XMM[0] # load input
- movdqu 0x10($inp), @XMM[1]
- movdqu 0x20($inp), @XMM[2]
- movdqu 0x30($inp), @XMM[3]
- movdqu 0x40($inp), @XMM[4]
- movdqu 0x50($inp), @XMM[5]
- movdqu 0x60($inp), @XMM[6]
- movdqu 0x70($inp), @XMM[7]
- mov $key, %rax # pass the $key
- lea 0x80($inp), $inp
- mov \$10,%r10d
- call _bsaes_encrypt8
- movdqu @XMM[0], 0x00($out) # write output
- movdqu @XMM[1], 0x10($out)
- movdqu @XMM[4], 0x20($out)
- movdqu @XMM[6], 0x30($out)
- movdqu @XMM[3], 0x40($out)
- movdqu @XMM[7], 0x50($out)
- movdqu @XMM[2], 0x60($out)
- movdqu @XMM[5], 0x70($out)
- lea 0x80($out), $out
- sub \$0x80,$len
- ja .Lenc128_loop
- ret
- .size bsaes_encrypt_128,.-bsaes_encrypt_128
- .globl bsaes_dec_key_convert
- .type bsaes_dec_key_convert,\@function,2
- .align 16
- bsaes_dec_key_convert:
- mov 240($inp),%r10d # pass rounds
- mov $inp,%rcx # pass key
- mov $out,%rax # pass key schedule
- call _bsaes_key_convert
- pxor ($out),%xmm7 # fix up round 0 key
- movdqa %xmm6,(%rax) # save last round key
- movdqa %xmm7,($out)
- ret
- .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
- .globl bsaes_decrypt_128
- .type bsaes_decrypt_128,\@function,4
- .align 16
- bsaes_decrypt_128:
- .Ldec128_loop:
- movdqu 0x00($inp), @XMM[0] # load input
- movdqu 0x10($inp), @XMM[1]
- movdqu 0x20($inp), @XMM[2]
- movdqu 0x30($inp), @XMM[3]
- movdqu 0x40($inp), @XMM[4]
- movdqu 0x50($inp), @XMM[5]
- movdqu 0x60($inp), @XMM[6]
- movdqu 0x70($inp), @XMM[7]
- mov $key, %rax # pass the $key
- lea 0x80($inp), $inp
- mov \$10,%r10d
- call _bsaes_decrypt8
- movdqu @XMM[0], 0x00($out) # write output
- movdqu @XMM[1], 0x10($out)
- movdqu @XMM[6], 0x20($out)
- movdqu @XMM[4], 0x30($out)
- movdqu @XMM[2], 0x40($out)
- movdqu @XMM[7], 0x50($out)
- movdqu @XMM[3], 0x60($out)
- movdqu @XMM[5], 0x70($out)
- lea 0x80($out), $out
- sub \$0x80,$len
- ja .Ldec128_loop
- ret
- .size bsaes_decrypt_128,.-bsaes_decrypt_128
- ___
- }
- {
- ######################################################################
- #
- # OpenSSL interface
- #
- my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
- : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
- my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
- if ($ecb) {
- $code.=<<___;
- .globl bsaes_ecb_encrypt_blocks
- .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
- .align 16
- bsaes_ecb_encrypt_blocks:
- .cfi_startproc
- mov %rsp, %rax
- .Lecb_enc_prologue:
- push %rbp
- .cfi_push %rbp
- push %rbx
- .cfi_push %rbx
- push %r12
- .cfi_push %r12
- push %r13
- .cfi_push %r13
- push %r14
- .cfi_push %r14
- push %r15
- .cfi_push %r15
- lea -0x48(%rsp),%rsp
- .cfi_adjust_cfa_offset 0x48
- ___
- $code.=<<___ if ($win64);
- lea -0xa0(%rsp), %rsp
- movaps %xmm6, 0x40(%rsp)
- movaps %xmm7, 0x50(%rsp)
- movaps %xmm8, 0x60(%rsp)
- movaps %xmm9, 0x70(%rsp)
- movaps %xmm10, 0x80(%rsp)
- movaps %xmm11, 0x90(%rsp)
- movaps %xmm12, 0xa0(%rsp)
- movaps %xmm13, 0xb0(%rsp)
- movaps %xmm14, 0xc0(%rsp)
- movaps %xmm15, 0xd0(%rsp)
- .Lecb_enc_body:
- ___
- $code.=<<___;
- mov %rsp,%rbp # backup %rsp
- .cfi_def_cfa_register %rbp
- mov 240($arg4),%eax # rounds
- mov $arg1,$inp # backup arguments
- mov $arg2,$out
- mov $arg3,$len
- mov $arg4,$key
- cmp \$8,$arg3
- jb .Lecb_enc_short
- mov %eax,%ebx # backup rounds
- shl \$7,%rax # 128 bytes per inner round key
- sub \$`128-32`,%rax # size of bit-sliced key schedule
- sub %rax,%rsp
- mov %rsp,%rax # pass key schedule
- mov $key,%rcx # pass key
- mov %ebx,%r10d # pass rounds
- call _bsaes_key_convert
- pxor %xmm6,%xmm7 # fix up last round key
- movdqa %xmm7,(%rax) # save last round key
- sub \$8,$len
- .Lecb_enc_loop:
- movdqu 0x00($inp), @XMM[0] # load input
- movdqu 0x10($inp), @XMM[1]
- movdqu 0x20($inp), @XMM[2]
- movdqu 0x30($inp), @XMM[3]
- movdqu 0x40($inp), @XMM[4]
- movdqu 0x50($inp), @XMM[5]
- mov %rsp, %rax # pass key schedule
- movdqu 0x60($inp), @XMM[6]
- mov %ebx,%r10d # pass rounds
- movdqu 0x70($inp), @XMM[7]
- lea 0x80($inp), $inp
- call _bsaes_encrypt8
- movdqu @XMM[0], 0x00($out) # write output
- movdqu @XMM[1], 0x10($out)
- movdqu @XMM[4], 0x20($out)
- movdqu @XMM[6], 0x30($out)
- movdqu @XMM[3], 0x40($out)
- movdqu @XMM[7], 0x50($out)
- movdqu @XMM[2], 0x60($out)
- movdqu @XMM[5], 0x70($out)
- lea 0x80($out), $out
- sub \$8,$len
- jnc .Lecb_enc_loop
- add \$8,$len
- jz .Lecb_enc_done
- movdqu 0x00($inp), @XMM[0] # load input
- mov %rsp, %rax # pass key schedule
- mov %ebx,%r10d # pass rounds
- cmp \$2,$len
- jb .Lecb_enc_one
- movdqu 0x10($inp), @XMM[1]
- je .Lecb_enc_two
- movdqu 0x20($inp), @XMM[2]
- cmp \$4,$len
- jb .Lecb_enc_three
- movdqu 0x30($inp), @XMM[3]
- je .Lecb_enc_four
- movdqu 0x40($inp), @XMM[4]
- cmp \$6,$len
- jb .Lecb_enc_five
- movdqu 0x50($inp), @XMM[5]
- je .Lecb_enc_six
- movdqu 0x60($inp), @XMM[6]
- call _bsaes_encrypt8
- movdqu @XMM[0], 0x00($out) # write output
- movdqu @XMM[1], 0x10($out)
- movdqu @XMM[4], 0x20($out)
- movdqu @XMM[6], 0x30($out)
- movdqu @XMM[3], 0x40($out)
- movdqu @XMM[7], 0x50($out)
- movdqu @XMM[2], 0x60($out)
- jmp .Lecb_enc_done
- .align 16
- .Lecb_enc_six:
- call _bsaes_encrypt8
- movdqu @XMM[0], 0x00($out) # write output
- movdqu @XMM[1], 0x10($out)
- movdqu @XMM[4], 0x20($out)
- movdqu @XMM[6], 0x30($out)
- movdqu @XMM[3], 0x40($out)
- movdqu @XMM[7], 0x50($out)
- jmp .Lecb_enc_done
- .align 16
- .Lecb_enc_five:
- call _bsaes_encrypt8
- movdqu @XMM[0], 0x00($out) # write output
- movdqu @XMM[1], 0x10($out)
- movdqu @XMM[4], 0x20($out)
- movdqu @XMM[6], 0x30($out)
- movdqu @XMM[3], 0x40($out)
- jmp .Lecb_enc_done
- .align 16
- .Lecb_enc_four:
- call _bsaes_encrypt8
- movdqu @XMM[0], 0x00($out) # write output
- movdqu @XMM[1], 0x10($out)
- movdqu @XMM[4], 0x20($out)
- movdqu @XMM[6], 0x30($out)
- jmp .Lecb_enc_done
- .align 16
- .Lecb_enc_three:
- call _bsaes_encrypt8
- movdqu @XMM[0], 0x00($out) # write output
- movdqu @XMM[1], 0x10($out)
- movdqu @XMM[4], 0x20($out)
- jmp .Lecb_enc_done
- .align 16
- .Lecb_enc_two:
- call _bsaes_encrypt8
- movdqu @XMM[0], 0x00($out) # write output
- movdqu @XMM[1], 0x10($out)
- jmp .Lecb_enc_done
- .align 16
- .Lecb_enc_one:
- call _bsaes_encrypt8
- movdqu @XMM[0], 0x00($out) # write output
- jmp .Lecb_enc_done
- .align 16
- .Lecb_enc_short:
- lea ($inp), $arg1
- lea ($out), $arg2
- lea ($key), $arg3
- call asm_AES_encrypt
- lea 16($inp), $inp
- lea 16($out), $out
- dec $len
- jnz .Lecb_enc_short
- .Lecb_enc_done:
- lea (%rsp),%rax
- pxor %xmm0, %xmm0
- .Lecb_enc_bzero: # wipe key schedule [if any]
- movdqa %xmm0, 0x00(%rax)
- movdqa %xmm0, 0x10(%rax)
- lea 0x20(%rax), %rax
- cmp %rax, %rbp
- jb .Lecb_enc_bzero
- lea 0x78(%rbp),%rax
- .cfi_def_cfa %rax,8
- ___
- $code.=<<___ if ($win64);
- movaps 0x40(%rbp), %xmm6
- movaps 0x50(%rbp), %xmm7
- movaps 0x60(%rbp), %xmm8
- movaps 0x70(%rbp), %xmm9
- movaps 0x80(%rbp), %xmm10
- movaps 0x90(%rbp), %xmm11
- movaps 0xa0(%rbp), %xmm12
- movaps 0xb0(%rbp), %xmm13
- movaps 0xc0(%rbp), %xmm14
- movaps 0xd0(%rbp), %xmm15
- lea 0xa0(%rax), %rax
- .Lecb_enc_tail:
- ___
- $code.=<<___;
- mov -48(%rax), %r15
- .cfi_restore %r15
- mov -40(%rax), %r14
- .cfi_restore %r14
- mov -32(%rax), %r13
- .cfi_restore %r13
- mov -24(%rax), %r12
- .cfi_restore %r12
- mov -16(%rax), %rbx
- .cfi_restore %rbx
- mov -8(%rax), %rbp
- .cfi_restore %rbp
- lea (%rax), %rsp # restore %rsp
- .cfi_def_cfa_register %rsp
- .Lecb_enc_epilogue:
- ret
- .cfi_endproc
- .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
- .globl bsaes_ecb_decrypt_blocks
- .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
- .align 16
- bsaes_ecb_decrypt_blocks:
- .cfi_startproc
- mov %rsp, %rax
- .Lecb_dec_prologue:
- push %rbp
- .cfi_push %rbp
- push %rbx
- .cfi_push %rbx
- push %r12
- .cfi_push %r12
- push %r13
- .cfi_push %r13
- push %r14
- .cfi_push %r14
- push %r15
- .cfi_push %r15
- lea -0x48(%rsp),%rsp
- .cfi_adjust_cfa_offset 0x48
- ___
- $code.=<<___ if ($win64);
- lea -0xa0(%rsp), %rsp
- movaps %xmm6, 0x40(%rsp)
- movaps %xmm7, 0x50(%rsp)
- movaps %xmm8, 0x60(%rsp)
- movaps %xmm9, 0x70(%rsp)
- movaps %xmm10, 0x80(%rsp)
- movaps %xmm11, 0x90(%rsp)
- movaps %xmm12, 0xa0(%rsp)
- movaps %xmm13, 0xb0(%rsp)
- movaps %xmm14, 0xc0(%rsp)
- movaps %xmm15, 0xd0(%rsp)
- .Lecb_dec_body:
- ___
- $code.=<<___;
- mov %rsp,%rbp # backup %rsp
- .cfi_def_cfa_register %rbp
- mov 240($arg4),%eax # rounds
- mov $arg1,$inp # backup arguments
- mov $arg2,$out
- mov $arg3,$len
- mov $arg4,$key
- cmp \$8,$arg3
- jb .Lecb_dec_short
- mov %eax,%ebx # backup rounds
- shl \$7,%rax # 128 bytes per inner round key
- sub \$`128-32`,%rax # size of bit-sliced key schedule
- sub %rax,%rsp
- mov %rsp,%rax # pass key schedule
- mov $key,%rcx # pass key
- mov %ebx,%r10d # pass rounds
- call _bsaes_key_convert
- pxor (%rsp),%xmm7 # fix up 0 round key
- movdqa %xmm6,(%rax) # save last round key
- movdqa %xmm7,(%rsp)
- sub \$8,$len
- .Lecb_dec_loop:
- movdqu 0x00($inp), @XMM[0] # load input
- movdqu 0x10($inp), @XMM[1]
- movdqu 0x20($inp), @XMM[2]
- movdqu 0x30($inp), @XMM[3]
- movdqu 0x40($inp), @XMM[4]
- movdqu 0x50($inp), @XMM[5]
- mov %rsp, %rax # pass key schedule
- movdqu 0x60($inp), @XMM[6]
- mov %ebx,%r10d # pass rounds
- movdqu 0x70($inp), @XMM[7]
- lea 0x80($inp), $inp
- call _bsaes_decrypt8
- movdqu @XMM[0], 0x00($out) # write output
- movdqu @XMM[1], 0x10($out)
- movdqu @XMM[6], 0x20($out)
- movdqu @XMM[4], 0x30($out)
- movdqu @XMM[2], 0x40($out)
- movdqu @XMM[7], 0x50($out)
- movdqu @XMM[3], 0x60($out)
- movdqu @XMM[5], 0x70($out)
- lea 0x80($out), $out
- sub \$8,$len
- jnc .Lecb_dec_loop
- add \$8,$len
- jz .Lecb_dec_done
- movdqu 0x00($inp), @XMM[0] # load input
- mov %rsp, %rax # pass key schedule
- mov %ebx,%r10d # pass rounds
- cmp \$2,$len
- jb .Lecb_dec_one
- movdqu 0x10($inp), @XMM[1]
- je .Lecb_dec_two
- movdqu 0x20($inp), @XMM[2]
- cmp \$4,$len
- jb .Lecb_dec_three
- movdqu 0x30($inp), @XMM[3]
- je .Lecb_dec_four
- movdqu 0x40($inp), @XMM[4]
- cmp \$6,$len
- jb .Lecb_dec_five
- movdqu 0x50($inp), @XMM[5]
- je .Lecb_dec_six
- movdqu 0x60($inp), @XMM[6]
- call _bsaes_decrypt8
- movdqu @XMM[0], 0x00($out) # write output
- movdqu @XMM[1], 0x10($out)
- movdqu @XMM[6], 0x20($out)
- movdqu @XMM[4], 0x30($out)
- movdqu @XMM[2], 0x40($out)
- movdqu @XMM[7], 0x50($out)
- movdqu @XMM[3], 0x60($out)
- jmp .Lecb_dec_done
- .align 16
- .Lecb_dec_six:
- call _bsaes_decrypt8
- movdqu @XMM[0], 0x00($out) # write output
- movdqu @XMM[1], 0x10($out)
- movdqu @XMM[6], 0x20($out)
- movdqu @XMM[4], 0x30($out)
- movdqu @XMM[2], 0x40($out)
- movdqu @XMM[7], 0x50($out)
- jmp .Lecb_dec_done
- .align 16
- .Lecb_dec_five:
- call _bsaes_decrypt8
- movdqu @XMM[0], 0x00($out) # write output
- movdqu @XMM[1], 0x10($out)
- movdqu @XMM[6], 0x20($out)
- movdqu @XMM[4], 0x30($out)
- movdqu @XMM[2], 0x40($out)
- jmp .Lecb_dec_done
- .align 16
- .Lecb_dec_four:
- call _bsaes_decrypt8
- movdqu @XMM[0], 0x00($out) # write output
- movdqu @XMM[1], 0x10($out)
- movdqu @XMM[6], 0x20($out)
- movdqu @XMM[4], 0x30($out)
- jmp .Lecb_dec_done
- .align 16
- .Lecb_dec_three:
- call _bsaes_decrypt8
- movdqu @XMM[0], 0x00($out) # write output
- movdqu @XMM[1], 0x10($out)
- movdqu @XMM[6], 0x20($out)
- jmp .Lecb_dec_done
- .align 16
- .Lecb_dec_two:
- call _bsaes_decrypt8
- movdqu @XMM[0], 0x00($out) # write output
- movdqu @XMM[1], 0x10($out)
- jmp .Lecb_dec_done
- .align 16
- .Lecb_dec_one:
- call _bsaes_decrypt8
- movdqu @XMM[0], 0x00($out) # write output
- jmp .Lecb_dec_done
- .align 16
- .Lecb_dec_short:
- lea ($inp), $arg1
- lea ($out), $arg2
- lea ($key), $arg3
- call asm_AES_decrypt
- lea 16($inp), $inp
- lea 16($out), $out
- dec $len
- jnz .Lecb_dec_short
- .Lecb_dec_done:
- lea (%rsp),%rax
- pxor %xmm0, %xmm0
- .Lecb_dec_bzero: # wipe key schedule [if any]
- movdqa %xmm0, 0x00(%rax)
- movdqa %xmm0, 0x10(%rax)
- lea 0x20(%rax), %rax
- cmp %rax, %rbp
- jb .Lecb_dec_bzero
- lea 0x78(%rbp),%rax
- .cfi_def_cfa %rax,8
- ___
- $code.=<<___ if ($win64);
- movaps 0x40(%rbp), %xmm6
- movaps 0x50(%rbp), %xmm7
- movaps 0x60(%rbp), %xmm8
- movaps 0x70(%rbp), %xmm9
- movaps 0x80(%rbp), %xmm10
- movaps 0x90(%rbp), %xmm11
- movaps 0xa0(%rbp), %xmm12
- movaps 0xb0(%rbp), %xmm13
- movaps 0xc0(%rbp), %xmm14
- movaps 0xd0(%rbp), %xmm15
- lea 0xa0(%rax), %rax
- .Lecb_dec_tail:
- ___
- $code.=<<___;
- mov -48(%rax), %r15
- .cfi_restore %r15
- mov -40(%rax), %r14
- .cfi_restore %r14
- mov -32(%rax), %r13
- .cfi_restore %r13
- mov -24(%rax), %r12
- .cfi_restore %r12
- mov -16(%rax), %rbx
- .cfi_restore %rbx
- mov -8(%rax), %rbp
- .cfi_restore %rbp
- lea (%rax), %rsp # restore %rsp
- .cfi_def_cfa_register %rsp
- .Lecb_dec_epilogue:
- ret
- .cfi_endproc
- .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
- ___
- }
- $code.=<<___;
- .extern asm_AES_cbc_encrypt
- .globl bsaes_cbc_encrypt
- .type bsaes_cbc_encrypt,\@abi-omnipotent
- .align 16
- bsaes_cbc_encrypt:
- .cfi_startproc
- endbranch
- ___
- $code.=<<___ if ($win64);
- mov 48(%rsp),$arg6 # pull direction flag
- ___
- $code.=<<___;
- cmp \$0,$arg6
- jne asm_AES_cbc_encrypt
- cmp \$128,$arg3
- jb asm_AES_cbc_encrypt
- mov %rsp, %rax
- .Lcbc_dec_prologue:
- push %rbp
- .cfi_push %rbp
- push %rbx
- .cfi_push %rbx
- push %r12
- .cfi_push %r12
- push %r13
- .cfi_push %r13
- push %r14
- .cfi_push %r14
- push %r15
- .cfi_push %r15
- lea -0x48(%rsp), %rsp
- .cfi_adjust_cfa_offset 0x48
- ___
- $code.=<<___ if ($win64);
- mov 0xa0(%rsp),$arg5 # pull ivp
- lea -0xa0(%rsp), %rsp
- movaps %xmm6, 0x40(%rsp)
- movaps %xmm7, 0x50(%rsp)
- movaps %xmm8, 0x60(%rsp)
- movaps %xmm9, 0x70(%rsp)
- movaps %xmm10, 0x80(%rsp)
- movaps %xmm11, 0x90(%rsp)
- movaps %xmm12, 0xa0(%rsp)
- movaps %xmm13, 0xb0(%rsp)
- movaps %xmm14, 0xc0(%rsp)
- movaps %xmm15, 0xd0(%rsp)
- .Lcbc_dec_body:
- ___
- $code.=<<___;
- mov %rsp, %rbp # backup %rsp
- .cfi_def_cfa_register %rbp
- mov 240($arg4), %eax # rounds
- mov $arg1, $inp # backup arguments
- mov $arg2, $out
- mov $arg3, $len
- mov $arg4, $key
- mov $arg5, %rbx
- shr \$4, $len # bytes to blocks
- mov %eax, %edx # rounds
- shl \$7, %rax # 128 bytes per inner round key
- sub \$`128-32`, %rax # size of bit-sliced key schedule
- sub %rax, %rsp
- mov %rsp, %rax # pass key schedule
- mov $key, %rcx # pass key
- mov %edx, %r10d # pass rounds
- call _bsaes_key_convert
- pxor (%rsp),%xmm7 # fix up 0 round key
- movdqa %xmm6,(%rax) # save last round key
- movdqa %xmm7,(%rsp)
- movdqu (%rbx), @XMM[15] # load IV
- sub \$8,$len
- .Lcbc_dec_loop:
- movdqu 0x00($inp), @XMM[0] # load input
- movdqu 0x10($inp), @XMM[1]
- movdqu 0x20($inp), @XMM[2]
- movdqu 0x30($inp), @XMM[3]
- movdqu 0x40($inp), @XMM[4]
- movdqu 0x50($inp), @XMM[5]
- mov %rsp, %rax # pass key schedule
- movdqu 0x60($inp), @XMM[6]
- mov %edx,%r10d # pass rounds
- movdqu 0x70($inp), @XMM[7]
- movdqa @XMM[15], 0x20(%rbp) # put aside IV
- call _bsaes_decrypt8
- pxor 0x20(%rbp), @XMM[0] # ^= IV
- movdqu 0x00($inp), @XMM[8] # re-load input
- movdqu 0x10($inp), @XMM[9]
- pxor @XMM[8], @XMM[1]
- movdqu 0x20($inp), @XMM[10]
- pxor @XMM[9], @XMM[6]
- movdqu 0x30($inp), @XMM[11]
- pxor @XMM[10], @XMM[4]
- movdqu 0x40($inp), @XMM[12]
- pxor @XMM[11], @XMM[2]
- movdqu 0x50($inp), @XMM[13]
- pxor @XMM[12], @XMM[7]
- movdqu 0x60($inp), @XMM[14]
- pxor @XMM[13], @XMM[3]
- movdqu 0x70($inp), @XMM[15] # IV
- pxor @XMM[14], @XMM[5]
- movdqu @XMM[0], 0x00($out) # write output
- lea 0x80($inp), $inp
- movdqu @XMM[1], 0x10($out)
- movdqu @XMM[6], 0x20($out)
- movdqu @XMM[4], 0x30($out)
- movdqu @XMM[2], 0x40($out)
- movdqu @XMM[7], 0x50($out)
- movdqu @XMM[3], 0x60($out)
- movdqu @XMM[5], 0x70($out)
- lea 0x80($out), $out
- sub \$8,$len
- jnc .Lcbc_dec_loop
- add \$8,$len
- jz .Lcbc_dec_done
- movdqu 0x00($inp), @XMM[0] # load input
- mov %rsp, %rax # pass key schedule
- mov %edx, %r10d # pass rounds
- cmp \$2,$len
- jb .Lcbc_dec_one
- movdqu 0x10($inp), @XMM[1]
- je .Lcbc_dec_two
- movdqu 0x20($inp), @XMM[2]
- cmp \$4,$len
- jb .Lcbc_dec_three
- movdqu 0x30($inp), @XMM[3]
- je .Lcbc_dec_four
- movdqu 0x40($inp), @XMM[4]
- cmp \$6,$len
- jb .Lcbc_dec_five
- movdqu 0x50($inp), @XMM[5]
- je .Lcbc_dec_six
- movdqu 0x60($inp), @XMM[6]
- movdqa @XMM[15], 0x20(%rbp) # put aside IV
- call _bsaes_decrypt8
- pxor 0x20(%rbp), @XMM[0] # ^= IV
- movdqu 0x00($inp), @XMM[8] # re-load input
- movdqu 0x10($inp), @XMM[9]
- pxor @XMM[8], @XMM[1]
- movdqu 0x20($inp), @XMM[10]
- pxor @XMM[9], @XMM[6]
- movdqu 0x30($inp), @XMM[11]
- pxor @XMM[10], @XMM[4]
- movdqu 0x40($inp), @XMM[12]
- pxor @XMM[11], @XMM[2]
- movdqu 0x50($inp), @XMM[13]
- pxor @XMM[12], @XMM[7]
- movdqu 0x60($inp), @XMM[15] # IV
- pxor @XMM[13], @XMM[3]
- movdqu @XMM[0], 0x00($out) # write output
- movdqu @XMM[1], 0x10($out)
- movdqu @XMM[6], 0x20($out)
- movdqu @XMM[4], 0x30($out)
- movdqu @XMM[2], 0x40($out)
- movdqu @XMM[7], 0x50($out)
- movdqu @XMM[3], 0x60($out)
- jmp .Lcbc_dec_done
- .align 16
- .Lcbc_dec_six:
- movdqa @XMM[15], 0x20(%rbp) # put aside IV
- call _bsaes_decrypt8
- pxor 0x20(%rbp), @XMM[0] # ^= IV
- movdqu 0x00($inp), @XMM[8] # re-load input
- movdqu 0x10($inp), @XMM[9]
- pxor @XMM[8], @XMM[1]
- movdqu 0x20($inp), @XMM[10]
- pxor @XMM[9], @XMM[6]
- movdqu 0x30($inp), @XMM[11]
- pxor @XMM[10], @XMM[4]
- movdqu 0x40($inp), @XMM[12]
- pxor @XMM[11], @XMM[2]
- movdqu 0x50($inp), @XMM[15] # IV
- pxor @XMM[12], @XMM[7]
- movdqu @XMM[0], 0x00($out) # write output
- movdqu @XMM[1], 0x10($out)
- movdqu @XMM[6], 0x20($out)
- movdqu @XMM[4], 0x30($out)
- movdqu @XMM[2], 0x40($out)
- movdqu @XMM[7], 0x50($out)
- jmp .Lcbc_dec_done
- .align 16
- .Lcbc_dec_five:
- movdqa @XMM[15], 0x20(%rbp) # put aside IV
- call _bsaes_decrypt8
- pxor 0x20(%rbp), @XMM[0] # ^= IV
- movdqu 0x00($inp), @XMM[8] # re-load input
- movdqu 0x10($inp), @XMM[9]
- pxor @XMM[8], @XMM[1]
- movdqu 0x20($inp), @XMM[10]
- pxor @XMM[9], @XMM[6]
- movdqu 0x30($inp), @XMM[11]
- pxor @XMM[10], @XMM[4]
- movdqu 0x40($inp), @XMM[15] # IV
- pxor @XMM[11], @XMM[2]
- movdqu @XMM[0], 0x00($out) # write output
- movdqu @XMM[1], 0x10($out)
- movdqu @XMM[6], 0x20($out)
- movdqu @XMM[4], 0x30($out)
- movdqu @XMM[2], 0x40($out)
- jmp .Lcbc_dec_done
- .align 16
- .Lcbc_dec_four:
- movdqa @XMM[15], 0x20(%rbp) # put aside IV
- call _bsaes_decrypt8
- pxor 0x20(%rbp), @XMM[0] # ^= IV
- movdqu 0x00($inp), @XMM[8] # re-load input
- movdqu 0x10($inp), @XMM[9]
- pxor @XMM[8], @XMM[1]
- movdqu 0x20($inp), @XMM[10]
- pxor @XMM[9], @XMM[6]
- movdqu 0x30($inp), @XMM[15] # IV
- pxor @XMM[10], @XMM[4]
- movdqu @XMM[0], 0x00($out) # write output
- movdqu @XMM[1], 0x10($out)
- movdqu @XMM[6], 0x20($out)
- movdqu @XMM[4], 0x30($out)
- jmp .Lcbc_dec_done
- .align 16
- .Lcbc_dec_three:
- movdqa @XMM[15], 0x20(%rbp) # put aside IV
- call _bsaes_decrypt8
- pxor 0x20(%rbp), @XMM[0] # ^= IV
- movdqu 0x00($inp), @XMM[8] # re-load input
- movdqu 0x10($inp), @XMM[9]
- pxor @XMM[8], @XMM[1]
- movdqu 0x20($inp), @XMM[15] # IV
- pxor @XMM[9], @XMM[6]
- movdqu @XMM[0], 0x00($out) # write output
- movdqu @XMM[1], 0x10($out)
- movdqu @XMM[6], 0x20($out)
- jmp .Lcbc_dec_done
- .align 16
- .Lcbc_dec_two:
- movdqa @XMM[15], 0x20(%rbp) # put aside IV
- call _bsaes_decrypt8
- pxor 0x20(%rbp), @XMM[0] # ^= IV
- movdqu 0x00($inp), @XMM[8] # re-load input
- movdqu 0x10($inp), @XMM[15] # IV
- pxor @XMM[8], @XMM[1]
- movdqu @XMM[0], 0x00($out) # write output
- movdqu @XMM[1], 0x10($out)
- jmp .Lcbc_dec_done
- .align 16
- .Lcbc_dec_one:
- lea ($inp), $arg1
- lea 0x20(%rbp), $arg2 # buffer output
- lea ($key), $arg3
- call asm_AES_decrypt # doesn't touch %xmm
- pxor 0x20(%rbp), @XMM[15] # ^= IV
- movdqu @XMM[15], ($out) # write output
- movdqa @XMM[0], @XMM[15] # IV
- .Lcbc_dec_done:
- movdqu @XMM[15], (%rbx) # return IV
- lea (%rsp), %rax
- pxor %xmm0, %xmm0
- .Lcbc_dec_bzero: # wipe key schedule [if any]
- movdqa %xmm0, 0x00(%rax)
- movdqa %xmm0, 0x10(%rax)
- lea 0x20(%rax), %rax
- cmp %rax, %rbp
- ja .Lcbc_dec_bzero
- lea 0x78(%rbp),%rax
- .cfi_def_cfa %rax,8
- ___
- $code.=<<___ if ($win64);
- movaps 0x40(%rbp), %xmm6
- movaps 0x50(%rbp), %xmm7
- movaps 0x60(%rbp), %xmm8
- movaps 0x70(%rbp), %xmm9
- movaps 0x80(%rbp), %xmm10
- movaps 0x90(%rbp), %xmm11
- movaps 0xa0(%rbp), %xmm12
- movaps 0xb0(%rbp), %xmm13
- movaps 0xc0(%rbp), %xmm14
- movaps 0xd0(%rbp), %xmm15
- lea 0xa0(%rax), %rax
- .Lcbc_dec_tail:
- ___
- $code.=<<___;
- mov -48(%rax), %r15
- .cfi_restore %r15
- mov -40(%rax), %r14
- .cfi_restore %r14
- mov -32(%rax), %r13
- .cfi_restore %r13
- mov -24(%rax), %r12
- .cfi_restore %r12
- mov -16(%rax), %rbx
- .cfi_restore %rbx
- mov -8(%rax), %rbp
- .cfi_restore %rbp
- lea (%rax), %rsp # restore %rsp
- .cfi_def_cfa_register %rsp
- .Lcbc_dec_epilogue:
- ret
- .cfi_endproc
- .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
- .globl bsaes_ctr32_encrypt_blocks
- .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
- .align 16
- bsaes_ctr32_encrypt_blocks:
- .cfi_startproc
- endbranch
- mov %rsp, %rax
- .Lctr_enc_prologue:
- push %rbp
- .cfi_push %rbp
- push %rbx
- .cfi_push %rbx
- push %r12
- .cfi_push %r12
- push %r13
- .cfi_push %r13
- push %r14
- .cfi_push %r14
- push %r15
- .cfi_push %r15
- lea -0x48(%rsp), %rsp
- .cfi_adjust_cfa_offset 0x48
- ___
- $code.=<<___ if ($win64);
- mov 0xa0(%rsp),$arg5 # pull ivp
- lea -0xa0(%rsp), %rsp
- movaps %xmm6, 0x40(%rsp)
- movaps %xmm7, 0x50(%rsp)
- movaps %xmm8, 0x60(%rsp)
- movaps %xmm9, 0x70(%rsp)
- movaps %xmm10, 0x80(%rsp)
- movaps %xmm11, 0x90(%rsp)
- movaps %xmm12, 0xa0(%rsp)
- movaps %xmm13, 0xb0(%rsp)
- movaps %xmm14, 0xc0(%rsp)
- movaps %xmm15, 0xd0(%rsp)
- .Lctr_enc_body:
- ___
- $code.=<<___;
- mov %rsp, %rbp # backup %rsp
- .cfi_def_cfa_register %rbp
- movdqu ($arg5), %xmm0 # load counter
- mov 240($arg4), %eax # rounds
- mov $arg1, $inp # backup arguments
- mov $arg2, $out
- mov $arg3, $len
- mov $arg4, $key
- movdqa %xmm0, 0x20(%rbp) # copy counter
- cmp \$8, $arg3
- jb .Lctr_enc_short
- mov %eax, %ebx # rounds
- shl \$7, %rax # 128 bytes per inner round key
- sub \$`128-32`, %rax # size of bit-sliced key schedule
- sub %rax, %rsp
- mov %rsp, %rax # pass key schedule
- mov $key, %rcx # pass key
- mov %ebx, %r10d # pass rounds
- call _bsaes_key_convert
- pxor %xmm6,%xmm7 # fix up last round key
- movdqa %xmm7,(%rax) # save last round key
- movdqa (%rsp), @XMM[9] # load round0 key
- lea .LADD1(%rip), %r11
- movdqa 0x20(%rbp), @XMM[0] # counter copy
- movdqa -0x20(%r11), @XMM[8] # .LSWPUP
- pshufb @XMM[8], @XMM[9] # byte swap upper part
- pshufb @XMM[8], @XMM[0]
- movdqa @XMM[9], (%rsp) # save adjusted round0 key
- jmp .Lctr_enc_loop
- .align 16
- .Lctr_enc_loop:
- movdqa @XMM[0], 0x20(%rbp) # save counter
- movdqa @XMM[0], @XMM[1] # prepare 8 counter values
- movdqa @XMM[0], @XMM[2]
- paddd 0x00(%r11), @XMM[1] # .LADD1
- movdqa @XMM[0], @XMM[3]
- paddd 0x10(%r11), @XMM[2] # .LADD2
- movdqa @XMM[0], @XMM[4]
- paddd 0x20(%r11), @XMM[3] # .LADD3
- movdqa @XMM[0], @XMM[5]
- paddd 0x30(%r11), @XMM[4] # .LADD4
- movdqa @XMM[0], @XMM[6]
- paddd 0x40(%r11), @XMM[5] # .LADD5
- movdqa @XMM[0], @XMM[7]
- paddd 0x50(%r11), @XMM[6] # .LADD6
- paddd 0x60(%r11), @XMM[7] # .LADD7
- # Borrow prologue from _bsaes_encrypt8 to use the opportunity
- # to flip byte order in 32-bit counter
- movdqa (%rsp), @XMM[9] # round 0 key
- lea 0x10(%rsp), %rax # pass key schedule
- movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
- pxor @XMM[9], @XMM[0] # xor with round0 key
- pxor @XMM[9], @XMM[1]
- pxor @XMM[9], @XMM[2]
- pxor @XMM[9], @XMM[3]
- pshufb @XMM[8], @XMM[0]
- pshufb @XMM[8], @XMM[1]
- pxor @XMM[9], @XMM[4]
- pxor @XMM[9], @XMM[5]
- pshufb @XMM[8], @XMM[2]
- pshufb @XMM[8], @XMM[3]
- pxor @XMM[9], @XMM[6]
- pxor @XMM[9], @XMM[7]
- pshufb @XMM[8], @XMM[4]
- pshufb @XMM[8], @XMM[5]
- pshufb @XMM[8], @XMM[6]
- pshufb @XMM[8], @XMM[7]
- lea .LBS0(%rip), %r11 # constants table
- mov %ebx,%r10d # pass rounds
- call _bsaes_encrypt8_bitslice
- sub \$8,$len
- jc .Lctr_enc_loop_done
- movdqu 0x00($inp), @XMM[8] # load input
- movdqu 0x10($inp), @XMM[9]
- movdqu 0x20($inp), @XMM[10]
- movdqu 0x30($inp), @XMM[11]
- movdqu 0x40($inp), @XMM[12]
- movdqu 0x50($inp), @XMM[13]
- movdqu 0x60($inp), @XMM[14]
- movdqu 0x70($inp), @XMM[15]
- lea 0x80($inp),$inp
- pxor @XMM[0], @XMM[8]
- movdqa 0x20(%rbp), @XMM[0] # load counter
- pxor @XMM[9], @XMM[1]
- movdqu @XMM[8], 0x00($out) # write output
- pxor @XMM[10], @XMM[4]
- movdqu @XMM[1], 0x10($out)
- pxor @XMM[11], @XMM[6]
- movdqu @XMM[4], 0x20($out)
- pxor @XMM[12], @XMM[3]
- movdqu @XMM[6], 0x30($out)
- pxor @XMM[13], @XMM[7]
- movdqu @XMM[3], 0x40($out)
- pxor @XMM[14], @XMM[2]
- movdqu @XMM[7], 0x50($out)
- pxor @XMM[15], @XMM[5]
- movdqu @XMM[2], 0x60($out)
- lea .LADD1(%rip), %r11
- movdqu @XMM[5], 0x70($out)
- lea 0x80($out), $out
- paddd 0x70(%r11), @XMM[0] # .LADD8
- jnz .Lctr_enc_loop
- jmp .Lctr_enc_done
- .align 16
- .Lctr_enc_loop_done:
- add \$8, $len
- movdqu 0x00($inp), @XMM[8] # load input
- pxor @XMM[8], @XMM[0]
- movdqu @XMM[0], 0x00($out) # write output
- cmp \$2,$len
- jb .Lctr_enc_done
- movdqu 0x10($inp), @XMM[9]
- pxor @XMM[9], @XMM[1]
- movdqu @XMM[1], 0x10($out)
- je .Lctr_enc_done
- movdqu 0x20($inp), @XMM[10]
- pxor @XMM[10], @XMM[4]
- movdqu @XMM[4], 0x20($out)
- cmp \$4,$len
- jb .Lctr_enc_done
- movdqu 0x30($inp), @XMM[11]
- pxor @XMM[11], @XMM[6]
- movdqu @XMM[6], 0x30($out)
- je .Lctr_enc_done
- movdqu 0x40($inp), @XMM[12]
- pxor @XMM[12], @XMM[3]
- movdqu @XMM[3], 0x40($out)
- cmp \$6,$len
- jb .Lctr_enc_done
- movdqu 0x50($inp), @XMM[13]
- pxor @XMM[13], @XMM[7]
- movdqu @XMM[7], 0x50($out)
- je .Lctr_enc_done
- movdqu 0x60($inp), @XMM[14]
- pxor @XMM[14], @XMM[2]
- movdqu @XMM[2], 0x60($out)
- jmp .Lctr_enc_done
- .align 16
- .Lctr_enc_short:
- lea 0x20(%rbp), $arg1
- lea 0x30(%rbp), $arg2
- lea ($key), $arg3
- call asm_AES_encrypt
- movdqu ($inp), @XMM[1]
- lea 16($inp), $inp
- mov 0x2c(%rbp), %eax # load 32-bit counter
- bswap %eax
- pxor 0x30(%rbp), @XMM[1]
- inc %eax # increment
- movdqu @XMM[1], ($out)
- bswap %eax
- lea 16($out), $out
- mov %eax, 0x2c(%rsp) # save 32-bit counter
- dec $len
- jnz .Lctr_enc_short
- .Lctr_enc_done:
- lea (%rsp), %rax
- pxor %xmm0, %xmm0
- .Lctr_enc_bzero: # wipe key schedule [if any]
- movdqa %xmm0, 0x00(%rax)
- movdqa %xmm0, 0x10(%rax)
- lea 0x20(%rax), %rax
- cmp %rax, %rbp
- ja .Lctr_enc_bzero
- lea 0x78(%rbp),%rax
- .cfi_def_cfa %rax,8
- ___
- $code.=<<___ if ($win64);
- movaps 0x40(%rbp), %xmm6
- movaps 0x50(%rbp), %xmm7
- movaps 0x60(%rbp), %xmm8
- movaps 0x70(%rbp), %xmm9
- movaps 0x80(%rbp), %xmm10
- movaps 0x90(%rbp), %xmm11
- movaps 0xa0(%rbp), %xmm12
- movaps 0xb0(%rbp), %xmm13
- movaps 0xc0(%rbp), %xmm14
- movaps 0xd0(%rbp), %xmm15
- lea 0xa0(%rax), %rax
- .Lctr_enc_tail:
- ___
- $code.=<<___;
- mov -48(%rax), %r15
- .cfi_restore %r15
- mov -40(%rax), %r14
- .cfi_restore %r14
- mov -32(%rax), %r13
- .cfi_restore %r13
- mov -24(%rax), %r12
- .cfi_restore %r12
- mov -16(%rax), %rbx
- .cfi_restore %rbx
- mov -8(%rax), %rbp
- .cfi_restore %rbp
- lea (%rax), %rsp # restore %rsp
- .cfi_def_cfa_register %rsp
- .Lctr_enc_epilogue:
- ret
- .cfi_endproc
- .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
- ___
- ######################################################################
- # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
- # const AES_KEY *key1, const AES_KEY *key2,
- # const unsigned char iv[16]);
- #
- my ($twmask,$twres,$twtmp)=@XMM[13..15];
- $arg6=~s/d$//;
- $code.=<<___;
- .globl bsaes_xts_encrypt
- .type bsaes_xts_encrypt,\@abi-omnipotent
- .align 16
- bsaes_xts_encrypt:
- .cfi_startproc
- mov %rsp, %rax
- .Lxts_enc_prologue:
- push %rbp
- .cfi_push %rbp
- push %rbx
- .cfi_push %rbx
- push %r12
- .cfi_push %r12
- push %r13
- .cfi_push %r13
- push %r14
- .cfi_push %r14
- push %r15
- .cfi_push %r15
- lea -0x48(%rsp), %rsp
- .cfi_adjust_cfa_offset 0x48
- ___
- $code.=<<___ if ($win64);
- mov 0xa0(%rsp),$arg5 # pull key2
- mov 0xa8(%rsp),$arg6 # pull ivp
- lea -0xa0(%rsp), %rsp
- movaps %xmm6, 0x40(%rsp)
- movaps %xmm7, 0x50(%rsp)
- movaps %xmm8, 0x60(%rsp)
- movaps %xmm9, 0x70(%rsp)
- movaps %xmm10, 0x80(%rsp)
- movaps %xmm11, 0x90(%rsp)
- movaps %xmm12, 0xa0(%rsp)
- movaps %xmm13, 0xb0(%rsp)
- movaps %xmm14, 0xc0(%rsp)
- movaps %xmm15, 0xd0(%rsp)
- .Lxts_enc_body:
- ___
- $code.=<<___;
- mov %rsp, %rbp # backup %rsp
- .cfi_def_cfa_register %rbp
- mov $arg1, $inp # backup arguments
- mov $arg2, $out
- mov $arg3, $len
- mov $arg4, $key
- lea ($arg6), $arg1
- lea 0x20(%rbp), $arg2
- lea ($arg5), $arg3
- call asm_AES_encrypt # generate initial tweak
- mov 240($key), %eax # rounds
- mov $len, %rbx # backup $len
- mov %eax, %edx # rounds
- shl \$7, %rax # 128 bytes per inner round key
- sub \$`128-32`, %rax # size of bit-sliced key schedule
- sub %rax, %rsp
- mov %rsp, %rax # pass key schedule
- mov $key, %rcx # pass key
- mov %edx, %r10d # pass rounds
- call _bsaes_key_convert
- pxor %xmm6, %xmm7 # fix up last round key
- movdqa %xmm7, (%rax) # save last round key
- and \$-16, $len
- sub \$0x80, %rsp # place for tweak[8]
- movdqa 0x20(%rbp), @XMM[7] # initial tweak
- pxor $twtmp, $twtmp
- movdqa .Lxts_magic(%rip), $twmask
- pcmpgtd @XMM[7], $twtmp # broadcast upper bits
- sub \$0x80, $len
- jc .Lxts_enc_short
- jmp .Lxts_enc_loop
- .align 16
- .Lxts_enc_loop:
- ___
- for ($i=0;$i<7;$i++) {
- $code.=<<___;
- pshufd \$0x13, $twtmp, $twres
- pxor $twtmp, $twtmp
- movdqa @XMM[7], @XMM[$i]
- movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
- paddq @XMM[7], @XMM[7] # psllq 1,$tweak
- pand $twmask, $twres # isolate carry and residue
- pcmpgtd @XMM[7], $twtmp # broadcast upper bits
- pxor $twres, @XMM[7]
- ___
- $code.=<<___ if ($i>=1);
- movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
- ___
- $code.=<<___ if ($i>=2);
- pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
- ___
- }
- $code.=<<___;
- movdqu 0x60($inp), @XMM[8+6]
- pxor @XMM[8+5], @XMM[5]
- movdqu 0x70($inp), @XMM[8+7]
- lea 0x80($inp), $inp
- movdqa @XMM[7], 0x70(%rsp)
- pxor @XMM[8+6], @XMM[6]
- lea 0x80(%rsp), %rax # pass key schedule
- pxor @XMM[8+7], @XMM[7]
- mov %edx, %r10d # pass rounds
- call _bsaes_encrypt8
- pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
- pxor 0x10(%rsp), @XMM[1]
- movdqu @XMM[0], 0x00($out) # write output
- pxor 0x20(%rsp), @XMM[4]
- movdqu @XMM[1], 0x10($out)
- pxor 0x30(%rsp), @XMM[6]
- movdqu @XMM[4], 0x20($out)
- pxor 0x40(%rsp), @XMM[3]
- movdqu @XMM[6], 0x30($out)
- pxor 0x50(%rsp), @XMM[7]
- movdqu @XMM[3], 0x40($out)
- pxor 0x60(%rsp), @XMM[2]
- movdqu @XMM[7], 0x50($out)
- pxor 0x70(%rsp), @XMM[5]
- movdqu @XMM[2], 0x60($out)
- movdqu @XMM[5], 0x70($out)
- lea 0x80($out), $out
- movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
- pxor $twtmp, $twtmp
- movdqa .Lxts_magic(%rip), $twmask
- pcmpgtd @XMM[7], $twtmp
- pshufd \$0x13, $twtmp, $twres
- pxor $twtmp, $twtmp
- paddq @XMM[7], @XMM[7] # psllq 1,$tweak
- pand $twmask, $twres # isolate carry and residue
- pcmpgtd @XMM[7], $twtmp # broadcast upper bits
- pxor $twres, @XMM[7]
- sub \$0x80,$len
- jnc .Lxts_enc_loop
- .Lxts_enc_short:
- add \$0x80, $len
- jz .Lxts_enc_done
- ___
- for ($i=0;$i<7;$i++) {
- $code.=<<___;
- pshufd \$0x13, $twtmp, $twres
- pxor $twtmp, $twtmp
- movdqa @XMM[7], @XMM[$i]
- movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
- paddq @XMM[7], @XMM[7] # psllq 1,$tweak
- pand $twmask, $twres # isolate carry and residue
- pcmpgtd @XMM[7], $twtmp # broadcast upper bits
- pxor $twres, @XMM[7]
- ___
- $code.=<<___ if ($i>=1);
- movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
- cmp \$`0x10*$i`,$len
- je .Lxts_enc_$i
- ___
- $code.=<<___ if ($i>=2);
- pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
- ___
- }
- $code.=<<___;
- movdqu 0x60($inp), @XMM[8+6]
- pxor @XMM[8+5], @XMM[5]
- movdqa @XMM[7], 0x70(%rsp)
- lea 0x70($inp), $inp
- pxor @XMM[8+6], @XMM[6]
- lea 0x80(%rsp), %rax # pass key schedule
- mov %edx, %r10d # pass rounds
- call _bsaes_encrypt8
- pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
- pxor 0x10(%rsp), @XMM[1]
- movdqu @XMM[0], 0x00($out) # write output
- pxor 0x20(%rsp), @XMM[4]
- movdqu @XMM[1], 0x10($out)
- pxor 0x30(%rsp), @XMM[6]
- movdqu @XMM[4], 0x20($out)
- pxor 0x40(%rsp), @XMM[3]
- movdqu @XMM[6], 0x30($out)
- pxor 0x50(%rsp), @XMM[7]
- movdqu @XMM[3], 0x40($out)
- pxor 0x60(%rsp), @XMM[2]
- movdqu @XMM[7], 0x50($out)
- movdqu @XMM[2], 0x60($out)
- lea 0x70($out), $out
- movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
- jmp .Lxts_enc_done
- .align 16
- .Lxts_enc_6:
- pxor @XMM[8+4], @XMM[4]
- lea 0x60($inp), $inp
- pxor @XMM[8+5], @XMM[5]
- lea 0x80(%rsp), %rax # pass key schedule
- mov %edx, %r10d # pass rounds
- call _bsaes_encrypt8
- pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
- pxor 0x10(%rsp), @XMM[1]
- movdqu @XMM[0], 0x00($out) # write output
- pxor 0x20(%rsp), @XMM[4]
- movdqu @XMM[1], 0x10($out)
- pxor 0x30(%rsp), @XMM[6]
- movdqu @XMM[4], 0x20($out)
- pxor 0x40(%rsp), @XMM[3]
- movdqu @XMM[6], 0x30($out)
- pxor 0x50(%rsp), @XMM[7]
- movdqu @XMM[3], 0x40($out)
- movdqu @XMM[7], 0x50($out)
- lea 0x60($out), $out
- movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
- jmp .Lxts_enc_done
- .align 16
- .Lxts_enc_5:
- pxor @XMM[8+3], @XMM[3]
- lea 0x50($inp), $inp
- pxor @XMM[8+4], @XMM[4]
- lea 0x80(%rsp), %rax # pass key schedule
- mov %edx, %r10d # pass rounds
- call _bsaes_encrypt8
- pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
- pxor 0x10(%rsp), @XMM[1]
- movdqu @XMM[0], 0x00($out) # write output
- pxor 0x20(%rsp), @XMM[4]
- movdqu @XMM[1], 0x10($out)
- pxor 0x30(%rsp), @XMM[6]
- movdqu @XMM[4], 0x20($out)
- pxor 0x40(%rsp), @XMM[3]
- movdqu @XMM[6], 0x30($out)
- movdqu @XMM[3], 0x40($out)
- lea 0x50($out), $out
- movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
- jmp .Lxts_enc_done
- .align 16
- .Lxts_enc_4:
- pxor @XMM[8+2], @XMM[2]
- lea 0x40($inp), $inp
- pxor @XMM[8+3], @XMM[3]
- lea 0x80(%rsp), %rax # pass key schedule
- mov %edx, %r10d # pass rounds
- call _bsaes_encrypt8
- pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
- pxor 0x10(%rsp), @XMM[1]
- movdqu @XMM[0], 0x00($out) # write output
- pxor 0x20(%rsp), @XMM[4]
- movdqu @XMM[1], 0x10($out)
- pxor 0x30(%rsp), @XMM[6]
- movdqu @XMM[4], 0x20($out)
- movdqu @XMM[6], 0x30($out)
- lea 0x40($out), $out
- movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
- jmp .Lxts_enc_done
- .align 16
- .Lxts_enc_3:
- pxor @XMM[8+1], @XMM[1]
- lea 0x30($inp), $inp
- pxor @XMM[8+2], @XMM[2]
- lea 0x80(%rsp), %rax # pass key schedule
- mov %edx, %r10d # pass rounds
- call _bsaes_encrypt8
- pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
- pxor 0x10(%rsp), @XMM[1]
- movdqu @XMM[0], 0x00($out) # write output
- pxor 0x20(%rsp), @XMM[4]
- movdqu @XMM[1], 0x10($out)
- movdqu @XMM[4], 0x20($out)
- lea 0x30($out), $out
- movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
- jmp .Lxts_enc_done
- .align 16
- .Lxts_enc_2:
- pxor @XMM[8+0], @XMM[0]
- lea 0x20($inp), $inp
- pxor @XMM[8+1], @XMM[1]
- lea 0x80(%rsp), %rax # pass key schedule
- mov %edx, %r10d # pass rounds
- call _bsaes_encrypt8
- pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
- pxor 0x10(%rsp), @XMM[1]
- movdqu @XMM[0], 0x00($out) # write output
- movdqu @XMM[1], 0x10($out)
- lea 0x20($out), $out
- movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
- jmp .Lxts_enc_done
- .align 16
- .Lxts_enc_1:
- pxor @XMM[0], @XMM[8]
- lea 0x10($inp), $inp
- movdqa @XMM[8], 0x20(%rbp)
- lea 0x20(%rbp), $arg1
- lea 0x20(%rbp), $arg2
- lea ($key), $arg3
- call asm_AES_encrypt # doesn't touch %xmm
- pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
- #pxor @XMM[8], @XMM[0]
- #lea 0x80(%rsp), %rax # pass key schedule
- #mov %edx, %r10d # pass rounds
- #call _bsaes_encrypt8
- #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
- movdqu @XMM[0], 0x00($out) # write output
- lea 0x10($out), $out
- movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
- .Lxts_enc_done:
- and \$15, %ebx
- jz .Lxts_enc_ret
- mov $out, %rdx
- .Lxts_enc_steal:
- movzb ($inp), %eax
- movzb -16(%rdx), %ecx
- lea 1($inp), $inp
- mov %al, -16(%rdx)
- mov %cl, 0(%rdx)
- lea 1(%rdx), %rdx
- sub \$1,%ebx
- jnz .Lxts_enc_steal
- movdqu -16($out), @XMM[0]
- lea 0x20(%rbp), $arg1
- pxor @XMM[7], @XMM[0]
- lea 0x20(%rbp), $arg2
- movdqa @XMM[0], 0x20(%rbp)
- lea ($key), $arg3
- call asm_AES_encrypt # doesn't touch %xmm
- pxor 0x20(%rbp), @XMM[7]
- movdqu @XMM[7], -16($out)
- .Lxts_enc_ret:
- lea (%rsp), %rax
- pxor %xmm0, %xmm0
- .Lxts_enc_bzero: # wipe key schedule [if any]
- movdqa %xmm0, 0x00(%rax)
- movdqa %xmm0, 0x10(%rax)
- lea 0x20(%rax), %rax
- cmp %rax, %rbp
- ja .Lxts_enc_bzero
- lea 0x78(%rbp),%rax
- .cfi_def_cfa %rax,8
- ___
- $code.=<<___ if ($win64);
- movaps 0x40(%rbp), %xmm6
- movaps 0x50(%rbp), %xmm7
- movaps 0x60(%rbp), %xmm8
- movaps 0x70(%rbp), %xmm9
- movaps 0x80(%rbp), %xmm10
- movaps 0x90(%rbp), %xmm11
- movaps 0xa0(%rbp), %xmm12
- movaps 0xb0(%rbp), %xmm13
- movaps 0xc0(%rbp), %xmm14
- movaps 0xd0(%rbp), %xmm15
- lea 0xa0(%rax), %rax
- .Lxts_enc_tail:
- ___
- $code.=<<___;
- mov -48(%rax), %r15
- .cfi_restore %r15
- mov -40(%rax), %r14
- .cfi_restore %r14
- mov -32(%rax), %r13
- .cfi_restore %r13
- mov -24(%rax), %r12
- .cfi_restore %r12
- mov -16(%rax), %rbx
- .cfi_restore %rbx
- mov -8(%rax), %rbp
- .cfi_restore %rbp
- lea (%rax), %rsp # restore %rsp
- .cfi_def_cfa_register %rsp
- .Lxts_enc_epilogue:
- ret
- .cfi_endproc
- .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
- .globl bsaes_xts_decrypt
- .type bsaes_xts_decrypt,\@abi-omnipotent
- .align 16
- bsaes_xts_decrypt:
- .cfi_startproc
- mov %rsp, %rax
- .Lxts_dec_prologue:
- push %rbp
- .cfi_push %rbp
- push %rbx
- .cfi_push %rbx
- push %r12
- .cfi_push %r12
- push %r13
- .cfi_push %r13
- push %r14
- .cfi_push %r14
- push %r15
- .cfi_push %r15
- lea -0x48(%rsp), %rsp
- .cfi_adjust_cfa_offset 0x48
- ___
- $code.=<<___ if ($win64);
- mov 0xa0(%rsp),$arg5 # pull key2
- mov 0xa8(%rsp),$arg6 # pull ivp
- lea -0xa0(%rsp), %rsp
- movaps %xmm6, 0x40(%rsp)
- movaps %xmm7, 0x50(%rsp)
- movaps %xmm8, 0x60(%rsp)
- movaps %xmm9, 0x70(%rsp)
- movaps %xmm10, 0x80(%rsp)
- movaps %xmm11, 0x90(%rsp)
- movaps %xmm12, 0xa0(%rsp)
- movaps %xmm13, 0xb0(%rsp)
- movaps %xmm14, 0xc0(%rsp)
- movaps %xmm15, 0xd0(%rsp)
- .Lxts_dec_body:
- ___
- $code.=<<___;
- mov %rsp, %rbp # backup %rsp
- mov $arg1, $inp # backup arguments
- mov $arg2, $out
- mov $arg3, $len
- mov $arg4, $key
- lea ($arg6), $arg1
- lea 0x20(%rbp), $arg2
- lea ($arg5), $arg3
- call asm_AES_encrypt # generate initial tweak
- mov 240($key), %eax # rounds
- mov $len, %rbx # backup $len
- mov %eax, %edx # rounds
- shl \$7, %rax # 128 bytes per inner round key
- sub \$`128-32`, %rax # size of bit-sliced key schedule
- sub %rax, %rsp
- mov %rsp, %rax # pass key schedule
- mov $key, %rcx # pass key
- mov %edx, %r10d # pass rounds
- call _bsaes_key_convert
- pxor (%rsp), %xmm7 # fix up round 0 key
- movdqa %xmm6, (%rax) # save last round key
- movdqa %xmm7, (%rsp)
- xor %eax, %eax # if ($len%16) len-=16;
- and \$-16, $len
- test \$15, %ebx
- setnz %al
- shl \$4, %rax
- sub %rax, $len
- sub \$0x80, %rsp # place for tweak[8]
- movdqa 0x20(%rbp), @XMM[7] # initial tweak
- pxor $twtmp, $twtmp
- movdqa .Lxts_magic(%rip), $twmask
- pcmpgtd @XMM[7], $twtmp # broadcast upper bits
- sub \$0x80, $len
- jc .Lxts_dec_short
- jmp .Lxts_dec_loop
- .align 16
- .Lxts_dec_loop:
- ___
- for ($i=0;$i<7;$i++) {
- $code.=<<___;
- pshufd \$0x13, $twtmp, $twres
- pxor $twtmp, $twtmp
- movdqa @XMM[7], @XMM[$i]
- movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
- paddq @XMM[7], @XMM[7] # psllq 1,$tweak
- pand $twmask, $twres # isolate carry and residue
- pcmpgtd @XMM[7], $twtmp # broadcast upper bits
- pxor $twres, @XMM[7]
- ___
- $code.=<<___ if ($i>=1);
- movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
- ___
- $code.=<<___ if ($i>=2);
- pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
- ___
- }
- $code.=<<___;
- movdqu 0x60($inp), @XMM[8+6]
- pxor @XMM[8+5], @XMM[5]
- movdqu 0x70($inp), @XMM[8+7]
- lea 0x80($inp), $inp
- movdqa @XMM[7], 0x70(%rsp)
- pxor @XMM[8+6], @XMM[6]
- lea 0x80(%rsp), %rax # pass key schedule
- pxor @XMM[8+7], @XMM[7]
- mov %edx, %r10d # pass rounds
- call _bsaes_decrypt8
- pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
- pxor 0x10(%rsp), @XMM[1]
- movdqu @XMM[0], 0x00($out) # write output
- pxor 0x20(%rsp), @XMM[6]
- movdqu @XMM[1], 0x10($out)
- pxor 0x30(%rsp), @XMM[4]
- movdqu @XMM[6], 0x20($out)
- pxor 0x40(%rsp), @XMM[2]
- movdqu @XMM[4], 0x30($out)
- pxor 0x50(%rsp), @XMM[7]
- movdqu @XMM[2], 0x40($out)
- pxor 0x60(%rsp), @XMM[3]
- movdqu @XMM[7], 0x50($out)
- pxor 0x70(%rsp), @XMM[5]
- movdqu @XMM[3], 0x60($out)
- movdqu @XMM[5], 0x70($out)
- lea 0x80($out), $out
- movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
- pxor $twtmp, $twtmp
- movdqa .Lxts_magic(%rip), $twmask
- pcmpgtd @XMM[7], $twtmp
- pshufd \$0x13, $twtmp, $twres
- pxor $twtmp, $twtmp
- paddq @XMM[7], @XMM[7] # psllq 1,$tweak
- pand $twmask, $twres # isolate carry and residue
- pcmpgtd @XMM[7], $twtmp # broadcast upper bits
- pxor $twres, @XMM[7]
- sub \$0x80,$len
- jnc .Lxts_dec_loop
- .Lxts_dec_short:
- add \$0x80, $len
- jz .Lxts_dec_done
- ___
- for ($i=0;$i<7;$i++) {
- $code.=<<___;
- pshufd \$0x13, $twtmp, $twres
- pxor $twtmp, $twtmp
- movdqa @XMM[7], @XMM[$i]
- movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
- paddq @XMM[7], @XMM[7] # psllq 1,$tweak
- pand $twmask, $twres # isolate carry and residue
- pcmpgtd @XMM[7], $twtmp # broadcast upper bits
- pxor $twres, @XMM[7]
- ___
- $code.=<<___ if ($i>=1);
- movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
- cmp \$`0x10*$i`,$len
- je .Lxts_dec_$i
- ___
- $code.=<<___ if ($i>=2);
- pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
- ___
- }
- $code.=<<___;
- movdqu 0x60($inp), @XMM[8+6]
- pxor @XMM[8+5], @XMM[5]
- movdqa @XMM[7], 0x70(%rsp)
- lea 0x70($inp), $inp
- pxor @XMM[8+6], @XMM[6]
- lea 0x80(%rsp), %rax # pass key schedule
- mov %edx, %r10d # pass rounds
- call _bsaes_decrypt8
- pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
- pxor 0x10(%rsp), @XMM[1]
- movdqu @XMM[0], 0x00($out) # write output
- pxor 0x20(%rsp), @XMM[6]
- movdqu @XMM[1], 0x10($out)
- pxor 0x30(%rsp), @XMM[4]
- movdqu @XMM[6], 0x20($out)
- pxor 0x40(%rsp), @XMM[2]
- movdqu @XMM[4], 0x30($out)
- pxor 0x50(%rsp), @XMM[7]
- movdqu @XMM[2], 0x40($out)
- pxor 0x60(%rsp), @XMM[3]
- movdqu @XMM[7], 0x50($out)
- movdqu @XMM[3], 0x60($out)
- lea 0x70($out), $out
- movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
- jmp .Lxts_dec_done
- .align 16
- .Lxts_dec_6:
- pxor @XMM[8+4], @XMM[4]
- lea 0x60($inp), $inp
- pxor @XMM[8+5], @XMM[5]
- lea 0x80(%rsp), %rax # pass key schedule
- mov %edx, %r10d # pass rounds
- call _bsaes_decrypt8
- pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
- pxor 0x10(%rsp), @XMM[1]
- movdqu @XMM[0], 0x00($out) # write output
- pxor 0x20(%rsp), @XMM[6]
- movdqu @XMM[1], 0x10($out)
- pxor 0x30(%rsp), @XMM[4]
- movdqu @XMM[6], 0x20($out)
- pxor 0x40(%rsp), @XMM[2]
- movdqu @XMM[4], 0x30($out)
- pxor 0x50(%rsp), @XMM[7]
- movdqu @XMM[2], 0x40($out)
- movdqu @XMM[7], 0x50($out)
- lea 0x60($out), $out
- movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
- jmp .Lxts_dec_done
- .align 16
- .Lxts_dec_5:
- pxor @XMM[8+3], @XMM[3]
- lea 0x50($inp), $inp
- pxor @XMM[8+4], @XMM[4]
- lea 0x80(%rsp), %rax # pass key schedule
- mov %edx, %r10d # pass rounds
- call _bsaes_decrypt8
- pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
- pxor 0x10(%rsp), @XMM[1]
- movdqu @XMM[0], 0x00($out) # write output
- pxor 0x20(%rsp), @XMM[6]
- movdqu @XMM[1], 0x10($out)
- pxor 0x30(%rsp), @XMM[4]
- movdqu @XMM[6], 0x20($out)
- pxor 0x40(%rsp), @XMM[2]
- movdqu @XMM[4], 0x30($out)
- movdqu @XMM[2], 0x40($out)
- lea 0x50($out), $out
- movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
- jmp .Lxts_dec_done
- .align 16
- .Lxts_dec_4:
- pxor @XMM[8+2], @XMM[2]
- lea 0x40($inp), $inp
- pxor @XMM[8+3], @XMM[3]
- lea 0x80(%rsp), %rax # pass key schedule
- mov %edx, %r10d # pass rounds
- call _bsaes_decrypt8
- pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
- pxor 0x10(%rsp), @XMM[1]
- movdqu @XMM[0], 0x00($out) # write output
- pxor 0x20(%rsp), @XMM[6]
- movdqu @XMM[1], 0x10($out)
- pxor 0x30(%rsp), @XMM[4]
- movdqu @XMM[6], 0x20($out)
- movdqu @XMM[4], 0x30($out)
- lea 0x40($out), $out
- movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
- jmp .Lxts_dec_done
- .align 16
- .Lxts_dec_3:
- pxor @XMM[8+1], @XMM[1]
- lea 0x30($inp), $inp
- pxor @XMM[8+2], @XMM[2]
- lea 0x80(%rsp), %rax # pass key schedule
- mov %edx, %r10d # pass rounds
- call _bsaes_decrypt8
- pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
- pxor 0x10(%rsp), @XMM[1]
- movdqu @XMM[0], 0x00($out) # write output
- pxor 0x20(%rsp), @XMM[6]
- movdqu @XMM[1], 0x10($out)
- movdqu @XMM[6], 0x20($out)
- lea 0x30($out), $out
- movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
- jmp .Lxts_dec_done
- .align 16
- .Lxts_dec_2:
- pxor @XMM[8+0], @XMM[0]
- lea 0x20($inp), $inp
- pxor @XMM[8+1], @XMM[1]
- lea 0x80(%rsp), %rax # pass key schedule
- mov %edx, %r10d # pass rounds
- call _bsaes_decrypt8
- pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
- pxor 0x10(%rsp), @XMM[1]
- movdqu @XMM[0], 0x00($out) # write output
- movdqu @XMM[1], 0x10($out)
- lea 0x20($out), $out
- movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
- jmp .Lxts_dec_done
- .align 16
- .Lxts_dec_1:
- pxor @XMM[0], @XMM[8]
- lea 0x10($inp), $inp
- movdqa @XMM[8], 0x20(%rbp)
- lea 0x20(%rbp), $arg1
- lea 0x20(%rbp), $arg2
- lea ($key), $arg3
- call asm_AES_decrypt # doesn't touch %xmm
- pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
- #pxor @XMM[8], @XMM[0]
- #lea 0x80(%rsp), %rax # pass key schedule
- #mov %edx, %r10d # pass rounds
- #call _bsaes_decrypt8
- #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
- movdqu @XMM[0], 0x00($out) # write output
- lea 0x10($out), $out
- movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
- .Lxts_dec_done:
- and \$15, %ebx
- jz .Lxts_dec_ret
- pxor $twtmp, $twtmp
- movdqa .Lxts_magic(%rip), $twmask
- pcmpgtd @XMM[7], $twtmp
- pshufd \$0x13, $twtmp, $twres
- movdqa @XMM[7], @XMM[6]
- paddq @XMM[7], @XMM[7] # psllq 1,$tweak
- pand $twmask, $twres # isolate carry and residue
- movdqu ($inp), @XMM[0]
- pxor $twres, @XMM[7]
- lea 0x20(%rbp), $arg1
- pxor @XMM[7], @XMM[0]
- lea 0x20(%rbp), $arg2
- movdqa @XMM[0], 0x20(%rbp)
- lea ($key), $arg3
- call asm_AES_decrypt # doesn't touch %xmm
- pxor 0x20(%rbp), @XMM[7]
- mov $out, %rdx
- movdqu @XMM[7], ($out)
- .Lxts_dec_steal:
- movzb 16($inp), %eax
- movzb (%rdx), %ecx
- lea 1($inp), $inp
- mov %al, (%rdx)
- mov %cl, 16(%rdx)
- lea 1(%rdx), %rdx
- sub \$1,%ebx
- jnz .Lxts_dec_steal
- movdqu ($out), @XMM[0]
- lea 0x20(%rbp), $arg1
- pxor @XMM[6], @XMM[0]
- lea 0x20(%rbp), $arg2
- movdqa @XMM[0], 0x20(%rbp)
- lea ($key), $arg3
- call asm_AES_decrypt # doesn't touch %xmm
- pxor 0x20(%rbp), @XMM[6]
- movdqu @XMM[6], ($out)
- .Lxts_dec_ret:
- lea (%rsp), %rax
- pxor %xmm0, %xmm0
- .Lxts_dec_bzero: # wipe key schedule [if any]
- movdqa %xmm0, 0x00(%rax)
- movdqa %xmm0, 0x10(%rax)
- lea 0x20(%rax), %rax
- cmp %rax, %rbp
- ja .Lxts_dec_bzero
- lea 0x78(%rbp),%rax
- .cfi_def_cfa %rax,8
- ___
- $code.=<<___ if ($win64);
- movaps 0x40(%rbp), %xmm6
- movaps 0x50(%rbp), %xmm7
- movaps 0x60(%rbp), %xmm8
- movaps 0x70(%rbp), %xmm9
- movaps 0x80(%rbp), %xmm10
- movaps 0x90(%rbp), %xmm11
- movaps 0xa0(%rbp), %xmm12
- movaps 0xb0(%rbp), %xmm13
- movaps 0xc0(%rbp), %xmm14
- movaps 0xd0(%rbp), %xmm15
- lea 0xa0(%rax), %rax
- .Lxts_dec_tail:
- ___
- $code.=<<___;
- mov -48(%rax), %r15
- .cfi_restore %r15
- mov -40(%rax), %r14
- .cfi_restore %r14
- mov -32(%rax), %r13
- .cfi_restore %r13
- mov -24(%rax), %r12
- .cfi_restore %r12
- mov -16(%rax), %rbx
- .cfi_restore %rbx
- mov -8(%rax), %rbp
- .cfi_restore %rbp
- lea (%rax), %rsp # restore %rsp
- .cfi_def_cfa_register %rsp
- .Lxts_dec_epilogue:
- ret
- .cfi_endproc
- .size bsaes_xts_decrypt,.-bsaes_xts_decrypt
- ___
- }
- $code.=<<___;
- .type _bsaes_const,\@object
- .align 64
- _bsaes_const:
- .LM0ISR: # InvShiftRows constants
- .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
- .LISRM0:
- .quad 0x01040b0e0205080f, 0x0306090c00070a0d
- .LISR:
- .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
- .LBS0: # bit-slice constants
- .quad 0x5555555555555555, 0x5555555555555555
- .LBS1:
- .quad 0x3333333333333333, 0x3333333333333333
- .LBS2:
- .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
- .LSR: # shiftrows constants
- .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
- .LSRM0:
- .quad 0x0304090e00050a0f, 0x01060b0c0207080d
- .LM0SR:
- .quad 0x0a0e02060f03070b, 0x0004080c05090d01
- .LSWPUP: # byte-swap upper dword
- .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
- .LSWPUPM0SR:
- .quad 0x0a0d02060c03070b, 0x0004080f05090e01
- .LADD1: # counter increment constants
- .quad 0x0000000000000000, 0x0000000100000000
- .LADD2:
- .quad 0x0000000000000000, 0x0000000200000000
- .LADD3:
- .quad 0x0000000000000000, 0x0000000300000000
- .LADD4:
- .quad 0x0000000000000000, 0x0000000400000000
- .LADD5:
- .quad 0x0000000000000000, 0x0000000500000000
- .LADD6:
- .quad 0x0000000000000000, 0x0000000600000000
- .LADD7:
- .quad 0x0000000000000000, 0x0000000700000000
- .LADD8:
- .quad 0x0000000000000000, 0x0000000800000000
- .Lxts_magic:
- .long 0x87,0,1,0
- .Lmasks:
- .quad 0x0101010101010101, 0x0101010101010101
- .quad 0x0202020202020202, 0x0202020202020202
- .quad 0x0404040404040404, 0x0404040404040404
- .quad 0x0808080808080808, 0x0808080808080808
- .LM0:
- .quad 0x02060a0e03070b0f, 0x0004080c0105090d
- .L63:
- .quad 0x6363636363636363, 0x6363636363636363
- .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
- .align 64
- .size _bsaes_const,.-_bsaes_const
- ___
- # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
- # CONTEXT *context,DISPATCHER_CONTEXT *disp)
- if ($win64) {
- $rec="%rcx";
- $frame="%rdx";
- $context="%r8";
- $disp="%r9";
- $code.=<<___;
- .extern __imp_RtlVirtualUnwind
- .type se_handler,\@abi-omnipotent
- .align 16
- se_handler:
- push %rsi
- push %rdi
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- pushfq
- sub \$64,%rsp
- mov 120($context),%rax # pull context->Rax
- mov 248($context),%rbx # pull context->Rip
- mov 8($disp),%rsi # disp->ImageBase
- mov 56($disp),%r11 # disp->HandlerData
- mov 0(%r11),%r10d # HandlerData[0]
- lea (%rsi,%r10),%r10 # prologue label
- cmp %r10,%rbx # context->Rip<=prologue label
- jbe .Lin_prologue
- mov 4(%r11),%r10d # HandlerData[1]
- lea (%rsi,%r10),%r10 # epilogue label
- cmp %r10,%rbx # context->Rip>=epilogue label
- jae .Lin_prologue
- mov 8(%r11),%r10d # HandlerData[2]
- lea (%rsi,%r10),%r10 # epilogue label
- cmp %r10,%rbx # context->Rip>=tail label
- jae .Lin_tail
- mov 160($context),%rax # pull context->Rbp
- lea 0x40(%rax),%rsi # %xmm save area
- lea 512($context),%rdi # &context.Xmm6
- mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
- .long 0xa548f3fc # cld; rep movsq
- lea 0xa0+0x78(%rax),%rax # adjust stack pointer
- .Lin_tail:
- mov -48(%rax),%rbp
- mov -40(%rax),%rbx
- mov -32(%rax),%r12
- mov -24(%rax),%r13
- mov -16(%rax),%r14
- mov -8(%rax),%r15
- mov %rbx,144($context) # restore context->Rbx
- mov %rbp,160($context) # restore context->Rbp
- mov %r12,216($context) # restore context->R12
- mov %r13,224($context) # restore context->R13
- mov %r14,232($context) # restore context->R14
- mov %r15,240($context) # restore context->R15
- .Lin_prologue:
- mov %rax,152($context) # restore context->Rsp
- mov 40($disp),%rdi # disp->ContextRecord
- mov $context,%rsi # context
- mov \$`1232/8`,%ecx # sizeof(CONTEXT)
- .long 0xa548f3fc # cld; rep movsq
- mov $disp,%rsi
- xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
- mov 8(%rsi),%rdx # arg2, disp->ImageBase
- mov 0(%rsi),%r8 # arg3, disp->ControlPc
- mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
- mov 40(%rsi),%r10 # disp->ContextRecord
- lea 56(%rsi),%r11 # &disp->HandlerData
- lea 24(%rsi),%r12 # &disp->EstablisherFrame
- mov %r10,32(%rsp) # arg5
- mov %r11,40(%rsp) # arg6
- mov %r12,48(%rsp) # arg7
- mov %rcx,56(%rsp) # arg8, (NULL)
- call *__imp_RtlVirtualUnwind(%rip)
- mov \$1,%eax # ExceptionContinueSearch
- add \$64,%rsp
- popfq
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- pop %rdi
- pop %rsi
- ret
- .size se_handler,.-se_handler
- .section .pdata
- .align 4
- ___
- $code.=<<___ if ($ecb);
- .rva .Lecb_enc_prologue
- .rva .Lecb_enc_epilogue
- .rva .Lecb_enc_info
- .rva .Lecb_dec_prologue
- .rva .Lecb_dec_epilogue
- .rva .Lecb_dec_info
- ___
- $code.=<<___;
- .rva .Lcbc_dec_prologue
- .rva .Lcbc_dec_epilogue
- .rva .Lcbc_dec_info
- .rva .Lctr_enc_prologue
- .rva .Lctr_enc_epilogue
- .rva .Lctr_enc_info
- .rva .Lxts_enc_prologue
- .rva .Lxts_enc_epilogue
- .rva .Lxts_enc_info
- .rva .Lxts_dec_prologue
- .rva .Lxts_dec_epilogue
- .rva .Lxts_dec_info
- .section .xdata
- .align 8
- ___
- $code.=<<___ if ($ecb);
- .Lecb_enc_info:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
- .rva .Lecb_enc_tail
- .long 0
- .Lecb_dec_info:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
- .rva .Lecb_dec_tail
- .long 0
- ___
- $code.=<<___;
- .Lcbc_dec_info:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
- .rva .Lcbc_dec_tail
- .long 0
- .Lctr_enc_info:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
- .rva .Lctr_enc_tail
- .long 0
- .Lxts_enc_info:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
- .rva .Lxts_enc_tail
- .long 0
- .Lxts_dec_info:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
- .rva .Lxts_dec_tail
- .long 0
- ___
- }
- $code =~ s/\`([^\`]*)\`/eval($1)/gem;
- print $code;
- close STDOUT or die "error closing STDOUT: $!";
|