bsaes-x86_64.pl 74 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243
  1. #! /usr/bin/env perl
  2. # Copyright 2011-2021 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. ###################################################################
  9. ### AES-128 [originally in CTR mode] ###
  10. ### bitsliced implementation for Intel Core 2 processors ###
  11. ### requires support of SSE extensions up to SSSE3 ###
  12. ### Author: Emilia Käsper and Peter Schwabe ###
  13. ### Date: 2009-03-19 ###
  14. ### Public domain ###
  15. ### ###
  16. ### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
  17. ### further information. ###
  18. ###################################################################
  19. #
  20. # September 2011.
  21. #
  22. # Started as transliteration to "perlasm" the original code has
  23. # undergone following changes:
  24. #
  25. # - code was made position-independent;
  26. # - rounds were folded into a loop resulting in >5x size reduction
  27. # from 12.5KB to 2.2KB;
  28. # - above was possible thanks to mixcolumns() modification that
  29. # allowed to feed its output back to aesenc[last], this was
  30. # achieved at cost of two additional inter-registers moves;
  31. # - some instruction reordering and interleaving;
  32. # - this module doesn't implement key setup subroutine, instead it
  33. # relies on conversion of "conventional" key schedule as returned
  34. # by AES_set_encrypt_key (see discussion below);
  35. # - first and last round keys are treated differently, which allowed
  36. # to skip one shiftrows(), reduce bit-sliced key schedule and
  37. # speed-up conversion by 22%;
  38. # - support for 192- and 256-bit keys was added;
  39. #
  40. # Resulting performance in CPU cycles spent to encrypt one byte out
  41. # of 4096-byte buffer with 128-bit key is:
  42. #
  43. # Emilia's this(*) difference
  44. #
  45. # Core 2 9.30 8.69 +7%
  46. # Nehalem(**) 7.63 6.88 +11%
  47. # Atom 17.1 16.4 +4%
  48. # Silvermont - 12.9
  49. # Goldmont - 8.85
  50. #
  51. # (*) Comparison is not completely fair, because "this" is ECB,
  52. # i.e. no extra processing such as counter values calculation
  53. # and xor-ing input as in Emilia's CTR implementation is
  54. # performed. However, the CTR calculations stand for not more
  55. # than 1% of total time, so comparison is *rather* fair.
  56. #
  57. # (**) Results were collected on Westmere, which is considered to
  58. # be equivalent to Nehalem for this code.
  59. #
  60. # As for key schedule conversion subroutine. Interface to OpenSSL
  61. # relies on per-invocation on-the-fly conversion. This naturally
  62. # has impact on performance, especially for short inputs. Conversion
  63. # time in CPU cycles and its ratio to CPU cycles spent in 8x block
  64. # function is:
  65. #
  66. # conversion conversion/8x block
  67. # Core 2 240 0.22
  68. # Nehalem 180 0.20
  69. # Atom 430 0.20
  70. #
  71. # The ratio values mean that 128-byte blocks will be processed
  72. # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
  73. # etc. Then keep in mind that input sizes not divisible by 128 are
  74. # *effectively* slower, especially shortest ones, e.g. consecutive
  75. # 144-byte blocks are processed 44% slower than one would expect,
  76. # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
  77. # it's still faster than ["hyper-threading-safe" code path in]
  78. # aes-x86_64.pl on all lengths above 64 bytes...
  79. #
  80. # October 2011.
  81. #
  82. # Add decryption procedure. Performance in CPU cycles spent to decrypt
  83. # one byte out of 4096-byte buffer with 128-bit key is:
  84. #
  85. # Core 2 9.98
  86. # Nehalem 7.80
  87. # Atom 17.9
  88. # Silvermont 14.0
  89. # Goldmont 10.2
  90. #
  91. # November 2011.
  92. #
  93. # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
  94. # suboptimal, but XTS is meant to be used with larger blocks...
  95. #
  96. # <appro@openssl.org>
  97. # $output is the last argument if it looks like a file (it has an extension)
  98. # $flavour is the first argument if it doesn't look like a file
  99. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  100. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  101. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  102. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  103. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  104. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  105. die "can't locate x86_64-xlate.pl";
  106. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
  107. or die "can't call $xlate: $!";
  108. *STDOUT=*OUT;
  109. my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
  110. my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
  111. my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
  112. {
  113. my ($key,$rounds,$const)=("%rax","%r10d","%r11");
  114. sub Sbox {
  115. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  116. # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
  117. my @b=@_[0..7];
  118. my @t=@_[8..11];
  119. my @s=@_[12..15];
  120. &InBasisChange (@b);
  121. &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
  122. &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
  123. }
  124. sub InBasisChange {
  125. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  126. # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
  127. my @b=@_[0..7];
  128. $code.=<<___;
  129. pxor @b[6], @b[5]
  130. pxor @b[1], @b[2]
  131. pxor @b[0], @b[3]
  132. pxor @b[2], @b[6]
  133. pxor @b[0], @b[5]
  134. pxor @b[3], @b[6]
  135. pxor @b[7], @b[3]
  136. pxor @b[5], @b[7]
  137. pxor @b[4], @b[3]
  138. pxor @b[5], @b[4]
  139. pxor @b[1], @b[3]
  140. pxor @b[7], @b[2]
  141. pxor @b[5], @b[1]
  142. ___
  143. }
  144. sub OutBasisChange {
  145. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  146. # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
  147. my @b=@_[0..7];
  148. $code.=<<___;
  149. pxor @b[6], @b[0]
  150. pxor @b[4], @b[1]
  151. pxor @b[0], @b[2]
  152. pxor @b[6], @b[4]
  153. pxor @b[1], @b[6]
  154. pxor @b[5], @b[1]
  155. pxor @b[3], @b[5]
  156. pxor @b[7], @b[3]
  157. pxor @b[5], @b[7]
  158. pxor @b[5], @b[2]
  159. pxor @b[7], @b[4]
  160. ___
  161. }
  162. sub InvSbox {
  163. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  164. # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
  165. my @b=@_[0..7];
  166. my @t=@_[8..11];
  167. my @s=@_[12..15];
  168. &InvInBasisChange (@b);
  169. &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
  170. &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
  171. }
  172. sub InvInBasisChange { # OutBasisChange in reverse
  173. my @b=@_[5,1,2,6,3,7,0,4];
  174. $code.=<<___
  175. pxor @b[7], @b[4]
  176. pxor @b[5], @b[7]
  177. pxor @b[5], @b[2]
  178. pxor @b[7], @b[3]
  179. pxor @b[3], @b[5]
  180. pxor @b[5], @b[1]
  181. pxor @b[1], @b[6]
  182. pxor @b[0], @b[2]
  183. pxor @b[6], @b[4]
  184. pxor @b[6], @b[0]
  185. pxor @b[4], @b[1]
  186. ___
  187. }
  188. sub InvOutBasisChange { # InBasisChange in reverse
  189. my @b=@_[2,5,7,3,6,1,0,4];
  190. $code.=<<___;
  191. pxor @b[5], @b[1]
  192. pxor @b[7], @b[2]
  193. pxor @b[1], @b[3]
  194. pxor @b[5], @b[4]
  195. pxor @b[5], @b[7]
  196. pxor @b[4], @b[3]
  197. pxor @b[0], @b[5]
  198. pxor @b[7], @b[3]
  199. pxor @b[2], @b[6]
  200. pxor @b[1], @b[2]
  201. pxor @b[3], @b[6]
  202. pxor @b[0], @b[3]
  203. pxor @b[6], @b[5]
  204. ___
  205. }
  206. sub Mul_GF4 {
  207. #;*************************************************************
  208. #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
  209. #;*************************************************************
  210. my ($x0,$x1,$y0,$y1,$t0)=@_;
  211. $code.=<<___;
  212. movdqa $y0, $t0
  213. pxor $y1, $t0
  214. pand $x0, $t0
  215. pxor $x1, $x0
  216. pand $y0, $x1
  217. pand $y1, $x0
  218. pxor $x1, $x0
  219. pxor $t0, $x1
  220. ___
  221. }
  222. sub Mul_GF4_N { # not used, see next subroutine
  223. # multiply and scale by N
  224. my ($x0,$x1,$y0,$y1,$t0)=@_;
  225. $code.=<<___;
  226. movdqa $y0, $t0
  227. pxor $y1, $t0
  228. pand $x0, $t0
  229. pxor $x1, $x0
  230. pand $y0, $x1
  231. pand $y1, $x0
  232. pxor $x0, $x1
  233. pxor $t0, $x0
  234. ___
  235. }
  236. sub Mul_GF4_N_GF4 {
  237. # interleaved Mul_GF4_N and Mul_GF4
  238. my ($x0,$x1,$y0,$y1,$t0,
  239. $x2,$x3,$y2,$y3,$t1)=@_;
  240. $code.=<<___;
  241. movdqa $y0, $t0
  242. movdqa $y2, $t1
  243. pxor $y1, $t0
  244. pxor $y3, $t1
  245. pand $x0, $t0
  246. pand $x2, $t1
  247. pxor $x1, $x0
  248. pxor $x3, $x2
  249. pand $y0, $x1
  250. pand $y2, $x3
  251. pand $y1, $x0
  252. pand $y3, $x2
  253. pxor $x0, $x1
  254. pxor $x3, $x2
  255. pxor $t0, $x0
  256. pxor $t1, $x3
  257. ___
  258. }
  259. sub Mul_GF16_2 {
  260. my @x=@_[0..7];
  261. my @y=@_[8..11];
  262. my @t=@_[12..15];
  263. $code.=<<___;
  264. movdqa @x[0], @t[0]
  265. movdqa @x[1], @t[1]
  266. ___
  267. &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
  268. $code.=<<___;
  269. pxor @x[2], @t[0]
  270. pxor @x[3], @t[1]
  271. pxor @y[2], @y[0]
  272. pxor @y[3], @y[1]
  273. ___
  274. Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
  275. @x[2], @x[3], @y[2], @y[3], @t[2]);
  276. $code.=<<___;
  277. pxor @t[0], @x[0]
  278. pxor @t[0], @x[2]
  279. pxor @t[1], @x[1]
  280. pxor @t[1], @x[3]
  281. movdqa @x[4], @t[0]
  282. movdqa @x[5], @t[1]
  283. pxor @x[6], @t[0]
  284. pxor @x[7], @t[1]
  285. ___
  286. &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
  287. @x[6], @x[7], @y[2], @y[3], @t[2]);
  288. $code.=<<___;
  289. pxor @y[2], @y[0]
  290. pxor @y[3], @y[1]
  291. ___
  292. &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
  293. $code.=<<___;
  294. pxor @t[0], @x[4]
  295. pxor @t[0], @x[6]
  296. pxor @t[1], @x[5]
  297. pxor @t[1], @x[7]
  298. ___
  299. }
  300. sub Inv_GF256 {
  301. #;********************************************************************
  302. #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
  303. #;********************************************************************
  304. my @x=@_[0..7];
  305. my @t=@_[8..11];
  306. my @s=@_[12..15];
  307. # direct optimizations from hardware
  308. $code.=<<___;
  309. movdqa @x[4], @t[3]
  310. movdqa @x[5], @t[2]
  311. movdqa @x[1], @t[1]
  312. movdqa @x[7], @s[1]
  313. movdqa @x[0], @s[0]
  314. pxor @x[6], @t[3]
  315. pxor @x[7], @t[2]
  316. pxor @x[3], @t[1]
  317. movdqa @t[3], @s[2]
  318. pxor @x[6], @s[1]
  319. movdqa @t[2], @t[0]
  320. pxor @x[2], @s[0]
  321. movdqa @t[3], @s[3]
  322. por @t[1], @t[2]
  323. por @s[0], @t[3]
  324. pxor @t[0], @s[3]
  325. pand @s[0], @s[2]
  326. pxor @t[1], @s[0]
  327. pand @t[1], @t[0]
  328. pand @s[0], @s[3]
  329. movdqa @x[3], @s[0]
  330. pxor @x[2], @s[0]
  331. pand @s[0], @s[1]
  332. pxor @s[1], @t[3]
  333. pxor @s[1], @t[2]
  334. movdqa @x[4], @s[1]
  335. movdqa @x[1], @s[0]
  336. pxor @x[5], @s[1]
  337. pxor @x[0], @s[0]
  338. movdqa @s[1], @t[1]
  339. pand @s[0], @s[1]
  340. por @s[0], @t[1]
  341. pxor @s[1], @t[0]
  342. pxor @s[3], @t[3]
  343. pxor @s[2], @t[2]
  344. pxor @s[3], @t[1]
  345. movdqa @x[7], @s[0]
  346. pxor @s[2], @t[0]
  347. movdqa @x[6], @s[1]
  348. pxor @s[2], @t[1]
  349. movdqa @x[5], @s[2]
  350. pand @x[3], @s[0]
  351. movdqa @x[4], @s[3]
  352. pand @x[2], @s[1]
  353. pand @x[1], @s[2]
  354. por @x[0], @s[3]
  355. pxor @s[0], @t[3]
  356. pxor @s[1], @t[2]
  357. pxor @s[2], @t[1]
  358. pxor @s[3], @t[0]
  359. #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
  360. # new smaller inversion
  361. movdqa @t[3], @s[0]
  362. pand @t[1], @t[3]
  363. pxor @t[2], @s[0]
  364. movdqa @t[0], @s[2]
  365. movdqa @s[0], @s[3]
  366. pxor @t[3], @s[2]
  367. pand @s[2], @s[3]
  368. movdqa @t[1], @s[1]
  369. pxor @t[2], @s[3]
  370. pxor @t[0], @s[1]
  371. pxor @t[2], @t[3]
  372. pand @t[3], @s[1]
  373. movdqa @s[2], @t[2]
  374. pxor @t[0], @s[1]
  375. pxor @s[1], @t[2]
  376. pxor @s[1], @t[1]
  377. pand @t[0], @t[2]
  378. pxor @t[2], @s[2]
  379. pxor @t[2], @t[1]
  380. pand @s[3], @s[2]
  381. pxor @s[0], @s[2]
  382. ___
  383. # output in s3, s2, s1, t1
  384. # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
  385. # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
  386. &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
  387. ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
  388. }
  389. # AES linear components
  390. sub ShiftRows {
  391. my @x=@_[0..7];
  392. my $mask=pop;
  393. $code.=<<___;
  394. pxor 0x00($key),@x[0]
  395. pxor 0x10($key),@x[1]
  396. pxor 0x20($key),@x[2]
  397. pxor 0x30($key),@x[3]
  398. pshufb $mask,@x[0]
  399. pshufb $mask,@x[1]
  400. pxor 0x40($key),@x[4]
  401. pxor 0x50($key),@x[5]
  402. pshufb $mask,@x[2]
  403. pshufb $mask,@x[3]
  404. pxor 0x60($key),@x[6]
  405. pxor 0x70($key),@x[7]
  406. pshufb $mask,@x[4]
  407. pshufb $mask,@x[5]
  408. pshufb $mask,@x[6]
  409. pshufb $mask,@x[7]
  410. lea 0x80($key),$key
  411. ___
  412. }
  413. sub MixColumns {
  414. # modified to emit output in order suitable for feeding back to aesenc[last]
  415. my @x=@_[0..7];
  416. my @t=@_[8..15];
  417. my $inv=@_[16]; # optional
  418. $code.=<<___;
  419. pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
  420. pshufd \$0x93, @x[1], @t[1]
  421. pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
  422. pshufd \$0x93, @x[2], @t[2]
  423. pxor @t[1], @x[1]
  424. pshufd \$0x93, @x[3], @t[3]
  425. pxor @t[2], @x[2]
  426. pshufd \$0x93, @x[4], @t[4]
  427. pxor @t[3], @x[3]
  428. pshufd \$0x93, @x[5], @t[5]
  429. pxor @t[4], @x[4]
  430. pshufd \$0x93, @x[6], @t[6]
  431. pxor @t[5], @x[5]
  432. pshufd \$0x93, @x[7], @t[7]
  433. pxor @t[6], @x[6]
  434. pxor @t[7], @x[7]
  435. pxor @x[0], @t[1]
  436. pxor @x[7], @t[0]
  437. pxor @x[7], @t[1]
  438. pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
  439. pxor @x[1], @t[2]
  440. pshufd \$0x4E, @x[1], @x[1]
  441. pxor @x[4], @t[5]
  442. pxor @t[0], @x[0]
  443. pxor @x[5], @t[6]
  444. pxor @t[1], @x[1]
  445. pxor @x[3], @t[4]
  446. pshufd \$0x4E, @x[4], @t[0]
  447. pxor @x[6], @t[7]
  448. pshufd \$0x4E, @x[5], @t[1]
  449. pxor @x[2], @t[3]
  450. pshufd \$0x4E, @x[3], @x[4]
  451. pxor @x[7], @t[3]
  452. pshufd \$0x4E, @x[7], @x[5]
  453. pxor @x[7], @t[4]
  454. pshufd \$0x4E, @x[6], @x[3]
  455. pxor @t[4], @t[0]
  456. pshufd \$0x4E, @x[2], @x[6]
  457. pxor @t[5], @t[1]
  458. ___
  459. $code.=<<___ if (!$inv);
  460. pxor @t[3], @x[4]
  461. pxor @t[7], @x[5]
  462. pxor @t[6], @x[3]
  463. movdqa @t[0], @x[2]
  464. pxor @t[2], @x[6]
  465. movdqa @t[1], @x[7]
  466. ___
  467. $code.=<<___ if ($inv);
  468. pxor @x[4], @t[3]
  469. pxor @t[7], @x[5]
  470. pxor @x[3], @t[6]
  471. movdqa @t[0], @x[3]
  472. pxor @t[2], @x[6]
  473. movdqa @t[6], @x[2]
  474. movdqa @t[1], @x[7]
  475. movdqa @x[6], @x[4]
  476. movdqa @t[3], @x[6]
  477. ___
  478. }
  479. sub InvMixColumns_orig {
  480. my @x=@_[0..7];
  481. my @t=@_[8..15];
  482. $code.=<<___;
  483. # multiplication by 0x0e
  484. pshufd \$0x93, @x[7], @t[7]
  485. movdqa @x[2], @t[2]
  486. pxor @x[5], @x[7] # 7 5
  487. pxor @x[5], @x[2] # 2 5
  488. pshufd \$0x93, @x[0], @t[0]
  489. movdqa @x[5], @t[5]
  490. pxor @x[0], @x[5] # 5 0 [1]
  491. pxor @x[1], @x[0] # 0 1
  492. pshufd \$0x93, @x[1], @t[1]
  493. pxor @x[2], @x[1] # 1 25
  494. pxor @x[6], @x[0] # 01 6 [2]
  495. pxor @x[3], @x[1] # 125 3 [4]
  496. pshufd \$0x93, @x[3], @t[3]
  497. pxor @x[0], @x[2] # 25 016 [3]
  498. pxor @x[7], @x[3] # 3 75
  499. pxor @x[6], @x[7] # 75 6 [0]
  500. pshufd \$0x93, @x[6], @t[6]
  501. movdqa @x[4], @t[4]
  502. pxor @x[4], @x[6] # 6 4
  503. pxor @x[3], @x[4] # 4 375 [6]
  504. pxor @x[7], @x[3] # 375 756=36
  505. pxor @t[5], @x[6] # 64 5 [7]
  506. pxor @t[2], @x[3] # 36 2
  507. pxor @t[4], @x[3] # 362 4 [5]
  508. pshufd \$0x93, @t[5], @t[5]
  509. ___
  510. my @y = @x[7,5,0,2,1,3,4,6];
  511. $code.=<<___;
  512. # multiplication by 0x0b
  513. pxor @y[0], @y[1]
  514. pxor @t[0], @y[0]
  515. pxor @t[1], @y[1]
  516. pshufd \$0x93, @t[2], @t[2]
  517. pxor @t[5], @y[0]
  518. pxor @t[6], @y[1]
  519. pxor @t[7], @y[0]
  520. pshufd \$0x93, @t[4], @t[4]
  521. pxor @t[6], @t[7] # clobber t[7]
  522. pxor @y[0], @y[1]
  523. pxor @t[0], @y[3]
  524. pshufd \$0x93, @t[0], @t[0]
  525. pxor @t[1], @y[2]
  526. pxor @t[1], @y[4]
  527. pxor @t[2], @y[2]
  528. pshufd \$0x93, @t[1], @t[1]
  529. pxor @t[2], @y[3]
  530. pxor @t[2], @y[5]
  531. pxor @t[7], @y[2]
  532. pshufd \$0x93, @t[2], @t[2]
  533. pxor @t[3], @y[3]
  534. pxor @t[3], @y[6]
  535. pxor @t[3], @y[4]
  536. pshufd \$0x93, @t[3], @t[3]
  537. pxor @t[4], @y[7]
  538. pxor @t[4], @y[5]
  539. pxor @t[7], @y[7]
  540. pxor @t[5], @y[3]
  541. pxor @t[4], @y[4]
  542. pxor @t[5], @t[7] # clobber t[7] even more
  543. pxor @t[7], @y[5]
  544. pshufd \$0x93, @t[4], @t[4]
  545. pxor @t[7], @y[6]
  546. pxor @t[7], @y[4]
  547. pxor @t[5], @t[7]
  548. pshufd \$0x93, @t[5], @t[5]
  549. pxor @t[6], @t[7] # restore t[7]
  550. # multiplication by 0x0d
  551. pxor @y[7], @y[4]
  552. pxor @t[4], @y[7]
  553. pshufd \$0x93, @t[6], @t[6]
  554. pxor @t[0], @y[2]
  555. pxor @t[5], @y[7]
  556. pxor @t[2], @y[2]
  557. pshufd \$0x93, @t[7], @t[7]
  558. pxor @y[1], @y[3]
  559. pxor @t[1], @y[1]
  560. pxor @t[0], @y[0]
  561. pxor @t[0], @y[3]
  562. pxor @t[5], @y[1]
  563. pxor @t[5], @y[0]
  564. pxor @t[7], @y[1]
  565. pshufd \$0x93, @t[0], @t[0]
  566. pxor @t[6], @y[0]
  567. pxor @y[1], @y[3]
  568. pxor @t[1], @y[4]
  569. pshufd \$0x93, @t[1], @t[1]
  570. pxor @t[7], @y[7]
  571. pxor @t[2], @y[4]
  572. pxor @t[2], @y[5]
  573. pshufd \$0x93, @t[2], @t[2]
  574. pxor @t[6], @y[2]
  575. pxor @t[3], @t[6] # clobber t[6]
  576. pxor @y[7], @y[4]
  577. pxor @t[6], @y[3]
  578. pxor @t[6], @y[6]
  579. pxor @t[5], @y[5]
  580. pxor @t[4], @y[6]
  581. pshufd \$0x93, @t[4], @t[4]
  582. pxor @t[6], @y[5]
  583. pxor @t[7], @y[6]
  584. pxor @t[3], @t[6] # restore t[6]
  585. pshufd \$0x93, @t[5], @t[5]
  586. pshufd \$0x93, @t[6], @t[6]
  587. pshufd \$0x93, @t[7], @t[7]
  588. pshufd \$0x93, @t[3], @t[3]
  589. # multiplication by 0x09
  590. pxor @y[1], @y[4]
  591. pxor @y[1], @t[1] # t[1]=y[1]
  592. pxor @t[5], @t[0] # clobber t[0]
  593. pxor @t[5], @t[1]
  594. pxor @t[0], @y[3]
  595. pxor @y[0], @t[0] # t[0]=y[0]
  596. pxor @t[6], @t[1]
  597. pxor @t[7], @t[6] # clobber t[6]
  598. pxor @t[1], @y[4]
  599. pxor @t[4], @y[7]
  600. pxor @y[4], @t[4] # t[4]=y[4]
  601. pxor @t[3], @y[6]
  602. pxor @y[3], @t[3] # t[3]=y[3]
  603. pxor @t[2], @y[5]
  604. pxor @y[2], @t[2] # t[2]=y[2]
  605. pxor @t[7], @t[3]
  606. pxor @y[5], @t[5] # t[5]=y[5]
  607. pxor @t[6], @t[2]
  608. pxor @t[6], @t[5]
  609. pxor @y[6], @t[6] # t[6]=y[6]
  610. pxor @y[7], @t[7] # t[7]=y[7]
  611. movdqa @t[0],@XMM[0]
  612. movdqa @t[1],@XMM[1]
  613. movdqa @t[2],@XMM[2]
  614. movdqa @t[3],@XMM[3]
  615. movdqa @t[4],@XMM[4]
  616. movdqa @t[5],@XMM[5]
  617. movdqa @t[6],@XMM[6]
  618. movdqa @t[7],@XMM[7]
  619. ___
  620. }
  621. sub InvMixColumns {
  622. my @x=@_[0..7];
  623. my @t=@_[8..15];
  624. # Thanks to Jussi Kivilinna for providing pointer to
  625. #
  626. # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
  627. # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
  628. # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
  629. # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
  630. $code.=<<___;
  631. # multiplication by 0x05-0x00-0x04-0x00
  632. pshufd \$0x4E, @x[0], @t[0]
  633. pshufd \$0x4E, @x[6], @t[6]
  634. pxor @x[0], @t[0]
  635. pshufd \$0x4E, @x[7], @t[7]
  636. pxor @x[6], @t[6]
  637. pshufd \$0x4E, @x[1], @t[1]
  638. pxor @x[7], @t[7]
  639. pshufd \$0x4E, @x[2], @t[2]
  640. pxor @x[1], @t[1]
  641. pshufd \$0x4E, @x[3], @t[3]
  642. pxor @x[2], @t[2]
  643. pxor @t[6], @x[0]
  644. pxor @t[6], @x[1]
  645. pshufd \$0x4E, @x[4], @t[4]
  646. pxor @x[3], @t[3]
  647. pxor @t[0], @x[2]
  648. pxor @t[1], @x[3]
  649. pshufd \$0x4E, @x[5], @t[5]
  650. pxor @x[4], @t[4]
  651. pxor @t[7], @x[1]
  652. pxor @t[2], @x[4]
  653. pxor @x[5], @t[5]
  654. pxor @t[7], @x[2]
  655. pxor @t[6], @x[3]
  656. pxor @t[6], @x[4]
  657. pxor @t[3], @x[5]
  658. pxor @t[4], @x[6]
  659. pxor @t[7], @x[4]
  660. pxor @t[7], @x[5]
  661. pxor @t[5], @x[7]
  662. ___
  663. &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
  664. }
  665. sub aesenc { # not used
  666. my @b=@_[0..7];
  667. my @t=@_[8..15];
  668. $code.=<<___;
  669. movdqa 0x30($const),@t[0] # .LSR
  670. ___
  671. &ShiftRows (@b,@t[0]);
  672. &Sbox (@b,@t);
  673. &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
  674. }
  675. sub aesenclast { # not used
  676. my @b=@_[0..7];
  677. my @t=@_[8..15];
  678. $code.=<<___;
  679. movdqa 0x40($const),@t[0] # .LSRM0
  680. ___
  681. &ShiftRows (@b,@t[0]);
  682. &Sbox (@b,@t);
  683. $code.=<<___
  684. pxor 0x00($key),@b[0]
  685. pxor 0x10($key),@b[1]
  686. pxor 0x20($key),@b[4]
  687. pxor 0x30($key),@b[6]
  688. pxor 0x40($key),@b[3]
  689. pxor 0x50($key),@b[7]
  690. pxor 0x60($key),@b[2]
  691. pxor 0x70($key),@b[5]
  692. ___
  693. }
  694. sub swapmove {
  695. my ($a,$b,$n,$mask,$t)=@_;
  696. $code.=<<___;
  697. movdqa $b,$t
  698. psrlq \$$n,$b
  699. pxor $a,$b
  700. pand $mask,$b
  701. pxor $b,$a
  702. psllq \$$n,$b
  703. pxor $t,$b
  704. ___
  705. }
  706. sub swapmove2x {
  707. my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
  708. $code.=<<___;
  709. movdqa $b0,$t0
  710. psrlq \$$n,$b0
  711. movdqa $b1,$t1
  712. psrlq \$$n,$b1
  713. pxor $a0,$b0
  714. pxor $a1,$b1
  715. pand $mask,$b0
  716. pand $mask,$b1
  717. pxor $b0,$a0
  718. psllq \$$n,$b0
  719. pxor $b1,$a1
  720. psllq \$$n,$b1
  721. pxor $t0,$b0
  722. pxor $t1,$b1
  723. ___
  724. }
  725. sub bitslice {
  726. my @x=reverse(@_[0..7]);
  727. my ($t0,$t1,$t2,$t3)=@_[8..11];
  728. $code.=<<___;
  729. movdqa 0x00($const),$t0 # .LBS0
  730. movdqa 0x10($const),$t1 # .LBS1
  731. ___
  732. &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
  733. &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
  734. $code.=<<___;
  735. movdqa 0x20($const),$t0 # .LBS2
  736. ___
  737. &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
  738. &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
  739. &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
  740. &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
  741. }
  742. $code.=<<___;
  743. .text
  744. .extern asm_AES_encrypt
  745. .extern asm_AES_decrypt
  746. .type _bsaes_encrypt8,\@abi-omnipotent
  747. .align 64
  748. _bsaes_encrypt8:
  749. .cfi_startproc
  750. lea .LBS0(%rip), $const # constants table
  751. movdqa ($key), @XMM[9] # round 0 key
  752. lea 0x10($key), $key
  753. movdqa 0x50($const), @XMM[8] # .LM0SR
  754. pxor @XMM[9], @XMM[0] # xor with round0 key
  755. pxor @XMM[9], @XMM[1]
  756. pxor @XMM[9], @XMM[2]
  757. pxor @XMM[9], @XMM[3]
  758. pshufb @XMM[8], @XMM[0]
  759. pshufb @XMM[8], @XMM[1]
  760. pxor @XMM[9], @XMM[4]
  761. pxor @XMM[9], @XMM[5]
  762. pshufb @XMM[8], @XMM[2]
  763. pshufb @XMM[8], @XMM[3]
  764. pxor @XMM[9], @XMM[6]
  765. pxor @XMM[9], @XMM[7]
  766. pshufb @XMM[8], @XMM[4]
  767. pshufb @XMM[8], @XMM[5]
  768. pshufb @XMM[8], @XMM[6]
  769. pshufb @XMM[8], @XMM[7]
  770. _bsaes_encrypt8_bitslice:
  771. ___
  772. &bitslice (@XMM[0..7, 8..11]);
  773. $code.=<<___;
  774. dec $rounds
  775. jmp .Lenc_sbox
  776. .align 16
  777. .Lenc_loop:
  778. ___
  779. &ShiftRows (@XMM[0..7, 8]);
  780. $code.=".Lenc_sbox:\n";
  781. &Sbox (@XMM[0..7, 8..15]);
  782. $code.=<<___;
  783. dec $rounds
  784. jl .Lenc_done
  785. ___
  786. &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
  787. $code.=<<___;
  788. movdqa 0x30($const), @XMM[8] # .LSR
  789. jnz .Lenc_loop
  790. movdqa 0x40($const), @XMM[8] # .LSRM0
  791. jmp .Lenc_loop
  792. .align 16
  793. .Lenc_done:
  794. ___
  795. # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
  796. &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
  797. $code.=<<___;
  798. movdqa ($key), @XMM[8] # last round key
  799. pxor @XMM[8], @XMM[4]
  800. pxor @XMM[8], @XMM[6]
  801. pxor @XMM[8], @XMM[3]
  802. pxor @XMM[8], @XMM[7]
  803. pxor @XMM[8], @XMM[2]
  804. pxor @XMM[8], @XMM[5]
  805. pxor @XMM[8], @XMM[0]
  806. pxor @XMM[8], @XMM[1]
  807. ret
  808. .cfi_endproc
  809. .size _bsaes_encrypt8,.-_bsaes_encrypt8
  810. .type _bsaes_decrypt8,\@abi-omnipotent
  811. .align 64
  812. _bsaes_decrypt8:
  813. .cfi_startproc
  814. lea .LBS0(%rip), $const # constants table
  815. movdqa ($key), @XMM[9] # round 0 key
  816. lea 0x10($key), $key
  817. movdqa -0x30($const), @XMM[8] # .LM0ISR
  818. pxor @XMM[9], @XMM[0] # xor with round0 key
  819. pxor @XMM[9], @XMM[1]
  820. pxor @XMM[9], @XMM[2]
  821. pxor @XMM[9], @XMM[3]
  822. pshufb @XMM[8], @XMM[0]
  823. pshufb @XMM[8], @XMM[1]
  824. pxor @XMM[9], @XMM[4]
  825. pxor @XMM[9], @XMM[5]
  826. pshufb @XMM[8], @XMM[2]
  827. pshufb @XMM[8], @XMM[3]
  828. pxor @XMM[9], @XMM[6]
  829. pxor @XMM[9], @XMM[7]
  830. pshufb @XMM[8], @XMM[4]
  831. pshufb @XMM[8], @XMM[5]
  832. pshufb @XMM[8], @XMM[6]
  833. pshufb @XMM[8], @XMM[7]
  834. ___
  835. &bitslice (@XMM[0..7, 8..11]);
  836. $code.=<<___;
  837. dec $rounds
  838. jmp .Ldec_sbox
  839. .align 16
  840. .Ldec_loop:
  841. ___
  842. &ShiftRows (@XMM[0..7, 8]);
  843. $code.=".Ldec_sbox:\n";
  844. &InvSbox (@XMM[0..7, 8..15]);
  845. $code.=<<___;
  846. dec $rounds
  847. jl .Ldec_done
  848. ___
  849. &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
  850. $code.=<<___;
  851. movdqa -0x10($const), @XMM[8] # .LISR
  852. jnz .Ldec_loop
  853. movdqa -0x20($const), @XMM[8] # .LISRM0
  854. jmp .Ldec_loop
  855. .align 16
  856. .Ldec_done:
  857. ___
  858. &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
  859. $code.=<<___;
  860. movdqa ($key), @XMM[8] # last round key
  861. pxor @XMM[8], @XMM[6]
  862. pxor @XMM[8], @XMM[4]
  863. pxor @XMM[8], @XMM[2]
  864. pxor @XMM[8], @XMM[7]
  865. pxor @XMM[8], @XMM[3]
  866. pxor @XMM[8], @XMM[5]
  867. pxor @XMM[8], @XMM[0]
  868. pxor @XMM[8], @XMM[1]
  869. ret
  870. .cfi_endproc
  871. .size _bsaes_decrypt8,.-_bsaes_decrypt8
  872. ___
  873. }
  874. {
  875. my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
  876. sub bitslice_key {
  877. my @x=reverse(@_[0..7]);
  878. my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
  879. &swapmove (@x[0,1],1,$bs0,$t2,$t3);
  880. $code.=<<___;
  881. #&swapmove(@x[2,3],1,$t0,$t2,$t3);
  882. movdqa @x[0], @x[2]
  883. movdqa @x[1], @x[3]
  884. ___
  885. #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
  886. &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
  887. $code.=<<___;
  888. #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
  889. movdqa @x[0], @x[4]
  890. movdqa @x[2], @x[6]
  891. movdqa @x[1], @x[5]
  892. movdqa @x[3], @x[7]
  893. ___
  894. &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
  895. &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
  896. }
  897. $code.=<<___;
  898. .type _bsaes_key_convert,\@abi-omnipotent
  899. .align 16
  900. _bsaes_key_convert:
  901. .cfi_startproc
  902. lea .Lmasks(%rip), $const
  903. movdqu ($inp), %xmm7 # load round 0 key
  904. lea 0x10($inp), $inp
  905. movdqa 0x00($const), %xmm0 # 0x01...
  906. movdqa 0x10($const), %xmm1 # 0x02...
  907. movdqa 0x20($const), %xmm2 # 0x04...
  908. movdqa 0x30($const), %xmm3 # 0x08...
  909. movdqa 0x40($const), %xmm4 # .LM0
  910. pcmpeqd %xmm5, %xmm5 # .LNOT
  911. movdqu ($inp), %xmm6 # load round 1 key
  912. movdqa %xmm7, ($out) # save round 0 key
  913. lea 0x10($out), $out
  914. dec $rounds
  915. jmp .Lkey_loop
  916. .align 16
  917. .Lkey_loop:
  918. pshufb %xmm4, %xmm6 # .LM0
  919. movdqa %xmm0, %xmm8
  920. movdqa %xmm1, %xmm9
  921. pand %xmm6, %xmm8
  922. pand %xmm6, %xmm9
  923. movdqa %xmm2, %xmm10
  924. pcmpeqb %xmm0, %xmm8
  925. psllq \$4, %xmm0 # 0x10...
  926. movdqa %xmm3, %xmm11
  927. pcmpeqb %xmm1, %xmm9
  928. psllq \$4, %xmm1 # 0x20...
  929. pand %xmm6, %xmm10
  930. pand %xmm6, %xmm11
  931. movdqa %xmm0, %xmm12
  932. pcmpeqb %xmm2, %xmm10
  933. psllq \$4, %xmm2 # 0x40...
  934. movdqa %xmm1, %xmm13
  935. pcmpeqb %xmm3, %xmm11
  936. psllq \$4, %xmm3 # 0x80...
  937. movdqa %xmm2, %xmm14
  938. movdqa %xmm3, %xmm15
  939. pxor %xmm5, %xmm8 # "pnot"
  940. pxor %xmm5, %xmm9
  941. pand %xmm6, %xmm12
  942. pand %xmm6, %xmm13
  943. movdqa %xmm8, 0x00($out) # write bit-sliced round key
  944. pcmpeqb %xmm0, %xmm12
  945. psrlq \$4, %xmm0 # 0x01...
  946. movdqa %xmm9, 0x10($out)
  947. pcmpeqb %xmm1, %xmm13
  948. psrlq \$4, %xmm1 # 0x02...
  949. lea 0x10($inp), $inp
  950. pand %xmm6, %xmm14
  951. pand %xmm6, %xmm15
  952. movdqa %xmm10, 0x20($out)
  953. pcmpeqb %xmm2, %xmm14
  954. psrlq \$4, %xmm2 # 0x04...
  955. movdqa %xmm11, 0x30($out)
  956. pcmpeqb %xmm3, %xmm15
  957. psrlq \$4, %xmm3 # 0x08...
  958. movdqu ($inp), %xmm6 # load next round key
  959. pxor %xmm5, %xmm13 # "pnot"
  960. pxor %xmm5, %xmm14
  961. movdqa %xmm12, 0x40($out)
  962. movdqa %xmm13, 0x50($out)
  963. movdqa %xmm14, 0x60($out)
  964. movdqa %xmm15, 0x70($out)
  965. lea 0x80($out),$out
  966. dec $rounds
  967. jnz .Lkey_loop
  968. movdqa 0x50($const), %xmm7 # .L63
  969. #movdqa %xmm6, ($out) # don't save last round key
  970. ret
  971. .cfi_endproc
  972. .size _bsaes_key_convert,.-_bsaes_key_convert
  973. ___
  974. }
  975. if (0 && !$win64) { # following four functions are unsupported interface
  976. # used for benchmarking...
  977. $code.=<<___;
  978. .globl bsaes_enc_key_convert
  979. .type bsaes_enc_key_convert,\@function,2
  980. .align 16
  981. bsaes_enc_key_convert:
  982. mov 240($inp),%r10d # pass rounds
  983. mov $inp,%rcx # pass key
  984. mov $out,%rax # pass key schedule
  985. call _bsaes_key_convert
  986. pxor %xmm6,%xmm7 # fix up last round key
  987. movdqa %xmm7,(%rax) # save last round key
  988. ret
  989. .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
  990. .globl bsaes_encrypt_128
  991. .type bsaes_encrypt_128,\@function,4
  992. .align 16
  993. bsaes_encrypt_128:
  994. .Lenc128_loop:
  995. movdqu 0x00($inp), @XMM[0] # load input
  996. movdqu 0x10($inp), @XMM[1]
  997. movdqu 0x20($inp), @XMM[2]
  998. movdqu 0x30($inp), @XMM[3]
  999. movdqu 0x40($inp), @XMM[4]
  1000. movdqu 0x50($inp), @XMM[5]
  1001. movdqu 0x60($inp), @XMM[6]
  1002. movdqu 0x70($inp), @XMM[7]
  1003. mov $key, %rax # pass the $key
  1004. lea 0x80($inp), $inp
  1005. mov \$10,%r10d
  1006. call _bsaes_encrypt8
  1007. movdqu @XMM[0], 0x00($out) # write output
  1008. movdqu @XMM[1], 0x10($out)
  1009. movdqu @XMM[4], 0x20($out)
  1010. movdqu @XMM[6], 0x30($out)
  1011. movdqu @XMM[3], 0x40($out)
  1012. movdqu @XMM[7], 0x50($out)
  1013. movdqu @XMM[2], 0x60($out)
  1014. movdqu @XMM[5], 0x70($out)
  1015. lea 0x80($out), $out
  1016. sub \$0x80,$len
  1017. ja .Lenc128_loop
  1018. ret
  1019. .size bsaes_encrypt_128,.-bsaes_encrypt_128
  1020. .globl bsaes_dec_key_convert
  1021. .type bsaes_dec_key_convert,\@function,2
  1022. .align 16
  1023. bsaes_dec_key_convert:
  1024. mov 240($inp),%r10d # pass rounds
  1025. mov $inp,%rcx # pass key
  1026. mov $out,%rax # pass key schedule
  1027. call _bsaes_key_convert
  1028. pxor ($out),%xmm7 # fix up round 0 key
  1029. movdqa %xmm6,(%rax) # save last round key
  1030. movdqa %xmm7,($out)
  1031. ret
  1032. .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
  1033. .globl bsaes_decrypt_128
  1034. .type bsaes_decrypt_128,\@function,4
  1035. .align 16
  1036. bsaes_decrypt_128:
  1037. .Ldec128_loop:
  1038. movdqu 0x00($inp), @XMM[0] # load input
  1039. movdqu 0x10($inp), @XMM[1]
  1040. movdqu 0x20($inp), @XMM[2]
  1041. movdqu 0x30($inp), @XMM[3]
  1042. movdqu 0x40($inp), @XMM[4]
  1043. movdqu 0x50($inp), @XMM[5]
  1044. movdqu 0x60($inp), @XMM[6]
  1045. movdqu 0x70($inp), @XMM[7]
  1046. mov $key, %rax # pass the $key
  1047. lea 0x80($inp), $inp
  1048. mov \$10,%r10d
  1049. call _bsaes_decrypt8
  1050. movdqu @XMM[0], 0x00($out) # write output
  1051. movdqu @XMM[1], 0x10($out)
  1052. movdqu @XMM[6], 0x20($out)
  1053. movdqu @XMM[4], 0x30($out)
  1054. movdqu @XMM[2], 0x40($out)
  1055. movdqu @XMM[7], 0x50($out)
  1056. movdqu @XMM[3], 0x60($out)
  1057. movdqu @XMM[5], 0x70($out)
  1058. lea 0x80($out), $out
  1059. sub \$0x80,$len
  1060. ja .Ldec128_loop
  1061. ret
  1062. .size bsaes_decrypt_128,.-bsaes_decrypt_128
  1063. ___
  1064. }
  1065. {
  1066. ######################################################################
  1067. #
  1068. # OpenSSL interface
  1069. #
  1070. my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
  1071. : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
  1072. my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
  1073. if ($ecb) {
  1074. $code.=<<___;
  1075. .globl bsaes_ecb_encrypt_blocks
  1076. .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
  1077. .align 16
  1078. bsaes_ecb_encrypt_blocks:
  1079. .cfi_startproc
  1080. mov %rsp, %rax
  1081. .Lecb_enc_prologue:
  1082. push %rbp
  1083. .cfi_push %rbp
  1084. push %rbx
  1085. .cfi_push %rbx
  1086. push %r12
  1087. .cfi_push %r12
  1088. push %r13
  1089. .cfi_push %r13
  1090. push %r14
  1091. .cfi_push %r14
  1092. push %r15
  1093. .cfi_push %r15
  1094. lea -0x48(%rsp),%rsp
  1095. .cfi_adjust_cfa_offset 0x48
  1096. ___
  1097. $code.=<<___ if ($win64);
  1098. lea -0xa0(%rsp), %rsp
  1099. movaps %xmm6, 0x40(%rsp)
  1100. movaps %xmm7, 0x50(%rsp)
  1101. movaps %xmm8, 0x60(%rsp)
  1102. movaps %xmm9, 0x70(%rsp)
  1103. movaps %xmm10, 0x80(%rsp)
  1104. movaps %xmm11, 0x90(%rsp)
  1105. movaps %xmm12, 0xa0(%rsp)
  1106. movaps %xmm13, 0xb0(%rsp)
  1107. movaps %xmm14, 0xc0(%rsp)
  1108. movaps %xmm15, 0xd0(%rsp)
  1109. .Lecb_enc_body:
  1110. ___
  1111. $code.=<<___;
  1112. mov %rsp,%rbp # backup %rsp
  1113. .cfi_def_cfa_register %rbp
  1114. mov 240($arg4),%eax # rounds
  1115. mov $arg1,$inp # backup arguments
  1116. mov $arg2,$out
  1117. mov $arg3,$len
  1118. mov $arg4,$key
  1119. cmp \$8,$arg3
  1120. jb .Lecb_enc_short
  1121. mov %eax,%ebx # backup rounds
  1122. shl \$7,%rax # 128 bytes per inner round key
  1123. sub \$`128-32`,%rax # size of bit-sliced key schedule
  1124. sub %rax,%rsp
  1125. mov %rsp,%rax # pass key schedule
  1126. mov $key,%rcx # pass key
  1127. mov %ebx,%r10d # pass rounds
  1128. call _bsaes_key_convert
  1129. pxor %xmm6,%xmm7 # fix up last round key
  1130. movdqa %xmm7,(%rax) # save last round key
  1131. sub \$8,$len
  1132. .Lecb_enc_loop:
  1133. movdqu 0x00($inp), @XMM[0] # load input
  1134. movdqu 0x10($inp), @XMM[1]
  1135. movdqu 0x20($inp), @XMM[2]
  1136. movdqu 0x30($inp), @XMM[3]
  1137. movdqu 0x40($inp), @XMM[4]
  1138. movdqu 0x50($inp), @XMM[5]
  1139. mov %rsp, %rax # pass key schedule
  1140. movdqu 0x60($inp), @XMM[6]
  1141. mov %ebx,%r10d # pass rounds
  1142. movdqu 0x70($inp), @XMM[7]
  1143. lea 0x80($inp), $inp
  1144. call _bsaes_encrypt8
  1145. movdqu @XMM[0], 0x00($out) # write output
  1146. movdqu @XMM[1], 0x10($out)
  1147. movdqu @XMM[4], 0x20($out)
  1148. movdqu @XMM[6], 0x30($out)
  1149. movdqu @XMM[3], 0x40($out)
  1150. movdqu @XMM[7], 0x50($out)
  1151. movdqu @XMM[2], 0x60($out)
  1152. movdqu @XMM[5], 0x70($out)
  1153. lea 0x80($out), $out
  1154. sub \$8,$len
  1155. jnc .Lecb_enc_loop
  1156. add \$8,$len
  1157. jz .Lecb_enc_done
  1158. movdqu 0x00($inp), @XMM[0] # load input
  1159. mov %rsp, %rax # pass key schedule
  1160. mov %ebx,%r10d # pass rounds
  1161. cmp \$2,$len
  1162. jb .Lecb_enc_one
  1163. movdqu 0x10($inp), @XMM[1]
  1164. je .Lecb_enc_two
  1165. movdqu 0x20($inp), @XMM[2]
  1166. cmp \$4,$len
  1167. jb .Lecb_enc_three
  1168. movdqu 0x30($inp), @XMM[3]
  1169. je .Lecb_enc_four
  1170. movdqu 0x40($inp), @XMM[4]
  1171. cmp \$6,$len
  1172. jb .Lecb_enc_five
  1173. movdqu 0x50($inp), @XMM[5]
  1174. je .Lecb_enc_six
  1175. movdqu 0x60($inp), @XMM[6]
  1176. call _bsaes_encrypt8
  1177. movdqu @XMM[0], 0x00($out) # write output
  1178. movdqu @XMM[1], 0x10($out)
  1179. movdqu @XMM[4], 0x20($out)
  1180. movdqu @XMM[6], 0x30($out)
  1181. movdqu @XMM[3], 0x40($out)
  1182. movdqu @XMM[7], 0x50($out)
  1183. movdqu @XMM[2], 0x60($out)
  1184. jmp .Lecb_enc_done
  1185. .align 16
  1186. .Lecb_enc_six:
  1187. call _bsaes_encrypt8
  1188. movdqu @XMM[0], 0x00($out) # write output
  1189. movdqu @XMM[1], 0x10($out)
  1190. movdqu @XMM[4], 0x20($out)
  1191. movdqu @XMM[6], 0x30($out)
  1192. movdqu @XMM[3], 0x40($out)
  1193. movdqu @XMM[7], 0x50($out)
  1194. jmp .Lecb_enc_done
  1195. .align 16
  1196. .Lecb_enc_five:
  1197. call _bsaes_encrypt8
  1198. movdqu @XMM[0], 0x00($out) # write output
  1199. movdqu @XMM[1], 0x10($out)
  1200. movdqu @XMM[4], 0x20($out)
  1201. movdqu @XMM[6], 0x30($out)
  1202. movdqu @XMM[3], 0x40($out)
  1203. jmp .Lecb_enc_done
  1204. .align 16
  1205. .Lecb_enc_four:
  1206. call _bsaes_encrypt8
  1207. movdqu @XMM[0], 0x00($out) # write output
  1208. movdqu @XMM[1], 0x10($out)
  1209. movdqu @XMM[4], 0x20($out)
  1210. movdqu @XMM[6], 0x30($out)
  1211. jmp .Lecb_enc_done
  1212. .align 16
  1213. .Lecb_enc_three:
  1214. call _bsaes_encrypt8
  1215. movdqu @XMM[0], 0x00($out) # write output
  1216. movdqu @XMM[1], 0x10($out)
  1217. movdqu @XMM[4], 0x20($out)
  1218. jmp .Lecb_enc_done
  1219. .align 16
  1220. .Lecb_enc_two:
  1221. call _bsaes_encrypt8
  1222. movdqu @XMM[0], 0x00($out) # write output
  1223. movdqu @XMM[1], 0x10($out)
  1224. jmp .Lecb_enc_done
  1225. .align 16
  1226. .Lecb_enc_one:
  1227. call _bsaes_encrypt8
  1228. movdqu @XMM[0], 0x00($out) # write output
  1229. jmp .Lecb_enc_done
  1230. .align 16
  1231. .Lecb_enc_short:
  1232. lea ($inp), $arg1
  1233. lea ($out), $arg2
  1234. lea ($key), $arg3
  1235. call asm_AES_encrypt
  1236. lea 16($inp), $inp
  1237. lea 16($out), $out
  1238. dec $len
  1239. jnz .Lecb_enc_short
  1240. .Lecb_enc_done:
  1241. lea (%rsp),%rax
  1242. pxor %xmm0, %xmm0
  1243. .Lecb_enc_bzero: # wipe key schedule [if any]
  1244. movdqa %xmm0, 0x00(%rax)
  1245. movdqa %xmm0, 0x10(%rax)
  1246. lea 0x20(%rax), %rax
  1247. cmp %rax, %rbp
  1248. jb .Lecb_enc_bzero
  1249. lea 0x78(%rbp),%rax
  1250. .cfi_def_cfa %rax,8
  1251. ___
  1252. $code.=<<___ if ($win64);
  1253. movaps 0x40(%rbp), %xmm6
  1254. movaps 0x50(%rbp), %xmm7
  1255. movaps 0x60(%rbp), %xmm8
  1256. movaps 0x70(%rbp), %xmm9
  1257. movaps 0x80(%rbp), %xmm10
  1258. movaps 0x90(%rbp), %xmm11
  1259. movaps 0xa0(%rbp), %xmm12
  1260. movaps 0xb0(%rbp), %xmm13
  1261. movaps 0xc0(%rbp), %xmm14
  1262. movaps 0xd0(%rbp), %xmm15
  1263. lea 0xa0(%rax), %rax
  1264. .Lecb_enc_tail:
  1265. ___
  1266. $code.=<<___;
  1267. mov -48(%rax), %r15
  1268. .cfi_restore %r15
  1269. mov -40(%rax), %r14
  1270. .cfi_restore %r14
  1271. mov -32(%rax), %r13
  1272. .cfi_restore %r13
  1273. mov -24(%rax), %r12
  1274. .cfi_restore %r12
  1275. mov -16(%rax), %rbx
  1276. .cfi_restore %rbx
  1277. mov -8(%rax), %rbp
  1278. .cfi_restore %rbp
  1279. lea (%rax), %rsp # restore %rsp
  1280. .cfi_def_cfa_register %rsp
  1281. .Lecb_enc_epilogue:
  1282. ret
  1283. .cfi_endproc
  1284. .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
  1285. .globl bsaes_ecb_decrypt_blocks
  1286. .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
  1287. .align 16
  1288. bsaes_ecb_decrypt_blocks:
  1289. .cfi_startproc
  1290. mov %rsp, %rax
  1291. .Lecb_dec_prologue:
  1292. push %rbp
  1293. .cfi_push %rbp
  1294. push %rbx
  1295. .cfi_push %rbx
  1296. push %r12
  1297. .cfi_push %r12
  1298. push %r13
  1299. .cfi_push %r13
  1300. push %r14
  1301. .cfi_push %r14
  1302. push %r15
  1303. .cfi_push %r15
  1304. lea -0x48(%rsp),%rsp
  1305. .cfi_adjust_cfa_offset 0x48
  1306. ___
  1307. $code.=<<___ if ($win64);
  1308. lea -0xa0(%rsp), %rsp
  1309. movaps %xmm6, 0x40(%rsp)
  1310. movaps %xmm7, 0x50(%rsp)
  1311. movaps %xmm8, 0x60(%rsp)
  1312. movaps %xmm9, 0x70(%rsp)
  1313. movaps %xmm10, 0x80(%rsp)
  1314. movaps %xmm11, 0x90(%rsp)
  1315. movaps %xmm12, 0xa0(%rsp)
  1316. movaps %xmm13, 0xb0(%rsp)
  1317. movaps %xmm14, 0xc0(%rsp)
  1318. movaps %xmm15, 0xd0(%rsp)
  1319. .Lecb_dec_body:
  1320. ___
  1321. $code.=<<___;
  1322. mov %rsp,%rbp # backup %rsp
  1323. .cfi_def_cfa_register %rbp
  1324. mov 240($arg4),%eax # rounds
  1325. mov $arg1,$inp # backup arguments
  1326. mov $arg2,$out
  1327. mov $arg3,$len
  1328. mov $arg4,$key
  1329. cmp \$8,$arg3
  1330. jb .Lecb_dec_short
  1331. mov %eax,%ebx # backup rounds
  1332. shl \$7,%rax # 128 bytes per inner round key
  1333. sub \$`128-32`,%rax # size of bit-sliced key schedule
  1334. sub %rax,%rsp
  1335. mov %rsp,%rax # pass key schedule
  1336. mov $key,%rcx # pass key
  1337. mov %ebx,%r10d # pass rounds
  1338. call _bsaes_key_convert
  1339. pxor (%rsp),%xmm7 # fix up 0 round key
  1340. movdqa %xmm6,(%rax) # save last round key
  1341. movdqa %xmm7,(%rsp)
  1342. sub \$8,$len
  1343. .Lecb_dec_loop:
  1344. movdqu 0x00($inp), @XMM[0] # load input
  1345. movdqu 0x10($inp), @XMM[1]
  1346. movdqu 0x20($inp), @XMM[2]
  1347. movdqu 0x30($inp), @XMM[3]
  1348. movdqu 0x40($inp), @XMM[4]
  1349. movdqu 0x50($inp), @XMM[5]
  1350. mov %rsp, %rax # pass key schedule
  1351. movdqu 0x60($inp), @XMM[6]
  1352. mov %ebx,%r10d # pass rounds
  1353. movdqu 0x70($inp), @XMM[7]
  1354. lea 0x80($inp), $inp
  1355. call _bsaes_decrypt8
  1356. movdqu @XMM[0], 0x00($out) # write output
  1357. movdqu @XMM[1], 0x10($out)
  1358. movdqu @XMM[6], 0x20($out)
  1359. movdqu @XMM[4], 0x30($out)
  1360. movdqu @XMM[2], 0x40($out)
  1361. movdqu @XMM[7], 0x50($out)
  1362. movdqu @XMM[3], 0x60($out)
  1363. movdqu @XMM[5], 0x70($out)
  1364. lea 0x80($out), $out
  1365. sub \$8,$len
  1366. jnc .Lecb_dec_loop
  1367. add \$8,$len
  1368. jz .Lecb_dec_done
  1369. movdqu 0x00($inp), @XMM[0] # load input
  1370. mov %rsp, %rax # pass key schedule
  1371. mov %ebx,%r10d # pass rounds
  1372. cmp \$2,$len
  1373. jb .Lecb_dec_one
  1374. movdqu 0x10($inp), @XMM[1]
  1375. je .Lecb_dec_two
  1376. movdqu 0x20($inp), @XMM[2]
  1377. cmp \$4,$len
  1378. jb .Lecb_dec_three
  1379. movdqu 0x30($inp), @XMM[3]
  1380. je .Lecb_dec_four
  1381. movdqu 0x40($inp), @XMM[4]
  1382. cmp \$6,$len
  1383. jb .Lecb_dec_five
  1384. movdqu 0x50($inp), @XMM[5]
  1385. je .Lecb_dec_six
  1386. movdqu 0x60($inp), @XMM[6]
  1387. call _bsaes_decrypt8
  1388. movdqu @XMM[0], 0x00($out) # write output
  1389. movdqu @XMM[1], 0x10($out)
  1390. movdqu @XMM[6], 0x20($out)
  1391. movdqu @XMM[4], 0x30($out)
  1392. movdqu @XMM[2], 0x40($out)
  1393. movdqu @XMM[7], 0x50($out)
  1394. movdqu @XMM[3], 0x60($out)
  1395. jmp .Lecb_dec_done
  1396. .align 16
  1397. .Lecb_dec_six:
  1398. call _bsaes_decrypt8
  1399. movdqu @XMM[0], 0x00($out) # write output
  1400. movdqu @XMM[1], 0x10($out)
  1401. movdqu @XMM[6], 0x20($out)
  1402. movdqu @XMM[4], 0x30($out)
  1403. movdqu @XMM[2], 0x40($out)
  1404. movdqu @XMM[7], 0x50($out)
  1405. jmp .Lecb_dec_done
  1406. .align 16
  1407. .Lecb_dec_five:
  1408. call _bsaes_decrypt8
  1409. movdqu @XMM[0], 0x00($out) # write output
  1410. movdqu @XMM[1], 0x10($out)
  1411. movdqu @XMM[6], 0x20($out)
  1412. movdqu @XMM[4], 0x30($out)
  1413. movdqu @XMM[2], 0x40($out)
  1414. jmp .Lecb_dec_done
  1415. .align 16
  1416. .Lecb_dec_four:
  1417. call _bsaes_decrypt8
  1418. movdqu @XMM[0], 0x00($out) # write output
  1419. movdqu @XMM[1], 0x10($out)
  1420. movdqu @XMM[6], 0x20($out)
  1421. movdqu @XMM[4], 0x30($out)
  1422. jmp .Lecb_dec_done
  1423. .align 16
  1424. .Lecb_dec_three:
  1425. call _bsaes_decrypt8
  1426. movdqu @XMM[0], 0x00($out) # write output
  1427. movdqu @XMM[1], 0x10($out)
  1428. movdqu @XMM[6], 0x20($out)
  1429. jmp .Lecb_dec_done
  1430. .align 16
  1431. .Lecb_dec_two:
  1432. call _bsaes_decrypt8
  1433. movdqu @XMM[0], 0x00($out) # write output
  1434. movdqu @XMM[1], 0x10($out)
  1435. jmp .Lecb_dec_done
  1436. .align 16
  1437. .Lecb_dec_one:
  1438. call _bsaes_decrypt8
  1439. movdqu @XMM[0], 0x00($out) # write output
  1440. jmp .Lecb_dec_done
  1441. .align 16
  1442. .Lecb_dec_short:
  1443. lea ($inp), $arg1
  1444. lea ($out), $arg2
  1445. lea ($key), $arg3
  1446. call asm_AES_decrypt
  1447. lea 16($inp), $inp
  1448. lea 16($out), $out
  1449. dec $len
  1450. jnz .Lecb_dec_short
  1451. .Lecb_dec_done:
  1452. lea (%rsp),%rax
  1453. pxor %xmm0, %xmm0
  1454. .Lecb_dec_bzero: # wipe key schedule [if any]
  1455. movdqa %xmm0, 0x00(%rax)
  1456. movdqa %xmm0, 0x10(%rax)
  1457. lea 0x20(%rax), %rax
  1458. cmp %rax, %rbp
  1459. jb .Lecb_dec_bzero
  1460. lea 0x78(%rbp),%rax
  1461. .cfi_def_cfa %rax,8
  1462. ___
  1463. $code.=<<___ if ($win64);
  1464. movaps 0x40(%rbp), %xmm6
  1465. movaps 0x50(%rbp), %xmm7
  1466. movaps 0x60(%rbp), %xmm8
  1467. movaps 0x70(%rbp), %xmm9
  1468. movaps 0x80(%rbp), %xmm10
  1469. movaps 0x90(%rbp), %xmm11
  1470. movaps 0xa0(%rbp), %xmm12
  1471. movaps 0xb0(%rbp), %xmm13
  1472. movaps 0xc0(%rbp), %xmm14
  1473. movaps 0xd0(%rbp), %xmm15
  1474. lea 0xa0(%rax), %rax
  1475. .Lecb_dec_tail:
  1476. ___
  1477. $code.=<<___;
  1478. mov -48(%rax), %r15
  1479. .cfi_restore %r15
  1480. mov -40(%rax), %r14
  1481. .cfi_restore %r14
  1482. mov -32(%rax), %r13
  1483. .cfi_restore %r13
  1484. mov -24(%rax), %r12
  1485. .cfi_restore %r12
  1486. mov -16(%rax), %rbx
  1487. .cfi_restore %rbx
  1488. mov -8(%rax), %rbp
  1489. .cfi_restore %rbp
  1490. lea (%rax), %rsp # restore %rsp
  1491. .cfi_def_cfa_register %rsp
  1492. .Lecb_dec_epilogue:
  1493. ret
  1494. .cfi_endproc
  1495. .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
  1496. ___
  1497. }
  1498. $code.=<<___;
  1499. .extern asm_AES_cbc_encrypt
  1500. .globl ossl_bsaes_cbc_encrypt
  1501. .type ossl_bsaes_cbc_encrypt,\@abi-omnipotent
  1502. .align 16
  1503. ossl_bsaes_cbc_encrypt:
  1504. .cfi_startproc
  1505. endbranch
  1506. ___
  1507. $code.=<<___ if ($win64);
  1508. mov 48(%rsp),$arg6 # pull direction flag
  1509. ___
  1510. $code.=<<___;
  1511. cmp \$0,$arg6
  1512. jne asm_AES_cbc_encrypt
  1513. cmp \$128,$arg3
  1514. jb asm_AES_cbc_encrypt
  1515. mov %rsp, %rax
  1516. .Lcbc_dec_prologue:
  1517. push %rbp
  1518. .cfi_push %rbp
  1519. push %rbx
  1520. .cfi_push %rbx
  1521. push %r12
  1522. .cfi_push %r12
  1523. push %r13
  1524. .cfi_push %r13
  1525. push %r14
  1526. .cfi_push %r14
  1527. push %r15
  1528. .cfi_push %r15
  1529. lea -0x48(%rsp), %rsp
  1530. .cfi_adjust_cfa_offset 0x48
  1531. ___
  1532. $code.=<<___ if ($win64);
  1533. mov 0xa0(%rsp),$arg5 # pull ivp
  1534. lea -0xa0(%rsp), %rsp
  1535. movaps %xmm6, 0x40(%rsp)
  1536. movaps %xmm7, 0x50(%rsp)
  1537. movaps %xmm8, 0x60(%rsp)
  1538. movaps %xmm9, 0x70(%rsp)
  1539. movaps %xmm10, 0x80(%rsp)
  1540. movaps %xmm11, 0x90(%rsp)
  1541. movaps %xmm12, 0xa0(%rsp)
  1542. movaps %xmm13, 0xb0(%rsp)
  1543. movaps %xmm14, 0xc0(%rsp)
  1544. movaps %xmm15, 0xd0(%rsp)
  1545. .Lcbc_dec_body:
  1546. ___
  1547. $code.=<<___;
  1548. mov %rsp, %rbp # backup %rsp
  1549. .cfi_def_cfa_register %rbp
  1550. mov 240($arg4), %eax # rounds
  1551. mov $arg1, $inp # backup arguments
  1552. mov $arg2, $out
  1553. mov $arg3, $len
  1554. mov $arg4, $key
  1555. mov $arg5, %rbx
  1556. shr \$4, $len # bytes to blocks
  1557. mov %eax, %edx # rounds
  1558. shl \$7, %rax # 128 bytes per inner round key
  1559. sub \$`128-32`, %rax # size of bit-sliced key schedule
  1560. sub %rax, %rsp
  1561. mov %rsp, %rax # pass key schedule
  1562. mov $key, %rcx # pass key
  1563. mov %edx, %r10d # pass rounds
  1564. call _bsaes_key_convert
  1565. pxor (%rsp),%xmm7 # fix up 0 round key
  1566. movdqa %xmm6,(%rax) # save last round key
  1567. movdqa %xmm7,(%rsp)
  1568. movdqu (%rbx), @XMM[15] # load IV
  1569. sub \$8,$len
  1570. .Lcbc_dec_loop:
  1571. movdqu 0x00($inp), @XMM[0] # load input
  1572. movdqu 0x10($inp), @XMM[1]
  1573. movdqu 0x20($inp), @XMM[2]
  1574. movdqu 0x30($inp), @XMM[3]
  1575. movdqu 0x40($inp), @XMM[4]
  1576. movdqu 0x50($inp), @XMM[5]
  1577. mov %rsp, %rax # pass key schedule
  1578. movdqu 0x60($inp), @XMM[6]
  1579. mov %edx,%r10d # pass rounds
  1580. movdqu 0x70($inp), @XMM[7]
  1581. movdqa @XMM[15], 0x20(%rbp) # put aside IV
  1582. call _bsaes_decrypt8
  1583. pxor 0x20(%rbp), @XMM[0] # ^= IV
  1584. movdqu 0x00($inp), @XMM[8] # re-load input
  1585. movdqu 0x10($inp), @XMM[9]
  1586. pxor @XMM[8], @XMM[1]
  1587. movdqu 0x20($inp), @XMM[10]
  1588. pxor @XMM[9], @XMM[6]
  1589. movdqu 0x30($inp), @XMM[11]
  1590. pxor @XMM[10], @XMM[4]
  1591. movdqu 0x40($inp), @XMM[12]
  1592. pxor @XMM[11], @XMM[2]
  1593. movdqu 0x50($inp), @XMM[13]
  1594. pxor @XMM[12], @XMM[7]
  1595. movdqu 0x60($inp), @XMM[14]
  1596. pxor @XMM[13], @XMM[3]
  1597. movdqu 0x70($inp), @XMM[15] # IV
  1598. pxor @XMM[14], @XMM[5]
  1599. movdqu @XMM[0], 0x00($out) # write output
  1600. lea 0x80($inp), $inp
  1601. movdqu @XMM[1], 0x10($out)
  1602. movdqu @XMM[6], 0x20($out)
  1603. movdqu @XMM[4], 0x30($out)
  1604. movdqu @XMM[2], 0x40($out)
  1605. movdqu @XMM[7], 0x50($out)
  1606. movdqu @XMM[3], 0x60($out)
  1607. movdqu @XMM[5], 0x70($out)
  1608. lea 0x80($out), $out
  1609. sub \$8,$len
  1610. jnc .Lcbc_dec_loop
  1611. add \$8,$len
  1612. jz .Lcbc_dec_done
  1613. movdqu 0x00($inp), @XMM[0] # load input
  1614. mov %rsp, %rax # pass key schedule
  1615. mov %edx, %r10d # pass rounds
  1616. cmp \$2,$len
  1617. jb .Lcbc_dec_one
  1618. movdqu 0x10($inp), @XMM[1]
  1619. je .Lcbc_dec_two
  1620. movdqu 0x20($inp), @XMM[2]
  1621. cmp \$4,$len
  1622. jb .Lcbc_dec_three
  1623. movdqu 0x30($inp), @XMM[3]
  1624. je .Lcbc_dec_four
  1625. movdqu 0x40($inp), @XMM[4]
  1626. cmp \$6,$len
  1627. jb .Lcbc_dec_five
  1628. movdqu 0x50($inp), @XMM[5]
  1629. je .Lcbc_dec_six
  1630. movdqu 0x60($inp), @XMM[6]
  1631. movdqa @XMM[15], 0x20(%rbp) # put aside IV
  1632. call _bsaes_decrypt8
  1633. pxor 0x20(%rbp), @XMM[0] # ^= IV
  1634. movdqu 0x00($inp), @XMM[8] # re-load input
  1635. movdqu 0x10($inp), @XMM[9]
  1636. pxor @XMM[8], @XMM[1]
  1637. movdqu 0x20($inp), @XMM[10]
  1638. pxor @XMM[9], @XMM[6]
  1639. movdqu 0x30($inp), @XMM[11]
  1640. pxor @XMM[10], @XMM[4]
  1641. movdqu 0x40($inp), @XMM[12]
  1642. pxor @XMM[11], @XMM[2]
  1643. movdqu 0x50($inp), @XMM[13]
  1644. pxor @XMM[12], @XMM[7]
  1645. movdqu 0x60($inp), @XMM[15] # IV
  1646. pxor @XMM[13], @XMM[3]
  1647. movdqu @XMM[0], 0x00($out) # write output
  1648. movdqu @XMM[1], 0x10($out)
  1649. movdqu @XMM[6], 0x20($out)
  1650. movdqu @XMM[4], 0x30($out)
  1651. movdqu @XMM[2], 0x40($out)
  1652. movdqu @XMM[7], 0x50($out)
  1653. movdqu @XMM[3], 0x60($out)
  1654. jmp .Lcbc_dec_done
  1655. .align 16
  1656. .Lcbc_dec_six:
  1657. movdqa @XMM[15], 0x20(%rbp) # put aside IV
  1658. call _bsaes_decrypt8
  1659. pxor 0x20(%rbp), @XMM[0] # ^= IV
  1660. movdqu 0x00($inp), @XMM[8] # re-load input
  1661. movdqu 0x10($inp), @XMM[9]
  1662. pxor @XMM[8], @XMM[1]
  1663. movdqu 0x20($inp), @XMM[10]
  1664. pxor @XMM[9], @XMM[6]
  1665. movdqu 0x30($inp), @XMM[11]
  1666. pxor @XMM[10], @XMM[4]
  1667. movdqu 0x40($inp), @XMM[12]
  1668. pxor @XMM[11], @XMM[2]
  1669. movdqu 0x50($inp), @XMM[15] # IV
  1670. pxor @XMM[12], @XMM[7]
  1671. movdqu @XMM[0], 0x00($out) # write output
  1672. movdqu @XMM[1], 0x10($out)
  1673. movdqu @XMM[6], 0x20($out)
  1674. movdqu @XMM[4], 0x30($out)
  1675. movdqu @XMM[2], 0x40($out)
  1676. movdqu @XMM[7], 0x50($out)
  1677. jmp .Lcbc_dec_done
  1678. .align 16
  1679. .Lcbc_dec_five:
  1680. movdqa @XMM[15], 0x20(%rbp) # put aside IV
  1681. call _bsaes_decrypt8
  1682. pxor 0x20(%rbp), @XMM[0] # ^= IV
  1683. movdqu 0x00($inp), @XMM[8] # re-load input
  1684. movdqu 0x10($inp), @XMM[9]
  1685. pxor @XMM[8], @XMM[1]
  1686. movdqu 0x20($inp), @XMM[10]
  1687. pxor @XMM[9], @XMM[6]
  1688. movdqu 0x30($inp), @XMM[11]
  1689. pxor @XMM[10], @XMM[4]
  1690. movdqu 0x40($inp), @XMM[15] # IV
  1691. pxor @XMM[11], @XMM[2]
  1692. movdqu @XMM[0], 0x00($out) # write output
  1693. movdqu @XMM[1], 0x10($out)
  1694. movdqu @XMM[6], 0x20($out)
  1695. movdqu @XMM[4], 0x30($out)
  1696. movdqu @XMM[2], 0x40($out)
  1697. jmp .Lcbc_dec_done
  1698. .align 16
  1699. .Lcbc_dec_four:
  1700. movdqa @XMM[15], 0x20(%rbp) # put aside IV
  1701. call _bsaes_decrypt8
  1702. pxor 0x20(%rbp), @XMM[0] # ^= IV
  1703. movdqu 0x00($inp), @XMM[8] # re-load input
  1704. movdqu 0x10($inp), @XMM[9]
  1705. pxor @XMM[8], @XMM[1]
  1706. movdqu 0x20($inp), @XMM[10]
  1707. pxor @XMM[9], @XMM[6]
  1708. movdqu 0x30($inp), @XMM[15] # IV
  1709. pxor @XMM[10], @XMM[4]
  1710. movdqu @XMM[0], 0x00($out) # write output
  1711. movdqu @XMM[1], 0x10($out)
  1712. movdqu @XMM[6], 0x20($out)
  1713. movdqu @XMM[4], 0x30($out)
  1714. jmp .Lcbc_dec_done
  1715. .align 16
  1716. .Lcbc_dec_three:
  1717. movdqa @XMM[15], 0x20(%rbp) # put aside IV
  1718. call _bsaes_decrypt8
  1719. pxor 0x20(%rbp), @XMM[0] # ^= IV
  1720. movdqu 0x00($inp), @XMM[8] # re-load input
  1721. movdqu 0x10($inp), @XMM[9]
  1722. pxor @XMM[8], @XMM[1]
  1723. movdqu 0x20($inp), @XMM[15] # IV
  1724. pxor @XMM[9], @XMM[6]
  1725. movdqu @XMM[0], 0x00($out) # write output
  1726. movdqu @XMM[1], 0x10($out)
  1727. movdqu @XMM[6], 0x20($out)
  1728. jmp .Lcbc_dec_done
  1729. .align 16
  1730. .Lcbc_dec_two:
  1731. movdqa @XMM[15], 0x20(%rbp) # put aside IV
  1732. call _bsaes_decrypt8
  1733. pxor 0x20(%rbp), @XMM[0] # ^= IV
  1734. movdqu 0x00($inp), @XMM[8] # re-load input
  1735. movdqu 0x10($inp), @XMM[15] # IV
  1736. pxor @XMM[8], @XMM[1]
  1737. movdqu @XMM[0], 0x00($out) # write output
  1738. movdqu @XMM[1], 0x10($out)
  1739. jmp .Lcbc_dec_done
  1740. .align 16
  1741. .Lcbc_dec_one:
  1742. lea ($inp), $arg1
  1743. lea 0x20(%rbp), $arg2 # buffer output
  1744. lea ($key), $arg3
  1745. call asm_AES_decrypt # doesn't touch %xmm
  1746. pxor 0x20(%rbp), @XMM[15] # ^= IV
  1747. movdqu @XMM[15], ($out) # write output
  1748. movdqa @XMM[0], @XMM[15] # IV
  1749. .Lcbc_dec_done:
  1750. movdqu @XMM[15], (%rbx) # return IV
  1751. lea (%rsp), %rax
  1752. pxor %xmm0, %xmm0
  1753. .Lcbc_dec_bzero: # wipe key schedule [if any]
  1754. movdqa %xmm0, 0x00(%rax)
  1755. movdqa %xmm0, 0x10(%rax)
  1756. lea 0x20(%rax), %rax
  1757. cmp %rax, %rbp
  1758. ja .Lcbc_dec_bzero
  1759. lea 0x78(%rbp),%rax
  1760. .cfi_def_cfa %rax,8
  1761. ___
  1762. $code.=<<___ if ($win64);
  1763. movaps 0x40(%rbp), %xmm6
  1764. movaps 0x50(%rbp), %xmm7
  1765. movaps 0x60(%rbp), %xmm8
  1766. movaps 0x70(%rbp), %xmm9
  1767. movaps 0x80(%rbp), %xmm10
  1768. movaps 0x90(%rbp), %xmm11
  1769. movaps 0xa0(%rbp), %xmm12
  1770. movaps 0xb0(%rbp), %xmm13
  1771. movaps 0xc0(%rbp), %xmm14
  1772. movaps 0xd0(%rbp), %xmm15
  1773. lea 0xa0(%rax), %rax
  1774. .Lcbc_dec_tail:
  1775. ___
  1776. $code.=<<___;
  1777. mov -48(%rax), %r15
  1778. .cfi_restore %r15
  1779. mov -40(%rax), %r14
  1780. .cfi_restore %r14
  1781. mov -32(%rax), %r13
  1782. .cfi_restore %r13
  1783. mov -24(%rax), %r12
  1784. .cfi_restore %r12
  1785. mov -16(%rax), %rbx
  1786. .cfi_restore %rbx
  1787. mov -8(%rax), %rbp
  1788. .cfi_restore %rbp
  1789. lea (%rax), %rsp # restore %rsp
  1790. .cfi_def_cfa_register %rsp
  1791. .Lcbc_dec_epilogue:
  1792. ret
  1793. .cfi_endproc
  1794. .size ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt
  1795. .globl ossl_bsaes_ctr32_encrypt_blocks
  1796. .type ossl_bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
  1797. .align 16
  1798. ossl_bsaes_ctr32_encrypt_blocks:
  1799. .cfi_startproc
  1800. endbranch
  1801. mov %rsp, %rax
  1802. .Lctr_enc_prologue:
  1803. push %rbp
  1804. .cfi_push %rbp
  1805. push %rbx
  1806. .cfi_push %rbx
  1807. push %r12
  1808. .cfi_push %r12
  1809. push %r13
  1810. .cfi_push %r13
  1811. push %r14
  1812. .cfi_push %r14
  1813. push %r15
  1814. .cfi_push %r15
  1815. lea -0x48(%rsp), %rsp
  1816. .cfi_adjust_cfa_offset 0x48
  1817. ___
  1818. $code.=<<___ if ($win64);
  1819. mov 0xa0(%rsp),$arg5 # pull ivp
  1820. lea -0xa0(%rsp), %rsp
  1821. movaps %xmm6, 0x40(%rsp)
  1822. movaps %xmm7, 0x50(%rsp)
  1823. movaps %xmm8, 0x60(%rsp)
  1824. movaps %xmm9, 0x70(%rsp)
  1825. movaps %xmm10, 0x80(%rsp)
  1826. movaps %xmm11, 0x90(%rsp)
  1827. movaps %xmm12, 0xa0(%rsp)
  1828. movaps %xmm13, 0xb0(%rsp)
  1829. movaps %xmm14, 0xc0(%rsp)
  1830. movaps %xmm15, 0xd0(%rsp)
  1831. .Lctr_enc_body:
  1832. ___
  1833. $code.=<<___;
  1834. mov %rsp, %rbp # backup %rsp
  1835. .cfi_def_cfa_register %rbp
  1836. movdqu ($arg5), %xmm0 # load counter
  1837. mov 240($arg4), %eax # rounds
  1838. mov $arg1, $inp # backup arguments
  1839. mov $arg2, $out
  1840. mov $arg3, $len
  1841. mov $arg4, $key
  1842. movdqa %xmm0, 0x20(%rbp) # copy counter
  1843. cmp \$8, $arg3
  1844. jb .Lctr_enc_short
  1845. mov %eax, %ebx # rounds
  1846. shl \$7, %rax # 128 bytes per inner round key
  1847. sub \$`128-32`, %rax # size of bit-sliced key schedule
  1848. sub %rax, %rsp
  1849. mov %rsp, %rax # pass key schedule
  1850. mov $key, %rcx # pass key
  1851. mov %ebx, %r10d # pass rounds
  1852. call _bsaes_key_convert
  1853. pxor %xmm6,%xmm7 # fix up last round key
  1854. movdqa %xmm7,(%rax) # save last round key
  1855. movdqa (%rsp), @XMM[9] # load round0 key
  1856. lea .LADD1(%rip), %r11
  1857. movdqa 0x20(%rbp), @XMM[0] # counter copy
  1858. movdqa -0x20(%r11), @XMM[8] # .LSWPUP
  1859. pshufb @XMM[8], @XMM[9] # byte swap upper part
  1860. pshufb @XMM[8], @XMM[0]
  1861. movdqa @XMM[9], (%rsp) # save adjusted round0 key
  1862. jmp .Lctr_enc_loop
  1863. .align 16
  1864. .Lctr_enc_loop:
  1865. movdqa @XMM[0], 0x20(%rbp) # save counter
  1866. movdqa @XMM[0], @XMM[1] # prepare 8 counter values
  1867. movdqa @XMM[0], @XMM[2]
  1868. paddd 0x00(%r11), @XMM[1] # .LADD1
  1869. movdqa @XMM[0], @XMM[3]
  1870. paddd 0x10(%r11), @XMM[2] # .LADD2
  1871. movdqa @XMM[0], @XMM[4]
  1872. paddd 0x20(%r11), @XMM[3] # .LADD3
  1873. movdqa @XMM[0], @XMM[5]
  1874. paddd 0x30(%r11), @XMM[4] # .LADD4
  1875. movdqa @XMM[0], @XMM[6]
  1876. paddd 0x40(%r11), @XMM[5] # .LADD5
  1877. movdqa @XMM[0], @XMM[7]
  1878. paddd 0x50(%r11), @XMM[6] # .LADD6
  1879. paddd 0x60(%r11), @XMM[7] # .LADD7
  1880. # Borrow prologue from _bsaes_encrypt8 to use the opportunity
  1881. # to flip byte order in 32-bit counter
  1882. movdqa (%rsp), @XMM[9] # round 0 key
  1883. lea 0x10(%rsp), %rax # pass key schedule
  1884. movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
  1885. pxor @XMM[9], @XMM[0] # xor with round0 key
  1886. pxor @XMM[9], @XMM[1]
  1887. pxor @XMM[9], @XMM[2]
  1888. pxor @XMM[9], @XMM[3]
  1889. pshufb @XMM[8], @XMM[0]
  1890. pshufb @XMM[8], @XMM[1]
  1891. pxor @XMM[9], @XMM[4]
  1892. pxor @XMM[9], @XMM[5]
  1893. pshufb @XMM[8], @XMM[2]
  1894. pshufb @XMM[8], @XMM[3]
  1895. pxor @XMM[9], @XMM[6]
  1896. pxor @XMM[9], @XMM[7]
  1897. pshufb @XMM[8], @XMM[4]
  1898. pshufb @XMM[8], @XMM[5]
  1899. pshufb @XMM[8], @XMM[6]
  1900. pshufb @XMM[8], @XMM[7]
  1901. lea .LBS0(%rip), %r11 # constants table
  1902. mov %ebx,%r10d # pass rounds
  1903. call _bsaes_encrypt8_bitslice
  1904. sub \$8,$len
  1905. jc .Lctr_enc_loop_done
  1906. movdqu 0x00($inp), @XMM[8] # load input
  1907. movdqu 0x10($inp), @XMM[9]
  1908. movdqu 0x20($inp), @XMM[10]
  1909. movdqu 0x30($inp), @XMM[11]
  1910. movdqu 0x40($inp), @XMM[12]
  1911. movdqu 0x50($inp), @XMM[13]
  1912. movdqu 0x60($inp), @XMM[14]
  1913. movdqu 0x70($inp), @XMM[15]
  1914. lea 0x80($inp),$inp
  1915. pxor @XMM[0], @XMM[8]
  1916. movdqa 0x20(%rbp), @XMM[0] # load counter
  1917. pxor @XMM[9], @XMM[1]
  1918. movdqu @XMM[8], 0x00($out) # write output
  1919. pxor @XMM[10], @XMM[4]
  1920. movdqu @XMM[1], 0x10($out)
  1921. pxor @XMM[11], @XMM[6]
  1922. movdqu @XMM[4], 0x20($out)
  1923. pxor @XMM[12], @XMM[3]
  1924. movdqu @XMM[6], 0x30($out)
  1925. pxor @XMM[13], @XMM[7]
  1926. movdqu @XMM[3], 0x40($out)
  1927. pxor @XMM[14], @XMM[2]
  1928. movdqu @XMM[7], 0x50($out)
  1929. pxor @XMM[15], @XMM[5]
  1930. movdqu @XMM[2], 0x60($out)
  1931. lea .LADD1(%rip), %r11
  1932. movdqu @XMM[5], 0x70($out)
  1933. lea 0x80($out), $out
  1934. paddd 0x70(%r11), @XMM[0] # .LADD8
  1935. jnz .Lctr_enc_loop
  1936. jmp .Lctr_enc_done
  1937. .align 16
  1938. .Lctr_enc_loop_done:
  1939. add \$8, $len
  1940. movdqu 0x00($inp), @XMM[8] # load input
  1941. pxor @XMM[8], @XMM[0]
  1942. movdqu @XMM[0], 0x00($out) # write output
  1943. cmp \$2,$len
  1944. jb .Lctr_enc_done
  1945. movdqu 0x10($inp), @XMM[9]
  1946. pxor @XMM[9], @XMM[1]
  1947. movdqu @XMM[1], 0x10($out)
  1948. je .Lctr_enc_done
  1949. movdqu 0x20($inp), @XMM[10]
  1950. pxor @XMM[10], @XMM[4]
  1951. movdqu @XMM[4], 0x20($out)
  1952. cmp \$4,$len
  1953. jb .Lctr_enc_done
  1954. movdqu 0x30($inp), @XMM[11]
  1955. pxor @XMM[11], @XMM[6]
  1956. movdqu @XMM[6], 0x30($out)
  1957. je .Lctr_enc_done
  1958. movdqu 0x40($inp), @XMM[12]
  1959. pxor @XMM[12], @XMM[3]
  1960. movdqu @XMM[3], 0x40($out)
  1961. cmp \$6,$len
  1962. jb .Lctr_enc_done
  1963. movdqu 0x50($inp), @XMM[13]
  1964. pxor @XMM[13], @XMM[7]
  1965. movdqu @XMM[7], 0x50($out)
  1966. je .Lctr_enc_done
  1967. movdqu 0x60($inp), @XMM[14]
  1968. pxor @XMM[14], @XMM[2]
  1969. movdqu @XMM[2], 0x60($out)
  1970. jmp .Lctr_enc_done
  1971. .align 16
  1972. .Lctr_enc_short:
  1973. lea 0x20(%rbp), $arg1
  1974. lea 0x30(%rbp), $arg2
  1975. lea ($key), $arg3
  1976. call asm_AES_encrypt
  1977. movdqu ($inp), @XMM[1]
  1978. lea 16($inp), $inp
  1979. mov 0x2c(%rbp), %eax # load 32-bit counter
  1980. bswap %eax
  1981. pxor 0x30(%rbp), @XMM[1]
  1982. inc %eax # increment
  1983. movdqu @XMM[1], ($out)
  1984. bswap %eax
  1985. lea 16($out), $out
  1986. mov %eax, 0x2c(%rsp) # save 32-bit counter
  1987. dec $len
  1988. jnz .Lctr_enc_short
  1989. .Lctr_enc_done:
  1990. lea (%rsp), %rax
  1991. pxor %xmm0, %xmm0
  1992. .Lctr_enc_bzero: # wipe key schedule [if any]
  1993. movdqa %xmm0, 0x00(%rax)
  1994. movdqa %xmm0, 0x10(%rax)
  1995. lea 0x20(%rax), %rax
  1996. cmp %rax, %rbp
  1997. ja .Lctr_enc_bzero
  1998. lea 0x78(%rbp),%rax
  1999. .cfi_def_cfa %rax,8
  2000. ___
  2001. $code.=<<___ if ($win64);
  2002. movaps 0x40(%rbp), %xmm6
  2003. movaps 0x50(%rbp), %xmm7
  2004. movaps 0x60(%rbp), %xmm8
  2005. movaps 0x70(%rbp), %xmm9
  2006. movaps 0x80(%rbp), %xmm10
  2007. movaps 0x90(%rbp), %xmm11
  2008. movaps 0xa0(%rbp), %xmm12
  2009. movaps 0xb0(%rbp), %xmm13
  2010. movaps 0xc0(%rbp), %xmm14
  2011. movaps 0xd0(%rbp), %xmm15
  2012. lea 0xa0(%rax), %rax
  2013. .Lctr_enc_tail:
  2014. ___
  2015. $code.=<<___;
  2016. mov -48(%rax), %r15
  2017. .cfi_restore %r15
  2018. mov -40(%rax), %r14
  2019. .cfi_restore %r14
  2020. mov -32(%rax), %r13
  2021. .cfi_restore %r13
  2022. mov -24(%rax), %r12
  2023. .cfi_restore %r12
  2024. mov -16(%rax), %rbx
  2025. .cfi_restore %rbx
  2026. mov -8(%rax), %rbp
  2027. .cfi_restore %rbp
  2028. lea (%rax), %rsp # restore %rsp
  2029. .cfi_def_cfa_register %rsp
  2030. .Lctr_enc_epilogue:
  2031. ret
  2032. .cfi_endproc
  2033. .size ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks
  2034. ___
  2035. ######################################################################
  2036. # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
  2037. # const AES_KEY *key1, const AES_KEY *key2,
  2038. # const unsigned char iv[16]);
  2039. #
  2040. my ($twmask,$twres,$twtmp)=@XMM[13..15];
  2041. $arg6=~s/d$//;
  2042. $code.=<<___;
  2043. .globl ossl_bsaes_xts_encrypt
  2044. .type ossl_bsaes_xts_encrypt,\@abi-omnipotent
  2045. .align 16
  2046. ossl_bsaes_xts_encrypt:
  2047. .cfi_startproc
  2048. mov %rsp, %rax
  2049. .Lxts_enc_prologue:
  2050. push %rbp
  2051. .cfi_push %rbp
  2052. push %rbx
  2053. .cfi_push %rbx
  2054. push %r12
  2055. .cfi_push %r12
  2056. push %r13
  2057. .cfi_push %r13
  2058. push %r14
  2059. .cfi_push %r14
  2060. push %r15
  2061. .cfi_push %r15
  2062. lea -0x48(%rsp), %rsp
  2063. .cfi_adjust_cfa_offset 0x48
  2064. ___
  2065. $code.=<<___ if ($win64);
  2066. mov 0xa0(%rsp),$arg5 # pull key2
  2067. mov 0xa8(%rsp),$arg6 # pull ivp
  2068. lea -0xa0(%rsp), %rsp
  2069. movaps %xmm6, 0x40(%rsp)
  2070. movaps %xmm7, 0x50(%rsp)
  2071. movaps %xmm8, 0x60(%rsp)
  2072. movaps %xmm9, 0x70(%rsp)
  2073. movaps %xmm10, 0x80(%rsp)
  2074. movaps %xmm11, 0x90(%rsp)
  2075. movaps %xmm12, 0xa0(%rsp)
  2076. movaps %xmm13, 0xb0(%rsp)
  2077. movaps %xmm14, 0xc0(%rsp)
  2078. movaps %xmm15, 0xd0(%rsp)
  2079. .Lxts_enc_body:
  2080. ___
  2081. $code.=<<___;
  2082. mov %rsp, %rbp # backup %rsp
  2083. .cfi_def_cfa_register %rbp
  2084. mov $arg1, $inp # backup arguments
  2085. mov $arg2, $out
  2086. mov $arg3, $len
  2087. mov $arg4, $key
  2088. lea ($arg6), $arg1
  2089. lea 0x20(%rbp), $arg2
  2090. lea ($arg5), $arg3
  2091. call asm_AES_encrypt # generate initial tweak
  2092. mov 240($key), %eax # rounds
  2093. mov $len, %rbx # backup $len
  2094. mov %eax, %edx # rounds
  2095. shl \$7, %rax # 128 bytes per inner round key
  2096. sub \$`128-32`, %rax # size of bit-sliced key schedule
  2097. sub %rax, %rsp
  2098. mov %rsp, %rax # pass key schedule
  2099. mov $key, %rcx # pass key
  2100. mov %edx, %r10d # pass rounds
  2101. call _bsaes_key_convert
  2102. pxor %xmm6, %xmm7 # fix up last round key
  2103. movdqa %xmm7, (%rax) # save last round key
  2104. and \$-16, $len
  2105. sub \$0x80, %rsp # place for tweak[8]
  2106. movdqa 0x20(%rbp), @XMM[7] # initial tweak
  2107. pxor $twtmp, $twtmp
  2108. movdqa .Lxts_magic(%rip), $twmask
  2109. pcmpgtd @XMM[7], $twtmp # broadcast upper bits
  2110. sub \$0x80, $len
  2111. jc .Lxts_enc_short
  2112. jmp .Lxts_enc_loop
  2113. .align 16
  2114. .Lxts_enc_loop:
  2115. ___
  2116. for ($i=0;$i<7;$i++) {
  2117. $code.=<<___;
  2118. pshufd \$0x13, $twtmp, $twres
  2119. pxor $twtmp, $twtmp
  2120. movdqa @XMM[7], @XMM[$i]
  2121. movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
  2122. paddq @XMM[7], @XMM[7] # psllq 1,$tweak
  2123. pand $twmask, $twres # isolate carry and residue
  2124. pcmpgtd @XMM[7], $twtmp # broadcast upper bits
  2125. pxor $twres, @XMM[7]
  2126. ___
  2127. $code.=<<___ if ($i>=1);
  2128. movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
  2129. ___
  2130. $code.=<<___ if ($i>=2);
  2131. pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
  2132. ___
  2133. }
  2134. $code.=<<___;
  2135. movdqu 0x60($inp), @XMM[8+6]
  2136. pxor @XMM[8+5], @XMM[5]
  2137. movdqu 0x70($inp), @XMM[8+7]
  2138. lea 0x80($inp), $inp
  2139. movdqa @XMM[7], 0x70(%rsp)
  2140. pxor @XMM[8+6], @XMM[6]
  2141. lea 0x80(%rsp), %rax # pass key schedule
  2142. pxor @XMM[8+7], @XMM[7]
  2143. mov %edx, %r10d # pass rounds
  2144. call _bsaes_encrypt8
  2145. pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2146. pxor 0x10(%rsp), @XMM[1]
  2147. movdqu @XMM[0], 0x00($out) # write output
  2148. pxor 0x20(%rsp), @XMM[4]
  2149. movdqu @XMM[1], 0x10($out)
  2150. pxor 0x30(%rsp), @XMM[6]
  2151. movdqu @XMM[4], 0x20($out)
  2152. pxor 0x40(%rsp), @XMM[3]
  2153. movdqu @XMM[6], 0x30($out)
  2154. pxor 0x50(%rsp), @XMM[7]
  2155. movdqu @XMM[3], 0x40($out)
  2156. pxor 0x60(%rsp), @XMM[2]
  2157. movdqu @XMM[7], 0x50($out)
  2158. pxor 0x70(%rsp), @XMM[5]
  2159. movdqu @XMM[2], 0x60($out)
  2160. movdqu @XMM[5], 0x70($out)
  2161. lea 0x80($out), $out
  2162. movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
  2163. pxor $twtmp, $twtmp
  2164. movdqa .Lxts_magic(%rip), $twmask
  2165. pcmpgtd @XMM[7], $twtmp
  2166. pshufd \$0x13, $twtmp, $twres
  2167. pxor $twtmp, $twtmp
  2168. paddq @XMM[7], @XMM[7] # psllq 1,$tweak
  2169. pand $twmask, $twres # isolate carry and residue
  2170. pcmpgtd @XMM[7], $twtmp # broadcast upper bits
  2171. pxor $twres, @XMM[7]
  2172. sub \$0x80,$len
  2173. jnc .Lxts_enc_loop
  2174. .Lxts_enc_short:
  2175. add \$0x80, $len
  2176. jz .Lxts_enc_done
  2177. ___
  2178. for ($i=0;$i<7;$i++) {
  2179. $code.=<<___;
  2180. pshufd \$0x13, $twtmp, $twres
  2181. pxor $twtmp, $twtmp
  2182. movdqa @XMM[7], @XMM[$i]
  2183. movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
  2184. paddq @XMM[7], @XMM[7] # psllq 1,$tweak
  2185. pand $twmask, $twres # isolate carry and residue
  2186. pcmpgtd @XMM[7], $twtmp # broadcast upper bits
  2187. pxor $twres, @XMM[7]
  2188. ___
  2189. $code.=<<___ if ($i>=1);
  2190. movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
  2191. cmp \$`0x10*$i`,$len
  2192. je .Lxts_enc_$i
  2193. ___
  2194. $code.=<<___ if ($i>=2);
  2195. pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
  2196. ___
  2197. }
  2198. $code.=<<___;
  2199. movdqu 0x60($inp), @XMM[8+6]
  2200. pxor @XMM[8+5], @XMM[5]
  2201. movdqa @XMM[7], 0x70(%rsp)
  2202. lea 0x70($inp), $inp
  2203. pxor @XMM[8+6], @XMM[6]
  2204. lea 0x80(%rsp), %rax # pass key schedule
  2205. mov %edx, %r10d # pass rounds
  2206. call _bsaes_encrypt8
  2207. pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2208. pxor 0x10(%rsp), @XMM[1]
  2209. movdqu @XMM[0], 0x00($out) # write output
  2210. pxor 0x20(%rsp), @XMM[4]
  2211. movdqu @XMM[1], 0x10($out)
  2212. pxor 0x30(%rsp), @XMM[6]
  2213. movdqu @XMM[4], 0x20($out)
  2214. pxor 0x40(%rsp), @XMM[3]
  2215. movdqu @XMM[6], 0x30($out)
  2216. pxor 0x50(%rsp), @XMM[7]
  2217. movdqu @XMM[3], 0x40($out)
  2218. pxor 0x60(%rsp), @XMM[2]
  2219. movdqu @XMM[7], 0x50($out)
  2220. movdqu @XMM[2], 0x60($out)
  2221. lea 0x70($out), $out
  2222. movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
  2223. jmp .Lxts_enc_done
  2224. .align 16
  2225. .Lxts_enc_6:
  2226. pxor @XMM[8+4], @XMM[4]
  2227. lea 0x60($inp), $inp
  2228. pxor @XMM[8+5], @XMM[5]
  2229. lea 0x80(%rsp), %rax # pass key schedule
  2230. mov %edx, %r10d # pass rounds
  2231. call _bsaes_encrypt8
  2232. pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2233. pxor 0x10(%rsp), @XMM[1]
  2234. movdqu @XMM[0], 0x00($out) # write output
  2235. pxor 0x20(%rsp), @XMM[4]
  2236. movdqu @XMM[1], 0x10($out)
  2237. pxor 0x30(%rsp), @XMM[6]
  2238. movdqu @XMM[4], 0x20($out)
  2239. pxor 0x40(%rsp), @XMM[3]
  2240. movdqu @XMM[6], 0x30($out)
  2241. pxor 0x50(%rsp), @XMM[7]
  2242. movdqu @XMM[3], 0x40($out)
  2243. movdqu @XMM[7], 0x50($out)
  2244. lea 0x60($out), $out
  2245. movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
  2246. jmp .Lxts_enc_done
  2247. .align 16
  2248. .Lxts_enc_5:
  2249. pxor @XMM[8+3], @XMM[3]
  2250. lea 0x50($inp), $inp
  2251. pxor @XMM[8+4], @XMM[4]
  2252. lea 0x80(%rsp), %rax # pass key schedule
  2253. mov %edx, %r10d # pass rounds
  2254. call _bsaes_encrypt8
  2255. pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2256. pxor 0x10(%rsp), @XMM[1]
  2257. movdqu @XMM[0], 0x00($out) # write output
  2258. pxor 0x20(%rsp), @XMM[4]
  2259. movdqu @XMM[1], 0x10($out)
  2260. pxor 0x30(%rsp), @XMM[6]
  2261. movdqu @XMM[4], 0x20($out)
  2262. pxor 0x40(%rsp), @XMM[3]
  2263. movdqu @XMM[6], 0x30($out)
  2264. movdqu @XMM[3], 0x40($out)
  2265. lea 0x50($out), $out
  2266. movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
  2267. jmp .Lxts_enc_done
  2268. .align 16
  2269. .Lxts_enc_4:
  2270. pxor @XMM[8+2], @XMM[2]
  2271. lea 0x40($inp), $inp
  2272. pxor @XMM[8+3], @XMM[3]
  2273. lea 0x80(%rsp), %rax # pass key schedule
  2274. mov %edx, %r10d # pass rounds
  2275. call _bsaes_encrypt8
  2276. pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2277. pxor 0x10(%rsp), @XMM[1]
  2278. movdqu @XMM[0], 0x00($out) # write output
  2279. pxor 0x20(%rsp), @XMM[4]
  2280. movdqu @XMM[1], 0x10($out)
  2281. pxor 0x30(%rsp), @XMM[6]
  2282. movdqu @XMM[4], 0x20($out)
  2283. movdqu @XMM[6], 0x30($out)
  2284. lea 0x40($out), $out
  2285. movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
  2286. jmp .Lxts_enc_done
  2287. .align 16
  2288. .Lxts_enc_3:
  2289. pxor @XMM[8+1], @XMM[1]
  2290. lea 0x30($inp), $inp
  2291. pxor @XMM[8+2], @XMM[2]
  2292. lea 0x80(%rsp), %rax # pass key schedule
  2293. mov %edx, %r10d # pass rounds
  2294. call _bsaes_encrypt8
  2295. pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2296. pxor 0x10(%rsp), @XMM[1]
  2297. movdqu @XMM[0], 0x00($out) # write output
  2298. pxor 0x20(%rsp), @XMM[4]
  2299. movdqu @XMM[1], 0x10($out)
  2300. movdqu @XMM[4], 0x20($out)
  2301. lea 0x30($out), $out
  2302. movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
  2303. jmp .Lxts_enc_done
  2304. .align 16
  2305. .Lxts_enc_2:
  2306. pxor @XMM[8+0], @XMM[0]
  2307. lea 0x20($inp), $inp
  2308. pxor @XMM[8+1], @XMM[1]
  2309. lea 0x80(%rsp), %rax # pass key schedule
  2310. mov %edx, %r10d # pass rounds
  2311. call _bsaes_encrypt8
  2312. pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2313. pxor 0x10(%rsp), @XMM[1]
  2314. movdqu @XMM[0], 0x00($out) # write output
  2315. movdqu @XMM[1], 0x10($out)
  2316. lea 0x20($out), $out
  2317. movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
  2318. jmp .Lxts_enc_done
  2319. .align 16
  2320. .Lxts_enc_1:
  2321. pxor @XMM[0], @XMM[8]
  2322. lea 0x10($inp), $inp
  2323. movdqa @XMM[8], 0x20(%rbp)
  2324. lea 0x20(%rbp), $arg1
  2325. lea 0x20(%rbp), $arg2
  2326. lea ($key), $arg3
  2327. call asm_AES_encrypt # doesn't touch %xmm
  2328. pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
  2329. #pxor @XMM[8], @XMM[0]
  2330. #lea 0x80(%rsp), %rax # pass key schedule
  2331. #mov %edx, %r10d # pass rounds
  2332. #call _bsaes_encrypt8
  2333. #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2334. movdqu @XMM[0], 0x00($out) # write output
  2335. lea 0x10($out), $out
  2336. movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
  2337. .Lxts_enc_done:
  2338. and \$15, %ebx
  2339. jz .Lxts_enc_ret
  2340. mov $out, %rdx
  2341. .Lxts_enc_steal:
  2342. movzb ($inp), %eax
  2343. movzb -16(%rdx), %ecx
  2344. lea 1($inp), $inp
  2345. mov %al, -16(%rdx)
  2346. mov %cl, 0(%rdx)
  2347. lea 1(%rdx), %rdx
  2348. sub \$1,%ebx
  2349. jnz .Lxts_enc_steal
  2350. movdqu -16($out), @XMM[0]
  2351. lea 0x20(%rbp), $arg1
  2352. pxor @XMM[7], @XMM[0]
  2353. lea 0x20(%rbp), $arg2
  2354. movdqa @XMM[0], 0x20(%rbp)
  2355. lea ($key), $arg3
  2356. call asm_AES_encrypt # doesn't touch %xmm
  2357. pxor 0x20(%rbp), @XMM[7]
  2358. movdqu @XMM[7], -16($out)
  2359. .Lxts_enc_ret:
  2360. lea (%rsp), %rax
  2361. pxor %xmm0, %xmm0
  2362. .Lxts_enc_bzero: # wipe key schedule [if any]
  2363. movdqa %xmm0, 0x00(%rax)
  2364. movdqa %xmm0, 0x10(%rax)
  2365. lea 0x20(%rax), %rax
  2366. cmp %rax, %rbp
  2367. ja .Lxts_enc_bzero
  2368. lea 0x78(%rbp),%rax
  2369. .cfi_def_cfa %rax,8
  2370. ___
  2371. $code.=<<___ if ($win64);
  2372. movaps 0x40(%rbp), %xmm6
  2373. movaps 0x50(%rbp), %xmm7
  2374. movaps 0x60(%rbp), %xmm8
  2375. movaps 0x70(%rbp), %xmm9
  2376. movaps 0x80(%rbp), %xmm10
  2377. movaps 0x90(%rbp), %xmm11
  2378. movaps 0xa0(%rbp), %xmm12
  2379. movaps 0xb0(%rbp), %xmm13
  2380. movaps 0xc0(%rbp), %xmm14
  2381. movaps 0xd0(%rbp), %xmm15
  2382. lea 0xa0(%rax), %rax
  2383. .Lxts_enc_tail:
  2384. ___
  2385. $code.=<<___;
  2386. mov -48(%rax), %r15
  2387. .cfi_restore %r15
  2388. mov -40(%rax), %r14
  2389. .cfi_restore %r14
  2390. mov -32(%rax), %r13
  2391. .cfi_restore %r13
  2392. mov -24(%rax), %r12
  2393. .cfi_restore %r12
  2394. mov -16(%rax), %rbx
  2395. .cfi_restore %rbx
  2396. mov -8(%rax), %rbp
  2397. .cfi_restore %rbp
  2398. lea (%rax), %rsp # restore %rsp
  2399. .cfi_def_cfa_register %rsp
  2400. .Lxts_enc_epilogue:
  2401. ret
  2402. .cfi_endproc
  2403. .size ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt
  2404. .globl ossl_bsaes_xts_decrypt
  2405. .type ossl_bsaes_xts_decrypt,\@abi-omnipotent
  2406. .align 16
  2407. ossl_bsaes_xts_decrypt:
  2408. .cfi_startproc
  2409. mov %rsp, %rax
  2410. .Lxts_dec_prologue:
  2411. push %rbp
  2412. .cfi_push %rbp
  2413. push %rbx
  2414. .cfi_push %rbx
  2415. push %r12
  2416. .cfi_push %r12
  2417. push %r13
  2418. .cfi_push %r13
  2419. push %r14
  2420. .cfi_push %r14
  2421. push %r15
  2422. .cfi_push %r15
  2423. lea -0x48(%rsp), %rsp
  2424. .cfi_adjust_cfa_offset 0x48
  2425. ___
  2426. $code.=<<___ if ($win64);
  2427. mov 0xa0(%rsp),$arg5 # pull key2
  2428. mov 0xa8(%rsp),$arg6 # pull ivp
  2429. lea -0xa0(%rsp), %rsp
  2430. movaps %xmm6, 0x40(%rsp)
  2431. movaps %xmm7, 0x50(%rsp)
  2432. movaps %xmm8, 0x60(%rsp)
  2433. movaps %xmm9, 0x70(%rsp)
  2434. movaps %xmm10, 0x80(%rsp)
  2435. movaps %xmm11, 0x90(%rsp)
  2436. movaps %xmm12, 0xa0(%rsp)
  2437. movaps %xmm13, 0xb0(%rsp)
  2438. movaps %xmm14, 0xc0(%rsp)
  2439. movaps %xmm15, 0xd0(%rsp)
  2440. .Lxts_dec_body:
  2441. ___
  2442. $code.=<<___;
  2443. mov %rsp, %rbp # backup %rsp
  2444. mov $arg1, $inp # backup arguments
  2445. mov $arg2, $out
  2446. mov $arg3, $len
  2447. mov $arg4, $key
  2448. lea ($arg6), $arg1
  2449. lea 0x20(%rbp), $arg2
  2450. lea ($arg5), $arg3
  2451. call asm_AES_encrypt # generate initial tweak
  2452. mov 240($key), %eax # rounds
  2453. mov $len, %rbx # backup $len
  2454. mov %eax, %edx # rounds
  2455. shl \$7, %rax # 128 bytes per inner round key
  2456. sub \$`128-32`, %rax # size of bit-sliced key schedule
  2457. sub %rax, %rsp
  2458. mov %rsp, %rax # pass key schedule
  2459. mov $key, %rcx # pass key
  2460. mov %edx, %r10d # pass rounds
  2461. call _bsaes_key_convert
  2462. pxor (%rsp), %xmm7 # fix up round 0 key
  2463. movdqa %xmm6, (%rax) # save last round key
  2464. movdqa %xmm7, (%rsp)
  2465. xor %eax, %eax # if ($len%16) len-=16;
  2466. and \$-16, $len
  2467. test \$15, %ebx
  2468. setnz %al
  2469. shl \$4, %rax
  2470. sub %rax, $len
  2471. sub \$0x80, %rsp # place for tweak[8]
  2472. movdqa 0x20(%rbp), @XMM[7] # initial tweak
  2473. pxor $twtmp, $twtmp
  2474. movdqa .Lxts_magic(%rip), $twmask
  2475. pcmpgtd @XMM[7], $twtmp # broadcast upper bits
  2476. sub \$0x80, $len
  2477. jc .Lxts_dec_short
  2478. jmp .Lxts_dec_loop
  2479. .align 16
  2480. .Lxts_dec_loop:
  2481. ___
  2482. for ($i=0;$i<7;$i++) {
  2483. $code.=<<___;
  2484. pshufd \$0x13, $twtmp, $twres
  2485. pxor $twtmp, $twtmp
  2486. movdqa @XMM[7], @XMM[$i]
  2487. movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
  2488. paddq @XMM[7], @XMM[7] # psllq 1,$tweak
  2489. pand $twmask, $twres # isolate carry and residue
  2490. pcmpgtd @XMM[7], $twtmp # broadcast upper bits
  2491. pxor $twres, @XMM[7]
  2492. ___
  2493. $code.=<<___ if ($i>=1);
  2494. movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
  2495. ___
  2496. $code.=<<___ if ($i>=2);
  2497. pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
  2498. ___
  2499. }
  2500. $code.=<<___;
  2501. movdqu 0x60($inp), @XMM[8+6]
  2502. pxor @XMM[8+5], @XMM[5]
  2503. movdqu 0x70($inp), @XMM[8+7]
  2504. lea 0x80($inp), $inp
  2505. movdqa @XMM[7], 0x70(%rsp)
  2506. pxor @XMM[8+6], @XMM[6]
  2507. lea 0x80(%rsp), %rax # pass key schedule
  2508. pxor @XMM[8+7], @XMM[7]
  2509. mov %edx, %r10d # pass rounds
  2510. call _bsaes_decrypt8
  2511. pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2512. pxor 0x10(%rsp), @XMM[1]
  2513. movdqu @XMM[0], 0x00($out) # write output
  2514. pxor 0x20(%rsp), @XMM[6]
  2515. movdqu @XMM[1], 0x10($out)
  2516. pxor 0x30(%rsp), @XMM[4]
  2517. movdqu @XMM[6], 0x20($out)
  2518. pxor 0x40(%rsp), @XMM[2]
  2519. movdqu @XMM[4], 0x30($out)
  2520. pxor 0x50(%rsp), @XMM[7]
  2521. movdqu @XMM[2], 0x40($out)
  2522. pxor 0x60(%rsp), @XMM[3]
  2523. movdqu @XMM[7], 0x50($out)
  2524. pxor 0x70(%rsp), @XMM[5]
  2525. movdqu @XMM[3], 0x60($out)
  2526. movdqu @XMM[5], 0x70($out)
  2527. lea 0x80($out), $out
  2528. movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
  2529. pxor $twtmp, $twtmp
  2530. movdqa .Lxts_magic(%rip), $twmask
  2531. pcmpgtd @XMM[7], $twtmp
  2532. pshufd \$0x13, $twtmp, $twres
  2533. pxor $twtmp, $twtmp
  2534. paddq @XMM[7], @XMM[7] # psllq 1,$tweak
  2535. pand $twmask, $twres # isolate carry and residue
  2536. pcmpgtd @XMM[7], $twtmp # broadcast upper bits
  2537. pxor $twres, @XMM[7]
  2538. sub \$0x80,$len
  2539. jnc .Lxts_dec_loop
  2540. .Lxts_dec_short:
  2541. add \$0x80, $len
  2542. jz .Lxts_dec_done
  2543. ___
  2544. for ($i=0;$i<7;$i++) {
  2545. $code.=<<___;
  2546. pshufd \$0x13, $twtmp, $twres
  2547. pxor $twtmp, $twtmp
  2548. movdqa @XMM[7], @XMM[$i]
  2549. movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
  2550. paddq @XMM[7], @XMM[7] # psllq 1,$tweak
  2551. pand $twmask, $twres # isolate carry and residue
  2552. pcmpgtd @XMM[7], $twtmp # broadcast upper bits
  2553. pxor $twres, @XMM[7]
  2554. ___
  2555. $code.=<<___ if ($i>=1);
  2556. movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
  2557. cmp \$`0x10*$i`,$len
  2558. je .Lxts_dec_$i
  2559. ___
  2560. $code.=<<___ if ($i>=2);
  2561. pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
  2562. ___
  2563. }
  2564. $code.=<<___;
  2565. movdqu 0x60($inp), @XMM[8+6]
  2566. pxor @XMM[8+5], @XMM[5]
  2567. movdqa @XMM[7], 0x70(%rsp)
  2568. lea 0x70($inp), $inp
  2569. pxor @XMM[8+6], @XMM[6]
  2570. lea 0x80(%rsp), %rax # pass key schedule
  2571. mov %edx, %r10d # pass rounds
  2572. call _bsaes_decrypt8
  2573. pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2574. pxor 0x10(%rsp), @XMM[1]
  2575. movdqu @XMM[0], 0x00($out) # write output
  2576. pxor 0x20(%rsp), @XMM[6]
  2577. movdqu @XMM[1], 0x10($out)
  2578. pxor 0x30(%rsp), @XMM[4]
  2579. movdqu @XMM[6], 0x20($out)
  2580. pxor 0x40(%rsp), @XMM[2]
  2581. movdqu @XMM[4], 0x30($out)
  2582. pxor 0x50(%rsp), @XMM[7]
  2583. movdqu @XMM[2], 0x40($out)
  2584. pxor 0x60(%rsp), @XMM[3]
  2585. movdqu @XMM[7], 0x50($out)
  2586. movdqu @XMM[3], 0x60($out)
  2587. lea 0x70($out), $out
  2588. movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
  2589. jmp .Lxts_dec_done
  2590. .align 16
  2591. .Lxts_dec_6:
  2592. pxor @XMM[8+4], @XMM[4]
  2593. lea 0x60($inp), $inp
  2594. pxor @XMM[8+5], @XMM[5]
  2595. lea 0x80(%rsp), %rax # pass key schedule
  2596. mov %edx, %r10d # pass rounds
  2597. call _bsaes_decrypt8
  2598. pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2599. pxor 0x10(%rsp), @XMM[1]
  2600. movdqu @XMM[0], 0x00($out) # write output
  2601. pxor 0x20(%rsp), @XMM[6]
  2602. movdqu @XMM[1], 0x10($out)
  2603. pxor 0x30(%rsp), @XMM[4]
  2604. movdqu @XMM[6], 0x20($out)
  2605. pxor 0x40(%rsp), @XMM[2]
  2606. movdqu @XMM[4], 0x30($out)
  2607. pxor 0x50(%rsp), @XMM[7]
  2608. movdqu @XMM[2], 0x40($out)
  2609. movdqu @XMM[7], 0x50($out)
  2610. lea 0x60($out), $out
  2611. movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
  2612. jmp .Lxts_dec_done
  2613. .align 16
  2614. .Lxts_dec_5:
  2615. pxor @XMM[8+3], @XMM[3]
  2616. lea 0x50($inp), $inp
  2617. pxor @XMM[8+4], @XMM[4]
  2618. lea 0x80(%rsp), %rax # pass key schedule
  2619. mov %edx, %r10d # pass rounds
  2620. call _bsaes_decrypt8
  2621. pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2622. pxor 0x10(%rsp), @XMM[1]
  2623. movdqu @XMM[0], 0x00($out) # write output
  2624. pxor 0x20(%rsp), @XMM[6]
  2625. movdqu @XMM[1], 0x10($out)
  2626. pxor 0x30(%rsp), @XMM[4]
  2627. movdqu @XMM[6], 0x20($out)
  2628. pxor 0x40(%rsp), @XMM[2]
  2629. movdqu @XMM[4], 0x30($out)
  2630. movdqu @XMM[2], 0x40($out)
  2631. lea 0x50($out), $out
  2632. movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
  2633. jmp .Lxts_dec_done
  2634. .align 16
  2635. .Lxts_dec_4:
  2636. pxor @XMM[8+2], @XMM[2]
  2637. lea 0x40($inp), $inp
  2638. pxor @XMM[8+3], @XMM[3]
  2639. lea 0x80(%rsp), %rax # pass key schedule
  2640. mov %edx, %r10d # pass rounds
  2641. call _bsaes_decrypt8
  2642. pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2643. pxor 0x10(%rsp), @XMM[1]
  2644. movdqu @XMM[0], 0x00($out) # write output
  2645. pxor 0x20(%rsp), @XMM[6]
  2646. movdqu @XMM[1], 0x10($out)
  2647. pxor 0x30(%rsp), @XMM[4]
  2648. movdqu @XMM[6], 0x20($out)
  2649. movdqu @XMM[4], 0x30($out)
  2650. lea 0x40($out), $out
  2651. movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
  2652. jmp .Lxts_dec_done
  2653. .align 16
  2654. .Lxts_dec_3:
  2655. pxor @XMM[8+1], @XMM[1]
  2656. lea 0x30($inp), $inp
  2657. pxor @XMM[8+2], @XMM[2]
  2658. lea 0x80(%rsp), %rax # pass key schedule
  2659. mov %edx, %r10d # pass rounds
  2660. call _bsaes_decrypt8
  2661. pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2662. pxor 0x10(%rsp), @XMM[1]
  2663. movdqu @XMM[0], 0x00($out) # write output
  2664. pxor 0x20(%rsp), @XMM[6]
  2665. movdqu @XMM[1], 0x10($out)
  2666. movdqu @XMM[6], 0x20($out)
  2667. lea 0x30($out), $out
  2668. movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
  2669. jmp .Lxts_dec_done
  2670. .align 16
  2671. .Lxts_dec_2:
  2672. pxor @XMM[8+0], @XMM[0]
  2673. lea 0x20($inp), $inp
  2674. pxor @XMM[8+1], @XMM[1]
  2675. lea 0x80(%rsp), %rax # pass key schedule
  2676. mov %edx, %r10d # pass rounds
  2677. call _bsaes_decrypt8
  2678. pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2679. pxor 0x10(%rsp), @XMM[1]
  2680. movdqu @XMM[0], 0x00($out) # write output
  2681. movdqu @XMM[1], 0x10($out)
  2682. lea 0x20($out), $out
  2683. movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
  2684. jmp .Lxts_dec_done
  2685. .align 16
  2686. .Lxts_dec_1:
  2687. pxor @XMM[0], @XMM[8]
  2688. lea 0x10($inp), $inp
  2689. movdqa @XMM[8], 0x20(%rbp)
  2690. lea 0x20(%rbp), $arg1
  2691. lea 0x20(%rbp), $arg2
  2692. lea ($key), $arg3
  2693. call asm_AES_decrypt # doesn't touch %xmm
  2694. pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
  2695. #pxor @XMM[8], @XMM[0]
  2696. #lea 0x80(%rsp), %rax # pass key schedule
  2697. #mov %edx, %r10d # pass rounds
  2698. #call _bsaes_decrypt8
  2699. #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2700. movdqu @XMM[0], 0x00($out) # write output
  2701. lea 0x10($out), $out
  2702. movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
  2703. .Lxts_dec_done:
  2704. and \$15, %ebx
  2705. jz .Lxts_dec_ret
  2706. pxor $twtmp, $twtmp
  2707. movdqa .Lxts_magic(%rip), $twmask
  2708. pcmpgtd @XMM[7], $twtmp
  2709. pshufd \$0x13, $twtmp, $twres
  2710. movdqa @XMM[7], @XMM[6]
  2711. paddq @XMM[7], @XMM[7] # psllq 1,$tweak
  2712. pand $twmask, $twres # isolate carry and residue
  2713. movdqu ($inp), @XMM[0]
  2714. pxor $twres, @XMM[7]
  2715. lea 0x20(%rbp), $arg1
  2716. pxor @XMM[7], @XMM[0]
  2717. lea 0x20(%rbp), $arg2
  2718. movdqa @XMM[0], 0x20(%rbp)
  2719. lea ($key), $arg3
  2720. call asm_AES_decrypt # doesn't touch %xmm
  2721. pxor 0x20(%rbp), @XMM[7]
  2722. mov $out, %rdx
  2723. movdqu @XMM[7], ($out)
  2724. .Lxts_dec_steal:
  2725. movzb 16($inp), %eax
  2726. movzb (%rdx), %ecx
  2727. lea 1($inp), $inp
  2728. mov %al, (%rdx)
  2729. mov %cl, 16(%rdx)
  2730. lea 1(%rdx), %rdx
  2731. sub \$1,%ebx
  2732. jnz .Lxts_dec_steal
  2733. movdqu ($out), @XMM[0]
  2734. lea 0x20(%rbp), $arg1
  2735. pxor @XMM[6], @XMM[0]
  2736. lea 0x20(%rbp), $arg2
  2737. movdqa @XMM[0], 0x20(%rbp)
  2738. lea ($key), $arg3
  2739. call asm_AES_decrypt # doesn't touch %xmm
  2740. pxor 0x20(%rbp), @XMM[6]
  2741. movdqu @XMM[6], ($out)
  2742. .Lxts_dec_ret:
  2743. lea (%rsp), %rax
  2744. pxor %xmm0, %xmm0
  2745. .Lxts_dec_bzero: # wipe key schedule [if any]
  2746. movdqa %xmm0, 0x00(%rax)
  2747. movdqa %xmm0, 0x10(%rax)
  2748. lea 0x20(%rax), %rax
  2749. cmp %rax, %rbp
  2750. ja .Lxts_dec_bzero
  2751. lea 0x78(%rbp),%rax
  2752. .cfi_def_cfa %rax,8
  2753. ___
  2754. $code.=<<___ if ($win64);
  2755. movaps 0x40(%rbp), %xmm6
  2756. movaps 0x50(%rbp), %xmm7
  2757. movaps 0x60(%rbp), %xmm8
  2758. movaps 0x70(%rbp), %xmm9
  2759. movaps 0x80(%rbp), %xmm10
  2760. movaps 0x90(%rbp), %xmm11
  2761. movaps 0xa0(%rbp), %xmm12
  2762. movaps 0xb0(%rbp), %xmm13
  2763. movaps 0xc0(%rbp), %xmm14
  2764. movaps 0xd0(%rbp), %xmm15
  2765. lea 0xa0(%rax), %rax
  2766. .Lxts_dec_tail:
  2767. ___
  2768. $code.=<<___;
  2769. mov -48(%rax), %r15
  2770. .cfi_restore %r15
  2771. mov -40(%rax), %r14
  2772. .cfi_restore %r14
  2773. mov -32(%rax), %r13
  2774. .cfi_restore %r13
  2775. mov -24(%rax), %r12
  2776. .cfi_restore %r12
  2777. mov -16(%rax), %rbx
  2778. .cfi_restore %rbx
  2779. mov -8(%rax), %rbp
  2780. .cfi_restore %rbp
  2781. lea (%rax), %rsp # restore %rsp
  2782. .cfi_def_cfa_register %rsp
  2783. .Lxts_dec_epilogue:
  2784. ret
  2785. .cfi_endproc
  2786. .size ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt
  2787. ___
  2788. }
  2789. $code.=<<___;
  2790. .type _bsaes_const,\@object
  2791. .align 64
  2792. _bsaes_const:
  2793. .LM0ISR: # InvShiftRows constants
  2794. .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
  2795. .LISRM0:
  2796. .quad 0x01040b0e0205080f, 0x0306090c00070a0d
  2797. .LISR:
  2798. .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
  2799. .LBS0: # bit-slice constants
  2800. .quad 0x5555555555555555, 0x5555555555555555
  2801. .LBS1:
  2802. .quad 0x3333333333333333, 0x3333333333333333
  2803. .LBS2:
  2804. .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
  2805. .LSR: # shiftrows constants
  2806. .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
  2807. .LSRM0:
  2808. .quad 0x0304090e00050a0f, 0x01060b0c0207080d
  2809. .LM0SR:
  2810. .quad 0x0a0e02060f03070b, 0x0004080c05090d01
  2811. .LSWPUP: # byte-swap upper dword
  2812. .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
  2813. .LSWPUPM0SR:
  2814. .quad 0x0a0d02060c03070b, 0x0004080f05090e01
  2815. .LADD1: # counter increment constants
  2816. .quad 0x0000000000000000, 0x0000000100000000
  2817. .LADD2:
  2818. .quad 0x0000000000000000, 0x0000000200000000
  2819. .LADD3:
  2820. .quad 0x0000000000000000, 0x0000000300000000
  2821. .LADD4:
  2822. .quad 0x0000000000000000, 0x0000000400000000
  2823. .LADD5:
  2824. .quad 0x0000000000000000, 0x0000000500000000
  2825. .LADD6:
  2826. .quad 0x0000000000000000, 0x0000000600000000
  2827. .LADD7:
  2828. .quad 0x0000000000000000, 0x0000000700000000
  2829. .LADD8:
  2830. .quad 0x0000000000000000, 0x0000000800000000
  2831. .Lxts_magic:
  2832. .long 0x87,0,1,0
  2833. .Lmasks:
  2834. .quad 0x0101010101010101, 0x0101010101010101
  2835. .quad 0x0202020202020202, 0x0202020202020202
  2836. .quad 0x0404040404040404, 0x0404040404040404
  2837. .quad 0x0808080808080808, 0x0808080808080808
  2838. .LM0:
  2839. .quad 0x02060a0e03070b0f, 0x0004080c0105090d
  2840. .L63:
  2841. .quad 0x6363636363636363, 0x6363636363636363
  2842. .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
  2843. .align 64
  2844. .size _bsaes_const,.-_bsaes_const
  2845. ___
  2846. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  2847. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  2848. if ($win64) {
  2849. $rec="%rcx";
  2850. $frame="%rdx";
  2851. $context="%r8";
  2852. $disp="%r9";
  2853. $code.=<<___;
  2854. .extern __imp_RtlVirtualUnwind
  2855. .type se_handler,\@abi-omnipotent
  2856. .align 16
  2857. se_handler:
  2858. push %rsi
  2859. push %rdi
  2860. push %rbx
  2861. push %rbp
  2862. push %r12
  2863. push %r13
  2864. push %r14
  2865. push %r15
  2866. pushfq
  2867. sub \$64,%rsp
  2868. mov 120($context),%rax # pull context->Rax
  2869. mov 248($context),%rbx # pull context->Rip
  2870. mov 8($disp),%rsi # disp->ImageBase
  2871. mov 56($disp),%r11 # disp->HandlerData
  2872. mov 0(%r11),%r10d # HandlerData[0]
  2873. lea (%rsi,%r10),%r10 # prologue label
  2874. cmp %r10,%rbx # context->Rip<=prologue label
  2875. jbe .Lin_prologue
  2876. mov 4(%r11),%r10d # HandlerData[1]
  2877. lea (%rsi,%r10),%r10 # epilogue label
  2878. cmp %r10,%rbx # context->Rip>=epilogue label
  2879. jae .Lin_prologue
  2880. mov 8(%r11),%r10d # HandlerData[2]
  2881. lea (%rsi,%r10),%r10 # epilogue label
  2882. cmp %r10,%rbx # context->Rip>=tail label
  2883. jae .Lin_tail
  2884. mov 160($context),%rax # pull context->Rbp
  2885. lea 0x40(%rax),%rsi # %xmm save area
  2886. lea 512($context),%rdi # &context.Xmm6
  2887. mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
  2888. .long 0xa548f3fc # cld; rep movsq
  2889. lea 0xa0+0x78(%rax),%rax # adjust stack pointer
  2890. .Lin_tail:
  2891. mov -48(%rax),%rbp
  2892. mov -40(%rax),%rbx
  2893. mov -32(%rax),%r12
  2894. mov -24(%rax),%r13
  2895. mov -16(%rax),%r14
  2896. mov -8(%rax),%r15
  2897. mov %rbx,144($context) # restore context->Rbx
  2898. mov %rbp,160($context) # restore context->Rbp
  2899. mov %r12,216($context) # restore context->R12
  2900. mov %r13,224($context) # restore context->R13
  2901. mov %r14,232($context) # restore context->R14
  2902. mov %r15,240($context) # restore context->R15
  2903. .Lin_prologue:
  2904. mov %rax,152($context) # restore context->Rsp
  2905. mov 40($disp),%rdi # disp->ContextRecord
  2906. mov $context,%rsi # context
  2907. mov \$`1232/8`,%ecx # sizeof(CONTEXT)
  2908. .long 0xa548f3fc # cld; rep movsq
  2909. mov $disp,%rsi
  2910. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  2911. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  2912. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  2913. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  2914. mov 40(%rsi),%r10 # disp->ContextRecord
  2915. lea 56(%rsi),%r11 # &disp->HandlerData
  2916. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  2917. mov %r10,32(%rsp) # arg5
  2918. mov %r11,40(%rsp) # arg6
  2919. mov %r12,48(%rsp) # arg7
  2920. mov %rcx,56(%rsp) # arg8, (NULL)
  2921. call *__imp_RtlVirtualUnwind(%rip)
  2922. mov \$1,%eax # ExceptionContinueSearch
  2923. add \$64,%rsp
  2924. popfq
  2925. pop %r15
  2926. pop %r14
  2927. pop %r13
  2928. pop %r12
  2929. pop %rbp
  2930. pop %rbx
  2931. pop %rdi
  2932. pop %rsi
  2933. ret
  2934. .size se_handler,.-se_handler
  2935. .section .pdata
  2936. .align 4
  2937. ___
  2938. $code.=<<___ if ($ecb);
  2939. .rva .Lecb_enc_prologue
  2940. .rva .Lecb_enc_epilogue
  2941. .rva .Lecb_enc_info
  2942. .rva .Lecb_dec_prologue
  2943. .rva .Lecb_dec_epilogue
  2944. .rva .Lecb_dec_info
  2945. ___
  2946. $code.=<<___;
  2947. .rva .Lcbc_dec_prologue
  2948. .rva .Lcbc_dec_epilogue
  2949. .rva .Lcbc_dec_info
  2950. .rva .Lctr_enc_prologue
  2951. .rva .Lctr_enc_epilogue
  2952. .rva .Lctr_enc_info
  2953. .rva .Lxts_enc_prologue
  2954. .rva .Lxts_enc_epilogue
  2955. .rva .Lxts_enc_info
  2956. .rva .Lxts_dec_prologue
  2957. .rva .Lxts_dec_epilogue
  2958. .rva .Lxts_dec_info
  2959. .section .xdata
  2960. .align 8
  2961. ___
  2962. $code.=<<___ if ($ecb);
  2963. .Lecb_enc_info:
  2964. .byte 9,0,0,0
  2965. .rva se_handler
  2966. .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
  2967. .rva .Lecb_enc_tail
  2968. .long 0
  2969. .Lecb_dec_info:
  2970. .byte 9,0,0,0
  2971. .rva se_handler
  2972. .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
  2973. .rva .Lecb_dec_tail
  2974. .long 0
  2975. ___
  2976. $code.=<<___;
  2977. .Lcbc_dec_info:
  2978. .byte 9,0,0,0
  2979. .rva se_handler
  2980. .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
  2981. .rva .Lcbc_dec_tail
  2982. .long 0
  2983. .Lctr_enc_info:
  2984. .byte 9,0,0,0
  2985. .rva se_handler
  2986. .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
  2987. .rva .Lctr_enc_tail
  2988. .long 0
  2989. .Lxts_enc_info:
  2990. .byte 9,0,0,0
  2991. .rva se_handler
  2992. .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
  2993. .rva .Lxts_enc_tail
  2994. .long 0
  2995. .Lxts_dec_info:
  2996. .byte 9,0,0,0
  2997. .rva se_handler
  2998. .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
  2999. .rva .Lxts_dec_tail
  3000. .long 0
  3001. ___
  3002. }
  3003. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  3004. print $code;
  3005. close STDOUT or die "error closing STDOUT: $!";