bsaes-x86_64.pl 71 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102
  1. #!/usr/bin/env perl
  2. ###################################################################
  3. ### AES-128 [originally in CTR mode] ###
  4. ### bitsliced implementation for Intel Core 2 processors ###
  5. ### requires support of SSE extensions up to SSSE3 ###
  6. ### Author: Emilia Käsper and Peter Schwabe ###
  7. ### Date: 2009-03-19 ###
  8. ### Public domain ###
  9. ### ###
  10. ### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
  11. ### further information. ###
  12. ###################################################################
  13. #
  14. # September 2011.
  15. #
  16. # Started as transliteration to "perlasm" the original code has
  17. # undergone following changes:
  18. #
  19. # - code was made position-independent;
  20. # - rounds were folded into a loop resulting in >5x size reduction
  21. # from 12.5KB to 2.2KB;
  22. # - above was possibile thanks to mixcolumns() modification that
  23. # allowed to feed its output back to aesenc[last], this was
  24. # achieved at cost of two additional inter-registers moves;
  25. # - some instruction reordering and interleaving;
  26. # - this module doesn't implement key setup subroutine, instead it
  27. # relies on conversion of "conventional" key schedule as returned
  28. # by AES_set_encrypt_key (see discussion below);
  29. # - first and last round keys are treated differently, which allowed
  30. # to skip one shiftrows(), reduce bit-sliced key schedule and
  31. # speed-up conversion by 22%;
  32. # - support for 192- and 256-bit keys was added;
  33. #
  34. # Resulting performance in CPU cycles spent to encrypt one byte out
  35. # of 4096-byte buffer with 128-bit key is:
  36. #
  37. # Emilia's this(*) difference
  38. #
  39. # Core 2 9.30 8.69 +7%
  40. # Nehalem(**) 7.63 6.88 +11%
  41. # Atom 17.1 16.4 +4%
  42. # Silvermont - 12.9
  43. #
  44. # (*) Comparison is not completely fair, because "this" is ECB,
  45. # i.e. no extra processing such as counter values calculation
  46. # and xor-ing input as in Emilia's CTR implementation is
  47. # performed. However, the CTR calculations stand for not more
  48. # than 1% of total time, so comparison is *rather* fair.
  49. #
  50. # (**) Results were collected on Westmere, which is considered to
  51. # be equivalent to Nehalem for this code.
  52. #
  53. # As for key schedule conversion subroutine. Interface to OpenSSL
  54. # relies on per-invocation on-the-fly conversion. This naturally
  55. # has impact on performance, especially for short inputs. Conversion
  56. # time in CPU cycles and its ratio to CPU cycles spent in 8x block
  57. # function is:
  58. #
  59. # conversion conversion/8x block
  60. # Core 2 240 0.22
  61. # Nehalem 180 0.20
  62. # Atom 430 0.20
  63. #
  64. # The ratio values mean that 128-byte blocks will be processed
  65. # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
  66. # etc. Then keep in mind that input sizes not divisible by 128 are
  67. # *effectively* slower, especially shortest ones, e.g. consecutive
  68. # 144-byte blocks are processed 44% slower than one would expect,
  69. # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
  70. # it's still faster than ["hyper-threading-safe" code path in]
  71. # aes-x86_64.pl on all lengths above 64 bytes...
  72. #
  73. # October 2011.
  74. #
  75. # Add decryption procedure. Performance in CPU cycles spent to decrypt
  76. # one byte out of 4096-byte buffer with 128-bit key is:
  77. #
  78. # Core 2 9.98
  79. # Nehalem 7.80
  80. # Atom 17.9
  81. # Silvermont 14.0
  82. #
  83. # November 2011.
  84. #
  85. # Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
  86. # suboptimal, but XTS is meant to be used with larger blocks...
  87. #
  88. # <appro@openssl.org>
  89. $flavour = shift;
  90. $output = shift;
  91. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  92. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  93. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  94. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  95. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  96. die "can't locate x86_64-xlate.pl";
  97. open OUT,"| \"$^X\" $xlate $flavour $output";
  98. *STDOUT=*OUT;
  99. my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
  100. my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
  101. my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
  102. {
  103. my ($key,$rounds,$const)=("%rax","%r10d","%r11");
  104. sub Sbox {
  105. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  106. # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
  107. my @b=@_[0..7];
  108. my @t=@_[8..11];
  109. my @s=@_[12..15];
  110. &InBasisChange (@b);
  111. &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
  112. &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
  113. }
  114. sub InBasisChange {
  115. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  116. # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
  117. my @b=@_[0..7];
  118. $code.=<<___;
  119. pxor @b[6], @b[5]
  120. pxor @b[1], @b[2]
  121. pxor @b[0], @b[3]
  122. pxor @b[2], @b[6]
  123. pxor @b[0], @b[5]
  124. pxor @b[3], @b[6]
  125. pxor @b[7], @b[3]
  126. pxor @b[5], @b[7]
  127. pxor @b[4], @b[3]
  128. pxor @b[5], @b[4]
  129. pxor @b[1], @b[3]
  130. pxor @b[7], @b[2]
  131. pxor @b[5], @b[1]
  132. ___
  133. }
  134. sub OutBasisChange {
  135. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  136. # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
  137. my @b=@_[0..7];
  138. $code.=<<___;
  139. pxor @b[6], @b[0]
  140. pxor @b[4], @b[1]
  141. pxor @b[0], @b[2]
  142. pxor @b[6], @b[4]
  143. pxor @b[1], @b[6]
  144. pxor @b[5], @b[1]
  145. pxor @b[3], @b[5]
  146. pxor @b[7], @b[3]
  147. pxor @b[5], @b[7]
  148. pxor @b[5], @b[2]
  149. pxor @b[7], @b[4]
  150. ___
  151. }
  152. sub InvSbox {
  153. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  154. # output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
  155. my @b=@_[0..7];
  156. my @t=@_[8..11];
  157. my @s=@_[12..15];
  158. &InvInBasisChange (@b);
  159. &Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
  160. &InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
  161. }
  162. sub InvInBasisChange { # OutBasisChange in reverse
  163. my @b=@_[5,1,2,6,3,7,0,4];
  164. $code.=<<___
  165. pxor @b[7], @b[4]
  166. pxor @b[5], @b[7]
  167. pxor @b[5], @b[2]
  168. pxor @b[7], @b[3]
  169. pxor @b[3], @b[5]
  170. pxor @b[5], @b[1]
  171. pxor @b[1], @b[6]
  172. pxor @b[0], @b[2]
  173. pxor @b[6], @b[4]
  174. pxor @b[6], @b[0]
  175. pxor @b[4], @b[1]
  176. ___
  177. }
  178. sub InvOutBasisChange { # InBasisChange in reverse
  179. my @b=@_[2,5,7,3,6,1,0,4];
  180. $code.=<<___;
  181. pxor @b[5], @b[1]
  182. pxor @b[7], @b[2]
  183. pxor @b[1], @b[3]
  184. pxor @b[5], @b[4]
  185. pxor @b[5], @b[7]
  186. pxor @b[4], @b[3]
  187. pxor @b[0], @b[5]
  188. pxor @b[7], @b[3]
  189. pxor @b[2], @b[6]
  190. pxor @b[1], @b[2]
  191. pxor @b[3], @b[6]
  192. pxor @b[0], @b[3]
  193. pxor @b[6], @b[5]
  194. ___
  195. }
  196. sub Mul_GF4 {
  197. #;*************************************************************
  198. #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
  199. #;*************************************************************
  200. my ($x0,$x1,$y0,$y1,$t0)=@_;
  201. $code.=<<___;
  202. movdqa $y0, $t0
  203. pxor $y1, $t0
  204. pand $x0, $t0
  205. pxor $x1, $x0
  206. pand $y0, $x1
  207. pand $y1, $x0
  208. pxor $x1, $x0
  209. pxor $t0, $x1
  210. ___
  211. }
  212. sub Mul_GF4_N { # not used, see next subroutine
  213. # multiply and scale by N
  214. my ($x0,$x1,$y0,$y1,$t0)=@_;
  215. $code.=<<___;
  216. movdqa $y0, $t0
  217. pxor $y1, $t0
  218. pand $x0, $t0
  219. pxor $x1, $x0
  220. pand $y0, $x1
  221. pand $y1, $x0
  222. pxor $x0, $x1
  223. pxor $t0, $x0
  224. ___
  225. }
  226. sub Mul_GF4_N_GF4 {
  227. # interleaved Mul_GF4_N and Mul_GF4
  228. my ($x0,$x1,$y0,$y1,$t0,
  229. $x2,$x3,$y2,$y3,$t1)=@_;
  230. $code.=<<___;
  231. movdqa $y0, $t0
  232. movdqa $y2, $t1
  233. pxor $y1, $t0
  234. pxor $y3, $t1
  235. pand $x0, $t0
  236. pand $x2, $t1
  237. pxor $x1, $x0
  238. pxor $x3, $x2
  239. pand $y0, $x1
  240. pand $y2, $x3
  241. pand $y1, $x0
  242. pand $y3, $x2
  243. pxor $x0, $x1
  244. pxor $x3, $x2
  245. pxor $t0, $x0
  246. pxor $t1, $x3
  247. ___
  248. }
  249. sub Mul_GF16_2 {
  250. my @x=@_[0..7];
  251. my @y=@_[8..11];
  252. my @t=@_[12..15];
  253. $code.=<<___;
  254. movdqa @x[0], @t[0]
  255. movdqa @x[1], @t[1]
  256. ___
  257. &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
  258. $code.=<<___;
  259. pxor @x[2], @t[0]
  260. pxor @x[3], @t[1]
  261. pxor @y[2], @y[0]
  262. pxor @y[3], @y[1]
  263. ___
  264. Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
  265. @x[2], @x[3], @y[2], @y[3], @t[2]);
  266. $code.=<<___;
  267. pxor @t[0], @x[0]
  268. pxor @t[0], @x[2]
  269. pxor @t[1], @x[1]
  270. pxor @t[1], @x[3]
  271. movdqa @x[4], @t[0]
  272. movdqa @x[5], @t[1]
  273. pxor @x[6], @t[0]
  274. pxor @x[7], @t[1]
  275. ___
  276. &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
  277. @x[6], @x[7], @y[2], @y[3], @t[2]);
  278. $code.=<<___;
  279. pxor @y[2], @y[0]
  280. pxor @y[3], @y[1]
  281. ___
  282. &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
  283. $code.=<<___;
  284. pxor @t[0], @x[4]
  285. pxor @t[0], @x[6]
  286. pxor @t[1], @x[5]
  287. pxor @t[1], @x[7]
  288. ___
  289. }
  290. sub Inv_GF256 {
  291. #;********************************************************************
  292. #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
  293. #;********************************************************************
  294. my @x=@_[0..7];
  295. my @t=@_[8..11];
  296. my @s=@_[12..15];
  297. # direct optimizations from hardware
  298. $code.=<<___;
  299. movdqa @x[4], @t[3]
  300. movdqa @x[5], @t[2]
  301. movdqa @x[1], @t[1]
  302. movdqa @x[7], @s[1]
  303. movdqa @x[0], @s[0]
  304. pxor @x[6], @t[3]
  305. pxor @x[7], @t[2]
  306. pxor @x[3], @t[1]
  307. movdqa @t[3], @s[2]
  308. pxor @x[6], @s[1]
  309. movdqa @t[2], @t[0]
  310. pxor @x[2], @s[0]
  311. movdqa @t[3], @s[3]
  312. por @t[1], @t[2]
  313. por @s[0], @t[3]
  314. pxor @t[0], @s[3]
  315. pand @s[0], @s[2]
  316. pxor @t[1], @s[0]
  317. pand @t[1], @t[0]
  318. pand @s[0], @s[3]
  319. movdqa @x[3], @s[0]
  320. pxor @x[2], @s[0]
  321. pand @s[0], @s[1]
  322. pxor @s[1], @t[3]
  323. pxor @s[1], @t[2]
  324. movdqa @x[4], @s[1]
  325. movdqa @x[1], @s[0]
  326. pxor @x[5], @s[1]
  327. pxor @x[0], @s[0]
  328. movdqa @s[1], @t[1]
  329. pand @s[0], @s[1]
  330. por @s[0], @t[1]
  331. pxor @s[1], @t[0]
  332. pxor @s[3], @t[3]
  333. pxor @s[2], @t[2]
  334. pxor @s[3], @t[1]
  335. movdqa @x[7], @s[0]
  336. pxor @s[2], @t[0]
  337. movdqa @x[6], @s[1]
  338. pxor @s[2], @t[1]
  339. movdqa @x[5], @s[2]
  340. pand @x[3], @s[0]
  341. movdqa @x[4], @s[3]
  342. pand @x[2], @s[1]
  343. pand @x[1], @s[2]
  344. por @x[0], @s[3]
  345. pxor @s[0], @t[3]
  346. pxor @s[1], @t[2]
  347. pxor @s[2], @t[1]
  348. pxor @s[3], @t[0]
  349. #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
  350. # new smaller inversion
  351. movdqa @t[3], @s[0]
  352. pand @t[1], @t[3]
  353. pxor @t[2], @s[0]
  354. movdqa @t[0], @s[2]
  355. movdqa @s[0], @s[3]
  356. pxor @t[3], @s[2]
  357. pand @s[2], @s[3]
  358. movdqa @t[1], @s[1]
  359. pxor @t[2], @s[3]
  360. pxor @t[0], @s[1]
  361. pxor @t[2], @t[3]
  362. pand @t[3], @s[1]
  363. movdqa @s[2], @t[2]
  364. pxor @t[0], @s[1]
  365. pxor @s[1], @t[2]
  366. pxor @s[1], @t[1]
  367. pand @t[0], @t[2]
  368. pxor @t[2], @s[2]
  369. pxor @t[2], @t[1]
  370. pand @s[3], @s[2]
  371. pxor @s[0], @s[2]
  372. ___
  373. # output in s3, s2, s1, t1
  374. # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
  375. # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
  376. &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
  377. ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
  378. }
  379. # AES linear components
  380. sub ShiftRows {
  381. my @x=@_[0..7];
  382. my $mask=pop;
  383. $code.=<<___;
  384. pxor 0x00($key),@x[0]
  385. pxor 0x10($key),@x[1]
  386. pxor 0x20($key),@x[2]
  387. pxor 0x30($key),@x[3]
  388. pshufb $mask,@x[0]
  389. pshufb $mask,@x[1]
  390. pxor 0x40($key),@x[4]
  391. pxor 0x50($key),@x[5]
  392. pshufb $mask,@x[2]
  393. pshufb $mask,@x[3]
  394. pxor 0x60($key),@x[6]
  395. pxor 0x70($key),@x[7]
  396. pshufb $mask,@x[4]
  397. pshufb $mask,@x[5]
  398. pshufb $mask,@x[6]
  399. pshufb $mask,@x[7]
  400. lea 0x80($key),$key
  401. ___
  402. }
  403. sub MixColumns {
  404. # modified to emit output in order suitable for feeding back to aesenc[last]
  405. my @x=@_[0..7];
  406. my @t=@_[8..15];
  407. my $inv=@_[16]; # optional
  408. $code.=<<___;
  409. pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
  410. pshufd \$0x93, @x[1], @t[1]
  411. pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
  412. pshufd \$0x93, @x[2], @t[2]
  413. pxor @t[1], @x[1]
  414. pshufd \$0x93, @x[3], @t[3]
  415. pxor @t[2], @x[2]
  416. pshufd \$0x93, @x[4], @t[4]
  417. pxor @t[3], @x[3]
  418. pshufd \$0x93, @x[5], @t[5]
  419. pxor @t[4], @x[4]
  420. pshufd \$0x93, @x[6], @t[6]
  421. pxor @t[5], @x[5]
  422. pshufd \$0x93, @x[7], @t[7]
  423. pxor @t[6], @x[6]
  424. pxor @t[7], @x[7]
  425. pxor @x[0], @t[1]
  426. pxor @x[7], @t[0]
  427. pxor @x[7], @t[1]
  428. pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
  429. pxor @x[1], @t[2]
  430. pshufd \$0x4E, @x[1], @x[1]
  431. pxor @x[4], @t[5]
  432. pxor @t[0], @x[0]
  433. pxor @x[5], @t[6]
  434. pxor @t[1], @x[1]
  435. pxor @x[3], @t[4]
  436. pshufd \$0x4E, @x[4], @t[0]
  437. pxor @x[6], @t[7]
  438. pshufd \$0x4E, @x[5], @t[1]
  439. pxor @x[2], @t[3]
  440. pshufd \$0x4E, @x[3], @x[4]
  441. pxor @x[7], @t[3]
  442. pshufd \$0x4E, @x[7], @x[5]
  443. pxor @x[7], @t[4]
  444. pshufd \$0x4E, @x[6], @x[3]
  445. pxor @t[4], @t[0]
  446. pshufd \$0x4E, @x[2], @x[6]
  447. pxor @t[5], @t[1]
  448. ___
  449. $code.=<<___ if (!$inv);
  450. pxor @t[3], @x[4]
  451. pxor @t[7], @x[5]
  452. pxor @t[6], @x[3]
  453. movdqa @t[0], @x[2]
  454. pxor @t[2], @x[6]
  455. movdqa @t[1], @x[7]
  456. ___
  457. $code.=<<___ if ($inv);
  458. pxor @x[4], @t[3]
  459. pxor @t[7], @x[5]
  460. pxor @x[3], @t[6]
  461. movdqa @t[0], @x[3]
  462. pxor @t[2], @x[6]
  463. movdqa @t[6], @x[2]
  464. movdqa @t[1], @x[7]
  465. movdqa @x[6], @x[4]
  466. movdqa @t[3], @x[6]
  467. ___
  468. }
  469. sub InvMixColumns_orig {
  470. my @x=@_[0..7];
  471. my @t=@_[8..15];
  472. $code.=<<___;
  473. # multiplication by 0x0e
  474. pshufd \$0x93, @x[7], @t[7]
  475. movdqa @x[2], @t[2]
  476. pxor @x[5], @x[7] # 7 5
  477. pxor @x[5], @x[2] # 2 5
  478. pshufd \$0x93, @x[0], @t[0]
  479. movdqa @x[5], @t[5]
  480. pxor @x[0], @x[5] # 5 0 [1]
  481. pxor @x[1], @x[0] # 0 1
  482. pshufd \$0x93, @x[1], @t[1]
  483. pxor @x[2], @x[1] # 1 25
  484. pxor @x[6], @x[0] # 01 6 [2]
  485. pxor @x[3], @x[1] # 125 3 [4]
  486. pshufd \$0x93, @x[3], @t[3]
  487. pxor @x[0], @x[2] # 25 016 [3]
  488. pxor @x[7], @x[3] # 3 75
  489. pxor @x[6], @x[7] # 75 6 [0]
  490. pshufd \$0x93, @x[6], @t[6]
  491. movdqa @x[4], @t[4]
  492. pxor @x[4], @x[6] # 6 4
  493. pxor @x[3], @x[4] # 4 375 [6]
  494. pxor @x[7], @x[3] # 375 756=36
  495. pxor @t[5], @x[6] # 64 5 [7]
  496. pxor @t[2], @x[3] # 36 2
  497. pxor @t[4], @x[3] # 362 4 [5]
  498. pshufd \$0x93, @t[5], @t[5]
  499. ___
  500. my @y = @x[7,5,0,2,1,3,4,6];
  501. $code.=<<___;
  502. # multiplication by 0x0b
  503. pxor @y[0], @y[1]
  504. pxor @t[0], @y[0]
  505. pxor @t[1], @y[1]
  506. pshufd \$0x93, @t[2], @t[2]
  507. pxor @t[5], @y[0]
  508. pxor @t[6], @y[1]
  509. pxor @t[7], @y[0]
  510. pshufd \$0x93, @t[4], @t[4]
  511. pxor @t[6], @t[7] # clobber t[7]
  512. pxor @y[0], @y[1]
  513. pxor @t[0], @y[3]
  514. pshufd \$0x93, @t[0], @t[0]
  515. pxor @t[1], @y[2]
  516. pxor @t[1], @y[4]
  517. pxor @t[2], @y[2]
  518. pshufd \$0x93, @t[1], @t[1]
  519. pxor @t[2], @y[3]
  520. pxor @t[2], @y[5]
  521. pxor @t[7], @y[2]
  522. pshufd \$0x93, @t[2], @t[2]
  523. pxor @t[3], @y[3]
  524. pxor @t[3], @y[6]
  525. pxor @t[3], @y[4]
  526. pshufd \$0x93, @t[3], @t[3]
  527. pxor @t[4], @y[7]
  528. pxor @t[4], @y[5]
  529. pxor @t[7], @y[7]
  530. pxor @t[5], @y[3]
  531. pxor @t[4], @y[4]
  532. pxor @t[5], @t[7] # clobber t[7] even more
  533. pxor @t[7], @y[5]
  534. pshufd \$0x93, @t[4], @t[4]
  535. pxor @t[7], @y[6]
  536. pxor @t[7], @y[4]
  537. pxor @t[5], @t[7]
  538. pshufd \$0x93, @t[5], @t[5]
  539. pxor @t[6], @t[7] # restore t[7]
  540. # multiplication by 0x0d
  541. pxor @y[7], @y[4]
  542. pxor @t[4], @y[7]
  543. pshufd \$0x93, @t[6], @t[6]
  544. pxor @t[0], @y[2]
  545. pxor @t[5], @y[7]
  546. pxor @t[2], @y[2]
  547. pshufd \$0x93, @t[7], @t[7]
  548. pxor @y[1], @y[3]
  549. pxor @t[1], @y[1]
  550. pxor @t[0], @y[0]
  551. pxor @t[0], @y[3]
  552. pxor @t[5], @y[1]
  553. pxor @t[5], @y[0]
  554. pxor @t[7], @y[1]
  555. pshufd \$0x93, @t[0], @t[0]
  556. pxor @t[6], @y[0]
  557. pxor @y[1], @y[3]
  558. pxor @t[1], @y[4]
  559. pshufd \$0x93, @t[1], @t[1]
  560. pxor @t[7], @y[7]
  561. pxor @t[2], @y[4]
  562. pxor @t[2], @y[5]
  563. pshufd \$0x93, @t[2], @t[2]
  564. pxor @t[6], @y[2]
  565. pxor @t[3], @t[6] # clobber t[6]
  566. pxor @y[7], @y[4]
  567. pxor @t[6], @y[3]
  568. pxor @t[6], @y[6]
  569. pxor @t[5], @y[5]
  570. pxor @t[4], @y[6]
  571. pshufd \$0x93, @t[4], @t[4]
  572. pxor @t[6], @y[5]
  573. pxor @t[7], @y[6]
  574. pxor @t[3], @t[6] # restore t[6]
  575. pshufd \$0x93, @t[5], @t[5]
  576. pshufd \$0x93, @t[6], @t[6]
  577. pshufd \$0x93, @t[7], @t[7]
  578. pshufd \$0x93, @t[3], @t[3]
  579. # multiplication by 0x09
  580. pxor @y[1], @y[4]
  581. pxor @y[1], @t[1] # t[1]=y[1]
  582. pxor @t[5], @t[0] # clobber t[0]
  583. pxor @t[5], @t[1]
  584. pxor @t[0], @y[3]
  585. pxor @y[0], @t[0] # t[0]=y[0]
  586. pxor @t[6], @t[1]
  587. pxor @t[7], @t[6] # clobber t[6]
  588. pxor @t[1], @y[4]
  589. pxor @t[4], @y[7]
  590. pxor @y[4], @t[4] # t[4]=y[4]
  591. pxor @t[3], @y[6]
  592. pxor @y[3], @t[3] # t[3]=y[3]
  593. pxor @t[2], @y[5]
  594. pxor @y[2], @t[2] # t[2]=y[2]
  595. pxor @t[7], @t[3]
  596. pxor @y[5], @t[5] # t[5]=y[5]
  597. pxor @t[6], @t[2]
  598. pxor @t[6], @t[5]
  599. pxor @y[6], @t[6] # t[6]=y[6]
  600. pxor @y[7], @t[7] # t[7]=y[7]
  601. movdqa @t[0],@XMM[0]
  602. movdqa @t[1],@XMM[1]
  603. movdqa @t[2],@XMM[2]
  604. movdqa @t[3],@XMM[3]
  605. movdqa @t[4],@XMM[4]
  606. movdqa @t[5],@XMM[5]
  607. movdqa @t[6],@XMM[6]
  608. movdqa @t[7],@XMM[7]
  609. ___
  610. }
  611. sub InvMixColumns {
  612. my @x=@_[0..7];
  613. my @t=@_[8..15];
  614. # Thanks to Jussi Kivilinna for providing pointer to
  615. #
  616. # | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
  617. # | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
  618. # | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
  619. # | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
  620. $code.=<<___;
  621. # multiplication by 0x05-0x00-0x04-0x00
  622. pshufd \$0x4E, @x[0], @t[0]
  623. pshufd \$0x4E, @x[6], @t[6]
  624. pxor @x[0], @t[0]
  625. pshufd \$0x4E, @x[7], @t[7]
  626. pxor @x[6], @t[6]
  627. pshufd \$0x4E, @x[1], @t[1]
  628. pxor @x[7], @t[7]
  629. pshufd \$0x4E, @x[2], @t[2]
  630. pxor @x[1], @t[1]
  631. pshufd \$0x4E, @x[3], @t[3]
  632. pxor @x[2], @t[2]
  633. pxor @t[6], @x[0]
  634. pxor @t[6], @x[1]
  635. pshufd \$0x4E, @x[4], @t[4]
  636. pxor @x[3], @t[3]
  637. pxor @t[0], @x[2]
  638. pxor @t[1], @x[3]
  639. pshufd \$0x4E, @x[5], @t[5]
  640. pxor @x[4], @t[4]
  641. pxor @t[7], @x[1]
  642. pxor @t[2], @x[4]
  643. pxor @x[5], @t[5]
  644. pxor @t[7], @x[2]
  645. pxor @t[6], @x[3]
  646. pxor @t[6], @x[4]
  647. pxor @t[3], @x[5]
  648. pxor @t[4], @x[6]
  649. pxor @t[7], @x[4]
  650. pxor @t[7], @x[5]
  651. pxor @t[5], @x[7]
  652. ___
  653. &MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
  654. }
  655. sub aesenc { # not used
  656. my @b=@_[0..7];
  657. my @t=@_[8..15];
  658. $code.=<<___;
  659. movdqa 0x30($const),@t[0] # .LSR
  660. ___
  661. &ShiftRows (@b,@t[0]);
  662. &Sbox (@b,@t);
  663. &MixColumns (@b[0,1,4,6,3,7,2,5],@t);
  664. }
  665. sub aesenclast { # not used
  666. my @b=@_[0..7];
  667. my @t=@_[8..15];
  668. $code.=<<___;
  669. movdqa 0x40($const),@t[0] # .LSRM0
  670. ___
  671. &ShiftRows (@b,@t[0]);
  672. &Sbox (@b,@t);
  673. $code.=<<___
  674. pxor 0x00($key),@b[0]
  675. pxor 0x10($key),@b[1]
  676. pxor 0x20($key),@b[4]
  677. pxor 0x30($key),@b[6]
  678. pxor 0x40($key),@b[3]
  679. pxor 0x50($key),@b[7]
  680. pxor 0x60($key),@b[2]
  681. pxor 0x70($key),@b[5]
  682. ___
  683. }
  684. sub swapmove {
  685. my ($a,$b,$n,$mask,$t)=@_;
  686. $code.=<<___;
  687. movdqa $b,$t
  688. psrlq \$$n,$b
  689. pxor $a,$b
  690. pand $mask,$b
  691. pxor $b,$a
  692. psllq \$$n,$b
  693. pxor $t,$b
  694. ___
  695. }
  696. sub swapmove2x {
  697. my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
  698. $code.=<<___;
  699. movdqa $b0,$t0
  700. psrlq \$$n,$b0
  701. movdqa $b1,$t1
  702. psrlq \$$n,$b1
  703. pxor $a0,$b0
  704. pxor $a1,$b1
  705. pand $mask,$b0
  706. pand $mask,$b1
  707. pxor $b0,$a0
  708. psllq \$$n,$b0
  709. pxor $b1,$a1
  710. psllq \$$n,$b1
  711. pxor $t0,$b0
  712. pxor $t1,$b1
  713. ___
  714. }
  715. sub bitslice {
  716. my @x=reverse(@_[0..7]);
  717. my ($t0,$t1,$t2,$t3)=@_[8..11];
  718. $code.=<<___;
  719. movdqa 0x00($const),$t0 # .LBS0
  720. movdqa 0x10($const),$t1 # .LBS1
  721. ___
  722. &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
  723. &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
  724. $code.=<<___;
  725. movdqa 0x20($const),$t0 # .LBS2
  726. ___
  727. &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
  728. &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
  729. &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
  730. &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
  731. }
  732. $code.=<<___;
  733. .text
  734. .extern asm_AES_encrypt
  735. .extern asm_AES_decrypt
  736. .type _bsaes_encrypt8,\@abi-omnipotent
  737. .align 64
  738. _bsaes_encrypt8:
  739. lea .LBS0(%rip), $const # constants table
  740. movdqa ($key), @XMM[9] # round 0 key
  741. lea 0x10($key), $key
  742. movdqa 0x50($const), @XMM[8] # .LM0SR
  743. pxor @XMM[9], @XMM[0] # xor with round0 key
  744. pxor @XMM[9], @XMM[1]
  745. pxor @XMM[9], @XMM[2]
  746. pxor @XMM[9], @XMM[3]
  747. pshufb @XMM[8], @XMM[0]
  748. pshufb @XMM[8], @XMM[1]
  749. pxor @XMM[9], @XMM[4]
  750. pxor @XMM[9], @XMM[5]
  751. pshufb @XMM[8], @XMM[2]
  752. pshufb @XMM[8], @XMM[3]
  753. pxor @XMM[9], @XMM[6]
  754. pxor @XMM[9], @XMM[7]
  755. pshufb @XMM[8], @XMM[4]
  756. pshufb @XMM[8], @XMM[5]
  757. pshufb @XMM[8], @XMM[6]
  758. pshufb @XMM[8], @XMM[7]
  759. _bsaes_encrypt8_bitslice:
  760. ___
  761. &bitslice (@XMM[0..7, 8..11]);
  762. $code.=<<___;
  763. dec $rounds
  764. jmp .Lenc_sbox
  765. .align 16
  766. .Lenc_loop:
  767. ___
  768. &ShiftRows (@XMM[0..7, 8]);
  769. $code.=".Lenc_sbox:\n";
  770. &Sbox (@XMM[0..7, 8..15]);
  771. $code.=<<___;
  772. dec $rounds
  773. jl .Lenc_done
  774. ___
  775. &MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
  776. $code.=<<___;
  777. movdqa 0x30($const), @XMM[8] # .LSR
  778. jnz .Lenc_loop
  779. movdqa 0x40($const), @XMM[8] # .LSRM0
  780. jmp .Lenc_loop
  781. .align 16
  782. .Lenc_done:
  783. ___
  784. # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
  785. &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
  786. $code.=<<___;
  787. movdqa ($key), @XMM[8] # last round key
  788. pxor @XMM[8], @XMM[4]
  789. pxor @XMM[8], @XMM[6]
  790. pxor @XMM[8], @XMM[3]
  791. pxor @XMM[8], @XMM[7]
  792. pxor @XMM[8], @XMM[2]
  793. pxor @XMM[8], @XMM[5]
  794. pxor @XMM[8], @XMM[0]
  795. pxor @XMM[8], @XMM[1]
  796. ret
  797. .size _bsaes_encrypt8,.-_bsaes_encrypt8
  798. .type _bsaes_decrypt8,\@abi-omnipotent
  799. .align 64
  800. _bsaes_decrypt8:
  801. lea .LBS0(%rip), $const # constants table
  802. movdqa ($key), @XMM[9] # round 0 key
  803. lea 0x10($key), $key
  804. movdqa -0x30($const), @XMM[8] # .LM0ISR
  805. pxor @XMM[9], @XMM[0] # xor with round0 key
  806. pxor @XMM[9], @XMM[1]
  807. pxor @XMM[9], @XMM[2]
  808. pxor @XMM[9], @XMM[3]
  809. pshufb @XMM[8], @XMM[0]
  810. pshufb @XMM[8], @XMM[1]
  811. pxor @XMM[9], @XMM[4]
  812. pxor @XMM[9], @XMM[5]
  813. pshufb @XMM[8], @XMM[2]
  814. pshufb @XMM[8], @XMM[3]
  815. pxor @XMM[9], @XMM[6]
  816. pxor @XMM[9], @XMM[7]
  817. pshufb @XMM[8], @XMM[4]
  818. pshufb @XMM[8], @XMM[5]
  819. pshufb @XMM[8], @XMM[6]
  820. pshufb @XMM[8], @XMM[7]
  821. ___
  822. &bitslice (@XMM[0..7, 8..11]);
  823. $code.=<<___;
  824. dec $rounds
  825. jmp .Ldec_sbox
  826. .align 16
  827. .Ldec_loop:
  828. ___
  829. &ShiftRows (@XMM[0..7, 8]);
  830. $code.=".Ldec_sbox:\n";
  831. &InvSbox (@XMM[0..7, 8..15]);
  832. $code.=<<___;
  833. dec $rounds
  834. jl .Ldec_done
  835. ___
  836. &InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
  837. $code.=<<___;
  838. movdqa -0x10($const), @XMM[8] # .LISR
  839. jnz .Ldec_loop
  840. movdqa -0x20($const), @XMM[8] # .LISRM0
  841. jmp .Ldec_loop
  842. .align 16
  843. .Ldec_done:
  844. ___
  845. &bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
  846. $code.=<<___;
  847. movdqa ($key), @XMM[8] # last round key
  848. pxor @XMM[8], @XMM[6]
  849. pxor @XMM[8], @XMM[4]
  850. pxor @XMM[8], @XMM[2]
  851. pxor @XMM[8], @XMM[7]
  852. pxor @XMM[8], @XMM[3]
  853. pxor @XMM[8], @XMM[5]
  854. pxor @XMM[8], @XMM[0]
  855. pxor @XMM[8], @XMM[1]
  856. ret
  857. .size _bsaes_decrypt8,.-_bsaes_decrypt8
  858. ___
  859. }
  860. {
  861. my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
  862. sub bitslice_key {
  863. my @x=reverse(@_[0..7]);
  864. my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
  865. &swapmove (@x[0,1],1,$bs0,$t2,$t3);
  866. $code.=<<___;
  867. #&swapmove(@x[2,3],1,$t0,$t2,$t3);
  868. movdqa @x[0], @x[2]
  869. movdqa @x[1], @x[3]
  870. ___
  871. #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
  872. &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
  873. $code.=<<___;
  874. #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
  875. movdqa @x[0], @x[4]
  876. movdqa @x[2], @x[6]
  877. movdqa @x[1], @x[5]
  878. movdqa @x[3], @x[7]
  879. ___
  880. &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
  881. &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
  882. }
  883. $code.=<<___;
  884. .type _bsaes_key_convert,\@abi-omnipotent
  885. .align 16
  886. _bsaes_key_convert:
  887. lea .Lmasks(%rip), $const
  888. movdqu ($inp), %xmm7 # load round 0 key
  889. lea 0x10($inp), $inp
  890. movdqa 0x00($const), %xmm0 # 0x01...
  891. movdqa 0x10($const), %xmm1 # 0x02...
  892. movdqa 0x20($const), %xmm2 # 0x04...
  893. movdqa 0x30($const), %xmm3 # 0x08...
  894. movdqa 0x40($const), %xmm4 # .LM0
  895. pcmpeqd %xmm5, %xmm5 # .LNOT
  896. movdqu ($inp), %xmm6 # load round 1 key
  897. movdqa %xmm7, ($out) # save round 0 key
  898. lea 0x10($out), $out
  899. dec $rounds
  900. jmp .Lkey_loop
  901. .align 16
  902. .Lkey_loop:
  903. pshufb %xmm4, %xmm6 # .LM0
  904. movdqa %xmm0, %xmm8
  905. movdqa %xmm1, %xmm9
  906. pand %xmm6, %xmm8
  907. pand %xmm6, %xmm9
  908. movdqa %xmm2, %xmm10
  909. pcmpeqb %xmm0, %xmm8
  910. psllq \$4, %xmm0 # 0x10...
  911. movdqa %xmm3, %xmm11
  912. pcmpeqb %xmm1, %xmm9
  913. psllq \$4, %xmm1 # 0x20...
  914. pand %xmm6, %xmm10
  915. pand %xmm6, %xmm11
  916. movdqa %xmm0, %xmm12
  917. pcmpeqb %xmm2, %xmm10
  918. psllq \$4, %xmm2 # 0x40...
  919. movdqa %xmm1, %xmm13
  920. pcmpeqb %xmm3, %xmm11
  921. psllq \$4, %xmm3 # 0x80...
  922. movdqa %xmm2, %xmm14
  923. movdqa %xmm3, %xmm15
  924. pxor %xmm5, %xmm8 # "pnot"
  925. pxor %xmm5, %xmm9
  926. pand %xmm6, %xmm12
  927. pand %xmm6, %xmm13
  928. movdqa %xmm8, 0x00($out) # write bit-sliced round key
  929. pcmpeqb %xmm0, %xmm12
  930. psrlq \$4, %xmm0 # 0x01...
  931. movdqa %xmm9, 0x10($out)
  932. pcmpeqb %xmm1, %xmm13
  933. psrlq \$4, %xmm1 # 0x02...
  934. lea 0x10($inp), $inp
  935. pand %xmm6, %xmm14
  936. pand %xmm6, %xmm15
  937. movdqa %xmm10, 0x20($out)
  938. pcmpeqb %xmm2, %xmm14
  939. psrlq \$4, %xmm2 # 0x04...
  940. movdqa %xmm11, 0x30($out)
  941. pcmpeqb %xmm3, %xmm15
  942. psrlq \$4, %xmm3 # 0x08...
  943. movdqu ($inp), %xmm6 # load next round key
  944. pxor %xmm5, %xmm13 # "pnot"
  945. pxor %xmm5, %xmm14
  946. movdqa %xmm12, 0x40($out)
  947. movdqa %xmm13, 0x50($out)
  948. movdqa %xmm14, 0x60($out)
  949. movdqa %xmm15, 0x70($out)
  950. lea 0x80($out),$out
  951. dec $rounds
  952. jnz .Lkey_loop
  953. movdqa 0x50($const), %xmm7 # .L63
  954. #movdqa %xmm6, ($out) # don't save last round key
  955. ret
  956. .size _bsaes_key_convert,.-_bsaes_key_convert
  957. ___
  958. }
  959. if (0 && !$win64) { # following four functions are unsupported interface
  960. # used for benchmarking...
  961. $code.=<<___;
  962. .globl bsaes_enc_key_convert
  963. .type bsaes_enc_key_convert,\@function,2
  964. .align 16
  965. bsaes_enc_key_convert:
  966. mov 240($inp),%r10d # pass rounds
  967. mov $inp,%rcx # pass key
  968. mov $out,%rax # pass key schedule
  969. call _bsaes_key_convert
  970. pxor %xmm6,%xmm7 # fix up last round key
  971. movdqa %xmm7,(%rax) # save last round key
  972. ret
  973. .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
  974. .globl bsaes_encrypt_128
  975. .type bsaes_encrypt_128,\@function,4
  976. .align 16
  977. bsaes_encrypt_128:
  978. .Lenc128_loop:
  979. movdqu 0x00($inp), @XMM[0] # load input
  980. movdqu 0x10($inp), @XMM[1]
  981. movdqu 0x20($inp), @XMM[2]
  982. movdqu 0x30($inp), @XMM[3]
  983. movdqu 0x40($inp), @XMM[4]
  984. movdqu 0x50($inp), @XMM[5]
  985. movdqu 0x60($inp), @XMM[6]
  986. movdqu 0x70($inp), @XMM[7]
  987. mov $key, %rax # pass the $key
  988. lea 0x80($inp), $inp
  989. mov \$10,%r10d
  990. call _bsaes_encrypt8
  991. movdqu @XMM[0], 0x00($out) # write output
  992. movdqu @XMM[1], 0x10($out)
  993. movdqu @XMM[4], 0x20($out)
  994. movdqu @XMM[6], 0x30($out)
  995. movdqu @XMM[3], 0x40($out)
  996. movdqu @XMM[7], 0x50($out)
  997. movdqu @XMM[2], 0x60($out)
  998. movdqu @XMM[5], 0x70($out)
  999. lea 0x80($out), $out
  1000. sub \$0x80,$len
  1001. ja .Lenc128_loop
  1002. ret
  1003. .size bsaes_encrypt_128,.-bsaes_encrypt_128
  1004. .globl bsaes_dec_key_convert
  1005. .type bsaes_dec_key_convert,\@function,2
  1006. .align 16
  1007. bsaes_dec_key_convert:
  1008. mov 240($inp),%r10d # pass rounds
  1009. mov $inp,%rcx # pass key
  1010. mov $out,%rax # pass key schedule
  1011. call _bsaes_key_convert
  1012. pxor ($out),%xmm7 # fix up round 0 key
  1013. movdqa %xmm6,(%rax) # save last round key
  1014. movdqa %xmm7,($out)
  1015. ret
  1016. .size bsaes_dec_key_convert,.-bsaes_dec_key_convert
  1017. .globl bsaes_decrypt_128
  1018. .type bsaes_decrypt_128,\@function,4
  1019. .align 16
  1020. bsaes_decrypt_128:
  1021. .Ldec128_loop:
  1022. movdqu 0x00($inp), @XMM[0] # load input
  1023. movdqu 0x10($inp), @XMM[1]
  1024. movdqu 0x20($inp), @XMM[2]
  1025. movdqu 0x30($inp), @XMM[3]
  1026. movdqu 0x40($inp), @XMM[4]
  1027. movdqu 0x50($inp), @XMM[5]
  1028. movdqu 0x60($inp), @XMM[6]
  1029. movdqu 0x70($inp), @XMM[7]
  1030. mov $key, %rax # pass the $key
  1031. lea 0x80($inp), $inp
  1032. mov \$10,%r10d
  1033. call _bsaes_decrypt8
  1034. movdqu @XMM[0], 0x00($out) # write output
  1035. movdqu @XMM[1], 0x10($out)
  1036. movdqu @XMM[6], 0x20($out)
  1037. movdqu @XMM[4], 0x30($out)
  1038. movdqu @XMM[2], 0x40($out)
  1039. movdqu @XMM[7], 0x50($out)
  1040. movdqu @XMM[3], 0x60($out)
  1041. movdqu @XMM[5], 0x70($out)
  1042. lea 0x80($out), $out
  1043. sub \$0x80,$len
  1044. ja .Ldec128_loop
  1045. ret
  1046. .size bsaes_decrypt_128,.-bsaes_decrypt_128
  1047. ___
  1048. }
  1049. {
  1050. ######################################################################
  1051. #
  1052. # OpenSSL interface
  1053. #
  1054. my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
  1055. : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
  1056. my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
  1057. if ($ecb) {
  1058. $code.=<<___;
  1059. .globl bsaes_ecb_encrypt_blocks
  1060. .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
  1061. .align 16
  1062. bsaes_ecb_encrypt_blocks:
  1063. mov %rsp, %rax
  1064. .Lecb_enc_prologue:
  1065. push %rbp
  1066. push %rbx
  1067. push %r12
  1068. push %r13
  1069. push %r14
  1070. push %r15
  1071. lea -0x48(%rsp),%rsp
  1072. ___
  1073. $code.=<<___ if ($win64);
  1074. lea -0xa0(%rsp), %rsp
  1075. movaps %xmm6, 0x40(%rsp)
  1076. movaps %xmm7, 0x50(%rsp)
  1077. movaps %xmm8, 0x60(%rsp)
  1078. movaps %xmm9, 0x70(%rsp)
  1079. movaps %xmm10, 0x80(%rsp)
  1080. movaps %xmm11, 0x90(%rsp)
  1081. movaps %xmm12, 0xa0(%rsp)
  1082. movaps %xmm13, 0xb0(%rsp)
  1083. movaps %xmm14, 0xc0(%rsp)
  1084. movaps %xmm15, 0xd0(%rsp)
  1085. .Lecb_enc_body:
  1086. ___
  1087. $code.=<<___;
  1088. mov %rsp,%rbp # backup %rsp
  1089. mov 240($arg4),%eax # rounds
  1090. mov $arg1,$inp # backup arguments
  1091. mov $arg2,$out
  1092. mov $arg3,$len
  1093. mov $arg4,$key
  1094. cmp \$8,$arg3
  1095. jb .Lecb_enc_short
  1096. mov %eax,%ebx # backup rounds
  1097. shl \$7,%rax # 128 bytes per inner round key
  1098. sub \$`128-32`,%rax # size of bit-sliced key schedule
  1099. sub %rax,%rsp
  1100. mov %rsp,%rax # pass key schedule
  1101. mov $key,%rcx # pass key
  1102. mov %ebx,%r10d # pass rounds
  1103. call _bsaes_key_convert
  1104. pxor %xmm6,%xmm7 # fix up last round key
  1105. movdqa %xmm7,(%rax) # save last round key
  1106. sub \$8,$len
  1107. .Lecb_enc_loop:
  1108. movdqu 0x00($inp), @XMM[0] # load input
  1109. movdqu 0x10($inp), @XMM[1]
  1110. movdqu 0x20($inp), @XMM[2]
  1111. movdqu 0x30($inp), @XMM[3]
  1112. movdqu 0x40($inp), @XMM[4]
  1113. movdqu 0x50($inp), @XMM[5]
  1114. mov %rsp, %rax # pass key schedule
  1115. movdqu 0x60($inp), @XMM[6]
  1116. mov %ebx,%r10d # pass rounds
  1117. movdqu 0x70($inp), @XMM[7]
  1118. lea 0x80($inp), $inp
  1119. call _bsaes_encrypt8
  1120. movdqu @XMM[0], 0x00($out) # write output
  1121. movdqu @XMM[1], 0x10($out)
  1122. movdqu @XMM[4], 0x20($out)
  1123. movdqu @XMM[6], 0x30($out)
  1124. movdqu @XMM[3], 0x40($out)
  1125. movdqu @XMM[7], 0x50($out)
  1126. movdqu @XMM[2], 0x60($out)
  1127. movdqu @XMM[5], 0x70($out)
  1128. lea 0x80($out), $out
  1129. sub \$8,$len
  1130. jnc .Lecb_enc_loop
  1131. add \$8,$len
  1132. jz .Lecb_enc_done
  1133. movdqu 0x00($inp), @XMM[0] # load input
  1134. mov %rsp, %rax # pass key schedule
  1135. mov %ebx,%r10d # pass rounds
  1136. cmp \$2,$len
  1137. jb .Lecb_enc_one
  1138. movdqu 0x10($inp), @XMM[1]
  1139. je .Lecb_enc_two
  1140. movdqu 0x20($inp), @XMM[2]
  1141. cmp \$4,$len
  1142. jb .Lecb_enc_three
  1143. movdqu 0x30($inp), @XMM[3]
  1144. je .Lecb_enc_four
  1145. movdqu 0x40($inp), @XMM[4]
  1146. cmp \$6,$len
  1147. jb .Lecb_enc_five
  1148. movdqu 0x50($inp), @XMM[5]
  1149. je .Lecb_enc_six
  1150. movdqu 0x60($inp), @XMM[6]
  1151. call _bsaes_encrypt8
  1152. movdqu @XMM[0], 0x00($out) # write output
  1153. movdqu @XMM[1], 0x10($out)
  1154. movdqu @XMM[4], 0x20($out)
  1155. movdqu @XMM[6], 0x30($out)
  1156. movdqu @XMM[3], 0x40($out)
  1157. movdqu @XMM[7], 0x50($out)
  1158. movdqu @XMM[2], 0x60($out)
  1159. jmp .Lecb_enc_done
  1160. .align 16
  1161. .Lecb_enc_six:
  1162. call _bsaes_encrypt8
  1163. movdqu @XMM[0], 0x00($out) # write output
  1164. movdqu @XMM[1], 0x10($out)
  1165. movdqu @XMM[4], 0x20($out)
  1166. movdqu @XMM[6], 0x30($out)
  1167. movdqu @XMM[3], 0x40($out)
  1168. movdqu @XMM[7], 0x50($out)
  1169. jmp .Lecb_enc_done
  1170. .align 16
  1171. .Lecb_enc_five:
  1172. call _bsaes_encrypt8
  1173. movdqu @XMM[0], 0x00($out) # write output
  1174. movdqu @XMM[1], 0x10($out)
  1175. movdqu @XMM[4], 0x20($out)
  1176. movdqu @XMM[6], 0x30($out)
  1177. movdqu @XMM[3], 0x40($out)
  1178. jmp .Lecb_enc_done
  1179. .align 16
  1180. .Lecb_enc_four:
  1181. call _bsaes_encrypt8
  1182. movdqu @XMM[0], 0x00($out) # write output
  1183. movdqu @XMM[1], 0x10($out)
  1184. movdqu @XMM[4], 0x20($out)
  1185. movdqu @XMM[6], 0x30($out)
  1186. jmp .Lecb_enc_done
  1187. .align 16
  1188. .Lecb_enc_three:
  1189. call _bsaes_encrypt8
  1190. movdqu @XMM[0], 0x00($out) # write output
  1191. movdqu @XMM[1], 0x10($out)
  1192. movdqu @XMM[4], 0x20($out)
  1193. jmp .Lecb_enc_done
  1194. .align 16
  1195. .Lecb_enc_two:
  1196. call _bsaes_encrypt8
  1197. movdqu @XMM[0], 0x00($out) # write output
  1198. movdqu @XMM[1], 0x10($out)
  1199. jmp .Lecb_enc_done
  1200. .align 16
  1201. .Lecb_enc_one:
  1202. call _bsaes_encrypt8
  1203. movdqu @XMM[0], 0x00($out) # write output
  1204. jmp .Lecb_enc_done
  1205. .align 16
  1206. .Lecb_enc_short:
  1207. lea ($inp), $arg1
  1208. lea ($out), $arg2
  1209. lea ($key), $arg3
  1210. call asm_AES_encrypt
  1211. lea 16($inp), $inp
  1212. lea 16($out), $out
  1213. dec $len
  1214. jnz .Lecb_enc_short
  1215. .Lecb_enc_done:
  1216. lea (%rsp),%rax
  1217. pxor %xmm0, %xmm0
  1218. .Lecb_enc_bzero: # wipe key schedule [if any]
  1219. movdqa %xmm0, 0x00(%rax)
  1220. movdqa %xmm0, 0x10(%rax)
  1221. lea 0x20(%rax), %rax
  1222. cmp %rax, %rbp
  1223. jb .Lecb_enc_bzero
  1224. lea (%rbp),%rsp # restore %rsp
  1225. ___
  1226. $code.=<<___ if ($win64);
  1227. movaps 0x40(%rbp), %xmm6
  1228. movaps 0x50(%rbp), %xmm7
  1229. movaps 0x60(%rbp), %xmm8
  1230. movaps 0x70(%rbp), %xmm9
  1231. movaps 0x80(%rbp), %xmm10
  1232. movaps 0x90(%rbp), %xmm11
  1233. movaps 0xa0(%rbp), %xmm12
  1234. movaps 0xb0(%rbp), %xmm13
  1235. movaps 0xc0(%rbp), %xmm14
  1236. movaps 0xd0(%rbp), %xmm15
  1237. lea 0xa0(%rbp), %rsp
  1238. ___
  1239. $code.=<<___;
  1240. mov 0x48(%rsp), %r15
  1241. mov 0x50(%rsp), %r14
  1242. mov 0x58(%rsp), %r13
  1243. mov 0x60(%rsp), %r12
  1244. mov 0x68(%rsp), %rbx
  1245. mov 0x70(%rsp), %rax
  1246. lea 0x78(%rsp), %rsp
  1247. mov %rax, %rbp
  1248. .Lecb_enc_epilogue:
  1249. ret
  1250. .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
  1251. .globl bsaes_ecb_decrypt_blocks
  1252. .type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
  1253. .align 16
  1254. bsaes_ecb_decrypt_blocks:
  1255. mov %rsp, %rax
  1256. .Lecb_dec_prologue:
  1257. push %rbp
  1258. push %rbx
  1259. push %r12
  1260. push %r13
  1261. push %r14
  1262. push %r15
  1263. lea -0x48(%rsp),%rsp
  1264. ___
  1265. $code.=<<___ if ($win64);
  1266. lea -0xa0(%rsp), %rsp
  1267. movaps %xmm6, 0x40(%rsp)
  1268. movaps %xmm7, 0x50(%rsp)
  1269. movaps %xmm8, 0x60(%rsp)
  1270. movaps %xmm9, 0x70(%rsp)
  1271. movaps %xmm10, 0x80(%rsp)
  1272. movaps %xmm11, 0x90(%rsp)
  1273. movaps %xmm12, 0xa0(%rsp)
  1274. movaps %xmm13, 0xb0(%rsp)
  1275. movaps %xmm14, 0xc0(%rsp)
  1276. movaps %xmm15, 0xd0(%rsp)
  1277. .Lecb_dec_body:
  1278. ___
  1279. $code.=<<___;
  1280. mov %rsp,%rbp # backup %rsp
  1281. mov 240($arg4),%eax # rounds
  1282. mov $arg1,$inp # backup arguments
  1283. mov $arg2,$out
  1284. mov $arg3,$len
  1285. mov $arg4,$key
  1286. cmp \$8,$arg3
  1287. jb .Lecb_dec_short
  1288. mov %eax,%ebx # backup rounds
  1289. shl \$7,%rax # 128 bytes per inner round key
  1290. sub \$`128-32`,%rax # size of bit-sliced key schedule
  1291. sub %rax,%rsp
  1292. mov %rsp,%rax # pass key schedule
  1293. mov $key,%rcx # pass key
  1294. mov %ebx,%r10d # pass rounds
  1295. call _bsaes_key_convert
  1296. pxor (%rsp),%xmm7 # fix up 0 round key
  1297. movdqa %xmm6,(%rax) # save last round key
  1298. movdqa %xmm7,(%rsp)
  1299. sub \$8,$len
  1300. .Lecb_dec_loop:
  1301. movdqu 0x00($inp), @XMM[0] # load input
  1302. movdqu 0x10($inp), @XMM[1]
  1303. movdqu 0x20($inp), @XMM[2]
  1304. movdqu 0x30($inp), @XMM[3]
  1305. movdqu 0x40($inp), @XMM[4]
  1306. movdqu 0x50($inp), @XMM[5]
  1307. mov %rsp, %rax # pass key schedule
  1308. movdqu 0x60($inp), @XMM[6]
  1309. mov %ebx,%r10d # pass rounds
  1310. movdqu 0x70($inp), @XMM[7]
  1311. lea 0x80($inp), $inp
  1312. call _bsaes_decrypt8
  1313. movdqu @XMM[0], 0x00($out) # write output
  1314. movdqu @XMM[1], 0x10($out)
  1315. movdqu @XMM[6], 0x20($out)
  1316. movdqu @XMM[4], 0x30($out)
  1317. movdqu @XMM[2], 0x40($out)
  1318. movdqu @XMM[7], 0x50($out)
  1319. movdqu @XMM[3], 0x60($out)
  1320. movdqu @XMM[5], 0x70($out)
  1321. lea 0x80($out), $out
  1322. sub \$8,$len
  1323. jnc .Lecb_dec_loop
  1324. add \$8,$len
  1325. jz .Lecb_dec_done
  1326. movdqu 0x00($inp), @XMM[0] # load input
  1327. mov %rsp, %rax # pass key schedule
  1328. mov %ebx,%r10d # pass rounds
  1329. cmp \$2,$len
  1330. jb .Lecb_dec_one
  1331. movdqu 0x10($inp), @XMM[1]
  1332. je .Lecb_dec_two
  1333. movdqu 0x20($inp), @XMM[2]
  1334. cmp \$4,$len
  1335. jb .Lecb_dec_three
  1336. movdqu 0x30($inp), @XMM[3]
  1337. je .Lecb_dec_four
  1338. movdqu 0x40($inp), @XMM[4]
  1339. cmp \$6,$len
  1340. jb .Lecb_dec_five
  1341. movdqu 0x50($inp), @XMM[5]
  1342. je .Lecb_dec_six
  1343. movdqu 0x60($inp), @XMM[6]
  1344. call _bsaes_decrypt8
  1345. movdqu @XMM[0], 0x00($out) # write output
  1346. movdqu @XMM[1], 0x10($out)
  1347. movdqu @XMM[6], 0x20($out)
  1348. movdqu @XMM[4], 0x30($out)
  1349. movdqu @XMM[2], 0x40($out)
  1350. movdqu @XMM[7], 0x50($out)
  1351. movdqu @XMM[3], 0x60($out)
  1352. jmp .Lecb_dec_done
  1353. .align 16
  1354. .Lecb_dec_six:
  1355. call _bsaes_decrypt8
  1356. movdqu @XMM[0], 0x00($out) # write output
  1357. movdqu @XMM[1], 0x10($out)
  1358. movdqu @XMM[6], 0x20($out)
  1359. movdqu @XMM[4], 0x30($out)
  1360. movdqu @XMM[2], 0x40($out)
  1361. movdqu @XMM[7], 0x50($out)
  1362. jmp .Lecb_dec_done
  1363. .align 16
  1364. .Lecb_dec_five:
  1365. call _bsaes_decrypt8
  1366. movdqu @XMM[0], 0x00($out) # write output
  1367. movdqu @XMM[1], 0x10($out)
  1368. movdqu @XMM[6], 0x20($out)
  1369. movdqu @XMM[4], 0x30($out)
  1370. movdqu @XMM[2], 0x40($out)
  1371. jmp .Lecb_dec_done
  1372. .align 16
  1373. .Lecb_dec_four:
  1374. call _bsaes_decrypt8
  1375. movdqu @XMM[0], 0x00($out) # write output
  1376. movdqu @XMM[1], 0x10($out)
  1377. movdqu @XMM[6], 0x20($out)
  1378. movdqu @XMM[4], 0x30($out)
  1379. jmp .Lecb_dec_done
  1380. .align 16
  1381. .Lecb_dec_three:
  1382. call _bsaes_decrypt8
  1383. movdqu @XMM[0], 0x00($out) # write output
  1384. movdqu @XMM[1], 0x10($out)
  1385. movdqu @XMM[6], 0x20($out)
  1386. jmp .Lecb_dec_done
  1387. .align 16
  1388. .Lecb_dec_two:
  1389. call _bsaes_decrypt8
  1390. movdqu @XMM[0], 0x00($out) # write output
  1391. movdqu @XMM[1], 0x10($out)
  1392. jmp .Lecb_dec_done
  1393. .align 16
  1394. .Lecb_dec_one:
  1395. call _bsaes_decrypt8
  1396. movdqu @XMM[0], 0x00($out) # write output
  1397. jmp .Lecb_dec_done
  1398. .align 16
  1399. .Lecb_dec_short:
  1400. lea ($inp), $arg1
  1401. lea ($out), $arg2
  1402. lea ($key), $arg3
  1403. call asm_AES_decrypt
  1404. lea 16($inp), $inp
  1405. lea 16($out), $out
  1406. dec $len
  1407. jnz .Lecb_dec_short
  1408. .Lecb_dec_done:
  1409. lea (%rsp),%rax
  1410. pxor %xmm0, %xmm0
  1411. .Lecb_dec_bzero: # wipe key schedule [if any]
  1412. movdqa %xmm0, 0x00(%rax)
  1413. movdqa %xmm0, 0x10(%rax)
  1414. lea 0x20(%rax), %rax
  1415. cmp %rax, %rbp
  1416. jb .Lecb_dec_bzero
  1417. lea (%rbp),%rsp # restore %rsp
  1418. ___
  1419. $code.=<<___ if ($win64);
  1420. movaps 0x40(%rbp), %xmm6
  1421. movaps 0x50(%rbp), %xmm7
  1422. movaps 0x60(%rbp), %xmm8
  1423. movaps 0x70(%rbp), %xmm9
  1424. movaps 0x80(%rbp), %xmm10
  1425. movaps 0x90(%rbp), %xmm11
  1426. movaps 0xa0(%rbp), %xmm12
  1427. movaps 0xb0(%rbp), %xmm13
  1428. movaps 0xc0(%rbp), %xmm14
  1429. movaps 0xd0(%rbp), %xmm15
  1430. lea 0xa0(%rbp), %rsp
  1431. ___
  1432. $code.=<<___;
  1433. mov 0x48(%rsp), %r15
  1434. mov 0x50(%rsp), %r14
  1435. mov 0x58(%rsp), %r13
  1436. mov 0x60(%rsp), %r12
  1437. mov 0x68(%rsp), %rbx
  1438. mov 0x70(%rsp), %rax
  1439. lea 0x78(%rsp), %rsp
  1440. mov %rax, %rbp
  1441. .Lecb_dec_epilogue:
  1442. ret
  1443. .size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
  1444. ___
  1445. }
  1446. $code.=<<___;
  1447. .extern asm_AES_cbc_encrypt
  1448. .globl bsaes_cbc_encrypt
  1449. .type bsaes_cbc_encrypt,\@abi-omnipotent
  1450. .align 16
  1451. bsaes_cbc_encrypt:
  1452. ___
  1453. $code.=<<___ if ($win64);
  1454. mov 48(%rsp),$arg6 # pull direction flag
  1455. ___
  1456. $code.=<<___;
  1457. cmp \$0,$arg6
  1458. jne asm_AES_cbc_encrypt
  1459. cmp \$128,$arg3
  1460. jb asm_AES_cbc_encrypt
  1461. mov %rsp, %rax
  1462. .Lcbc_dec_prologue:
  1463. push %rbp
  1464. push %rbx
  1465. push %r12
  1466. push %r13
  1467. push %r14
  1468. push %r15
  1469. lea -0x48(%rsp), %rsp
  1470. ___
  1471. $code.=<<___ if ($win64);
  1472. mov 0xa0(%rsp),$arg5 # pull ivp
  1473. lea -0xa0(%rsp), %rsp
  1474. movaps %xmm6, 0x40(%rsp)
  1475. movaps %xmm7, 0x50(%rsp)
  1476. movaps %xmm8, 0x60(%rsp)
  1477. movaps %xmm9, 0x70(%rsp)
  1478. movaps %xmm10, 0x80(%rsp)
  1479. movaps %xmm11, 0x90(%rsp)
  1480. movaps %xmm12, 0xa0(%rsp)
  1481. movaps %xmm13, 0xb0(%rsp)
  1482. movaps %xmm14, 0xc0(%rsp)
  1483. movaps %xmm15, 0xd0(%rsp)
  1484. .Lcbc_dec_body:
  1485. ___
  1486. $code.=<<___;
  1487. mov %rsp, %rbp # backup %rsp
  1488. mov 240($arg4), %eax # rounds
  1489. mov $arg1, $inp # backup arguments
  1490. mov $arg2, $out
  1491. mov $arg3, $len
  1492. mov $arg4, $key
  1493. mov $arg5, %rbx
  1494. shr \$4, $len # bytes to blocks
  1495. mov %eax, %edx # rounds
  1496. shl \$7, %rax # 128 bytes per inner round key
  1497. sub \$`128-32`, %rax # size of bit-sliced key schedule
  1498. sub %rax, %rsp
  1499. mov %rsp, %rax # pass key schedule
  1500. mov $key, %rcx # pass key
  1501. mov %edx, %r10d # pass rounds
  1502. call _bsaes_key_convert
  1503. pxor (%rsp),%xmm7 # fix up 0 round key
  1504. movdqa %xmm6,(%rax) # save last round key
  1505. movdqa %xmm7,(%rsp)
  1506. movdqu (%rbx), @XMM[15] # load IV
  1507. sub \$8,$len
  1508. .Lcbc_dec_loop:
  1509. movdqu 0x00($inp), @XMM[0] # load input
  1510. movdqu 0x10($inp), @XMM[1]
  1511. movdqu 0x20($inp), @XMM[2]
  1512. movdqu 0x30($inp), @XMM[3]
  1513. movdqu 0x40($inp), @XMM[4]
  1514. movdqu 0x50($inp), @XMM[5]
  1515. mov %rsp, %rax # pass key schedule
  1516. movdqu 0x60($inp), @XMM[6]
  1517. mov %edx,%r10d # pass rounds
  1518. movdqu 0x70($inp), @XMM[7]
  1519. movdqa @XMM[15], 0x20(%rbp) # put aside IV
  1520. call _bsaes_decrypt8
  1521. pxor 0x20(%rbp), @XMM[0] # ^= IV
  1522. movdqu 0x00($inp), @XMM[8] # re-load input
  1523. movdqu 0x10($inp), @XMM[9]
  1524. pxor @XMM[8], @XMM[1]
  1525. movdqu 0x20($inp), @XMM[10]
  1526. pxor @XMM[9], @XMM[6]
  1527. movdqu 0x30($inp), @XMM[11]
  1528. pxor @XMM[10], @XMM[4]
  1529. movdqu 0x40($inp), @XMM[12]
  1530. pxor @XMM[11], @XMM[2]
  1531. movdqu 0x50($inp), @XMM[13]
  1532. pxor @XMM[12], @XMM[7]
  1533. movdqu 0x60($inp), @XMM[14]
  1534. pxor @XMM[13], @XMM[3]
  1535. movdqu 0x70($inp), @XMM[15] # IV
  1536. pxor @XMM[14], @XMM[5]
  1537. movdqu @XMM[0], 0x00($out) # write output
  1538. lea 0x80($inp), $inp
  1539. movdqu @XMM[1], 0x10($out)
  1540. movdqu @XMM[6], 0x20($out)
  1541. movdqu @XMM[4], 0x30($out)
  1542. movdqu @XMM[2], 0x40($out)
  1543. movdqu @XMM[7], 0x50($out)
  1544. movdqu @XMM[3], 0x60($out)
  1545. movdqu @XMM[5], 0x70($out)
  1546. lea 0x80($out), $out
  1547. sub \$8,$len
  1548. jnc .Lcbc_dec_loop
  1549. add \$8,$len
  1550. jz .Lcbc_dec_done
  1551. movdqu 0x00($inp), @XMM[0] # load input
  1552. mov %rsp, %rax # pass key schedule
  1553. mov %edx, %r10d # pass rounds
  1554. cmp \$2,$len
  1555. jb .Lcbc_dec_one
  1556. movdqu 0x10($inp), @XMM[1]
  1557. je .Lcbc_dec_two
  1558. movdqu 0x20($inp), @XMM[2]
  1559. cmp \$4,$len
  1560. jb .Lcbc_dec_three
  1561. movdqu 0x30($inp), @XMM[3]
  1562. je .Lcbc_dec_four
  1563. movdqu 0x40($inp), @XMM[4]
  1564. cmp \$6,$len
  1565. jb .Lcbc_dec_five
  1566. movdqu 0x50($inp), @XMM[5]
  1567. je .Lcbc_dec_six
  1568. movdqu 0x60($inp), @XMM[6]
  1569. movdqa @XMM[15], 0x20(%rbp) # put aside IV
  1570. call _bsaes_decrypt8
  1571. pxor 0x20(%rbp), @XMM[0] # ^= IV
  1572. movdqu 0x00($inp), @XMM[8] # re-load input
  1573. movdqu 0x10($inp), @XMM[9]
  1574. pxor @XMM[8], @XMM[1]
  1575. movdqu 0x20($inp), @XMM[10]
  1576. pxor @XMM[9], @XMM[6]
  1577. movdqu 0x30($inp), @XMM[11]
  1578. pxor @XMM[10], @XMM[4]
  1579. movdqu 0x40($inp), @XMM[12]
  1580. pxor @XMM[11], @XMM[2]
  1581. movdqu 0x50($inp), @XMM[13]
  1582. pxor @XMM[12], @XMM[7]
  1583. movdqu 0x60($inp), @XMM[15] # IV
  1584. pxor @XMM[13], @XMM[3]
  1585. movdqu @XMM[0], 0x00($out) # write output
  1586. movdqu @XMM[1], 0x10($out)
  1587. movdqu @XMM[6], 0x20($out)
  1588. movdqu @XMM[4], 0x30($out)
  1589. movdqu @XMM[2], 0x40($out)
  1590. movdqu @XMM[7], 0x50($out)
  1591. movdqu @XMM[3], 0x60($out)
  1592. jmp .Lcbc_dec_done
  1593. .align 16
  1594. .Lcbc_dec_six:
  1595. movdqa @XMM[15], 0x20(%rbp) # put aside IV
  1596. call _bsaes_decrypt8
  1597. pxor 0x20(%rbp), @XMM[0] # ^= IV
  1598. movdqu 0x00($inp), @XMM[8] # re-load input
  1599. movdqu 0x10($inp), @XMM[9]
  1600. pxor @XMM[8], @XMM[1]
  1601. movdqu 0x20($inp), @XMM[10]
  1602. pxor @XMM[9], @XMM[6]
  1603. movdqu 0x30($inp), @XMM[11]
  1604. pxor @XMM[10], @XMM[4]
  1605. movdqu 0x40($inp), @XMM[12]
  1606. pxor @XMM[11], @XMM[2]
  1607. movdqu 0x50($inp), @XMM[15] # IV
  1608. pxor @XMM[12], @XMM[7]
  1609. movdqu @XMM[0], 0x00($out) # write output
  1610. movdqu @XMM[1], 0x10($out)
  1611. movdqu @XMM[6], 0x20($out)
  1612. movdqu @XMM[4], 0x30($out)
  1613. movdqu @XMM[2], 0x40($out)
  1614. movdqu @XMM[7], 0x50($out)
  1615. jmp .Lcbc_dec_done
  1616. .align 16
  1617. .Lcbc_dec_five:
  1618. movdqa @XMM[15], 0x20(%rbp) # put aside IV
  1619. call _bsaes_decrypt8
  1620. pxor 0x20(%rbp), @XMM[0] # ^= IV
  1621. movdqu 0x00($inp), @XMM[8] # re-load input
  1622. movdqu 0x10($inp), @XMM[9]
  1623. pxor @XMM[8], @XMM[1]
  1624. movdqu 0x20($inp), @XMM[10]
  1625. pxor @XMM[9], @XMM[6]
  1626. movdqu 0x30($inp), @XMM[11]
  1627. pxor @XMM[10], @XMM[4]
  1628. movdqu 0x40($inp), @XMM[15] # IV
  1629. pxor @XMM[11], @XMM[2]
  1630. movdqu @XMM[0], 0x00($out) # write output
  1631. movdqu @XMM[1], 0x10($out)
  1632. movdqu @XMM[6], 0x20($out)
  1633. movdqu @XMM[4], 0x30($out)
  1634. movdqu @XMM[2], 0x40($out)
  1635. jmp .Lcbc_dec_done
  1636. .align 16
  1637. .Lcbc_dec_four:
  1638. movdqa @XMM[15], 0x20(%rbp) # put aside IV
  1639. call _bsaes_decrypt8
  1640. pxor 0x20(%rbp), @XMM[0] # ^= IV
  1641. movdqu 0x00($inp), @XMM[8] # re-load input
  1642. movdqu 0x10($inp), @XMM[9]
  1643. pxor @XMM[8], @XMM[1]
  1644. movdqu 0x20($inp), @XMM[10]
  1645. pxor @XMM[9], @XMM[6]
  1646. movdqu 0x30($inp), @XMM[15] # IV
  1647. pxor @XMM[10], @XMM[4]
  1648. movdqu @XMM[0], 0x00($out) # write output
  1649. movdqu @XMM[1], 0x10($out)
  1650. movdqu @XMM[6], 0x20($out)
  1651. movdqu @XMM[4], 0x30($out)
  1652. jmp .Lcbc_dec_done
  1653. .align 16
  1654. .Lcbc_dec_three:
  1655. movdqa @XMM[15], 0x20(%rbp) # put aside IV
  1656. call _bsaes_decrypt8
  1657. pxor 0x20(%rbp), @XMM[0] # ^= IV
  1658. movdqu 0x00($inp), @XMM[8] # re-load input
  1659. movdqu 0x10($inp), @XMM[9]
  1660. pxor @XMM[8], @XMM[1]
  1661. movdqu 0x20($inp), @XMM[15] # IV
  1662. pxor @XMM[9], @XMM[6]
  1663. movdqu @XMM[0], 0x00($out) # write output
  1664. movdqu @XMM[1], 0x10($out)
  1665. movdqu @XMM[6], 0x20($out)
  1666. jmp .Lcbc_dec_done
  1667. .align 16
  1668. .Lcbc_dec_two:
  1669. movdqa @XMM[15], 0x20(%rbp) # put aside IV
  1670. call _bsaes_decrypt8
  1671. pxor 0x20(%rbp), @XMM[0] # ^= IV
  1672. movdqu 0x00($inp), @XMM[8] # re-load input
  1673. movdqu 0x10($inp), @XMM[15] # IV
  1674. pxor @XMM[8], @XMM[1]
  1675. movdqu @XMM[0], 0x00($out) # write output
  1676. movdqu @XMM[1], 0x10($out)
  1677. jmp .Lcbc_dec_done
  1678. .align 16
  1679. .Lcbc_dec_one:
  1680. lea ($inp), $arg1
  1681. lea 0x20(%rbp), $arg2 # buffer output
  1682. lea ($key), $arg3
  1683. call asm_AES_decrypt # doesn't touch %xmm
  1684. pxor 0x20(%rbp), @XMM[15] # ^= IV
  1685. movdqu @XMM[15], ($out) # write output
  1686. movdqa @XMM[0], @XMM[15] # IV
  1687. .Lcbc_dec_done:
  1688. movdqu @XMM[15], (%rbx) # return IV
  1689. lea (%rsp), %rax
  1690. pxor %xmm0, %xmm0
  1691. .Lcbc_dec_bzero: # wipe key schedule [if any]
  1692. movdqa %xmm0, 0x00(%rax)
  1693. movdqa %xmm0, 0x10(%rax)
  1694. lea 0x20(%rax), %rax
  1695. cmp %rax, %rbp
  1696. ja .Lcbc_dec_bzero
  1697. lea (%rbp),%rsp # restore %rsp
  1698. ___
  1699. $code.=<<___ if ($win64);
  1700. movaps 0x40(%rbp), %xmm6
  1701. movaps 0x50(%rbp), %xmm7
  1702. movaps 0x60(%rbp), %xmm8
  1703. movaps 0x70(%rbp), %xmm9
  1704. movaps 0x80(%rbp), %xmm10
  1705. movaps 0x90(%rbp), %xmm11
  1706. movaps 0xa0(%rbp), %xmm12
  1707. movaps 0xb0(%rbp), %xmm13
  1708. movaps 0xc0(%rbp), %xmm14
  1709. movaps 0xd0(%rbp), %xmm15
  1710. lea 0xa0(%rbp), %rsp
  1711. ___
  1712. $code.=<<___;
  1713. mov 0x48(%rsp), %r15
  1714. mov 0x50(%rsp), %r14
  1715. mov 0x58(%rsp), %r13
  1716. mov 0x60(%rsp), %r12
  1717. mov 0x68(%rsp), %rbx
  1718. mov 0x70(%rsp), %rax
  1719. lea 0x78(%rsp), %rsp
  1720. mov %rax, %rbp
  1721. .Lcbc_dec_epilogue:
  1722. ret
  1723. .size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
  1724. .globl bsaes_ctr32_encrypt_blocks
  1725. .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
  1726. .align 16
  1727. bsaes_ctr32_encrypt_blocks:
  1728. mov %rsp, %rax
  1729. .Lctr_enc_prologue:
  1730. push %rbp
  1731. push %rbx
  1732. push %r12
  1733. push %r13
  1734. push %r14
  1735. push %r15
  1736. lea -0x48(%rsp), %rsp
  1737. ___
  1738. $code.=<<___ if ($win64);
  1739. mov 0xa0(%rsp),$arg5 # pull ivp
  1740. lea -0xa0(%rsp), %rsp
  1741. movaps %xmm6, 0x40(%rsp)
  1742. movaps %xmm7, 0x50(%rsp)
  1743. movaps %xmm8, 0x60(%rsp)
  1744. movaps %xmm9, 0x70(%rsp)
  1745. movaps %xmm10, 0x80(%rsp)
  1746. movaps %xmm11, 0x90(%rsp)
  1747. movaps %xmm12, 0xa0(%rsp)
  1748. movaps %xmm13, 0xb0(%rsp)
  1749. movaps %xmm14, 0xc0(%rsp)
  1750. movaps %xmm15, 0xd0(%rsp)
  1751. .Lctr_enc_body:
  1752. ___
  1753. $code.=<<___;
  1754. mov %rsp, %rbp # backup %rsp
  1755. movdqu ($arg5), %xmm0 # load counter
  1756. mov 240($arg4), %eax # rounds
  1757. mov $arg1, $inp # backup arguments
  1758. mov $arg2, $out
  1759. mov $arg3, $len
  1760. mov $arg4, $key
  1761. movdqa %xmm0, 0x20(%rbp) # copy counter
  1762. cmp \$8, $arg3
  1763. jb .Lctr_enc_short
  1764. mov %eax, %ebx # rounds
  1765. shl \$7, %rax # 128 bytes per inner round key
  1766. sub \$`128-32`, %rax # size of bit-sliced key schedule
  1767. sub %rax, %rsp
  1768. mov %rsp, %rax # pass key schedule
  1769. mov $key, %rcx # pass key
  1770. mov %ebx, %r10d # pass rounds
  1771. call _bsaes_key_convert
  1772. pxor %xmm6,%xmm7 # fix up last round key
  1773. movdqa %xmm7,(%rax) # save last round key
  1774. movdqa (%rsp), @XMM[9] # load round0 key
  1775. lea .LADD1(%rip), %r11
  1776. movdqa 0x20(%rbp), @XMM[0] # counter copy
  1777. movdqa -0x20(%r11), @XMM[8] # .LSWPUP
  1778. pshufb @XMM[8], @XMM[9] # byte swap upper part
  1779. pshufb @XMM[8], @XMM[0]
  1780. movdqa @XMM[9], (%rsp) # save adjusted round0 key
  1781. jmp .Lctr_enc_loop
  1782. .align 16
  1783. .Lctr_enc_loop:
  1784. movdqa @XMM[0], 0x20(%rbp) # save counter
  1785. movdqa @XMM[0], @XMM[1] # prepare 8 counter values
  1786. movdqa @XMM[0], @XMM[2]
  1787. paddd 0x00(%r11), @XMM[1] # .LADD1
  1788. movdqa @XMM[0], @XMM[3]
  1789. paddd 0x10(%r11), @XMM[2] # .LADD2
  1790. movdqa @XMM[0], @XMM[4]
  1791. paddd 0x20(%r11), @XMM[3] # .LADD3
  1792. movdqa @XMM[0], @XMM[5]
  1793. paddd 0x30(%r11), @XMM[4] # .LADD4
  1794. movdqa @XMM[0], @XMM[6]
  1795. paddd 0x40(%r11), @XMM[5] # .LADD5
  1796. movdqa @XMM[0], @XMM[7]
  1797. paddd 0x50(%r11), @XMM[6] # .LADD6
  1798. paddd 0x60(%r11), @XMM[7] # .LADD7
  1799. # Borrow prologue from _bsaes_encrypt8 to use the opportunity
  1800. # to flip byte order in 32-bit counter
  1801. movdqa (%rsp), @XMM[9] # round 0 key
  1802. lea 0x10(%rsp), %rax # pass key schedule
  1803. movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
  1804. pxor @XMM[9], @XMM[0] # xor with round0 key
  1805. pxor @XMM[9], @XMM[1]
  1806. pxor @XMM[9], @XMM[2]
  1807. pxor @XMM[9], @XMM[3]
  1808. pshufb @XMM[8], @XMM[0]
  1809. pshufb @XMM[8], @XMM[1]
  1810. pxor @XMM[9], @XMM[4]
  1811. pxor @XMM[9], @XMM[5]
  1812. pshufb @XMM[8], @XMM[2]
  1813. pshufb @XMM[8], @XMM[3]
  1814. pxor @XMM[9], @XMM[6]
  1815. pxor @XMM[9], @XMM[7]
  1816. pshufb @XMM[8], @XMM[4]
  1817. pshufb @XMM[8], @XMM[5]
  1818. pshufb @XMM[8], @XMM[6]
  1819. pshufb @XMM[8], @XMM[7]
  1820. lea .LBS0(%rip), %r11 # constants table
  1821. mov %ebx,%r10d # pass rounds
  1822. call _bsaes_encrypt8_bitslice
  1823. sub \$8,$len
  1824. jc .Lctr_enc_loop_done
  1825. movdqu 0x00($inp), @XMM[8] # load input
  1826. movdqu 0x10($inp), @XMM[9]
  1827. movdqu 0x20($inp), @XMM[10]
  1828. movdqu 0x30($inp), @XMM[11]
  1829. movdqu 0x40($inp), @XMM[12]
  1830. movdqu 0x50($inp), @XMM[13]
  1831. movdqu 0x60($inp), @XMM[14]
  1832. movdqu 0x70($inp), @XMM[15]
  1833. lea 0x80($inp),$inp
  1834. pxor @XMM[0], @XMM[8]
  1835. movdqa 0x20(%rbp), @XMM[0] # load counter
  1836. pxor @XMM[9], @XMM[1]
  1837. movdqu @XMM[8], 0x00($out) # write output
  1838. pxor @XMM[10], @XMM[4]
  1839. movdqu @XMM[1], 0x10($out)
  1840. pxor @XMM[11], @XMM[6]
  1841. movdqu @XMM[4], 0x20($out)
  1842. pxor @XMM[12], @XMM[3]
  1843. movdqu @XMM[6], 0x30($out)
  1844. pxor @XMM[13], @XMM[7]
  1845. movdqu @XMM[3], 0x40($out)
  1846. pxor @XMM[14], @XMM[2]
  1847. movdqu @XMM[7], 0x50($out)
  1848. pxor @XMM[15], @XMM[5]
  1849. movdqu @XMM[2], 0x60($out)
  1850. lea .LADD1(%rip), %r11
  1851. movdqu @XMM[5], 0x70($out)
  1852. lea 0x80($out), $out
  1853. paddd 0x70(%r11), @XMM[0] # .LADD8
  1854. jnz .Lctr_enc_loop
  1855. jmp .Lctr_enc_done
  1856. .align 16
  1857. .Lctr_enc_loop_done:
  1858. add \$8, $len
  1859. movdqu 0x00($inp), @XMM[8] # load input
  1860. pxor @XMM[8], @XMM[0]
  1861. movdqu @XMM[0], 0x00($out) # write output
  1862. cmp \$2,$len
  1863. jb .Lctr_enc_done
  1864. movdqu 0x10($inp), @XMM[9]
  1865. pxor @XMM[9], @XMM[1]
  1866. movdqu @XMM[1], 0x10($out)
  1867. je .Lctr_enc_done
  1868. movdqu 0x20($inp), @XMM[10]
  1869. pxor @XMM[10], @XMM[4]
  1870. movdqu @XMM[4], 0x20($out)
  1871. cmp \$4,$len
  1872. jb .Lctr_enc_done
  1873. movdqu 0x30($inp), @XMM[11]
  1874. pxor @XMM[11], @XMM[6]
  1875. movdqu @XMM[6], 0x30($out)
  1876. je .Lctr_enc_done
  1877. movdqu 0x40($inp), @XMM[12]
  1878. pxor @XMM[12], @XMM[3]
  1879. movdqu @XMM[3], 0x40($out)
  1880. cmp \$6,$len
  1881. jb .Lctr_enc_done
  1882. movdqu 0x50($inp), @XMM[13]
  1883. pxor @XMM[13], @XMM[7]
  1884. movdqu @XMM[7], 0x50($out)
  1885. je .Lctr_enc_done
  1886. movdqu 0x60($inp), @XMM[14]
  1887. pxor @XMM[14], @XMM[2]
  1888. movdqu @XMM[2], 0x60($out)
  1889. jmp .Lctr_enc_done
  1890. .align 16
  1891. .Lctr_enc_short:
  1892. lea 0x20(%rbp), $arg1
  1893. lea 0x30(%rbp), $arg2
  1894. lea ($key), $arg3
  1895. call asm_AES_encrypt
  1896. movdqu ($inp), @XMM[1]
  1897. lea 16($inp), $inp
  1898. mov 0x2c(%rbp), %eax # load 32-bit counter
  1899. bswap %eax
  1900. pxor 0x30(%rbp), @XMM[1]
  1901. inc %eax # increment
  1902. movdqu @XMM[1], ($out)
  1903. bswap %eax
  1904. lea 16($out), $out
  1905. mov %eax, 0x2c(%rsp) # save 32-bit counter
  1906. dec $len
  1907. jnz .Lctr_enc_short
  1908. .Lctr_enc_done:
  1909. lea (%rsp), %rax
  1910. pxor %xmm0, %xmm0
  1911. .Lctr_enc_bzero: # wipe key schedule [if any]
  1912. movdqa %xmm0, 0x00(%rax)
  1913. movdqa %xmm0, 0x10(%rax)
  1914. lea 0x20(%rax), %rax
  1915. cmp %rax, %rbp
  1916. ja .Lctr_enc_bzero
  1917. lea (%rbp),%rsp # restore %rsp
  1918. ___
  1919. $code.=<<___ if ($win64);
  1920. movaps 0x40(%rbp), %xmm6
  1921. movaps 0x50(%rbp), %xmm7
  1922. movaps 0x60(%rbp), %xmm8
  1923. movaps 0x70(%rbp), %xmm9
  1924. movaps 0x80(%rbp), %xmm10
  1925. movaps 0x90(%rbp), %xmm11
  1926. movaps 0xa0(%rbp), %xmm12
  1927. movaps 0xb0(%rbp), %xmm13
  1928. movaps 0xc0(%rbp), %xmm14
  1929. movaps 0xd0(%rbp), %xmm15
  1930. lea 0xa0(%rbp), %rsp
  1931. ___
  1932. $code.=<<___;
  1933. mov 0x48(%rsp), %r15
  1934. mov 0x50(%rsp), %r14
  1935. mov 0x58(%rsp), %r13
  1936. mov 0x60(%rsp), %r12
  1937. mov 0x68(%rsp), %rbx
  1938. mov 0x70(%rsp), %rax
  1939. lea 0x78(%rsp), %rsp
  1940. mov %rax, %rbp
  1941. .Lctr_enc_epilogue:
  1942. ret
  1943. .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
  1944. ___
  1945. ######################################################################
  1946. # void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
  1947. # const AES_KEY *key1, const AES_KEY *key2,
  1948. # const unsigned char iv[16]);
  1949. #
  1950. my ($twmask,$twres,$twtmp)=@XMM[13..15];
  1951. $arg6=~s/d$//;
  1952. $code.=<<___;
  1953. .globl bsaes_xts_encrypt
  1954. .type bsaes_xts_encrypt,\@abi-omnipotent
  1955. .align 16
  1956. bsaes_xts_encrypt:
  1957. mov %rsp, %rax
  1958. .Lxts_enc_prologue:
  1959. push %rbp
  1960. push %rbx
  1961. push %r12
  1962. push %r13
  1963. push %r14
  1964. push %r15
  1965. lea -0x48(%rsp), %rsp
  1966. ___
  1967. $code.=<<___ if ($win64);
  1968. mov 0xa0(%rsp),$arg5 # pull key2
  1969. mov 0xa8(%rsp),$arg6 # pull ivp
  1970. lea -0xa0(%rsp), %rsp
  1971. movaps %xmm6, 0x40(%rsp)
  1972. movaps %xmm7, 0x50(%rsp)
  1973. movaps %xmm8, 0x60(%rsp)
  1974. movaps %xmm9, 0x70(%rsp)
  1975. movaps %xmm10, 0x80(%rsp)
  1976. movaps %xmm11, 0x90(%rsp)
  1977. movaps %xmm12, 0xa0(%rsp)
  1978. movaps %xmm13, 0xb0(%rsp)
  1979. movaps %xmm14, 0xc0(%rsp)
  1980. movaps %xmm15, 0xd0(%rsp)
  1981. .Lxts_enc_body:
  1982. ___
  1983. $code.=<<___;
  1984. mov %rsp, %rbp # backup %rsp
  1985. mov $arg1, $inp # backup arguments
  1986. mov $arg2, $out
  1987. mov $arg3, $len
  1988. mov $arg4, $key
  1989. lea ($arg6), $arg1
  1990. lea 0x20(%rbp), $arg2
  1991. lea ($arg5), $arg3
  1992. call asm_AES_encrypt # generate initial tweak
  1993. mov 240($key), %eax # rounds
  1994. mov $len, %rbx # backup $len
  1995. mov %eax, %edx # rounds
  1996. shl \$7, %rax # 128 bytes per inner round key
  1997. sub \$`128-32`, %rax # size of bit-sliced key schedule
  1998. sub %rax, %rsp
  1999. mov %rsp, %rax # pass key schedule
  2000. mov $key, %rcx # pass key
  2001. mov %edx, %r10d # pass rounds
  2002. call _bsaes_key_convert
  2003. pxor %xmm6, %xmm7 # fix up last round key
  2004. movdqa %xmm7, (%rax) # save last round key
  2005. and \$-16, $len
  2006. sub \$0x80, %rsp # place for tweak[8]
  2007. movdqa 0x20(%rbp), @XMM[7] # initial tweak
  2008. pxor $twtmp, $twtmp
  2009. movdqa .Lxts_magic(%rip), $twmask
  2010. pcmpgtd @XMM[7], $twtmp # broadcast upper bits
  2011. sub \$0x80, $len
  2012. jc .Lxts_enc_short
  2013. jmp .Lxts_enc_loop
  2014. .align 16
  2015. .Lxts_enc_loop:
  2016. ___
  2017. for ($i=0;$i<7;$i++) {
  2018. $code.=<<___;
  2019. pshufd \$0x13, $twtmp, $twres
  2020. pxor $twtmp, $twtmp
  2021. movdqa @XMM[7], @XMM[$i]
  2022. movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
  2023. paddq @XMM[7], @XMM[7] # psllq 1,$tweak
  2024. pand $twmask, $twres # isolate carry and residue
  2025. pcmpgtd @XMM[7], $twtmp # broadcast upper bits
  2026. pxor $twres, @XMM[7]
  2027. ___
  2028. $code.=<<___ if ($i>=1);
  2029. movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
  2030. ___
  2031. $code.=<<___ if ($i>=2);
  2032. pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
  2033. ___
  2034. }
  2035. $code.=<<___;
  2036. movdqu 0x60($inp), @XMM[8+6]
  2037. pxor @XMM[8+5], @XMM[5]
  2038. movdqu 0x70($inp), @XMM[8+7]
  2039. lea 0x80($inp), $inp
  2040. movdqa @XMM[7], 0x70(%rsp)
  2041. pxor @XMM[8+6], @XMM[6]
  2042. lea 0x80(%rsp), %rax # pass key schedule
  2043. pxor @XMM[8+7], @XMM[7]
  2044. mov %edx, %r10d # pass rounds
  2045. call _bsaes_encrypt8
  2046. pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2047. pxor 0x10(%rsp), @XMM[1]
  2048. movdqu @XMM[0], 0x00($out) # write output
  2049. pxor 0x20(%rsp), @XMM[4]
  2050. movdqu @XMM[1], 0x10($out)
  2051. pxor 0x30(%rsp), @XMM[6]
  2052. movdqu @XMM[4], 0x20($out)
  2053. pxor 0x40(%rsp), @XMM[3]
  2054. movdqu @XMM[6], 0x30($out)
  2055. pxor 0x50(%rsp), @XMM[7]
  2056. movdqu @XMM[3], 0x40($out)
  2057. pxor 0x60(%rsp), @XMM[2]
  2058. movdqu @XMM[7], 0x50($out)
  2059. pxor 0x70(%rsp), @XMM[5]
  2060. movdqu @XMM[2], 0x60($out)
  2061. movdqu @XMM[5], 0x70($out)
  2062. lea 0x80($out), $out
  2063. movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
  2064. pxor $twtmp, $twtmp
  2065. movdqa .Lxts_magic(%rip), $twmask
  2066. pcmpgtd @XMM[7], $twtmp
  2067. pshufd \$0x13, $twtmp, $twres
  2068. pxor $twtmp, $twtmp
  2069. paddq @XMM[7], @XMM[7] # psllq 1,$tweak
  2070. pand $twmask, $twres # isolate carry and residue
  2071. pcmpgtd @XMM[7], $twtmp # broadcast upper bits
  2072. pxor $twres, @XMM[7]
  2073. sub \$0x80,$len
  2074. jnc .Lxts_enc_loop
  2075. .Lxts_enc_short:
  2076. add \$0x80, $len
  2077. jz .Lxts_enc_done
  2078. ___
  2079. for ($i=0;$i<7;$i++) {
  2080. $code.=<<___;
  2081. pshufd \$0x13, $twtmp, $twres
  2082. pxor $twtmp, $twtmp
  2083. movdqa @XMM[7], @XMM[$i]
  2084. movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
  2085. paddq @XMM[7], @XMM[7] # psllq 1,$tweak
  2086. pand $twmask, $twres # isolate carry and residue
  2087. pcmpgtd @XMM[7], $twtmp # broadcast upper bits
  2088. pxor $twres, @XMM[7]
  2089. ___
  2090. $code.=<<___ if ($i>=1);
  2091. movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
  2092. cmp \$`0x10*$i`,$len
  2093. je .Lxts_enc_$i
  2094. ___
  2095. $code.=<<___ if ($i>=2);
  2096. pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
  2097. ___
  2098. }
  2099. $code.=<<___;
  2100. movdqu 0x60($inp), @XMM[8+6]
  2101. pxor @XMM[8+5], @XMM[5]
  2102. movdqa @XMM[7], 0x70(%rsp)
  2103. lea 0x70($inp), $inp
  2104. pxor @XMM[8+6], @XMM[6]
  2105. lea 0x80(%rsp), %rax # pass key schedule
  2106. mov %edx, %r10d # pass rounds
  2107. call _bsaes_encrypt8
  2108. pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2109. pxor 0x10(%rsp), @XMM[1]
  2110. movdqu @XMM[0], 0x00($out) # write output
  2111. pxor 0x20(%rsp), @XMM[4]
  2112. movdqu @XMM[1], 0x10($out)
  2113. pxor 0x30(%rsp), @XMM[6]
  2114. movdqu @XMM[4], 0x20($out)
  2115. pxor 0x40(%rsp), @XMM[3]
  2116. movdqu @XMM[6], 0x30($out)
  2117. pxor 0x50(%rsp), @XMM[7]
  2118. movdqu @XMM[3], 0x40($out)
  2119. pxor 0x60(%rsp), @XMM[2]
  2120. movdqu @XMM[7], 0x50($out)
  2121. movdqu @XMM[2], 0x60($out)
  2122. lea 0x70($out), $out
  2123. movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
  2124. jmp .Lxts_enc_done
  2125. .align 16
  2126. .Lxts_enc_6:
  2127. pxor @XMM[8+4], @XMM[4]
  2128. lea 0x60($inp), $inp
  2129. pxor @XMM[8+5], @XMM[5]
  2130. lea 0x80(%rsp), %rax # pass key schedule
  2131. mov %edx, %r10d # pass rounds
  2132. call _bsaes_encrypt8
  2133. pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2134. pxor 0x10(%rsp), @XMM[1]
  2135. movdqu @XMM[0], 0x00($out) # write output
  2136. pxor 0x20(%rsp), @XMM[4]
  2137. movdqu @XMM[1], 0x10($out)
  2138. pxor 0x30(%rsp), @XMM[6]
  2139. movdqu @XMM[4], 0x20($out)
  2140. pxor 0x40(%rsp), @XMM[3]
  2141. movdqu @XMM[6], 0x30($out)
  2142. pxor 0x50(%rsp), @XMM[7]
  2143. movdqu @XMM[3], 0x40($out)
  2144. movdqu @XMM[7], 0x50($out)
  2145. lea 0x60($out), $out
  2146. movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
  2147. jmp .Lxts_enc_done
  2148. .align 16
  2149. .Lxts_enc_5:
  2150. pxor @XMM[8+3], @XMM[3]
  2151. lea 0x50($inp), $inp
  2152. pxor @XMM[8+4], @XMM[4]
  2153. lea 0x80(%rsp), %rax # pass key schedule
  2154. mov %edx, %r10d # pass rounds
  2155. call _bsaes_encrypt8
  2156. pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2157. pxor 0x10(%rsp), @XMM[1]
  2158. movdqu @XMM[0], 0x00($out) # write output
  2159. pxor 0x20(%rsp), @XMM[4]
  2160. movdqu @XMM[1], 0x10($out)
  2161. pxor 0x30(%rsp), @XMM[6]
  2162. movdqu @XMM[4], 0x20($out)
  2163. pxor 0x40(%rsp), @XMM[3]
  2164. movdqu @XMM[6], 0x30($out)
  2165. movdqu @XMM[3], 0x40($out)
  2166. lea 0x50($out), $out
  2167. movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
  2168. jmp .Lxts_enc_done
  2169. .align 16
  2170. .Lxts_enc_4:
  2171. pxor @XMM[8+2], @XMM[2]
  2172. lea 0x40($inp), $inp
  2173. pxor @XMM[8+3], @XMM[3]
  2174. lea 0x80(%rsp), %rax # pass key schedule
  2175. mov %edx, %r10d # pass rounds
  2176. call _bsaes_encrypt8
  2177. pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2178. pxor 0x10(%rsp), @XMM[1]
  2179. movdqu @XMM[0], 0x00($out) # write output
  2180. pxor 0x20(%rsp), @XMM[4]
  2181. movdqu @XMM[1], 0x10($out)
  2182. pxor 0x30(%rsp), @XMM[6]
  2183. movdqu @XMM[4], 0x20($out)
  2184. movdqu @XMM[6], 0x30($out)
  2185. lea 0x40($out), $out
  2186. movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
  2187. jmp .Lxts_enc_done
  2188. .align 16
  2189. .Lxts_enc_3:
  2190. pxor @XMM[8+1], @XMM[1]
  2191. lea 0x30($inp), $inp
  2192. pxor @XMM[8+2], @XMM[2]
  2193. lea 0x80(%rsp), %rax # pass key schedule
  2194. mov %edx, %r10d # pass rounds
  2195. call _bsaes_encrypt8
  2196. pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2197. pxor 0x10(%rsp), @XMM[1]
  2198. movdqu @XMM[0], 0x00($out) # write output
  2199. pxor 0x20(%rsp), @XMM[4]
  2200. movdqu @XMM[1], 0x10($out)
  2201. movdqu @XMM[4], 0x20($out)
  2202. lea 0x30($out), $out
  2203. movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
  2204. jmp .Lxts_enc_done
  2205. .align 16
  2206. .Lxts_enc_2:
  2207. pxor @XMM[8+0], @XMM[0]
  2208. lea 0x20($inp), $inp
  2209. pxor @XMM[8+1], @XMM[1]
  2210. lea 0x80(%rsp), %rax # pass key schedule
  2211. mov %edx, %r10d # pass rounds
  2212. call _bsaes_encrypt8
  2213. pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2214. pxor 0x10(%rsp), @XMM[1]
  2215. movdqu @XMM[0], 0x00($out) # write output
  2216. movdqu @XMM[1], 0x10($out)
  2217. lea 0x20($out), $out
  2218. movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
  2219. jmp .Lxts_enc_done
  2220. .align 16
  2221. .Lxts_enc_1:
  2222. pxor @XMM[0], @XMM[8]
  2223. lea 0x10($inp), $inp
  2224. movdqa @XMM[8], 0x20(%rbp)
  2225. lea 0x20(%rbp), $arg1
  2226. lea 0x20(%rbp), $arg2
  2227. lea ($key), $arg3
  2228. call asm_AES_encrypt # doesn't touch %xmm
  2229. pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
  2230. #pxor @XMM[8], @XMM[0]
  2231. #lea 0x80(%rsp), %rax # pass key schedule
  2232. #mov %edx, %r10d # pass rounds
  2233. #call _bsaes_encrypt8
  2234. #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2235. movdqu @XMM[0], 0x00($out) # write output
  2236. lea 0x10($out), $out
  2237. movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
  2238. .Lxts_enc_done:
  2239. and \$15, %ebx
  2240. jz .Lxts_enc_ret
  2241. mov $out, %rdx
  2242. .Lxts_enc_steal:
  2243. movzb ($inp), %eax
  2244. movzb -16(%rdx), %ecx
  2245. lea 1($inp), $inp
  2246. mov %al, -16(%rdx)
  2247. mov %cl, 0(%rdx)
  2248. lea 1(%rdx), %rdx
  2249. sub \$1,%ebx
  2250. jnz .Lxts_enc_steal
  2251. movdqu -16($out), @XMM[0]
  2252. lea 0x20(%rbp), $arg1
  2253. pxor @XMM[7], @XMM[0]
  2254. lea 0x20(%rbp), $arg2
  2255. movdqa @XMM[0], 0x20(%rbp)
  2256. lea ($key), $arg3
  2257. call asm_AES_encrypt # doesn't touch %xmm
  2258. pxor 0x20(%rbp), @XMM[7]
  2259. movdqu @XMM[7], -16($out)
  2260. .Lxts_enc_ret:
  2261. lea (%rsp), %rax
  2262. pxor %xmm0, %xmm0
  2263. .Lxts_enc_bzero: # wipe key schedule [if any]
  2264. movdqa %xmm0, 0x00(%rax)
  2265. movdqa %xmm0, 0x10(%rax)
  2266. lea 0x20(%rax), %rax
  2267. cmp %rax, %rbp
  2268. ja .Lxts_enc_bzero
  2269. lea (%rbp),%rsp # restore %rsp
  2270. ___
  2271. $code.=<<___ if ($win64);
  2272. movaps 0x40(%rbp), %xmm6
  2273. movaps 0x50(%rbp), %xmm7
  2274. movaps 0x60(%rbp), %xmm8
  2275. movaps 0x70(%rbp), %xmm9
  2276. movaps 0x80(%rbp), %xmm10
  2277. movaps 0x90(%rbp), %xmm11
  2278. movaps 0xa0(%rbp), %xmm12
  2279. movaps 0xb0(%rbp), %xmm13
  2280. movaps 0xc0(%rbp), %xmm14
  2281. movaps 0xd0(%rbp), %xmm15
  2282. lea 0xa0(%rbp), %rsp
  2283. ___
  2284. $code.=<<___;
  2285. mov 0x48(%rsp), %r15
  2286. mov 0x50(%rsp), %r14
  2287. mov 0x58(%rsp), %r13
  2288. mov 0x60(%rsp), %r12
  2289. mov 0x68(%rsp), %rbx
  2290. mov 0x70(%rsp), %rax
  2291. lea 0x78(%rsp), %rsp
  2292. mov %rax, %rbp
  2293. .Lxts_enc_epilogue:
  2294. ret
  2295. .size bsaes_xts_encrypt,.-bsaes_xts_encrypt
  2296. .globl bsaes_xts_decrypt
  2297. .type bsaes_xts_decrypt,\@abi-omnipotent
  2298. .align 16
  2299. bsaes_xts_decrypt:
  2300. mov %rsp, %rax
  2301. .Lxts_dec_prologue:
  2302. push %rbp
  2303. push %rbx
  2304. push %r12
  2305. push %r13
  2306. push %r14
  2307. push %r15
  2308. lea -0x48(%rsp), %rsp
  2309. ___
  2310. $code.=<<___ if ($win64);
  2311. mov 0xa0(%rsp),$arg5 # pull key2
  2312. mov 0xa8(%rsp),$arg6 # pull ivp
  2313. lea -0xa0(%rsp), %rsp
  2314. movaps %xmm6, 0x40(%rsp)
  2315. movaps %xmm7, 0x50(%rsp)
  2316. movaps %xmm8, 0x60(%rsp)
  2317. movaps %xmm9, 0x70(%rsp)
  2318. movaps %xmm10, 0x80(%rsp)
  2319. movaps %xmm11, 0x90(%rsp)
  2320. movaps %xmm12, 0xa0(%rsp)
  2321. movaps %xmm13, 0xb0(%rsp)
  2322. movaps %xmm14, 0xc0(%rsp)
  2323. movaps %xmm15, 0xd0(%rsp)
  2324. .Lxts_dec_body:
  2325. ___
  2326. $code.=<<___;
  2327. mov %rsp, %rbp # backup %rsp
  2328. mov $arg1, $inp # backup arguments
  2329. mov $arg2, $out
  2330. mov $arg3, $len
  2331. mov $arg4, $key
  2332. lea ($arg6), $arg1
  2333. lea 0x20(%rbp), $arg2
  2334. lea ($arg5), $arg3
  2335. call asm_AES_encrypt # generate initial tweak
  2336. mov 240($key), %eax # rounds
  2337. mov $len, %rbx # backup $len
  2338. mov %eax, %edx # rounds
  2339. shl \$7, %rax # 128 bytes per inner round key
  2340. sub \$`128-32`, %rax # size of bit-sliced key schedule
  2341. sub %rax, %rsp
  2342. mov %rsp, %rax # pass key schedule
  2343. mov $key, %rcx # pass key
  2344. mov %edx, %r10d # pass rounds
  2345. call _bsaes_key_convert
  2346. pxor (%rsp), %xmm7 # fix up round 0 key
  2347. movdqa %xmm6, (%rax) # save last round key
  2348. movdqa %xmm7, (%rsp)
  2349. xor %eax, %eax # if ($len%16) len-=16;
  2350. and \$-16, $len
  2351. test \$15, %ebx
  2352. setnz %al
  2353. shl \$4, %rax
  2354. sub %rax, $len
  2355. sub \$0x80, %rsp # place for tweak[8]
  2356. movdqa 0x20(%rbp), @XMM[7] # initial tweak
  2357. pxor $twtmp, $twtmp
  2358. movdqa .Lxts_magic(%rip), $twmask
  2359. pcmpgtd @XMM[7], $twtmp # broadcast upper bits
  2360. sub \$0x80, $len
  2361. jc .Lxts_dec_short
  2362. jmp .Lxts_dec_loop
  2363. .align 16
  2364. .Lxts_dec_loop:
  2365. ___
  2366. for ($i=0;$i<7;$i++) {
  2367. $code.=<<___;
  2368. pshufd \$0x13, $twtmp, $twres
  2369. pxor $twtmp, $twtmp
  2370. movdqa @XMM[7], @XMM[$i]
  2371. movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
  2372. paddq @XMM[7], @XMM[7] # psllq 1,$tweak
  2373. pand $twmask, $twres # isolate carry and residue
  2374. pcmpgtd @XMM[7], $twtmp # broadcast upper bits
  2375. pxor $twres, @XMM[7]
  2376. ___
  2377. $code.=<<___ if ($i>=1);
  2378. movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
  2379. ___
  2380. $code.=<<___ if ($i>=2);
  2381. pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
  2382. ___
  2383. }
  2384. $code.=<<___;
  2385. movdqu 0x60($inp), @XMM[8+6]
  2386. pxor @XMM[8+5], @XMM[5]
  2387. movdqu 0x70($inp), @XMM[8+7]
  2388. lea 0x80($inp), $inp
  2389. movdqa @XMM[7], 0x70(%rsp)
  2390. pxor @XMM[8+6], @XMM[6]
  2391. lea 0x80(%rsp), %rax # pass key schedule
  2392. pxor @XMM[8+7], @XMM[7]
  2393. mov %edx, %r10d # pass rounds
  2394. call _bsaes_decrypt8
  2395. pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2396. pxor 0x10(%rsp), @XMM[1]
  2397. movdqu @XMM[0], 0x00($out) # write output
  2398. pxor 0x20(%rsp), @XMM[6]
  2399. movdqu @XMM[1], 0x10($out)
  2400. pxor 0x30(%rsp), @XMM[4]
  2401. movdqu @XMM[6], 0x20($out)
  2402. pxor 0x40(%rsp), @XMM[2]
  2403. movdqu @XMM[4], 0x30($out)
  2404. pxor 0x50(%rsp), @XMM[7]
  2405. movdqu @XMM[2], 0x40($out)
  2406. pxor 0x60(%rsp), @XMM[3]
  2407. movdqu @XMM[7], 0x50($out)
  2408. pxor 0x70(%rsp), @XMM[5]
  2409. movdqu @XMM[3], 0x60($out)
  2410. movdqu @XMM[5], 0x70($out)
  2411. lea 0x80($out), $out
  2412. movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
  2413. pxor $twtmp, $twtmp
  2414. movdqa .Lxts_magic(%rip), $twmask
  2415. pcmpgtd @XMM[7], $twtmp
  2416. pshufd \$0x13, $twtmp, $twres
  2417. pxor $twtmp, $twtmp
  2418. paddq @XMM[7], @XMM[7] # psllq 1,$tweak
  2419. pand $twmask, $twres # isolate carry and residue
  2420. pcmpgtd @XMM[7], $twtmp # broadcast upper bits
  2421. pxor $twres, @XMM[7]
  2422. sub \$0x80,$len
  2423. jnc .Lxts_dec_loop
  2424. .Lxts_dec_short:
  2425. add \$0x80, $len
  2426. jz .Lxts_dec_done
  2427. ___
  2428. for ($i=0;$i<7;$i++) {
  2429. $code.=<<___;
  2430. pshufd \$0x13, $twtmp, $twres
  2431. pxor $twtmp, $twtmp
  2432. movdqa @XMM[7], @XMM[$i]
  2433. movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
  2434. paddq @XMM[7], @XMM[7] # psllq 1,$tweak
  2435. pand $twmask, $twres # isolate carry and residue
  2436. pcmpgtd @XMM[7], $twtmp # broadcast upper bits
  2437. pxor $twres, @XMM[7]
  2438. ___
  2439. $code.=<<___ if ($i>=1);
  2440. movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
  2441. cmp \$`0x10*$i`,$len
  2442. je .Lxts_dec_$i
  2443. ___
  2444. $code.=<<___ if ($i>=2);
  2445. pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
  2446. ___
  2447. }
  2448. $code.=<<___;
  2449. movdqu 0x60($inp), @XMM[8+6]
  2450. pxor @XMM[8+5], @XMM[5]
  2451. movdqa @XMM[7], 0x70(%rsp)
  2452. lea 0x70($inp), $inp
  2453. pxor @XMM[8+6], @XMM[6]
  2454. lea 0x80(%rsp), %rax # pass key schedule
  2455. mov %edx, %r10d # pass rounds
  2456. call _bsaes_decrypt8
  2457. pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2458. pxor 0x10(%rsp), @XMM[1]
  2459. movdqu @XMM[0], 0x00($out) # write output
  2460. pxor 0x20(%rsp), @XMM[6]
  2461. movdqu @XMM[1], 0x10($out)
  2462. pxor 0x30(%rsp), @XMM[4]
  2463. movdqu @XMM[6], 0x20($out)
  2464. pxor 0x40(%rsp), @XMM[2]
  2465. movdqu @XMM[4], 0x30($out)
  2466. pxor 0x50(%rsp), @XMM[7]
  2467. movdqu @XMM[2], 0x40($out)
  2468. pxor 0x60(%rsp), @XMM[3]
  2469. movdqu @XMM[7], 0x50($out)
  2470. movdqu @XMM[3], 0x60($out)
  2471. lea 0x70($out), $out
  2472. movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
  2473. jmp .Lxts_dec_done
  2474. .align 16
  2475. .Lxts_dec_6:
  2476. pxor @XMM[8+4], @XMM[4]
  2477. lea 0x60($inp), $inp
  2478. pxor @XMM[8+5], @XMM[5]
  2479. lea 0x80(%rsp), %rax # pass key schedule
  2480. mov %edx, %r10d # pass rounds
  2481. call _bsaes_decrypt8
  2482. pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2483. pxor 0x10(%rsp), @XMM[1]
  2484. movdqu @XMM[0], 0x00($out) # write output
  2485. pxor 0x20(%rsp), @XMM[6]
  2486. movdqu @XMM[1], 0x10($out)
  2487. pxor 0x30(%rsp), @XMM[4]
  2488. movdqu @XMM[6], 0x20($out)
  2489. pxor 0x40(%rsp), @XMM[2]
  2490. movdqu @XMM[4], 0x30($out)
  2491. pxor 0x50(%rsp), @XMM[7]
  2492. movdqu @XMM[2], 0x40($out)
  2493. movdqu @XMM[7], 0x50($out)
  2494. lea 0x60($out), $out
  2495. movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
  2496. jmp .Lxts_dec_done
  2497. .align 16
  2498. .Lxts_dec_5:
  2499. pxor @XMM[8+3], @XMM[3]
  2500. lea 0x50($inp), $inp
  2501. pxor @XMM[8+4], @XMM[4]
  2502. lea 0x80(%rsp), %rax # pass key schedule
  2503. mov %edx, %r10d # pass rounds
  2504. call _bsaes_decrypt8
  2505. pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2506. pxor 0x10(%rsp), @XMM[1]
  2507. movdqu @XMM[0], 0x00($out) # write output
  2508. pxor 0x20(%rsp), @XMM[6]
  2509. movdqu @XMM[1], 0x10($out)
  2510. pxor 0x30(%rsp), @XMM[4]
  2511. movdqu @XMM[6], 0x20($out)
  2512. pxor 0x40(%rsp), @XMM[2]
  2513. movdqu @XMM[4], 0x30($out)
  2514. movdqu @XMM[2], 0x40($out)
  2515. lea 0x50($out), $out
  2516. movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
  2517. jmp .Lxts_dec_done
  2518. .align 16
  2519. .Lxts_dec_4:
  2520. pxor @XMM[8+2], @XMM[2]
  2521. lea 0x40($inp), $inp
  2522. pxor @XMM[8+3], @XMM[3]
  2523. lea 0x80(%rsp), %rax # pass key schedule
  2524. mov %edx, %r10d # pass rounds
  2525. call _bsaes_decrypt8
  2526. pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2527. pxor 0x10(%rsp), @XMM[1]
  2528. movdqu @XMM[0], 0x00($out) # write output
  2529. pxor 0x20(%rsp), @XMM[6]
  2530. movdqu @XMM[1], 0x10($out)
  2531. pxor 0x30(%rsp), @XMM[4]
  2532. movdqu @XMM[6], 0x20($out)
  2533. movdqu @XMM[4], 0x30($out)
  2534. lea 0x40($out), $out
  2535. movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
  2536. jmp .Lxts_dec_done
  2537. .align 16
  2538. .Lxts_dec_3:
  2539. pxor @XMM[8+1], @XMM[1]
  2540. lea 0x30($inp), $inp
  2541. pxor @XMM[8+2], @XMM[2]
  2542. lea 0x80(%rsp), %rax # pass key schedule
  2543. mov %edx, %r10d # pass rounds
  2544. call _bsaes_decrypt8
  2545. pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2546. pxor 0x10(%rsp), @XMM[1]
  2547. movdqu @XMM[0], 0x00($out) # write output
  2548. pxor 0x20(%rsp), @XMM[6]
  2549. movdqu @XMM[1], 0x10($out)
  2550. movdqu @XMM[6], 0x20($out)
  2551. lea 0x30($out), $out
  2552. movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
  2553. jmp .Lxts_dec_done
  2554. .align 16
  2555. .Lxts_dec_2:
  2556. pxor @XMM[8+0], @XMM[0]
  2557. lea 0x20($inp), $inp
  2558. pxor @XMM[8+1], @XMM[1]
  2559. lea 0x80(%rsp), %rax # pass key schedule
  2560. mov %edx, %r10d # pass rounds
  2561. call _bsaes_decrypt8
  2562. pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2563. pxor 0x10(%rsp), @XMM[1]
  2564. movdqu @XMM[0], 0x00($out) # write output
  2565. movdqu @XMM[1], 0x10($out)
  2566. lea 0x20($out), $out
  2567. movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
  2568. jmp .Lxts_dec_done
  2569. .align 16
  2570. .Lxts_dec_1:
  2571. pxor @XMM[0], @XMM[8]
  2572. lea 0x10($inp), $inp
  2573. movdqa @XMM[8], 0x20(%rbp)
  2574. lea 0x20(%rbp), $arg1
  2575. lea 0x20(%rbp), $arg2
  2576. lea ($key), $arg3
  2577. call asm_AES_decrypt # doesn't touch %xmm
  2578. pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
  2579. #pxor @XMM[8], @XMM[0]
  2580. #lea 0x80(%rsp), %rax # pass key schedule
  2581. #mov %edx, %r10d # pass rounds
  2582. #call _bsaes_decrypt8
  2583. #pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
  2584. movdqu @XMM[0], 0x00($out) # write output
  2585. lea 0x10($out), $out
  2586. movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
  2587. .Lxts_dec_done:
  2588. and \$15, %ebx
  2589. jz .Lxts_dec_ret
  2590. pxor $twtmp, $twtmp
  2591. movdqa .Lxts_magic(%rip), $twmask
  2592. pcmpgtd @XMM[7], $twtmp
  2593. pshufd \$0x13, $twtmp, $twres
  2594. movdqa @XMM[7], @XMM[6]
  2595. paddq @XMM[7], @XMM[7] # psllq 1,$tweak
  2596. pand $twmask, $twres # isolate carry and residue
  2597. movdqu ($inp), @XMM[0]
  2598. pxor $twres, @XMM[7]
  2599. lea 0x20(%rbp), $arg1
  2600. pxor @XMM[7], @XMM[0]
  2601. lea 0x20(%rbp), $arg2
  2602. movdqa @XMM[0], 0x20(%rbp)
  2603. lea ($key), $arg3
  2604. call asm_AES_decrypt # doesn't touch %xmm
  2605. pxor 0x20(%rbp), @XMM[7]
  2606. mov $out, %rdx
  2607. movdqu @XMM[7], ($out)
  2608. .Lxts_dec_steal:
  2609. movzb 16($inp), %eax
  2610. movzb (%rdx), %ecx
  2611. lea 1($inp), $inp
  2612. mov %al, (%rdx)
  2613. mov %cl, 16(%rdx)
  2614. lea 1(%rdx), %rdx
  2615. sub \$1,%ebx
  2616. jnz .Lxts_dec_steal
  2617. movdqu ($out), @XMM[0]
  2618. lea 0x20(%rbp), $arg1
  2619. pxor @XMM[6], @XMM[0]
  2620. lea 0x20(%rbp), $arg2
  2621. movdqa @XMM[0], 0x20(%rbp)
  2622. lea ($key), $arg3
  2623. call asm_AES_decrypt # doesn't touch %xmm
  2624. pxor 0x20(%rbp), @XMM[6]
  2625. movdqu @XMM[6], ($out)
  2626. .Lxts_dec_ret:
  2627. lea (%rsp), %rax
  2628. pxor %xmm0, %xmm0
  2629. .Lxts_dec_bzero: # wipe key schedule [if any]
  2630. movdqa %xmm0, 0x00(%rax)
  2631. movdqa %xmm0, 0x10(%rax)
  2632. lea 0x20(%rax), %rax
  2633. cmp %rax, %rbp
  2634. ja .Lxts_dec_bzero
  2635. lea (%rbp),%rsp # restore %rsp
  2636. ___
  2637. $code.=<<___ if ($win64);
  2638. movaps 0x40(%rbp), %xmm6
  2639. movaps 0x50(%rbp), %xmm7
  2640. movaps 0x60(%rbp), %xmm8
  2641. movaps 0x70(%rbp), %xmm9
  2642. movaps 0x80(%rbp), %xmm10
  2643. movaps 0x90(%rbp), %xmm11
  2644. movaps 0xa0(%rbp), %xmm12
  2645. movaps 0xb0(%rbp), %xmm13
  2646. movaps 0xc0(%rbp), %xmm14
  2647. movaps 0xd0(%rbp), %xmm15
  2648. lea 0xa0(%rbp), %rsp
  2649. ___
  2650. $code.=<<___;
  2651. mov 0x48(%rsp), %r15
  2652. mov 0x50(%rsp), %r14
  2653. mov 0x58(%rsp), %r13
  2654. mov 0x60(%rsp), %r12
  2655. mov 0x68(%rsp), %rbx
  2656. mov 0x70(%rsp), %rax
  2657. lea 0x78(%rsp), %rsp
  2658. mov %rax, %rbp
  2659. .Lxts_dec_epilogue:
  2660. ret
  2661. .size bsaes_xts_decrypt,.-bsaes_xts_decrypt
  2662. ___
  2663. }
  2664. $code.=<<___;
  2665. .type _bsaes_const,\@object
  2666. .align 64
  2667. _bsaes_const:
  2668. .LM0ISR: # InvShiftRows constants
  2669. .quad 0x0a0e0206070b0f03, 0x0004080c0d010509
  2670. .LISRM0:
  2671. .quad 0x01040b0e0205080f, 0x0306090c00070a0d
  2672. .LISR:
  2673. .quad 0x0504070602010003, 0x0f0e0d0c080b0a09
  2674. .LBS0: # bit-slice constants
  2675. .quad 0x5555555555555555, 0x5555555555555555
  2676. .LBS1:
  2677. .quad 0x3333333333333333, 0x3333333333333333
  2678. .LBS2:
  2679. .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
  2680. .LSR: # shiftrows constants
  2681. .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
  2682. .LSRM0:
  2683. .quad 0x0304090e00050a0f, 0x01060b0c0207080d
  2684. .LM0SR:
  2685. .quad 0x0a0e02060f03070b, 0x0004080c05090d01
  2686. .LSWPUP: # byte-swap upper dword
  2687. .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
  2688. .LSWPUPM0SR:
  2689. .quad 0x0a0d02060c03070b, 0x0004080f05090e01
  2690. .LADD1: # counter increment constants
  2691. .quad 0x0000000000000000, 0x0000000100000000
  2692. .LADD2:
  2693. .quad 0x0000000000000000, 0x0000000200000000
  2694. .LADD3:
  2695. .quad 0x0000000000000000, 0x0000000300000000
  2696. .LADD4:
  2697. .quad 0x0000000000000000, 0x0000000400000000
  2698. .LADD5:
  2699. .quad 0x0000000000000000, 0x0000000500000000
  2700. .LADD6:
  2701. .quad 0x0000000000000000, 0x0000000600000000
  2702. .LADD7:
  2703. .quad 0x0000000000000000, 0x0000000700000000
  2704. .LADD8:
  2705. .quad 0x0000000000000000, 0x0000000800000000
  2706. .Lxts_magic:
  2707. .long 0x87,0,1,0
  2708. .Lmasks:
  2709. .quad 0x0101010101010101, 0x0101010101010101
  2710. .quad 0x0202020202020202, 0x0202020202020202
  2711. .quad 0x0404040404040404, 0x0404040404040404
  2712. .quad 0x0808080808080808, 0x0808080808080808
  2713. .LM0:
  2714. .quad 0x02060a0e03070b0f, 0x0004080c0105090d
  2715. .L63:
  2716. .quad 0x6363636363636363, 0x6363636363636363
  2717. .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
  2718. .align 64
  2719. .size _bsaes_const,.-_bsaes_const
  2720. ___
  2721. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  2722. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  2723. if ($win64) {
  2724. $rec="%rcx";
  2725. $frame="%rdx";
  2726. $context="%r8";
  2727. $disp="%r9";
  2728. $code.=<<___;
  2729. .extern __imp_RtlVirtualUnwind
  2730. .type se_handler,\@abi-omnipotent
  2731. .align 16
  2732. se_handler:
  2733. push %rsi
  2734. push %rdi
  2735. push %rbx
  2736. push %rbp
  2737. push %r12
  2738. push %r13
  2739. push %r14
  2740. push %r15
  2741. pushfq
  2742. sub \$64,%rsp
  2743. mov 120($context),%rax # pull context->Rax
  2744. mov 248($context),%rbx # pull context->Rip
  2745. mov 8($disp),%rsi # disp->ImageBase
  2746. mov 56($disp),%r11 # disp->HandlerData
  2747. mov 0(%r11),%r10d # HandlerData[0]
  2748. lea (%rsi,%r10),%r10 # prologue label
  2749. cmp %r10,%rbx # context->Rip<prologue label
  2750. jb .Lin_prologue
  2751. mov 152($context),%rax # pull context->Rsp
  2752. mov 4(%r11),%r10d # HandlerData[1]
  2753. lea (%rsi,%r10),%r10 # epilogue label
  2754. cmp %r10,%rbx # context->Rip>=epilogue label
  2755. jae .Lin_prologue
  2756. mov 160($context),%rax # pull context->Rbp
  2757. lea 0x40(%rax),%rsi # %xmm save area
  2758. lea 512($context),%rdi # &context.Xmm6
  2759. mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
  2760. .long 0xa548f3fc # cld; rep movsq
  2761. lea 0xa0(%rax),%rax # adjust stack pointer
  2762. mov 0x70(%rax),%rbp
  2763. mov 0x68(%rax),%rbx
  2764. mov 0x60(%rax),%r12
  2765. mov 0x58(%rax),%r13
  2766. mov 0x50(%rax),%r14
  2767. mov 0x48(%rax),%r15
  2768. lea 0x78(%rax),%rax # adjust stack pointer
  2769. mov %rbx,144($context) # restore context->Rbx
  2770. mov %rbp,160($context) # restore context->Rbp
  2771. mov %r12,216($context) # restore context->R12
  2772. mov %r13,224($context) # restore context->R13
  2773. mov %r14,232($context) # restore context->R14
  2774. mov %r15,240($context) # restore context->R15
  2775. .Lin_prologue:
  2776. mov %rax,152($context) # restore context->Rsp
  2777. mov 40($disp),%rdi # disp->ContextRecord
  2778. mov $context,%rsi # context
  2779. mov \$`1232/8`,%ecx # sizeof(CONTEXT)
  2780. .long 0xa548f3fc # cld; rep movsq
  2781. mov $disp,%rsi
  2782. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  2783. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  2784. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  2785. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  2786. mov 40(%rsi),%r10 # disp->ContextRecord
  2787. lea 56(%rsi),%r11 # &disp->HandlerData
  2788. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  2789. mov %r10,32(%rsp) # arg5
  2790. mov %r11,40(%rsp) # arg6
  2791. mov %r12,48(%rsp) # arg7
  2792. mov %rcx,56(%rsp) # arg8, (NULL)
  2793. call *__imp_RtlVirtualUnwind(%rip)
  2794. mov \$1,%eax # ExceptionContinueSearch
  2795. add \$64,%rsp
  2796. popfq
  2797. pop %r15
  2798. pop %r14
  2799. pop %r13
  2800. pop %r12
  2801. pop %rbp
  2802. pop %rbx
  2803. pop %rdi
  2804. pop %rsi
  2805. ret
  2806. .size se_handler,.-se_handler
  2807. .section .pdata
  2808. .align 4
  2809. ___
  2810. $code.=<<___ if ($ecb);
  2811. .rva .Lecb_enc_prologue
  2812. .rva .Lecb_enc_epilogue
  2813. .rva .Lecb_enc_info
  2814. .rva .Lecb_dec_prologue
  2815. .rva .Lecb_dec_epilogue
  2816. .rva .Lecb_dec_info
  2817. ___
  2818. $code.=<<___;
  2819. .rva .Lcbc_dec_prologue
  2820. .rva .Lcbc_dec_epilogue
  2821. .rva .Lcbc_dec_info
  2822. .rva .Lctr_enc_prologue
  2823. .rva .Lctr_enc_epilogue
  2824. .rva .Lctr_enc_info
  2825. .rva .Lxts_enc_prologue
  2826. .rva .Lxts_enc_epilogue
  2827. .rva .Lxts_enc_info
  2828. .rva .Lxts_dec_prologue
  2829. .rva .Lxts_dec_epilogue
  2830. .rva .Lxts_dec_info
  2831. .section .xdata
  2832. .align 8
  2833. ___
  2834. $code.=<<___ if ($ecb);
  2835. .Lecb_enc_info:
  2836. .byte 9,0,0,0
  2837. .rva se_handler
  2838. .rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
  2839. .Lecb_dec_info:
  2840. .byte 9,0,0,0
  2841. .rva se_handler
  2842. .rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
  2843. ___
  2844. $code.=<<___;
  2845. .Lcbc_dec_info:
  2846. .byte 9,0,0,0
  2847. .rva se_handler
  2848. .rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
  2849. .Lctr_enc_info:
  2850. .byte 9,0,0,0
  2851. .rva se_handler
  2852. .rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
  2853. .Lxts_enc_info:
  2854. .byte 9,0,0,0
  2855. .rva se_handler
  2856. .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
  2857. .Lxts_dec_info:
  2858. .byte 9,0,0,0
  2859. .rva se_handler
  2860. .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
  2861. ___
  2862. }
  2863. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  2864. print $code;
  2865. close STDOUT;