bsaes-x86_64.pl 28 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184
  1. #!/usr/bin/env perl
  2. ###################################################################
  3. ### AES-128 [originally in CTR mode] ###
  4. ### bitsliced implementation for Intel Core 2 processors ###
  5. ### requires support of SSE extensions up to SSSE3 ###
  6. ### Author: Emilia Käsper and Peter Schwabe ###
  7. ### Date: 2009-03-19 ###
  8. ### Public domain ###
  9. ### ###
  10. ### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
  11. ### further information. ###
  12. ###################################################################
  13. #
  14. # September 2011.
  15. #
  16. # Started as transliteration to "perlasm" the original code has
  17. # undergone following changes:
  18. #
  19. # - code was made position-independent;
  20. # - rounds were folded into a loop resulting in >5x size reduction
  21. # from 12.5KB to 2.2KB;
  22. # - above was possibile thanks to mixcolumns() modification that
  23. # allowed to feed its output back to aesenc[last], this was
  24. # achieved at cost of two additional inter-registers moves;
  25. # - some instruction reordering and interleaving;
  26. # - this module doesn't implement key setup subroutine, instead it
  27. # relies on conversion of "conventional" key schedule as returned
  28. # by AES_set_encrypt_key (see discussion below);
  29. # - first and last round keys are treated differently, which allowed
  30. # to skip one shiftrows(), reduce bit-sliced key schedule and
  31. # speed-up conversion by 22%;
  32. # - support for 192- and 256-bit keys was added;
  33. #
  34. # Resulting performance in CPU cycles spent to encrypt one byte out
  35. # of 4096-byte buffer with 128-bit key is:
  36. #
  37. # Emilia's this(*) difference
  38. #
  39. # Core 2 9.30 8.69 +7%
  40. # Nehalem(**) 7.63 6.98 +9%
  41. # Atom 17.1 17.4 -2%(***)
  42. #
  43. # (*) Comparison is not completely fair, because "this" is ECB,
  44. # i.e. no extra processing such as counter values calculation
  45. # and xor-ing input as in Emilia's CTR implementation is
  46. # performed. However, the CTR calculations stand for not more
  47. # than 1% of total time, so comparison is *rather* fair.
  48. #
  49. # (**) Results were collected on Westmere, which is considered to
  50. # be equivalent to Nehalem for this code.
  51. #
  52. # (***) Slowdown on Atom is rather strange per se, because original
  53. # implementation has a number of 9+-bytes instructions, which
  54. # are bad for Atom front-end, and which I eliminated completely.
  55. # In attempt to address deterioration sbox() was tested in FP
  56. # SIMD "domain" (movaps instead of movdqa, xorps instead of
  57. # pxor, etc.). While it resulted in nominal 4% improvement on
  58. # Atom, it hurted Westmere by more than 2x factor.
  59. #
  60. # As for key schedule conversion subroutine. Interface to OpenSSL
  61. # relies on per-invocation on-the-fly conversion. This naturally
  62. # has impact on performance, especially for short inputs. Conversion
  63. # time in CPU cycles and its ratio to CPU cycles spent in 8x block
  64. # function is:
  65. #
  66. # conversion conversion/8x block
  67. # Core 2 410 0.37
  68. # Nehalem 310 0.35
  69. # Atom 570 0.26
  70. #
  71. # The ratio values mean that 128-byte blocks will be processed
  72. # 21-27% slower, 256-byte blocks - 12-16%, 382-byte blocks - 8-11%,
  73. # etc. Then keep in mind that input sizes not divisible by 128 are
  74. # *effectively* slower, especially shortest ones, e.g. consecutive
  75. # 144-byte blocks are processed 44% slower than one would expect,
  76. # 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
  77. # it's still faster than ["hyper-threading-safe" code path in]
  78. # aes-x86_64.pl on all lengths above 64 bytes...
  79. #
  80. # <appro@openssl.org>
  81. $flavour = shift;
  82. $output = shift;
  83. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  84. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  85. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  86. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  87. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  88. die "can't locate x86_64-xlate.pl";
  89. open STDOUT,"| $^X $xlate $flavour $output";
  90. my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
  91. my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
  92. {
  93. my ($key,$rounds,$const)=("%rax","%r10d","%r11");
  94. sub sbox {
  95. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  96. # output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
  97. my @b=@_[0..7];
  98. my @t=@_[8..11];
  99. my @s=@_[12..15];
  100. &InBasisChange (@b);
  101. &Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
  102. &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
  103. }
  104. sub InBasisChange {
  105. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  106. # output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
  107. my @b=@_[0..7];
  108. $code.=<<___;
  109. pxor @b[6], @b[5]
  110. pxor @b[1], @b[2]
  111. pxor @b[0], @b[5]
  112. pxor @b[2], @b[6]
  113. pxor @b[0], @b[3]
  114. pxor @b[3], @b[6]
  115. pxor @b[7], @b[3]
  116. pxor @b[5], @b[7]
  117. pxor @b[4], @b[3]
  118. pxor @b[5], @b[4]
  119. pxor @b[1], @b[3]
  120. pxor @b[7], @b[2]
  121. pxor @b[5], @b[1]
  122. ___
  123. }
  124. sub OutBasisChange {
  125. # input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
  126. # output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
  127. my @b=@_[0..7];
  128. $code.=<<___;
  129. pxor @b[6], @b[0]
  130. pxor @b[4], @b[1]
  131. pxor @b[0], @b[2]
  132. pxor @b[6], @b[4]
  133. pxor @b[1], @b[6]
  134. pxor @b[5], @b[1]
  135. pxor @b[3], @b[5]
  136. pxor @b[7], @b[3]
  137. pxor @b[5], @b[7]
  138. pxor @b[5], @b[2]
  139. pxor @b[7], @b[4]
  140. ___
  141. }
  142. sub Mul_GF4 {
  143. #;*************************************************************
  144. #;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
  145. #;*************************************************************
  146. my ($x0,$x1,$y0,$y1,$t0)=@_;
  147. $code.=<<___;
  148. movdqa $y0, $t0
  149. pxor $y1, $t0
  150. pand $x0, $t0
  151. pxor $x1, $x0
  152. pand $y0, $x1
  153. pand $y1, $x0
  154. pxor $x1, $x0
  155. pxor $t0, $x1
  156. ___
  157. }
  158. sub Mul_GF4_N { # not used, see next subroutine
  159. # multiply and scale by N
  160. my ($x0,$x1,$y0,$y1,$t0)=@_;
  161. $code.=<<___;
  162. movdqa $y0, $t0
  163. pxor $y1, $t0
  164. pand $x0, $t0
  165. pxor $x1, $x0
  166. pand $y0, $x1
  167. pand $y1, $x0
  168. pxor $x0, $x1
  169. pxor $t0, $x0
  170. ___
  171. }
  172. sub Mul_GF4_N_GF4 {
  173. # interleaved Mul_GF4_N and Mul_GF4
  174. my ($x0,$x1,$y0,$y1,$t0,
  175. $x2,$x3,$y2,$y3,$t1)=@_;
  176. $code.=<<___;
  177. movdqa $y0, $t0
  178. movdqa $y2, $t1
  179. pxor $y1, $t0
  180. pxor $y3, $t1
  181. pand $x0, $t0
  182. pand $x2, $t1
  183. pxor $x1, $x0
  184. pxor $x3, $x2
  185. pand $y0, $x1
  186. pand $y2, $x3
  187. pand $y1, $x0
  188. pand $y3, $x2
  189. pxor $x0, $x1
  190. pxor $x3, $x2
  191. pxor $t0, $x0
  192. pxor $t1, $x3
  193. ___
  194. }
  195. sub Mul_GF16_2 {
  196. my @x=@_[0..7];
  197. my @y=@_[8..11];
  198. my @t=@_[12..15];
  199. $code.=<<___;
  200. movdqa @x[0], @t[0]
  201. movdqa @x[1], @t[1]
  202. ___
  203. &Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
  204. $code.=<<___;
  205. pxor @x[2], @t[0]
  206. pxor @x[3], @t[1]
  207. pxor @y[2], @y[0]
  208. pxor @y[3], @y[1]
  209. ___
  210. Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
  211. @x[2], @x[3], @y[2], @y[3], @t[2]);
  212. $code.=<<___;
  213. pxor @t[0], @x[0]
  214. pxor @t[0], @x[2]
  215. pxor @t[1], @x[1]
  216. pxor @t[1], @x[3]
  217. movdqa @x[4], @t[0]
  218. movdqa @x[5], @t[1]
  219. pxor @x[6], @t[0]
  220. pxor @x[7], @t[1]
  221. ___
  222. &Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
  223. @x[6], @x[7], @y[2], @y[3], @t[2]);
  224. $code.=<<___;
  225. pxor @y[2], @y[0]
  226. pxor @y[3], @y[1]
  227. ___
  228. &Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
  229. $code.=<<___;
  230. pxor @t[0], @x[4]
  231. pxor @t[0], @x[6]
  232. pxor @t[1], @x[5]
  233. pxor @t[1], @x[7]
  234. ___
  235. }
  236. sub Inv_GF256 {
  237. #;********************************************************************
  238. #;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
  239. #;********************************************************************
  240. my @x=@_[0..7];
  241. my @t=@_[8..11];
  242. my @s=@_[12..15];
  243. # direct optimizations from hardware
  244. $code.=<<___;
  245. movdqa @x[4], @t[3]
  246. movdqa @x[5], @t[2]
  247. movdqa @x[1], @t[1]
  248. movdqa @x[7], @s[1]
  249. movdqa @x[0], @s[0]
  250. pxor @x[6], @t[3]
  251. pxor @x[7], @t[2]
  252. pxor @x[3], @t[1]
  253. movdqa @t[3], @s[2]
  254. pxor @x[6], @s[1]
  255. movdqa @t[2], @t[0]
  256. pxor @x[2], @s[0]
  257. movdqa @t[3], @s[3]
  258. por @t[1], @t[2]
  259. por @s[0], @t[3]
  260. pxor @t[0], @s[3]
  261. pand @s[0], @s[2]
  262. pxor @t[1], @s[0]
  263. pand @t[1], @t[0]
  264. pand @s[0], @s[3]
  265. movdqa @x[3], @s[0]
  266. pxor @x[2], @s[0]
  267. pand @s[0], @s[1]
  268. pxor @s[1], @t[3]
  269. pxor @s[1], @t[2]
  270. movdqa @x[4], @s[1]
  271. movdqa @x[1], @s[0]
  272. pxor @x[5], @s[1]
  273. pxor @x[0], @s[0]
  274. movdqa @s[1], @t[1]
  275. pand @s[0], @s[1]
  276. por @s[0], @t[1]
  277. pxor @s[1], @t[0]
  278. pxor @s[3], @t[3]
  279. pxor @s[2], @t[2]
  280. pxor @s[3], @t[1]
  281. movdqa @x[7], @s[0]
  282. pxor @s[2], @t[0]
  283. movdqa @x[6], @s[1]
  284. pxor @s[2], @t[1]
  285. movdqa @x[5], @s[2]
  286. pand @x[3], @s[0]
  287. movdqa @x[4], @s[3]
  288. pand @x[2], @s[1]
  289. pand @x[1], @s[2]
  290. por @x[0], @s[3]
  291. pxor @s[0], @t[3]
  292. pxor @s[1], @t[2]
  293. pxor @s[2], @t[1]
  294. pxor @s[3], @t[0]
  295. #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
  296. # new smaller inversion
  297. movdqa @t[3], @s[0]
  298. pand @t[1], @t[3]
  299. pxor @t[2], @s[0]
  300. movdqa @t[0], @s[2]
  301. movdqa @s[0], @s[3]
  302. pxor @t[3], @s[2]
  303. pand @s[2], @s[3]
  304. movdqa @t[1], @s[1]
  305. pxor @t[2], @s[3]
  306. pxor @t[0], @s[1]
  307. pxor @t[2], @t[3]
  308. pand @t[3], @s[1]
  309. movdqa @s[2], @t[2]
  310. pxor @t[0], @s[1]
  311. pxor @s[1], @t[2]
  312. pxor @s[1], @t[1]
  313. pand @t[0], @t[2]
  314. pxor @t[2], @s[2]
  315. pxor @t[2], @t[1]
  316. pand @s[3], @s[2]
  317. pxor @s[0], @s[2]
  318. ___
  319. # output in s3, s2, s1, t1
  320. # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
  321. # Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
  322. &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
  323. ### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
  324. }
  325. # AES linear components
  326. sub shiftrows {
  327. my @x=@_[0..7];
  328. my $mask=pop;
  329. $code.=<<___;
  330. pxor 0x00($key),@x[0]
  331. pxor 0x10($key),@x[1]
  332. pshufb $mask,@x[0]
  333. pxor 0x20($key),@x[2]
  334. pshufb $mask,@x[1]
  335. pxor 0x30($key),@x[3]
  336. pshufb $mask,@x[2]
  337. pxor 0x40($key),@x[4]
  338. pshufb $mask,@x[3]
  339. pxor 0x50($key),@x[5]
  340. pshufb $mask,@x[4]
  341. pxor 0x60($key),@x[6]
  342. pshufb $mask,@x[5]
  343. pxor 0x70($key),@x[7]
  344. pshufb $mask,@x[6]
  345. lea 0x80($key),$key
  346. pshufb $mask,@x[7]
  347. ___
  348. }
  349. sub mixcolumns {
  350. # modified to emit output in order suitable for feeding back to aesenc[last]
  351. my @x=@_[0..7];
  352. my @t=@_[8..15];
  353. $code.=<<___;
  354. pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
  355. pshufd \$0x93, @x[1], @t[1]
  356. pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
  357. pshufd \$0x93, @x[2], @t[2]
  358. pxor @t[1], @x[1]
  359. pshufd \$0x93, @x[3], @t[3]
  360. pxor @t[2], @x[2]
  361. pshufd \$0x93, @x[4], @t[4]
  362. pxor @t[3], @x[3]
  363. pshufd \$0x93, @x[5], @t[5]
  364. pxor @t[4], @x[4]
  365. pshufd \$0x93, @x[6], @t[6]
  366. pxor @t[5], @x[5]
  367. pshufd \$0x93, @x[7], @t[7]
  368. pxor @t[6], @x[6]
  369. pxor @t[7], @x[7]
  370. pxor @x[0], @t[1]
  371. pxor @x[7], @t[0]
  372. pxor @x[7], @t[1]
  373. pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
  374. pxor @x[1], @t[2]
  375. pshufd \$0x4E, @x[1], @x[1]
  376. pxor @x[4], @t[5]
  377. pxor @t[0], @x[0]
  378. pxor @x[5], @t[6]
  379. pxor @t[1], @x[1]
  380. pxor @x[3], @t[4]
  381. pshufd \$0x4E, @x[4], @t[0]
  382. pxor @x[6], @t[7]
  383. pshufd \$0x4E, @x[5], @t[1]
  384. pxor @x[2], @t[3]
  385. pshufd \$0x4E, @x[3], @x[4]
  386. pxor @x[7], @t[3]
  387. pshufd \$0x4E, @x[7], @x[5]
  388. pxor @x[7], @t[4]
  389. pshufd \$0x4E, @x[6], @x[3]
  390. pxor @t[4], @t[0]
  391. pshufd \$0x4E, @x[2], @x[6]
  392. pxor @t[5], @t[1]
  393. pxor @t[3], @x[4]
  394. pxor @t[7], @x[5]
  395. pxor @t[6], @x[3]
  396. movdqa @t[0], @x[2]
  397. pxor @t[2], @x[6]
  398. movdqa @t[1], @x[7]
  399. ___
  400. }
  401. sub aesenc { # not used
  402. my @b=@_[0..7];
  403. my @t=@_[8..15];
  404. $code.=<<___;
  405. movdqa 0x30($const),@t[0] # .LSR
  406. ___
  407. &shiftrows (@b,@t[0]);
  408. &sbox (@b,@t);
  409. &mixcolumns (@b[0,1,4,6,3,7,2,5],@t);
  410. }
  411. sub aesenclast { # not used
  412. my @b=@_[0..7];
  413. my @t=@_[8..15];
  414. $code.=<<___;
  415. movdqa 0x40($const),@t[0] # .LSRM0
  416. ___
  417. &shiftrows (@b,@t[0]);
  418. &sbox (@b,@t);
  419. $code.=<<___
  420. pxor 0x00($key),@b[0]
  421. pxor 0x10($key),@b[1]
  422. pxor 0x20($key),@b[4]
  423. pxor 0x30($key),@b[6]
  424. pxor 0x40($key),@b[3]
  425. pxor 0x50($key),@b[7]
  426. pxor 0x60($key),@b[2]
  427. pxor 0x70($key),@b[5]
  428. ___
  429. }
  430. sub swapmove {
  431. my ($a,$b,$n,$mask,$t)=@_;
  432. $code.=<<___;
  433. movdqa $b,$t
  434. psrlq \$$n,$b
  435. pxor $a,$b
  436. pand $mask,$b
  437. pxor $b,$a
  438. psllq \$$n,$b
  439. pxor $t,$b
  440. ___
  441. }
  442. sub swapmove2x {
  443. my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
  444. $code.=<<___;
  445. movdqa $b0,$t0
  446. psrlq \$$n,$b0
  447. movdqa $b1,$t1
  448. psrlq \$$n,$b1
  449. pxor $a0,$b0
  450. pxor $a1,$b1
  451. pand $mask,$b0
  452. pand $mask,$b1
  453. pxor $b0,$a0
  454. psllq \$$n,$b0
  455. pxor $b1,$a1
  456. psllq \$$n,$b1
  457. pxor $t0,$b0
  458. pxor $t1,$b1
  459. ___
  460. }
  461. sub bitslice {
  462. my @x=reverse(@_[0..7]);
  463. my ($t0,$t1,$t2,$t3)=@_[8..11];
  464. $code.=<<___;
  465. movdqa 0x00($const),$t0 # .LBS0
  466. movdqa 0x10($const),$t1 # .LBS1
  467. ___
  468. &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
  469. &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
  470. $code.=<<___;
  471. movdqa 0x20($const),$t0 # .LBS2
  472. ___
  473. &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
  474. &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
  475. &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
  476. &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
  477. }
  478. $code.=<<___;
  479. .text
  480. .extern AES_encrypt
  481. .type _bsaes_encrypt8,\@abi-omnipotent
  482. .align 64
  483. _bsaes_encrypt8:
  484. lea .LBS0(%rip), $const # constants table
  485. movdqa ($key), @XMM[9] # round 0 key
  486. lea 0x10($key), $key
  487. movdqa 0x60($const), @XMM[8] # .LM0SR
  488. pxor @XMM[9], @XMM[0] # xor with round0 key
  489. pxor @XMM[9], @XMM[1]
  490. pshufb @XMM[8], @XMM[0]
  491. pxor @XMM[9], @XMM[2]
  492. pshufb @XMM[8], @XMM[1]
  493. pxor @XMM[9], @XMM[3]
  494. pshufb @XMM[8], @XMM[2]
  495. pxor @XMM[9], @XMM[4]
  496. pshufb @XMM[8], @XMM[3]
  497. pxor @XMM[9], @XMM[5]
  498. pshufb @XMM[8], @XMM[4]
  499. pxor @XMM[9], @XMM[6]
  500. pshufb @XMM[8], @XMM[5]
  501. pxor @XMM[9], @XMM[7]
  502. pshufb @XMM[8], @XMM[6]
  503. pshufb @XMM[8], @XMM[7]
  504. _bsaes_encrypt8_bitslice:
  505. ___
  506. &bitslice (@XMM[0..7, 8..11]);
  507. $code.=<<___;
  508. dec $rounds
  509. jmp .Lenc_sbox
  510. .align 16
  511. .Lenc_loop:
  512. ___
  513. &shiftrows (@XMM[0..7, 8]);
  514. $code.=".Lenc_sbox:\n";
  515. &sbox (@XMM[0..7, 8..15]);
  516. $code.=<<___;
  517. dec $rounds
  518. jl .Lenc_done
  519. ___
  520. &mixcolumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
  521. $code.=<<___;
  522. movdqa 0x30($const), @XMM[8] # .LSR
  523. jnz .Lenc_loop
  524. movdqa 0x40($const), @XMM[8] # .LSRM0
  525. jmp .Lenc_loop
  526. .align 16
  527. .Lenc_done:
  528. ___
  529. # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
  530. &bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
  531. $code.=<<___;
  532. movdqa ($key), @XMM[8] # last round key
  533. pxor @XMM[8], @XMM[0]
  534. pxor @XMM[8], @XMM[1]
  535. pxor @XMM[8], @XMM[4]
  536. pxor @XMM[8], @XMM[6]
  537. pxor @XMM[8], @XMM[3]
  538. pxor @XMM[8], @XMM[7]
  539. pxor @XMM[8], @XMM[2]
  540. pxor @XMM[8], @XMM[5]
  541. ret
  542. .size _bsaes_encrypt8,.-_bsaes_encrypt8
  543. ___
  544. }
  545. {
  546. my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
  547. sub bitslice_key {
  548. my @x=reverse(@_[0..7]);
  549. my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
  550. &swapmove (@x[0,1],1,$bs0,$t2,$t3);
  551. $code.=<<___;
  552. #&swapmove(@x[2,3],1,$t0,$t2,$t3);
  553. movdqa @x[0], @x[2]
  554. movdqa @x[1], @x[3]
  555. ___
  556. #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
  557. &swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
  558. $code.=<<___;
  559. #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
  560. movdqa @x[0], @x[4]
  561. movdqa @x[2], @x[6]
  562. movdqa @x[1], @x[5]
  563. movdqa @x[3], @x[7]
  564. ___
  565. &swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
  566. &swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
  567. }
  568. $code.=<<___;
  569. .type _bsaes_enc_key_convert,\@abi-omnipotent
  570. .align 16
  571. _bsaes_enc_key_convert:
  572. lea .LBS1(%rip), $const
  573. movdqu ($inp), %xmm7 # load round 0 key
  574. movdqa -0x10($const), %xmm8 # .LBS0
  575. movdqa 0x00($const), %xmm9 # .LBS1
  576. movdqa 0x10($const), %xmm10 # .LBS2
  577. movdqa 0x40($const), %xmm13 # .LM0
  578. movdqa 0x60($const),%xmm14 # .LNOT
  579. movdqu 0x10($inp), %xmm6 # load round 1 key
  580. lea 0x10($inp), $inp
  581. movdqa %xmm7, ($out) # save round 0 key
  582. lea 0x10($out), $out
  583. dec $rounds
  584. jmp .Lkey_loop
  585. .align 16
  586. .Lkey_loop:
  587. pshufb %xmm13, %xmm6
  588. movdqa %xmm6, %xmm7
  589. ___
  590. &bitslice_key (map("%xmm$_",(0..7, 8..12)));
  591. $code.=<<___;
  592. pxor %xmm14, %xmm5 # "pnot"
  593. pxor %xmm14, %xmm6
  594. pxor %xmm14, %xmm0
  595. pxor %xmm14, %xmm1
  596. lea 0x10($inp), $inp
  597. movdqa %xmm0, 0x00($out) # write bit-sliced round key
  598. movdqa %xmm1, 0x10($out)
  599. movdqa %xmm2, 0x20($out)
  600. movdqa %xmm3, 0x30($out)
  601. movdqa %xmm4, 0x40($out)
  602. movdqa %xmm5, 0x50($out)
  603. movdqa %xmm6, 0x60($out)
  604. movdqa %xmm7, 0x70($out)
  605. lea 0x80($out),$out
  606. movdqu ($inp), %xmm6 # load next round key
  607. dec $rounds
  608. jnz .Lkey_loop
  609. pxor 0x70($const), %xmm6 # .L63
  610. movdqa %xmm6, ($out) # save last round key
  611. ret
  612. .size _bsaes_enc_key_convert,.-_bsaes_enc_key_convert
  613. ___
  614. }
  615. if (1 && !$win64) { # following two functions are unsupported interface
  616. # used for benchmarking...
  617. $code.=<<___;
  618. .globl bsaes_enc_key_convert
  619. .type bsaes_enc_key_convert,\@function,2
  620. .align 16
  621. bsaes_enc_key_convert:
  622. mov 240($inp),%r10d # pass rounds
  623. mov $inp,%rcx # pass key
  624. mov $out,%rax # pass key schedule
  625. call _bsaes_enc_key_convert
  626. ret
  627. .size bsaes_enc_key_convert,.-bsaes_enc_key_convert
  628. .globl bsaes_encrypt_128
  629. .type bsaes_encrypt_128,\@function,4
  630. .align 16
  631. bsaes_encrypt_128:
  632. .Lenc128_loop:
  633. movdqu 0x00($inp), @XMM[0] # load input
  634. movdqu 0x10($inp), @XMM[1]
  635. movdqu 0x20($inp), @XMM[2]
  636. movdqu 0x30($inp), @XMM[3]
  637. movdqu 0x40($inp), @XMM[4]
  638. movdqu 0x50($inp), @XMM[5]
  639. movdqu 0x60($inp), @XMM[6]
  640. movdqu 0x70($inp), @XMM[7]
  641. mov $key, %rax # pass the $key
  642. lea 0x80($inp), $inp
  643. mov \$10,%r10d
  644. call _bsaes_encrypt8
  645. movdqu @XMM[0], 0x00($out) # write output
  646. movdqu @XMM[1], 0x10($out)
  647. movdqu @XMM[4], 0x20($out)
  648. movdqu @XMM[6], 0x30($out)
  649. movdqu @XMM[3], 0x40($out)
  650. movdqu @XMM[7], 0x50($out)
  651. movdqu @XMM[2], 0x60($out)
  652. movdqu @XMM[5], 0x70($out)
  653. lea 0x80($out), $out
  654. sub \$0x80,$len
  655. ja .Lenc128_loop
  656. ret
  657. .size bsaes_encrypt_128,.-bsaes_encrypt_128
  658. ___
  659. }
  660. {
  661. ######################################################################
  662. #
  663. # OpenSSL interface
  664. #
  665. my ($arg1,$arg2,$arg3,$arg4,$arg5) = $win64 ? ("%rcx","%rdx","%r8","%r9","%r10")
  666. : ("%rdi","%rsi","%rdx","%rcx","%r8");
  667. my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
  668. $code.=<<___;
  669. .globl bsaes_ecb_encrypt_blocks
  670. .type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
  671. .align 16
  672. bsaes_ecb_encrypt_blocks:
  673. push %rbp
  674. push %rbx
  675. push %r12
  676. push %r13
  677. push %r14
  678. push %r15
  679. lea -0x48(%rsp),%rsp
  680. ___
  681. $code.=<<___ if ($win64);
  682. lea -0xa0(%rsp), %rsp
  683. movaps %xmm6, 0x40(%rsp)
  684. movaps %xmm7, 0x50(%rsp)
  685. movaps %xmm8, 0x60(%rsp)
  686. movaps %xmm9, 0x70(%rsp)
  687. movaps %xmm10, 0x80(%rsp)
  688. movaps %xmm11, 0x90(%rsp)
  689. movaps %xmm12, 0xa0(%rsp)
  690. movaps %xmm13, 0xb0(%rsp)
  691. movaps %xmm14, 0xc0(%rsp)
  692. movaps %xmm15, 0xd0(%rsp)
  693. .Lecb_enc_body:
  694. ___
  695. $code.=<<___;
  696. mov %rsp,%rbp # backup %rsp
  697. mov 240($arg4),%eax # rounds
  698. mov $arg1,$inp # backup arguments
  699. mov $arg2,$out
  700. mov $arg3,$len
  701. mov $arg4,$key
  702. cmp \$8,$arg3
  703. jb .Lecb_enc_short
  704. mov %eax,%ebx # backup rounds
  705. shl \$7,%rax # 128 bytes per inner round key
  706. sub \$`128-32`,%rax # size of bit-sliced key schedule
  707. sub %rax,%rsp
  708. mov %rsp,%rax # pass key schedule
  709. mov $key,%rcx # pass key
  710. mov %ebx,%r10d # pass rounds
  711. call _bsaes_enc_key_convert
  712. sub \$8,$len
  713. .Lecb_enc_loop:
  714. movdqu 0x00($inp), @XMM[0] # load input
  715. movdqu 0x10($inp), @XMM[1]
  716. movdqu 0x20($inp), @XMM[2]
  717. movdqu 0x30($inp), @XMM[3]
  718. movdqu 0x40($inp), @XMM[4]
  719. movdqu 0x50($inp), @XMM[5]
  720. mov %rsp, %rax # pass key schedule
  721. movdqu 0x60($inp), @XMM[6]
  722. mov %ebx,%r10d # pass rounds
  723. movdqu 0x70($inp), @XMM[7]
  724. lea 0x80($inp), $inp
  725. call _bsaes_encrypt8
  726. movdqu @XMM[0], 0x00($out) # write output
  727. movdqu @XMM[1], 0x10($out)
  728. movdqu @XMM[4], 0x20($out)
  729. movdqu @XMM[6], 0x30($out)
  730. movdqu @XMM[3], 0x40($out)
  731. movdqu @XMM[7], 0x50($out)
  732. movdqu @XMM[2], 0x60($out)
  733. movdqu @XMM[5], 0x70($out)
  734. lea 0x80($out), $out
  735. sub \$8,$len
  736. jnc .Lecb_enc_loop
  737. add \$8,$len
  738. jz .Lecb_enc_done
  739. movdqu 0x00($inp), @XMM[0] # load input
  740. mov %rsp, %rax # pass key schedule
  741. mov %ebx,%r10d # pass rounds
  742. cmp \$2,$len
  743. jb .Lecb_enc_one
  744. movdqu 0x10($inp), @XMM[1]
  745. je .Lecb_enc_two
  746. movdqu 0x20($inp), @XMM[2]
  747. cmp \$4,$len
  748. jb .Lecb_enc_three
  749. movdqu 0x30($inp), @XMM[3]
  750. je .Lecb_enc_four
  751. movdqu 0x40($inp), @XMM[4]
  752. cmp \$6,$len
  753. jb .Lecb_enc_five
  754. movdqu 0x50($inp), @XMM[5]
  755. je .Lecb_enc_six
  756. movdqu 0x60($inp), @XMM[6]
  757. call _bsaes_encrypt8
  758. movdqu @XMM[0], 0x00($out) # write output
  759. movdqu @XMM[1], 0x10($out)
  760. movdqu @XMM[4], 0x20($out)
  761. movdqu @XMM[6], 0x30($out)
  762. movdqu @XMM[3], 0x40($out)
  763. movdqu @XMM[7], 0x50($out)
  764. movdqu @XMM[2], 0x60($out)
  765. jmp .Lecb_enc_done
  766. .align 16
  767. .Lecb_enc_six:
  768. call _bsaes_encrypt8
  769. movdqu @XMM[0], 0x00($out) # write output
  770. movdqu @XMM[1], 0x10($out)
  771. movdqu @XMM[4], 0x20($out)
  772. movdqu @XMM[6], 0x30($out)
  773. movdqu @XMM[3], 0x40($out)
  774. movdqu @XMM[7], 0x50($out)
  775. jmp .Lecb_enc_done
  776. .align 16
  777. .Lecb_enc_five:
  778. call _bsaes_encrypt8
  779. movdqu @XMM[0], 0x00($out) # write output
  780. movdqu @XMM[1], 0x10($out)
  781. movdqu @XMM[4], 0x20($out)
  782. movdqu @XMM[6], 0x30($out)
  783. movdqu @XMM[3], 0x40($out)
  784. jmp .Lecb_enc_done
  785. .align 16
  786. .Lecb_enc_four:
  787. call _bsaes_encrypt8
  788. movdqu @XMM[0], 0x00($out) # write output
  789. movdqu @XMM[1], 0x10($out)
  790. movdqu @XMM[4], 0x20($out)
  791. movdqu @XMM[6], 0x30($out)
  792. jmp .Lecb_enc_done
  793. .align 16
  794. .Lecb_enc_three:
  795. call _bsaes_encrypt8
  796. movdqu @XMM[0], 0x00($out) # write output
  797. movdqu @XMM[1], 0x10($out)
  798. movdqu @XMM[4], 0x20($out)
  799. jmp .Lecb_enc_done
  800. .align 16
  801. .Lecb_enc_two:
  802. call _bsaes_encrypt8
  803. movdqu @XMM[0], 0x00($out) # write output
  804. movdqu @XMM[1], 0x10($out)
  805. jmp .Lecb_enc_done
  806. .align 16
  807. .Lecb_enc_one:
  808. call _bsaes_encrypt8
  809. movdqu @XMM[0], 0x00($out) # write output
  810. jmp .Lecb_enc_done
  811. .align 16
  812. .Lecb_enc_short:
  813. lea ($inp), $arg1
  814. lea ($out), $arg2
  815. lea ($key), $arg3
  816. call AES_encrypt
  817. lea 16($inp), $inp
  818. lea 16($out), $out
  819. dec $len
  820. jnz .Lecb_enc_short
  821. .Lecb_enc_done:
  822. lea (%rsp),%rax
  823. pxor %xmm0, %xmm0
  824. .Lecb_enc_bzero: # wipe key schedule [if any]
  825. movdqa %xmm0, 0x00(%rax)
  826. movdqa %xmm0, 0x10(%rax)
  827. lea 0x20(%rax), %rax
  828. cmp %rax, %rbp
  829. jb .Lecb_enc_bzero
  830. lea (%rbp),%rsp # restore %rsp
  831. ___
  832. $code.=<<___ if ($win64);
  833. movaps 0x40(%rbp), %xmm6
  834. movaps 0x50(%rbp), %xmm7
  835. movaps 0x60(%rbp), %xmm8
  836. movaps 0x70(%rbp), %xmm9
  837. movaps 0x80(%rbp), %xmm10
  838. movaps 0x90(%rbp), %xmm11
  839. movaps 0xa0(%rbp), %xmm12
  840. movaps 0xb0(%rbp), %xmm13
  841. movaps 0xc0(%rbp), %xmm14
  842. movaps 0xd0(%rbp), %xmm15
  843. lea 0xa0(%rbp), %rsp
  844. ___
  845. $code.=<<___;
  846. mov 0x48(%rsp), %r15
  847. mov 0x50(%rsp), %r14
  848. mov 0x58(%rsp), %r13
  849. mov 0x60(%rsp), %r12
  850. mov 0x68(%rsp), %rbx
  851. mov 0x70(%rsp), %rbp
  852. lea 0x78(%rsp), %rsp
  853. .Lecb_enc_epilogue:
  854. ret
  855. .size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
  856. .globl bsaes_ctr32_encrypt_blocks
  857. .type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
  858. .align 16
  859. bsaes_ctr32_encrypt_blocks:
  860. push %rbp
  861. push %rbx
  862. push %r12
  863. push %r13
  864. push %r14
  865. push %r15
  866. lea -0x48(%rsp), %rsp
  867. ___
  868. $code.=<<___ if ($win64);
  869. mov 0xa0(%rsp),$arg5 # pull ivp
  870. lea -0xa0(%rsp), %rsp
  871. movaps %xmm6, 0x40(%rsp)
  872. movaps %xmm7, 0x50(%rsp)
  873. movaps %xmm8, 0x60(%rsp)
  874. movaps %xmm9, 0x70(%rsp)
  875. movaps %xmm10, 0x80(%rsp)
  876. movaps %xmm11, 0x90(%rsp)
  877. movaps %xmm12, 0xa0(%rsp)
  878. movaps %xmm13, 0xb0(%rsp)
  879. movaps %xmm14, 0xc0(%rsp)
  880. movaps %xmm15, 0xd0(%rsp)
  881. .Lctr_enc_body:
  882. ___
  883. $code.=<<___;
  884. mov %rsp, %rbp # backup %rsp
  885. movdqu ($arg5), %xmm0 # load counter
  886. mov 240($arg4), %eax # rounds
  887. mov $arg1, $inp # backup arguments
  888. mov $arg2, $out
  889. mov $arg3, $len
  890. mov $arg4, $key
  891. movdqa %xmm0, 0x20(%rbp) # copy counter
  892. cmp \$8, $arg3
  893. jb .Lctr_enc_short
  894. mov %eax, %ebx # rounds
  895. shl \$7, %rax # 128 bytes per inner round key
  896. sub \$`128-32`, %rax # size of bit-sliced key schedule
  897. sub %rax, %rsp
  898. mov %rsp, %rax # pass key schedule
  899. mov $key, %rcx # pass key
  900. mov %ebx, %r10d # pass rounds
  901. call _bsaes_enc_key_convert
  902. movdqa (%rsp), @XMM[9] # load round0 key
  903. lea .LADD1(%rip), %r11
  904. movdqa 0x20(%rbp), @XMM[0] # counter copy
  905. movdqa -0x20(%r11), @XMM[8] # .LSWPUP
  906. pshufb @XMM[8], @XMM[9] # byte swap upper part
  907. pshufb @XMM[8], @XMM[0]
  908. movdqa @XMM[9], (%rsp) # save adjusted round0 key
  909. jmp .Lctr_enc_loop
  910. .align 16
  911. .Lctr_enc_loop:
  912. movdqa @XMM[0], 0x20(%rbp) # save counter
  913. movdqa @XMM[0], @XMM[1] # prepare 8 counter values
  914. movdqa @XMM[0], @XMM[2]
  915. paddd 0x00(%r11), @XMM[1] # .LADD1
  916. movdqa @XMM[0], @XMM[3]
  917. paddd 0x10(%r11), @XMM[2] # .LADD2
  918. movdqa @XMM[0], @XMM[4]
  919. paddd 0x20(%r11), @XMM[3] # .LADD3
  920. movdqa @XMM[0], @XMM[5]
  921. paddd 0x30(%r11), @XMM[4] # .LADD4
  922. movdqa @XMM[0], @XMM[6]
  923. paddd 0x40(%r11), @XMM[5] # .LADD5
  924. movdqa @XMM[0], @XMM[7]
  925. paddd 0x50(%r11), @XMM[6] # .LADD6
  926. paddd 0x60(%r11), @XMM[7] # .LADD7
  927. # Borrow prologue from _bsaes_encrypt8 to use the opportunity
  928. # to flip byte order in 32-bit counter
  929. movdqa (%rsp), @XMM[9] # round 0 key
  930. lea 0x10(%rsp), %rax # pass key schedule
  931. movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
  932. pxor @XMM[9], @XMM[0] # xor with round0 key
  933. pxor @XMM[9], @XMM[1]
  934. pshufb @XMM[8], @XMM[0]
  935. pxor @XMM[9], @XMM[2]
  936. pshufb @XMM[8], @XMM[1]
  937. pxor @XMM[9], @XMM[3]
  938. pshufb @XMM[8], @XMM[2]
  939. pxor @XMM[9], @XMM[4]
  940. pshufb @XMM[8], @XMM[3]
  941. pxor @XMM[9], @XMM[5]
  942. pshufb @XMM[8], @XMM[4]
  943. pxor @XMM[9], @XMM[6]
  944. pshufb @XMM[8], @XMM[5]
  945. pxor @XMM[9], @XMM[7]
  946. pshufb @XMM[8], @XMM[6]
  947. lea .LBS0(%rip), %r11 # constants table
  948. pshufb @XMM[8], @XMM[7]
  949. mov %ebx,%r10d # pass rounds
  950. call _bsaes_encrypt8_bitslice
  951. sub \$8,$len
  952. jc .Lctr_enc_loop_done
  953. movdqu 0x00($inp), @XMM[8] # load input
  954. movdqu 0x10($inp), @XMM[9]
  955. movdqu 0x20($inp), @XMM[10]
  956. movdqu 0x30($inp), @XMM[11]
  957. movdqu 0x40($inp), @XMM[12]
  958. movdqu 0x50($inp), @XMM[13]
  959. movdqu 0x60($inp), @XMM[14]
  960. movdqu 0x70($inp), @XMM[15]
  961. lea 0x80($inp),$inp
  962. pxor @XMM[0], @XMM[8]
  963. movdqa 0x20(%rbp), @XMM[0] # load counter
  964. pxor @XMM[9], @XMM[1]
  965. movdqu @XMM[8], 0x00($out) # write output
  966. pxor @XMM[10], @XMM[4]
  967. movdqu @XMM[1], 0x10($out)
  968. pxor @XMM[11], @XMM[6]
  969. movdqu @XMM[4], 0x20($out)
  970. pxor @XMM[12], @XMM[3]
  971. movdqu @XMM[6], 0x30($out)
  972. pxor @XMM[13], @XMM[7]
  973. movdqu @XMM[3], 0x40($out)
  974. pxor @XMM[14], @XMM[2]
  975. movdqu @XMM[7], 0x50($out)
  976. pxor @XMM[15], @XMM[5]
  977. movdqu @XMM[2], 0x60($out)
  978. lea .LADD1(%rip), %r11
  979. movdqu @XMM[5], 0x70($out)
  980. lea 0x80($out), $out
  981. paddd 0x70(%r11), @XMM[0] # .LADD8
  982. jnz .Lctr_enc_loop
  983. jmp .Lctr_enc_done
  984. .align 16
  985. .Lctr_enc_loop_done:
  986. movdqu 0x00($inp), @XMM[8] # load input
  987. pxor @XMM[8], @XMM[0]
  988. movdqu @XMM[0], 0x00($out) # write output
  989. cmp \$2,$len
  990. jb .Lctr_enc_done
  991. movdqu 0x10($inp), @XMM[9]
  992. pxor @XMM[9], @XMM[1]
  993. movdqu @XMM[1], 0x10($out)
  994. je .Lctr_enc_done
  995. movdqu 0x20($inp), @XMM[10]
  996. pxor @XMM[10], @XMM[4]
  997. movdqu @XMM[4], 0x20($out)
  998. cmp \$4,$len
  999. jb .Lctr_enc_done
  1000. movdqu 0x30($inp), @XMM[11]
  1001. pxor @XMM[11], @XMM[6]
  1002. movdqu @XMM[6], 0x30($out)
  1003. je .Lctr_enc_done
  1004. movdqu 0x40($inp), @XMM[12]
  1005. pxor @XMM[12], @XMM[3]
  1006. movdqu @XMM[3], 0x40($out)
  1007. cmp \$6,$len
  1008. jb .Lctr_enc_done
  1009. movdqu 0x50($inp), @XMM[13]
  1010. pxor @XMM[13], @XMM[7]
  1011. movdqu @XMM[7], 0x50($out)
  1012. je .Lctr_enc_done
  1013. movdqu 0x60($inp), @XMM[14]
  1014. pxor @XMM[14], @XMM[2]
  1015. movdqu @XMM[2], 0x60($out)
  1016. jmp .Lctr_enc_done
  1017. .align 16
  1018. .Lctr_enc_short:
  1019. lea 0x20(%rbp), $arg1
  1020. lea 0x30(%rbp), $arg2
  1021. lea ($key), $arg3
  1022. call AES_encrypt
  1023. movdqu ($inp), @XMM[1]
  1024. lea 16($inp), $inp
  1025. mov 0x2c(%rbp), %eax # load 32-bit counter
  1026. bswap %eax
  1027. pxor 0x30(%rbp), @XMM[1]
  1028. inc %eax # increment
  1029. movdqu @XMM[1], ($out)
  1030. bswap %eax
  1031. lea 16($out), $out
  1032. mov %eax, 0x2c(%rsp) # save 32-bit counter
  1033. dec $len
  1034. jnz .Lctr_enc_short
  1035. .Lctr_enc_done:
  1036. lea (%rsp), %rax
  1037. pxor %xmm0, %xmm0
  1038. .Lctr_enc_bzero: # wipe key schedule [if any]
  1039. movdqa %xmm0, 0x00(%rax)
  1040. movdqa %xmm0, 0x10(%rax)
  1041. lea 0x20(%rax), %rax
  1042. cmp %rax, %rbp
  1043. ja .Lctr_enc_bzero
  1044. lea (%rbp),%rsp # restore %rsp
  1045. ___
  1046. $code.=<<___ if ($win64);
  1047. movaps 0x40(%rbp), %xmm6
  1048. movaps 0x50(%rbp), %xmm7
  1049. movaps 0x60(%rbp), %xmm8
  1050. movaps 0x70(%rbp), %xmm9
  1051. movaps 0x80(%rbp), %xmm10
  1052. movaps 0x90(%rbp), %xmm11
  1053. movaps 0xa0(%rbp), %xmm12
  1054. movaps 0xb0(%rbp), %xmm13
  1055. movaps 0xc0(%rbp), %xmm14
  1056. movaps 0xd0(%rbp), %xmm15
  1057. lea 0xa0(%rbp), %rsp
  1058. ___
  1059. $code.=<<___;
  1060. mov 0x48(%rsp), %r15
  1061. mov 0x50(%rsp), %r14
  1062. mov 0x58(%rsp), %r13
  1063. mov 0x60(%rsp), %r12
  1064. mov 0x68(%rsp), %rbx
  1065. mov 0x70(%rsp), %rbp
  1066. lea 0x78(%rsp), %rsp
  1067. .Lctr_enc_epilogue:
  1068. ret
  1069. .size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
  1070. ___
  1071. }
  1072. $code.=<<___;
  1073. .align 64
  1074. .LBS0: .quad 0x5555555555555555, 0x5555555555555555
  1075. .LBS1: .quad 0x3333333333333333, 0x3333333333333333
  1076. .LBS2: .quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
  1077. .LSR: .quad 0x0504070600030201, 0x0f0e0d0c0a09080b
  1078. .LSRM0: .quad 0x0304090e00050a0f, 0x01060b0c0207080d
  1079. .LM0: .quad 0x02060a0e03070b0f, 0x0004080c0105090d
  1080. .LM0SR: .quad 0x0a0e02060f03070b, 0x0004080c05090d01
  1081. .LNOT: .quad 0xffffffffffffffff, 0xffffffffffffffff
  1082. .L63: .quad 0x6363636363636363, 0x6363636363636363
  1083. .LSWPUP:
  1084. .quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
  1085. .LSWPUPM0SR:
  1086. .quad 0x0a0d02060c03070b, 0x0004080f05090e01
  1087. .LADD1: .quad 0x0000000000000000, 0x0000000100000000
  1088. .LADD2: .quad 0x0000000000000000, 0x0000000200000000
  1089. .LADD3: .quad 0x0000000000000000, 0x0000000300000000
  1090. .LADD4: .quad 0x0000000000000000, 0x0000000400000000
  1091. .LADD5: .quad 0x0000000000000000, 0x0000000500000000
  1092. .LADD6: .quad 0x0000000000000000, 0x0000000600000000
  1093. .LADD7: .quad 0x0000000000000000, 0x0000000700000000
  1094. .LADD8: .quad 0x0000000000000000, 0x0000000800000000
  1095. .asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper and Peter Schwabe"
  1096. .align 64
  1097. ___
  1098. $code =~ s/\`([^\`]*)\`/eval($1)/gem;
  1099. print $code;
  1100. close STDOUT;