cmll-x86_64.pl 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080
  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Copyright (c) 2008 Andy Polyakov <appro@openssl.org>
  4. #
  5. # This module may be used under the terms of either the GNU General
  6. # Public License version 2 or later, the GNU Lesser General Public
  7. # License version 2.1 or later, the Mozilla Public License version
  8. # 1.1 or the BSD License. The exact terms of either license are
  9. # distributed along with this module. For further details see
  10. # http://www.openssl.org/~appro/camellia/.
  11. # ====================================================================
  12. # Performance in cycles per processed byte (less is better) in
  13. # 'openssl speed ...' benchmark:
  14. #
  15. # AMD64 Core2 EM64T
  16. # -evp camellia-128-ecb 16.7 21.0 22.7
  17. # + over gcc 3.4.6 +25% +5% 0%
  18. #
  19. # camellia-128-cbc 15.7 20.4 21.1
  20. #
  21. # 128-bit key setup 128 216 205 cycles/key
  22. # + over gcc 3.4.6 +54% +39% +15%
  23. #
  24. # Numbers in "+" rows represent performance improvement over compiler
  25. # generated code. Key setup timings are impressive on AMD and Core2
  26. # thanks to 64-bit operations being covertly deployed. Improvement on
  27. # EM64T, pre-Core2 Intel x86_64 CPU, is not as impressive, because it
  28. # apparently emulates some of 64-bit operations in [32-bit] microcode.
  29. $flavour = shift;
  30. $output = shift;
  31. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  32. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  33. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  34. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  35. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  36. die "can't locate x86_64-xlate.pl";
  37. open STDOUT,"| $^X $xlate $flavour $output";
  38. sub hi() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1h/; $r; }
  39. sub lo() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/;
  40. $r =~ s/%[er]([sd]i)/%\1l/;
  41. $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
  42. $t0="%eax";$t1="%ebx";$t2="%ecx";$t3="%edx";
  43. @S=("%r8d","%r9d","%r10d","%r11d");
  44. $i0="%esi";
  45. $i1="%edi";
  46. $Tbl="%rbp"; # size optimization
  47. $inp="%r12";
  48. $out="%r13";
  49. $key="%r14";
  50. $keyend="%r15";
  51. $arg0d=$win64?"%ecx":"%edi";
  52. # const unsigned int Camellia_SBOX[4][256];
  53. # Well, sort of... Camellia_SBOX[0][] is interleaved with [1][],
  54. # and [2][] - with [3][]. This is done to minimize code size.
  55. $SBOX1_1110=0; # Camellia_SBOX[0]
  56. $SBOX4_4404=4; # Camellia_SBOX[1]
  57. $SBOX2_0222=2048; # Camellia_SBOX[2]
  58. $SBOX3_3033=2052; # Camellia_SBOX[3]
  59. sub Camellia_Feistel {
  60. my $i=@_[0];
  61. my $seed=defined(@_[1])?@_[1]:0;
  62. my $scale=$seed<0?-8:8;
  63. my $j=($i&1)*2;
  64. my $s0=@S[($j)%4],$s1=@S[($j+1)%4],$s2=@S[($j+2)%4],$s3=@S[($j+3)%4];
  65. $code.=<<___;
  66. xor $s0,$t0 # t0^=key[0]
  67. xor $s1,$t1 # t1^=key[1]
  68. movz `&hi("$t0")`,$i0 # (t0>>8)&0xff
  69. movz `&lo("$t1")`,$i1 # (t1>>0)&0xff
  70. mov $SBOX3_3033($Tbl,$i0,8),$t3 # t3=SBOX3_3033[0]
  71. mov $SBOX1_1110($Tbl,$i1,8),$t2 # t2=SBOX1_1110[1]
  72. movz `&lo("$t0")`,$i0 # (t0>>0)&0xff
  73. shr \$16,$t0
  74. movz `&hi("$t1")`,$i1 # (t1>>8)&0xff
  75. xor $SBOX4_4404($Tbl,$i0,8),$t3 # t3^=SBOX4_4404[0]
  76. shr \$16,$t1
  77. xor $SBOX4_4404($Tbl,$i1,8),$t2 # t2^=SBOX4_4404[1]
  78. movz `&hi("$t0")`,$i0 # (t0>>24)&0xff
  79. movz `&lo("$t1")`,$i1 # (t1>>16)&0xff
  80. xor $SBOX1_1110($Tbl,$i0,8),$t3 # t3^=SBOX1_1110[0]
  81. xor $SBOX3_3033($Tbl,$i1,8),$t2 # t2^=SBOX3_3033[1]
  82. movz `&lo("$t0")`,$i0 # (t0>>16)&0xff
  83. movz `&hi("$t1")`,$i1 # (t1>>24)&0xff
  84. xor $SBOX2_0222($Tbl,$i0,8),$t3 # t3^=SBOX2_0222[0]
  85. xor $SBOX2_0222($Tbl,$i1,8),$t2 # t2^=SBOX2_0222[1]
  86. mov `$seed+($i+1)*$scale`($key),$t1 # prefetch key[i+1]
  87. mov `$seed+($i+1)*$scale+4`($key),$t0
  88. xor $t3,$t2 # t2^=t3
  89. ror \$8,$t3 # t3=RightRotate(t3,8)
  90. xor $t2,$s2
  91. xor $t2,$s3
  92. xor $t3,$s3
  93. ___
  94. }
  95. # void Camellia_EncryptBlock_Rounds(
  96. # int grandRounds,
  97. # const Byte plaintext[],
  98. # const KEY_TABLE_TYPE keyTable,
  99. # Byte ciphertext[])
  100. $code=<<___;
  101. .text
  102. # V1.x API
  103. .globl Camellia_EncryptBlock
  104. .type Camellia_EncryptBlock,\@abi-omnipotent
  105. .align 16
  106. Camellia_EncryptBlock:
  107. movl \$128,%eax
  108. subl $arg0d,%eax
  109. movl \$3,$arg0d
  110. adcl \$0,$arg0d # keyBitLength==128?3:4
  111. jmp .Lenc_rounds
  112. .size Camellia_EncryptBlock,.-Camellia_EncryptBlock
  113. # V2
  114. .globl Camellia_EncryptBlock_Rounds
  115. .type Camellia_EncryptBlock_Rounds,\@function,4
  116. .align 16
  117. .Lenc_rounds:
  118. Camellia_EncryptBlock_Rounds:
  119. push %rbx
  120. push %rbp
  121. push %r13
  122. push %r14
  123. push %r15
  124. .Lenc_prologue:
  125. #mov %rsi,$inp # put away arguments
  126. mov %rcx,$out
  127. mov %rdx,$key
  128. shl \$6,%edi # process grandRounds
  129. lea .LCamellia_SBOX(%rip),$Tbl
  130. lea ($key,%rdi),$keyend
  131. mov 0(%rsi),@S[0] # load plaintext
  132. mov 4(%rsi),@S[1]
  133. mov 8(%rsi),@S[2]
  134. bswap @S[0]
  135. mov 12(%rsi),@S[3]
  136. bswap @S[1]
  137. bswap @S[2]
  138. bswap @S[3]
  139. call _x86_64_Camellia_encrypt
  140. bswap @S[0]
  141. bswap @S[1]
  142. bswap @S[2]
  143. mov @S[0],0($out)
  144. bswap @S[3]
  145. mov @S[1],4($out)
  146. mov @S[2],8($out)
  147. mov @S[3],12($out)
  148. mov 0(%rsp),%r15
  149. mov 8(%rsp),%r14
  150. mov 16(%rsp),%r13
  151. mov 24(%rsp),%rbp
  152. mov 32(%rsp),%rbx
  153. lea 40(%rsp),%rsp
  154. .Lenc_epilogue:
  155. ret
  156. .size Camellia_EncryptBlock_Rounds,.-Camellia_EncryptBlock_Rounds
  157. .type _x86_64_Camellia_encrypt,\@abi-omnipotent
  158. .align 16
  159. _x86_64_Camellia_encrypt:
  160. xor 0($key),@S[1]
  161. xor 4($key),@S[0] # ^=key[0-3]
  162. xor 8($key),@S[3]
  163. xor 12($key),@S[2]
  164. .align 16
  165. .Leloop:
  166. mov 16($key),$t1 # prefetch key[4-5]
  167. mov 20($key),$t0
  168. ___
  169. for ($i=0;$i<6;$i++) { Camellia_Feistel($i,16); }
  170. $code.=<<___;
  171. lea 16*4($key),$key
  172. cmp $keyend,$key
  173. mov 8($key),$t3 # prefetch key[2-3]
  174. mov 12($key),$t2
  175. je .Ledone
  176. and @S[0],$t0
  177. or @S[3],$t3
  178. rol \$1,$t0
  179. xor $t3,@S[2] # s2^=s3|key[3];
  180. xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1);
  181. and @S[2],$t2
  182. or @S[1],$t1
  183. rol \$1,$t2
  184. xor $t1,@S[0] # s0^=s1|key[1];
  185. xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1);
  186. jmp .Leloop
  187. .align 16
  188. .Ledone:
  189. xor @S[2],$t0 # SwapHalf
  190. xor @S[3],$t1
  191. xor @S[0],$t2
  192. xor @S[1],$t3
  193. mov $t0,@S[0]
  194. mov $t1,@S[1]
  195. mov $t2,@S[2]
  196. mov $t3,@S[3]
  197. .byte 0xf3,0xc3 # rep ret
  198. .size _x86_64_Camellia_encrypt,.-_x86_64_Camellia_encrypt
  199. # V1.x API
  200. .globl Camellia_DecryptBlock
  201. .type Camellia_DecryptBlock,\@abi-omnipotent
  202. .align 16
  203. Camellia_DecryptBlock:
  204. movl \$128,%eax
  205. subl $arg0d,%eax
  206. movl \$3,$arg0d
  207. adcl \$0,$arg0d # keyBitLength==128?3:4
  208. jmp .Ldec_rounds
  209. .size Camellia_DecryptBlock,.-Camellia_DecryptBlock
  210. # V2
  211. .globl Camellia_DecryptBlock_Rounds
  212. .type Camellia_DecryptBlock_Rounds,\@function,4
  213. .align 16
  214. .Ldec_rounds:
  215. Camellia_DecryptBlock_Rounds:
  216. push %rbx
  217. push %rbp
  218. push %r13
  219. push %r14
  220. push %r15
  221. .Ldec_prologue:
  222. #mov %rsi,$inp # put away arguments
  223. mov %rcx,$out
  224. mov %rdx,$keyend
  225. shl \$6,%edi # process grandRounds
  226. lea .LCamellia_SBOX(%rip),$Tbl
  227. lea ($keyend,%rdi),$key
  228. mov 0(%rsi),@S[0] # load plaintext
  229. mov 4(%rsi),@S[1]
  230. mov 8(%rsi),@S[2]
  231. bswap @S[0]
  232. mov 12(%rsi),@S[3]
  233. bswap @S[1]
  234. bswap @S[2]
  235. bswap @S[3]
  236. call _x86_64_Camellia_decrypt
  237. bswap @S[0]
  238. bswap @S[1]
  239. bswap @S[2]
  240. mov @S[0],0($out)
  241. bswap @S[3]
  242. mov @S[1],4($out)
  243. mov @S[2],8($out)
  244. mov @S[3],12($out)
  245. mov 0(%rsp),%r15
  246. mov 8(%rsp),%r14
  247. mov 16(%rsp),%r13
  248. mov 24(%rsp),%rbp
  249. mov 32(%rsp),%rbx
  250. lea 40(%rsp),%rsp
  251. .Ldec_epilogue:
  252. ret
  253. .size Camellia_DecryptBlock_Rounds,.-Camellia_DecryptBlock_Rounds
  254. .type _x86_64_Camellia_decrypt,\@abi-omnipotent
  255. .align 16
  256. _x86_64_Camellia_decrypt:
  257. xor 0($key),@S[1]
  258. xor 4($key),@S[0] # ^=key[0-3]
  259. xor 8($key),@S[3]
  260. xor 12($key),@S[2]
  261. .align 16
  262. .Ldloop:
  263. mov -8($key),$t1 # prefetch key[4-5]
  264. mov -4($key),$t0
  265. ___
  266. for ($i=0;$i<6;$i++) { Camellia_Feistel($i,-8); }
  267. $code.=<<___;
  268. lea -16*4($key),$key
  269. cmp $keyend,$key
  270. mov 0($key),$t3 # prefetch key[2-3]
  271. mov 4($key),$t2
  272. je .Lddone
  273. and @S[0],$t0
  274. or @S[3],$t3
  275. rol \$1,$t0
  276. xor $t3,@S[2] # s2^=s3|key[3];
  277. xor $t0,@S[1] # s1^=LeftRotate(s0&key[0],1);
  278. and @S[2],$t2
  279. or @S[1],$t1
  280. rol \$1,$t2
  281. xor $t1,@S[0] # s0^=s1|key[1];
  282. xor $t2,@S[3] # s3^=LeftRotate(s2&key[2],1);
  283. jmp .Ldloop
  284. .align 16
  285. .Lddone:
  286. xor @S[2],$t2
  287. xor @S[3],$t3
  288. xor @S[0],$t0
  289. xor @S[1],$t1
  290. mov $t2,@S[0] # SwapHalf
  291. mov $t3,@S[1]
  292. mov $t0,@S[2]
  293. mov $t1,@S[3]
  294. .byte 0xf3,0xc3 # rep ret
  295. .size _x86_64_Camellia_decrypt,.-_x86_64_Camellia_decrypt
  296. ___
  297. sub _saveround {
  298. my ($rnd,$key,@T)=@_;
  299. my $bias=int(@T[0])?shift(@T):0;
  300. if ($#T==3) {
  301. $code.=<<___;
  302. mov @T[1],`$bias+$rnd*8+0`($key)
  303. mov @T[0],`$bias+$rnd*8+4`($key)
  304. mov @T[3],`$bias+$rnd*8+8`($key)
  305. mov @T[2],`$bias+$rnd*8+12`($key)
  306. ___
  307. } else {
  308. $code.=" mov @T[0],`$bias+$rnd*8+0`($key)\n";
  309. $code.=" mov @T[1],`$bias+$rnd*8+8`($key)\n" if ($#T>=1);
  310. }
  311. }
  312. sub _loadround {
  313. my ($rnd,$key,@T)=@_;
  314. my $bias=int(@T[0])?shift(@T):0;
  315. $code.=" mov `$bias+$rnd*8+0`($key),@T[0]\n";
  316. $code.=" mov `$bias+$rnd*8+8`($key),@T[1]\n" if ($#T>=1);
  317. }
  318. # shld is very slow on Intel EM64T family. Even on AMD it limits
  319. # instruction decode rate [because it's VectorPath] and consequently
  320. # performance...
  321. sub __rotl128 {
  322. my ($i0,$i1,$rot)=@_;
  323. if ($rot) {
  324. $code.=<<___;
  325. mov $i0,%r11
  326. shld \$$rot,$i1,$i0
  327. shld \$$rot,%r11,$i1
  328. ___
  329. }
  330. }
  331. # ... Implementing 128-bit rotate without shld gives 80% better
  332. # performance EM64T, +15% on AMD64 and only ~7% degradation on
  333. # Core2. This is therefore preferred.
  334. sub _rotl128 {
  335. my ($i0,$i1,$rot)=@_;
  336. if ($rot) {
  337. $code.=<<___;
  338. mov $i0,%r11
  339. shl \$$rot,$i0
  340. mov $i1,%r9
  341. shr \$`64-$rot`,%r9
  342. shr \$`64-$rot`,%r11
  343. or %r9,$i0
  344. shl \$$rot,$i1
  345. or %r11,$i1
  346. ___
  347. }
  348. }
  349. { my $step=0;
  350. $code.=<<___;
  351. .globl Camellia_Ekeygen
  352. .type Camellia_Ekeygen,\@function,3
  353. .align 16
  354. Camellia_Ekeygen:
  355. push %rbx
  356. push %rbp
  357. push %r13
  358. push %r14
  359. push %r15
  360. .Lkey_prologue:
  361. mov %rdi,$keyend # put away arguments, keyBitLength
  362. mov %rdx,$out # keyTable
  363. mov 0(%rsi),@S[0] # load 0-127 bits
  364. mov 4(%rsi),@S[1]
  365. mov 8(%rsi),@S[2]
  366. mov 12(%rsi),@S[3]
  367. bswap @S[0]
  368. bswap @S[1]
  369. bswap @S[2]
  370. bswap @S[3]
  371. ___
  372. &_saveround (0,$out,@S); # KL<<<0
  373. $code.=<<___;
  374. cmp \$128,$keyend # check keyBitLength
  375. je .L1st128
  376. mov 16(%rsi),@S[0] # load 128-191 bits
  377. mov 20(%rsi),@S[1]
  378. cmp \$192,$keyend
  379. je .L1st192
  380. mov 24(%rsi),@S[2] # load 192-255 bits
  381. mov 28(%rsi),@S[3]
  382. jmp .L1st256
  383. .L1st192:
  384. mov @S[0],@S[2]
  385. mov @S[1],@S[3]
  386. not @S[2]
  387. not @S[3]
  388. .L1st256:
  389. bswap @S[0]
  390. bswap @S[1]
  391. bswap @S[2]
  392. bswap @S[3]
  393. ___
  394. &_saveround (4,$out,@S); # temp storage for KR!
  395. $code.=<<___;
  396. xor 0($out),@S[1] # KR^KL
  397. xor 4($out),@S[0]
  398. xor 8($out),@S[3]
  399. xor 12($out),@S[2]
  400. .L1st128:
  401. lea .LCamellia_SIGMA(%rip),$key
  402. lea .LCamellia_SBOX(%rip),$Tbl
  403. mov 0($key),$t1
  404. mov 4($key),$t0
  405. ___
  406. &Camellia_Feistel($step++);
  407. &Camellia_Feistel($step++);
  408. $code.=<<___;
  409. xor 0($out),@S[1] # ^KL
  410. xor 4($out),@S[0]
  411. xor 8($out),@S[3]
  412. xor 12($out),@S[2]
  413. ___
  414. &Camellia_Feistel($step++);
  415. &Camellia_Feistel($step++);
  416. $code.=<<___;
  417. cmp \$128,$keyend
  418. jne .L2nd256
  419. lea 128($out),$out # size optimization
  420. shl \$32,%r8 # @S[0]||
  421. shl \$32,%r10 # @S[2]||
  422. or %r9,%r8 # ||@S[1]
  423. or %r11,%r10 # ||@S[3]
  424. ___
  425. &_loadround (0,$out,-128,"%rax","%rbx"); # KL
  426. &_saveround (2,$out,-128,"%r8","%r10"); # KA<<<0
  427. &_rotl128 ("%rax","%rbx",15);
  428. &_saveround (4,$out,-128,"%rax","%rbx"); # KL<<<15
  429. &_rotl128 ("%r8","%r10",15);
  430. &_saveround (6,$out,-128,"%r8","%r10"); # KA<<<15
  431. &_rotl128 ("%r8","%r10",15); # 15+15=30
  432. &_saveround (8,$out,-128,"%r8","%r10"); # KA<<<30
  433. &_rotl128 ("%rax","%rbx",30); # 15+30=45
  434. &_saveround (10,$out,-128,"%rax","%rbx"); # KL<<<45
  435. &_rotl128 ("%r8","%r10",15); # 30+15=45
  436. &_saveround (12,$out,-128,"%r8"); # KA<<<45
  437. &_rotl128 ("%rax","%rbx",15); # 45+15=60
  438. &_saveround (13,$out,-128,"%rbx"); # KL<<<60
  439. &_rotl128 ("%r8","%r10",15); # 45+15=60
  440. &_saveround (14,$out,-128,"%r8","%r10"); # KA<<<60
  441. &_rotl128 ("%rax","%rbx",17); # 60+17=77
  442. &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<77
  443. &_rotl128 ("%rax","%rbx",17); # 77+17=94
  444. &_saveround (18,$out,-128,"%rax","%rbx"); # KL<<<94
  445. &_rotl128 ("%r8","%r10",34); # 60+34=94
  446. &_saveround (20,$out,-128,"%r8","%r10"); # KA<<<94
  447. &_rotl128 ("%rax","%rbx",17); # 94+17=111
  448. &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<111
  449. &_rotl128 ("%r8","%r10",17); # 94+17=111
  450. &_saveround (24,$out,-128,"%r8","%r10"); # KA<<<111
  451. $code.=<<___;
  452. mov \$3,%eax
  453. jmp .Ldone
  454. .align 16
  455. .L2nd256:
  456. ___
  457. &_saveround (6,$out,@S); # temp storage for KA!
  458. $code.=<<___;
  459. xor `4*8+0`($out),@S[1] # KA^KR
  460. xor `4*8+4`($out),@S[0]
  461. xor `5*8+0`($out),@S[3]
  462. xor `5*8+4`($out),@S[2]
  463. ___
  464. &Camellia_Feistel($step++);
  465. &Camellia_Feistel($step++);
  466. &_loadround (0,$out,"%rax","%rbx"); # KL
  467. &_loadround (4,$out,"%rcx","%rdx"); # KR
  468. &_loadround (6,$out,"%r14","%r15"); # KA
  469. $code.=<<___;
  470. lea 128($out),$out # size optimization
  471. shl \$32,%r8 # @S[0]||
  472. shl \$32,%r10 # @S[2]||
  473. or %r9,%r8 # ||@S[1]
  474. or %r11,%r10 # ||@S[3]
  475. ___
  476. &_saveround (2,$out,-128,"%r8","%r10"); # KB<<<0
  477. &_rotl128 ("%rcx","%rdx",15);
  478. &_saveround (4,$out,-128,"%rcx","%rdx"); # KR<<<15
  479. &_rotl128 ("%r14","%r15",15);
  480. &_saveround (6,$out,-128,"%r14","%r15"); # KA<<<15
  481. &_rotl128 ("%rcx","%rdx",15); # 15+15=30
  482. &_saveround (8,$out,-128,"%rcx","%rdx"); # KR<<<30
  483. &_rotl128 ("%r8","%r10",30);
  484. &_saveround (10,$out,-128,"%r8","%r10"); # KB<<<30
  485. &_rotl128 ("%rax","%rbx",45);
  486. &_saveround (12,$out,-128,"%rax","%rbx"); # KL<<<45
  487. &_rotl128 ("%r14","%r15",30); # 15+30=45
  488. &_saveround (14,$out,-128,"%r14","%r15"); # KA<<<45
  489. &_rotl128 ("%rax","%rbx",15); # 45+15=60
  490. &_saveround (16,$out,-128,"%rax","%rbx"); # KL<<<60
  491. &_rotl128 ("%rcx","%rdx",30); # 30+30=60
  492. &_saveround (18,$out,-128,"%rcx","%rdx"); # KR<<<60
  493. &_rotl128 ("%r8","%r10",30); # 30+30=60
  494. &_saveround (20,$out,-128,"%r8","%r10"); # KB<<<60
  495. &_rotl128 ("%rax","%rbx",17); # 60+17=77
  496. &_saveround (22,$out,-128,"%rax","%rbx"); # KL<<<77
  497. &_rotl128 ("%r14","%r15",32); # 45+32=77
  498. &_saveround (24,$out,-128,"%r14","%r15"); # KA<<<77
  499. &_rotl128 ("%rcx","%rdx",34); # 60+34=94
  500. &_saveround (26,$out,-128,"%rcx","%rdx"); # KR<<<94
  501. &_rotl128 ("%r14","%r15",17); # 77+17=94
  502. &_saveround (28,$out,-128,"%r14","%r15"); # KA<<<77
  503. &_rotl128 ("%rax","%rbx",34); # 77+34=111
  504. &_saveround (30,$out,-128,"%rax","%rbx"); # KL<<<111
  505. &_rotl128 ("%r8","%r10",51); # 60+51=111
  506. &_saveround (32,$out,-128,"%r8","%r10"); # KB<<<111
  507. $code.=<<___;
  508. mov \$4,%eax
  509. .Ldone:
  510. mov 0(%rsp),%r15
  511. mov 8(%rsp),%r14
  512. mov 16(%rsp),%r13
  513. mov 24(%rsp),%rbp
  514. mov 32(%rsp),%rbx
  515. lea 40(%rsp),%rsp
  516. .Lkey_epilogue:
  517. ret
  518. .size Camellia_Ekeygen,.-Camellia_Ekeygen
  519. ___
  520. }
  521. @SBOX=(
  522. 112,130, 44,236,179, 39,192,229,228,133, 87, 53,234, 12,174, 65,
  523. 35,239,107,147, 69, 25,165, 33,237, 14, 79, 78, 29,101,146,189,
  524. 134,184,175,143,124,235, 31,206, 62, 48,220, 95, 94,197, 11, 26,
  525. 166,225, 57,202,213, 71, 93, 61,217, 1, 90,214, 81, 86,108, 77,
  526. 139, 13,154,102,251,204,176, 45,116, 18, 43, 32,240,177,132,153,
  527. 223, 76,203,194, 52,126,118, 5,109,183,169, 49,209, 23, 4,215,
  528. 20, 88, 58, 97,222, 27, 17, 28, 50, 15,156, 22, 83, 24,242, 34,
  529. 254, 68,207,178,195,181,122,145, 36, 8,232,168, 96,252,105, 80,
  530. 170,208,160,125,161,137, 98,151, 84, 91, 30,149,224,255,100,210,
  531. 16,196, 0, 72,163,247,117,219,138, 3,230,218, 9, 63,221,148,
  532. 135, 92,131, 2,205, 74,144, 51,115,103,246,243,157,127,191,226,
  533. 82,155,216, 38,200, 55,198, 59,129,150,111, 75, 19,190, 99, 46,
  534. 233,121,167,140,159,110,188,142, 41,245,249,182, 47,253,180, 89,
  535. 120,152, 6,106,231, 70,113,186,212, 37,171, 66,136,162,141,250,
  536. 114, 7,185, 85,248,238,172, 10, 54, 73, 42,104, 60, 56,241,164,
  537. 64, 40,211,123,187,201, 67,193, 21,227,173,244,119,199,128,158);
  538. sub S1110 { my $i=shift; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i<<8; sprintf("0x%08x",$i); }
  539. sub S4404 { my $i=shift; $i=($i<<1|$i>>7)&0xff; $i=@SBOX[$i]; $i=$i<<24|$i<<16|$i; sprintf("0x%08x",$i); }
  540. sub S0222 { my $i=shift; $i=@SBOX[$i]; $i=($i<<1|$i>>7)&0xff; $i=$i<<16|$i<<8|$i; sprintf("0x%08x",$i); }
  541. sub S3033 { my $i=shift; $i=@SBOX[$i]; $i=($i>>1|$i<<7)&0xff; $i=$i<<24|$i<<8|$i; sprintf("0x%08x",$i); }
  542. $code.=<<___;
  543. .align 64
  544. .LCamellia_SIGMA:
  545. .long 0x3bcc908b, 0xa09e667f, 0x4caa73b2, 0xb67ae858
  546. .long 0xe94f82be, 0xc6ef372f, 0xf1d36f1c, 0x54ff53a5
  547. .long 0xde682d1d, 0x10e527fa, 0xb3e6c1fd, 0xb05688c2
  548. .long 0, 0, 0, 0
  549. .LCamellia_SBOX:
  550. ___
  551. # tables are interleaved, remember?
  552. sub data_word { $code.=".long\t".join(',',@_)."\n"; }
  553. for ($i=0;$i<256;$i++) { &data_word(&S1110($i),&S4404($i)); }
  554. for ($i=0;$i<256;$i++) { &data_word(&S0222($i),&S3033($i)); }
  555. # void Camellia_cbc_encrypt (const void char *inp, unsigned char *out,
  556. # size_t length, const CAMELLIA_KEY *key,
  557. # unsigned char *ivp,const int enc);
  558. {
  559. $_key="0(%rsp)";
  560. $_end="8(%rsp)"; # inp+len&~15
  561. $_res="16(%rsp)"; # len&15
  562. $ivec="24(%rsp)";
  563. $_ivp="40(%rsp)";
  564. $_rsp="48(%rsp)";
  565. $code.=<<___;
  566. .globl Camellia_cbc_encrypt
  567. .type Camellia_cbc_encrypt,\@function,6
  568. .align 16
  569. Camellia_cbc_encrypt:
  570. cmp \$0,%rdx
  571. je .Lcbc_abort
  572. push %rbx
  573. push %rbp
  574. push %r12
  575. push %r13
  576. push %r14
  577. push %r15
  578. .Lcbc_prologue:
  579. mov %rsp,%rbp
  580. sub \$64,%rsp
  581. and \$-64,%rsp
  582. # place stack frame just "above mod 1024" the key schedule,
  583. # this ensures that cache associativity suffices
  584. lea -64-63(%rcx),%r10
  585. sub %rsp,%r10
  586. neg %r10
  587. and \$0x3C0,%r10
  588. sub %r10,%rsp
  589. #add \$8,%rsp # 8 is reserved for callee's ra
  590. mov %rdi,$inp # inp argument
  591. mov %rsi,$out # out argument
  592. mov %r8,%rbx # ivp argument
  593. mov %rcx,$key # key argument
  594. mov 272(%rcx),${keyend}d # grandRounds
  595. mov %r8,$_ivp
  596. mov %rbp,$_rsp
  597. .Lcbc_body:
  598. lea .LCamellia_SBOX(%rip),$Tbl
  599. mov \$32,%ecx
  600. .align 4
  601. .Lcbc_prefetch_sbox:
  602. mov 0($Tbl),%rax
  603. mov 32($Tbl),%rsi
  604. mov 64($Tbl),%rdi
  605. mov 96($Tbl),%r11
  606. lea 128($Tbl),$Tbl
  607. loop .Lcbc_prefetch_sbox
  608. sub \$4096,$Tbl
  609. shl \$6,$keyend
  610. mov %rdx,%rcx # len argument
  611. lea ($key,$keyend),$keyend
  612. cmp \$0,%r9d # enc argument
  613. je .LCBC_DECRYPT
  614. and \$-16,%rdx
  615. and \$15,%rcx # length residue
  616. lea ($inp,%rdx),%rdx
  617. mov $key,$_key
  618. mov %rdx,$_end
  619. mov %rcx,$_res
  620. cmp $inp,%rdx
  621. mov 0(%rbx),@S[0] # load IV
  622. mov 4(%rbx),@S[1]
  623. mov 8(%rbx),@S[2]
  624. mov 12(%rbx),@S[3]
  625. je .Lcbc_enc_tail
  626. jmp .Lcbc_eloop
  627. .align 16
  628. .Lcbc_eloop:
  629. xor 0($inp),@S[0]
  630. xor 4($inp),@S[1]
  631. xor 8($inp),@S[2]
  632. bswap @S[0]
  633. xor 12($inp),@S[3]
  634. bswap @S[1]
  635. bswap @S[2]
  636. bswap @S[3]
  637. call _x86_64_Camellia_encrypt
  638. mov $_key,$key # "rewind" the key
  639. bswap @S[0]
  640. mov $_end,%rdx
  641. bswap @S[1]
  642. mov $_res,%rcx
  643. bswap @S[2]
  644. mov @S[0],0($out)
  645. bswap @S[3]
  646. mov @S[1],4($out)
  647. mov @S[2],8($out)
  648. lea 16($inp),$inp
  649. mov @S[3],12($out)
  650. cmp %rdx,$inp
  651. lea 16($out),$out
  652. jne .Lcbc_eloop
  653. cmp \$0,%rcx
  654. jne .Lcbc_enc_tail
  655. mov $_ivp,$out
  656. mov @S[0],0($out) # write out IV residue
  657. mov @S[1],4($out)
  658. mov @S[2],8($out)
  659. mov @S[3],12($out)
  660. jmp .Lcbc_done
  661. .align 16
  662. .Lcbc_enc_tail:
  663. xor %rax,%rax
  664. mov %rax,0+$ivec
  665. mov %rax,8+$ivec
  666. mov %rax,$_res
  667. .Lcbc_enc_pushf:
  668. pushfq
  669. cld
  670. mov $inp,%rsi
  671. lea 8+$ivec,%rdi
  672. .long 0x9066A4F3 # rep movsb
  673. popfq
  674. .Lcbc_enc_popf:
  675. lea $ivec,$inp
  676. lea 16+$ivec,%rax
  677. mov %rax,$_end
  678. jmp .Lcbc_eloop # one more time
  679. .align 16
  680. .LCBC_DECRYPT:
  681. xchg $key,$keyend
  682. add \$15,%rdx
  683. and \$15,%rcx # length residue
  684. and \$-16,%rdx
  685. mov $key,$_key
  686. lea ($inp,%rdx),%rdx
  687. mov %rdx,$_end
  688. mov %rcx,$_res
  689. mov (%rbx),%rax # load IV
  690. mov 8(%rbx),%rbx
  691. jmp .Lcbc_dloop
  692. .align 16
  693. .Lcbc_dloop:
  694. mov 0($inp),@S[0]
  695. mov 4($inp),@S[1]
  696. mov 8($inp),@S[2]
  697. bswap @S[0]
  698. mov 12($inp),@S[3]
  699. bswap @S[1]
  700. mov %rax,0+$ivec # save IV to temporary storage
  701. bswap @S[2]
  702. mov %rbx,8+$ivec
  703. bswap @S[3]
  704. call _x86_64_Camellia_decrypt
  705. mov $_key,$key # "rewind" the key
  706. mov $_end,%rdx
  707. mov $_res,%rcx
  708. bswap @S[0]
  709. mov ($inp),%rax # load IV for next iteration
  710. bswap @S[1]
  711. mov 8($inp),%rbx
  712. bswap @S[2]
  713. xor 0+$ivec,@S[0]
  714. bswap @S[3]
  715. xor 4+$ivec,@S[1]
  716. xor 8+$ivec,@S[2]
  717. lea 16($inp),$inp
  718. xor 12+$ivec,@S[3]
  719. cmp %rdx,$inp
  720. je .Lcbc_ddone
  721. mov @S[0],0($out)
  722. mov @S[1],4($out)
  723. mov @S[2],8($out)
  724. mov @S[3],12($out)
  725. lea 16($out),$out
  726. jmp .Lcbc_dloop
  727. .align 16
  728. .Lcbc_ddone:
  729. mov $_ivp,%rdx
  730. cmp \$0,%rcx
  731. jne .Lcbc_dec_tail
  732. mov @S[0],0($out)
  733. mov @S[1],4($out)
  734. mov @S[2],8($out)
  735. mov @S[3],12($out)
  736. mov %rax,(%rdx) # write out IV residue
  737. mov %rbx,8(%rdx)
  738. jmp .Lcbc_done
  739. .align 16
  740. .Lcbc_dec_tail:
  741. mov @S[0],0+$ivec
  742. mov @S[1],4+$ivec
  743. mov @S[2],8+$ivec
  744. mov @S[3],12+$ivec
  745. .Lcbc_dec_pushf:
  746. pushfq
  747. cld
  748. lea 8+$ivec,%rsi
  749. lea ($out),%rdi
  750. .long 0x9066A4F3 # rep movsb
  751. popfq
  752. .Lcbc_dec_popf:
  753. mov %rax,(%rdx) # write out IV residue
  754. mov %rbx,8(%rdx)
  755. jmp .Lcbc_done
  756. .align 16
  757. .Lcbc_done:
  758. mov $_rsp,%rcx
  759. mov 0(%rcx),%r15
  760. mov 8(%rcx),%r14
  761. mov 16(%rcx),%r13
  762. mov 24(%rcx),%r12
  763. mov 32(%rcx),%rbp
  764. mov 40(%rcx),%rbx
  765. lea 48(%rcx),%rsp
  766. .Lcbc_abort:
  767. ret
  768. .size Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
  769. .asciz "Camellia for x86_64 by <appro\@openssl.org>"
  770. ___
  771. }
  772. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  773. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  774. if ($win64) {
  775. $rec="%rcx";
  776. $frame="%rdx";
  777. $context="%r8";
  778. $disp="%r9";
  779. $code.=<<___;
  780. .extern __imp_RtlVirtualUnwind
  781. .type common_se_handler,\@abi-omnipotent
  782. .align 16
  783. common_se_handler:
  784. push %rsi
  785. push %rdi
  786. push %rbx
  787. push %rbp
  788. push %r12
  789. push %r13
  790. push %r14
  791. push %r15
  792. pushfq
  793. lea -64(%rsp),%rsp
  794. mov 120($context),%rax # pull context->Rax
  795. mov 248($context),%rbx # pull context->Rip
  796. mov 8($disp),%rsi # disp->ImageBase
  797. mov 56($disp),%r11 # disp->HandlerData
  798. mov 0(%r11),%r10d # HandlerData[0]
  799. lea (%rsi,%r10),%r10 # prologue label
  800. cmp %r10,%rbx # context->Rip<prologue label
  801. jb .Lin_prologue
  802. mov 152($context),%rax # pull context->Rsp
  803. mov 4(%r11),%r10d # HandlerData[1]
  804. lea (%rsi,%r10),%r10 # epilogue label
  805. cmp %r10,%rbx # context->Rip>=epilogue label
  806. jae .Lin_prologue
  807. lea 40(%rax),%rax
  808. mov -8(%rax),%rbx
  809. mov -16(%rax),%rbp
  810. mov -24(%rax),%r13
  811. mov -32(%rax),%r14
  812. mov -40(%rax),%r15
  813. mov %rbx,144($context) # restore context->Rbx
  814. mov %rbp,160($context) # restore context->Rbp
  815. mov %r13,224($context) # restore context->R13
  816. mov %r14,232($context) # restore context->R14
  817. mov %r15,240($context) # restore context->R15
  818. .Lin_prologue:
  819. mov 8(%rax),%rdi
  820. mov 16(%rax),%rsi
  821. mov %rax,152($context) # restore context->Rsp
  822. mov %rsi,168($context) # restore context->Rsi
  823. mov %rdi,176($context) # restore context->Rdi
  824. jmp .Lcommon_seh_exit
  825. .size common_se_handler,.-common_se_handler
  826. .type cbc_se_handler,\@abi-omnipotent
  827. .align 16
  828. cbc_se_handler:
  829. push %rsi
  830. push %rdi
  831. push %rbx
  832. push %rbp
  833. push %r12
  834. push %r13
  835. push %r14
  836. push %r15
  837. pushfq
  838. lea -64(%rsp),%rsp
  839. mov 120($context),%rax # pull context->Rax
  840. mov 248($context),%rbx # pull context->Rip
  841. lea .Lcbc_prologue(%rip),%r10
  842. cmp %r10,%rbx # context->Rip<.Lcbc_prologue
  843. jb .Lin_cbc_prologue
  844. lea .Lcbc_body(%rip),%r10
  845. cmp %r10,%rbx # context->Rip<.Lcbc_body
  846. jb .Lin_cbc_frame_setup
  847. mov 152($context),%rax # pull context->Rsp
  848. lea .Lcbc_abort(%rip),%r10
  849. cmp %r10,%rbx # context->Rip>=.Lcbc_abort
  850. jae .Lin_cbc_prologue
  851. # handle pushf/popf in Camellia_cbc_encrypt
  852. lea .Lcbc_enc_pushf(%rip),%r10
  853. cmp %r10,%rbx # context->Rip<=.Lcbc_enc_pushf
  854. jbe .Lin_cbc_no_flag
  855. lea 8(%rax),%rax
  856. lea .Lcbc_enc_popf(%rip),%r10
  857. cmp %r10,%rbx # context->Rip<.Lcbc_enc_popf
  858. jb .Lin_cbc_no_flag
  859. lea -8(%rax),%rax
  860. lea .Lcbc_dec_pushf(%rip),%r10
  861. cmp %r10,%rbx # context->Rip<=.Lcbc_dec_pushf
  862. jbe .Lin_cbc_no_flag
  863. lea 8(%rax),%rax
  864. lea .Lcbc_dec_popf(%rip),%r10
  865. cmp %r10,%rbx # context->Rip<.Lcbc_dec_popf
  866. jb .Lin_cbc_no_flag
  867. lea -8(%rax),%rax
  868. .Lin_cbc_no_flag:
  869. mov 48(%rax),%rax # $_rsp
  870. lea 48(%rax),%rax
  871. .Lin_cbc_frame_setup:
  872. mov -8(%rax),%rbx
  873. mov -16(%rax),%rbp
  874. mov -24(%rax),%r12
  875. mov -32(%rax),%r13
  876. mov -40(%rax),%r14
  877. mov -48(%rax),%r15
  878. mov %rbx,144($context) # restore context->Rbx
  879. mov %rbp,160($context) # restore context->Rbp
  880. mov %r12,216($context) # restore context->R12
  881. mov %r13,224($context) # restore context->R13
  882. mov %r14,232($context) # restore context->R14
  883. mov %r15,240($context) # restore context->R15
  884. .Lin_cbc_prologue:
  885. mov 8(%rax),%rdi
  886. mov 16(%rax),%rsi
  887. mov %rax,152($context) # restore context->Rsp
  888. mov %rsi,168($context) # restore context->Rsi
  889. mov %rdi,176($context) # restore context->Rdi
  890. .align 4
  891. .Lcommon_seh_exit:
  892. mov 40($disp),%rdi # disp->ContextRecord
  893. mov $context,%rsi # context
  894. mov \$`1232/8`,%ecx # sizeof(CONTEXT)
  895. .long 0xa548f3fc # cld; rep movsq
  896. mov $disp,%rsi
  897. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  898. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  899. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  900. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  901. mov 40(%rsi),%r10 # disp->ContextRecord
  902. lea 56(%rsi),%r11 # &disp->HandlerData
  903. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  904. mov %r10,32(%rsp) # arg5
  905. mov %r11,40(%rsp) # arg6
  906. mov %r12,48(%rsp) # arg7
  907. mov %rcx,56(%rsp) # arg8, (NULL)
  908. call *__imp_RtlVirtualUnwind(%rip)
  909. mov \$1,%eax # ExceptionContinueSearch
  910. lea 64(%rsp),%rsp
  911. popfq
  912. pop %r15
  913. pop %r14
  914. pop %r13
  915. pop %r12
  916. pop %rbp
  917. pop %rbx
  918. pop %rdi
  919. pop %rsi
  920. ret
  921. .size cbc_se_handler,.-cbc_se_handler
  922. .section .pdata
  923. .align 4
  924. .rva .LSEH_begin_Camellia_EncryptBlock_Rounds
  925. .rva .LSEH_end_Camellia_EncryptBlock_Rounds
  926. .rva .LSEH_info_Camellia_EncryptBlock_Rounds
  927. .rva .LSEH_begin_Camellia_DecryptBlock_Rounds
  928. .rva .LSEH_end_Camellia_DecryptBlock_Rounds
  929. .rva .LSEH_info_Camellia_DecryptBlock_Rounds
  930. .rva .LSEH_begin_Camellia_Ekeygen
  931. .rva .LSEH_end_Camellia_Ekeygen
  932. .rva .LSEH_info_Camellia_Ekeygen
  933. .rva .LSEH_begin_Camellia_cbc_encrypt
  934. .rva .LSEH_end_Camellia_cbc_encrypt
  935. .rva .LSEH_info_Camellia_cbc_encrypt
  936. .section .xdata
  937. .align 8
  938. .LSEH_info_Camellia_EncryptBlock_Rounds:
  939. .byte 9,0,0,0
  940. .rva common_se_handler
  941. .rva .Lenc_prologue,.Lenc_epilogue # HandlerData[]
  942. .LSEH_info_Camellia_DecryptBlock_Rounds:
  943. .byte 9,0,0,0
  944. .rva common_se_handler
  945. .rva .Ldec_prologue,.Ldec_epilogue # HandlerData[]
  946. .LSEH_info_Camellia_Ekeygen:
  947. .byte 9,0,0,0
  948. .rva common_se_handler
  949. .rva .Lkey_prologue,.Lkey_epilogue # HandlerData[]
  950. .LSEH_info_Camellia_cbc_encrypt:
  951. .byte 9,0,0,0
  952. .rva cbc_se_handler
  953. ___
  954. }
  955. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  956. print $code;
  957. close STDOUT;