cmllt4-sparcv9.pl 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938
  1. #! /usr/bin/env perl
  2. # Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by David S. Miller and Andy Polyakov.
  10. # The module is licensed under 2-clause BSD
  11. # license. October 2012. All rights reserved.
  12. # ====================================================================
  13. ######################################################################
  14. # Camellia for SPARC T4.
  15. #
  16. # As with AES below results [for aligned data] are virtually identical
  17. # to critical path lengths for 3-cycle instruction latency:
  18. #
  19. # 128-bit key 192/256-
  20. # CBC encrypt 4.14/4.21(*) 5.46/5.52
  21. # (*) numbers after slash are for
  22. # misaligned data.
  23. #
  24. # As with Intel AES-NI, question is if it's possible to improve
  25. # performance of parallelizable modes by interleaving round
  26. # instructions. In Camellia every instruction is dependent on
  27. # previous, which means that there is place for 2 additional ones
  28. # in between two dependent. Can we expect 3x performance improvement?
  29. # At least one can argue that it should be possible to break 2x
  30. # barrier... For some reason not even 2x appears to be possible:
  31. #
  32. # 128-bit key 192/256-
  33. # CBC decrypt 2.21/2.74 2.99/3.40
  34. # CTR 2.15/2.68(*) 2.93/3.34
  35. # (*) numbers after slash are for
  36. # misaligned data.
  37. #
  38. # This is for 2x interleave. But compared to 1x interleave CBC decrypt
  39. # improved by ... 0% for 128-bit key, and 11% for 192/256-bit one.
  40. # So that out-of-order execution logic can take non-interleaved code
  41. # to 1.87x, but can't take 2x interleaved one any further. There
  42. # surely is some explanation... As result 3x interleave was not even
  43. # attempted. Instead an effort was made to share specific modes
  44. # implementations with AES module (therefore sparct4_modes.pl).
  45. #
  46. # To anchor to something else, software C implementation processes
  47. # one byte in 38 cycles with 128-bit key on same processor.
  48. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  49. push(@INC,"${dir}","${dir}../../perlasm");
  50. require "sparcv9_modes.pl";
  51. $output = pop and open STDOUT,">$output";
  52. $::evp=1; # if $evp is set to 0, script generates module with
  53. # Camellia_[en|de]crypt, Camellia_set_key and Camellia_cbc_encrypt
  54. # entry points. These are fully compatible with openssl/camellia.h.
  55. ######################################################################
  56. # single-round subroutines
  57. #
  58. {
  59. my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
  60. $code=<<___;
  61. #include "sparc_arch.h"
  62. .text
  63. .globl cmll_t4_encrypt
  64. .align 32
  65. cmll_t4_encrypt:
  66. andcc $inp, 7, %g1 ! is input aligned?
  67. andn $inp, 7, $inp
  68. ldx [$key + 0], %g4
  69. ldx [$key + 8], %g5
  70. ldx [$inp + 0], %o4
  71. bz,pt %icc, 1f
  72. ldx [$inp + 8], %o5
  73. ldx [$inp + 16], $inp
  74. sll %g1, 3, %g1
  75. sub %g0, %g1, %o3
  76. sllx %o4, %g1, %o4
  77. sllx %o5, %g1, %g1
  78. srlx %o5, %o3, %o5
  79. srlx $inp, %o3, %o3
  80. or %o5, %o4, %o4
  81. or %o3, %g1, %o5
  82. 1:
  83. ld [$key + 272], $rounds ! grandRounds, 3 or 4
  84. ldd [$key + 16], %f12
  85. ldd [$key + 24], %f14
  86. xor %g4, %o4, %o4
  87. xor %g5, %o5, %o5
  88. ldd [$key + 32], %f16
  89. ldd [$key + 40], %f18
  90. movxtod %o4, %f0
  91. movxtod %o5, %f2
  92. ldd [$key + 48], %f20
  93. ldd [$key + 56], %f22
  94. sub $rounds, 1, $rounds
  95. ldd [$key + 64], %f24
  96. ldd [$key + 72], %f26
  97. add $key, 80, $key
  98. .Lenc:
  99. camellia_f %f12, %f2, %f0, %f2
  100. ldd [$key + 0], %f12
  101. sub $rounds,1,$rounds
  102. camellia_f %f14, %f0, %f2, %f0
  103. ldd [$key + 8], %f14
  104. camellia_f %f16, %f2, %f0, %f2
  105. ldd [$key + 16], %f16
  106. camellia_f %f18, %f0, %f2, %f0
  107. ldd [$key + 24], %f18
  108. camellia_f %f20, %f2, %f0, %f2
  109. ldd [$key + 32], %f20
  110. camellia_f %f22, %f0, %f2, %f0
  111. ldd [$key + 40], %f22
  112. camellia_fl %f24, %f0, %f0
  113. ldd [$key + 48], %f24
  114. camellia_fli %f26, %f2, %f2
  115. ldd [$key + 56], %f26
  116. brnz,pt $rounds, .Lenc
  117. add $key, 64, $key
  118. andcc $out, 7, $tmp ! is output aligned?
  119. camellia_f %f12, %f2, %f0, %f2
  120. camellia_f %f14, %f0, %f2, %f0
  121. camellia_f %f16, %f2, %f0, %f2
  122. camellia_f %f18, %f0, %f2, %f0
  123. camellia_f %f20, %f2, %f0, %f4
  124. camellia_f %f22, %f0, %f4, %f2
  125. fxor %f24, %f4, %f0
  126. fxor %f26, %f2, %f2
  127. bnz,pn %icc, 2f
  128. nop
  129. std %f0, [$out + 0]
  130. retl
  131. std %f2, [$out + 8]
  132. 2: alignaddrl $out, %g0, $out
  133. mov 0xff, $mask
  134. srl $mask, $tmp, $mask
  135. faligndata %f0, %f0, %f4
  136. faligndata %f0, %f2, %f6
  137. faligndata %f2, %f2, %f8
  138. stda %f4, [$out + $mask]0xc0 ! partial store
  139. std %f6, [$out + 8]
  140. add $out, 16, $out
  141. orn %g0, $mask, $mask
  142. retl
  143. stda %f8, [$out + $mask]0xc0 ! partial store
  144. .type cmll_t4_encrypt,#function
  145. .size cmll_t4_encrypt,.-cmll_t4_encrypt
  146. .globl cmll_t4_decrypt
  147. .align 32
  148. cmll_t4_decrypt:
  149. ld [$key + 272], $rounds ! grandRounds, 3 or 4
  150. andcc $inp, 7, %g1 ! is input aligned?
  151. andn $inp, 7, $inp
  152. sll $rounds, 6, $rounds
  153. add $rounds, $key, $key
  154. ldx [$inp + 0], %o4
  155. bz,pt %icc, 1f
  156. ldx [$inp + 8], %o5
  157. ldx [$inp + 16], $inp
  158. sll %g1, 3, %g1
  159. sub %g0, %g1, %g4
  160. sllx %o4, %g1, %o4
  161. sllx %o5, %g1, %g1
  162. srlx %o5, %g4, %o5
  163. srlx $inp, %g4, %g4
  164. or %o5, %o4, %o4
  165. or %g4, %g1, %o5
  166. 1:
  167. ldx [$key + 0], %g4
  168. ldx [$key + 8], %g5
  169. ldd [$key - 8], %f12
  170. ldd [$key - 16], %f14
  171. xor %g4, %o4, %o4
  172. xor %g5, %o5, %o5
  173. ldd [$key - 24], %f16
  174. ldd [$key - 32], %f18
  175. movxtod %o4, %f0
  176. movxtod %o5, %f2
  177. ldd [$key - 40], %f20
  178. ldd [$key - 48], %f22
  179. sub $rounds, 64, $rounds
  180. ldd [$key - 56], %f24
  181. ldd [$key - 64], %f26
  182. sub $key, 64, $key
  183. .Ldec:
  184. camellia_f %f12, %f2, %f0, %f2
  185. ldd [$key - 8], %f12
  186. sub $rounds, 64, $rounds
  187. camellia_f %f14, %f0, %f2, %f0
  188. ldd [$key - 16], %f14
  189. camellia_f %f16, %f2, %f0, %f2
  190. ldd [$key - 24], %f16
  191. camellia_f %f18, %f0, %f2, %f0
  192. ldd [$key - 32], %f18
  193. camellia_f %f20, %f2, %f0, %f2
  194. ldd [$key - 40], %f20
  195. camellia_f %f22, %f0, %f2, %f0
  196. ldd [$key - 48], %f22
  197. camellia_fl %f24, %f0, %f0
  198. ldd [$key - 56], %f24
  199. camellia_fli %f26, %f2, %f2
  200. ldd [$key - 64], %f26
  201. brnz,pt $rounds, .Ldec
  202. sub $key, 64, $key
  203. andcc $out, 7, $tmp ! is output aligned?
  204. camellia_f %f12, %f2, %f0, %f2
  205. camellia_f %f14, %f0, %f2, %f0
  206. camellia_f %f16, %f2, %f0, %f2
  207. camellia_f %f18, %f0, %f2, %f0
  208. camellia_f %f20, %f2, %f0, %f4
  209. camellia_f %f22, %f0, %f4, %f2
  210. fxor %f26, %f4, %f0
  211. fxor %f24, %f2, %f2
  212. bnz,pn %icc, 2f
  213. nop
  214. std %f0, [$out + 0]
  215. retl
  216. std %f2, [$out + 8]
  217. 2: alignaddrl $out, %g0, $out
  218. mov 0xff, $mask
  219. srl $mask, $tmp, $mask
  220. faligndata %f0, %f0, %f4
  221. faligndata %f0, %f2, %f6
  222. faligndata %f2, %f2, %f8
  223. stda %f4, [$out + $mask]0xc0 ! partial store
  224. std %f6, [$out + 8]
  225. add $out, 16, $out
  226. orn %g0, $mask, $mask
  227. retl
  228. stda %f8, [$out + $mask]0xc0 ! partial store
  229. .type cmll_t4_decrypt,#function
  230. .size cmll_t4_decrypt,.-cmll_t4_decrypt
  231. ___
  232. }
  233. ######################################################################
  234. # key setup subroutines
  235. #
  236. {
  237. sub ROTL128 {
  238. my $rot = shift;
  239. "srlx %o4, 64-$rot, %g4\n\t".
  240. "sllx %o4, $rot, %o4\n\t".
  241. "srlx %o5, 64-$rot, %g5\n\t".
  242. "sllx %o5, $rot, %o5\n\t".
  243. "or %o4, %g5, %o4\n\t".
  244. "or %o5, %g4, %o5";
  245. }
  246. my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5));
  247. $code.=<<___;
  248. .globl cmll_t4_set_key
  249. .align 32
  250. cmll_t4_set_key:
  251. and $inp, 7, $tmp
  252. alignaddr $inp, %g0, $inp
  253. cmp $bits, 192
  254. ldd [$inp + 0], %f0
  255. bl,pt %icc,.L128
  256. ldd [$inp + 8], %f2
  257. be,pt %icc,.L192
  258. ldd [$inp + 16], %f4
  259. brz,pt $tmp, .L256aligned
  260. ldd [$inp + 24], %f6
  261. ldd [$inp + 32], %f8
  262. faligndata %f0, %f2, %f0
  263. faligndata %f2, %f4, %f2
  264. faligndata %f4, %f6, %f4
  265. b .L256aligned
  266. faligndata %f6, %f8, %f6
  267. .align 16
  268. .L192:
  269. brz,a,pt $tmp, .L256aligned
  270. fnot2 %f4, %f6
  271. ldd [$inp + 24], %f6
  272. nop
  273. faligndata %f0, %f2, %f0
  274. faligndata %f2, %f4, %f2
  275. faligndata %f4, %f6, %f4
  276. fnot2 %f4, %f6
  277. .L256aligned:
  278. std %f0, [$out + 0] ! k[0, 1]
  279. fsrc2 %f0, %f28
  280. std %f2, [$out + 8] ! k[2, 3]
  281. fsrc2 %f2, %f30
  282. fxor %f4, %f0, %f0
  283. b .L128key
  284. fxor %f6, %f2, %f2
  285. .align 16
  286. .L128:
  287. brz,pt $tmp, .L128aligned
  288. nop
  289. ldd [$inp + 16], %f4
  290. nop
  291. faligndata %f0, %f2, %f0
  292. faligndata %f2, %f4, %f2
  293. .L128aligned:
  294. std %f0, [$out + 0] ! k[0, 1]
  295. fsrc2 %f0, %f28
  296. std %f2, [$out + 8] ! k[2, 3]
  297. fsrc2 %f2, %f30
  298. .L128key:
  299. mov %o7, %o5
  300. 1: call .+8
  301. add %o7, SIGMA-1b, %o4
  302. mov %o5, %o7
  303. ldd [%o4 + 0], %f16
  304. ldd [%o4 + 8], %f18
  305. ldd [%o4 + 16], %f20
  306. ldd [%o4 + 24], %f22
  307. camellia_f %f16, %f2, %f0, %f2
  308. camellia_f %f18, %f0, %f2, %f0
  309. fxor %f28, %f0, %f0
  310. fxor %f30, %f2, %f2
  311. camellia_f %f20, %f2, %f0, %f2
  312. camellia_f %f22, %f0, %f2, %f0
  313. bge,pn %icc, .L256key
  314. nop
  315. std %f0, [$out + 0x10] ! k[ 4, 5]
  316. std %f2, [$out + 0x18] ! k[ 6, 7]
  317. movdtox %f0, %o4
  318. movdtox %f2, %o5
  319. `&ROTL128(15)`
  320. stx %o4, [$out + 0x30] ! k[12, 13]
  321. stx %o5, [$out + 0x38] ! k[14, 15]
  322. `&ROTL128(15)`
  323. stx %o4, [$out + 0x40] ! k[16, 17]
  324. stx %o5, [$out + 0x48] ! k[18, 19]
  325. `&ROTL128(15)`
  326. stx %o4, [$out + 0x60] ! k[24, 25]
  327. `&ROTL128(15)`
  328. stx %o4, [$out + 0x70] ! k[28, 29]
  329. stx %o5, [$out + 0x78] ! k[30, 31]
  330. `&ROTL128(34)`
  331. stx %o4, [$out + 0xa0] ! k[40, 41]
  332. stx %o5, [$out + 0xa8] ! k[42, 43]
  333. `&ROTL128(17)`
  334. stx %o4, [$out + 0xc0] ! k[48, 49]
  335. stx %o5, [$out + 0xc8] ! k[50, 51]
  336. movdtox %f28, %o4 ! k[ 0, 1]
  337. movdtox %f30, %o5 ! k[ 2, 3]
  338. `&ROTL128(15)`
  339. stx %o4, [$out + 0x20] ! k[ 8, 9]
  340. stx %o5, [$out + 0x28] ! k[10, 11]
  341. `&ROTL128(30)`
  342. stx %o4, [$out + 0x50] ! k[20, 21]
  343. stx %o5, [$out + 0x58] ! k[22, 23]
  344. `&ROTL128(15)`
  345. stx %o5, [$out + 0x68] ! k[26, 27]
  346. `&ROTL128(17)`
  347. stx %o4, [$out + 0x80] ! k[32, 33]
  348. stx %o5, [$out + 0x88] ! k[34, 35]
  349. `&ROTL128(17)`
  350. stx %o4, [$out + 0x90] ! k[36, 37]
  351. stx %o5, [$out + 0x98] ! k[38, 39]
  352. `&ROTL128(17)`
  353. stx %o4, [$out + 0xb0] ! k[44, 45]
  354. stx %o5, [$out + 0xb8] ! k[46, 47]
  355. mov 3, $tmp
  356. st $tmp, [$out + 0x110]
  357. retl
  358. xor %o0, %o0, %o0
  359. .align 16
  360. .L256key:
  361. ldd [%o4 + 32], %f24
  362. ldd [%o4 + 40], %f26
  363. std %f0, [$out + 0x30] ! k[12, 13]
  364. std %f2, [$out + 0x38] ! k[14, 15]
  365. fxor %f4, %f0, %f0
  366. fxor %f6, %f2, %f2
  367. camellia_f %f24, %f2, %f0, %f2
  368. camellia_f %f26, %f0, %f2, %f0
  369. std %f0, [$out + 0x10] ! k[ 4, 5]
  370. std %f2, [$out + 0x18] ! k[ 6, 7]
  371. movdtox %f0, %o4
  372. movdtox %f2, %o5
  373. `&ROTL128(30)`
  374. stx %o4, [$out + 0x50] ! k[20, 21]
  375. stx %o5, [$out + 0x58] ! k[22, 23]
  376. `&ROTL128(30)`
  377. stx %o4, [$out + 0xa0] ! k[40, 41]
  378. stx %o5, [$out + 0xa8] ! k[42, 43]
  379. `&ROTL128(51)`
  380. stx %o4, [$out + 0x100] ! k[64, 65]
  381. stx %o5, [$out + 0x108] ! k[66, 67]
  382. movdtox %f4, %o4 ! k[ 8, 9]
  383. movdtox %f6, %o5 ! k[10, 11]
  384. `&ROTL128(15)`
  385. stx %o4, [$out + 0x20] ! k[ 8, 9]
  386. stx %o5, [$out + 0x28] ! k[10, 11]
  387. `&ROTL128(15)`
  388. stx %o4, [$out + 0x40] ! k[16, 17]
  389. stx %o5, [$out + 0x48] ! k[18, 19]
  390. `&ROTL128(30)`
  391. stx %o4, [$out + 0x90] ! k[36, 37]
  392. stx %o5, [$out + 0x98] ! k[38, 39]
  393. `&ROTL128(34)`
  394. stx %o4, [$out + 0xd0] ! k[52, 53]
  395. stx %o5, [$out + 0xd8] ! k[54, 55]
  396. ldx [$out + 0x30], %o4 ! k[12, 13]
  397. ldx [$out + 0x38], %o5 ! k[14, 15]
  398. `&ROTL128(15)`
  399. stx %o4, [$out + 0x30] ! k[12, 13]
  400. stx %o5, [$out + 0x38] ! k[14, 15]
  401. `&ROTL128(30)`
  402. stx %o4, [$out + 0x70] ! k[28, 29]
  403. stx %o5, [$out + 0x78] ! k[30, 31]
  404. srlx %o4, 32, %g4
  405. srlx %o5, 32, %g5
  406. st %o4, [$out + 0xc0] ! k[48]
  407. st %g5, [$out + 0xc4] ! k[49]
  408. st %o5, [$out + 0xc8] ! k[50]
  409. st %g4, [$out + 0xcc] ! k[51]
  410. `&ROTL128(49)`
  411. stx %o4, [$out + 0xe0] ! k[56, 57]
  412. stx %o5, [$out + 0xe8] ! k[58, 59]
  413. movdtox %f28, %o4 ! k[ 0, 1]
  414. movdtox %f30, %o5 ! k[ 2, 3]
  415. `&ROTL128(45)`
  416. stx %o4, [$out + 0x60] ! k[24, 25]
  417. stx %o5, [$out + 0x68] ! k[26, 27]
  418. `&ROTL128(15)`
  419. stx %o4, [$out + 0x80] ! k[32, 33]
  420. stx %o5, [$out + 0x88] ! k[34, 35]
  421. `&ROTL128(17)`
  422. stx %o4, [$out + 0xb0] ! k[44, 45]
  423. stx %o5, [$out + 0xb8] ! k[46, 47]
  424. `&ROTL128(34)`
  425. stx %o4, [$out + 0xf0] ! k[60, 61]
  426. stx %o5, [$out + 0xf8] ! k[62, 63]
  427. mov 4, $tmp
  428. st $tmp, [$out + 0x110]
  429. retl
  430. xor %o0, %o0, %o0
  431. .type cmll_t4_set_key,#function
  432. .size cmll_t4_set_key,.-cmll_t4_set_key
  433. .align 32
  434. SIGMA:
  435. .long 0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2
  436. .long 0xc6ef372f, 0xe94f82be, 0x54ff53a5, 0xf1d36f1c
  437. .long 0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd
  438. .type SIGMA,#object
  439. .size SIGMA,.-SIGMA
  440. .asciz "Camellia for SPARC T4, David S. Miller, Andy Polyakov"
  441. ___
  442. }
  443. {{{
  444. my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
  445. my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7));
  446. $code.=<<___;
  447. .align 32
  448. _cmll128_load_enckey:
  449. ldx [$key + 0], %g4
  450. ldx [$key + 8], %g5
  451. ___
  452. for ($i=2; $i<26;$i++) { # load key schedule
  453. $code.=<<___;
  454. ldd [$key + `8*$i`], %f`12+2*$i`
  455. ___
  456. }
  457. $code.=<<___;
  458. retl
  459. nop
  460. .type _cmll128_load_enckey,#function
  461. .size _cmll128_load_enckey,.-_cmll128_load_enckey
  462. _cmll256_load_enckey=_cmll128_load_enckey
  463. .align 32
  464. _cmll256_load_deckey:
  465. ldd [$key + 64], %f62
  466. ldd [$key + 72], %f60
  467. b .Load_deckey
  468. add $key, 64, $key
  469. _cmll128_load_deckey:
  470. ldd [$key + 0], %f60
  471. ldd [$key + 8], %f62
  472. .Load_deckey:
  473. ___
  474. for ($i=2; $i<24;$i++) { # load key schedule
  475. $code.=<<___;
  476. ldd [$key + `8*$i`], %f`62-2*$i`
  477. ___
  478. }
  479. $code.=<<___;
  480. ldx [$key + 192], %g4
  481. retl
  482. ldx [$key + 200], %g5
  483. .type _cmll256_load_deckey,#function
  484. .size _cmll256_load_deckey,.-_cmll256_load_deckey
  485. .align 32
  486. _cmll128_encrypt_1x:
  487. ___
  488. for ($i=0; $i<3; $i++) {
  489. $code.=<<___;
  490. camellia_f %f`16+16*$i+0`, %f2, %f0, %f2
  491. camellia_f %f`16+16*$i+2`, %f0, %f2, %f0
  492. camellia_f %f`16+16*$i+4`, %f2, %f0, %f2
  493. camellia_f %f`16+16*$i+6`, %f0, %f2, %f0
  494. ___
  495. $code.=<<___ if ($i<2);
  496. camellia_f %f`16+16*$i+8`, %f2, %f0, %f2
  497. camellia_f %f`16+16*$i+10`, %f0, %f2, %f0
  498. camellia_fl %f`16+16*$i+12`, %f0, %f0
  499. camellia_fli %f`16+16*$i+14`, %f2, %f2
  500. ___
  501. }
  502. $code.=<<___;
  503. camellia_f %f56, %f2, %f0, %f4
  504. camellia_f %f58, %f0, %f4, %f2
  505. fxor %f60, %f4, %f0
  506. retl
  507. fxor %f62, %f2, %f2
  508. .type _cmll128_encrypt_1x,#function
  509. .size _cmll128_encrypt_1x,.-_cmll128_encrypt_1x
  510. _cmll128_decrypt_1x=_cmll128_encrypt_1x
  511. .align 32
  512. _cmll128_encrypt_2x:
  513. ___
  514. for ($i=0; $i<3; $i++) {
  515. $code.=<<___;
  516. camellia_f %f`16+16*$i+0`, %f2, %f0, %f2
  517. camellia_f %f`16+16*$i+0`, %f6, %f4, %f6
  518. camellia_f %f`16+16*$i+2`, %f0, %f2, %f0
  519. camellia_f %f`16+16*$i+2`, %f4, %f6, %f4
  520. camellia_f %f`16+16*$i+4`, %f2, %f0, %f2
  521. camellia_f %f`16+16*$i+4`, %f6, %f4, %f6
  522. camellia_f %f`16+16*$i+6`, %f0, %f2, %f0
  523. camellia_f %f`16+16*$i+6`, %f4, %f6, %f4
  524. ___
  525. $code.=<<___ if ($i<2);
  526. camellia_f %f`16+16*$i+8`, %f2, %f0, %f2
  527. camellia_f %f`16+16*$i+8`, %f6, %f4, %f6
  528. camellia_f %f`16+16*$i+10`, %f0, %f2, %f0
  529. camellia_f %f`16+16*$i+10`, %f4, %f6, %f4
  530. camellia_fl %f`16+16*$i+12`, %f0, %f0
  531. camellia_fl %f`16+16*$i+12`, %f4, %f4
  532. camellia_fli %f`16+16*$i+14`, %f2, %f2
  533. camellia_fli %f`16+16*$i+14`, %f6, %f6
  534. ___
  535. }
  536. $code.=<<___;
  537. camellia_f %f56, %f2, %f0, %f8
  538. camellia_f %f56, %f6, %f4, %f10
  539. camellia_f %f58, %f0, %f8, %f2
  540. camellia_f %f58, %f4, %f10, %f6
  541. fxor %f60, %f8, %f0
  542. fxor %f60, %f10, %f4
  543. fxor %f62, %f2, %f2
  544. retl
  545. fxor %f62, %f6, %f6
  546. .type _cmll128_encrypt_2x,#function
  547. .size _cmll128_encrypt_2x,.-_cmll128_encrypt_2x
  548. _cmll128_decrypt_2x=_cmll128_encrypt_2x
  549. .align 32
  550. _cmll256_encrypt_1x:
  551. camellia_f %f16, %f2, %f0, %f2
  552. camellia_f %f18, %f0, %f2, %f0
  553. ldd [$key + 208], %f16
  554. ldd [$key + 216], %f18
  555. camellia_f %f20, %f2, %f0, %f2
  556. camellia_f %f22, %f0, %f2, %f0
  557. ldd [$key + 224], %f20
  558. ldd [$key + 232], %f22
  559. camellia_f %f24, %f2, %f0, %f2
  560. camellia_f %f26, %f0, %f2, %f0
  561. ldd [$key + 240], %f24
  562. ldd [$key + 248], %f26
  563. camellia_fl %f28, %f0, %f0
  564. camellia_fli %f30, %f2, %f2
  565. ldd [$key + 256], %f28
  566. ldd [$key + 264], %f30
  567. ___
  568. for ($i=1; $i<3; $i++) {
  569. $code.=<<___;
  570. camellia_f %f`16+16*$i+0`, %f2, %f0, %f2
  571. camellia_f %f`16+16*$i+2`, %f0, %f2, %f0
  572. camellia_f %f`16+16*$i+4`, %f2, %f0, %f2
  573. camellia_f %f`16+16*$i+6`, %f0, %f2, %f0
  574. camellia_f %f`16+16*$i+8`, %f2, %f0, %f2
  575. camellia_f %f`16+16*$i+10`, %f0, %f2, %f0
  576. camellia_fl %f`16+16*$i+12`, %f0, %f0
  577. camellia_fli %f`16+16*$i+14`, %f2, %f2
  578. ___
  579. }
  580. $code.=<<___;
  581. camellia_f %f16, %f2, %f0, %f2
  582. camellia_f %f18, %f0, %f2, %f0
  583. ldd [$key + 16], %f16
  584. ldd [$key + 24], %f18
  585. camellia_f %f20, %f2, %f0, %f2
  586. camellia_f %f22, %f0, %f2, %f0
  587. ldd [$key + 32], %f20
  588. ldd [$key + 40], %f22
  589. camellia_f %f24, %f2, %f0, %f4
  590. camellia_f %f26, %f0, %f4, %f2
  591. ldd [$key + 48], %f24
  592. ldd [$key + 56], %f26
  593. fxor %f28, %f4, %f0
  594. fxor %f30, %f2, %f2
  595. ldd [$key + 64], %f28
  596. retl
  597. ldd [$key + 72], %f30
  598. .type _cmll256_encrypt_1x,#function
  599. .size _cmll256_encrypt_1x,.-_cmll256_encrypt_1x
  600. .align 32
  601. _cmll256_encrypt_2x:
  602. camellia_f %f16, %f2, %f0, %f2
  603. camellia_f %f16, %f6, %f4, %f6
  604. camellia_f %f18, %f0, %f2, %f0
  605. camellia_f %f18, %f4, %f6, %f4
  606. ldd [$key + 208], %f16
  607. ldd [$key + 216], %f18
  608. camellia_f %f20, %f2, %f0, %f2
  609. camellia_f %f20, %f6, %f4, %f6
  610. camellia_f %f22, %f0, %f2, %f0
  611. camellia_f %f22, %f4, %f6, %f4
  612. ldd [$key + 224], %f20
  613. ldd [$key + 232], %f22
  614. camellia_f %f24, %f2, %f0, %f2
  615. camellia_f %f24, %f6, %f4, %f6
  616. camellia_f %f26, %f0, %f2, %f0
  617. camellia_f %f26, %f4, %f6, %f4
  618. ldd [$key + 240], %f24
  619. ldd [$key + 248], %f26
  620. camellia_fl %f28, %f0, %f0
  621. camellia_fl %f28, %f4, %f4
  622. camellia_fli %f30, %f2, %f2
  623. camellia_fli %f30, %f6, %f6
  624. ldd [$key + 256], %f28
  625. ldd [$key + 264], %f30
  626. ___
  627. for ($i=1; $i<3; $i++) {
  628. $code.=<<___;
  629. camellia_f %f`16+16*$i+0`, %f2, %f0, %f2
  630. camellia_f %f`16+16*$i+0`, %f6, %f4, %f6
  631. camellia_f %f`16+16*$i+2`, %f0, %f2, %f0
  632. camellia_f %f`16+16*$i+2`, %f4, %f6, %f4
  633. camellia_f %f`16+16*$i+4`, %f2, %f0, %f2
  634. camellia_f %f`16+16*$i+4`, %f6, %f4, %f6
  635. camellia_f %f`16+16*$i+6`, %f0, %f2, %f0
  636. camellia_f %f`16+16*$i+6`, %f4, %f6, %f4
  637. camellia_f %f`16+16*$i+8`, %f2, %f0, %f2
  638. camellia_f %f`16+16*$i+8`, %f6, %f4, %f6
  639. camellia_f %f`16+16*$i+10`, %f0, %f2, %f0
  640. camellia_f %f`16+16*$i+10`, %f4, %f6, %f4
  641. camellia_fl %f`16+16*$i+12`, %f0, %f0
  642. camellia_fl %f`16+16*$i+12`, %f4, %f4
  643. camellia_fli %f`16+16*$i+14`, %f2, %f2
  644. camellia_fli %f`16+16*$i+14`, %f6, %f6
  645. ___
  646. }
  647. $code.=<<___;
  648. camellia_f %f16, %f2, %f0, %f2
  649. camellia_f %f16, %f6, %f4, %f6
  650. camellia_f %f18, %f0, %f2, %f0
  651. camellia_f %f18, %f4, %f6, %f4
  652. ldd [$key + 16], %f16
  653. ldd [$key + 24], %f18
  654. camellia_f %f20, %f2, %f0, %f2
  655. camellia_f %f20, %f6, %f4, %f6
  656. camellia_f %f22, %f0, %f2, %f0
  657. camellia_f %f22, %f4, %f6, %f4
  658. ldd [$key + 32], %f20
  659. ldd [$key + 40], %f22
  660. camellia_f %f24, %f2, %f0, %f8
  661. camellia_f %f24, %f6, %f4, %f10
  662. camellia_f %f26, %f0, %f8, %f2
  663. camellia_f %f26, %f4, %f10, %f6
  664. ldd [$key + 48], %f24
  665. ldd [$key + 56], %f26
  666. fxor %f28, %f8, %f0
  667. fxor %f28, %f10, %f4
  668. fxor %f30, %f2, %f2
  669. fxor %f30, %f6, %f6
  670. ldd [$key + 64], %f28
  671. retl
  672. ldd [$key + 72], %f30
  673. .type _cmll256_encrypt_2x,#function
  674. .size _cmll256_encrypt_2x,.-_cmll256_encrypt_2x
  675. .align 32
  676. _cmll256_decrypt_1x:
  677. camellia_f %f16, %f2, %f0, %f2
  678. camellia_f %f18, %f0, %f2, %f0
  679. ldd [$key - 8], %f16
  680. ldd [$key - 16], %f18
  681. camellia_f %f20, %f2, %f0, %f2
  682. camellia_f %f22, %f0, %f2, %f0
  683. ldd [$key - 24], %f20
  684. ldd [$key - 32], %f22
  685. camellia_f %f24, %f2, %f0, %f2
  686. camellia_f %f26, %f0, %f2, %f0
  687. ldd [$key - 40], %f24
  688. ldd [$key - 48], %f26
  689. camellia_fl %f28, %f0, %f0
  690. camellia_fli %f30, %f2, %f2
  691. ldd [$key - 56], %f28
  692. ldd [$key - 64], %f30
  693. ___
  694. for ($i=1; $i<3; $i++) {
  695. $code.=<<___;
  696. camellia_f %f`16+16*$i+0`, %f2, %f0, %f2
  697. camellia_f %f`16+16*$i+2`, %f0, %f2, %f0
  698. camellia_f %f`16+16*$i+4`, %f2, %f0, %f2
  699. camellia_f %f`16+16*$i+6`, %f0, %f2, %f0
  700. camellia_f %f`16+16*$i+8`, %f2, %f0, %f2
  701. camellia_f %f`16+16*$i+10`, %f0, %f2, %f0
  702. camellia_fl %f`16+16*$i+12`, %f0, %f0
  703. camellia_fli %f`16+16*$i+14`, %f2, %f2
  704. ___
  705. }
  706. $code.=<<___;
  707. camellia_f %f16, %f2, %f0, %f2
  708. camellia_f %f18, %f0, %f2, %f0
  709. ldd [$key + 184], %f16
  710. ldd [$key + 176], %f18
  711. camellia_f %f20, %f2, %f0, %f2
  712. camellia_f %f22, %f0, %f2, %f0
  713. ldd [$key + 168], %f20
  714. ldd [$key + 160], %f22
  715. camellia_f %f24, %f2, %f0, %f4
  716. camellia_f %f26, %f0, %f4, %f2
  717. ldd [$key + 152], %f24
  718. ldd [$key + 144], %f26
  719. fxor %f30, %f4, %f0
  720. fxor %f28, %f2, %f2
  721. ldd [$key + 136], %f28
  722. retl
  723. ldd [$key + 128], %f30
  724. .type _cmll256_decrypt_1x,#function
  725. .size _cmll256_decrypt_1x,.-_cmll256_decrypt_1x
  726. .align 32
  727. _cmll256_decrypt_2x:
  728. camellia_f %f16, %f2, %f0, %f2
  729. camellia_f %f16, %f6, %f4, %f6
  730. camellia_f %f18, %f0, %f2, %f0
  731. camellia_f %f18, %f4, %f6, %f4
  732. ldd [$key - 8], %f16
  733. ldd [$key - 16], %f18
  734. camellia_f %f20, %f2, %f0, %f2
  735. camellia_f %f20, %f6, %f4, %f6
  736. camellia_f %f22, %f0, %f2, %f0
  737. camellia_f %f22, %f4, %f6, %f4
  738. ldd [$key - 24], %f20
  739. ldd [$key - 32], %f22
  740. camellia_f %f24, %f2, %f0, %f2
  741. camellia_f %f24, %f6, %f4, %f6
  742. camellia_f %f26, %f0, %f2, %f0
  743. camellia_f %f26, %f4, %f6, %f4
  744. ldd [$key - 40], %f24
  745. ldd [$key - 48], %f26
  746. camellia_fl %f28, %f0, %f0
  747. camellia_fl %f28, %f4, %f4
  748. camellia_fli %f30, %f2, %f2
  749. camellia_fli %f30, %f6, %f6
  750. ldd [$key - 56], %f28
  751. ldd [$key - 64], %f30
  752. ___
  753. for ($i=1; $i<3; $i++) {
  754. $code.=<<___;
  755. camellia_f %f`16+16*$i+0`, %f2, %f0, %f2
  756. camellia_f %f`16+16*$i+0`, %f6, %f4, %f6
  757. camellia_f %f`16+16*$i+2`, %f0, %f2, %f0
  758. camellia_f %f`16+16*$i+2`, %f4, %f6, %f4
  759. camellia_f %f`16+16*$i+4`, %f2, %f0, %f2
  760. camellia_f %f`16+16*$i+4`, %f6, %f4, %f6
  761. camellia_f %f`16+16*$i+6`, %f0, %f2, %f0
  762. camellia_f %f`16+16*$i+6`, %f4, %f6, %f4
  763. camellia_f %f`16+16*$i+8`, %f2, %f0, %f2
  764. camellia_f %f`16+16*$i+8`, %f6, %f4, %f6
  765. camellia_f %f`16+16*$i+10`, %f0, %f2, %f0
  766. camellia_f %f`16+16*$i+10`, %f4, %f6, %f4
  767. camellia_fl %f`16+16*$i+12`, %f0, %f0
  768. camellia_fl %f`16+16*$i+12`, %f4, %f4
  769. camellia_fli %f`16+16*$i+14`, %f2, %f2
  770. camellia_fli %f`16+16*$i+14`, %f6, %f6
  771. ___
  772. }
  773. $code.=<<___;
  774. camellia_f %f16, %f2, %f0, %f2
  775. camellia_f %f16, %f6, %f4, %f6
  776. camellia_f %f18, %f0, %f2, %f0
  777. camellia_f %f18, %f4, %f6, %f4
  778. ldd [$key + 184], %f16
  779. ldd [$key + 176], %f18
  780. camellia_f %f20, %f2, %f0, %f2
  781. camellia_f %f20, %f6, %f4, %f6
  782. camellia_f %f22, %f0, %f2, %f0
  783. camellia_f %f22, %f4, %f6, %f4
  784. ldd [$key + 168], %f20
  785. ldd [$key + 160], %f22
  786. camellia_f %f24, %f2, %f0, %f8
  787. camellia_f %f24, %f6, %f4, %f10
  788. camellia_f %f26, %f0, %f8, %f2
  789. camellia_f %f26, %f4, %f10, %f6
  790. ldd [$key + 152], %f24
  791. ldd [$key + 144], %f26
  792. fxor %f30, %f8, %f0
  793. fxor %f30, %f10, %f4
  794. fxor %f28, %f2, %f2
  795. fxor %f28, %f6, %f6
  796. ldd [$key + 136], %f28
  797. retl
  798. ldd [$key + 128], %f30
  799. .type _cmll256_decrypt_2x,#function
  800. .size _cmll256_decrypt_2x,.-_cmll256_decrypt_2x
  801. ___
  802. &alg_cbc_encrypt_implement("cmll",128);
  803. &alg_cbc_encrypt_implement("cmll",256);
  804. &alg_cbc_decrypt_implement("cmll",128);
  805. &alg_cbc_decrypt_implement("cmll",256);
  806. if ($::evp) {
  807. &alg_ctr32_implement("cmll",128);
  808. &alg_ctr32_implement("cmll",256);
  809. }
  810. }}}
  811. if (!$::evp) {
  812. $code.=<<___;
  813. .global Camellia_encrypt
  814. Camellia_encrypt=cmll_t4_encrypt
  815. .global Camellia_decrypt
  816. Camellia_decrypt=cmll_t4_decrypt
  817. .global Camellia_set_key
  818. .align 32
  819. Camellia_set_key:
  820. andcc %o2, 7, %g0 ! double-check alignment
  821. bnz,a,pn %icc, 1f
  822. mov -1, %o0
  823. brz,a,pn %o0, 1f
  824. mov -1, %o0
  825. brz,a,pn %o2, 1f
  826. mov -1, %o0
  827. andncc %o1, 0x1c0, %g0
  828. bnz,a,pn %icc, 1f
  829. mov -2, %o0
  830. cmp %o1, 128
  831. bl,a,pn %icc, 1f
  832. mov -2, %o0
  833. b cmll_t4_set_key
  834. nop
  835. 1: retl
  836. nop
  837. .type Camellia_set_key,#function
  838. .size Camellia_set_key,.-Camellia_set_key
  839. ___
  840. my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5));
  841. $code.=<<___;
  842. .globl Camellia_cbc_encrypt
  843. .align 32
  844. Camellia_cbc_encrypt:
  845. ld [$key + 272], %g1
  846. nop
  847. brz $enc, .Lcbc_decrypt
  848. cmp %g1, 3
  849. be,pt %icc, cmll128_t4_cbc_encrypt
  850. nop
  851. ba cmll256_t4_cbc_encrypt
  852. nop
  853. .Lcbc_decrypt:
  854. be,pt %icc, cmll128_t4_cbc_decrypt
  855. nop
  856. ba cmll256_t4_cbc_decrypt
  857. nop
  858. .type Camellia_cbc_encrypt,#function
  859. .size Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
  860. ___
  861. }
  862. &emit_assembler();
  863. close STDOUT or die "error closing STDOUT: $!";