cmllt4-sparcv9.pl 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941
  1. #! /usr/bin/env perl
  2. # Copyright 2012-2021 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by David S. Miller and Andy Polyakov.
  10. # The module is licensed under 2-clause BSD
  11. # license. October 2012. All rights reserved.
  12. # ====================================================================
  13. ######################################################################
  14. # Camellia for SPARC T4.
  15. #
  16. # As with AES below results [for aligned data] are virtually identical
  17. # to critical path lengths for 3-cycle instruction latency:
  18. #
  19. # 128-bit key 192/256-
  20. # CBC encrypt 4.14/4.21(*) 5.46/5.52
  21. # (*) numbers after slash are for
  22. # misaligned data.
  23. #
  24. # As with Intel AES-NI, question is if it's possible to improve
  25. # performance of parallelizable modes by interleaving round
  26. # instructions. In Camellia every instruction is dependent on
  27. # previous, which means that there is place for 2 additional ones
  28. # in between two dependent. Can we expect 3x performance improvement?
  29. # At least one can argue that it should be possible to break 2x
  30. # barrier... For some reason not even 2x appears to be possible:
  31. #
  32. # 128-bit key 192/256-
  33. # CBC decrypt 2.21/2.74 2.99/3.40
  34. # CTR 2.15/2.68(*) 2.93/3.34
  35. # (*) numbers after slash are for
  36. # misaligned data.
  37. #
  38. # This is for 2x interleave. But compared to 1x interleave CBC decrypt
  39. # improved by ... 0% for 128-bit key, and 11% for 192/256-bit one.
  40. # So that out-of-order execution logic can take non-interleaved code
  41. # to 1.87x, but can't take 2x interleaved one any further. There
  42. # surely is some explanation... As result 3x interleave was not even
  43. # attempted. Instead an effort was made to share specific modes
  44. # implementations with AES module (therefore sparct4_modes.pl).
  45. #
  46. # To anchor to something else, software C implementation processes
  47. # one byte in 38 cycles with 128-bit key on same processor.
  48. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  49. push(@INC,"${dir}","${dir}../../perlasm");
  50. require "sparcv9_modes.pl";
  51. $output = pop and open STDOUT,">$output";
  52. $::evp=1; # if $evp is set to 0, script generates module with
  53. # Camellia_[en|de]crypt, Camellia_set_key and Camellia_cbc_encrypt
  54. # entry points. These are fully compatible with openssl/camellia.h.
  55. ######################################################################
  56. # single-round subroutines
  57. #
  58. {
  59. my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
  60. $code=<<___;
  61. #ifndef __ASSEMBLER__
  62. # define __ASSEMBLER__ 1
  63. #endif
  64. #include "crypto/sparc_arch.h"
  65. .text
  66. .globl cmll_t4_encrypt
  67. .align 32
  68. cmll_t4_encrypt:
  69. andcc $inp, 7, %g1 ! is input aligned?
  70. andn $inp, 7, $inp
  71. ldx [$key + 0], %g4
  72. ldx [$key + 8], %g5
  73. ldx [$inp + 0], %o4
  74. bz,pt %icc, 1f
  75. ldx [$inp + 8], %o5
  76. ldx [$inp + 16], $inp
  77. sll %g1, 3, %g1
  78. sub %g0, %g1, %o3
  79. sllx %o4, %g1, %o4
  80. sllx %o5, %g1, %g1
  81. srlx %o5, %o3, %o5
  82. srlx $inp, %o3, %o3
  83. or %o5, %o4, %o4
  84. or %o3, %g1, %o5
  85. 1:
  86. ld [$key + 272], $rounds ! grandRounds, 3 or 4
  87. ldd [$key + 16], %f12
  88. ldd [$key + 24], %f14
  89. xor %g4, %o4, %o4
  90. xor %g5, %o5, %o5
  91. ldd [$key + 32], %f16
  92. ldd [$key + 40], %f18
  93. movxtod %o4, %f0
  94. movxtod %o5, %f2
  95. ldd [$key + 48], %f20
  96. ldd [$key + 56], %f22
  97. sub $rounds, 1, $rounds
  98. ldd [$key + 64], %f24
  99. ldd [$key + 72], %f26
  100. add $key, 80, $key
  101. .Lenc:
  102. camellia_f %f12, %f2, %f0, %f2
  103. ldd [$key + 0], %f12
  104. sub $rounds,1,$rounds
  105. camellia_f %f14, %f0, %f2, %f0
  106. ldd [$key + 8], %f14
  107. camellia_f %f16, %f2, %f0, %f2
  108. ldd [$key + 16], %f16
  109. camellia_f %f18, %f0, %f2, %f0
  110. ldd [$key + 24], %f18
  111. camellia_f %f20, %f2, %f0, %f2
  112. ldd [$key + 32], %f20
  113. camellia_f %f22, %f0, %f2, %f0
  114. ldd [$key + 40], %f22
  115. camellia_fl %f24, %f0, %f0
  116. ldd [$key + 48], %f24
  117. camellia_fli %f26, %f2, %f2
  118. ldd [$key + 56], %f26
  119. brnz,pt $rounds, .Lenc
  120. add $key, 64, $key
  121. andcc $out, 7, $tmp ! is output aligned?
  122. camellia_f %f12, %f2, %f0, %f2
  123. camellia_f %f14, %f0, %f2, %f0
  124. camellia_f %f16, %f2, %f0, %f2
  125. camellia_f %f18, %f0, %f2, %f0
  126. camellia_f %f20, %f2, %f0, %f4
  127. camellia_f %f22, %f0, %f4, %f2
  128. fxor %f24, %f4, %f0
  129. fxor %f26, %f2, %f2
  130. bnz,pn %icc, 2f
  131. nop
  132. std %f0, [$out + 0]
  133. retl
  134. std %f2, [$out + 8]
  135. 2: alignaddrl $out, %g0, $out
  136. mov 0xff, $mask
  137. srl $mask, $tmp, $mask
  138. faligndata %f0, %f0, %f4
  139. faligndata %f0, %f2, %f6
  140. faligndata %f2, %f2, %f8
  141. stda %f4, [$out + $mask]0xc0 ! partial store
  142. std %f6, [$out + 8]
  143. add $out, 16, $out
  144. orn %g0, $mask, $mask
  145. retl
  146. stda %f8, [$out + $mask]0xc0 ! partial store
  147. .type cmll_t4_encrypt,#function
  148. .size cmll_t4_encrypt,.-cmll_t4_encrypt
  149. .globl cmll_t4_decrypt
  150. .align 32
  151. cmll_t4_decrypt:
  152. ld [$key + 272], $rounds ! grandRounds, 3 or 4
  153. andcc $inp, 7, %g1 ! is input aligned?
  154. andn $inp, 7, $inp
  155. sll $rounds, 6, $rounds
  156. add $rounds, $key, $key
  157. ldx [$inp + 0], %o4
  158. bz,pt %icc, 1f
  159. ldx [$inp + 8], %o5
  160. ldx [$inp + 16], $inp
  161. sll %g1, 3, %g1
  162. sub %g0, %g1, %g4
  163. sllx %o4, %g1, %o4
  164. sllx %o5, %g1, %g1
  165. srlx %o5, %g4, %o5
  166. srlx $inp, %g4, %g4
  167. or %o5, %o4, %o4
  168. or %g4, %g1, %o5
  169. 1:
  170. ldx [$key + 0], %g4
  171. ldx [$key + 8], %g5
  172. ldd [$key - 8], %f12
  173. ldd [$key - 16], %f14
  174. xor %g4, %o4, %o4
  175. xor %g5, %o5, %o5
  176. ldd [$key - 24], %f16
  177. ldd [$key - 32], %f18
  178. movxtod %o4, %f0
  179. movxtod %o5, %f2
  180. ldd [$key - 40], %f20
  181. ldd [$key - 48], %f22
  182. sub $rounds, 64, $rounds
  183. ldd [$key - 56], %f24
  184. ldd [$key - 64], %f26
  185. sub $key, 64, $key
  186. .Ldec:
  187. camellia_f %f12, %f2, %f0, %f2
  188. ldd [$key - 8], %f12
  189. sub $rounds, 64, $rounds
  190. camellia_f %f14, %f0, %f2, %f0
  191. ldd [$key - 16], %f14
  192. camellia_f %f16, %f2, %f0, %f2
  193. ldd [$key - 24], %f16
  194. camellia_f %f18, %f0, %f2, %f0
  195. ldd [$key - 32], %f18
  196. camellia_f %f20, %f2, %f0, %f2
  197. ldd [$key - 40], %f20
  198. camellia_f %f22, %f0, %f2, %f0
  199. ldd [$key - 48], %f22
  200. camellia_fl %f24, %f0, %f0
  201. ldd [$key - 56], %f24
  202. camellia_fli %f26, %f2, %f2
  203. ldd [$key - 64], %f26
  204. brnz,pt $rounds, .Ldec
  205. sub $key, 64, $key
  206. andcc $out, 7, $tmp ! is output aligned?
  207. camellia_f %f12, %f2, %f0, %f2
  208. camellia_f %f14, %f0, %f2, %f0
  209. camellia_f %f16, %f2, %f0, %f2
  210. camellia_f %f18, %f0, %f2, %f0
  211. camellia_f %f20, %f2, %f0, %f4
  212. camellia_f %f22, %f0, %f4, %f2
  213. fxor %f26, %f4, %f0
  214. fxor %f24, %f2, %f2
  215. bnz,pn %icc, 2f
  216. nop
  217. std %f0, [$out + 0]
  218. retl
  219. std %f2, [$out + 8]
  220. 2: alignaddrl $out, %g0, $out
  221. mov 0xff, $mask
  222. srl $mask, $tmp, $mask
  223. faligndata %f0, %f0, %f4
  224. faligndata %f0, %f2, %f6
  225. faligndata %f2, %f2, %f8
  226. stda %f4, [$out + $mask]0xc0 ! partial store
  227. std %f6, [$out + 8]
  228. add $out, 16, $out
  229. orn %g0, $mask, $mask
  230. retl
  231. stda %f8, [$out + $mask]0xc0 ! partial store
  232. .type cmll_t4_decrypt,#function
  233. .size cmll_t4_decrypt,.-cmll_t4_decrypt
  234. ___
  235. }
  236. ######################################################################
  237. # key setup subroutines
  238. #
  239. {
  240. sub ROTL128 {
  241. my $rot = shift;
  242. "srlx %o4, 64-$rot, %g4\n\t".
  243. "sllx %o4, $rot, %o4\n\t".
  244. "srlx %o5, 64-$rot, %g5\n\t".
  245. "sllx %o5, $rot, %o5\n\t".
  246. "or %o4, %g5, %o4\n\t".
  247. "or %o5, %g4, %o5";
  248. }
  249. my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5));
  250. $code.=<<___;
  251. .globl cmll_t4_set_key
  252. .align 32
  253. cmll_t4_set_key:
  254. and $inp, 7, $tmp
  255. alignaddr $inp, %g0, $inp
  256. cmp $bits, 192
  257. ldd [$inp + 0], %f0
  258. bl,pt %icc,.L128
  259. ldd [$inp + 8], %f2
  260. be,pt %icc,.L192
  261. ldd [$inp + 16], %f4
  262. brz,pt $tmp, .L256aligned
  263. ldd [$inp + 24], %f6
  264. ldd [$inp + 32], %f8
  265. faligndata %f0, %f2, %f0
  266. faligndata %f2, %f4, %f2
  267. faligndata %f4, %f6, %f4
  268. b .L256aligned
  269. faligndata %f6, %f8, %f6
  270. .align 16
  271. .L192:
  272. brz,a,pt $tmp, .L256aligned
  273. fnot2 %f4, %f6
  274. ldd [$inp + 24], %f6
  275. nop
  276. faligndata %f0, %f2, %f0
  277. faligndata %f2, %f4, %f2
  278. faligndata %f4, %f6, %f4
  279. fnot2 %f4, %f6
  280. .L256aligned:
  281. std %f0, [$out + 0] ! k[0, 1]
  282. fsrc2 %f0, %f28
  283. std %f2, [$out + 8] ! k[2, 3]
  284. fsrc2 %f2, %f30
  285. fxor %f4, %f0, %f0
  286. b .L128key
  287. fxor %f6, %f2, %f2
  288. .align 16
  289. .L128:
  290. brz,pt $tmp, .L128aligned
  291. nop
  292. ldd [$inp + 16], %f4
  293. nop
  294. faligndata %f0, %f2, %f0
  295. faligndata %f2, %f4, %f2
  296. .L128aligned:
  297. std %f0, [$out + 0] ! k[0, 1]
  298. fsrc2 %f0, %f28
  299. std %f2, [$out + 8] ! k[2, 3]
  300. fsrc2 %f2, %f30
  301. .L128key:
  302. mov %o7, %o5
  303. 1: call .+8
  304. add %o7, SIGMA-1b, %o4
  305. mov %o5, %o7
  306. ldd [%o4 + 0], %f16
  307. ldd [%o4 + 8], %f18
  308. ldd [%o4 + 16], %f20
  309. ldd [%o4 + 24], %f22
  310. camellia_f %f16, %f2, %f0, %f2
  311. camellia_f %f18, %f0, %f2, %f0
  312. fxor %f28, %f0, %f0
  313. fxor %f30, %f2, %f2
  314. camellia_f %f20, %f2, %f0, %f2
  315. camellia_f %f22, %f0, %f2, %f0
  316. bge,pn %icc, .L256key
  317. nop
  318. std %f0, [$out + 0x10] ! k[ 4, 5]
  319. std %f2, [$out + 0x18] ! k[ 6, 7]
  320. movdtox %f0, %o4
  321. movdtox %f2, %o5
  322. `&ROTL128(15)`
  323. stx %o4, [$out + 0x30] ! k[12, 13]
  324. stx %o5, [$out + 0x38] ! k[14, 15]
  325. `&ROTL128(15)`
  326. stx %o4, [$out + 0x40] ! k[16, 17]
  327. stx %o5, [$out + 0x48] ! k[18, 19]
  328. `&ROTL128(15)`
  329. stx %o4, [$out + 0x60] ! k[24, 25]
  330. `&ROTL128(15)`
  331. stx %o4, [$out + 0x70] ! k[28, 29]
  332. stx %o5, [$out + 0x78] ! k[30, 31]
  333. `&ROTL128(34)`
  334. stx %o4, [$out + 0xa0] ! k[40, 41]
  335. stx %o5, [$out + 0xa8] ! k[42, 43]
  336. `&ROTL128(17)`
  337. stx %o4, [$out + 0xc0] ! k[48, 49]
  338. stx %o5, [$out + 0xc8] ! k[50, 51]
  339. movdtox %f28, %o4 ! k[ 0, 1]
  340. movdtox %f30, %o5 ! k[ 2, 3]
  341. `&ROTL128(15)`
  342. stx %o4, [$out + 0x20] ! k[ 8, 9]
  343. stx %o5, [$out + 0x28] ! k[10, 11]
  344. `&ROTL128(30)`
  345. stx %o4, [$out + 0x50] ! k[20, 21]
  346. stx %o5, [$out + 0x58] ! k[22, 23]
  347. `&ROTL128(15)`
  348. stx %o5, [$out + 0x68] ! k[26, 27]
  349. `&ROTL128(17)`
  350. stx %o4, [$out + 0x80] ! k[32, 33]
  351. stx %o5, [$out + 0x88] ! k[34, 35]
  352. `&ROTL128(17)`
  353. stx %o4, [$out + 0x90] ! k[36, 37]
  354. stx %o5, [$out + 0x98] ! k[38, 39]
  355. `&ROTL128(17)`
  356. stx %o4, [$out + 0xb0] ! k[44, 45]
  357. stx %o5, [$out + 0xb8] ! k[46, 47]
  358. mov 3, $tmp
  359. st $tmp, [$out + 0x110]
  360. retl
  361. xor %o0, %o0, %o0
  362. .align 16
  363. .L256key:
  364. ldd [%o4 + 32], %f24
  365. ldd [%o4 + 40], %f26
  366. std %f0, [$out + 0x30] ! k[12, 13]
  367. std %f2, [$out + 0x38] ! k[14, 15]
  368. fxor %f4, %f0, %f0
  369. fxor %f6, %f2, %f2
  370. camellia_f %f24, %f2, %f0, %f2
  371. camellia_f %f26, %f0, %f2, %f0
  372. std %f0, [$out + 0x10] ! k[ 4, 5]
  373. std %f2, [$out + 0x18] ! k[ 6, 7]
  374. movdtox %f0, %o4
  375. movdtox %f2, %o5
  376. `&ROTL128(30)`
  377. stx %o4, [$out + 0x50] ! k[20, 21]
  378. stx %o5, [$out + 0x58] ! k[22, 23]
  379. `&ROTL128(30)`
  380. stx %o4, [$out + 0xa0] ! k[40, 41]
  381. stx %o5, [$out + 0xa8] ! k[42, 43]
  382. `&ROTL128(51)`
  383. stx %o4, [$out + 0x100] ! k[64, 65]
  384. stx %o5, [$out + 0x108] ! k[66, 67]
  385. movdtox %f4, %o4 ! k[ 8, 9]
  386. movdtox %f6, %o5 ! k[10, 11]
  387. `&ROTL128(15)`
  388. stx %o4, [$out + 0x20] ! k[ 8, 9]
  389. stx %o5, [$out + 0x28] ! k[10, 11]
  390. `&ROTL128(15)`
  391. stx %o4, [$out + 0x40] ! k[16, 17]
  392. stx %o5, [$out + 0x48] ! k[18, 19]
  393. `&ROTL128(30)`
  394. stx %o4, [$out + 0x90] ! k[36, 37]
  395. stx %o5, [$out + 0x98] ! k[38, 39]
  396. `&ROTL128(34)`
  397. stx %o4, [$out + 0xd0] ! k[52, 53]
  398. stx %o5, [$out + 0xd8] ! k[54, 55]
  399. ldx [$out + 0x30], %o4 ! k[12, 13]
  400. ldx [$out + 0x38], %o5 ! k[14, 15]
  401. `&ROTL128(15)`
  402. stx %o4, [$out + 0x30] ! k[12, 13]
  403. stx %o5, [$out + 0x38] ! k[14, 15]
  404. `&ROTL128(30)`
  405. stx %o4, [$out + 0x70] ! k[28, 29]
  406. stx %o5, [$out + 0x78] ! k[30, 31]
  407. srlx %o4, 32, %g4
  408. srlx %o5, 32, %g5
  409. st %o4, [$out + 0xc0] ! k[48]
  410. st %g5, [$out + 0xc4] ! k[49]
  411. st %o5, [$out + 0xc8] ! k[50]
  412. st %g4, [$out + 0xcc] ! k[51]
  413. `&ROTL128(49)`
  414. stx %o4, [$out + 0xe0] ! k[56, 57]
  415. stx %o5, [$out + 0xe8] ! k[58, 59]
  416. movdtox %f28, %o4 ! k[ 0, 1]
  417. movdtox %f30, %o5 ! k[ 2, 3]
  418. `&ROTL128(45)`
  419. stx %o4, [$out + 0x60] ! k[24, 25]
  420. stx %o5, [$out + 0x68] ! k[26, 27]
  421. `&ROTL128(15)`
  422. stx %o4, [$out + 0x80] ! k[32, 33]
  423. stx %o5, [$out + 0x88] ! k[34, 35]
  424. `&ROTL128(17)`
  425. stx %o4, [$out + 0xb0] ! k[44, 45]
  426. stx %o5, [$out + 0xb8] ! k[46, 47]
  427. `&ROTL128(34)`
  428. stx %o4, [$out + 0xf0] ! k[60, 61]
  429. stx %o5, [$out + 0xf8] ! k[62, 63]
  430. mov 4, $tmp
  431. st $tmp, [$out + 0x110]
  432. retl
  433. xor %o0, %o0, %o0
  434. .type cmll_t4_set_key,#function
  435. .size cmll_t4_set_key,.-cmll_t4_set_key
  436. .align 32
  437. SIGMA:
  438. .long 0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2
  439. .long 0xc6ef372f, 0xe94f82be, 0x54ff53a5, 0xf1d36f1c
  440. .long 0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd
  441. .type SIGMA,#object
  442. .size SIGMA,.-SIGMA
  443. .asciz "Camellia for SPARC T4, David S. Miller, Andy Polyakov"
  444. ___
  445. }
  446. {{{
  447. my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
  448. my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7));
  449. $code.=<<___;
  450. .align 32
  451. _cmll128_load_enckey:
  452. ldx [$key + 0], %g4
  453. ldx [$key + 8], %g5
  454. ___
  455. for ($i=2; $i<26;$i++) { # load key schedule
  456. $code.=<<___;
  457. ldd [$key + `8*$i`], %f`12+2*$i`
  458. ___
  459. }
  460. $code.=<<___;
  461. retl
  462. nop
  463. .type _cmll128_load_enckey,#function
  464. .size _cmll128_load_enckey,.-_cmll128_load_enckey
  465. _cmll256_load_enckey=_cmll128_load_enckey
  466. .align 32
  467. _cmll256_load_deckey:
  468. ldd [$key + 64], %f62
  469. ldd [$key + 72], %f60
  470. b .Load_deckey
  471. add $key, 64, $key
  472. _cmll128_load_deckey:
  473. ldd [$key + 0], %f60
  474. ldd [$key + 8], %f62
  475. .Load_deckey:
  476. ___
  477. for ($i=2; $i<24;$i++) { # load key schedule
  478. $code.=<<___;
  479. ldd [$key + `8*$i`], %f`62-2*$i`
  480. ___
  481. }
  482. $code.=<<___;
  483. ldx [$key + 192], %g4
  484. retl
  485. ldx [$key + 200], %g5
  486. .type _cmll256_load_deckey,#function
  487. .size _cmll256_load_deckey,.-_cmll256_load_deckey
  488. .align 32
  489. _cmll128_encrypt_1x:
  490. ___
  491. for ($i=0; $i<3; $i++) {
  492. $code.=<<___;
  493. camellia_f %f`16+16*$i+0`, %f2, %f0, %f2
  494. camellia_f %f`16+16*$i+2`, %f0, %f2, %f0
  495. camellia_f %f`16+16*$i+4`, %f2, %f0, %f2
  496. camellia_f %f`16+16*$i+6`, %f0, %f2, %f0
  497. ___
  498. $code.=<<___ if ($i<2);
  499. camellia_f %f`16+16*$i+8`, %f2, %f0, %f2
  500. camellia_f %f`16+16*$i+10`, %f0, %f2, %f0
  501. camellia_fl %f`16+16*$i+12`, %f0, %f0
  502. camellia_fli %f`16+16*$i+14`, %f2, %f2
  503. ___
  504. }
  505. $code.=<<___;
  506. camellia_f %f56, %f2, %f0, %f4
  507. camellia_f %f58, %f0, %f4, %f2
  508. fxor %f60, %f4, %f0
  509. retl
  510. fxor %f62, %f2, %f2
  511. .type _cmll128_encrypt_1x,#function
  512. .size _cmll128_encrypt_1x,.-_cmll128_encrypt_1x
  513. _cmll128_decrypt_1x=_cmll128_encrypt_1x
  514. .align 32
  515. _cmll128_encrypt_2x:
  516. ___
  517. for ($i=0; $i<3; $i++) {
  518. $code.=<<___;
  519. camellia_f %f`16+16*$i+0`, %f2, %f0, %f2
  520. camellia_f %f`16+16*$i+0`, %f6, %f4, %f6
  521. camellia_f %f`16+16*$i+2`, %f0, %f2, %f0
  522. camellia_f %f`16+16*$i+2`, %f4, %f6, %f4
  523. camellia_f %f`16+16*$i+4`, %f2, %f0, %f2
  524. camellia_f %f`16+16*$i+4`, %f6, %f4, %f6
  525. camellia_f %f`16+16*$i+6`, %f0, %f2, %f0
  526. camellia_f %f`16+16*$i+6`, %f4, %f6, %f4
  527. ___
  528. $code.=<<___ if ($i<2);
  529. camellia_f %f`16+16*$i+8`, %f2, %f0, %f2
  530. camellia_f %f`16+16*$i+8`, %f6, %f4, %f6
  531. camellia_f %f`16+16*$i+10`, %f0, %f2, %f0
  532. camellia_f %f`16+16*$i+10`, %f4, %f6, %f4
  533. camellia_fl %f`16+16*$i+12`, %f0, %f0
  534. camellia_fl %f`16+16*$i+12`, %f4, %f4
  535. camellia_fli %f`16+16*$i+14`, %f2, %f2
  536. camellia_fli %f`16+16*$i+14`, %f6, %f6
  537. ___
  538. }
  539. $code.=<<___;
  540. camellia_f %f56, %f2, %f0, %f8
  541. camellia_f %f56, %f6, %f4, %f10
  542. camellia_f %f58, %f0, %f8, %f2
  543. camellia_f %f58, %f4, %f10, %f6
  544. fxor %f60, %f8, %f0
  545. fxor %f60, %f10, %f4
  546. fxor %f62, %f2, %f2
  547. retl
  548. fxor %f62, %f6, %f6
  549. .type _cmll128_encrypt_2x,#function
  550. .size _cmll128_encrypt_2x,.-_cmll128_encrypt_2x
  551. _cmll128_decrypt_2x=_cmll128_encrypt_2x
  552. .align 32
  553. _cmll256_encrypt_1x:
  554. camellia_f %f16, %f2, %f0, %f2
  555. camellia_f %f18, %f0, %f2, %f0
  556. ldd [$key + 208], %f16
  557. ldd [$key + 216], %f18
  558. camellia_f %f20, %f2, %f0, %f2
  559. camellia_f %f22, %f0, %f2, %f0
  560. ldd [$key + 224], %f20
  561. ldd [$key + 232], %f22
  562. camellia_f %f24, %f2, %f0, %f2
  563. camellia_f %f26, %f0, %f2, %f0
  564. ldd [$key + 240], %f24
  565. ldd [$key + 248], %f26
  566. camellia_fl %f28, %f0, %f0
  567. camellia_fli %f30, %f2, %f2
  568. ldd [$key + 256], %f28
  569. ldd [$key + 264], %f30
  570. ___
  571. for ($i=1; $i<3; $i++) {
  572. $code.=<<___;
  573. camellia_f %f`16+16*$i+0`, %f2, %f0, %f2
  574. camellia_f %f`16+16*$i+2`, %f0, %f2, %f0
  575. camellia_f %f`16+16*$i+4`, %f2, %f0, %f2
  576. camellia_f %f`16+16*$i+6`, %f0, %f2, %f0
  577. camellia_f %f`16+16*$i+8`, %f2, %f0, %f2
  578. camellia_f %f`16+16*$i+10`, %f0, %f2, %f0
  579. camellia_fl %f`16+16*$i+12`, %f0, %f0
  580. camellia_fli %f`16+16*$i+14`, %f2, %f2
  581. ___
  582. }
  583. $code.=<<___;
  584. camellia_f %f16, %f2, %f0, %f2
  585. camellia_f %f18, %f0, %f2, %f0
  586. ldd [$key + 16], %f16
  587. ldd [$key + 24], %f18
  588. camellia_f %f20, %f2, %f0, %f2
  589. camellia_f %f22, %f0, %f2, %f0
  590. ldd [$key + 32], %f20
  591. ldd [$key + 40], %f22
  592. camellia_f %f24, %f2, %f0, %f4
  593. camellia_f %f26, %f0, %f4, %f2
  594. ldd [$key + 48], %f24
  595. ldd [$key + 56], %f26
  596. fxor %f28, %f4, %f0
  597. fxor %f30, %f2, %f2
  598. ldd [$key + 64], %f28
  599. retl
  600. ldd [$key + 72], %f30
  601. .type _cmll256_encrypt_1x,#function
  602. .size _cmll256_encrypt_1x,.-_cmll256_encrypt_1x
  603. .align 32
  604. _cmll256_encrypt_2x:
  605. camellia_f %f16, %f2, %f0, %f2
  606. camellia_f %f16, %f6, %f4, %f6
  607. camellia_f %f18, %f0, %f2, %f0
  608. camellia_f %f18, %f4, %f6, %f4
  609. ldd [$key + 208], %f16
  610. ldd [$key + 216], %f18
  611. camellia_f %f20, %f2, %f0, %f2
  612. camellia_f %f20, %f6, %f4, %f6
  613. camellia_f %f22, %f0, %f2, %f0
  614. camellia_f %f22, %f4, %f6, %f4
  615. ldd [$key + 224], %f20
  616. ldd [$key + 232], %f22
  617. camellia_f %f24, %f2, %f0, %f2
  618. camellia_f %f24, %f6, %f4, %f6
  619. camellia_f %f26, %f0, %f2, %f0
  620. camellia_f %f26, %f4, %f6, %f4
  621. ldd [$key + 240], %f24
  622. ldd [$key + 248], %f26
  623. camellia_fl %f28, %f0, %f0
  624. camellia_fl %f28, %f4, %f4
  625. camellia_fli %f30, %f2, %f2
  626. camellia_fli %f30, %f6, %f6
  627. ldd [$key + 256], %f28
  628. ldd [$key + 264], %f30
  629. ___
  630. for ($i=1; $i<3; $i++) {
  631. $code.=<<___;
  632. camellia_f %f`16+16*$i+0`, %f2, %f0, %f2
  633. camellia_f %f`16+16*$i+0`, %f6, %f4, %f6
  634. camellia_f %f`16+16*$i+2`, %f0, %f2, %f0
  635. camellia_f %f`16+16*$i+2`, %f4, %f6, %f4
  636. camellia_f %f`16+16*$i+4`, %f2, %f0, %f2
  637. camellia_f %f`16+16*$i+4`, %f6, %f4, %f6
  638. camellia_f %f`16+16*$i+6`, %f0, %f2, %f0
  639. camellia_f %f`16+16*$i+6`, %f4, %f6, %f4
  640. camellia_f %f`16+16*$i+8`, %f2, %f0, %f2
  641. camellia_f %f`16+16*$i+8`, %f6, %f4, %f6
  642. camellia_f %f`16+16*$i+10`, %f0, %f2, %f0
  643. camellia_f %f`16+16*$i+10`, %f4, %f6, %f4
  644. camellia_fl %f`16+16*$i+12`, %f0, %f0
  645. camellia_fl %f`16+16*$i+12`, %f4, %f4
  646. camellia_fli %f`16+16*$i+14`, %f2, %f2
  647. camellia_fli %f`16+16*$i+14`, %f6, %f6
  648. ___
  649. }
  650. $code.=<<___;
  651. camellia_f %f16, %f2, %f0, %f2
  652. camellia_f %f16, %f6, %f4, %f6
  653. camellia_f %f18, %f0, %f2, %f0
  654. camellia_f %f18, %f4, %f6, %f4
  655. ldd [$key + 16], %f16
  656. ldd [$key + 24], %f18
  657. camellia_f %f20, %f2, %f0, %f2
  658. camellia_f %f20, %f6, %f4, %f6
  659. camellia_f %f22, %f0, %f2, %f0
  660. camellia_f %f22, %f4, %f6, %f4
  661. ldd [$key + 32], %f20
  662. ldd [$key + 40], %f22
  663. camellia_f %f24, %f2, %f0, %f8
  664. camellia_f %f24, %f6, %f4, %f10
  665. camellia_f %f26, %f0, %f8, %f2
  666. camellia_f %f26, %f4, %f10, %f6
  667. ldd [$key + 48], %f24
  668. ldd [$key + 56], %f26
  669. fxor %f28, %f8, %f0
  670. fxor %f28, %f10, %f4
  671. fxor %f30, %f2, %f2
  672. fxor %f30, %f6, %f6
  673. ldd [$key + 64], %f28
  674. retl
  675. ldd [$key + 72], %f30
  676. .type _cmll256_encrypt_2x,#function
  677. .size _cmll256_encrypt_2x,.-_cmll256_encrypt_2x
  678. .align 32
  679. _cmll256_decrypt_1x:
  680. camellia_f %f16, %f2, %f0, %f2
  681. camellia_f %f18, %f0, %f2, %f0
  682. ldd [$key - 8], %f16
  683. ldd [$key - 16], %f18
  684. camellia_f %f20, %f2, %f0, %f2
  685. camellia_f %f22, %f0, %f2, %f0
  686. ldd [$key - 24], %f20
  687. ldd [$key - 32], %f22
  688. camellia_f %f24, %f2, %f0, %f2
  689. camellia_f %f26, %f0, %f2, %f0
  690. ldd [$key - 40], %f24
  691. ldd [$key - 48], %f26
  692. camellia_fl %f28, %f0, %f0
  693. camellia_fli %f30, %f2, %f2
  694. ldd [$key - 56], %f28
  695. ldd [$key - 64], %f30
  696. ___
  697. for ($i=1; $i<3; $i++) {
  698. $code.=<<___;
  699. camellia_f %f`16+16*$i+0`, %f2, %f0, %f2
  700. camellia_f %f`16+16*$i+2`, %f0, %f2, %f0
  701. camellia_f %f`16+16*$i+4`, %f2, %f0, %f2
  702. camellia_f %f`16+16*$i+6`, %f0, %f2, %f0
  703. camellia_f %f`16+16*$i+8`, %f2, %f0, %f2
  704. camellia_f %f`16+16*$i+10`, %f0, %f2, %f0
  705. camellia_fl %f`16+16*$i+12`, %f0, %f0
  706. camellia_fli %f`16+16*$i+14`, %f2, %f2
  707. ___
  708. }
  709. $code.=<<___;
  710. camellia_f %f16, %f2, %f0, %f2
  711. camellia_f %f18, %f0, %f2, %f0
  712. ldd [$key + 184], %f16
  713. ldd [$key + 176], %f18
  714. camellia_f %f20, %f2, %f0, %f2
  715. camellia_f %f22, %f0, %f2, %f0
  716. ldd [$key + 168], %f20
  717. ldd [$key + 160], %f22
  718. camellia_f %f24, %f2, %f0, %f4
  719. camellia_f %f26, %f0, %f4, %f2
  720. ldd [$key + 152], %f24
  721. ldd [$key + 144], %f26
  722. fxor %f30, %f4, %f0
  723. fxor %f28, %f2, %f2
  724. ldd [$key + 136], %f28
  725. retl
  726. ldd [$key + 128], %f30
  727. .type _cmll256_decrypt_1x,#function
  728. .size _cmll256_decrypt_1x,.-_cmll256_decrypt_1x
  729. .align 32
  730. _cmll256_decrypt_2x:
  731. camellia_f %f16, %f2, %f0, %f2
  732. camellia_f %f16, %f6, %f4, %f6
  733. camellia_f %f18, %f0, %f2, %f0
  734. camellia_f %f18, %f4, %f6, %f4
  735. ldd [$key - 8], %f16
  736. ldd [$key - 16], %f18
  737. camellia_f %f20, %f2, %f0, %f2
  738. camellia_f %f20, %f6, %f4, %f6
  739. camellia_f %f22, %f0, %f2, %f0
  740. camellia_f %f22, %f4, %f6, %f4
  741. ldd [$key - 24], %f20
  742. ldd [$key - 32], %f22
  743. camellia_f %f24, %f2, %f0, %f2
  744. camellia_f %f24, %f6, %f4, %f6
  745. camellia_f %f26, %f0, %f2, %f0
  746. camellia_f %f26, %f4, %f6, %f4
  747. ldd [$key - 40], %f24
  748. ldd [$key - 48], %f26
  749. camellia_fl %f28, %f0, %f0
  750. camellia_fl %f28, %f4, %f4
  751. camellia_fli %f30, %f2, %f2
  752. camellia_fli %f30, %f6, %f6
  753. ldd [$key - 56], %f28
  754. ldd [$key - 64], %f30
  755. ___
  756. for ($i=1; $i<3; $i++) {
  757. $code.=<<___;
  758. camellia_f %f`16+16*$i+0`, %f2, %f0, %f2
  759. camellia_f %f`16+16*$i+0`, %f6, %f4, %f6
  760. camellia_f %f`16+16*$i+2`, %f0, %f2, %f0
  761. camellia_f %f`16+16*$i+2`, %f4, %f6, %f4
  762. camellia_f %f`16+16*$i+4`, %f2, %f0, %f2
  763. camellia_f %f`16+16*$i+4`, %f6, %f4, %f6
  764. camellia_f %f`16+16*$i+6`, %f0, %f2, %f0
  765. camellia_f %f`16+16*$i+6`, %f4, %f6, %f4
  766. camellia_f %f`16+16*$i+8`, %f2, %f0, %f2
  767. camellia_f %f`16+16*$i+8`, %f6, %f4, %f6
  768. camellia_f %f`16+16*$i+10`, %f0, %f2, %f0
  769. camellia_f %f`16+16*$i+10`, %f4, %f6, %f4
  770. camellia_fl %f`16+16*$i+12`, %f0, %f0
  771. camellia_fl %f`16+16*$i+12`, %f4, %f4
  772. camellia_fli %f`16+16*$i+14`, %f2, %f2
  773. camellia_fli %f`16+16*$i+14`, %f6, %f6
  774. ___
  775. }
  776. $code.=<<___;
  777. camellia_f %f16, %f2, %f0, %f2
  778. camellia_f %f16, %f6, %f4, %f6
  779. camellia_f %f18, %f0, %f2, %f0
  780. camellia_f %f18, %f4, %f6, %f4
  781. ldd [$key + 184], %f16
  782. ldd [$key + 176], %f18
  783. camellia_f %f20, %f2, %f0, %f2
  784. camellia_f %f20, %f6, %f4, %f6
  785. camellia_f %f22, %f0, %f2, %f0
  786. camellia_f %f22, %f4, %f6, %f4
  787. ldd [$key + 168], %f20
  788. ldd [$key + 160], %f22
  789. camellia_f %f24, %f2, %f0, %f8
  790. camellia_f %f24, %f6, %f4, %f10
  791. camellia_f %f26, %f0, %f8, %f2
  792. camellia_f %f26, %f4, %f10, %f6
  793. ldd [$key + 152], %f24
  794. ldd [$key + 144], %f26
  795. fxor %f30, %f8, %f0
  796. fxor %f30, %f10, %f4
  797. fxor %f28, %f2, %f2
  798. fxor %f28, %f6, %f6
  799. ldd [$key + 136], %f28
  800. retl
  801. ldd [$key + 128], %f30
  802. .type _cmll256_decrypt_2x,#function
  803. .size _cmll256_decrypt_2x,.-_cmll256_decrypt_2x
  804. ___
  805. &alg_cbc_encrypt_implement("cmll",128);
  806. &alg_cbc_encrypt_implement("cmll",256);
  807. &alg_cbc_decrypt_implement("cmll",128);
  808. &alg_cbc_decrypt_implement("cmll",256);
  809. if ($::evp) {
  810. &alg_ctr32_implement("cmll",128);
  811. &alg_ctr32_implement("cmll",256);
  812. }
  813. }}}
  814. if (!$::evp) {
  815. $code.=<<___;
  816. .global Camellia_encrypt
  817. Camellia_encrypt=cmll_t4_encrypt
  818. .global Camellia_decrypt
  819. Camellia_decrypt=cmll_t4_decrypt
  820. .global Camellia_set_key
  821. .align 32
  822. Camellia_set_key:
  823. andcc %o2, 7, %g0 ! double-check alignment
  824. bnz,a,pn %icc, 1f
  825. mov -1, %o0
  826. brz,a,pn %o0, 1f
  827. mov -1, %o0
  828. brz,a,pn %o2, 1f
  829. mov -1, %o0
  830. andncc %o1, 0x1c0, %g0
  831. bnz,a,pn %icc, 1f
  832. mov -2, %o0
  833. cmp %o1, 128
  834. bl,a,pn %icc, 1f
  835. mov -2, %o0
  836. b cmll_t4_set_key
  837. nop
  838. 1: retl
  839. nop
  840. .type Camellia_set_key,#function
  841. .size Camellia_set_key,.-Camellia_set_key
  842. ___
  843. my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5));
  844. $code.=<<___;
  845. .globl Camellia_cbc_encrypt
  846. .align 32
  847. Camellia_cbc_encrypt:
  848. ld [$key + 272], %g1
  849. nop
  850. brz $enc, .Lcbc_decrypt
  851. cmp %g1, 3
  852. be,pt %icc, cmll128_t4_cbc_encrypt
  853. nop
  854. ba cmll256_t4_cbc_encrypt
  855. nop
  856. .Lcbc_decrypt:
  857. be,pt %icc, cmll128_t4_cbc_decrypt
  858. nop
  859. ba cmll256_t4_cbc_decrypt
  860. nop
  861. .type Camellia_cbc_encrypt,#function
  862. .size Camellia_cbc_encrypt,.-Camellia_cbc_encrypt
  863. ___
  864. }
  865. &emit_assembler();
  866. close STDOUT or die "error closing STDOUT: $!";