aest4-sparcv9.pl 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931
  1. #! /usr/bin/env perl
  2. # Copyright 2012-2021 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by David S. Miller and Andy Polyakov.
  10. # The module is licensed under 2-clause BSD license. October 2012.
  11. # All rights reserved.
  12. # ====================================================================
  13. ######################################################################
  14. # AES for SPARC T4.
  15. #
  16. # AES round instructions complete in 3 cycles and can be issued every
  17. # cycle. It means that round calculations should take 4*rounds cycles,
  18. # because any given round instruction depends on result of *both*
  19. # previous instructions:
  20. #
  21. # |0 |1 |2 |3 |4
  22. # |01|01|01|
  23. # |23|23|23|
  24. # |01|01|...
  25. # |23|...
  26. #
  27. # Provided that fxor [with IV] takes 3 cycles to complete, critical
  28. # path length for CBC encrypt would be 3+4*rounds, or in other words
  29. # it should process one byte in at least (3+4*rounds)/16 cycles. This
  30. # estimate doesn't account for "collateral" instructions, such as
  31. # fetching input from memory, xor-ing it with zero-round key and
  32. # storing the result. Yet, *measured* performance [for data aligned
  33. # at 64-bit boundary!] deviates from this equation by less than 0.5%:
  34. #
  35. # 128-bit key 192- 256-
  36. # CBC encrypt 2.70/2.90(*) 3.20/3.40 3.70/3.90
  37. # (*) numbers after slash are for
  38. # misaligned data.
  39. #
  40. # Out-of-order execution logic managed to fully overlap "collateral"
  41. # instructions with those on critical path. Amazing!
  42. #
  43. # As with Intel AES-NI, question is if it's possible to improve
  44. # performance of parallelizable modes by interleaving round
  45. # instructions. Provided round instruction latency and throughput
  46. # optimal interleave factor is 2. But can we expect 2x performance
  47. # improvement? Well, as round instructions can be issued one per
  48. # cycle, they don't saturate the 2-way issue pipeline and therefore
  49. # there is room for "collateral" calculations... Yet, 2x speed-up
  50. # over CBC encrypt remains unattaintable:
  51. #
  52. # 128-bit key 192- 256-
  53. # CBC decrypt 1.64/2.11 1.89/2.37 2.23/2.61
  54. # CTR 1.64/2.08(*) 1.89/2.33 2.23/2.61
  55. # (*) numbers after slash are for
  56. # misaligned data.
  57. #
  58. # Estimates based on amount of instructions under assumption that
  59. # round instructions are not pairable with any other instruction
  60. # suggest that latter is the actual case and pipeline runs
  61. # underutilized. It should be noted that T4 out-of-order execution
  62. # logic is so capable that performance gain from 2x interleave is
  63. # not even impressive, ~7-13% over non-interleaved code, largest
  64. # for 256-bit keys.
  65. # To anchor to something else, software implementation processes
  66. # one byte in 29 cycles with 128-bit key on same processor. Intel
  67. # Sandy Bridge encrypts byte in 5.07 cycles in CBC mode and decrypts
  68. # in 0.93, naturally with AES-NI.
  69. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  70. push(@INC,"${dir}","${dir}../../perlasm");
  71. require "sparcv9_modes.pl";
  72. $output = pop and open STDOUT,">$output";
  73. $::evp=1; # if $evp is set to 0, script generates module with
  74. # AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry
  75. # points. These however are not fully compatible with openssl/aes.h,
  76. # because they expect AES_KEY to be aligned at 64-bit boundary. When
  77. # used through EVP, alignment is arranged at EVP layer. Second thing
  78. # that is arranged by EVP is at least 32-bit alignment of IV.
  79. ######################################################################
  80. # single-round subroutines
  81. #
  82. {
  83. my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
  84. $code.=<<___;
  85. #ifndef __ASSEMBLER__
  86. # define __ASSEMBLER__ 1
  87. #endif
  88. #include "crypto/sparc_arch.h"
  89. #ifdef __arch64__
  90. .register %g2,#scratch
  91. .register %g3,#scratch
  92. #endif
  93. .text
  94. .globl aes_t4_encrypt
  95. .align 32
  96. aes_t4_encrypt:
  97. andcc $inp, 7, %g1 ! is input aligned?
  98. andn $inp, 7, $inp
  99. ldx [$key + 0], %g4
  100. ldx [$key + 8], %g5
  101. ldx [$inp + 0], %o4
  102. bz,pt %icc, 1f
  103. ldx [$inp + 8], %o5
  104. ldx [$inp + 16], $inp
  105. sll %g1, 3, %g1
  106. sub %g0, %g1, %o3
  107. sllx %o4, %g1, %o4
  108. sllx %o5, %g1, %g1
  109. srlx %o5, %o3, %o5
  110. srlx $inp, %o3, %o3
  111. or %o5, %o4, %o4
  112. or %o3, %g1, %o5
  113. 1:
  114. ld [$key + 240], $rounds
  115. ldd [$key + 16], %f12
  116. ldd [$key + 24], %f14
  117. xor %g4, %o4, %o4
  118. xor %g5, %o5, %o5
  119. movxtod %o4, %f0
  120. movxtod %o5, %f2
  121. srl $rounds, 1, $rounds
  122. ldd [$key + 32], %f16
  123. sub $rounds, 1, $rounds
  124. ldd [$key + 40], %f18
  125. add $key, 48, $key
  126. .Lenc:
  127. aes_eround01 %f12, %f0, %f2, %f4
  128. aes_eround23 %f14, %f0, %f2, %f2
  129. ldd [$key + 0], %f12
  130. ldd [$key + 8], %f14
  131. sub $rounds,1,$rounds
  132. aes_eround01 %f16, %f4, %f2, %f0
  133. aes_eround23 %f18, %f4, %f2, %f2
  134. ldd [$key + 16], %f16
  135. ldd [$key + 24], %f18
  136. brnz,pt $rounds, .Lenc
  137. add $key, 32, $key
  138. andcc $out, 7, $tmp ! is output aligned?
  139. aes_eround01 %f12, %f0, %f2, %f4
  140. aes_eround23 %f14, %f0, %f2, %f2
  141. aes_eround01_l %f16, %f4, %f2, %f0
  142. aes_eround23_l %f18, %f4, %f2, %f2
  143. bnz,pn %icc, 2f
  144. nop
  145. std %f0, [$out + 0]
  146. retl
  147. std %f2, [$out + 8]
  148. 2: alignaddrl $out, %g0, $out
  149. mov 0xff, $mask
  150. srl $mask, $tmp, $mask
  151. faligndata %f0, %f0, %f4
  152. faligndata %f0, %f2, %f6
  153. faligndata %f2, %f2, %f8
  154. stda %f4, [$out + $mask]0xc0 ! partial store
  155. std %f6, [$out + 8]
  156. add $out, 16, $out
  157. orn %g0, $mask, $mask
  158. retl
  159. stda %f8, [$out + $mask]0xc0 ! partial store
  160. .type aes_t4_encrypt,#function
  161. .size aes_t4_encrypt,.-aes_t4_encrypt
  162. .globl aes_t4_decrypt
  163. .align 32
  164. aes_t4_decrypt:
  165. andcc $inp, 7, %g1 ! is input aligned?
  166. andn $inp, 7, $inp
  167. ldx [$key + 0], %g4
  168. ldx [$key + 8], %g5
  169. ldx [$inp + 0], %o4
  170. bz,pt %icc, 1f
  171. ldx [$inp + 8], %o5
  172. ldx [$inp + 16], $inp
  173. sll %g1, 3, %g1
  174. sub %g0, %g1, %o3
  175. sllx %o4, %g1, %o4
  176. sllx %o5, %g1, %g1
  177. srlx %o5, %o3, %o5
  178. srlx $inp, %o3, %o3
  179. or %o5, %o4, %o4
  180. or %o3, %g1, %o5
  181. 1:
  182. ld [$key + 240], $rounds
  183. ldd [$key + 16], %f12
  184. ldd [$key + 24], %f14
  185. xor %g4, %o4, %o4
  186. xor %g5, %o5, %o5
  187. movxtod %o4, %f0
  188. movxtod %o5, %f2
  189. srl $rounds, 1, $rounds
  190. ldd [$key + 32], %f16
  191. sub $rounds, 1, $rounds
  192. ldd [$key + 40], %f18
  193. add $key, 48, $key
  194. .Ldec:
  195. aes_dround01 %f12, %f0, %f2, %f4
  196. aes_dround23 %f14, %f0, %f2, %f2
  197. ldd [$key + 0], %f12
  198. ldd [$key + 8], %f14
  199. sub $rounds,1,$rounds
  200. aes_dround01 %f16, %f4, %f2, %f0
  201. aes_dround23 %f18, %f4, %f2, %f2
  202. ldd [$key + 16], %f16
  203. ldd [$key + 24], %f18
  204. brnz,pt $rounds, .Ldec
  205. add $key, 32, $key
  206. andcc $out, 7, $tmp ! is output aligned?
  207. aes_dround01 %f12, %f0, %f2, %f4
  208. aes_dround23 %f14, %f0, %f2, %f2
  209. aes_dround01_l %f16, %f4, %f2, %f0
  210. aes_dround23_l %f18, %f4, %f2, %f2
  211. bnz,pn %icc, 2f
  212. nop
  213. std %f0, [$out + 0]
  214. retl
  215. std %f2, [$out + 8]
  216. 2: alignaddrl $out, %g0, $out
  217. mov 0xff, $mask
  218. srl $mask, $tmp, $mask
  219. faligndata %f0, %f0, %f4
  220. faligndata %f0, %f2, %f6
  221. faligndata %f2, %f2, %f8
  222. stda %f4, [$out + $mask]0xc0 ! partial store
  223. std %f6, [$out + 8]
  224. add $out, 16, $out
  225. orn %g0, $mask, $mask
  226. retl
  227. stda %f8, [$out + $mask]0xc0 ! partial store
  228. .type aes_t4_decrypt,#function
  229. .size aes_t4_decrypt,.-aes_t4_decrypt
  230. ___
  231. }
  232. ######################################################################
  233. # key setup subroutines
  234. #
  235. {
  236. my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5));
  237. $code.=<<___;
  238. .globl aes_t4_set_encrypt_key
  239. .align 32
  240. aes_t4_set_encrypt_key:
  241. .Lset_encrypt_key:
  242. and $inp, 7, $tmp
  243. alignaddr $inp, %g0, $inp
  244. cmp $bits, 192
  245. ldd [$inp + 0], %f0
  246. bl,pt %icc,.L128
  247. ldd [$inp + 8], %f2
  248. be,pt %icc,.L192
  249. ldd [$inp + 16], %f4
  250. brz,pt $tmp, .L256aligned
  251. ldd [$inp + 24], %f6
  252. ldd [$inp + 32], %f8
  253. faligndata %f0, %f2, %f0
  254. faligndata %f2, %f4, %f2
  255. faligndata %f4, %f6, %f4
  256. faligndata %f6, %f8, %f6
  257. .L256aligned:
  258. ___
  259. for ($i=0; $i<6; $i++) {
  260. $code.=<<___;
  261. std %f0, [$out + `32*$i+0`]
  262. aes_kexpand1 %f0, %f6, $i, %f0
  263. std %f2, [$out + `32*$i+8`]
  264. aes_kexpand2 %f2, %f0, %f2
  265. std %f4, [$out + `32*$i+16`]
  266. aes_kexpand0 %f4, %f2, %f4
  267. std %f6, [$out + `32*$i+24`]
  268. aes_kexpand2 %f6, %f4, %f6
  269. ___
  270. }
  271. $code.=<<___;
  272. std %f0, [$out + `32*$i+0`]
  273. aes_kexpand1 %f0, %f6, $i, %f0
  274. std %f2, [$out + `32*$i+8`]
  275. aes_kexpand2 %f2, %f0, %f2
  276. std %f4, [$out + `32*$i+16`]
  277. std %f6, [$out + `32*$i+24`]
  278. std %f0, [$out + `32*$i+32`]
  279. std %f2, [$out + `32*$i+40`]
  280. mov 14, $tmp
  281. st $tmp, [$out + 240]
  282. retl
  283. xor %o0, %o0, %o0
  284. .align 16
  285. .L192:
  286. brz,pt $tmp, .L192aligned
  287. nop
  288. ldd [$inp + 24], %f6
  289. faligndata %f0, %f2, %f0
  290. faligndata %f2, %f4, %f2
  291. faligndata %f4, %f6, %f4
  292. .L192aligned:
  293. ___
  294. for ($i=0; $i<7; $i++) {
  295. $code.=<<___;
  296. std %f0, [$out + `24*$i+0`]
  297. aes_kexpand1 %f0, %f4, $i, %f0
  298. std %f2, [$out + `24*$i+8`]
  299. aes_kexpand2 %f2, %f0, %f2
  300. std %f4, [$out + `24*$i+16`]
  301. aes_kexpand2 %f4, %f2, %f4
  302. ___
  303. }
  304. $code.=<<___;
  305. std %f0, [$out + `24*$i+0`]
  306. aes_kexpand1 %f0, %f4, $i, %f0
  307. std %f2, [$out + `24*$i+8`]
  308. aes_kexpand2 %f2, %f0, %f2
  309. std %f4, [$out + `24*$i+16`]
  310. std %f0, [$out + `24*$i+24`]
  311. std %f2, [$out + `24*$i+32`]
  312. mov 12, $tmp
  313. st $tmp, [$out + 240]
  314. retl
  315. xor %o0, %o0, %o0
  316. .align 16
  317. .L128:
  318. brz,pt $tmp, .L128aligned
  319. nop
  320. ldd [$inp + 16], %f4
  321. faligndata %f0, %f2, %f0
  322. faligndata %f2, %f4, %f2
  323. .L128aligned:
  324. ___
  325. for ($i=0; $i<10; $i++) {
  326. $code.=<<___;
  327. std %f0, [$out + `16*$i+0`]
  328. aes_kexpand1 %f0, %f2, $i, %f0
  329. std %f2, [$out + `16*$i+8`]
  330. aes_kexpand2 %f2, %f0, %f2
  331. ___
  332. }
  333. $code.=<<___;
  334. std %f0, [$out + `16*$i+0`]
  335. std %f2, [$out + `16*$i+8`]
  336. mov 10, $tmp
  337. st $tmp, [$out + 240]
  338. retl
  339. xor %o0, %o0, %o0
  340. .type aes_t4_set_encrypt_key,#function
  341. .size aes_t4_set_encrypt_key,.-aes_t4_set_encrypt_key
  342. .globl aes_t4_set_decrypt_key
  343. .align 32
  344. aes_t4_set_decrypt_key:
  345. mov %o7, %o5
  346. call .Lset_encrypt_key
  347. nop
  348. mov %o5, %o7
  349. sll $tmp, 4, $inp ! $tmp is number of rounds
  350. add $tmp, 2, $tmp
  351. add $out, $inp, $inp ! $inp=$out+16*rounds
  352. srl $tmp, 2, $tmp ! $tmp=(rounds+2)/4
  353. .Lkey_flip:
  354. ldd [$out + 0], %f0
  355. ldd [$out + 8], %f2
  356. ldd [$out + 16], %f4
  357. ldd [$out + 24], %f6
  358. ldd [$inp + 0], %f8
  359. ldd [$inp + 8], %f10
  360. ldd [$inp - 16], %f12
  361. ldd [$inp - 8], %f14
  362. sub $tmp, 1, $tmp
  363. std %f0, [$inp + 0]
  364. std %f2, [$inp + 8]
  365. std %f4, [$inp - 16]
  366. std %f6, [$inp - 8]
  367. std %f8, [$out + 0]
  368. std %f10, [$out + 8]
  369. std %f12, [$out + 16]
  370. std %f14, [$out + 24]
  371. add $out, 32, $out
  372. brnz $tmp, .Lkey_flip
  373. sub $inp, 32, $inp
  374. retl
  375. xor %o0, %o0, %o0
  376. .type aes_t4_set_decrypt_key,#function
  377. .size aes_t4_set_decrypt_key,.-aes_t4_set_decrypt_key
  378. ___
  379. }
  380. {{{
  381. my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
  382. my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7));
  383. $code.=<<___;
  384. .align 32
  385. _aes128_encrypt_1x:
  386. ___
  387. for ($i=0; $i<4; $i++) {
  388. $code.=<<___;
  389. aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
  390. aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
  391. aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
  392. aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
  393. ___
  394. }
  395. $code.=<<___;
  396. aes_eround01 %f48, %f0, %f2, %f4
  397. aes_eround23 %f50, %f0, %f2, %f2
  398. aes_eround01_l %f52, %f4, %f2, %f0
  399. retl
  400. aes_eround23_l %f54, %f4, %f2, %f2
  401. .type _aes128_encrypt_1x,#function
  402. .size _aes128_encrypt_1x,.-_aes128_encrypt_1x
  403. .align 32
  404. _aes128_encrypt_2x:
  405. ___
  406. for ($i=0; $i<4; $i++) {
  407. $code.=<<___;
  408. aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
  409. aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
  410. aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
  411. aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
  412. aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
  413. aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
  414. aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
  415. aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
  416. ___
  417. }
  418. $code.=<<___;
  419. aes_eround01 %f48, %f0, %f2, %f8
  420. aes_eround23 %f50, %f0, %f2, %f2
  421. aes_eround01 %f48, %f4, %f6, %f10
  422. aes_eround23 %f50, %f4, %f6, %f6
  423. aes_eround01_l %f52, %f8, %f2, %f0
  424. aes_eround23_l %f54, %f8, %f2, %f2
  425. aes_eround01_l %f52, %f10, %f6, %f4
  426. retl
  427. aes_eround23_l %f54, %f10, %f6, %f6
  428. .type _aes128_encrypt_2x,#function
  429. .size _aes128_encrypt_2x,.-_aes128_encrypt_2x
  430. .align 32
  431. _aes128_loadkey:
  432. ldx [$key + 0], %g4
  433. ldx [$key + 8], %g5
  434. ___
  435. for ($i=2; $i<22;$i++) { # load key schedule
  436. $code.=<<___;
  437. ldd [$key + `8*$i`], %f`12+2*$i`
  438. ___
  439. }
  440. $code.=<<___;
  441. retl
  442. nop
  443. .type _aes128_loadkey,#function
  444. .size _aes128_loadkey,.-_aes128_loadkey
  445. _aes128_load_enckey=_aes128_loadkey
  446. _aes128_load_deckey=_aes128_loadkey
  447. ___
  448. &alg_cbc_encrypt_implement("aes",128);
  449. if ($::evp) {
  450. &alg_ctr32_implement("aes",128);
  451. &alg_xts_implement("aes",128,"en");
  452. &alg_xts_implement("aes",128,"de");
  453. }
  454. &alg_cbc_decrypt_implement("aes",128);
  455. $code.=<<___;
  456. .align 32
  457. _aes128_decrypt_1x:
  458. ___
  459. for ($i=0; $i<4; $i++) {
  460. $code.=<<___;
  461. aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
  462. aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
  463. aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
  464. aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
  465. ___
  466. }
  467. $code.=<<___;
  468. aes_dround01 %f48, %f0, %f2, %f4
  469. aes_dround23 %f50, %f0, %f2, %f2
  470. aes_dround01_l %f52, %f4, %f2, %f0
  471. retl
  472. aes_dround23_l %f54, %f4, %f2, %f2
  473. .type _aes128_decrypt_1x,#function
  474. .size _aes128_decrypt_1x,.-_aes128_decrypt_1x
  475. .align 32
  476. _aes128_decrypt_2x:
  477. ___
  478. for ($i=0; $i<4; $i++) {
  479. $code.=<<___;
  480. aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
  481. aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
  482. aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
  483. aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
  484. aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
  485. aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
  486. aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
  487. aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
  488. ___
  489. }
  490. $code.=<<___;
  491. aes_dround01 %f48, %f0, %f2, %f8
  492. aes_dround23 %f50, %f0, %f2, %f2
  493. aes_dround01 %f48, %f4, %f6, %f10
  494. aes_dround23 %f50, %f4, %f6, %f6
  495. aes_dround01_l %f52, %f8, %f2, %f0
  496. aes_dround23_l %f54, %f8, %f2, %f2
  497. aes_dround01_l %f52, %f10, %f6, %f4
  498. retl
  499. aes_dround23_l %f54, %f10, %f6, %f6
  500. .type _aes128_decrypt_2x,#function
  501. .size _aes128_decrypt_2x,.-_aes128_decrypt_2x
  502. ___
  503. $code.=<<___;
  504. .align 32
  505. _aes192_encrypt_1x:
  506. ___
  507. for ($i=0; $i<5; $i++) {
  508. $code.=<<___;
  509. aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
  510. aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
  511. aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
  512. aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
  513. ___
  514. }
  515. $code.=<<___;
  516. aes_eround01 %f56, %f0, %f2, %f4
  517. aes_eround23 %f58, %f0, %f2, %f2
  518. aes_eround01_l %f60, %f4, %f2, %f0
  519. retl
  520. aes_eround23_l %f62, %f4, %f2, %f2
  521. .type _aes192_encrypt_1x,#function
  522. .size _aes192_encrypt_1x,.-_aes192_encrypt_1x
  523. .align 32
  524. _aes192_encrypt_2x:
  525. ___
  526. for ($i=0; $i<5; $i++) {
  527. $code.=<<___;
  528. aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
  529. aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
  530. aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
  531. aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
  532. aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
  533. aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
  534. aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
  535. aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
  536. ___
  537. }
  538. $code.=<<___;
  539. aes_eround01 %f56, %f0, %f2, %f8
  540. aes_eround23 %f58, %f0, %f2, %f2
  541. aes_eround01 %f56, %f4, %f6, %f10
  542. aes_eround23 %f58, %f4, %f6, %f6
  543. aes_eround01_l %f60, %f8, %f2, %f0
  544. aes_eround23_l %f62, %f8, %f2, %f2
  545. aes_eround01_l %f60, %f10, %f6, %f4
  546. retl
  547. aes_eround23_l %f62, %f10, %f6, %f6
  548. .type _aes192_encrypt_2x,#function
  549. .size _aes192_encrypt_2x,.-_aes192_encrypt_2x
  550. .align 32
  551. _aes256_encrypt_1x:
  552. aes_eround01 %f16, %f0, %f2, %f4
  553. aes_eround23 %f18, %f0, %f2, %f2
  554. ldd [$key + 208], %f16
  555. ldd [$key + 216], %f18
  556. aes_eround01 %f20, %f4, %f2, %f0
  557. aes_eround23 %f22, %f4, %f2, %f2
  558. ldd [$key + 224], %f20
  559. ldd [$key + 232], %f22
  560. ___
  561. for ($i=1; $i<6; $i++) {
  562. $code.=<<___;
  563. aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
  564. aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
  565. aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
  566. aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
  567. ___
  568. }
  569. $code.=<<___;
  570. aes_eround01 %f16, %f0, %f2, %f4
  571. aes_eround23 %f18, %f0, %f2, %f2
  572. ldd [$key + 16], %f16
  573. ldd [$key + 24], %f18
  574. aes_eround01_l %f20, %f4, %f2, %f0
  575. aes_eround23_l %f22, %f4, %f2, %f2
  576. ldd [$key + 32], %f20
  577. retl
  578. ldd [$key + 40], %f22
  579. .type _aes256_encrypt_1x,#function
  580. .size _aes256_encrypt_1x,.-_aes256_encrypt_1x
  581. .align 32
  582. _aes256_encrypt_2x:
  583. aes_eround01 %f16, %f0, %f2, %f8
  584. aes_eround23 %f18, %f0, %f2, %f2
  585. aes_eround01 %f16, %f4, %f6, %f10
  586. aes_eround23 %f18, %f4, %f6, %f6
  587. ldd [$key + 208], %f16
  588. ldd [$key + 216], %f18
  589. aes_eround01 %f20, %f8, %f2, %f0
  590. aes_eround23 %f22, %f8, %f2, %f2
  591. aes_eround01 %f20, %f10, %f6, %f4
  592. aes_eround23 %f22, %f10, %f6, %f6
  593. ldd [$key + 224], %f20
  594. ldd [$key + 232], %f22
  595. ___
  596. for ($i=1; $i<6; $i++) {
  597. $code.=<<___;
  598. aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
  599. aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
  600. aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
  601. aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
  602. aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
  603. aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
  604. aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
  605. aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
  606. ___
  607. }
  608. $code.=<<___;
  609. aes_eround01 %f16, %f0, %f2, %f8
  610. aes_eround23 %f18, %f0, %f2, %f2
  611. aes_eround01 %f16, %f4, %f6, %f10
  612. aes_eround23 %f18, %f4, %f6, %f6
  613. ldd [$key + 16], %f16
  614. ldd [$key + 24], %f18
  615. aes_eround01_l %f20, %f8, %f2, %f0
  616. aes_eround23_l %f22, %f8, %f2, %f2
  617. aes_eround01_l %f20, %f10, %f6, %f4
  618. aes_eround23_l %f22, %f10, %f6, %f6
  619. ldd [$key + 32], %f20
  620. retl
  621. ldd [$key + 40], %f22
  622. .type _aes256_encrypt_2x,#function
  623. .size _aes256_encrypt_2x,.-_aes256_encrypt_2x
  624. .align 32
  625. _aes192_loadkey:
  626. ldx [$key + 0], %g4
  627. ldx [$key + 8], %g5
  628. ___
  629. for ($i=2; $i<26;$i++) { # load key schedule
  630. $code.=<<___;
  631. ldd [$key + `8*$i`], %f`12+2*$i`
  632. ___
  633. }
  634. $code.=<<___;
  635. retl
  636. nop
  637. .type _aes192_loadkey,#function
  638. .size _aes192_loadkey,.-_aes192_loadkey
  639. _aes256_loadkey=_aes192_loadkey
  640. _aes192_load_enckey=_aes192_loadkey
  641. _aes192_load_deckey=_aes192_loadkey
  642. _aes256_load_enckey=_aes192_loadkey
  643. _aes256_load_deckey=_aes192_loadkey
  644. ___
  645. &alg_cbc_encrypt_implement("aes",256);
  646. &alg_cbc_encrypt_implement("aes",192);
  647. if ($::evp) {
  648. &alg_ctr32_implement("aes",256);
  649. &alg_xts_implement("aes",256,"en");
  650. &alg_xts_implement("aes",256,"de");
  651. &alg_ctr32_implement("aes",192);
  652. }
  653. &alg_cbc_decrypt_implement("aes",192);
  654. &alg_cbc_decrypt_implement("aes",256);
  655. $code.=<<___;
  656. .align 32
  657. _aes256_decrypt_1x:
  658. aes_dround01 %f16, %f0, %f2, %f4
  659. aes_dround23 %f18, %f0, %f2, %f2
  660. ldd [$key + 208], %f16
  661. ldd [$key + 216], %f18
  662. aes_dround01 %f20, %f4, %f2, %f0
  663. aes_dround23 %f22, %f4, %f2, %f2
  664. ldd [$key + 224], %f20
  665. ldd [$key + 232], %f22
  666. ___
  667. for ($i=1; $i<6; $i++) {
  668. $code.=<<___;
  669. aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
  670. aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
  671. aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
  672. aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
  673. ___
  674. }
  675. $code.=<<___;
  676. aes_dround01 %f16, %f0, %f2, %f4
  677. aes_dround23 %f18, %f0, %f2, %f2
  678. ldd [$key + 16], %f16
  679. ldd [$key + 24], %f18
  680. aes_dround01_l %f20, %f4, %f2, %f0
  681. aes_dround23_l %f22, %f4, %f2, %f2
  682. ldd [$key + 32], %f20
  683. retl
  684. ldd [$key + 40], %f22
  685. .type _aes256_decrypt_1x,#function
  686. .size _aes256_decrypt_1x,.-_aes256_decrypt_1x
  687. .align 32
  688. _aes256_decrypt_2x:
  689. aes_dround01 %f16, %f0, %f2, %f8
  690. aes_dround23 %f18, %f0, %f2, %f2
  691. aes_dround01 %f16, %f4, %f6, %f10
  692. aes_dround23 %f18, %f4, %f6, %f6
  693. ldd [$key + 208], %f16
  694. ldd [$key + 216], %f18
  695. aes_dround01 %f20, %f8, %f2, %f0
  696. aes_dround23 %f22, %f8, %f2, %f2
  697. aes_dround01 %f20, %f10, %f6, %f4
  698. aes_dround23 %f22, %f10, %f6, %f6
  699. ldd [$key + 224], %f20
  700. ldd [$key + 232], %f22
  701. ___
  702. for ($i=1; $i<6; $i++) {
  703. $code.=<<___;
  704. aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
  705. aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
  706. aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
  707. aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
  708. aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
  709. aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
  710. aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
  711. aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
  712. ___
  713. }
  714. $code.=<<___;
  715. aes_dround01 %f16, %f0, %f2, %f8
  716. aes_dround23 %f18, %f0, %f2, %f2
  717. aes_dround01 %f16, %f4, %f6, %f10
  718. aes_dround23 %f18, %f4, %f6, %f6
  719. ldd [$key + 16], %f16
  720. ldd [$key + 24], %f18
  721. aes_dround01_l %f20, %f8, %f2, %f0
  722. aes_dround23_l %f22, %f8, %f2, %f2
  723. aes_dround01_l %f20, %f10, %f6, %f4
  724. aes_dround23_l %f22, %f10, %f6, %f6
  725. ldd [$key + 32], %f20
  726. retl
  727. ldd [$key + 40], %f22
  728. .type _aes256_decrypt_2x,#function
  729. .size _aes256_decrypt_2x,.-_aes256_decrypt_2x
  730. .align 32
  731. _aes192_decrypt_1x:
  732. ___
  733. for ($i=0; $i<5; $i++) {
  734. $code.=<<___;
  735. aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
  736. aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
  737. aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
  738. aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
  739. ___
  740. }
  741. $code.=<<___;
  742. aes_dround01 %f56, %f0, %f2, %f4
  743. aes_dround23 %f58, %f0, %f2, %f2
  744. aes_dround01_l %f60, %f4, %f2, %f0
  745. retl
  746. aes_dround23_l %f62, %f4, %f2, %f2
  747. .type _aes192_decrypt_1x,#function
  748. .size _aes192_decrypt_1x,.-_aes192_decrypt_1x
  749. .align 32
  750. _aes192_decrypt_2x:
  751. ___
  752. for ($i=0; $i<5; $i++) {
  753. $code.=<<___;
  754. aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
  755. aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
  756. aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
  757. aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
  758. aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
  759. aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
  760. aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
  761. aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
  762. ___
  763. }
  764. $code.=<<___;
  765. aes_dround01 %f56, %f0, %f2, %f8
  766. aes_dround23 %f58, %f0, %f2, %f2
  767. aes_dround01 %f56, %f4, %f6, %f10
  768. aes_dround23 %f58, %f4, %f6, %f6
  769. aes_dround01_l %f60, %f8, %f2, %f0
  770. aes_dround23_l %f62, %f8, %f2, %f2
  771. aes_dround01_l %f60, %f10, %f6, %f4
  772. retl
  773. aes_dround23_l %f62, %f10, %f6, %f6
  774. .type _aes192_decrypt_2x,#function
  775. .size _aes192_decrypt_2x,.-_aes192_decrypt_2x
  776. ___
  777. }}}
  778. if (!$::evp) {
  779. $code.=<<___;
  780. .global AES_encrypt
  781. AES_encrypt=aes_t4_encrypt
  782. .global AES_decrypt
  783. AES_decrypt=aes_t4_decrypt
  784. .global AES_set_encrypt_key
  785. .align 32
  786. AES_set_encrypt_key:
  787. andcc %o2, 7, %g0 ! check alignment
  788. bnz,a,pn %icc, 1f
  789. mov -1, %o0
  790. brz,a,pn %o0, 1f
  791. mov -1, %o0
  792. brz,a,pn %o2, 1f
  793. mov -1, %o0
  794. andncc %o1, 0x1c0, %g0
  795. bnz,a,pn %icc, 1f
  796. mov -2, %o0
  797. cmp %o1, 128
  798. bl,a,pn %icc, 1f
  799. mov -2, %o0
  800. b aes_t4_set_encrypt_key
  801. nop
  802. 1: retl
  803. nop
  804. .type AES_set_encrypt_key,#function
  805. .size AES_set_encrypt_key,.-AES_set_encrypt_key
  806. .global AES_set_decrypt_key
  807. .align 32
  808. AES_set_decrypt_key:
  809. andcc %o2, 7, %g0 ! check alignment
  810. bnz,a,pn %icc, 1f
  811. mov -1, %o0
  812. brz,a,pn %o0, 1f
  813. mov -1, %o0
  814. brz,a,pn %o2, 1f
  815. mov -1, %o0
  816. andncc %o1, 0x1c0, %g0
  817. bnz,a,pn %icc, 1f
  818. mov -2, %o0
  819. cmp %o1, 128
  820. bl,a,pn %icc, 1f
  821. mov -2, %o0
  822. b aes_t4_set_decrypt_key
  823. nop
  824. 1: retl
  825. nop
  826. .type AES_set_decrypt_key,#function
  827. .size AES_set_decrypt_key,.-AES_set_decrypt_key
  828. ___
  829. my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5));
  830. $code.=<<___;
  831. .globl AES_cbc_encrypt
  832. .align 32
  833. AES_cbc_encrypt:
  834. ld [$key + 240], %g1
  835. nop
  836. brz $enc, .Lcbc_decrypt
  837. cmp %g1, 12
  838. bl,pt %icc, aes128_t4_cbc_encrypt
  839. nop
  840. be,pn %icc, aes192_t4_cbc_encrypt
  841. nop
  842. ba aes256_t4_cbc_encrypt
  843. nop
  844. .Lcbc_decrypt:
  845. bl,pt %icc, aes128_t4_cbc_decrypt
  846. nop
  847. be,pn %icc, aes192_t4_cbc_decrypt
  848. nop
  849. ba aes256_t4_cbc_decrypt
  850. nop
  851. .type AES_cbc_encrypt,#function
  852. .size AES_cbc_encrypt,.-AES_cbc_encrypt
  853. ___
  854. }
  855. $code.=<<___;
  856. .asciz "AES for SPARC T4, David S. Miller, Andy Polyakov"
  857. .align 4
  858. ___
  859. &emit_assembler();
  860. close STDOUT or die "error closing STDOUT: $!";