aest4-sparcv9.pl 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928
  1. #! /usr/bin/env perl
  2. # Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by David S. Miller and Andy Polyakov.
  10. # The module is licensed under 2-clause BSD license. October 2012.
  11. # All rights reserved.
  12. # ====================================================================
  13. ######################################################################
  14. # AES for SPARC T4.
  15. #
  16. # AES round instructions complete in 3 cycles and can be issued every
  17. # cycle. It means that round calculations should take 4*rounds cycles,
  18. # because any given round instruction depends on result of *both*
  19. # previous instructions:
  20. #
  21. # |0 |1 |2 |3 |4
  22. # |01|01|01|
  23. # |23|23|23|
  24. # |01|01|...
  25. # |23|...
  26. #
  27. # Provided that fxor [with IV] takes 3 cycles to complete, critical
  28. # path length for CBC encrypt would be 3+4*rounds, or in other words
  29. # it should process one byte in at least (3+4*rounds)/16 cycles. This
  30. # estimate doesn't account for "collateral" instructions, such as
  31. # fetching input from memory, xor-ing it with zero-round key and
  32. # storing the result. Yet, *measured* performance [for data aligned
  33. # at 64-bit boundary!] deviates from this equation by less than 0.5%:
  34. #
  35. # 128-bit key 192- 256-
  36. # CBC encrypt 2.70/2.90(*) 3.20/3.40 3.70/3.90
  37. # (*) numbers after slash are for
  38. # misaligned data.
  39. #
  40. # Out-of-order execution logic managed to fully overlap "collateral"
  41. # instructions with those on critical path. Amazing!
  42. #
  43. # As with Intel AES-NI, question is if it's possible to improve
  44. # performance of parallelizable modes by interleaving round
  45. # instructions. Provided round instruction latency and throughput
  46. # optimal interleave factor is 2. But can we expect 2x performance
  47. # improvement? Well, as round instructions can be issued one per
  48. # cycle, they don't saturate the 2-way issue pipeline and therefore
  49. # there is room for "collateral" calculations... Yet, 2x speed-up
  50. # over CBC encrypt remains unattaintable:
  51. #
  52. # 128-bit key 192- 256-
  53. # CBC decrypt 1.64/2.11 1.89/2.37 2.23/2.61
  54. # CTR 1.64/2.08(*) 1.89/2.33 2.23/2.61
  55. # (*) numbers after slash are for
  56. # misaligned data.
  57. #
  58. # Estimates based on amount of instructions under assumption that
  59. # round instructions are not pairable with any other instruction
  60. # suggest that latter is the actual case and pipeline runs
  61. # underutilized. It should be noted that T4 out-of-order execution
  62. # logic is so capable that performance gain from 2x interleave is
  63. # not even impressive, ~7-13% over non-interleaved code, largest
  64. # for 256-bit keys.
  65. # To anchor to something else, software implementation processes
  66. # one byte in 29 cycles with 128-bit key on same processor. Intel
  67. # Sandy Bridge encrypts byte in 5.07 cycles in CBC mode and decrypts
  68. # in 0.93, naturally with AES-NI.
  69. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  70. push(@INC,"${dir}","${dir}../../perlasm");
  71. require "sparcv9_modes.pl";
  72. $output = pop and open STDOUT,">$output";
  73. $::evp=1; # if $evp is set to 0, script generates module with
  74. # AES_[en|de]crypt, AES_set_[en|de]crypt_key and AES_cbc_encrypt entry
  75. # points. These however are not fully compatible with openssl/aes.h,
  76. # because they expect AES_KEY to be aligned at 64-bit boundary. When
  77. # used through EVP, alignment is arranged at EVP layer. Second thing
  78. # that is arranged by EVP is at least 32-bit alignment of IV.
  79. ######################################################################
  80. # single-round subroutines
  81. #
  82. {
  83. my ($inp,$out,$key,$rounds,$tmp,$mask)=map("%o$_",(0..5));
  84. $code.=<<___;
  85. #include "sparc_arch.h"
  86. #ifdef __arch64__
  87. .register %g2,#scratch
  88. .register %g3,#scratch
  89. #endif
  90. .text
  91. .globl aes_t4_encrypt
  92. .align 32
  93. aes_t4_encrypt:
  94. andcc $inp, 7, %g1 ! is input aligned?
  95. andn $inp, 7, $inp
  96. ldx [$key + 0], %g4
  97. ldx [$key + 8], %g5
  98. ldx [$inp + 0], %o4
  99. bz,pt %icc, 1f
  100. ldx [$inp + 8], %o5
  101. ldx [$inp + 16], $inp
  102. sll %g1, 3, %g1
  103. sub %g0, %g1, %o3
  104. sllx %o4, %g1, %o4
  105. sllx %o5, %g1, %g1
  106. srlx %o5, %o3, %o5
  107. srlx $inp, %o3, %o3
  108. or %o5, %o4, %o4
  109. or %o3, %g1, %o5
  110. 1:
  111. ld [$key + 240], $rounds
  112. ldd [$key + 16], %f12
  113. ldd [$key + 24], %f14
  114. xor %g4, %o4, %o4
  115. xor %g5, %o5, %o5
  116. movxtod %o4, %f0
  117. movxtod %o5, %f2
  118. srl $rounds, 1, $rounds
  119. ldd [$key + 32], %f16
  120. sub $rounds, 1, $rounds
  121. ldd [$key + 40], %f18
  122. add $key, 48, $key
  123. .Lenc:
  124. aes_eround01 %f12, %f0, %f2, %f4
  125. aes_eround23 %f14, %f0, %f2, %f2
  126. ldd [$key + 0], %f12
  127. ldd [$key + 8], %f14
  128. sub $rounds,1,$rounds
  129. aes_eround01 %f16, %f4, %f2, %f0
  130. aes_eround23 %f18, %f4, %f2, %f2
  131. ldd [$key + 16], %f16
  132. ldd [$key + 24], %f18
  133. brnz,pt $rounds, .Lenc
  134. add $key, 32, $key
  135. andcc $out, 7, $tmp ! is output aligned?
  136. aes_eround01 %f12, %f0, %f2, %f4
  137. aes_eround23 %f14, %f0, %f2, %f2
  138. aes_eround01_l %f16, %f4, %f2, %f0
  139. aes_eround23_l %f18, %f4, %f2, %f2
  140. bnz,pn %icc, 2f
  141. nop
  142. std %f0, [$out + 0]
  143. retl
  144. std %f2, [$out + 8]
  145. 2: alignaddrl $out, %g0, $out
  146. mov 0xff, $mask
  147. srl $mask, $tmp, $mask
  148. faligndata %f0, %f0, %f4
  149. faligndata %f0, %f2, %f6
  150. faligndata %f2, %f2, %f8
  151. stda %f4, [$out + $mask]0xc0 ! partial store
  152. std %f6, [$out + 8]
  153. add $out, 16, $out
  154. orn %g0, $mask, $mask
  155. retl
  156. stda %f8, [$out + $mask]0xc0 ! partial store
  157. .type aes_t4_encrypt,#function
  158. .size aes_t4_encrypt,.-aes_t4_encrypt
  159. .globl aes_t4_decrypt
  160. .align 32
  161. aes_t4_decrypt:
  162. andcc $inp, 7, %g1 ! is input aligned?
  163. andn $inp, 7, $inp
  164. ldx [$key + 0], %g4
  165. ldx [$key + 8], %g5
  166. ldx [$inp + 0], %o4
  167. bz,pt %icc, 1f
  168. ldx [$inp + 8], %o5
  169. ldx [$inp + 16], $inp
  170. sll %g1, 3, %g1
  171. sub %g0, %g1, %o3
  172. sllx %o4, %g1, %o4
  173. sllx %o5, %g1, %g1
  174. srlx %o5, %o3, %o5
  175. srlx $inp, %o3, %o3
  176. or %o5, %o4, %o4
  177. or %o3, %g1, %o5
  178. 1:
  179. ld [$key + 240], $rounds
  180. ldd [$key + 16], %f12
  181. ldd [$key + 24], %f14
  182. xor %g4, %o4, %o4
  183. xor %g5, %o5, %o5
  184. movxtod %o4, %f0
  185. movxtod %o5, %f2
  186. srl $rounds, 1, $rounds
  187. ldd [$key + 32], %f16
  188. sub $rounds, 1, $rounds
  189. ldd [$key + 40], %f18
  190. add $key, 48, $key
  191. .Ldec:
  192. aes_dround01 %f12, %f0, %f2, %f4
  193. aes_dround23 %f14, %f0, %f2, %f2
  194. ldd [$key + 0], %f12
  195. ldd [$key + 8], %f14
  196. sub $rounds,1,$rounds
  197. aes_dround01 %f16, %f4, %f2, %f0
  198. aes_dround23 %f18, %f4, %f2, %f2
  199. ldd [$key + 16], %f16
  200. ldd [$key + 24], %f18
  201. brnz,pt $rounds, .Ldec
  202. add $key, 32, $key
  203. andcc $out, 7, $tmp ! is output aligned?
  204. aes_dround01 %f12, %f0, %f2, %f4
  205. aes_dround23 %f14, %f0, %f2, %f2
  206. aes_dround01_l %f16, %f4, %f2, %f0
  207. aes_dround23_l %f18, %f4, %f2, %f2
  208. bnz,pn %icc, 2f
  209. nop
  210. std %f0, [$out + 0]
  211. retl
  212. std %f2, [$out + 8]
  213. 2: alignaddrl $out, %g0, $out
  214. mov 0xff, $mask
  215. srl $mask, $tmp, $mask
  216. faligndata %f0, %f0, %f4
  217. faligndata %f0, %f2, %f6
  218. faligndata %f2, %f2, %f8
  219. stda %f4, [$out + $mask]0xc0 ! partial store
  220. std %f6, [$out + 8]
  221. add $out, 16, $out
  222. orn %g0, $mask, $mask
  223. retl
  224. stda %f8, [$out + $mask]0xc0 ! partial store
  225. .type aes_t4_decrypt,#function
  226. .size aes_t4_decrypt,.-aes_t4_decrypt
  227. ___
  228. }
  229. ######################################################################
  230. # key setup subroutines
  231. #
  232. {
  233. my ($inp,$bits,$out,$tmp)=map("%o$_",(0..5));
  234. $code.=<<___;
  235. .globl aes_t4_set_encrypt_key
  236. .align 32
  237. aes_t4_set_encrypt_key:
  238. .Lset_encrypt_key:
  239. and $inp, 7, $tmp
  240. alignaddr $inp, %g0, $inp
  241. cmp $bits, 192
  242. ldd [$inp + 0], %f0
  243. bl,pt %icc,.L128
  244. ldd [$inp + 8], %f2
  245. be,pt %icc,.L192
  246. ldd [$inp + 16], %f4
  247. brz,pt $tmp, .L256aligned
  248. ldd [$inp + 24], %f6
  249. ldd [$inp + 32], %f8
  250. faligndata %f0, %f2, %f0
  251. faligndata %f2, %f4, %f2
  252. faligndata %f4, %f6, %f4
  253. faligndata %f6, %f8, %f6
  254. .L256aligned:
  255. ___
  256. for ($i=0; $i<6; $i++) {
  257. $code.=<<___;
  258. std %f0, [$out + `32*$i+0`]
  259. aes_kexpand1 %f0, %f6, $i, %f0
  260. std %f2, [$out + `32*$i+8`]
  261. aes_kexpand2 %f2, %f0, %f2
  262. std %f4, [$out + `32*$i+16`]
  263. aes_kexpand0 %f4, %f2, %f4
  264. std %f6, [$out + `32*$i+24`]
  265. aes_kexpand2 %f6, %f4, %f6
  266. ___
  267. }
  268. $code.=<<___;
  269. std %f0, [$out + `32*$i+0`]
  270. aes_kexpand1 %f0, %f6, $i, %f0
  271. std %f2, [$out + `32*$i+8`]
  272. aes_kexpand2 %f2, %f0, %f2
  273. std %f4, [$out + `32*$i+16`]
  274. std %f6, [$out + `32*$i+24`]
  275. std %f0, [$out + `32*$i+32`]
  276. std %f2, [$out + `32*$i+40`]
  277. mov 14, $tmp
  278. st $tmp, [$out + 240]
  279. retl
  280. xor %o0, %o0, %o0
  281. .align 16
  282. .L192:
  283. brz,pt $tmp, .L192aligned
  284. nop
  285. ldd [$inp + 24], %f6
  286. faligndata %f0, %f2, %f0
  287. faligndata %f2, %f4, %f2
  288. faligndata %f4, %f6, %f4
  289. .L192aligned:
  290. ___
  291. for ($i=0; $i<7; $i++) {
  292. $code.=<<___;
  293. std %f0, [$out + `24*$i+0`]
  294. aes_kexpand1 %f0, %f4, $i, %f0
  295. std %f2, [$out + `24*$i+8`]
  296. aes_kexpand2 %f2, %f0, %f2
  297. std %f4, [$out + `24*$i+16`]
  298. aes_kexpand2 %f4, %f2, %f4
  299. ___
  300. }
  301. $code.=<<___;
  302. std %f0, [$out + `24*$i+0`]
  303. aes_kexpand1 %f0, %f4, $i, %f0
  304. std %f2, [$out + `24*$i+8`]
  305. aes_kexpand2 %f2, %f0, %f2
  306. std %f4, [$out + `24*$i+16`]
  307. std %f0, [$out + `24*$i+24`]
  308. std %f2, [$out + `24*$i+32`]
  309. mov 12, $tmp
  310. st $tmp, [$out + 240]
  311. retl
  312. xor %o0, %o0, %o0
  313. .align 16
  314. .L128:
  315. brz,pt $tmp, .L128aligned
  316. nop
  317. ldd [$inp + 16], %f4
  318. faligndata %f0, %f2, %f0
  319. faligndata %f2, %f4, %f2
  320. .L128aligned:
  321. ___
  322. for ($i=0; $i<10; $i++) {
  323. $code.=<<___;
  324. std %f0, [$out + `16*$i+0`]
  325. aes_kexpand1 %f0, %f2, $i, %f0
  326. std %f2, [$out + `16*$i+8`]
  327. aes_kexpand2 %f2, %f0, %f2
  328. ___
  329. }
  330. $code.=<<___;
  331. std %f0, [$out + `16*$i+0`]
  332. std %f2, [$out + `16*$i+8`]
  333. mov 10, $tmp
  334. st $tmp, [$out + 240]
  335. retl
  336. xor %o0, %o0, %o0
  337. .type aes_t4_set_encrypt_key,#function
  338. .size aes_t4_set_encrypt_key,.-aes_t4_set_encrypt_key
  339. .globl aes_t4_set_decrypt_key
  340. .align 32
  341. aes_t4_set_decrypt_key:
  342. mov %o7, %o5
  343. call .Lset_encrypt_key
  344. nop
  345. mov %o5, %o7
  346. sll $tmp, 4, $inp ! $tmp is number of rounds
  347. add $tmp, 2, $tmp
  348. add $out, $inp, $inp ! $inp=$out+16*rounds
  349. srl $tmp, 2, $tmp ! $tmp=(rounds+2)/4
  350. .Lkey_flip:
  351. ldd [$out + 0], %f0
  352. ldd [$out + 8], %f2
  353. ldd [$out + 16], %f4
  354. ldd [$out + 24], %f6
  355. ldd [$inp + 0], %f8
  356. ldd [$inp + 8], %f10
  357. ldd [$inp - 16], %f12
  358. ldd [$inp - 8], %f14
  359. sub $tmp, 1, $tmp
  360. std %f0, [$inp + 0]
  361. std %f2, [$inp + 8]
  362. std %f4, [$inp - 16]
  363. std %f6, [$inp - 8]
  364. std %f8, [$out + 0]
  365. std %f10, [$out + 8]
  366. std %f12, [$out + 16]
  367. std %f14, [$out + 24]
  368. add $out, 32, $out
  369. brnz $tmp, .Lkey_flip
  370. sub $inp, 32, $inp
  371. retl
  372. xor %o0, %o0, %o0
  373. .type aes_t4_set_decrypt_key,#function
  374. .size aes_t4_set_decrypt_key,.-aes_t4_set_decrypt_key
  375. ___
  376. }
  377. {{{
  378. my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
  379. my ($ileft,$iright,$ooff,$omask,$ivoff)=map("%l$_",(1..7));
  380. $code.=<<___;
  381. .align 32
  382. _aes128_encrypt_1x:
  383. ___
  384. for ($i=0; $i<4; $i++) {
  385. $code.=<<___;
  386. aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
  387. aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
  388. aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
  389. aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
  390. ___
  391. }
  392. $code.=<<___;
  393. aes_eround01 %f48, %f0, %f2, %f4
  394. aes_eround23 %f50, %f0, %f2, %f2
  395. aes_eround01_l %f52, %f4, %f2, %f0
  396. retl
  397. aes_eround23_l %f54, %f4, %f2, %f2
  398. .type _aes128_encrypt_1x,#function
  399. .size _aes128_encrypt_1x,.-_aes128_encrypt_1x
  400. .align 32
  401. _aes128_encrypt_2x:
  402. ___
  403. for ($i=0; $i<4; $i++) {
  404. $code.=<<___;
  405. aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
  406. aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
  407. aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
  408. aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
  409. aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
  410. aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
  411. aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
  412. aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
  413. ___
  414. }
  415. $code.=<<___;
  416. aes_eround01 %f48, %f0, %f2, %f8
  417. aes_eround23 %f50, %f0, %f2, %f2
  418. aes_eround01 %f48, %f4, %f6, %f10
  419. aes_eround23 %f50, %f4, %f6, %f6
  420. aes_eround01_l %f52, %f8, %f2, %f0
  421. aes_eround23_l %f54, %f8, %f2, %f2
  422. aes_eround01_l %f52, %f10, %f6, %f4
  423. retl
  424. aes_eround23_l %f54, %f10, %f6, %f6
  425. .type _aes128_encrypt_2x,#function
  426. .size _aes128_encrypt_2x,.-_aes128_encrypt_2x
  427. .align 32
  428. _aes128_loadkey:
  429. ldx [$key + 0], %g4
  430. ldx [$key + 8], %g5
  431. ___
  432. for ($i=2; $i<22;$i++) { # load key schedule
  433. $code.=<<___;
  434. ldd [$key + `8*$i`], %f`12+2*$i`
  435. ___
  436. }
  437. $code.=<<___;
  438. retl
  439. nop
  440. .type _aes128_loadkey,#function
  441. .size _aes128_loadkey,.-_aes128_loadkey
  442. _aes128_load_enckey=_aes128_loadkey
  443. _aes128_load_deckey=_aes128_loadkey
  444. ___
  445. &alg_cbc_encrypt_implement("aes",128);
  446. if ($::evp) {
  447. &alg_ctr32_implement("aes",128);
  448. &alg_xts_implement("aes",128,"en");
  449. &alg_xts_implement("aes",128,"de");
  450. }
  451. &alg_cbc_decrypt_implement("aes",128);
  452. $code.=<<___;
  453. .align 32
  454. _aes128_decrypt_1x:
  455. ___
  456. for ($i=0; $i<4; $i++) {
  457. $code.=<<___;
  458. aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
  459. aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
  460. aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
  461. aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
  462. ___
  463. }
  464. $code.=<<___;
  465. aes_dround01 %f48, %f0, %f2, %f4
  466. aes_dround23 %f50, %f0, %f2, %f2
  467. aes_dround01_l %f52, %f4, %f2, %f0
  468. retl
  469. aes_dround23_l %f54, %f4, %f2, %f2
  470. .type _aes128_decrypt_1x,#function
  471. .size _aes128_decrypt_1x,.-_aes128_decrypt_1x
  472. .align 32
  473. _aes128_decrypt_2x:
  474. ___
  475. for ($i=0; $i<4; $i++) {
  476. $code.=<<___;
  477. aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
  478. aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
  479. aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
  480. aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
  481. aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
  482. aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
  483. aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
  484. aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
  485. ___
  486. }
  487. $code.=<<___;
  488. aes_dround01 %f48, %f0, %f2, %f8
  489. aes_dround23 %f50, %f0, %f2, %f2
  490. aes_dround01 %f48, %f4, %f6, %f10
  491. aes_dround23 %f50, %f4, %f6, %f6
  492. aes_dround01_l %f52, %f8, %f2, %f0
  493. aes_dround23_l %f54, %f8, %f2, %f2
  494. aes_dround01_l %f52, %f10, %f6, %f4
  495. retl
  496. aes_dround23_l %f54, %f10, %f6, %f6
  497. .type _aes128_decrypt_2x,#function
  498. .size _aes128_decrypt_2x,.-_aes128_decrypt_2x
  499. ___
  500. $code.=<<___;
  501. .align 32
  502. _aes192_encrypt_1x:
  503. ___
  504. for ($i=0; $i<5; $i++) {
  505. $code.=<<___;
  506. aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
  507. aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
  508. aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
  509. aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
  510. ___
  511. }
  512. $code.=<<___;
  513. aes_eround01 %f56, %f0, %f2, %f4
  514. aes_eround23 %f58, %f0, %f2, %f2
  515. aes_eround01_l %f60, %f4, %f2, %f0
  516. retl
  517. aes_eround23_l %f62, %f4, %f2, %f2
  518. .type _aes192_encrypt_1x,#function
  519. .size _aes192_encrypt_1x,.-_aes192_encrypt_1x
  520. .align 32
  521. _aes192_encrypt_2x:
  522. ___
  523. for ($i=0; $i<5; $i++) {
  524. $code.=<<___;
  525. aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
  526. aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
  527. aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
  528. aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
  529. aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
  530. aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
  531. aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
  532. aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
  533. ___
  534. }
  535. $code.=<<___;
  536. aes_eround01 %f56, %f0, %f2, %f8
  537. aes_eround23 %f58, %f0, %f2, %f2
  538. aes_eround01 %f56, %f4, %f6, %f10
  539. aes_eround23 %f58, %f4, %f6, %f6
  540. aes_eround01_l %f60, %f8, %f2, %f0
  541. aes_eround23_l %f62, %f8, %f2, %f2
  542. aes_eround01_l %f60, %f10, %f6, %f4
  543. retl
  544. aes_eround23_l %f62, %f10, %f6, %f6
  545. .type _aes192_encrypt_2x,#function
  546. .size _aes192_encrypt_2x,.-_aes192_encrypt_2x
  547. .align 32
  548. _aes256_encrypt_1x:
  549. aes_eround01 %f16, %f0, %f2, %f4
  550. aes_eround23 %f18, %f0, %f2, %f2
  551. ldd [$key + 208], %f16
  552. ldd [$key + 216], %f18
  553. aes_eround01 %f20, %f4, %f2, %f0
  554. aes_eround23 %f22, %f4, %f2, %f2
  555. ldd [$key + 224], %f20
  556. ldd [$key + 232], %f22
  557. ___
  558. for ($i=1; $i<6; $i++) {
  559. $code.=<<___;
  560. aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f4
  561. aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
  562. aes_eround01 %f`16+8*$i+4`, %f4, %f2, %f0
  563. aes_eround23 %f`16+8*$i+6`, %f4, %f2, %f2
  564. ___
  565. }
  566. $code.=<<___;
  567. aes_eround01 %f16, %f0, %f2, %f4
  568. aes_eround23 %f18, %f0, %f2, %f2
  569. ldd [$key + 16], %f16
  570. ldd [$key + 24], %f18
  571. aes_eround01_l %f20, %f4, %f2, %f0
  572. aes_eround23_l %f22, %f4, %f2, %f2
  573. ldd [$key + 32], %f20
  574. retl
  575. ldd [$key + 40], %f22
  576. .type _aes256_encrypt_1x,#function
  577. .size _aes256_encrypt_1x,.-_aes256_encrypt_1x
  578. .align 32
  579. _aes256_encrypt_2x:
  580. aes_eround01 %f16, %f0, %f2, %f8
  581. aes_eround23 %f18, %f0, %f2, %f2
  582. aes_eround01 %f16, %f4, %f6, %f10
  583. aes_eround23 %f18, %f4, %f6, %f6
  584. ldd [$key + 208], %f16
  585. ldd [$key + 216], %f18
  586. aes_eround01 %f20, %f8, %f2, %f0
  587. aes_eround23 %f22, %f8, %f2, %f2
  588. aes_eround01 %f20, %f10, %f6, %f4
  589. aes_eround23 %f22, %f10, %f6, %f6
  590. ldd [$key + 224], %f20
  591. ldd [$key + 232], %f22
  592. ___
  593. for ($i=1; $i<6; $i++) {
  594. $code.=<<___;
  595. aes_eround01 %f`16+8*$i+0`, %f0, %f2, %f8
  596. aes_eround23 %f`16+8*$i+2`, %f0, %f2, %f2
  597. aes_eround01 %f`16+8*$i+0`, %f4, %f6, %f10
  598. aes_eround23 %f`16+8*$i+2`, %f4, %f6, %f6
  599. aes_eround01 %f`16+8*$i+4`, %f8, %f2, %f0
  600. aes_eround23 %f`16+8*$i+6`, %f8, %f2, %f2
  601. aes_eround01 %f`16+8*$i+4`, %f10, %f6, %f4
  602. aes_eround23 %f`16+8*$i+6`, %f10, %f6, %f6
  603. ___
  604. }
  605. $code.=<<___;
  606. aes_eround01 %f16, %f0, %f2, %f8
  607. aes_eround23 %f18, %f0, %f2, %f2
  608. aes_eround01 %f16, %f4, %f6, %f10
  609. aes_eround23 %f18, %f4, %f6, %f6
  610. ldd [$key + 16], %f16
  611. ldd [$key + 24], %f18
  612. aes_eround01_l %f20, %f8, %f2, %f0
  613. aes_eround23_l %f22, %f8, %f2, %f2
  614. aes_eround01_l %f20, %f10, %f6, %f4
  615. aes_eround23_l %f22, %f10, %f6, %f6
  616. ldd [$key + 32], %f20
  617. retl
  618. ldd [$key + 40], %f22
  619. .type _aes256_encrypt_2x,#function
  620. .size _aes256_encrypt_2x,.-_aes256_encrypt_2x
  621. .align 32
  622. _aes192_loadkey:
  623. ldx [$key + 0], %g4
  624. ldx [$key + 8], %g5
  625. ___
  626. for ($i=2; $i<26;$i++) { # load key schedule
  627. $code.=<<___;
  628. ldd [$key + `8*$i`], %f`12+2*$i`
  629. ___
  630. }
  631. $code.=<<___;
  632. retl
  633. nop
  634. .type _aes192_loadkey,#function
  635. .size _aes192_loadkey,.-_aes192_loadkey
  636. _aes256_loadkey=_aes192_loadkey
  637. _aes192_load_enckey=_aes192_loadkey
  638. _aes192_load_deckey=_aes192_loadkey
  639. _aes256_load_enckey=_aes192_loadkey
  640. _aes256_load_deckey=_aes192_loadkey
  641. ___
  642. &alg_cbc_encrypt_implement("aes",256);
  643. &alg_cbc_encrypt_implement("aes",192);
  644. if ($::evp) {
  645. &alg_ctr32_implement("aes",256);
  646. &alg_xts_implement("aes",256,"en");
  647. &alg_xts_implement("aes",256,"de");
  648. &alg_ctr32_implement("aes",192);
  649. }
  650. &alg_cbc_decrypt_implement("aes",192);
  651. &alg_cbc_decrypt_implement("aes",256);
  652. $code.=<<___;
  653. .align 32
  654. _aes256_decrypt_1x:
  655. aes_dround01 %f16, %f0, %f2, %f4
  656. aes_dround23 %f18, %f0, %f2, %f2
  657. ldd [$key + 208], %f16
  658. ldd [$key + 216], %f18
  659. aes_dround01 %f20, %f4, %f2, %f0
  660. aes_dround23 %f22, %f4, %f2, %f2
  661. ldd [$key + 224], %f20
  662. ldd [$key + 232], %f22
  663. ___
  664. for ($i=1; $i<6; $i++) {
  665. $code.=<<___;
  666. aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
  667. aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
  668. aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
  669. aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
  670. ___
  671. }
  672. $code.=<<___;
  673. aes_dround01 %f16, %f0, %f2, %f4
  674. aes_dround23 %f18, %f0, %f2, %f2
  675. ldd [$key + 16], %f16
  676. ldd [$key + 24], %f18
  677. aes_dround01_l %f20, %f4, %f2, %f0
  678. aes_dround23_l %f22, %f4, %f2, %f2
  679. ldd [$key + 32], %f20
  680. retl
  681. ldd [$key + 40], %f22
  682. .type _aes256_decrypt_1x,#function
  683. .size _aes256_decrypt_1x,.-_aes256_decrypt_1x
  684. .align 32
  685. _aes256_decrypt_2x:
  686. aes_dround01 %f16, %f0, %f2, %f8
  687. aes_dround23 %f18, %f0, %f2, %f2
  688. aes_dround01 %f16, %f4, %f6, %f10
  689. aes_dround23 %f18, %f4, %f6, %f6
  690. ldd [$key + 208], %f16
  691. ldd [$key + 216], %f18
  692. aes_dround01 %f20, %f8, %f2, %f0
  693. aes_dround23 %f22, %f8, %f2, %f2
  694. aes_dround01 %f20, %f10, %f6, %f4
  695. aes_dround23 %f22, %f10, %f6, %f6
  696. ldd [$key + 224], %f20
  697. ldd [$key + 232], %f22
  698. ___
  699. for ($i=1; $i<6; $i++) {
  700. $code.=<<___;
  701. aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
  702. aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
  703. aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
  704. aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
  705. aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
  706. aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
  707. aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
  708. aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
  709. ___
  710. }
  711. $code.=<<___;
  712. aes_dround01 %f16, %f0, %f2, %f8
  713. aes_dround23 %f18, %f0, %f2, %f2
  714. aes_dround01 %f16, %f4, %f6, %f10
  715. aes_dround23 %f18, %f4, %f6, %f6
  716. ldd [$key + 16], %f16
  717. ldd [$key + 24], %f18
  718. aes_dround01_l %f20, %f8, %f2, %f0
  719. aes_dround23_l %f22, %f8, %f2, %f2
  720. aes_dround01_l %f20, %f10, %f6, %f4
  721. aes_dround23_l %f22, %f10, %f6, %f6
  722. ldd [$key + 32], %f20
  723. retl
  724. ldd [$key + 40], %f22
  725. .type _aes256_decrypt_2x,#function
  726. .size _aes256_decrypt_2x,.-_aes256_decrypt_2x
  727. .align 32
  728. _aes192_decrypt_1x:
  729. ___
  730. for ($i=0; $i<5; $i++) {
  731. $code.=<<___;
  732. aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f4
  733. aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
  734. aes_dround01 %f`16+8*$i+4`, %f4, %f2, %f0
  735. aes_dround23 %f`16+8*$i+6`, %f4, %f2, %f2
  736. ___
  737. }
  738. $code.=<<___;
  739. aes_dround01 %f56, %f0, %f2, %f4
  740. aes_dround23 %f58, %f0, %f2, %f2
  741. aes_dround01_l %f60, %f4, %f2, %f0
  742. retl
  743. aes_dround23_l %f62, %f4, %f2, %f2
  744. .type _aes192_decrypt_1x,#function
  745. .size _aes192_decrypt_1x,.-_aes192_decrypt_1x
  746. .align 32
  747. _aes192_decrypt_2x:
  748. ___
  749. for ($i=0; $i<5; $i++) {
  750. $code.=<<___;
  751. aes_dround01 %f`16+8*$i+0`, %f0, %f2, %f8
  752. aes_dround23 %f`16+8*$i+2`, %f0, %f2, %f2
  753. aes_dround01 %f`16+8*$i+0`, %f4, %f6, %f10
  754. aes_dround23 %f`16+8*$i+2`, %f4, %f6, %f6
  755. aes_dround01 %f`16+8*$i+4`, %f8, %f2, %f0
  756. aes_dround23 %f`16+8*$i+6`, %f8, %f2, %f2
  757. aes_dround01 %f`16+8*$i+4`, %f10, %f6, %f4
  758. aes_dround23 %f`16+8*$i+6`, %f10, %f6, %f6
  759. ___
  760. }
  761. $code.=<<___;
  762. aes_dround01 %f56, %f0, %f2, %f8
  763. aes_dround23 %f58, %f0, %f2, %f2
  764. aes_dround01 %f56, %f4, %f6, %f10
  765. aes_dround23 %f58, %f4, %f6, %f6
  766. aes_dround01_l %f60, %f8, %f2, %f0
  767. aes_dround23_l %f62, %f8, %f2, %f2
  768. aes_dround01_l %f60, %f10, %f6, %f4
  769. retl
  770. aes_dround23_l %f62, %f10, %f6, %f6
  771. .type _aes192_decrypt_2x,#function
  772. .size _aes192_decrypt_2x,.-_aes192_decrypt_2x
  773. ___
  774. }}}
  775. if (!$::evp) {
  776. $code.=<<___;
  777. .global AES_encrypt
  778. AES_encrypt=aes_t4_encrypt
  779. .global AES_decrypt
  780. AES_decrypt=aes_t4_decrypt
  781. .global AES_set_encrypt_key
  782. .align 32
  783. AES_set_encrypt_key:
  784. andcc %o2, 7, %g0 ! check alignment
  785. bnz,a,pn %icc, 1f
  786. mov -1, %o0
  787. brz,a,pn %o0, 1f
  788. mov -1, %o0
  789. brz,a,pn %o2, 1f
  790. mov -1, %o0
  791. andncc %o1, 0x1c0, %g0
  792. bnz,a,pn %icc, 1f
  793. mov -2, %o0
  794. cmp %o1, 128
  795. bl,a,pn %icc, 1f
  796. mov -2, %o0
  797. b aes_t4_set_encrypt_key
  798. nop
  799. 1: retl
  800. nop
  801. .type AES_set_encrypt_key,#function
  802. .size AES_set_encrypt_key,.-AES_set_encrypt_key
  803. .global AES_set_decrypt_key
  804. .align 32
  805. AES_set_decrypt_key:
  806. andcc %o2, 7, %g0 ! check alignment
  807. bnz,a,pn %icc, 1f
  808. mov -1, %o0
  809. brz,a,pn %o0, 1f
  810. mov -1, %o0
  811. brz,a,pn %o2, 1f
  812. mov -1, %o0
  813. andncc %o1, 0x1c0, %g0
  814. bnz,a,pn %icc, 1f
  815. mov -2, %o0
  816. cmp %o1, 128
  817. bl,a,pn %icc, 1f
  818. mov -2, %o0
  819. b aes_t4_set_decrypt_key
  820. nop
  821. 1: retl
  822. nop
  823. .type AES_set_decrypt_key,#function
  824. .size AES_set_decrypt_key,.-AES_set_decrypt_key
  825. ___
  826. my ($inp,$out,$len,$key,$ivec,$enc)=map("%o$_",(0..5));
  827. $code.=<<___;
  828. .globl AES_cbc_encrypt
  829. .align 32
  830. AES_cbc_encrypt:
  831. ld [$key + 240], %g1
  832. nop
  833. brz $enc, .Lcbc_decrypt
  834. cmp %g1, 12
  835. bl,pt %icc, aes128_t4_cbc_encrypt
  836. nop
  837. be,pn %icc, aes192_t4_cbc_encrypt
  838. nop
  839. ba aes256_t4_cbc_encrypt
  840. nop
  841. .Lcbc_decrypt:
  842. bl,pt %icc, aes128_t4_cbc_decrypt
  843. nop
  844. be,pn %icc, aes192_t4_cbc_decrypt
  845. nop
  846. ba aes256_t4_cbc_decrypt
  847. nop
  848. .type AES_cbc_encrypt,#function
  849. .size AES_cbc_encrypt,.-AES_cbc_encrypt
  850. ___
  851. }
  852. $code.=<<___;
  853. .asciz "AES for SPARC T4, David S. Miller, Andy Polyakov"
  854. .align 4
  855. ___
  856. &emit_assembler();
  857. close STDOUT or die "error closing STDOUT: $!";