aesfx-sparcv9.pl 27 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270
  1. #! /usr/bin/env perl
  2. # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. # March 2016
  16. #
  17. # Initial support for Fujitsu SPARC64 X/X+ comprises minimally
  18. # required key setup and single-block procedures.
  19. #
  20. # April 2016
  21. #
  22. # Add "teaser" CBC and CTR mode-specific subroutines. "Teaser" means
  23. # that parallelizable nature of CBC decrypt and CTR is not utilized
  24. # yet. CBC encrypt on the other hand is as good as it can possibly
  25. # get processing one byte in 4.1 cycles with 128-bit key on SPARC64 X.
  26. # This is ~6x faster than pure software implementation...
  27. #
  28. # July 2016
  29. #
  30. # Switch from faligndata to fshiftorx, which allows to omit alignaddr
  31. # instructions and improve single-block and short-input performance
  32. # with misaligned data.
  33. $output = pop;
  34. open STDOUT,">$output";
  35. {
  36. my ($inp,$out,$key,$rounds,$tmp,$mask) = map("%o$_",(0..5));
  37. $code.=<<___;
  38. #include "sparc_arch.h"
  39. #define LOCALS (STACK_BIAS+STACK_FRAME)
  40. .text
  41. .globl aes_fx_encrypt
  42. .align 32
  43. aes_fx_encrypt:
  44. and $inp, 7, $tmp ! is input aligned?
  45. andn $inp, 7, $inp
  46. ldd [$key + 0], %f6 ! round[0]
  47. ldd [$key + 8], %f8
  48. mov %o7, %g1
  49. ld [$key + 240], $rounds
  50. 1: call .+8
  51. add %o7, .Linp_align-1b, %o7
  52. sll $tmp, 3, $tmp
  53. ldd [$inp + 0], %f0 ! load input
  54. brz,pt $tmp, .Lenc_inp_aligned
  55. ldd [$inp + 8], %f2
  56. ldd [%o7 + $tmp], %f14 ! shift left params
  57. ldd [$inp + 16], %f4
  58. fshiftorx %f0, %f2, %f14, %f0
  59. fshiftorx %f2, %f4, %f14, %f2
  60. .Lenc_inp_aligned:
  61. ldd [$key + 16], %f10 ! round[1]
  62. ldd [$key + 24], %f12
  63. fxor %f0, %f6, %f0 ! ^=round[0]
  64. fxor %f2, %f8, %f2
  65. ldd [$key + 32], %f6 ! round[2]
  66. ldd [$key + 40], %f8
  67. add $key, 32, $key
  68. sub $rounds, 4, $rounds
  69. .Loop_enc:
  70. fmovd %f0, %f4
  71. faesencx %f2, %f10, %f0
  72. faesencx %f4, %f12, %f2
  73. ldd [$key + 16], %f10
  74. ldd [$key + 24], %f12
  75. add $key, 32, $key
  76. fmovd %f0, %f4
  77. faesencx %f2, %f6, %f0
  78. faesencx %f4, %f8, %f2
  79. ldd [$key + 0], %f6
  80. ldd [$key + 8], %f8
  81. brnz,a $rounds, .Loop_enc
  82. sub $rounds, 2, $rounds
  83. andcc $out, 7, $tmp ! is output aligned?
  84. andn $out, 7, $out
  85. mov 0xff, $mask
  86. srl $mask, $tmp, $mask
  87. add %o7, 64, %o7
  88. sll $tmp, 3, $tmp
  89. fmovd %f0, %f4
  90. faesencx %f2, %f10, %f0
  91. faesencx %f4, %f12, %f2
  92. ldd [%o7 + $tmp], %f14 ! shift right params
  93. fmovd %f0, %f4
  94. faesenclx %f2, %f6, %f0
  95. faesenclx %f4, %f8, %f2
  96. bnz,pn %icc, .Lenc_out_unaligned
  97. mov %g1, %o7
  98. std %f0, [$out + 0]
  99. retl
  100. std %f2, [$out + 8]
  101. .align 16
  102. .Lenc_out_unaligned:
  103. add $out, 16, $inp
  104. orn %g0, $mask, $tmp
  105. fshiftorx %f0, %f0, %f14, %f4
  106. fshiftorx %f0, %f2, %f14, %f6
  107. fshiftorx %f2, %f2, %f14, %f8
  108. stda %f4, [$out + $mask]0xc0 ! partial store
  109. std %f6, [$out + 8]
  110. stda %f8, [$inp + $tmp]0xc0 ! partial store
  111. retl
  112. nop
  113. .type aes_fx_encrypt,#function
  114. .size aes_fx_encrypt,.-aes_fx_encrypt
  115. .globl aes_fx_decrypt
  116. .align 32
  117. aes_fx_decrypt:
  118. and $inp, 7, $tmp ! is input aligned?
  119. andn $inp, 7, $inp
  120. ldd [$key + 0], %f6 ! round[0]
  121. ldd [$key + 8], %f8
  122. mov %o7, %g1
  123. ld [$key + 240], $rounds
  124. 1: call .+8
  125. add %o7, .Linp_align-1b, %o7
  126. sll $tmp, 3, $tmp
  127. ldd [$inp + 0], %f0 ! load input
  128. brz,pt $tmp, .Ldec_inp_aligned
  129. ldd [$inp + 8], %f2
  130. ldd [%o7 + $tmp], %f14 ! shift left params
  131. ldd [$inp + 16], %f4
  132. fshiftorx %f0, %f2, %f14, %f0
  133. fshiftorx %f2, %f4, %f14, %f2
  134. .Ldec_inp_aligned:
  135. ldd [$key + 16], %f10 ! round[1]
  136. ldd [$key + 24], %f12
  137. fxor %f0, %f6, %f0 ! ^=round[0]
  138. fxor %f2, %f8, %f2
  139. ldd [$key + 32], %f6 ! round[2]
  140. ldd [$key + 40], %f8
  141. add $key, 32, $key
  142. sub $rounds, 4, $rounds
  143. .Loop_dec:
  144. fmovd %f0, %f4
  145. faesdecx %f2, %f10, %f0
  146. faesdecx %f4, %f12, %f2
  147. ldd [$key + 16], %f10
  148. ldd [$key + 24], %f12
  149. add $key, 32, $key
  150. fmovd %f0, %f4
  151. faesdecx %f2, %f6, %f0
  152. faesdecx %f4, %f8, %f2
  153. ldd [$key + 0], %f6
  154. ldd [$key + 8], %f8
  155. brnz,a $rounds, .Loop_dec
  156. sub $rounds, 2, $rounds
  157. andcc $out, 7, $tmp ! is output aligned?
  158. andn $out, 7, $out
  159. mov 0xff, $mask
  160. srl $mask, $tmp, $mask
  161. add %o7, 64, %o7
  162. sll $tmp, 3, $tmp
  163. fmovd %f0, %f4
  164. faesdecx %f2, %f10, %f0
  165. faesdecx %f4, %f12, %f2
  166. ldd [%o7 + $tmp], %f14 ! shift right params
  167. fmovd %f0, %f4
  168. faesdeclx %f2, %f6, %f0
  169. faesdeclx %f4, %f8, %f2
  170. bnz,pn %icc, .Ldec_out_unaligned
  171. mov %g1, %o7
  172. std %f0, [$out + 0]
  173. retl
  174. std %f2, [$out + 8]
  175. .align 16
  176. .Ldec_out_unaligned:
  177. add $out, 16, $inp
  178. orn %g0, $mask, $tmp
  179. fshiftorx %f0, %f0, %f14, %f4
  180. fshiftorx %f0, %f2, %f14, %f6
  181. fshiftorx %f2, %f2, %f14, %f8
  182. stda %f4, [$out + $mask]0xc0 ! partial store
  183. std %f6, [$out + 8]
  184. stda %f8, [$inp + $tmp]0xc0 ! partial store
  185. retl
  186. nop
  187. .type aes_fx_decrypt,#function
  188. .size aes_fx_decrypt,.-aes_fx_decrypt
  189. ___
  190. }
  191. {
  192. my ($inp,$bits,$out,$tmp,$inc) = map("%o$_",(0..5));
  193. $code.=<<___;
  194. .globl aes_fx_set_decrypt_key
  195. .align 32
  196. aes_fx_set_decrypt_key:
  197. b .Lset_encrypt_key
  198. mov -1, $inc
  199. retl
  200. nop
  201. .type aes_fx_set_decrypt_key,#function
  202. .size aes_fx_set_decrypt_key,.-aes_fx_set_decrypt_key
  203. .globl aes_fx_set_encrypt_key
  204. .align 32
  205. aes_fx_set_encrypt_key:
  206. mov 1, $inc
  207. nop
  208. .Lset_encrypt_key:
  209. and $inp, 7, $tmp
  210. andn $inp, 7, $inp
  211. sll $tmp, 3, $tmp
  212. mov %o7, %g1
  213. 1: call .+8
  214. add %o7, .Linp_align-1b, %o7
  215. ldd [%o7 + $tmp], %f10 ! shift left params
  216. mov %g1, %o7
  217. cmp $bits, 192
  218. ldd [$inp + 0], %f0
  219. bl,pt %icc, .L128
  220. ldd [$inp + 8], %f2
  221. be,pt %icc, .L192
  222. ldd [$inp + 16], %f4
  223. brz,pt $tmp, .L256aligned
  224. ldd [$inp + 24], %f6
  225. ldd [$inp + 32], %f8
  226. fshiftorx %f0, %f2, %f10, %f0
  227. fshiftorx %f2, %f4, %f10, %f2
  228. fshiftorx %f4, %f6, %f10, %f4
  229. fshiftorx %f6, %f8, %f10, %f6
  230. .L256aligned:
  231. mov 14, $bits
  232. and $inc, `14*16`, $tmp
  233. st $bits, [$out + 240] ! store rounds
  234. add $out, $tmp, $out ! start or end of key schedule
  235. sllx $inc, 4, $inc ! 16 or -16
  236. ___
  237. for ($i=0; $i<6; $i++) {
  238. $code.=<<___;
  239. std %f0, [$out + 0]
  240. faeskeyx %f6, `0x10+$i`, %f0
  241. std %f2, [$out + 8]
  242. add $out, $inc, $out
  243. faeskeyx %f0, 0x00, %f2
  244. std %f4, [$out + 0]
  245. faeskeyx %f2, 0x01, %f4
  246. std %f6, [$out + 8]
  247. add $out, $inc, $out
  248. faeskeyx %f4, 0x00, %f6
  249. ___
  250. }
  251. $code.=<<___;
  252. std %f0, [$out + 0]
  253. faeskeyx %f6, `0x10+$i`, %f0
  254. std %f2, [$out + 8]
  255. add $out, $inc, $out
  256. faeskeyx %f0, 0x00, %f2
  257. std %f4,[$out + 0]
  258. std %f6,[$out + 8]
  259. add $out, $inc, $out
  260. std %f0,[$out + 0]
  261. std %f2,[$out + 8]
  262. retl
  263. xor %o0, %o0, %o0 ! return 0
  264. .align 16
  265. .L192:
  266. brz,pt $tmp, .L192aligned
  267. nop
  268. ldd [$inp + 24], %f6
  269. fshiftorx %f0, %f2, %f10, %f0
  270. fshiftorx %f2, %f4, %f10, %f2
  271. fshiftorx %f4, %f6, %f10, %f4
  272. .L192aligned:
  273. mov 12, $bits
  274. and $inc, `12*16`, $tmp
  275. st $bits, [$out + 240] ! store rounds
  276. add $out, $tmp, $out ! start or end of key schedule
  277. sllx $inc, 4, $inc ! 16 or -16
  278. ___
  279. for ($i=0; $i<8; $i+=2) {
  280. $code.=<<___;
  281. std %f0, [$out + 0]
  282. faeskeyx %f4, `0x10+$i`, %f0
  283. std %f2, [$out + 8]
  284. add $out, $inc, $out
  285. faeskeyx %f0, 0x00, %f2
  286. std %f4, [$out + 0]
  287. faeskeyx %f2, 0x00, %f4
  288. std %f0, [$out + 8]
  289. add $out, $inc, $out
  290. faeskeyx %f4, `0x10+$i+1`, %f0
  291. std %f2, [$out + 0]
  292. faeskeyx %f0, 0x00, %f2
  293. std %f4, [$out + 8]
  294. add $out, $inc, $out
  295. ___
  296. $code.=<<___ if ($i<6);
  297. faeskeyx %f2, 0x00, %f4
  298. ___
  299. }
  300. $code.=<<___;
  301. std %f0, [$out + 0]
  302. std %f2, [$out + 8]
  303. retl
  304. xor %o0, %o0, %o0 ! return 0
  305. .align 16
  306. .L128:
  307. brz,pt $tmp, .L128aligned
  308. nop
  309. ldd [$inp + 16], %f4
  310. fshiftorx %f0, %f2, %f10, %f0
  311. fshiftorx %f2, %f4, %f10, %f2
  312. .L128aligned:
  313. mov 10, $bits
  314. and $inc, `10*16`, $tmp
  315. st $bits, [$out + 240] ! store rounds
  316. add $out, $tmp, $out ! start or end of key schedule
  317. sllx $inc, 4, $inc ! 16 or -16
  318. ___
  319. for ($i=0; $i<10; $i++) {
  320. $code.=<<___;
  321. std %f0, [$out + 0]
  322. faeskeyx %f2, `0x10+$i`, %f0
  323. std %f2, [$out + 8]
  324. add $out, $inc, $out
  325. faeskeyx %f0, 0x00, %f2
  326. ___
  327. }
  328. $code.=<<___;
  329. std %f0, [$out + 0]
  330. std %f2, [$out + 8]
  331. retl
  332. xor %o0, %o0, %o0 ! return 0
  333. .type aes_fx_set_encrypt_key,#function
  334. .size aes_fx_set_encrypt_key,.-aes_fx_set_encrypt_key
  335. ___
  336. }
  337. {
  338. my ($inp,$out,$len,$key,$ivp,$dir) = map("%i$_",(0..5));
  339. my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7));
  340. my ($iv0,$iv1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift)
  341. = map("%f$_",grep { !($_ & 1) } (16 .. 62));
  342. my ($ileft,$iright) = ($ialign,$oalign);
  343. $code.=<<___;
  344. .globl aes_fx_cbc_encrypt
  345. .align 32
  346. aes_fx_cbc_encrypt:
  347. save %sp, -STACK_FRAME-16, %sp
  348. srln $len, 4, $len
  349. and $inp, 7, $ialign
  350. andn $inp, 7, $inp
  351. brz,pn $len, .Lcbc_no_data
  352. sll $ialign, 3, $ileft
  353. 1: call .+8
  354. add %o7, .Linp_align-1b, %o7
  355. ld [$key + 240], $rounds
  356. and $out, 7, $oalign
  357. ld [$ivp + 0], %f0 ! load ivec
  358. andn $out, 7, $out
  359. ld [$ivp + 4], %f1
  360. sll $oalign, 3, $mask
  361. ld [$ivp + 8], %f2
  362. ld [$ivp + 12], %f3
  363. sll $rounds, 4, $rounds
  364. add $rounds, $key, $end
  365. ldd [$key + 0], $r0hi ! round[0]
  366. ldd [$key + 8], $r0lo
  367. add $inp, 16, $inp
  368. sub $len, 1, $len
  369. ldd [$end + 0], $rlhi ! round[last]
  370. ldd [$end + 8], $rllo
  371. mov 16, $inc
  372. movrz $len, 0, $inc
  373. ldd [$key + 16], %f10 ! round[1]
  374. ldd [$key + 24], %f12
  375. ldd [%o7 + $ileft], $fshift ! shift left params
  376. add %o7, 64, %o7
  377. ldd [$inp - 16], $in0 ! load input
  378. ldd [$inp - 8], $in1
  379. ldda [$inp]0x82, $intail ! non-faulting load
  380. brz $dir, .Lcbc_decrypt
  381. add $inp, $inc, $inp ! inp+=16
  382. fxor $r0hi, %f0, %f0 ! ivec^=round[0]
  383. fxor $r0lo, %f2, %f2
  384. fshiftorx $in0, $in1, $fshift, $in0
  385. fshiftorx $in1, $intail, $fshift, $in1
  386. nop
  387. .Loop_cbc_enc:
  388. fxor $in0, %f0, %f0 ! inp^ivec^round[0]
  389. fxor $in1, %f2, %f2
  390. ldd [$key + 32], %f6 ! round[2]
  391. ldd [$key + 40], %f8
  392. add $key, 32, $end
  393. sub $rounds, 16*6, $inner
  394. .Lcbc_enc:
  395. fmovd %f0, %f4
  396. faesencx %f2, %f10, %f0
  397. faesencx %f4, %f12, %f2
  398. ldd [$end + 16], %f10
  399. ldd [$end + 24], %f12
  400. add $end, 32, $end
  401. fmovd %f0, %f4
  402. faesencx %f2, %f6, %f0
  403. faesencx %f4, %f8, %f2
  404. ldd [$end + 0], %f6
  405. ldd [$end + 8], %f8
  406. brnz,a $inner, .Lcbc_enc
  407. sub $inner, 16*2, $inner
  408. fmovd %f0, %f4
  409. faesencx %f2, %f10, %f0
  410. faesencx %f4, %f12, %f2
  411. ldd [$end + 16], %f10 ! round[last-1]
  412. ldd [$end + 24], %f12
  413. movrz $len, 0, $inc
  414. fmovd $intail, $in0
  415. ldd [$inp - 8], $in1 ! load next input block
  416. ldda [$inp]0x82, $intail ! non-faulting load
  417. add $inp, $inc, $inp ! inp+=16
  418. fmovd %f0, %f4
  419. faesencx %f2, %f6, %f0
  420. faesencx %f4, %f8, %f2
  421. fshiftorx $in0, $in1, $fshift, $in0
  422. fshiftorx $in1, $intail, $fshift, $in1
  423. fmovd %f0, %f4
  424. faesencx %f2, %f10, %f0
  425. faesencx %f4, %f12, %f2
  426. ldd [$key + 16], %f10 ! round[1]
  427. ldd [$key + 24], %f12
  428. fxor $r0hi, $in0, $in0 ! inp^=round[0]
  429. fxor $r0lo, $in1, $in1
  430. fmovd %f0, %f4
  431. faesenclx %f2, $rlhi, %f0
  432. faesenclx %f4, $rllo, %f2
  433. brnz,pn $oalign, .Lcbc_enc_unaligned_out
  434. nop
  435. std %f0, [$out + 0]
  436. std %f2, [$out + 8]
  437. add $out, 16, $out
  438. brnz,a $len, .Loop_cbc_enc
  439. sub $len, 1, $len
  440. st %f0, [$ivp + 0] ! output ivec
  441. st %f1, [$ivp + 4]
  442. st %f2, [$ivp + 8]
  443. st %f3, [$ivp + 12]
  444. .Lcbc_no_data:
  445. ret
  446. restore
  447. .align 32
  448. .Lcbc_enc_unaligned_out:
  449. ldd [%o7 + $mask], $fshift ! shift right params
  450. mov 0xff, $mask
  451. srl $mask, $oalign, $mask
  452. sub %g0, $ileft, $iright
  453. fshiftorx %f0, %f0, $fshift, %f6
  454. fshiftorx %f0, %f2, $fshift, %f8
  455. stda %f6, [$out + $mask]0xc0 ! partial store
  456. orn %g0, $mask, $mask
  457. std %f8, [$out + 8]
  458. add $out, 16, $out
  459. brz $len, .Lcbc_enc_unaligned_out_done
  460. sub $len, 1, $len
  461. b .Loop_cbc_enc_unaligned_out
  462. nop
  463. .align 32
  464. .Loop_cbc_enc_unaligned_out:
  465. fmovd %f2, $outhead
  466. fxor $in0, %f0, %f0 ! inp^ivec^round[0]
  467. fxor $in1, %f2, %f2
  468. ldd [$key + 32], %f6 ! round[2]
  469. ldd [$key + 40], %f8
  470. fmovd %f0, %f4
  471. faesencx %f2, %f10, %f0
  472. faesencx %f4, %f12, %f2
  473. ldd [$key + 48], %f10 ! round[3]
  474. ldd [$key + 56], %f12
  475. ldx [$inp - 16], %o0
  476. ldx [$inp - 8], %o1
  477. brz $ileft, .Lcbc_enc_aligned_inp
  478. movrz $len, 0, $inc
  479. ldx [$inp], %o2
  480. sllx %o0, $ileft, %o0
  481. srlx %o1, $iright, %g1
  482. sllx %o1, $ileft, %o1
  483. or %g1, %o0, %o0
  484. srlx %o2, $iright, %o2
  485. or %o2, %o1, %o1
  486. .Lcbc_enc_aligned_inp:
  487. fmovd %f0, %f4
  488. faesencx %f2, %f6, %f0
  489. faesencx %f4, %f8, %f2
  490. ldd [$key + 64], %f6 ! round[4]
  491. ldd [$key + 72], %f8
  492. add $key, 64, $end
  493. sub $rounds, 16*8, $inner
  494. stx %o0, [%sp + LOCALS + 0]
  495. stx %o1, [%sp + LOCALS + 8]
  496. add $inp, $inc, $inp ! inp+=16
  497. nop
  498. .Lcbc_enc_unaligned:
  499. fmovd %f0, %f4
  500. faesencx %f2, %f10, %f0
  501. faesencx %f4, %f12, %f2
  502. ldd [$end + 16], %f10
  503. ldd [$end + 24], %f12
  504. add $end, 32, $end
  505. fmovd %f0, %f4
  506. faesencx %f2, %f6, %f0
  507. faesencx %f4, %f8, %f2
  508. ldd [$end + 0], %f6
  509. ldd [$end + 8], %f8
  510. brnz,a $inner, .Lcbc_enc_unaligned
  511. sub $inner, 16*2, $inner
  512. fmovd %f0, %f4
  513. faesencx %f2, %f10, %f0
  514. faesencx %f4, %f12, %f2
  515. ldd [$end + 16], %f10 ! round[last-1]
  516. ldd [$end + 24], %f12
  517. fmovd %f0, %f4
  518. faesencx %f2, %f6, %f0
  519. faesencx %f4, %f8, %f2
  520. ldd [%sp + LOCALS + 0], $in0
  521. ldd [%sp + LOCALS + 8], $in1
  522. fmovd %f0, %f4
  523. faesencx %f2, %f10, %f0
  524. faesencx %f4, %f12, %f2
  525. ldd [$key + 16], %f10 ! round[1]
  526. ldd [$key + 24], %f12
  527. fxor $r0hi, $in0, $in0 ! inp^=round[0]
  528. fxor $r0lo, $in1, $in1
  529. fmovd %f0, %f4
  530. faesenclx %f2, $rlhi, %f0
  531. faesenclx %f4, $rllo, %f2
  532. fshiftorx $outhead, %f0, $fshift, %f6
  533. fshiftorx %f0, %f2, $fshift, %f8
  534. std %f6, [$out + 0]
  535. std %f8, [$out + 8]
  536. add $out, 16, $out
  537. brnz,a $len, .Loop_cbc_enc_unaligned_out
  538. sub $len, 1, $len
  539. .Lcbc_enc_unaligned_out_done:
  540. fshiftorx %f2, %f2, $fshift, %f8
  541. stda %f8, [$out + $mask]0xc0 ! partial store
  542. st %f0, [$ivp + 0] ! output ivec
  543. st %f1, [$ivp + 4]
  544. st %f2, [$ivp + 8]
  545. st %f3, [$ivp + 12]
  546. ret
  547. restore
  548. .align 32
  549. .Lcbc_decrypt:
  550. fshiftorx $in0, $in1, $fshift, $in0
  551. fshiftorx $in1, $intail, $fshift, $in1
  552. fmovd %f0, $iv0
  553. fmovd %f2, $iv1
  554. .Loop_cbc_dec:
  555. fxor $in0, $r0hi, %f0 ! inp^round[0]
  556. fxor $in1, $r0lo, %f2
  557. ldd [$key + 32], %f6 ! round[2]
  558. ldd [$key + 40], %f8
  559. add $key, 32, $end
  560. sub $rounds, 16*6, $inner
  561. .Lcbc_dec:
  562. fmovd %f0, %f4
  563. faesdecx %f2, %f10, %f0
  564. faesdecx %f4, %f12, %f2
  565. ldd [$end + 16], %f10
  566. ldd [$end + 24], %f12
  567. add $end, 32, $end
  568. fmovd %f0, %f4
  569. faesdecx %f2, %f6, %f0
  570. faesdecx %f4, %f8, %f2
  571. ldd [$end + 0], %f6
  572. ldd [$end + 8], %f8
  573. brnz,a $inner, .Lcbc_dec
  574. sub $inner, 16*2, $inner
  575. fmovd %f0, %f4
  576. faesdecx %f2, %f10, %f0
  577. faesdecx %f4, %f12, %f2
  578. ldd [$end + 16], %f10 ! round[last-1]
  579. ldd [$end + 24], %f12
  580. fmovd %f0, %f4
  581. faesdecx %f2, %f6, %f0
  582. faesdecx %f4, %f8, %f2
  583. fxor $iv0, $rlhi, %f6 ! ivec^round[last]
  584. fxor $iv1, $rllo, %f8
  585. fmovd $in0, $iv0
  586. fmovd $in1, $iv1
  587. movrz $len, 0, $inc
  588. fmovd $intail, $in0
  589. ldd [$inp - 8], $in1 ! load next input block
  590. ldda [$inp]0x82, $intail ! non-faulting load
  591. add $inp, $inc, $inp ! inp+=16
  592. fmovd %f0, %f4
  593. faesdecx %f2, %f10, %f0
  594. faesdecx %f4, %f12, %f2
  595. ldd [$key + 16], %f10 ! round[1]
  596. ldd [$key + 24], %f12
  597. fshiftorx $in0, $in1, $fshift, $in0
  598. fshiftorx $in1, $intail, $fshift, $in1
  599. fmovd %f0, %f4
  600. faesdeclx %f2, %f6, %f0
  601. faesdeclx %f4, %f8, %f2
  602. brnz,pn $oalign, .Lcbc_dec_unaligned_out
  603. nop
  604. std %f0, [$out + 0]
  605. std %f2, [$out + 8]
  606. add $out, 16, $out
  607. brnz,a $len, .Loop_cbc_dec
  608. sub $len, 1, $len
  609. st $iv0, [$ivp + 0] ! output ivec
  610. st $iv0#lo, [$ivp + 4]
  611. st $iv1, [$ivp + 8]
  612. st $iv1#lo, [$ivp + 12]
  613. ret
  614. restore
  615. .align 32
  616. .Lcbc_dec_unaligned_out:
  617. ldd [%o7 + $mask], $fshift ! shift right params
  618. mov 0xff, $mask
  619. srl $mask, $oalign, $mask
  620. sub %g0, $ileft, $iright
  621. fshiftorx %f0, %f0, $fshift, %f6
  622. fshiftorx %f0, %f2, $fshift, %f8
  623. stda %f6, [$out + $mask]0xc0 ! partial store
  624. orn %g0, $mask, $mask
  625. std %f8, [$out + 8]
  626. add $out, 16, $out
  627. brz $len, .Lcbc_dec_unaligned_out_done
  628. sub $len, 1, $len
  629. b .Loop_cbc_dec_unaligned_out
  630. nop
  631. .align 32
  632. .Loop_cbc_dec_unaligned_out:
  633. fmovd %f2, $outhead
  634. fxor $in0, $r0hi, %f0 ! inp^round[0]
  635. fxor $in1, $r0lo, %f2
  636. ldd [$key + 32], %f6 ! round[2]
  637. ldd [$key + 40], %f8
  638. fmovd %f0, %f4
  639. faesdecx %f2, %f10, %f0
  640. faesdecx %f4, %f12, %f2
  641. ldd [$key + 48], %f10 ! round[3]
  642. ldd [$key + 56], %f12
  643. ldx [$inp - 16], %o0
  644. ldx [$inp - 8], %o1
  645. brz $ileft, .Lcbc_dec_aligned_inp
  646. movrz $len, 0, $inc
  647. ldx [$inp], %o2
  648. sllx %o0, $ileft, %o0
  649. srlx %o1, $iright, %g1
  650. sllx %o1, $ileft, %o1
  651. or %g1, %o0, %o0
  652. srlx %o2, $iright, %o2
  653. or %o2, %o1, %o1
  654. .Lcbc_dec_aligned_inp:
  655. fmovd %f0, %f4
  656. faesdecx %f2, %f6, %f0
  657. faesdecx %f4, %f8, %f2
  658. ldd [$key + 64], %f6 ! round[4]
  659. ldd [$key + 72], %f8
  660. add $key, 64, $end
  661. sub $rounds, 16*8, $inner
  662. stx %o0, [%sp + LOCALS + 0]
  663. stx %o1, [%sp + LOCALS + 8]
  664. add $inp, $inc, $inp ! inp+=16
  665. nop
  666. .Lcbc_dec_unaligned:
  667. fmovd %f0, %f4
  668. faesdecx %f2, %f10, %f0
  669. faesdecx %f4, %f12, %f2
  670. ldd [$end + 16], %f10
  671. ldd [$end + 24], %f12
  672. add $end, 32, $end
  673. fmovd %f0, %f4
  674. faesdecx %f2, %f6, %f0
  675. faesdecx %f4, %f8, %f2
  676. ldd [$end + 0], %f6
  677. ldd [$end + 8], %f8
  678. brnz,a $inner, .Lcbc_dec_unaligned
  679. sub $inner, 16*2, $inner
  680. fmovd %f0, %f4
  681. faesdecx %f2, %f10, %f0
  682. faesdecx %f4, %f12, %f2
  683. ldd [$end + 16], %f10 ! round[last-1]
  684. ldd [$end + 24], %f12
  685. fmovd %f0, %f4
  686. faesdecx %f2, %f6, %f0
  687. faesdecx %f4, %f8, %f2
  688. fxor $iv0, $rlhi, %f6 ! ivec^round[last]
  689. fxor $iv1, $rllo, %f8
  690. fmovd $in0, $iv0
  691. fmovd $in1, $iv1
  692. ldd [%sp + LOCALS + 0], $in0
  693. ldd [%sp + LOCALS + 8], $in1
  694. fmovd %f0, %f4
  695. faesdecx %f2, %f10, %f0
  696. faesdecx %f4, %f12, %f2
  697. ldd [$key + 16], %f10 ! round[1]
  698. ldd [$key + 24], %f12
  699. fmovd %f0, %f4
  700. faesdeclx %f2, %f6, %f0
  701. faesdeclx %f4, %f8, %f2
  702. fshiftorx $outhead, %f0, $fshift, %f6
  703. fshiftorx %f0, %f2, $fshift, %f8
  704. std %f6, [$out + 0]
  705. std %f8, [$out + 8]
  706. add $out, 16, $out
  707. brnz,a $len, .Loop_cbc_dec_unaligned_out
  708. sub $len, 1, $len
  709. .Lcbc_dec_unaligned_out_done:
  710. fshiftorx %f2, %f2, $fshift, %f8
  711. stda %f8, [$out + $mask]0xc0 ! partial store
  712. st $iv0, [$ivp + 0] ! output ivec
  713. st $iv0#lo, [$ivp + 4]
  714. st $iv1, [$ivp + 8]
  715. st $iv1#lo, [$ivp + 12]
  716. ret
  717. restore
  718. .type aes_fx_cbc_encrypt,#function
  719. .size aes_fx_cbc_encrypt,.-aes_fx_cbc_encrypt
  720. ___
  721. }
  722. {
  723. my ($inp,$out,$len,$key,$ivp) = map("%i$_",(0..5));
  724. my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7));
  725. my ($ctr0,$ctr1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift)
  726. = map("%f$_",grep { !($_ & 1) } (16 .. 62));
  727. my ($ileft,$iright) = ($ialign, $oalign);
  728. my $one = "%f14";
  729. $code.=<<___;
  730. .globl aes_fx_ctr32_encrypt_blocks
  731. .align 32
  732. aes_fx_ctr32_encrypt_blocks:
  733. save %sp, -STACK_FRAME-16, %sp
  734. srln $len, 0, $len
  735. and $inp, 7, $ialign
  736. andn $inp, 7, $inp
  737. brz,pn $len, .Lctr32_no_data
  738. sll $ialign, 3, $ileft
  739. .Lpic: call .+8
  740. add %o7, .Linp_align - .Lpic, %o7
  741. ld [$key + 240], $rounds
  742. and $out, 7, $oalign
  743. ld [$ivp + 0], $ctr0 ! load counter
  744. andn $out, 7, $out
  745. ld [$ivp + 4], $ctr0#lo
  746. sll $oalign, 3, $mask
  747. ld [$ivp + 8], $ctr1
  748. ld [$ivp + 12], $ctr1#lo
  749. ldd [%o7 + 128], $one
  750. sll $rounds, 4, $rounds
  751. add $rounds, $key, $end
  752. ldd [$key + 0], $r0hi ! round[0]
  753. ldd [$key + 8], $r0lo
  754. add $inp, 16, $inp
  755. sub $len, 1, $len
  756. ldd [$key + 16], %f10 ! round[1]
  757. ldd [$key + 24], %f12
  758. mov 16, $inc
  759. movrz $len, 0, $inc
  760. ldd [$end + 0], $rlhi ! round[last]
  761. ldd [$end + 8], $rllo
  762. ldd [%o7 + $ileft], $fshift ! shiftleft params
  763. add %o7, 64, %o7
  764. ldd [$inp - 16], $in0 ! load input
  765. ldd [$inp - 8], $in1
  766. ldda [$inp]0x82, $intail ! non-faulting load
  767. add $inp, $inc, $inp ! inp+=16
  768. fshiftorx $in0, $in1, $fshift, $in0
  769. fshiftorx $in1, $intail, $fshift, $in1
  770. .Loop_ctr32:
  771. fxor $ctr0, $r0hi, %f0 ! counter^round[0]
  772. fxor $ctr1, $r0lo, %f2
  773. ldd [$key + 32], %f6 ! round[2]
  774. ldd [$key + 40], %f8
  775. add $key, 32, $end
  776. sub $rounds, 16*6, $inner
  777. .Lctr32_enc:
  778. fmovd %f0, %f4
  779. faesencx %f2, %f10, %f0
  780. faesencx %f4, %f12, %f2
  781. ldd [$end + 16], %f10
  782. ldd [$end + 24], %f12
  783. add $end, 32, $end
  784. fmovd %f0, %f4
  785. faesencx %f2, %f6, %f0
  786. faesencx %f4, %f8, %f2
  787. ldd [$end + 0], %f6
  788. ldd [$end + 8], %f8
  789. brnz,a $inner, .Lctr32_enc
  790. sub $inner, 16*2, $inner
  791. fmovd %f0, %f4
  792. faesencx %f2, %f10, %f0
  793. faesencx %f4, %f12, %f2
  794. ldd [$end + 16], %f10 ! round[last-1]
  795. ldd [$end + 24], %f12
  796. fmovd %f0, %f4
  797. faesencx %f2, %f6, %f0
  798. faesencx %f4, %f8, %f2
  799. fxor $in0, $rlhi, %f6 ! inp^round[last]
  800. fxor $in1, $rllo, %f8
  801. movrz $len, 0, $inc
  802. fmovd $intail, $in0
  803. ldd [$inp - 8], $in1 ! load next input block
  804. ldda [$inp]0x82, $intail ! non-faulting load
  805. add $inp, $inc, $inp ! inp+=16
  806. fmovd %f0, %f4
  807. faesencx %f2, %f10, %f0
  808. faesencx %f4, %f12, %f2
  809. ldd [$key + 16], %f10 ! round[1]
  810. ldd [$key + 24], %f12
  811. fshiftorx $in0, $in1, $fshift, $in0
  812. fshiftorx $in1, $intail, $fshift, $in1
  813. fpadd32 $ctr1, $one, $ctr1 ! increment counter
  814. fmovd %f0, %f4
  815. faesenclx %f2, %f6, %f0
  816. faesenclx %f4, %f8, %f2
  817. brnz,pn $oalign, .Lctr32_unaligned_out
  818. nop
  819. std %f0, [$out + 0]
  820. std %f2, [$out + 8]
  821. add $out, 16, $out
  822. brnz,a $len, .Loop_ctr32
  823. sub $len, 1, $len
  824. .Lctr32_no_data:
  825. ret
  826. restore
  827. .align 32
  828. .Lctr32_unaligned_out:
  829. ldd [%o7 + $mask], $fshift ! shift right params
  830. mov 0xff, $mask
  831. srl $mask, $oalign, $mask
  832. sub %g0, $ileft, $iright
  833. fshiftorx %f0, %f0, $fshift, %f6
  834. fshiftorx %f0, %f2, $fshift, %f8
  835. stda %f6, [$out + $mask]0xc0 ! partial store
  836. orn %g0, $mask, $mask
  837. std %f8, [$out + 8]
  838. add $out, 16, $out
  839. brz $len, .Lctr32_unaligned_out_done
  840. sub $len, 1, $len
  841. b .Loop_ctr32_unaligned_out
  842. nop
  843. .align 32
  844. .Loop_ctr32_unaligned_out:
  845. fmovd %f2, $outhead
  846. fxor $ctr0, $r0hi, %f0 ! counter^round[0]
  847. fxor $ctr1, $r0lo, %f2
  848. ldd [$key + 32], %f6 ! round[2]
  849. ldd [$key + 40], %f8
  850. fmovd %f0, %f4
  851. faesencx %f2, %f10, %f0
  852. faesencx %f4, %f12, %f2
  853. ldd [$key + 48], %f10 ! round[3]
  854. ldd [$key + 56], %f12
  855. ldx [$inp - 16], %o0
  856. ldx [$inp - 8], %o1
  857. brz $ileft, .Lctr32_aligned_inp
  858. movrz $len, 0, $inc
  859. ldx [$inp], %o2
  860. sllx %o0, $ileft, %o0
  861. srlx %o1, $iright, %g1
  862. sllx %o1, $ileft, %o1
  863. or %g1, %o0, %o0
  864. srlx %o2, $iright, %o2
  865. or %o2, %o1, %o1
  866. .Lctr32_aligned_inp:
  867. fmovd %f0, %f4
  868. faesencx %f2, %f6, %f0
  869. faesencx %f4, %f8, %f2
  870. ldd [$key + 64], %f6 ! round[4]
  871. ldd [$key + 72], %f8
  872. add $key, 64, $end
  873. sub $rounds, 16*8, $inner
  874. stx %o0, [%sp + LOCALS + 0]
  875. stx %o1, [%sp + LOCALS + 8]
  876. add $inp, $inc, $inp ! inp+=16
  877. nop
  878. .Lctr32_enc_unaligned:
  879. fmovd %f0, %f4
  880. faesencx %f2, %f10, %f0
  881. faesencx %f4, %f12, %f2
  882. ldd [$end + 16], %f10
  883. ldd [$end + 24], %f12
  884. add $end, 32, $end
  885. fmovd %f0, %f4
  886. faesencx %f2, %f6, %f0
  887. faesencx %f4, %f8, %f2
  888. ldd [$end + 0], %f6
  889. ldd [$end + 8], %f8
  890. brnz,a $inner, .Lctr32_enc_unaligned
  891. sub $inner, 16*2, $inner
  892. fmovd %f0, %f4
  893. faesencx %f2, %f10, %f0
  894. faesencx %f4, %f12, %f2
  895. ldd [$end + 16], %f10 ! round[last-1]
  896. ldd [$end + 24], %f12
  897. fpadd32 $ctr1, $one, $ctr1 ! increment counter
  898. fmovd %f0, %f4
  899. faesencx %f2, %f6, %f0
  900. faesencx %f4, %f8, %f2
  901. fxor $in0, $rlhi, %f6 ! inp^round[last]
  902. fxor $in1, $rllo, %f8
  903. ldd [%sp + LOCALS + 0], $in0
  904. ldd [%sp + LOCALS + 8], $in1
  905. fmovd %f0, %f4
  906. faesencx %f2, %f10, %f0
  907. faesencx %f4, %f12, %f2
  908. ldd [$key + 16], %f10 ! round[1]
  909. ldd [$key + 24], %f12
  910. fmovd %f0, %f4
  911. faesenclx %f2, %f6, %f0
  912. faesenclx %f4, %f8, %f2
  913. fshiftorx $outhead, %f0, $fshift, %f6
  914. fshiftorx %f0, %f2, $fshift, %f8
  915. std %f6, [$out + 0]
  916. std %f8, [$out + 8]
  917. add $out, 16, $out
  918. brnz,a $len, .Loop_ctr32_unaligned_out
  919. sub $len, 1, $len
  920. .Lctr32_unaligned_out_done:
  921. fshiftorx %f2, %f2, $fshift, %f8
  922. stda %f8, [$out + $mask]0xc0 ! partial store
  923. ret
  924. restore
  925. .type aes_fx_ctr32_encrypt_blocks,#function
  926. .size aes_fx_ctr32_encrypt_blocks,.-aes_fx_ctr32_encrypt_blocks
  927. .align 32
  928. .Linp_align: ! fshiftorx parameters for left shift toward %rs1
  929. .byte 0, 0, 64, 0, 0, 64, 0, -64
  930. .byte 0, 0, 56, 8, 0, 56, 8, -56
  931. .byte 0, 0, 48, 16, 0, 48, 16, -48
  932. .byte 0, 0, 40, 24, 0, 40, 24, -40
  933. .byte 0, 0, 32, 32, 0, 32, 32, -32
  934. .byte 0, 0, 24, 40, 0, 24, 40, -24
  935. .byte 0, 0, 16, 48, 0, 16, 48, -16
  936. .byte 0, 0, 8, 56, 0, 8, 56, -8
  937. .Lout_align: ! fshiftorx parameters for right shift toward %rs2
  938. .byte 0, 0, 0, 64, 0, 0, 64, 0
  939. .byte 0, 0, 8, 56, 0, 8, 56, -8
  940. .byte 0, 0, 16, 48, 0, 16, 48, -16
  941. .byte 0, 0, 24, 40, 0, 24, 40, -24
  942. .byte 0, 0, 32, 32, 0, 32, 32, -32
  943. .byte 0, 0, 40, 24, 0, 40, 24, -40
  944. .byte 0, 0, 48, 16, 0, 48, 16, -48
  945. .byte 0, 0, 56, 8, 0, 56, 8, -56
  946. .Lone:
  947. .word 0, 1
  948. .asciz "AES for Fujitsu SPARC64 X, CRYPTOGAMS by <appro\@openssl.org>"
  949. .align 4
  950. ___
  951. }
  952. # Purpose of these subroutines is to explicitly encode VIS instructions,
  953. # so that one can compile the module without having to specify VIS
  954. # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
  955. # Idea is to reserve for option to produce "universal" binary and let
  956. # programmer detect if current CPU is VIS capable at run-time.
  957. sub unvis {
  958. my ($mnemonic,$rs1,$rs2,$rd)=@_;
  959. my ($ref,$opf);
  960. my %visopf = ( "faligndata" => 0x048,
  961. "bshuffle" => 0x04c,
  962. "fpadd32" => 0x052,
  963. "fxor" => 0x06c,
  964. "fsrc2" => 0x078 );
  965. $ref = "$mnemonic\t$rs1,$rs2,$rd";
  966. if ($opf=$visopf{$mnemonic}) {
  967. foreach ($rs1,$rs2,$rd) {
  968. return $ref if (!/%f([0-9]{1,2})/);
  969. $_=$1;
  970. if ($1>=32) {
  971. return $ref if ($1&1);
  972. # re-encode for upper double register addressing
  973. $_=($1|$1>>5)&31;
  974. }
  975. }
  976. return sprintf ".word\t0x%08x !%s",
  977. 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
  978. $ref;
  979. } else {
  980. return $ref;
  981. }
  982. }
  983. sub unvis3 {
  984. my ($mnemonic,$rs1,$rs2,$rd)=@_;
  985. my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
  986. my ($ref,$opf);
  987. my %visopf = ( "alignaddr" => 0x018,
  988. "bmask" => 0x019,
  989. "alignaddrl" => 0x01a );
  990. $ref = "$mnemonic\t$rs1,$rs2,$rd";
  991. if ($opf=$visopf{$mnemonic}) {
  992. foreach ($rs1,$rs2,$rd) {
  993. return $ref if (!/%([goli])([0-9])/);
  994. $_=$bias{$1}+$2;
  995. }
  996. return sprintf ".word\t0x%08x !%s",
  997. 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
  998. $ref;
  999. } else {
  1000. return $ref;
  1001. }
  1002. }
  1003. sub unfx {
  1004. my ($mnemonic,$rs1,$rs2,$rd)=@_;
  1005. my ($ref,$opf);
  1006. my %aesopf = ( "faesencx" => 0x90,
  1007. "faesdecx" => 0x91,
  1008. "faesenclx" => 0x92,
  1009. "faesdeclx" => 0x93,
  1010. "faeskeyx" => 0x94 );
  1011. $ref = "$mnemonic\t$rs1,$rs2,$rd";
  1012. if (defined($opf=$aesopf{$mnemonic})) {
  1013. $rs2 = ($rs2 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs2;
  1014. $rs2 = oct($rs2) if ($rs2 =~ /^0/);
  1015. foreach ($rs1,$rd) {
  1016. return $ref if (!/%f([0-9]{1,2})/);
  1017. $_=$1;
  1018. if ($1>=32) {
  1019. return $ref if ($1&1);
  1020. # re-encode for upper double register addressing
  1021. $_=($1|$1>>5)&31;
  1022. }
  1023. }
  1024. return sprintf ".word\t0x%08x !%s",
  1025. 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
  1026. $ref;
  1027. } else {
  1028. return $ref;
  1029. }
  1030. }
  1031. sub unfx3src {
  1032. my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
  1033. my ($ref,$opf);
  1034. my %aesopf = ( "fshiftorx" => 0x0b );
  1035. $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
  1036. if (defined($opf=$aesopf{$mnemonic})) {
  1037. foreach ($rs1,$rs2,$rs3,$rd) {
  1038. return $ref if (!/%f([0-9]{1,2})/);
  1039. $_=$1;
  1040. if ($1>=32) {
  1041. return $ref if ($1&1);
  1042. # re-encode for upper double register addressing
  1043. $_=($1|$1>>5)&31;
  1044. }
  1045. }
  1046. return sprintf ".word\t0x%08x !%s",
  1047. 2<<30|$rd<<25|0x37<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
  1048. $ref;
  1049. } else {
  1050. return $ref;
  1051. }
  1052. }
  1053. foreach (split("\n",$code)) {
  1054. s/\`([^\`]*)\`/eval $1/ge;
  1055. s/%f([0-9]+)#lo/sprintf "%%f%d",$1+1/ge;
  1056. s/\b(faes[^x]{3,4}x)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
  1057. &unfx($1,$2,$3,$4)
  1058. /ge or
  1059. s/\b([f][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
  1060. &unfx3src($1,$2,$3,$4,$5)
  1061. /ge or
  1062. s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
  1063. &unvis($1,$2,$3,$4)
  1064. /ge or
  1065. s/\b(alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
  1066. &unvis3($1,$2,$3,$4)
  1067. /ge;
  1068. print $_,"\n";
  1069. }
  1070. close STDOUT;