2
0

sparcv9_modes.pl 38 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # Specific modes implementations for SPARC Architecture 2011. There
  9. # is T4 dependency though, an ASI value that is not specified in the
  10. # Architecture Manual. But as SPARC universe is rather monocultural,
  11. # we imply that processor capable of executing crypto instructions
  12. # can handle the ASI in question as well. This means that we ought to
  13. # keep eyes open when new processors emerge...
  14. #
  15. # As for above mentioned ASI. It's so called "block initializing
  16. # store" which cancels "read" in "read-update-write" on cache lines.
  17. # This is "cooperative" optimization, as it reduces overall pressure
  18. # on memory interface. Benefits can't be observed/quantified with
  19. # usual benchmarks, on the contrary you can notice that single-thread
  20. # performance for parallelizable modes is ~1.5% worse for largest
  21. # block sizes [though few percent better for not so long ones]. All
  22. # this based on suggestions from David Miller.
  23. $::bias="STACK_BIAS";
  24. $::frame="STACK_FRAME";
  25. $::size_t_cc="SIZE_T_CC";
  26. sub asm_init { # to be called with @ARGV as argument
  27. for (@_) { $::abibits=64 if (/\-m64/ || /\-xarch\=v9/); }
  28. if ($::abibits==64) { $::bias=2047; $::frame=192; $::size_t_cc="%xcc"; }
  29. else { $::bias=0; $::frame=112; $::size_t_cc="%icc"; }
  30. }
  31. # unified interface
  32. my ($inp,$out,$len,$key,$ivec)=map("%i$_",(0..5));
  33. # local variables
  34. my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7));
  35. sub alg_cbc_encrypt_implement {
  36. my ($alg,$bits) = @_;
  37. $::code.=<<___;
  38. .globl ${alg}${bits}_t4_cbc_encrypt
  39. .align 32
  40. ${alg}${bits}_t4_cbc_encrypt:
  41. save %sp, -$::frame, %sp
  42. cmp $len, 0
  43. be,pn $::size_t_cc, .L${bits}_cbc_enc_abort
  44. srln $len, 0, $len ! needed on v8+, "nop" on v9
  45. sub $inp, $out, $blk_init ! $inp!=$out
  46. ___
  47. $::code.=<<___ if (!$::evp);
  48. andcc $ivec, 7, $ivoff
  49. alignaddr $ivec, %g0, $ivec
  50. ldd [$ivec + 0], %f0 ! load ivec
  51. bz,pt %icc, 1f
  52. ldd [$ivec + 8], %f2
  53. ldd [$ivec + 16], %f4
  54. faligndata %f0, %f2, %f0
  55. faligndata %f2, %f4, %f2
  56. 1:
  57. ___
  58. $::code.=<<___ if ($::evp);
  59. ld [$ivec + 0], %f0
  60. ld [$ivec + 4], %f1
  61. ld [$ivec + 8], %f2
  62. ld [$ivec + 12], %f3
  63. ___
  64. $::code.=<<___;
  65. prefetch [$inp], 20
  66. prefetch [$inp + 63], 20
  67. call _${alg}${bits}_load_enckey
  68. and $inp, 7, $ileft
  69. andn $inp, 7, $inp
  70. sll $ileft, 3, $ileft
  71. mov 64, $iright
  72. mov 0xff, $omask
  73. sub $iright, $ileft, $iright
  74. and $out, 7, $ooff
  75. cmp $len, 127
  76. movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
  77. movleu $::size_t_cc, 0, $blk_init ! $len<128 ||
  78. brnz,pn $blk_init, .L${bits}cbc_enc_blk ! $inp==$out)
  79. srl $omask, $ooff, $omask
  80. alignaddrl $out, %g0, $out
  81. srlx $len, 4, $len
  82. prefetch [$out], 22
  83. .L${bits}_cbc_enc_loop:
  84. ldx [$inp + 0], %o0
  85. brz,pt $ileft, 4f
  86. ldx [$inp + 8], %o1
  87. ldx [$inp + 16], %o2
  88. sllx %o0, $ileft, %o0
  89. srlx %o1, $iright, %g1
  90. sllx %o1, $ileft, %o1
  91. or %g1, %o0, %o0
  92. srlx %o2, $iright, %o2
  93. or %o2, %o1, %o1
  94. 4:
  95. xor %g4, %o0, %o0 ! ^= rk[0]
  96. xor %g5, %o1, %o1
  97. movxtod %o0, %f12
  98. movxtod %o1, %f14
  99. fxor %f12, %f0, %f0 ! ^= ivec
  100. fxor %f14, %f2, %f2
  101. prefetch [$out + 63], 22
  102. prefetch [$inp + 16+63], 20
  103. call _${alg}${bits}_encrypt_1x
  104. add $inp, 16, $inp
  105. brnz,pn $ooff, 2f
  106. sub $len, 1, $len
  107. std %f0, [$out + 0]
  108. std %f2, [$out + 8]
  109. brnz,pt $len, .L${bits}_cbc_enc_loop
  110. add $out, 16, $out
  111. ___
  112. $::code.=<<___ if ($::evp);
  113. st %f0, [$ivec + 0]
  114. st %f1, [$ivec + 4]
  115. st %f2, [$ivec + 8]
  116. st %f3, [$ivec + 12]
  117. ___
  118. $::code.=<<___ if (!$::evp);
  119. brnz,pn $ivoff, 3f
  120. nop
  121. std %f0, [$ivec + 0] ! write out ivec
  122. std %f2, [$ivec + 8]
  123. ___
  124. $::code.=<<___;
  125. .L${bits}_cbc_enc_abort:
  126. ret
  127. restore
  128. .align 16
  129. 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
  130. ! and ~3x deterioration
  131. ! in inp==out case
  132. faligndata %f0, %f0, %f4 ! handle unaligned output
  133. faligndata %f0, %f2, %f6
  134. faligndata %f2, %f2, %f8
  135. stda %f4, [$out + $omask]0xc0 ! partial store
  136. std %f6, [$out + 8]
  137. add $out, 16, $out
  138. orn %g0, $omask, $omask
  139. stda %f8, [$out + $omask]0xc0 ! partial store
  140. brnz,pt $len, .L${bits}_cbc_enc_loop+4
  141. orn %g0, $omask, $omask
  142. ___
  143. $::code.=<<___ if ($::evp);
  144. st %f0, [$ivec + 0]
  145. st %f1, [$ivec + 4]
  146. st %f2, [$ivec + 8]
  147. st %f3, [$ivec + 12]
  148. ___
  149. $::code.=<<___ if (!$::evp);
  150. brnz,pn $ivoff, 3f
  151. nop
  152. std %f0, [$ivec + 0] ! write out ivec
  153. std %f2, [$ivec + 8]
  154. ret
  155. restore
  156. .align 16
  157. 3: alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec
  158. mov 0xff, $omask
  159. srl $omask, $ivoff, $omask
  160. faligndata %f0, %f0, %f4
  161. faligndata %f0, %f2, %f6
  162. faligndata %f2, %f2, %f8
  163. stda %f4, [$ivec + $omask]0xc0
  164. std %f6, [$ivec + 8]
  165. add $ivec, 16, $ivec
  166. orn %g0, $omask, $omask
  167. stda %f8, [$ivec + $omask]0xc0
  168. ___
  169. $::code.=<<___;
  170. ret
  171. restore
  172. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  173. .align 32
  174. .L${bits}cbc_enc_blk:
  175. add $out, $len, $blk_init
  176. and $blk_init, 63, $blk_init ! tail
  177. sub $len, $blk_init, $len
  178. add $blk_init, 15, $blk_init ! round up to 16n
  179. srlx $len, 4, $len
  180. srl $blk_init, 4, $blk_init
  181. .L${bits}_cbc_enc_blk_loop:
  182. ldx [$inp + 0], %o0
  183. brz,pt $ileft, 5f
  184. ldx [$inp + 8], %o1
  185. ldx [$inp + 16], %o2
  186. sllx %o0, $ileft, %o0
  187. srlx %o1, $iright, %g1
  188. sllx %o1, $ileft, %o1
  189. or %g1, %o0, %o0
  190. srlx %o2, $iright, %o2
  191. or %o2, %o1, %o1
  192. 5:
  193. xor %g4, %o0, %o0 ! ^= rk[0]
  194. xor %g5, %o1, %o1
  195. movxtod %o0, %f12
  196. movxtod %o1, %f14
  197. fxor %f12, %f0, %f0 ! ^= ivec
  198. fxor %f14, %f2, %f2
  199. prefetch [$inp + 16+63], 20
  200. call _${alg}${bits}_encrypt_1x
  201. add $inp, 16, $inp
  202. sub $len, 1, $len
  203. stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  204. add $out, 8, $out
  205. stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  206. brnz,pt $len, .L${bits}_cbc_enc_blk_loop
  207. add $out, 8, $out
  208. membar #StoreLoad|#StoreStore
  209. brnz,pt $blk_init, .L${bits}_cbc_enc_loop
  210. mov $blk_init, $len
  211. ___
  212. $::code.=<<___ if ($::evp);
  213. st %f0, [$ivec + 0]
  214. st %f1, [$ivec + 4]
  215. st %f2, [$ivec + 8]
  216. st %f3, [$ivec + 12]
  217. ___
  218. $::code.=<<___ if (!$::evp);
  219. brnz,pn $ivoff, 3b
  220. nop
  221. std %f0, [$ivec + 0] ! write out ivec
  222. std %f2, [$ivec + 8]
  223. ___
  224. $::code.=<<___;
  225. ret
  226. restore
  227. .type ${alg}${bits}_t4_cbc_encrypt,#function
  228. .size ${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt
  229. ___
  230. }
  231. sub alg_cbc_decrypt_implement {
  232. my ($alg,$bits) = @_;
  233. $::code.=<<___;
  234. .globl ${alg}${bits}_t4_cbc_decrypt
  235. .align 32
  236. ${alg}${bits}_t4_cbc_decrypt:
  237. save %sp, -$::frame, %sp
  238. cmp $len, 0
  239. be,pn $::size_t_cc, .L${bits}_cbc_dec_abort
  240. srln $len, 0, $len ! needed on v8+, "nop" on v9
  241. sub $inp, $out, $blk_init ! $inp!=$out
  242. ___
  243. $::code.=<<___ if (!$::evp);
  244. andcc $ivec, 7, $ivoff
  245. alignaddr $ivec, %g0, $ivec
  246. ldd [$ivec + 0], %f12 ! load ivec
  247. bz,pt %icc, 1f
  248. ldd [$ivec + 8], %f14
  249. ldd [$ivec + 16], %f0
  250. faligndata %f12, %f14, %f12
  251. faligndata %f14, %f0, %f14
  252. 1:
  253. ___
  254. $::code.=<<___ if ($::evp);
  255. ld [$ivec + 0], %f12 ! load ivec
  256. ld [$ivec + 4], %f13
  257. ld [$ivec + 8], %f14
  258. ld [$ivec + 12], %f15
  259. ___
  260. $::code.=<<___;
  261. prefetch [$inp], 20
  262. prefetch [$inp + 63], 20
  263. call _${alg}${bits}_load_deckey
  264. and $inp, 7, $ileft
  265. andn $inp, 7, $inp
  266. sll $ileft, 3, $ileft
  267. mov 64, $iright
  268. mov 0xff, $omask
  269. sub $iright, $ileft, $iright
  270. and $out, 7, $ooff
  271. cmp $len, 255
  272. movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
  273. movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
  274. brnz,pn $blk_init, .L${bits}cbc_dec_blk ! $inp==$out)
  275. srl $omask, $ooff, $omask
  276. andcc $len, 16, %g0 ! is number of blocks even?
  277. srlx $len, 4, $len
  278. alignaddrl $out, %g0, $out
  279. bz %icc, .L${bits}_cbc_dec_loop2x
  280. prefetch [$out], 22
  281. .L${bits}_cbc_dec_loop:
  282. ldx [$inp + 0], %o0
  283. brz,pt $ileft, 4f
  284. ldx [$inp + 8], %o1
  285. ldx [$inp + 16], %o2
  286. sllx %o0, $ileft, %o0
  287. srlx %o1, $iright, %g1
  288. sllx %o1, $ileft, %o1
  289. or %g1, %o0, %o0
  290. srlx %o2, $iright, %o2
  291. or %o2, %o1, %o1
  292. 4:
  293. xor %g4, %o0, %o2 ! ^= rk[0]
  294. xor %g5, %o1, %o3
  295. movxtod %o2, %f0
  296. movxtod %o3, %f2
  297. prefetch [$out + 63], 22
  298. prefetch [$inp + 16+63], 20
  299. call _${alg}${bits}_decrypt_1x
  300. add $inp, 16, $inp
  301. fxor %f12, %f0, %f0 ! ^= ivec
  302. fxor %f14, %f2, %f2
  303. movxtod %o0, %f12
  304. movxtod %o1, %f14
  305. brnz,pn $ooff, 2f
  306. sub $len, 1, $len
  307. std %f0, [$out + 0]
  308. std %f2, [$out + 8]
  309. brnz,pt $len, .L${bits}_cbc_dec_loop2x
  310. add $out, 16, $out
  311. ___
  312. $::code.=<<___ if ($::evp);
  313. st %f12, [$ivec + 0]
  314. st %f13, [$ivec + 4]
  315. st %f14, [$ivec + 8]
  316. st %f15, [$ivec + 12]
  317. ___
  318. $::code.=<<___ if (!$::evp);
  319. brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
  320. nop
  321. std %f12, [$ivec + 0] ! write out ivec
  322. std %f14, [$ivec + 8]
  323. ___
  324. $::code.=<<___;
  325. .L${bits}_cbc_dec_abort:
  326. ret
  327. restore
  328. .align 16
  329. 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
  330. ! and ~3x deterioration
  331. ! in inp==out case
  332. faligndata %f0, %f0, %f4 ! handle unaligned output
  333. faligndata %f0, %f2, %f6
  334. faligndata %f2, %f2, %f8
  335. stda %f4, [$out + $omask]0xc0 ! partial store
  336. std %f6, [$out + 8]
  337. add $out, 16, $out
  338. orn %g0, $omask, $omask
  339. stda %f8, [$out + $omask]0xc0 ! partial store
  340. brnz,pt $len, .L${bits}_cbc_dec_loop2x+4
  341. orn %g0, $omask, $omask
  342. ___
  343. $::code.=<<___ if ($::evp);
  344. st %f12, [$ivec + 0]
  345. st %f13, [$ivec + 4]
  346. st %f14, [$ivec + 8]
  347. st %f15, [$ivec + 12]
  348. ___
  349. $::code.=<<___ if (!$::evp);
  350. brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
  351. nop
  352. std %f12, [$ivec + 0] ! write out ivec
  353. std %f14, [$ivec + 8]
  354. ___
  355. $::code.=<<___;
  356. ret
  357. restore
  358. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  359. .align 32
  360. .L${bits}_cbc_dec_loop2x:
  361. ldx [$inp + 0], %o0
  362. ldx [$inp + 8], %o1
  363. ldx [$inp + 16], %o2
  364. brz,pt $ileft, 4f
  365. ldx [$inp + 24], %o3
  366. ldx [$inp + 32], %o4
  367. sllx %o0, $ileft, %o0
  368. srlx %o1, $iright, %g1
  369. or %g1, %o0, %o0
  370. sllx %o1, $ileft, %o1
  371. srlx %o2, $iright, %g1
  372. or %g1, %o1, %o1
  373. sllx %o2, $ileft, %o2
  374. srlx %o3, $iright, %g1
  375. or %g1, %o2, %o2
  376. sllx %o3, $ileft, %o3
  377. srlx %o4, $iright, %o4
  378. or %o4, %o3, %o3
  379. 4:
  380. xor %g4, %o0, %o4 ! ^= rk[0]
  381. xor %g5, %o1, %o5
  382. movxtod %o4, %f0
  383. movxtod %o5, %f2
  384. xor %g4, %o2, %o4
  385. xor %g5, %o3, %o5
  386. movxtod %o4, %f4
  387. movxtod %o5, %f6
  388. prefetch [$out + 63], 22
  389. prefetch [$inp + 32+63], 20
  390. call _${alg}${bits}_decrypt_2x
  391. add $inp, 32, $inp
  392. movxtod %o0, %f8
  393. movxtod %o1, %f10
  394. fxor %f12, %f0, %f0 ! ^= ivec
  395. fxor %f14, %f2, %f2
  396. movxtod %o2, %f12
  397. movxtod %o3, %f14
  398. fxor %f8, %f4, %f4
  399. fxor %f10, %f6, %f6
  400. brnz,pn $ooff, 2f
  401. sub $len, 2, $len
  402. std %f0, [$out + 0]
  403. std %f2, [$out + 8]
  404. std %f4, [$out + 16]
  405. std %f6, [$out + 24]
  406. brnz,pt $len, .L${bits}_cbc_dec_loop2x
  407. add $out, 32, $out
  408. ___
  409. $::code.=<<___ if ($::evp);
  410. st %f12, [$ivec + 0]
  411. st %f13, [$ivec + 4]
  412. st %f14, [$ivec + 8]
  413. st %f15, [$ivec + 12]
  414. ___
  415. $::code.=<<___ if (!$::evp);
  416. brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
  417. nop
  418. std %f12, [$ivec + 0] ! write out ivec
  419. std %f14, [$ivec + 8]
  420. ___
  421. $::code.=<<___;
  422. ret
  423. restore
  424. .align 16
  425. 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
  426. ! and ~3x deterioration
  427. ! in inp==out case
  428. faligndata %f0, %f0, %f8 ! handle unaligned output
  429. faligndata %f0, %f2, %f0
  430. faligndata %f2, %f4, %f2
  431. faligndata %f4, %f6, %f4
  432. faligndata %f6, %f6, %f6
  433. stda %f8, [$out + $omask]0xc0 ! partial store
  434. std %f0, [$out + 8]
  435. std %f2, [$out + 16]
  436. std %f4, [$out + 24]
  437. add $out, 32, $out
  438. orn %g0, $omask, $omask
  439. stda %f6, [$out + $omask]0xc0 ! partial store
  440. brnz,pt $len, .L${bits}_cbc_dec_loop2x+4
  441. orn %g0, $omask, $omask
  442. ___
  443. $::code.=<<___ if ($::evp);
  444. st %f12, [$ivec + 0]
  445. st %f13, [$ivec + 4]
  446. st %f14, [$ivec + 8]
  447. st %f15, [$ivec + 12]
  448. ___
  449. $::code.=<<___ if (!$::evp);
  450. brnz,pn $ivoff, .L${bits}_cbc_dec_unaligned_ivec
  451. nop
  452. std %f12, [$ivec + 0] ! write out ivec
  453. std %f14, [$ivec + 8]
  454. ret
  455. restore
  456. .align 16
  457. .L${bits}_cbc_dec_unaligned_ivec:
  458. alignaddrl $ivec, $ivoff, %g0 ! handle unaligned ivec
  459. mov 0xff, $omask
  460. srl $omask, $ivoff, $omask
  461. faligndata %f12, %f12, %f0
  462. faligndata %f12, %f14, %f2
  463. faligndata %f14, %f14, %f4
  464. stda %f0, [$ivec + $omask]0xc0
  465. std %f2, [$ivec + 8]
  466. add $ivec, 16, $ivec
  467. orn %g0, $omask, $omask
  468. stda %f4, [$ivec + $omask]0xc0
  469. ___
  470. $::code.=<<___;
  471. ret
  472. restore
  473. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  474. .align 32
  475. .L${bits}cbc_dec_blk:
  476. add $out, $len, $blk_init
  477. and $blk_init, 63, $blk_init ! tail
  478. sub $len, $blk_init, $len
  479. add $blk_init, 15, $blk_init ! round up to 16n
  480. srlx $len, 4, $len
  481. srl $blk_init, 4, $blk_init
  482. sub $len, 1, $len
  483. add $blk_init, 1, $blk_init
  484. .L${bits}_cbc_dec_blk_loop2x:
  485. ldx [$inp + 0], %o0
  486. ldx [$inp + 8], %o1
  487. ldx [$inp + 16], %o2
  488. brz,pt $ileft, 5f
  489. ldx [$inp + 24], %o3
  490. ldx [$inp + 32], %o4
  491. sllx %o0, $ileft, %o0
  492. srlx %o1, $iright, %g1
  493. or %g1, %o0, %o0
  494. sllx %o1, $ileft, %o1
  495. srlx %o2, $iright, %g1
  496. or %g1, %o1, %o1
  497. sllx %o2, $ileft, %o2
  498. srlx %o3, $iright, %g1
  499. or %g1, %o2, %o2
  500. sllx %o3, $ileft, %o3
  501. srlx %o4, $iright, %o4
  502. or %o4, %o3, %o3
  503. 5:
  504. xor %g4, %o0, %o4 ! ^= rk[0]
  505. xor %g5, %o1, %o5
  506. movxtod %o4, %f0
  507. movxtod %o5, %f2
  508. xor %g4, %o2, %o4
  509. xor %g5, %o3, %o5
  510. movxtod %o4, %f4
  511. movxtod %o5, %f6
  512. prefetch [$inp + 32+63], 20
  513. call _${alg}${bits}_decrypt_2x
  514. add $inp, 32, $inp
  515. subcc $len, 2, $len
  516. movxtod %o0, %f8
  517. movxtod %o1, %f10
  518. fxor %f12, %f0, %f0 ! ^= ivec
  519. fxor %f14, %f2, %f2
  520. movxtod %o2, %f12
  521. movxtod %o3, %f14
  522. fxor %f8, %f4, %f4
  523. fxor %f10, %f6, %f6
  524. stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  525. add $out, 8, $out
  526. stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  527. add $out, 8, $out
  528. stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  529. add $out, 8, $out
  530. stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  531. bgu,pt $::size_t_cc, .L${bits}_cbc_dec_blk_loop2x
  532. add $out, 8, $out
  533. add $blk_init, $len, $len
  534. andcc $len, 1, %g0 ! is number of blocks even?
  535. membar #StoreLoad|#StoreStore
  536. bnz,pt %icc, .L${bits}_cbc_dec_loop
  537. srl $len, 0, $len
  538. brnz,pn $len, .L${bits}_cbc_dec_loop2x
  539. nop
  540. ___
  541. $::code.=<<___ if ($::evp);
  542. st %f12, [$ivec + 0] ! write out ivec
  543. st %f13, [$ivec + 4]
  544. st %f14, [$ivec + 8]
  545. st %f15, [$ivec + 12]
  546. ___
  547. $::code.=<<___ if (!$::evp);
  548. brnz,pn $ivoff, 3b
  549. nop
  550. std %f12, [$ivec + 0] ! write out ivec
  551. std %f14, [$ivec + 8]
  552. ___
  553. $::code.=<<___;
  554. ret
  555. restore
  556. .type ${alg}${bits}_t4_cbc_decrypt,#function
  557. .size ${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt
  558. ___
  559. }
  560. sub alg_ctr32_implement {
  561. my ($alg,$bits) = @_;
  562. $::code.=<<___;
  563. .globl ${alg}${bits}_t4_ctr32_encrypt
  564. .align 32
  565. ${alg}${bits}_t4_ctr32_encrypt:
  566. save %sp, -$::frame, %sp
  567. srln $len, 0, $len ! needed on v8+, "nop" on v9
  568. prefetch [$inp], 20
  569. prefetch [$inp + 63], 20
  570. call _${alg}${bits}_load_enckey
  571. sllx $len, 4, $len
  572. ld [$ivec + 0], %l4 ! counter
  573. ld [$ivec + 4], %l5
  574. ld [$ivec + 8], %l6
  575. ld [$ivec + 12], %l7
  576. sllx %l4, 32, %o5
  577. or %l5, %o5, %o5
  578. sllx %l6, 32, %g1
  579. xor %o5, %g4, %g4 ! ^= rk[0]
  580. xor %g1, %g5, %g5
  581. movxtod %g4, %f14 ! most significant 64 bits
  582. sub $inp, $out, $blk_init ! $inp!=$out
  583. and $inp, 7, $ileft
  584. andn $inp, 7, $inp
  585. sll $ileft, 3, $ileft
  586. mov 64, $iright
  587. mov 0xff, $omask
  588. sub $iright, $ileft, $iright
  589. and $out, 7, $ooff
  590. cmp $len, 255
  591. movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
  592. movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
  593. brnz,pn $blk_init, .L${bits}_ctr32_blk ! $inp==$out)
  594. srl $omask, $ooff, $omask
  595. andcc $len, 16, %g0 ! is number of blocks even?
  596. alignaddrl $out, %g0, $out
  597. bz %icc, .L${bits}_ctr32_loop2x
  598. srlx $len, 4, $len
  599. .L${bits}_ctr32_loop:
  600. ldx [$inp + 0], %o0
  601. brz,pt $ileft, 4f
  602. ldx [$inp + 8], %o1
  603. ldx [$inp + 16], %o2
  604. sllx %o0, $ileft, %o0
  605. srlx %o1, $iright, %g1
  606. sllx %o1, $ileft, %o1
  607. or %g1, %o0, %o0
  608. srlx %o2, $iright, %o2
  609. or %o2, %o1, %o1
  610. 4:
  611. xor %g5, %l7, %g1 ! ^= rk[0]
  612. add %l7, 1, %l7
  613. movxtod %g1, %f2
  614. srl %l7, 0, %l7 ! clruw
  615. prefetch [$out + 63], 22
  616. prefetch [$inp + 16+63], 20
  617. ___
  618. $::code.=<<___ if ($alg eq "aes");
  619. aes_eround01 %f16, %f14, %f2, %f4
  620. aes_eround23 %f18, %f14, %f2, %f2
  621. ___
  622. $::code.=<<___ if ($alg eq "cmll");
  623. camellia_f %f16, %f2, %f14, %f2
  624. camellia_f %f18, %f14, %f2, %f0
  625. ___
  626. $::code.=<<___;
  627. call _${alg}${bits}_encrypt_1x+8
  628. add $inp, 16, $inp
  629. movxtod %o0, %f10
  630. movxtod %o1, %f12
  631. fxor %f10, %f0, %f0 ! ^= inp
  632. fxor %f12, %f2, %f2
  633. brnz,pn $ooff, 2f
  634. sub $len, 1, $len
  635. std %f0, [$out + 0]
  636. std %f2, [$out + 8]
  637. brnz,pt $len, .L${bits}_ctr32_loop2x
  638. add $out, 16, $out
  639. ret
  640. restore
  641. .align 16
  642. 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
  643. ! and ~3x deterioration
  644. ! in inp==out case
  645. faligndata %f0, %f0, %f4 ! handle unaligned output
  646. faligndata %f0, %f2, %f6
  647. faligndata %f2, %f2, %f8
  648. stda %f4, [$out + $omask]0xc0 ! partial store
  649. std %f6, [$out + 8]
  650. add $out, 16, $out
  651. orn %g0, $omask, $omask
  652. stda %f8, [$out + $omask]0xc0 ! partial store
  653. brnz,pt $len, .L${bits}_ctr32_loop2x+4
  654. orn %g0, $omask, $omask
  655. ret
  656. restore
  657. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  658. .align 32
  659. .L${bits}_ctr32_loop2x:
  660. ldx [$inp + 0], %o0
  661. ldx [$inp + 8], %o1
  662. ldx [$inp + 16], %o2
  663. brz,pt $ileft, 4f
  664. ldx [$inp + 24], %o3
  665. ldx [$inp + 32], %o4
  666. sllx %o0, $ileft, %o0
  667. srlx %o1, $iright, %g1
  668. or %g1, %o0, %o0
  669. sllx %o1, $ileft, %o1
  670. srlx %o2, $iright, %g1
  671. or %g1, %o1, %o1
  672. sllx %o2, $ileft, %o2
  673. srlx %o3, $iright, %g1
  674. or %g1, %o2, %o2
  675. sllx %o3, $ileft, %o3
  676. srlx %o4, $iright, %o4
  677. or %o4, %o3, %o3
  678. 4:
  679. xor %g5, %l7, %g1 ! ^= rk[0]
  680. add %l7, 1, %l7
  681. movxtod %g1, %f2
  682. srl %l7, 0, %l7 ! clruw
  683. xor %g5, %l7, %g1
  684. add %l7, 1, %l7
  685. movxtod %g1, %f6
  686. srl %l7, 0, %l7 ! clruw
  687. prefetch [$out + 63], 22
  688. prefetch [$inp + 32+63], 20
  689. ___
  690. $::code.=<<___ if ($alg eq "aes");
  691. aes_eround01 %f16, %f14, %f2, %f8
  692. aes_eround23 %f18, %f14, %f2, %f2
  693. aes_eround01 %f16, %f14, %f6, %f10
  694. aes_eround23 %f18, %f14, %f6, %f6
  695. ___
  696. $::code.=<<___ if ($alg eq "cmll");
  697. camellia_f %f16, %f2, %f14, %f2
  698. camellia_f %f16, %f6, %f14, %f6
  699. camellia_f %f18, %f14, %f2, %f0
  700. camellia_f %f18, %f14, %f6, %f4
  701. ___
  702. $::code.=<<___;
  703. call _${alg}${bits}_encrypt_2x+16
  704. add $inp, 32, $inp
  705. movxtod %o0, %f8
  706. movxtod %o1, %f10
  707. movxtod %o2, %f12
  708. fxor %f8, %f0, %f0 ! ^= inp
  709. movxtod %o3, %f8
  710. fxor %f10, %f2, %f2
  711. fxor %f12, %f4, %f4
  712. fxor %f8, %f6, %f6
  713. brnz,pn $ooff, 2f
  714. sub $len, 2, $len
  715. std %f0, [$out + 0]
  716. std %f2, [$out + 8]
  717. std %f4, [$out + 16]
  718. std %f6, [$out + 24]
  719. brnz,pt $len, .L${bits}_ctr32_loop2x
  720. add $out, 32, $out
  721. ret
  722. restore
  723. .align 16
  724. 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
  725. ! and ~3x deterioration
  726. ! in inp==out case
  727. faligndata %f0, %f0, %f8 ! handle unaligned output
  728. faligndata %f0, %f2, %f0
  729. faligndata %f2, %f4, %f2
  730. faligndata %f4, %f6, %f4
  731. faligndata %f6, %f6, %f6
  732. stda %f8, [$out + $omask]0xc0 ! partial store
  733. std %f0, [$out + 8]
  734. std %f2, [$out + 16]
  735. std %f4, [$out + 24]
  736. add $out, 32, $out
  737. orn %g0, $omask, $omask
  738. stda %f6, [$out + $omask]0xc0 ! partial store
  739. brnz,pt $len, .L${bits}_ctr32_loop2x+4
  740. orn %g0, $omask, $omask
  741. ret
  742. restore
  743. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  744. .align 32
  745. .L${bits}_ctr32_blk:
  746. add $out, $len, $blk_init
  747. and $blk_init, 63, $blk_init ! tail
  748. sub $len, $blk_init, $len
  749. add $blk_init, 15, $blk_init ! round up to 16n
  750. srlx $len, 4, $len
  751. srl $blk_init, 4, $blk_init
  752. sub $len, 1, $len
  753. add $blk_init, 1, $blk_init
  754. .L${bits}_ctr32_blk_loop2x:
  755. ldx [$inp + 0], %o0
  756. ldx [$inp + 8], %o1
  757. ldx [$inp + 16], %o2
  758. brz,pt $ileft, 5f
  759. ldx [$inp + 24], %o3
  760. ldx [$inp + 32], %o4
  761. sllx %o0, $ileft, %o0
  762. srlx %o1, $iright, %g1
  763. or %g1, %o0, %o0
  764. sllx %o1, $ileft, %o1
  765. srlx %o2, $iright, %g1
  766. or %g1, %o1, %o1
  767. sllx %o2, $ileft, %o2
  768. srlx %o3, $iright, %g1
  769. or %g1, %o2, %o2
  770. sllx %o3, $ileft, %o3
  771. srlx %o4, $iright, %o4
  772. or %o4, %o3, %o3
  773. 5:
  774. xor %g5, %l7, %g1 ! ^= rk[0]
  775. add %l7, 1, %l7
  776. movxtod %g1, %f2
  777. srl %l7, 0, %l7 ! clruw
  778. xor %g5, %l7, %g1
  779. add %l7, 1, %l7
  780. movxtod %g1, %f6
  781. srl %l7, 0, %l7 ! clruw
  782. prefetch [$inp + 32+63], 20
  783. ___
  784. $::code.=<<___ if ($alg eq "aes");
  785. aes_eround01 %f16, %f14, %f2, %f8
  786. aes_eround23 %f18, %f14, %f2, %f2
  787. aes_eround01 %f16, %f14, %f6, %f10
  788. aes_eround23 %f18, %f14, %f6, %f6
  789. ___
  790. $::code.=<<___ if ($alg eq "cmll");
  791. camellia_f %f16, %f2, %f14, %f2
  792. camellia_f %f16, %f6, %f14, %f6
  793. camellia_f %f18, %f14, %f2, %f0
  794. camellia_f %f18, %f14, %f6, %f4
  795. ___
  796. $::code.=<<___;
  797. call _${alg}${bits}_encrypt_2x+16
  798. add $inp, 32, $inp
  799. subcc $len, 2, $len
  800. movxtod %o0, %f8
  801. movxtod %o1, %f10
  802. movxtod %o2, %f12
  803. fxor %f8, %f0, %f0 ! ^= inp
  804. movxtod %o3, %f8
  805. fxor %f10, %f2, %f2
  806. fxor %f12, %f4, %f4
  807. fxor %f8, %f6, %f6
  808. stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  809. add $out, 8, $out
  810. stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  811. add $out, 8, $out
  812. stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  813. add $out, 8, $out
  814. stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  815. bgu,pt $::size_t_cc, .L${bits}_ctr32_blk_loop2x
  816. add $out, 8, $out
  817. add $blk_init, $len, $len
  818. andcc $len, 1, %g0 ! is number of blocks even?
  819. membar #StoreLoad|#StoreStore
  820. bnz,pt %icc, .L${bits}_ctr32_loop
  821. srl $len, 0, $len
  822. brnz,pn $len, .L${bits}_ctr32_loop2x
  823. nop
  824. ret
  825. restore
  826. .type ${alg}${bits}_t4_ctr32_encrypt,#function
  827. .size ${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt
  828. ___
  829. }
  830. sub alg_xts_implement {
  831. my ($alg,$bits,$dir) = @_;
  832. my ($inp,$out,$len,$key1,$key2,$ivec)=map("%i$_",(0..5));
  833. my $rem=$ivec;
  834. $::code.=<<___;
  835. .globl ${alg}${bits}_t4_xts_${dir}crypt
  836. .align 32
  837. ${alg}${bits}_t4_xts_${dir}crypt:
  838. save %sp, -$::frame-16, %sp
  839. srln $len, 0, $len ! needed on v8+, "nop" on v9
  840. mov $ivec, %o0
  841. add %fp, $::bias-16, %o1
  842. call ${alg}_t4_encrypt
  843. mov $key2, %o2
  844. add %fp, $::bias-16, %l7
  845. ldxa [%l7]0x88, %g2
  846. add %fp, $::bias-8, %l7
  847. ldxa [%l7]0x88, %g3 ! %g3:%g2 is tweak
  848. sethi %hi(0x76543210), %l7
  849. or %l7, %lo(0x76543210), %l7
  850. bmask %l7, %g0, %g0 ! byte swap mask
  851. prefetch [$inp], 20
  852. prefetch [$inp + 63], 20
  853. call _${alg}${bits}_load_${dir}ckey
  854. and $len, 15, $rem
  855. and $len, -16, $len
  856. ___
  857. $code.=<<___ if ($dir eq "de");
  858. mov 0, %l7
  859. movrnz $rem, 16, %l7
  860. sub $len, %l7, $len
  861. ___
  862. $code.=<<___;
  863. sub $inp, $out, $blk_init ! $inp!=$out
  864. and $inp, 7, $ileft
  865. andn $inp, 7, $inp
  866. sll $ileft, 3, $ileft
  867. mov 64, $iright
  868. mov 0xff, $omask
  869. sub $iright, $ileft, $iright
  870. and $out, 7, $ooff
  871. cmp $len, 255
  872. movrnz $ooff, 0, $blk_init ! if ( $out&7 ||
  873. movleu $::size_t_cc, 0, $blk_init ! $len<256 ||
  874. brnz,pn $blk_init, .L${bits}_xts_${dir}blk ! $inp==$out)
  875. srl $omask, $ooff, $omask
  876. andcc $len, 16, %g0 ! is number of blocks even?
  877. ___
  878. $code.=<<___ if ($dir eq "de");
  879. brz,pn $len, .L${bits}_xts_${dir}steal
  880. ___
  881. $code.=<<___;
  882. alignaddrl $out, %g0, $out
  883. bz %icc, .L${bits}_xts_${dir}loop2x
  884. srlx $len, 4, $len
  885. .L${bits}_xts_${dir}loop:
  886. ldx [$inp + 0], %o0
  887. brz,pt $ileft, 4f
  888. ldx [$inp + 8], %o1
  889. ldx [$inp + 16], %o2
  890. sllx %o0, $ileft, %o0
  891. srlx %o1, $iright, %g1
  892. sllx %o1, $ileft, %o1
  893. or %g1, %o0, %o0
  894. srlx %o2, $iright, %o2
  895. or %o2, %o1, %o1
  896. 4:
  897. movxtod %g2, %f12
  898. movxtod %g3, %f14
  899. bshuffle %f12, %f12, %f12
  900. bshuffle %f14, %f14, %f14
  901. xor %g4, %o0, %o0 ! ^= rk[0]
  902. xor %g5, %o1, %o1
  903. movxtod %o0, %f0
  904. movxtod %o1, %f2
  905. fxor %f12, %f0, %f0 ! ^= tweak[0]
  906. fxor %f14, %f2, %f2
  907. prefetch [$out + 63], 22
  908. prefetch [$inp + 16+63], 20
  909. call _${alg}${bits}_${dir}crypt_1x
  910. add $inp, 16, $inp
  911. fxor %f12, %f0, %f0 ! ^= tweak[0]
  912. fxor %f14, %f2, %f2
  913. srax %g3, 63, %l7 ! next tweak value
  914. addcc %g2, %g2, %g2
  915. and %l7, 0x87, %l7
  916. addxc %g3, %g3, %g3
  917. xor %l7, %g2, %g2
  918. brnz,pn $ooff, 2f
  919. sub $len, 1, $len
  920. std %f0, [$out + 0]
  921. std %f2, [$out + 8]
  922. brnz,pt $len, .L${bits}_xts_${dir}loop2x
  923. add $out, 16, $out
  924. brnz,pn $rem, .L${bits}_xts_${dir}steal
  925. nop
  926. ret
  927. restore
  928. .align 16
  929. 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
  930. ! and ~3x deterioration
  931. ! in inp==out case
  932. faligndata %f0, %f0, %f4 ! handle unaligned output
  933. faligndata %f0, %f2, %f6
  934. faligndata %f2, %f2, %f8
  935. stda %f4, [$out + $omask]0xc0 ! partial store
  936. std %f6, [$out + 8]
  937. add $out, 16, $out
  938. orn %g0, $omask, $omask
  939. stda %f8, [$out + $omask]0xc0 ! partial store
  940. brnz,pt $len, .L${bits}_xts_${dir}loop2x+4
  941. orn %g0, $omask, $omask
  942. brnz,pn $rem, .L${bits}_xts_${dir}steal
  943. nop
  944. ret
  945. restore
  946. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  947. .align 32
  948. .L${bits}_xts_${dir}loop2x:
  949. ldx [$inp + 0], %o0
  950. ldx [$inp + 8], %o1
  951. ldx [$inp + 16], %o2
  952. brz,pt $ileft, 4f
  953. ldx [$inp + 24], %o3
  954. ldx [$inp + 32], %o4
  955. sllx %o0, $ileft, %o0
  956. srlx %o1, $iright, %g1
  957. or %g1, %o0, %o0
  958. sllx %o1, $ileft, %o1
  959. srlx %o2, $iright, %g1
  960. or %g1, %o1, %o1
  961. sllx %o2, $ileft, %o2
  962. srlx %o3, $iright, %g1
  963. or %g1, %o2, %o2
  964. sllx %o3, $ileft, %o3
  965. srlx %o4, $iright, %o4
  966. or %o4, %o3, %o3
  967. 4:
  968. movxtod %g2, %f12
  969. movxtod %g3, %f14
  970. bshuffle %f12, %f12, %f12
  971. bshuffle %f14, %f14, %f14
  972. srax %g3, 63, %l7 ! next tweak value
  973. addcc %g2, %g2, %g2
  974. and %l7, 0x87, %l7
  975. addxc %g3, %g3, %g3
  976. xor %l7, %g2, %g2
  977. movxtod %g2, %f8
  978. movxtod %g3, %f10
  979. bshuffle %f8, %f8, %f8
  980. bshuffle %f10, %f10, %f10
  981. xor %g4, %o0, %o0 ! ^= rk[0]
  982. xor %g5, %o1, %o1
  983. xor %g4, %o2, %o2 ! ^= rk[0]
  984. xor %g5, %o3, %o3
  985. movxtod %o0, %f0
  986. movxtod %o1, %f2
  987. movxtod %o2, %f4
  988. movxtod %o3, %f6
  989. fxor %f12, %f0, %f0 ! ^= tweak[0]
  990. fxor %f14, %f2, %f2
  991. fxor %f8, %f4, %f4 ! ^= tweak[0]
  992. fxor %f10, %f6, %f6
  993. prefetch [$out + 63], 22
  994. prefetch [$inp + 32+63], 20
  995. call _${alg}${bits}_${dir}crypt_2x
  996. add $inp, 32, $inp
  997. movxtod %g2, %f8
  998. movxtod %g3, %f10
  999. srax %g3, 63, %l7 ! next tweak value
  1000. addcc %g2, %g2, %g2
  1001. and %l7, 0x87, %l7
  1002. addxc %g3, %g3, %g3
  1003. xor %l7, %g2, %g2
  1004. bshuffle %f8, %f8, %f8
  1005. bshuffle %f10, %f10, %f10
  1006. fxor %f12, %f0, %f0 ! ^= tweak[0]
  1007. fxor %f14, %f2, %f2
  1008. fxor %f8, %f4, %f4
  1009. fxor %f10, %f6, %f6
  1010. brnz,pn $ooff, 2f
  1011. sub $len, 2, $len
  1012. std %f0, [$out + 0]
  1013. std %f2, [$out + 8]
  1014. std %f4, [$out + 16]
  1015. std %f6, [$out + 24]
  1016. brnz,pt $len, .L${bits}_xts_${dir}loop2x
  1017. add $out, 32, $out
  1018. fsrc2 %f4, %f0
  1019. fsrc2 %f6, %f2
  1020. brnz,pn $rem, .L${bits}_xts_${dir}steal
  1021. nop
  1022. ret
  1023. restore
  1024. .align 16
  1025. 2: ldxa [$inp]0x82, %o0 ! avoid read-after-write hazard
  1026. ! and ~3x deterioration
  1027. ! in inp==out case
  1028. faligndata %f0, %f0, %f8 ! handle unaligned output
  1029. faligndata %f0, %f2, %f10
  1030. faligndata %f2, %f4, %f12
  1031. faligndata %f4, %f6, %f14
  1032. faligndata %f6, %f6, %f0
  1033. stda %f8, [$out + $omask]0xc0 ! partial store
  1034. std %f10, [$out + 8]
  1035. std %f12, [$out + 16]
  1036. std %f14, [$out + 24]
  1037. add $out, 32, $out
  1038. orn %g0, $omask, $omask
  1039. stda %f0, [$out + $omask]0xc0 ! partial store
  1040. brnz,pt $len, .L${bits}_xts_${dir}loop2x+4
  1041. orn %g0, $omask, $omask
  1042. fsrc2 %f4, %f0
  1043. fsrc2 %f6, %f2
  1044. brnz,pn $rem, .L${bits}_xts_${dir}steal
  1045. nop
  1046. ret
  1047. restore
  1048. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  1049. .align 32
  1050. .L${bits}_xts_${dir}blk:
  1051. add $out, $len, $blk_init
  1052. and $blk_init, 63, $blk_init ! tail
  1053. sub $len, $blk_init, $len
  1054. add $blk_init, 15, $blk_init ! round up to 16n
  1055. srlx $len, 4, $len
  1056. srl $blk_init, 4, $blk_init
  1057. sub $len, 1, $len
  1058. add $blk_init, 1, $blk_init
  1059. .L${bits}_xts_${dir}blk2x:
  1060. ldx [$inp + 0], %o0
  1061. ldx [$inp + 8], %o1
  1062. ldx [$inp + 16], %o2
  1063. brz,pt $ileft, 5f
  1064. ldx [$inp + 24], %o3
  1065. ldx [$inp + 32], %o4
  1066. sllx %o0, $ileft, %o0
  1067. srlx %o1, $iright, %g1
  1068. or %g1, %o0, %o0
  1069. sllx %o1, $ileft, %o1
  1070. srlx %o2, $iright, %g1
  1071. or %g1, %o1, %o1
  1072. sllx %o2, $ileft, %o2
  1073. srlx %o3, $iright, %g1
  1074. or %g1, %o2, %o2
  1075. sllx %o3, $ileft, %o3
  1076. srlx %o4, $iright, %o4
  1077. or %o4, %o3, %o3
  1078. 5:
  1079. movxtod %g2, %f12
  1080. movxtod %g3, %f14
  1081. bshuffle %f12, %f12, %f12
  1082. bshuffle %f14, %f14, %f14
  1083. srax %g3, 63, %l7 ! next tweak value
  1084. addcc %g2, %g2, %g2
  1085. and %l7, 0x87, %l7
  1086. addxc %g3, %g3, %g3
  1087. xor %l7, %g2, %g2
  1088. movxtod %g2, %f8
  1089. movxtod %g3, %f10
  1090. bshuffle %f8, %f8, %f8
  1091. bshuffle %f10, %f10, %f10
  1092. xor %g4, %o0, %o0 ! ^= rk[0]
  1093. xor %g5, %o1, %o1
  1094. xor %g4, %o2, %o2 ! ^= rk[0]
  1095. xor %g5, %o3, %o3
  1096. movxtod %o0, %f0
  1097. movxtod %o1, %f2
  1098. movxtod %o2, %f4
  1099. movxtod %o3, %f6
  1100. fxor %f12, %f0, %f0 ! ^= tweak[0]
  1101. fxor %f14, %f2, %f2
  1102. fxor %f8, %f4, %f4 ! ^= tweak[0]
  1103. fxor %f10, %f6, %f6
  1104. prefetch [$inp + 32+63], 20
  1105. call _${alg}${bits}_${dir}crypt_2x
  1106. add $inp, 32, $inp
  1107. movxtod %g2, %f8
  1108. movxtod %g3, %f10
  1109. srax %g3, 63, %l7 ! next tweak value
  1110. addcc %g2, %g2, %g2
  1111. and %l7, 0x87, %l7
  1112. addxc %g3, %g3, %g3
  1113. xor %l7, %g2, %g2
  1114. bshuffle %f8, %f8, %f8
  1115. bshuffle %f10, %f10, %f10
  1116. fxor %f12, %f0, %f0 ! ^= tweak[0]
  1117. fxor %f14, %f2, %f2
  1118. fxor %f8, %f4, %f4
  1119. fxor %f10, %f6, %f6
  1120. subcc $len, 2, $len
  1121. stda %f0, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  1122. add $out, 8, $out
  1123. stda %f2, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  1124. add $out, 8, $out
  1125. stda %f4, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  1126. add $out, 8, $out
  1127. stda %f6, [$out]0xe2 ! ASI_BLK_INIT, T4-specific
  1128. bgu,pt $::size_t_cc, .L${bits}_xts_${dir}blk2x
  1129. add $out, 8, $out
  1130. add $blk_init, $len, $len
  1131. andcc $len, 1, %g0 ! is number of blocks even?
  1132. membar #StoreLoad|#StoreStore
  1133. bnz,pt %icc, .L${bits}_xts_${dir}loop
  1134. srl $len, 0, $len
  1135. brnz,pn $len, .L${bits}_xts_${dir}loop2x
  1136. nop
  1137. fsrc2 %f4, %f0
  1138. fsrc2 %f6, %f2
  1139. brnz,pn $rem, .L${bits}_xts_${dir}steal
  1140. nop
  1141. ret
  1142. restore
  1143. !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
  1144. ___
  1145. $code.=<<___ if ($dir eq "en");
  1146. .align 32
  1147. .L${bits}_xts_${dir}steal:
  1148. std %f0, [%fp + $::bias-16] ! copy of output
  1149. std %f2, [%fp + $::bias-8]
  1150. srl $ileft, 3, $ileft
  1151. add %fp, $::bias-16, %l7
  1152. add $inp, $ileft, $inp ! original $inp+$len&-15
  1153. add $out, $ooff, $out ! original $out+$len&-15
  1154. mov 0, $ileft
  1155. nop ! align
  1156. .L${bits}_xts_${dir}stealing:
  1157. ldub [$inp + $ileft], %o0
  1158. ldub [%l7 + $ileft], %o1
  1159. dec $rem
  1160. stb %o0, [%l7 + $ileft]
  1161. stb %o1, [$out + $ileft]
  1162. brnz $rem, .L${bits}_xts_${dir}stealing
  1163. inc $ileft
  1164. mov %l7, $inp
  1165. sub $out, 16, $out
  1166. mov 0, $ileft
  1167. sub $out, $ooff, $out
  1168. ba .L${bits}_xts_${dir}loop ! one more time
  1169. mov 1, $len ! $rem is 0
  1170. ___
  1171. $code.=<<___ if ($dir eq "de");
  1172. .align 32
  1173. .L${bits}_xts_${dir}steal:
  1174. ldx [$inp + 0], %o0
  1175. brz,pt $ileft, 8f
  1176. ldx [$inp + 8], %o1
  1177. ldx [$inp + 16], %o2
  1178. sllx %o0, $ileft, %o0
  1179. srlx %o1, $iright, %g1
  1180. sllx %o1, $ileft, %o1
  1181. or %g1, %o0, %o0
  1182. srlx %o2, $iright, %o2
  1183. or %o2, %o1, %o1
  1184. 8:
  1185. srax %g3, 63, %l7 ! next tweak value
  1186. addcc %g2, %g2, %o2
  1187. and %l7, 0x87, %l7
  1188. addxc %g3, %g3, %o3
  1189. xor %l7, %o2, %o2
  1190. movxtod %o2, %f12
  1191. movxtod %o3, %f14
  1192. bshuffle %f12, %f12, %f12
  1193. bshuffle %f14, %f14, %f14
  1194. xor %g4, %o0, %o0 ! ^= rk[0]
  1195. xor %g5, %o1, %o1
  1196. movxtod %o0, %f0
  1197. movxtod %o1, %f2
  1198. fxor %f12, %f0, %f0 ! ^= tweak[0]
  1199. fxor %f14, %f2, %f2
  1200. call _${alg}${bits}_${dir}crypt_1x
  1201. add $inp, 16, $inp
  1202. fxor %f12, %f0, %f0 ! ^= tweak[0]
  1203. fxor %f14, %f2, %f2
  1204. std %f0, [%fp + $::bias-16]
  1205. std %f2, [%fp + $::bias-8]
  1206. srl $ileft, 3, $ileft
  1207. add %fp, $::bias-16, %l7
  1208. add $inp, $ileft, $inp ! original $inp+$len&-15
  1209. add $out, $ooff, $out ! original $out+$len&-15
  1210. mov 0, $ileft
  1211. add $out, 16, $out
  1212. nop ! align
  1213. .L${bits}_xts_${dir}stealing:
  1214. ldub [$inp + $ileft], %o0
  1215. ldub [%l7 + $ileft], %o1
  1216. dec $rem
  1217. stb %o0, [%l7 + $ileft]
  1218. stb %o1, [$out + $ileft]
  1219. brnz $rem, .L${bits}_xts_${dir}stealing
  1220. inc $ileft
  1221. mov %l7, $inp
  1222. sub $out, 16, $out
  1223. mov 0, $ileft
  1224. sub $out, $ooff, $out
  1225. ba .L${bits}_xts_${dir}loop ! one more time
  1226. mov 1, $len ! $rem is 0
  1227. ___
  1228. $code.=<<___;
  1229. ret
  1230. restore
  1231. .type ${alg}${bits}_t4_xts_${dir}crypt,#function
  1232. .size ${alg}${bits}_t4_xts_${dir}crypt,.-${alg}${bits}_t4_xts_${dir}crypt
  1233. ___
  1234. }
  1235. # Purpose of these subroutines is to explicitly encode VIS instructions,
  1236. # so that one can compile the module without having to specify VIS
  1237. # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
  1238. # Idea is to reserve for option to produce "universal" binary and let
  1239. # programmer detect if current CPU is VIS capable at run-time.
  1240. sub unvis {
  1241. my ($mnemonic,$rs1,$rs2,$rd)=@_;
  1242. my ($ref,$opf);
  1243. my %visopf = ( "faligndata" => 0x048,
  1244. "bshuffle" => 0x04c,
  1245. "fnot2" => 0x066,
  1246. "fxor" => 0x06c,
  1247. "fsrc2" => 0x078 );
  1248. $ref = "$mnemonic\t$rs1,$rs2,$rd";
  1249. if ($opf=$visopf{$mnemonic}) {
  1250. foreach ($rs1,$rs2,$rd) {
  1251. return $ref if (!/%f([0-9]{1,2})/);
  1252. $_=$1;
  1253. if ($1>=32) {
  1254. return $ref if ($1&1);
  1255. # re-encode for upper double register addressing
  1256. $_=($1|$1>>5)&31;
  1257. }
  1258. }
  1259. return sprintf ".word\t0x%08x !%s",
  1260. 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
  1261. $ref;
  1262. } else {
  1263. return $ref;
  1264. }
  1265. }
  1266. sub unvis3 {
  1267. my ($mnemonic,$rs1,$rs2,$rd)=@_;
  1268. my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
  1269. my ($ref,$opf);
  1270. my %visopf = ( "addxc" => 0x011,
  1271. "addxccc" => 0x013,
  1272. "umulxhi" => 0x016,
  1273. "alignaddr" => 0x018,
  1274. "bmask" => 0x019,
  1275. "alignaddrl" => 0x01a );
  1276. $ref = "$mnemonic\t$rs1,$rs2,$rd";
  1277. if ($opf=$visopf{$mnemonic}) {
  1278. foreach ($rs1,$rs2,$rd) {
  1279. return $ref if (!/%([goli])([0-9])/);
  1280. $_=$bias{$1}+$2;
  1281. }
  1282. return sprintf ".word\t0x%08x !%s",
  1283. 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
  1284. $ref;
  1285. } else {
  1286. return $ref;
  1287. }
  1288. }
  1289. sub unaes_round { # 4-argument instructions
  1290. my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
  1291. my ($ref,$opf);
  1292. my %aesopf = ( "aes_eround01" => 0,
  1293. "aes_eround23" => 1,
  1294. "aes_dround01" => 2,
  1295. "aes_dround23" => 3,
  1296. "aes_eround01_l"=> 4,
  1297. "aes_eround23_l"=> 5,
  1298. "aes_dround01_l"=> 6,
  1299. "aes_dround23_l"=> 7,
  1300. "aes_kexpand1" => 8 );
  1301. $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
  1302. if (defined($opf=$aesopf{$mnemonic})) {
  1303. $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
  1304. foreach ($rs1,$rs2,$rd) {
  1305. return $ref if (!/%f([0-9]{1,2})/);
  1306. $_=$1;
  1307. if ($1>=32) {
  1308. return $ref if ($1&1);
  1309. # re-encode for upper double register addressing
  1310. $_=($1|$1>>5)&31;
  1311. }
  1312. }
  1313. return sprintf ".word\t0x%08x !%s",
  1314. 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
  1315. $ref;
  1316. } else {
  1317. return $ref;
  1318. }
  1319. }
  1320. sub unaes_kexpand { # 3-argument instructions
  1321. my ($mnemonic,$rs1,$rs2,$rd)=@_;
  1322. my ($ref,$opf);
  1323. my %aesopf = ( "aes_kexpand0" => 0x130,
  1324. "aes_kexpand2" => 0x131 );
  1325. $ref = "$mnemonic\t$rs1,$rs2,$rd";
  1326. if (defined($opf=$aesopf{$mnemonic})) {
  1327. foreach ($rs1,$rs2,$rd) {
  1328. return $ref if (!/%f([0-9]{1,2})/);
  1329. $_=$1;
  1330. if ($1>=32) {
  1331. return $ref if ($1&1);
  1332. # re-encode for upper double register addressing
  1333. $_=($1|$1>>5)&31;
  1334. }
  1335. }
  1336. return sprintf ".word\t0x%08x !%s",
  1337. 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
  1338. $ref;
  1339. } else {
  1340. return $ref;
  1341. }
  1342. }
  1343. sub uncamellia_f { # 4-argument instructions
  1344. my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
  1345. my ($ref,$opf);
  1346. $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
  1347. if (1) {
  1348. $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
  1349. foreach ($rs1,$rs2,$rd) {
  1350. return $ref if (!/%f([0-9]{1,2})/);
  1351. $_=$1;
  1352. if ($1>=32) {
  1353. return $ref if ($1&1);
  1354. # re-encode for upper double register addressing
  1355. $_=($1|$1>>5)&31;
  1356. }
  1357. }
  1358. return sprintf ".word\t0x%08x !%s",
  1359. 2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2,
  1360. $ref;
  1361. } else {
  1362. return $ref;
  1363. }
  1364. }
  1365. sub uncamellia3 { # 3-argument instructions
  1366. my ($mnemonic,$rs1,$rs2,$rd)=@_;
  1367. my ($ref,$opf);
  1368. my %cmllopf = ( "camellia_fl" => 0x13c,
  1369. "camellia_fli" => 0x13d );
  1370. $ref = "$mnemonic\t$rs1,$rs2,$rd";
  1371. if (defined($opf=$cmllopf{$mnemonic})) {
  1372. foreach ($rs1,$rs2,$rd) {
  1373. return $ref if (!/%f([0-9]{1,2})/);
  1374. $_=$1;
  1375. if ($1>=32) {
  1376. return $ref if ($1&1);
  1377. # re-encode for upper double register addressing
  1378. $_=($1|$1>>5)&31;
  1379. }
  1380. }
  1381. return sprintf ".word\t0x%08x !%s",
  1382. 2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
  1383. $ref;
  1384. } else {
  1385. return $ref;
  1386. }
  1387. }
  1388. sub unmovxtox { # 2-argument instructions
  1389. my ($mnemonic,$rs,$rd)=@_;
  1390. my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 );
  1391. my ($ref,$opf);
  1392. my %movxopf = ( "movdtox" => 0x110,
  1393. "movstouw" => 0x111,
  1394. "movstosw" => 0x113,
  1395. "movxtod" => 0x118,
  1396. "movwtos" => 0x119 );
  1397. $ref = "$mnemonic\t$rs,$rd";
  1398. if (defined($opf=$movxopf{$mnemonic})) {
  1399. foreach ($rs,$rd) {
  1400. return $ref if (!/%([fgoli])([0-9]{1,2})/);
  1401. $_=$bias{$1}+$2;
  1402. if ($2>=32) {
  1403. return $ref if ($2&1);
  1404. # re-encode for upper double register addressing
  1405. $_=($2|$2>>5)&31;
  1406. }
  1407. }
  1408. return sprintf ".word\t0x%08x !%s",
  1409. 2<<30|$rd<<25|0x36<<19|$opf<<5|$rs,
  1410. $ref;
  1411. } else {
  1412. return $ref;
  1413. }
  1414. }
  1415. sub undes {
  1416. my ($mnemonic)=shift;
  1417. my @args=@_;
  1418. my ($ref,$opf);
  1419. my %desopf = ( "des_round" => 0b1001,
  1420. "des_ip" => 0b100110100,
  1421. "des_iip" => 0b100110101,
  1422. "des_kexpand" => 0b100110110 );
  1423. $ref = "$mnemonic\t".join(",",@_);
  1424. if (defined($opf=$desopf{$mnemonic})) { # 4-arg
  1425. if ($mnemonic eq "des_round") {
  1426. foreach (@args[0..3]) {
  1427. return $ref if (!/%f([0-9]{1,2})/);
  1428. $_=$1;
  1429. if ($1>=32) {
  1430. return $ref if ($1&1);
  1431. # re-encode for upper double register addressing
  1432. $_=($1|$1>>5)&31;
  1433. }
  1434. }
  1435. return sprintf ".word\t0x%08x !%s",
  1436. 2<<30|0b011001<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<9|$args[3]<<25,
  1437. $ref;
  1438. } elsif ($mnemonic eq "des_kexpand") { # 3-arg
  1439. foreach (@args[0..2]) {
  1440. return $ref if (!/(%f)?([0-9]{1,2})/);
  1441. $_=$2;
  1442. if ($2>=32) {
  1443. return $ref if ($2&1);
  1444. # re-encode for upper double register addressing
  1445. $_=($2|$2>>5)&31;
  1446. }
  1447. }
  1448. return sprintf ".word\t0x%08x !%s",
  1449. 2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]|$args[2]<<25,
  1450. $ref;
  1451. } else { # 2-arg
  1452. foreach (@args[0..1]) {
  1453. return $ref if (!/%f([0-9]{1,2})/);
  1454. $_=$1;
  1455. if ($1>=32) {
  1456. return $ref if ($2&1);
  1457. # re-encode for upper double register addressing
  1458. $_=($1|$1>>5)&31;
  1459. }
  1460. }
  1461. return sprintf ".word\t0x%08x !%s",
  1462. 2<<30|0b110110<<19|$opf<<5|$args[0]<<14|$args[1]<<25,
  1463. $ref;
  1464. }
  1465. } else {
  1466. return $ref;
  1467. }
  1468. }
  1469. sub emit_assembler {
  1470. foreach (split("\n",$::code)) {
  1471. s/\`([^\`]*)\`/eval $1/ge;
  1472. s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/go;
  1473. s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
  1474. &unaes_round($1,$2,$3,$4,$5)
  1475. /geo or
  1476. s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
  1477. &unaes_kexpand($1,$2,$3,$4)
  1478. /geo or
  1479. s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
  1480. &uncamellia_f($1,$2,$3,$4,$5)
  1481. /geo or
  1482. s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
  1483. &uncamellia3($1,$2,$3,$4)
  1484. /geo or
  1485. s/\b(des_\w+)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+)(?:,\s*(%f[0-9]{1,2})(?:,\s*(%f[0-9]{1,2}))?)?/
  1486. &undes($1,$2,$3,$4,$5)
  1487. /geo or
  1488. s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/
  1489. &unmovxtox($1,$2,$3)
  1490. /geo or
  1491. s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/
  1492. &unmovxtox($1,$2,$3)
  1493. /geo or
  1494. s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
  1495. &unvis($1,$2,$3,$4)
  1496. /geo or
  1497. s/\b(umulxhi|bmask|addxc[c]{0,2}|alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
  1498. &unvis3($1,$2,$3,$4)
  1499. /geo;
  1500. print $_,"\n";
  1501. }
  1502. }
  1503. 1;