chacha-riscv64-zvkb.pl 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320
  1. #! /usr/bin/env perl
  2. # This file is dual-licensed, meaning that you can use it under your
  3. # choice of either of the following two licenses:
  4. #
  5. # Copyright 2023-2023 The OpenSSL Project Authors. All Rights Reserved.
  6. #
  7. # Licensed under the Apache License 2.0 (the "License"). You may not use
  8. # this file except in compliance with the License. You can obtain a copy
  9. # in the file LICENSE in the source distribution or at
  10. # https://www.openssl.org/source/license.html
  11. #
  12. # or
  13. #
  14. # Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
  15. # All rights reserved.
  16. #
  17. # Redistribution and use in source and binary forms, with or without
  18. # modification, are permitted provided that the following conditions
  19. # are met:
  20. # 1. Redistributions of source code must retain the above copyright
  21. # notice, this list of conditions and the following disclaimer.
  22. # 2. Redistributions in binary form must reproduce the above copyright
  23. # notice, this list of conditions and the following disclaimer in the
  24. # documentation and/or other materials provided with the distribution.
  25. #
  26. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  27. # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  28. # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  29. # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  30. # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  31. # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  32. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  33. # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  34. # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  35. # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  36. # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  37. # - RV64I
  38. # - RISC-V Vector ('V') with VLEN >= 128
  39. # - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
  40. # - RISC-V Zicclsm(Main memory supports misaligned loads/stores)
  41. use strict;
  42. use warnings;
  43. use FindBin qw($Bin);
  44. use lib "$Bin";
  45. use lib "$Bin/../../perlasm";
  46. use riscv;
  47. # $output is the last argument if it looks like a file (it has an extension)
  48. # $flavour is the first argument if it doesn't look like a file
  49. my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  50. my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  51. $output and open STDOUT, ">$output";
  52. my $code = <<___;
  53. .text
  54. ___
  55. # void ChaCha20_ctr32_zvkb(unsigned char *out, const unsigned char *inp,
  56. # size_t len, const unsigned int key[8],
  57. # const unsigned int counter[4]);
  58. ################################################################################
  59. my ( $OUTPUT, $INPUT, $LEN, $KEY, $COUNTER ) = ( "a0", "a1", "a2", "a3", "a4" );
  60. my ( $T0 ) = ( "t0" );
  61. my ( $CONST_DATA0, $CONST_DATA1, $CONST_DATA2, $CONST_DATA3 ) =
  62. ( "a5", "a6", "a7", "t1" );
  63. my ( $KEY0, $KEY1, $KEY2,$KEY3, $KEY4, $KEY5, $KEY6, $KEY7,
  64. $COUNTER0, $COUNTER1, $NONCE0, $NONCE1
  65. ) = ( "s0", "s1", "s2", "s3", "s4", "s5", "s6",
  66. "s7", "s8", "s9", "s10", "s11" );
  67. my ( $VL, $STRIDE, $CHACHA_LOOP_COUNT ) = ( "t2", "t3", "t4" );
  68. my (
  69. $V0, $V1, $V2, $V3, $V4, $V5, $V6, $V7, $V8, $V9, $V10,
  70. $V11, $V12, $V13, $V14, $V15, $V16, $V17, $V18, $V19, $V20, $V21,
  71. $V22, $V23, $V24, $V25, $V26, $V27, $V28, $V29, $V30, $V31,
  72. ) = map( "v$_", ( 0 .. 31 ) );
  73. sub chacha_quad_round_group {
  74. my (
  75. $A0, $B0, $C0, $D0, $A1, $B1, $C1, $D1,
  76. $A2, $B2, $C2, $D2, $A3, $B3, $C3, $D3
  77. ) = @_;
  78. my $code = <<___;
  79. # a += b; d ^= a; d <<<= 16;
  80. @{[vadd_vv $A0, $A0, $B0]}
  81. @{[vadd_vv $A1, $A1, $B1]}
  82. @{[vadd_vv $A2, $A2, $B2]}
  83. @{[vadd_vv $A3, $A3, $B3]}
  84. @{[vxor_vv $D0, $D0, $A0]}
  85. @{[vxor_vv $D1, $D1, $A1]}
  86. @{[vxor_vv $D2, $D2, $A2]}
  87. @{[vxor_vv $D3, $D3, $A3]}
  88. @{[vror_vi $D0, $D0, 32 - 16]}
  89. @{[vror_vi $D1, $D1, 32 - 16]}
  90. @{[vror_vi $D2, $D2, 32 - 16]}
  91. @{[vror_vi $D3, $D3, 32 - 16]}
  92. # c += d; b ^= c; b <<<= 12;
  93. @{[vadd_vv $C0, $C0, $D0]}
  94. @{[vadd_vv $C1, $C1, $D1]}
  95. @{[vadd_vv $C2, $C2, $D2]}
  96. @{[vadd_vv $C3, $C3, $D3]}
  97. @{[vxor_vv $B0, $B0, $C0]}
  98. @{[vxor_vv $B1, $B1, $C1]}
  99. @{[vxor_vv $B2, $B2, $C2]}
  100. @{[vxor_vv $B3, $B3, $C3]}
  101. @{[vror_vi $B0, $B0, 32 - 12]}
  102. @{[vror_vi $B1, $B1, 32 - 12]}
  103. @{[vror_vi $B2, $B2, 32 - 12]}
  104. @{[vror_vi $B3, $B3, 32 - 12]}
  105. # a += b; d ^= a; d <<<= 8;
  106. @{[vadd_vv $A0, $A0, $B0]}
  107. @{[vadd_vv $A1, $A1, $B1]}
  108. @{[vadd_vv $A2, $A2, $B2]}
  109. @{[vadd_vv $A3, $A3, $B3]}
  110. @{[vxor_vv $D0, $D0, $A0]}
  111. @{[vxor_vv $D1, $D1, $A1]}
  112. @{[vxor_vv $D2, $D2, $A2]}
  113. @{[vxor_vv $D3, $D3, $A3]}
  114. @{[vror_vi $D0, $D0, 32 - 8]}
  115. @{[vror_vi $D1, $D1, 32 - 8]}
  116. @{[vror_vi $D2, $D2, 32 - 8]}
  117. @{[vror_vi $D3, $D3, 32 - 8]}
  118. # c += d; b ^= c; b <<<= 7;
  119. @{[vadd_vv $C0, $C0, $D0]}
  120. @{[vadd_vv $C1, $C1, $D1]}
  121. @{[vadd_vv $C2, $C2, $D2]}
  122. @{[vadd_vv $C3, $C3, $D3]}
  123. @{[vxor_vv $B0, $B0, $C0]}
  124. @{[vxor_vv $B1, $B1, $C1]}
  125. @{[vxor_vv $B2, $B2, $C2]}
  126. @{[vxor_vv $B3, $B3, $C3]}
  127. @{[vror_vi $B0, $B0, 32 - 7]}
  128. @{[vror_vi $B1, $B1, 32 - 7]}
  129. @{[vror_vi $B2, $B2, 32 - 7]}
  130. @{[vror_vi $B3, $B3, 32 - 7]}
  131. ___
  132. return $code;
  133. }
  134. $code .= <<___;
  135. .p2align 3
  136. .globl ChaCha20_ctr32_zvkb
  137. .type ChaCha20_ctr32_zvkb,\@function
  138. ChaCha20_ctr32_zvkb:
  139. srli $LEN, $LEN, 6
  140. beqz $LEN, .Lend
  141. addi sp, sp, -96
  142. sd s0, 0(sp)
  143. sd s1, 8(sp)
  144. sd s2, 16(sp)
  145. sd s3, 24(sp)
  146. sd s4, 32(sp)
  147. sd s5, 40(sp)
  148. sd s6, 48(sp)
  149. sd s7, 56(sp)
  150. sd s8, 64(sp)
  151. sd s9, 72(sp)
  152. sd s10, 80(sp)
  153. sd s11, 88(sp)
  154. li $STRIDE, 64
  155. #### chacha block data
  156. # "expa" little endian
  157. li $CONST_DATA0, 0x61707865
  158. # "nd 3" little endian
  159. li $CONST_DATA1, 0x3320646e
  160. # "2-by" little endian
  161. li $CONST_DATA2, 0x79622d32
  162. # "te k" little endian
  163. li $CONST_DATA3, 0x6b206574
  164. lw $KEY0, 0($KEY)
  165. lw $KEY1, 4($KEY)
  166. lw $KEY2, 8($KEY)
  167. lw $KEY3, 12($KEY)
  168. lw $KEY4, 16($KEY)
  169. lw $KEY5, 20($KEY)
  170. lw $KEY6, 24($KEY)
  171. lw $KEY7, 28($KEY)
  172. lw $COUNTER0, 0($COUNTER)
  173. lw $COUNTER1, 4($COUNTER)
  174. lw $NONCE0, 8($COUNTER)
  175. lw $NONCE1, 12($COUNTER)
  176. .Lblock_loop:
  177. @{[vsetvli $VL, $LEN, "e32", "m1", "ta", "ma"]}
  178. # init chacha const states
  179. @{[vmv_v_x $V0, $CONST_DATA0]}
  180. @{[vmv_v_x $V1, $CONST_DATA1]}
  181. @{[vmv_v_x $V2, $CONST_DATA2]}
  182. @{[vmv_v_x $V3, $CONST_DATA3]}
  183. # init chacha key states
  184. @{[vmv_v_x $V4, $KEY0]}
  185. @{[vmv_v_x $V5, $KEY1]}
  186. @{[vmv_v_x $V6, $KEY2]}
  187. @{[vmv_v_x $V7, $KEY3]}
  188. @{[vmv_v_x $V8, $KEY4]}
  189. @{[vmv_v_x $V9, $KEY5]}
  190. @{[vmv_v_x $V10, $KEY6]}
  191. @{[vmv_v_x $V11, $KEY7]}
  192. # init chacha key states
  193. @{[vid_v $V12]}
  194. @{[vadd_vx $V12, $V12, $COUNTER0]}
  195. @{[vmv_v_x $V13, $COUNTER1]}
  196. # init chacha nonce states
  197. @{[vmv_v_x $V14, $NONCE0]}
  198. @{[vmv_v_x $V15, $NONCE1]}
  199. # load the top-half of input data
  200. @{[vlsseg_nf_e32_v 8, $V16, $INPUT, $STRIDE]}
  201. li $CHACHA_LOOP_COUNT, 10
  202. .Lround_loop:
  203. addi $CHACHA_LOOP_COUNT, $CHACHA_LOOP_COUNT, -1
  204. @{[chacha_quad_round_group
  205. $V0, $V4, $V8, $V12,
  206. $V1, $V5, $V9, $V13,
  207. $V2, $V6, $V10, $V14,
  208. $V3, $V7, $V11, $V15]}
  209. @{[chacha_quad_round_group
  210. $V0, $V5, $V10, $V15,
  211. $V1, $V6, $V11, $V12,
  212. $V2, $V7, $V8, $V13,
  213. $V3, $V4, $V9, $V14]}
  214. bnez $CHACHA_LOOP_COUNT, .Lround_loop
  215. # load the bottom-half of input data
  216. addi $T0, $INPUT, 32
  217. @{[vlsseg_nf_e32_v 8, $V24, $T0, $STRIDE]}
  218. # add chacha top-half initial block states
  219. @{[vadd_vx $V0, $V0, $CONST_DATA0]}
  220. @{[vadd_vx $V1, $V1, $CONST_DATA1]}
  221. @{[vadd_vx $V2, $V2, $CONST_DATA2]}
  222. @{[vadd_vx $V3, $V3, $CONST_DATA3]}
  223. @{[vadd_vx $V4, $V4, $KEY0]}
  224. @{[vadd_vx $V5, $V5, $KEY1]}
  225. @{[vadd_vx $V6, $V6, $KEY2]}
  226. @{[vadd_vx $V7, $V7, $KEY3]}
  227. # xor with the top-half input
  228. @{[vxor_vv $V16, $V16, $V0]}
  229. @{[vxor_vv $V17, $V17, $V1]}
  230. @{[vxor_vv $V18, $V18, $V2]}
  231. @{[vxor_vv $V19, $V19, $V3]}
  232. @{[vxor_vv $V20, $V20, $V4]}
  233. @{[vxor_vv $V21, $V21, $V5]}
  234. @{[vxor_vv $V22, $V22, $V6]}
  235. @{[vxor_vv $V23, $V23, $V7]}
  236. # save the top-half of output
  237. @{[vssseg_nf_e32_v 8, $V16, $OUTPUT, $STRIDE]}
  238. # add chacha bottom-half initial block states
  239. @{[vadd_vx $V8, $V8, $KEY4]}
  240. @{[vadd_vx $V9, $V9, $KEY5]}
  241. @{[vadd_vx $V10, $V10, $KEY6]}
  242. @{[vadd_vx $V11, $V11, $KEY7]}
  243. @{[vid_v $V0]}
  244. @{[vadd_vx $V12, $V12, $COUNTER0]}
  245. @{[vadd_vx $V13, $V13, $COUNTER1]}
  246. @{[vadd_vx $V14, $V14, $NONCE0]}
  247. @{[vadd_vx $V15, $V15, $NONCE1]}
  248. @{[vadd_vv $V12, $V12, $V0]}
  249. # xor with the bottom-half input
  250. @{[vxor_vv $V24, $V24, $V8]}
  251. @{[vxor_vv $V25, $V25, $V9]}
  252. @{[vxor_vv $V26, $V26, $V10]}
  253. @{[vxor_vv $V27, $V27, $V11]}
  254. @{[vxor_vv $V29, $V29, $V13]}
  255. @{[vxor_vv $V28, $V28, $V12]}
  256. @{[vxor_vv $V30, $V30, $V14]}
  257. @{[vxor_vv $V31, $V31, $V15]}
  258. # save the bottom-half of output
  259. addi $T0, $OUTPUT, 32
  260. @{[vssseg_nf_e32_v 8, $V24, $T0, $STRIDE]}
  261. # update counter
  262. add $COUNTER0, $COUNTER0, $VL
  263. sub $LEN, $LEN, $VL
  264. # increase offset for `4 * 16 * VL = 64 * VL`
  265. slli $T0, $VL, 6
  266. add $INPUT, $INPUT, $T0
  267. add $OUTPUT, $OUTPUT, $T0
  268. bnez $LEN, .Lblock_loop
  269. ld s0, 0(sp)
  270. ld s1, 8(sp)
  271. ld s2, 16(sp)
  272. ld s3, 24(sp)
  273. ld s4, 32(sp)
  274. ld s5, 40(sp)
  275. ld s6, 48(sp)
  276. ld s7, 56(sp)
  277. ld s8, 64(sp)
  278. ld s9, 72(sp)
  279. ld s10, 80(sp)
  280. ld s11, 88(sp)
  281. addi sp, sp, 96
  282. .Lend:
  283. ret
  284. .size ChaCha20_ctr32_zvkb,.-ChaCha20_ctr32_zvkb
  285. ___
  286. print $code;
  287. close STDOUT or die "error closing STDOUT: $!";