aes-gcm-riscv64-zvkb-zvkg-zvkned.pl 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975
  1. #! /usr/bin/env perl
  2. # This file is dual-licensed, meaning that you can use it under your
  3. # choice of either of the following two licenses:
  4. #
  5. # Copyright 2023 The OpenSSL Project Authors. All Rights Reserved.
  6. #
  7. # Licensed under the Apache License 2.0 (the "License"). You can obtain
  8. # a copy in the file LICENSE in the source distribution or at
  9. # https://www.openssl.org/source/license.html
  10. #
  11. # or
  12. #
  13. # Copyright (c) 2023, Jerry Shih <jerry.shih@sifive.com>
  14. # All rights reserved.
  15. #
  16. # Redistribution and use in source and binary forms, with or without
  17. # modification, are permitted provided that the following conditions
  18. # are met:
  19. # 1. Redistributions of source code must retain the above copyright
  20. # notice, this list of conditions and the following disclaimer.
  21. # 2. Redistributions in binary form must reproduce the above copyright
  22. # notice, this list of conditions and the following disclaimer in the
  23. # documentation and/or other materials provided with the distribution.
  24. #
  25. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  26. # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  27. # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
  28. # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  29. # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  30. # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  31. # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  32. # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  33. # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  34. # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
  35. # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  36. # - RV64I
  37. # - RISC-V Vector ('V') with VLEN >= 128
  38. # - RISC-V Vector Cryptography Bit-manipulation extension ('Zvkb')
  39. # - RISC-V Vector GCM/GMAC extension ('Zvkg')
  40. # - RISC-V Vector AES Block Cipher extension ('Zvkned')
  41. # - RISC-V Zicclsm(Main memory supports misaligned loads/stores)
  42. # Reference: https://github.com/riscv/riscv-crypto/issues/192#issuecomment-1270447575
  43. #
  44. # Assume we have 12 GCM blocks and we try to parallelize GCM computation for 4 blocks.
  45. # Tag = M0*H^12 + M1*H^11 + M2*H^10 + M3*H^9 +
  46. # M4*H^8 + M5*H^7 + M6*H^6 + M7*H^5 +
  47. # M8*H^4 + M9*H^3 + M10*H^2 + M11*H^1
  48. # We could rewrite the formula into:
  49. # T0 = 0
  50. # T1 = (T0+M1)*H^4 T2 = (T0+M2)*H^4 T3 = (T0+M3)*H^4 T4 = (T0+M4)*H^4
  51. # T5 = (T1+M5)*H^4 T6 = (T2+M6)*H^4 T7 = (T3+M7)*H^4 T8 = (T4+M8)*H^4
  52. # T9 = (T5+M9)*H^4 T10 = (T6+M10)*H^3 T11 = (T7+M11)*H^2 T12 = (T8+M12)*H^1
  53. #
  54. # We will multiply with [H^4, H^4, H^4, H^4] in each steps except the last iteration.
  55. # The last iteration will multiply with [H^4, H^3, H^2, H^1].
  56. use strict;
  57. use warnings;
  58. use FindBin qw($Bin);
  59. use lib "$Bin";
  60. use lib "$Bin/../../perlasm";
  61. use riscv;
  62. # $output is the last argument if it looks like a file (it has an extension)
  63. # $flavour is the first argument if it doesn't look like a file
  64. my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  65. my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  66. $output and open STDOUT,">$output";
  67. my $code=<<___;
  68. .text
  69. ___
  70. {
  71. my ($INP, $OUTP, $LEN, $KEYP, $IVP, $XIP) = ("a0", "a1", "a2", "a3", "a4", "a5");
  72. my ($T0, $T1, $T2, $T3) = ("t0", "t1", "t2", "t3");
  73. my ($PADDING_LEN32) = ("t4");
  74. my ($LEN32) = ("t5");
  75. my ($CTR) = ("t6");
  76. my ($FULL_BLOCK_LEN32) = ("a6");
  77. my ($ORIGINAL_LEN32) = ("a7");
  78. my ($PROCESSED_LEN) = ("a0");
  79. my ($CTR_MASK) = ("v0");
  80. my ($INPUT_PADDING_MASK) = ("v0");
  81. my ($V0, $V1, $V2, $V3, $V4, $V5, $V6, $V7,
  82. $V8, $V9, $V10, $V11, $V12, $V13, $V14, $V15,
  83. $V16, $V17, $V18, $V19, $V20, $V21, $V22, $V23,
  84. $V24, $V25, $V26, $V27, $V28, $V29, $V30, $V31,
  85. ) = map("v$_",(0..31));
  86. # Do aes-128 enc.
  87. sub aes_128_cipher_body {
  88. my $code=<<___;
  89. @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
  90. @{[vaesz_vs $V28, $V1]}
  91. @{[vaesem_vs $V28, $V2]}
  92. @{[vaesem_vs $V28, $V3]}
  93. @{[vaesem_vs $V28, $V4]}
  94. @{[vaesem_vs $V28, $V5]}
  95. @{[vaesem_vs $V28, $V6]}
  96. @{[vaesem_vs $V28, $V7]}
  97. @{[vaesem_vs $V28, $V8]}
  98. @{[vaesem_vs $V28, $V9]}
  99. @{[vaesem_vs $V28, $V10]}
  100. @{[vaesef_vs $V28, $V11]}
  101. ___
  102. return $code;
  103. }
  104. # Do aes-192 enc.
  105. sub aes_192_cipher_body {
  106. my $TMP_REG = shift;
  107. my $code=<<___;
  108. # Load key 4
  109. @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
  110. addi $TMP_REG, $KEYP, 48
  111. @{[vle32_v $V11, $TMP_REG]}
  112. @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
  113. @{[vaesz_vs $V28, $V1]}
  114. @{[vaesem_vs $V28, $V2]}
  115. @{[vaesem_vs $V28, $V3]}
  116. @{[vaesem_vs $V28, $V11]}
  117. # Load key 8
  118. @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
  119. addi $TMP_REG, $KEYP, 112
  120. @{[vle32_v $V11, $TMP_REG]}
  121. @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
  122. @{[vaesem_vs $V28, $V4]}
  123. @{[vaesem_vs $V28, $V5]}
  124. @{[vaesem_vs $V28, $V6]}
  125. @{[vaesem_vs $V28, $V11]}
  126. # Load key 13
  127. @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
  128. addi $TMP_REG, $KEYP, 192
  129. @{[vle32_v $V11, $TMP_REG]}
  130. @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
  131. @{[vaesem_vs $V28, $V7]}
  132. @{[vaesem_vs $V28, $V8]}
  133. @{[vaesem_vs $V28, $V9]}
  134. @{[vaesem_vs $V28, $V10]}
  135. @{[vaesef_vs $V28, $V11]}
  136. ___
  137. return $code;
  138. }
  139. # Do aes-256 enc.
  140. sub aes_256_cipher_body {
  141. my $TMP_REG = shift;
  142. my $code=<<___;
  143. # Load key 3
  144. @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
  145. addi $TMP_REG, $KEYP, 32
  146. @{[vle32_v $V11, $TMP_REG]}
  147. @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
  148. @{[vaesz_vs $V28, $V1]}
  149. @{[vaesem_vs $V28, $V2]}
  150. @{[vaesem_vs $V28, $V11]}
  151. # Load key 6
  152. @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
  153. addi $TMP_REG, $KEYP, 80
  154. @{[vle32_v $V11, $TMP_REG]}
  155. @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
  156. @{[vaesem_vs $V28, $V3]}
  157. @{[vaesem_vs $V28, $V4]}
  158. @{[vaesem_vs $V28, $V11]}
  159. # Load key 9
  160. @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
  161. addi $TMP_REG, $KEYP, 128
  162. @{[vle32_v $V11, $TMP_REG]}
  163. @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
  164. @{[vaesem_vs $V28, $V5]}
  165. @{[vaesem_vs $V28, $V6]}
  166. @{[vaesem_vs $V28, $V11]}
  167. # Load key 12
  168. @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
  169. addi $TMP_REG, $KEYP, 176
  170. @{[vle32_v $V11, $TMP_REG]}
  171. @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
  172. @{[vaesem_vs $V28, $V7]}
  173. @{[vaesem_vs $V28, $V8]}
  174. @{[vaesem_vs $V28, $V11]}
  175. # Load key 15
  176. @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
  177. addi $TMP_REG, $KEYP, 224
  178. @{[vle32_v $V11, $TMP_REG]}
  179. @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
  180. @{[vaesem_vs $V28, $V9]}
  181. @{[vaesem_vs $V28, $V10]}
  182. @{[vaesef_vs $V28, $V11]}
  183. ___
  184. return $code;
  185. }
  186. sub handle_padding_in_first_round {
  187. my $TMP_REG = shift;
  188. my $code=<<___;
  189. bnez $PADDING_LEN32, 1f
  190. ## without padding
  191. # Store ciphertext/plaintext
  192. @{[vse32_v $V28, $OUTP]}
  193. j 2f
  194. ## with padding
  195. 1:
  196. # Store ciphertext/plaintext using mask
  197. @{[vse32_v $V28, $OUTP, $INPUT_PADDING_MASK]}
  198. # Fill zero for the padding blocks
  199. @{[vsetvli "zero", $PADDING_LEN32, "e32", "m4", "tu", "ma"]}
  200. @{[vmv_v_i $V28, 0]}
  201. # We have used mask register for `INPUT_PADDING_MASK` before. We need to
  202. # setup the ctr mask back.
  203. # ctr mask : [000100010001....]
  204. @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e8", "m1", "ta", "ma"]}
  205. li $TMP_REG, 0b10001000
  206. @{[vmv_v_x $CTR_MASK, $TMP_REG]}
  207. 2:
  208. ___
  209. return $code;
  210. }
  211. # Do aes-128 enc for first round.
  212. sub aes_128_first_round {
  213. my $PTR_OFFSET_REG = shift;
  214. my $TMP_REG = shift;
  215. my $code=<<___;
  216. # Load all 11 aes round keys to v1-v11 registers.
  217. @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
  218. @{[vle32_v $V1, $KEYP]}
  219. addi $KEYP, $KEYP, 16
  220. @{[vle32_v $V2, $KEYP]}
  221. addi $KEYP, $KEYP, 16
  222. @{[vle32_v $V3, $KEYP]}
  223. addi $KEYP, $KEYP, 16
  224. @{[vle32_v $V4, $KEYP]}
  225. addi $KEYP, $KEYP, 16
  226. @{[vle32_v $V5, $KEYP]}
  227. addi $KEYP, $KEYP, 16
  228. @{[vle32_v $V6, $KEYP]}
  229. addi $KEYP, $KEYP, 16
  230. @{[vle32_v $V7, $KEYP]}
  231. addi $KEYP, $KEYP, 16
  232. @{[vle32_v $V8, $KEYP]}
  233. addi $KEYP, $KEYP, 16
  234. @{[vle32_v $V9, $KEYP]}
  235. addi $KEYP, $KEYP, 16
  236. @{[vle32_v $V10, $KEYP]}
  237. addi $KEYP, $KEYP, 16
  238. @{[vle32_v $V11, $KEYP]}
  239. # We already have the ciphertext/plaintext and ctr data for the first round.
  240. @{[aes_128_cipher_body]}
  241. # Compute AES ctr result.
  242. @{[vxor_vv $V28, $V28, $V24]}
  243. @{[handle_padding_in_first_round $TMP_REG]}
  244. add $INP, $INP, $PTR_OFFSET_REG
  245. add $OUTP, $OUTP, $PTR_OFFSET_REG
  246. ___
  247. return $code;
  248. }
  249. # Do aes-192 enc for first round.
  250. sub aes_192_first_round {
  251. my $PTR_OFFSET_REG = shift;
  252. my $TMP_REG = shift;
  253. my $code=<<___;
  254. # We run out of 32 vector registers, so we just preserve some round keys
  255. # and load the remaining round keys inside the aes body.
  256. # We keep the round keys for:
  257. # 1, 2, 3, 5, 6, 7, 9, 10, 11 and 12th keys.
  258. # The following keys will be loaded in the aes body:
  259. # 4, 8 and 13th keys.
  260. @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
  261. # key 1
  262. @{[vle32_v $V1, $KEYP]}
  263. # key 2
  264. addi $TMP_REG, $KEYP, 16
  265. @{[vle32_v $V2, $TMP_REG]}
  266. # key 3
  267. addi $TMP_REG, $KEYP, 32
  268. @{[vle32_v $V3, $TMP_REG]}
  269. # key 5
  270. addi $TMP_REG, $KEYP, 64
  271. @{[vle32_v $V4, $TMP_REG]}
  272. # key 6
  273. addi $TMP_REG, $KEYP, 80
  274. @{[vle32_v $V5, $TMP_REG]}
  275. # key 7
  276. addi $TMP_REG, $KEYP, 96
  277. @{[vle32_v $V6, $TMP_REG]}
  278. # key 9
  279. addi $TMP_REG, $KEYP, 128
  280. @{[vle32_v $V7, $TMP_REG]}
  281. # key 10
  282. addi $TMP_REG, $KEYP, 144
  283. @{[vle32_v $V8, $TMP_REG]}
  284. # key 11
  285. addi $TMP_REG, $KEYP, 160
  286. @{[vle32_v $V9, $TMP_REG]}
  287. # key 12
  288. addi $TMP_REG, $KEYP, 176
  289. @{[vle32_v $V10, $TMP_REG]}
  290. # We already have the ciphertext/plaintext and ctr data for the first round.
  291. @{[aes_192_cipher_body $TMP_REG]}
  292. # Compute AES ctr result.
  293. @{[vxor_vv $V28, $V28, $V24]}
  294. @{[handle_padding_in_first_round $TMP_REG]}
  295. add $INP, $INP, $PTR_OFFSET_REG
  296. add $OUTP, $OUTP, $PTR_OFFSET_REG
  297. ___
  298. return $code;
  299. }
  300. # Do aes-256 enc for first round.
  301. sub aes_256_first_round {
  302. my $PTR_OFFSET_REG = shift;
  303. my $TMP_REG = shift;
  304. my $code=<<___;
  305. # We run out of 32 vector registers, so we just preserve some round keys
  306. # and load the remaining round keys inside the aes body.
  307. # We keep the round keys for:
  308. # 1, 2, 4, 5, 7, 8, 10, 11, 13 and 14th keys.
  309. # The following keys will be loaded in the aes body:
  310. # 3, 6, 9, 12 and 15th keys.
  311. @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
  312. # key 1
  313. @{[vle32_v $V1, $KEYP]}
  314. # key 2
  315. addi $TMP_REG, $KEYP, 16
  316. @{[vle32_v $V2, $TMP_REG]}
  317. # key 4
  318. addi $TMP_REG, $KEYP, 48
  319. @{[vle32_v $V3, $TMP_REG]}
  320. # key 5
  321. addi $TMP_REG, $KEYP, 64
  322. @{[vle32_v $V4, $TMP_REG]}
  323. # key 7
  324. addi $TMP_REG, $KEYP, 96
  325. @{[vle32_v $V5, $TMP_REG]}
  326. # key 8
  327. addi $TMP_REG, $KEYP, 112
  328. @{[vle32_v $V6, $TMP_REG]}
  329. # key 10
  330. addi $TMP_REG, $KEYP, 144
  331. @{[vle32_v $V7, $TMP_REG]}
  332. # key 11
  333. addi $TMP_REG, $KEYP, 160
  334. @{[vle32_v $V8, $TMP_REG]}
  335. # key 13
  336. addi $TMP_REG, $KEYP, 192
  337. @{[vle32_v $V9, $TMP_REG]}
  338. # key 14
  339. addi $TMP_REG, $KEYP, 208
  340. @{[vle32_v $V10, $TMP_REG]}
  341. # We already have the ciphertext/plaintext and ctr data for the first round.
  342. @{[aes_256_cipher_body $TMP_REG]}
  343. # Compute AES ctr result.
  344. @{[vxor_vv $V28, $V28, $V24]}
  345. @{[handle_padding_in_first_round $TMP_REG]}
  346. add $INP, $INP, $PTR_OFFSET_REG
  347. add $OUTP, $OUTP, $PTR_OFFSET_REG
  348. ___
  349. return $code;
  350. }
  351. sub aes_gcm_init {
  352. my $code=<<___;
  353. # Compute the AES-GCM full-block e32 length for `LMUL=4`. We will handle
  354. # the multiple AES-GCM blocks at the same time within `LMUL=4` register.
  355. # The AES-GCM's SEW is e32 and EGW is 128 bits.
  356. # FULL_BLOCK_LEN32 = (VLEN*LMUL)/(EGW) * (EGW/SEW) = (VLEN*4)/(32*4) * 4
  357. # = (VLEN*4)/32
  358. # We could get the block_num using the VL value of `vsetvli with e32, m4`.
  359. @{[vsetvli $FULL_BLOCK_LEN32, "zero", "e32", "m4", "ta", "ma"]}
  360. # If `LEN32 % FULL_BLOCK_LEN32` is not equal to zero, we could fill the
  361. # zero padding data to make sure we could always handle FULL_BLOCK_LEN32
  362. # blocks for all iterations.
  363. ## Prepare the H^n multiplier in v16 for GCM multiplier. The `n` is the gcm
  364. ## block number in a LMUL=4 register group.
  365. ## n = ((VLEN*LMUL)/(32*4)) = ((VLEN*4)/(32*4))
  366. ## = (VLEN/32)
  367. ## We could use vsetvli with `e32, m1` to compute the `n` number.
  368. @{[vsetvli $T0, "zero", "e32", "m1", "ta", "ma"]}
  369. # The H is at `gcm128_context.Htable[0]`(addr(Xi)+16*2).
  370. addi $T1, $XIP, 32
  371. @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
  372. @{[vle32_v $V31, $T1]}
  373. # Compute the H^n
  374. li $T1, 1
  375. 1:
  376. @{[vgmul_vv $V31, $V31]}
  377. slli $T1, $T1, 1
  378. bltu $T1, $T0, 1b
  379. @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
  380. @{[vmv_v_i $V16, 0]}
  381. @{[vaesz_vs $V16, $V31]}
  382. #### Load plaintext into v24 and handle padding. We also load the init tag
  383. #### data into v20 and prepare the AES ctr input data into v12 and v28.
  384. @{[vmv_v_i $V20, 0]}
  385. ## Prepare the AES ctr input data into v12.
  386. # Setup ctr input mask.
  387. # ctr mask : [000100010001....]
  388. # Note: The actual vl should be `FULL_BLOCK_LEN32/4 * 2`, but we just use
  389. # `FULL_BLOCK_LEN32` here.
  390. @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e8", "m1", "ta", "ma"]}
  391. li $T0, 0b10001000
  392. @{[vmv_v_x $CTR_MASK, $T0]}
  393. # Load IV.
  394. @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
  395. @{[vle32_v $V31, $IVP]}
  396. # Convert the big-endian counter into little-endian.
  397. @{[vsetivli "zero", 4, "e32", "m1", "ta", "mu"]}
  398. @{[vrev8_v $V31, $V31, $CTR_MASK]}
  399. # Splat the `single block of IV` to v12
  400. @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
  401. @{[vmv_v_i $V12, 0]}
  402. @{[vaesz_vs $V12, $V31]}
  403. # Prepare the ctr counter into v8
  404. # v8: [x, x, x, 0, x, x, x, 1, x, x, x, 2, ...]
  405. @{[viota_m $V8, $CTR_MASK, $CTR_MASK]}
  406. # Merge IV and ctr counter into v12.
  407. # v12:[x, x, x, count+0, x, x, x, count+1, ...]
  408. @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "mu"]}
  409. @{[vadd_vv $V12, $V12, $V8, $CTR_MASK]}
  410. li $PADDING_LEN32, 0
  411. # Get the SEW32 size in the first round.
  412. # If we have the non-zero value for `LEN32&(FULL_BLOCK_LEN32-1)`, then
  413. # we will have the leading padding zero.
  414. addi $T0, $FULL_BLOCK_LEN32, -1
  415. and $T0, $T0, $LEN32
  416. beqz $T0, 1f
  417. ## with padding
  418. sub $LEN32, $LEN32, $T0
  419. sub $PADDING_LEN32, $FULL_BLOCK_LEN32, $T0
  420. # padding block size
  421. srli $T1, $PADDING_LEN32, 2
  422. # padding byte size
  423. slli $T2, $PADDING_LEN32, 2
  424. # Adjust the ctr counter to make the counter start from `counter+0` for the
  425. # first non-padding block.
  426. @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "mu"]}
  427. @{[vsub_vx $V12, $V12, $T1, $CTR_MASK]}
  428. # Prepare the AES ctr input into v28.
  429. # The ctr data uses big-endian form.
  430. @{[vmv_v_v $V28, $V12]}
  431. @{[vrev8_v $V28, $V28, $CTR_MASK]}
  432. # Prepare the mask for input loading in the first round. We use
  433. # `VL=FULL_BLOCK_LEN32` with the mask in the first round.
  434. # Adjust input ptr.
  435. sub $INP, $INP, $T2
  436. # Adjust output ptr.
  437. sub $OUTP, $OUTP, $T2
  438. @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e16", "m2", "ta", "ma"]}
  439. @{[vid_v $V2]}
  440. # We don't use the pseudo instruction `vmsgeu` here. Use `vmsgtu` instead.
  441. # The original code is:
  442. # vmsgeu.vx $INPUT_PADDING_MASK, $V2, $PADDING_LEN32
  443. addi $T0, $PADDING_LEN32, -1
  444. @{[vmsgtu_vx $INPUT_PADDING_MASK, $V2, $T0]}
  445. @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
  446. @{[vmv_v_i $V24, 0]}
  447. # Load the input for length FULL_BLOCK_LEN32 with mask.
  448. @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "mu"]}
  449. @{[vle32_v $V24, $INP, $INPUT_PADDING_MASK]}
  450. # Load the init `Xi` data to v20 with preceding zero padding.
  451. # Adjust Xi ptr.
  452. sub $T0, $XIP, $T2
  453. # Load for length `zero-padding-e32-length + 4`.
  454. addi $T1, $PADDING_LEN32, 4
  455. @{[vsetvli "zero", $T1, "e32", "m4", "tu", "mu"]}
  456. @{[vle32_v $V20, $T0, $INPUT_PADDING_MASK]}
  457. j 2f
  458. 1:
  459. ## without padding
  460. sub $LEN32, $LEN32, $FULL_BLOCK_LEN32
  461. @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
  462. @{[vle32_v $V24, $INP]}
  463. # Load the init Xi data to v20.
  464. @{[vsetivli "zero", 4, "e32", "m1", "tu", "ma"]}
  465. @{[vle32_v $V20, $XIP]}
  466. # Prepare the AES ctr input into v28.
  467. # The ctr data uses big-endian form.
  468. @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "mu"]}
  469. @{[vmv_v_v $V28, $V12]}
  470. @{[vrev8_v $V28, $V28, $CTR_MASK]}
  471. 2:
  472. ___
  473. return $code;
  474. }
  475. sub prepare_input_and_ctr {
  476. my $PTR_OFFSET_REG = shift;
  477. my $code=<<___;
  478. @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "mu"]}
  479. # Increase ctr in v12.
  480. @{[vadd_vx $V12, $V12, $CTR, $CTR_MASK]}
  481. sub $LEN32, $LEN32, $FULL_BLOCK_LEN32
  482. # Load plaintext into v24
  483. @{[vsetvli "zero", "zero", "e32", "m4", "ta", "ma"]}
  484. @{[vle32_v $V24, $INP]}
  485. # Prepare the AES ctr input into v28.
  486. # The ctr data uses big-endian form.
  487. @{[vmv_v_v $V28, $V12]}
  488. add $INP, $INP, $PTR_OFFSET_REG
  489. @{[vsetvli "zero", "zero", "e32", "m4", "ta", "mu"]}
  490. @{[vrev8_v $V28, $V28, $CTR_MASK]}
  491. ___
  492. return $code;
  493. }
  494. # Store the current CTR back to IV buffer.
  495. sub store_current_ctr {
  496. my $code=<<___;
  497. @{[vsetivli "zero", 4, "e32", "m4", "ta", "ma"]}
  498. # Update current ctr value to v12
  499. @{[vadd_vx $V12, $V12, $CTR, $CTR_MASK]}
  500. # Convert ctr to big-endian counter.
  501. @{[vrev8_v $V12, $V12, $CTR_MASK]}
  502. @{[vse32_v $V12, $IVP, $CTR_MASK]}
  503. ___
  504. return $code;
  505. }
  506. # Compute the final tag into v0 from the partial tag v20.
  507. sub compute_final_tag {
  508. my $TMP_REG = shift;
  509. my $code=<<___;
  510. # The H is at `gcm128_context.Htable[0]` (addr(Xi)+16*2).
  511. # Load H to v1
  512. addi $TMP_REG, $XIP, 32
  513. @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
  514. @{[vle32_v $V1, $TMP_REG]}
  515. # Multiply H for each partial tag and XOR them together.
  516. # Handle 1st partial tag
  517. @{[vmv_v_v $V0, $V20]}
  518. @{[vgmul_vv $V0, $V1]}
  519. # Handle 2nd to N-th partial tags
  520. li $TMP_REG, 4
  521. 1:
  522. @{[vsetivli "zero", 4, "e32", "m4", "ta", "ma"]}
  523. @{[vslidedown_vx $V4, $V20, $TMP_REG]}
  524. @{[vsetivli "zero", 4, "e32", "m1", "ta", "ma"]}
  525. @{[vghsh_vv $V0, $V1, $V4]}
  526. addi $TMP_REG, $TMP_REG, 4
  527. blt $TMP_REG, $FULL_BLOCK_LEN32, 1b
  528. ___
  529. return $code;
  530. }
  531. ################################################################################
  532. # size_t rv64i_zvkb_zvkg_zvkned_aes_gcm_encrypt(const unsigned char *in,
  533. # unsigned char *out, size_t len,
  534. # const void *key,
  535. # unsigned char ivec[16], u64 *Xi);
  536. {
  537. $code .= <<___;
  538. .p2align 3
  539. .globl rv64i_zvkb_zvkg_zvkned_aes_gcm_encrypt
  540. .type rv64i_zvkb_zvkg_zvkned_aes_gcm_encrypt,\@function
  541. rv64i_zvkb_zvkg_zvkned_aes_gcm_encrypt:
  542. srli $T0, $LEN, 4
  543. beqz $T0, .Lenc_end
  544. slli $LEN32, $T0, 2
  545. mv $ORIGINAL_LEN32, $LEN32
  546. @{[aes_gcm_init]}
  547. # Load number of rounds
  548. lwu $T0, 240($KEYP)
  549. li $T1, 14
  550. li $T2, 12
  551. li $T3, 10
  552. beq $T0, $T1, aes_gcm_enc_blocks_256
  553. beq $T0, $T2, aes_gcm_enc_blocks_192
  554. beq $T0, $T3, aes_gcm_enc_blocks_128
  555. .Lenc_end:
  556. li $PROCESSED_LEN, 0
  557. ret
  558. .size rv64i_zvkb_zvkg_zvkned_aes_gcm_encrypt,.-rv64i_zvkb_zvkg_zvkned_aes_gcm_encrypt
  559. ___
  560. $code .= <<___;
  561. .p2align 3
  562. aes_gcm_enc_blocks_128:
  563. srli $CTR, $FULL_BLOCK_LEN32, 2
  564. slli $T0, $FULL_BLOCK_LEN32, 2
  565. @{[aes_128_first_round $T0, $T1]}
  566. @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
  567. .Lenc_blocks_128:
  568. # Compute the partial tags.
  569. # The partial tags will multiply with [H^n, H^n, ..., H^n]
  570. # [tag0, tag1, ...] =
  571. # ([tag0, tag1, ...] + [ciphertext0, ciphertext1, ...] * [H^n, H^n, ..., H^n]
  572. # We will skip the [H^n, H^n, ..., H^n] multiplication for the last round.
  573. beqz $LEN32, .Lenc_blocks_128_end
  574. @{[vghsh_vv $V20, $V16, $V28]}
  575. @{[prepare_input_and_ctr $T0]}
  576. @{[aes_128_cipher_body]}
  577. # Compute AES ctr ciphertext result.
  578. @{[vxor_vv $V28, $V28, $V24]}
  579. # Store ciphertext
  580. @{[vse32_v $V28, $OUTP]}
  581. add $OUTP, $OUTP, $T0
  582. j .Lenc_blocks_128
  583. .Lenc_blocks_128_end:
  584. # Add ciphertext into partial tag
  585. @{[vxor_vv $V20, $V20, $V28]}
  586. @{[store_current_ctr]}
  587. @{[compute_final_tag $T1]}
  588. # Save the final tag
  589. @{[vse32_v $V0, $XIP]}
  590. # return the processed size.
  591. slli $PROCESSED_LEN, $ORIGINAL_LEN32, 2
  592. ret
  593. .size aes_gcm_enc_blocks_128,.-aes_gcm_enc_blocks_128
  594. ___
  595. $code .= <<___;
  596. .p2align 3
  597. aes_gcm_enc_blocks_192:
  598. srli $CTR, $FULL_BLOCK_LEN32, 2
  599. slli $T0, $FULL_BLOCK_LEN32, 2
  600. @{[aes_192_first_round $T0, $T1]}
  601. @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
  602. .Lenc_blocks_192:
  603. # Compute the partial tags.
  604. # The partial tags will multiply with [H^n, H^n, ..., H^n]
  605. # [tag0, tag1, ...] =
  606. # ([tag0, tag1, ...] + [ciphertext0, ciphertext1, ...] * [H^n, H^n, ..., H^n]
  607. # We will skip the [H^n, H^n, ..., H^n] multiplication for the last round.
  608. beqz $LEN32, .Lenc_blocks_192_end
  609. @{[vghsh_vv $V20, $V16, $V28]}
  610. @{[prepare_input_and_ctr $T0]}
  611. @{[aes_192_cipher_body $T1]}
  612. # Compute AES ctr ciphertext result.
  613. @{[vxor_vv $V28, $V28, $V24]}
  614. # Store ciphertext
  615. @{[vse32_v $V28, $OUTP]}
  616. add $OUTP, $OUTP, $T0
  617. j .Lenc_blocks_192
  618. .Lenc_blocks_192_end:
  619. # Add ciphertext into partial tag
  620. @{[vxor_vv $V20, $V20, $V28]}
  621. @{[store_current_ctr]}
  622. @{[compute_final_tag $T1]}
  623. # Save the final tag
  624. @{[vse32_v $V0, $XIP]}
  625. # return the processed size.
  626. slli $PROCESSED_LEN, $ORIGINAL_LEN32, 2
  627. ret
  628. .size aes_gcm_enc_blocks_192,.-aes_gcm_enc_blocks_192
  629. ___
  630. $code .= <<___;
  631. .p2align 3
  632. aes_gcm_enc_blocks_256:
  633. srli $CTR, $FULL_BLOCK_LEN32, 2
  634. slli $T0, $FULL_BLOCK_LEN32, 2
  635. @{[aes_256_first_round $T0, $T1]}
  636. @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
  637. .Lenc_blocks_256:
  638. # Compute the partial tags.
  639. # The partial tags will multiply with [H^n, H^n, ..., H^n]
  640. # [tag0, tag1, ...] =
  641. # ([tag0, tag1, ...] + [ciphertext0, ciphertext1, ...] * [H^n, H^n, ..., H^n]
  642. # We will skip the [H^n, H^n, ..., H^n] multiplication for the last round.
  643. beqz $LEN32, .Lenc_blocks_256_end
  644. @{[vghsh_vv $V20, $V16, $V28]}
  645. @{[prepare_input_and_ctr $T0]}
  646. @{[aes_256_cipher_body $T1]}
  647. # Compute AES ctr ciphertext result.
  648. @{[vxor_vv $V28, $V28, $V24]}
  649. # Store ciphertext
  650. @{[vse32_v $V28, $OUTP]}
  651. add $OUTP, $OUTP, $T0
  652. j .Lenc_blocks_256
  653. .Lenc_blocks_256_end:
  654. # Add ciphertext into partial tag
  655. @{[vxor_vv $V20, $V20, $V28]}
  656. @{[store_current_ctr]}
  657. @{[compute_final_tag $T1]}
  658. # Save the final tag
  659. @{[vse32_v $V0, $XIP]}
  660. # return the processed size.
  661. slli $PROCESSED_LEN, $ORIGINAL_LEN32, 2
  662. ret
  663. .size aes_gcm_enc_blocks_256,.-aes_gcm_enc_blocks_256
  664. ___
  665. }
  666. ################################################################################
  667. # size_t rv64i_zvkb_zvkg_zvkned_aes_gcm_decrypt(const unsigned char *in,
  668. # unsigned char *out, size_t len,
  669. # const void *key,
  670. # unsigned char ivec[16], u64 *Xi);
  671. {
  672. $code .= <<___;
  673. .p2align 3
  674. .globl rv64i_zvkb_zvkg_zvkned_aes_gcm_decrypt
  675. .type rv64i_zvkb_zvkg_zvkned_aes_gcm_decrypt,\@function
  676. rv64i_zvkb_zvkg_zvkned_aes_gcm_decrypt:
  677. srli $T0, $LEN, 4
  678. beqz $T0, .Ldec_end
  679. slli $LEN32, $T0, 2
  680. mv $ORIGINAL_LEN32, $LEN32
  681. @{[aes_gcm_init]}
  682. # Load number of rounds
  683. lwu $T0, 240($KEYP)
  684. li $T1, 14
  685. li $T2, 12
  686. li $T3, 10
  687. beq $T0, $T1, aes_gcm_dec_blocks_256
  688. beq $T0, $T2, aes_gcm_dec_blocks_192
  689. beq $T0, $T3, aes_gcm_dec_blocks_128
  690. .Ldec_end:
  691. li $PROCESSED_LEN, 0
  692. ret
  693. .size rv64i_zvkb_zvkg_zvkned_aes_gcm_decrypt,.-rv64i_zvkb_zvkg_zvkned_aes_gcm_decrypt
  694. ___
  695. $code .= <<___;
  696. .p2align 3
  697. aes_gcm_dec_blocks_128:
  698. srli $CTR, $FULL_BLOCK_LEN32, 2
  699. slli $T0, $FULL_BLOCK_LEN32, 2
  700. @{[aes_128_first_round $T0, $T1]}
  701. @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
  702. .Ldec_blocks_128:
  703. # Compute the partial tags.
  704. # The partial tags will multiply with [H^n, H^n, ..., H^n]
  705. # [tag0, tag1, ...] =
  706. # ([tag0, tag1, ...] + [ciphertext0, ciphertext1, ...] * [H^n, H^n, ..., H^n]
  707. # We will skip the [H^n, H^n, ..., H^n] multiplication for the last round.
  708. beqz $LEN32, .Ldec_blocks_256_end
  709. @{[vghsh_vv $V20, $V16, $V24]}
  710. @{[prepare_input_and_ctr $T0]}
  711. @{[aes_128_cipher_body]}
  712. # Compute AES ctr plaintext result.
  713. @{[vxor_vv $V28, $V28, $V24]}
  714. # Store plaintext
  715. @{[vse32_v $V28, $OUTP]}
  716. add $OUTP, $OUTP, $T0
  717. j .Ldec_blocks_128
  718. .Ldec_blocks_128_end:
  719. # Add ciphertext into partial tag
  720. @{[vxor_vv $V20, $V20, $V24]}
  721. @{[store_current_ctr]}
  722. @{[compute_final_tag $T1]}
  723. # Save the final tag
  724. @{[vse32_v $V0, $XIP]}
  725. # return the processed size.
  726. slli $PROCESSED_LEN, $ORIGINAL_LEN32, 2
  727. ret
  728. .size aes_gcm_dec_blocks_128,.-aes_gcm_dec_blocks_128
  729. ___
  730. $code .= <<___;
  731. .p2align 3
  732. aes_gcm_dec_blocks_192:
  733. srli $CTR, $FULL_BLOCK_LEN32, 2
  734. slli $T0, $FULL_BLOCK_LEN32, 2
  735. @{[aes_192_first_round $T0, $T1]}
  736. @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
  737. .Ldec_blocks_192:
  738. # Compute the partial tags.
  739. # The partial tags will multiply with [H^n, H^n, ..., H^n]
  740. # [tag0, tag1, ...] =
  741. # ([tag0, tag1, ...] + [ciphertext0, ciphertext1, ...] * [H^n, H^n, ..., H^n]
  742. # We will skip the [H^n, H^n, ..., H^n] multiplication for the last round.
  743. beqz $LEN32, .Ldec_blocks_192_end
  744. @{[vghsh_vv $V20, $V16, $V24]}
  745. @{[prepare_input_and_ctr $T0]}
  746. @{[aes_192_cipher_body $T1]}
  747. # Compute AES ctr plaintext result.
  748. @{[vxor_vv $V28, $V28, $V24]}
  749. # Store plaintext
  750. @{[vse32_v $V28, $OUTP]}
  751. add $OUTP, $OUTP, $T0
  752. j .Ldec_blocks_192
  753. .Ldec_blocks_192_end:
  754. # Add ciphertext into partial tag
  755. @{[vxor_vv $V20, $V20, $V24]}
  756. @{[store_current_ctr]}
  757. @{[compute_final_tag $T1]}
  758. # Save the final tag
  759. @{[vse32_v $V0, $XIP]}
  760. # return the processed size.
  761. slli $PROCESSED_LEN, $ORIGINAL_LEN32, 2
  762. ret
  763. .size aes_gcm_dec_blocks_192,.-aes_gcm_dec_blocks_192
  764. ___
  765. $code .= <<___;
  766. .p2align 3
  767. aes_gcm_dec_blocks_256:
  768. srli $CTR, $FULL_BLOCK_LEN32, 2
  769. slli $T0, $FULL_BLOCK_LEN32, 2
  770. @{[aes_256_first_round $T0, $T1]}
  771. @{[vsetvli "zero", $FULL_BLOCK_LEN32, "e32", "m4", "ta", "ma"]}
  772. .Ldec_blocks_256:
  773. # Compute the partial tags.
  774. # The partial tags will multiply with [H^n, H^n, ..., H^n]
  775. # [tag0, tag1, ...] =
  776. # ([tag0, tag1, ...] + [ciphertext0, ciphertext1, ...] * [H^n, H^n, ..., H^n]
  777. # We will skip the [H^n, H^n, ..., H^n] multiplication for the last round.
  778. beqz $LEN32, .Ldec_blocks_256_end
  779. @{[vghsh_vv $V20, $V16, $V24]}
  780. @{[prepare_input_and_ctr $T0]}
  781. @{[aes_256_cipher_body $T1]}
  782. # Compute AES ctr plaintext result.
  783. @{[vxor_vv $V28, $V28, $V24]}
  784. # Store plaintext
  785. @{[vse32_v $V28, $OUTP]}
  786. add $OUTP, $OUTP, $T0
  787. j .Ldec_blocks_256
  788. .Ldec_blocks_256_end:
  789. # Add ciphertext into partial tag
  790. @{[vxor_vv $V20, $V20, $V24]}
  791. @{[store_current_ctr]}
  792. @{[compute_final_tag $T1]}
  793. # Save the final tag
  794. @{[vse32_v $V0, $XIP]}
  795. # return the processed size.
  796. slli $PROCESSED_LEN, $ORIGINAL_LEN32, 2
  797. ret
  798. .size aes_gcm_dec_blocks_256,.-aes_gcm_dec_blocks_256
  799. ___
  800. }
  801. }
  802. print $code;
  803. close STDOUT or die "error closing STDOUT: $!";