ghash-riscv64.pl 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298
  1. #! /usr/bin/env perl
  2. # Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # $output is the last argument if it looks like a file (it has an extension)
  9. # $flavour is the first argument if it doesn't look like a file
  10. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  11. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  12. $output and open STDOUT,">$output";
  13. my @regs = map("x$_",(0..31));
  14. my @regaliases = ('zero','ra','sp','gp','tp','t0','t1','t2','s0','s1',
  15. map("a$_",(0..7)),
  16. map("s$_",(2..11)),
  17. map("t$_",(3..6))
  18. );
  19. my %reglookup;
  20. @reglookup{@regs} = @regs;
  21. @reglookup{@regaliases} = @regs;
  22. # Takes a register name, possibly an alias, and converts it to a register index
  23. # from 0 to 31
  24. sub read_reg {
  25. my $reg = lc shift;
  26. if (!exists($reglookup{$reg})) {
  27. die("Unknown register ".$reg);
  28. }
  29. my $regstr = $reglookup{$reg};
  30. if (!($regstr =~ /^x([0-9]+)$/)) {
  31. die("Could not process register ".$reg);
  32. }
  33. return $1;
  34. }
  35. sub rv64_rev8 {
  36. # Encoding for rev8 rd, rs instruction on RV64
  37. # XXXXXXXXXXXXX_ rs _XXX_ rd _XXXXXXX
  38. my $template = 0b011010111000_00000_101_00000_0010011;
  39. my $rd = read_reg shift;
  40. my $rs = read_reg shift;
  41. return ".word ".($template | ($rs << 15) | ($rd << 7));
  42. }
  43. sub rv64_clmul {
  44. # Encoding for clmul rd, rs1, rs2 instruction on RV64
  45. # XXXXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX
  46. my $template = 0b0000101_00000_00000_001_00000_0110011;
  47. my $rd = read_reg shift;
  48. my $rs1 = read_reg shift;
  49. my $rs2 = read_reg shift;
  50. return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7));
  51. }
  52. sub rv64_clmulh {
  53. # Encoding for clmulh rd, rs1, rs2 instruction on RV64
  54. # XXXXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX
  55. my $template = 0b0000101_00000_00000_011_00000_0110011;
  56. my $rd = read_reg shift;
  57. my $rs1 = read_reg shift;
  58. my $rs2 = read_reg shift;
  59. return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7));
  60. }
  61. ################################################################################
  62. # gcm_init_clmul_rv64i_zbb_zbc(u128 Htable[16], const u64 Xi[2])
  63. # Initialization function for clmul-based implementation of GMULT
  64. # This function is used in tandem with gcm_gmult_clmul_rv64i_zbb_zbc
  65. ################################################################################
  66. {
  67. my ($Haddr,$Xi,$TEMP) = ("a0","a1","a2");
  68. $code .= <<___;
  69. .text
  70. .balign 16
  71. .globl gcm_init_clmul_rv64i_zbb_zbc
  72. .type gcm_init_clmul_rv64i_zbb_zbc,\@function
  73. # Initialize clmul-based implementation of galois field multiplication routine.
  74. # gcm_init_clmul_rv64i_zbb_zbc(ctx->Htable, ctx->H.u)
  75. gcm_init_clmul_rv64i_zbb_zbc:
  76. # argument 0 = ctx->Htable (store H here)
  77. # argument 1 = H.u[] (2x 64-bit words) [H_high64, H_low64]
  78. # Simply store [H_high64, H_low64] for later
  79. ld $TEMP,0($Xi)
  80. sd $TEMP,0($Haddr)
  81. ld $TEMP,8($Xi)
  82. sd $TEMP,8($Haddr)
  83. ret
  84. ___
  85. }
  86. ################################################################################
  87. # gcm_gmult_clmul_rv64i_zbb_zbc(u64 Xi[2], const u128 Htable[16])
  88. # Compute GMULT (X*H mod f) using the Zbc (clmul) and Zbb (basic bit manip)
  89. # extensions, and the Modified Barrett Reduction technique
  90. ################################################################################
  91. {
  92. my ($Xi,$Haddr,$A1,$A0,$B1,$B0,$C1,$C0,$D1,$D0,$E1,$E0,$TEMP,$TEMP2,$qp_low) =
  93. ("a0","a1","a2","a3","a4","a5","a6","a7","t0","t1","t2","t3","t4","t5","t6");
  94. $code .= <<___;
  95. .text
  96. .balign 16
  97. .globl gcm_gmult_clmul_rv64i_zbb_zbc
  98. .type gcm_gmult_clmul_rv64i_zbb_zbc,\@function
  99. # static void gcm_gmult_clmul_rv64i_zbb_zbc(u64 Xi[2], const u128 Htable[16])
  100. # Computes product of X*H mod f
  101. gcm_gmult_clmul_rv64i_zbb_zbc:
  102. # Load X and H (H is saved previously in gcm_init_clmul_rv64i_zbb_zbc)
  103. ld $A1,0($Xi)
  104. ld $A0,8($Xi)
  105. ld $B1,0($Haddr)
  106. ld $B0,8($Haddr)
  107. li $qp_low,0xe100000000000000
  108. # Perform Katratsuba Multiplication to generate a 255-bit intermediate
  109. # A = [A1:A0]
  110. # B = [B1:B0]
  111. # Let:
  112. # [C1:C0] = A1*B1
  113. # [D1:D0] = A0*B0
  114. # [E1:E0] = (A0+A1)*(B0+B1)
  115. # Then:
  116. # A*B = [C1:C0+C1+D1+E1:D1+C0+D0+E0:D0]
  117. @{[rv64_rev8 $A1, $A1]}
  118. @{[rv64_clmul $C0,$A1,$B1]}
  119. @{[rv64_clmulh $C1,$A1,$B1]}
  120. @{[rv64_rev8 $A0,$A0]}
  121. @{[rv64_clmul $D0,$A0,$B0]}
  122. @{[rv64_clmulh $D1,$A0,$B0]}
  123. xor $TEMP,$A0,$A1
  124. xor $TEMP2,$B0,$B1
  125. @{[rv64_clmul $E0,$TEMP,$TEMP2]}
  126. @{[rv64_clmulh $E1,$TEMP,$TEMP2]}
  127. # 0th term is just C1
  128. # Construct term 1 in E1 (E1 only appears in dword 1)
  129. xor $E1,$E1,$D1
  130. xor $E1,$E1,$C1
  131. xor $E1,$E1,$C0
  132. # Term 1 is E1
  133. # Construct term 2 in E0 (E0 only appears in dword 2)
  134. xor $E0,$E0,$D0
  135. xor $E0,$E0,$C0
  136. xor $E0,$E0,$D1
  137. # Term 2 is E0
  138. # final term is just D0
  139. # X*H is now stored in [C1,E1,E0,D0]
  140. # Left-justify
  141. slli $C1,$C1,1
  142. # Or in the high bit of E1
  143. srli $TEMP,$E1,63
  144. or $C1,$C1,$TEMP
  145. slli $E1,$E1,1
  146. # Or in the high bit of E0
  147. srli $TEMP2,$E0,63
  148. or $E1,$E1,$TEMP2
  149. slli $E0,$E0,1
  150. # Or in the high bit of D0
  151. srli $TEMP,$D0,63
  152. or $E0,$E0,$TEMP
  153. slli $D0,$D0,1
  154. # Barrett Reduction
  155. # c = [E0, D0]
  156. # We want the top 128 bits of the result of c*f
  157. # We'll get this by computing the low-half (most significant 128 bits in
  158. # the reflected domain) of clmul(c,fs)<<1 first, then
  159. # xor in c to complete the calculation
  160. # AA = [AA1:AA0] = [E0,D0] = c
  161. # BB = [BB1:BB0] = [qp_low,0]
  162. # [CC1:CC0] = AA1*BB1
  163. # [DD1:DD0] = AA0*BB0
  164. # [EE1:EE0] = (AA0+AA1)*(BB0+BB1)
  165. # Then:
  166. # AA*BB = [CC1:CC0+CC1+DD1+EE1:DD1+CC0+DD0+EE0:DD0]
  167. # We only need CC0,DD1,DD0,EE0 to compute the low 128 bits of c * qp_low
  168. ___
  169. my ($CC0,$EE0,$AA1,$AA0,$BB1) = ($A0,$B1,$E0,$D0,$qp_low);
  170. $code .= <<___;
  171. @{[rv64_clmul $CC0,$AA1,$BB1]}
  172. #clmul DD0,AA0,BB0 # BB0 is 0, so DD0 = 0
  173. #clmulh DD1,AA0,BB0 # BB0 is 0, so DD1 = 0
  174. xor $TEMP,$AA0,$AA1
  175. #xor TEMP2,BB0,BB1 # TEMP2 = BB1 = qp_low
  176. @{[rv64_clmul $EE0,$TEMP,$BB1]}
  177. # Result is [N/A:N/A:DD1+CC0+DD0+EE0:DD0]
  178. # Simplifying: [CC0+EE0:0]
  179. xor $TEMP2,$CC0,$EE0
  180. # Shift left by 1 to correct for bit reflection
  181. slli $TEMP2,$TEMP2,1
  182. # xor into c = [E0,D0]
  183. # Note that only E0 is affected
  184. xor $E0,$E0,$TEMP2
  185. # Now, q = [E0,D0]
  186. # The final step is to compute clmul(q,[qp_low:0])<<1
  187. # The leftmost 128 bits are the reduced result.
  188. # Once again, we use Karatsuba multiplication, but many of the terms
  189. # simplify or cancel out.
  190. # AA = [AA1:AA0] = [E0,D0] = c
  191. # BB = [BB1:BB0] = [qp_low,0]
  192. # [CC1:CC0] = AA1*BB1
  193. # [DD1:DD0] = AA0*BB0
  194. # [EE1:EE0] = (AA0+AA1)*(BB0+BB1)
  195. # Then:
  196. # AA*BB = [CC1:CC0+CC1+DD1+EE1:DD1+CC0+DD0+EE0:DD0]
  197. # We need CC1,CC0,DD0,DD1,EE1,EE0 to compute the leftmost 128 bits of AA*BB
  198. ___
  199. my ($AA1,$AA0,$BB1,$CC1,$CC0,$EE1,$EE0) = ($E0,$D0,$qp_low,$A0,$A1,$C0,$B0);
  200. $code .= <<___;
  201. @{[rv64_clmul $CC0,$AA1,$BB1]}
  202. @{[rv64_clmulh $CC1,$AA1,$BB1]}
  203. #clmul DD0,AA0,BB0 # BB0 = 0 so DD0 = 0
  204. #clmulh DD1,AA0,BB0 # BB0 = 0 so DD1 = 0
  205. xor $TEMP,$AA0,$AA1
  206. #xor TEMP2,BB0,BB1 # BB0 = 0 to TEMP2 == BB1 == qp_low
  207. @{[rv64_clmul $EE0,$TEMP,$BB1]}
  208. @{[rv64_clmulh $EE1,$TEMP,$BB1]}
  209. # Need the DD1+CC0+DD0+EE0 term to shift its leftmost bit into the
  210. # intermediate result.
  211. # This is just CC0+EE0, store it in TEMP
  212. xor $TEMP,$CC0,$EE0
  213. # Result is [CC1:CC0+CC1+EE1:(a single bit)]<<1
  214. # Combine into [CC1:CC0]
  215. xor $CC0,$CC0,$CC1
  216. xor $CC0,$CC0,$EE1
  217. # Shift 128-bit quantity, xor in [C1,E1] and store
  218. slli $CC1,$CC1,1
  219. srli $TEMP2,$CC0,63
  220. or $CC1,$CC1,$TEMP2
  221. # xor in C1
  222. xor $CC1,$CC1,$C1
  223. @{[rv64_rev8 $CC1,$CC1]}
  224. slli $CC0,$CC0,1
  225. srli $TEMP,$TEMP,63
  226. or $CC0,$CC0,$TEMP
  227. # xor in E1
  228. xor $CC0,$CC0,$E1
  229. @{[rv64_rev8 $CC0,$CC0]}
  230. sd $CC1,0(a0)
  231. sd $CC0,8(a0)
  232. ret
  233. ___
  234. }
  235. print $code;
  236. close STDOUT or die "error closing STDOUT: $!";