rsaz-3k-avx512.pl 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883
  1. # Copyright 2021-2023 The OpenSSL Project Authors. All Rights Reserved.
  2. # Copyright (c) 2021, Intel Corporation. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. #
  10. # Originally written by Sergey Kirillov and Andrey Matyukov
  11. # Intel Corporation
  12. #
  13. # March 2021
  14. #
  15. # Initial release.
  16. #
  17. # Implementation utilizes 256-bit (ymm) registers to avoid frequency scaling issues.
  18. #
  19. # IceLake-Client @ 1.3GHz
  20. # |---------+-----------------------+---------------+-------------|
  21. # | | OpenSSL 3.0.0-alpha15 | this | Unit |
  22. # |---------+-----------------------+---------------+-------------|
  23. # | rsa3072 | 6 397 637 | 2 866 593 | cycles/sign |
  24. # | | 203.2 | 453.5 / +123% | sign/s |
  25. # |---------+-----------------------+---------------+-------------|
  26. #
  27. # $output is the last argument if it looks like a file (it has an extension)
  28. # $flavour is the first argument if it doesn't look like a file
  29. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  30. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  31. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  32. $avx512ifma=0;
  33. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  34. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  35. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  36. die "can't locate x86_64-xlate.pl";
  37. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  38. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  39. $avx512ifma = ($1>=2.26);
  40. }
  41. if (!$avx512ifma && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  42. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) {
  43. $avx512ifma = ($1==2.11 && $2>=8) + ($1>=2.12);
  44. }
  45. if (!$avx512ifma && `$ENV{CC} -v 2>&1`
  46. =~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) {
  47. my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001
  48. if ($1) {
  49. # Apple conditions, they use a different version series, see
  50. # https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2
  51. # clang 7.0.0 is Apple clang 10.0.1
  52. $avx512ifma = ($ver>=10.0001)
  53. } else {
  54. $avx512ifma = ($ver>=7.0);
  55. }
  56. }
  57. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
  58. or die "can't call $xlate: $!";
  59. *STDOUT=*OUT;
  60. if ($avx512ifma>0) {{{
  61. @_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
  62. ###############################################################################
  63. # Almost Montgomery Multiplication (AMM) for 30-digit number in radix 2^52.
  64. #
  65. # AMM is defined as presented in the paper [1].
  66. #
  67. # The input and output are presented in 2^52 radix domain, i.e.
  68. # |res|, |a|, |b|, |m| are arrays of 32 64-bit qwords with 12 high bits zeroed
  69. #
  70. # NOTE: the function uses zero-padded data - 2 high QWs is a padding.
  71. #
  72. # |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64
  73. #
  74. # NB: the AMM implementation does not perform "conditional" subtraction step
  75. # specified in the original algorithm as according to the Lemma 1 from the paper
  76. # [2], the result will be always < 2*m and can be used as a direct input to
  77. # the next AMM iteration. This post-condition is true, provided the correct
  78. # parameter |s| (notion of the Lemma 1 from [2]) is chosen, i.e. s >= n + 2 * k,
  79. # which matches our case: 1560 > 1536 + 2 * 1.
  80. #
  81. # [1] Gueron, S. Efficient software implementations of modular exponentiation.
  82. # DOI: 10.1007/s13389-012-0031-5
  83. # [2] Gueron, S. Enhanced Montgomery Multiplication.
  84. # DOI: 10.1007/3-540-36400-5_5
  85. #
  86. # void ossl_rsaz_amm52x30_x1_ifma256(BN_ULONG *res,
  87. # const BN_ULONG *a,
  88. # const BN_ULONG *b,
  89. # const BN_ULONG *m,
  90. # BN_ULONG k0);
  91. ###############################################################################
  92. {
  93. # input parameters ("%rdi","%rsi","%rdx","%rcx","%r8")
  94. my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI;
  95. my $mask52 = "%rax";
  96. my $acc0_0 = "%r9";
  97. my $acc0_0_low = "%r9d";
  98. my $acc0_1 = "%r15";
  99. my $acc0_1_low = "%r15d";
  100. my $b_ptr = "%r11";
  101. my $iter = "%ebx";
  102. my $zero = "%ymm0";
  103. my $Bi = "%ymm1";
  104. my $Yi = "%ymm2";
  105. my ($R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h) = map("%ymm$_",(3..10));
  106. my ($R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h) = map("%ymm$_",(11..18));
  107. # Registers mapping for normalization
  108. my ($T0,$T0h,$T1,$T1h,$T2,$T2h,$T3,$T3h) = ("$zero", "$Bi", "$Yi", map("%ymm$_", (19..23)));
  109. sub amm52x30_x1() {
  110. # _data_offset - offset in the |a| or |m| arrays pointing to the beginning
  111. # of data for corresponding AMM operation;
  112. # _b_offset - offset in the |b| array pointing to the next qword digit;
  113. my ($_data_offset,$_b_offset,$_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h,$_k0) = @_;
  114. my $_R0_xmm = $_R0;
  115. $_R0_xmm =~ s/%y/%x/;
  116. $code.=<<___;
  117. movq $_b_offset($b_ptr), %r13 # b[i]
  118. vpbroadcastq %r13, $Bi # broadcast b[i]
  119. movq $_data_offset($a), %rdx
  120. mulx %r13, %r13, %r12 # a[0]*b[i] = (t0,t2)
  121. addq %r13, $_acc # acc += t0
  122. movq %r12, %r10
  123. adcq \$0, %r10 # t2 += CF
  124. movq $_k0, %r13
  125. imulq $_acc, %r13 # acc * k0
  126. andq $mask52, %r13 # yi = (acc * k0) & mask52
  127. vpbroadcastq %r13, $Yi # broadcast y[i]
  128. movq $_data_offset($m), %rdx
  129. mulx %r13, %r13, %r12 # yi * m[0] = (t0,t1)
  130. addq %r13, $_acc # acc += t0
  131. adcq %r12, %r10 # t2 += (t1 + CF)
  132. shrq \$52, $_acc
  133. salq \$12, %r10
  134. or %r10, $_acc # acc = ((acc >> 52) | (t2 << 12))
  135. vpmadd52luq `$_data_offset+64*0`($a), $Bi, $_R0
  136. vpmadd52luq `$_data_offset+64*0+32`($a), $Bi, $_R0h
  137. vpmadd52luq `$_data_offset+64*1`($a), $Bi, $_R1
  138. vpmadd52luq `$_data_offset+64*1+32`($a), $Bi, $_R1h
  139. vpmadd52luq `$_data_offset+64*2`($a), $Bi, $_R2
  140. vpmadd52luq `$_data_offset+64*2+32`($a), $Bi, $_R2h
  141. vpmadd52luq `$_data_offset+64*3`($a), $Bi, $_R3
  142. vpmadd52luq `$_data_offset+64*3+32`($a), $Bi, $_R3h
  143. vpmadd52luq `$_data_offset+64*0`($m), $Yi, $_R0
  144. vpmadd52luq `$_data_offset+64*0+32`($m), $Yi, $_R0h
  145. vpmadd52luq `$_data_offset+64*1`($m), $Yi, $_R1
  146. vpmadd52luq `$_data_offset+64*1+32`($m), $Yi, $_R1h
  147. vpmadd52luq `$_data_offset+64*2`($m), $Yi, $_R2
  148. vpmadd52luq `$_data_offset+64*2+32`($m), $Yi, $_R2h
  149. vpmadd52luq `$_data_offset+64*3`($m), $Yi, $_R3
  150. vpmadd52luq `$_data_offset+64*3+32`($m), $Yi, $_R3h
  151. # Shift accumulators right by 1 qword, zero extending the highest one
  152. valignq \$1, $_R0, $_R0h, $_R0
  153. valignq \$1, $_R0h, $_R1, $_R0h
  154. valignq \$1, $_R1, $_R1h, $_R1
  155. valignq \$1, $_R1h, $_R2, $_R1h
  156. valignq \$1, $_R2, $_R2h, $_R2
  157. valignq \$1, $_R2h, $_R3, $_R2h
  158. valignq \$1, $_R3, $_R3h, $_R3
  159. valignq \$1, $_R3h, $zero, $_R3h
  160. vmovq $_R0_xmm, %r13
  161. addq %r13, $_acc # acc += R0[0]
  162. vpmadd52huq `$_data_offset+64*0`($a), $Bi, $_R0
  163. vpmadd52huq `$_data_offset+64*0+32`($a), $Bi, $_R0h
  164. vpmadd52huq `$_data_offset+64*1`($a), $Bi, $_R1
  165. vpmadd52huq `$_data_offset+64*1+32`($a), $Bi, $_R1h
  166. vpmadd52huq `$_data_offset+64*2`($a), $Bi, $_R2
  167. vpmadd52huq `$_data_offset+64*2+32`($a), $Bi, $_R2h
  168. vpmadd52huq `$_data_offset+64*3`($a), $Bi, $_R3
  169. vpmadd52huq `$_data_offset+64*3+32`($a), $Bi, $_R3h
  170. vpmadd52huq `$_data_offset+64*0`($m), $Yi, $_R0
  171. vpmadd52huq `$_data_offset+64*0+32`($m), $Yi, $_R0h
  172. vpmadd52huq `$_data_offset+64*1`($m), $Yi, $_R1
  173. vpmadd52huq `$_data_offset+64*1+32`($m), $Yi, $_R1h
  174. vpmadd52huq `$_data_offset+64*2`($m), $Yi, $_R2
  175. vpmadd52huq `$_data_offset+64*2+32`($m), $Yi, $_R2h
  176. vpmadd52huq `$_data_offset+64*3`($m), $Yi, $_R3
  177. vpmadd52huq `$_data_offset+64*3+32`($m), $Yi, $_R3h
  178. ___
  179. }
  180. # Normalization routine: handles carry bits and gets bignum qwords to normalized
  181. # 2^52 representation.
  182. #
  183. # Uses %r8-14,%e[abcd]x
  184. sub amm52x30_x1_norm {
  185. my ($_acc,$_R0,$_R0h,$_R1,$_R1h,$_R2,$_R2h,$_R3,$_R3h) = @_;
  186. $code.=<<___;
  187. # Put accumulator to low qword in R0
  188. vpbroadcastq $_acc, $T0
  189. vpblendd \$3, $T0, $_R0, $_R0
  190. # Extract "carries" (12 high bits) from each QW of the bignum
  191. # Save them to LSB of QWs in T0..Tn
  192. vpsrlq \$52, $_R0, $T0
  193. vpsrlq \$52, $_R0h, $T0h
  194. vpsrlq \$52, $_R1, $T1
  195. vpsrlq \$52, $_R1h, $T1h
  196. vpsrlq \$52, $_R2, $T2
  197. vpsrlq \$52, $_R2h, $T2h
  198. vpsrlq \$52, $_R3, $T3
  199. vpsrlq \$52, $_R3h, $T3h
  200. # "Shift left" T0..Tn by 1 QW
  201. valignq \$3, $T3, $T3h, $T3h
  202. valignq \$3, $T2h, $T3, $T3
  203. valignq \$3, $T2, $T2h, $T2h
  204. valignq \$3, $T1h, $T2, $T2
  205. valignq \$3, $T1, $T1h, $T1h
  206. valignq \$3, $T0h, $T1, $T1
  207. valignq \$3, $T0, $T0h, $T0h
  208. valignq \$3, .Lzeros(%rip), $T0, $T0
  209. # Drop "carries" from R0..Rn QWs
  210. vpandq .Lmask52x4(%rip), $_R0, $_R0
  211. vpandq .Lmask52x4(%rip), $_R0h, $_R0h
  212. vpandq .Lmask52x4(%rip), $_R1, $_R1
  213. vpandq .Lmask52x4(%rip), $_R1h, $_R1h
  214. vpandq .Lmask52x4(%rip), $_R2, $_R2
  215. vpandq .Lmask52x4(%rip), $_R2h, $_R2h
  216. vpandq .Lmask52x4(%rip), $_R3, $_R3
  217. vpandq .Lmask52x4(%rip), $_R3h, $_R3h
  218. # Sum R0..Rn with corresponding adjusted carries
  219. vpaddq $T0, $_R0, $_R0
  220. vpaddq $T0h, $_R0h, $_R0h
  221. vpaddq $T1, $_R1, $_R1
  222. vpaddq $T1h, $_R1h, $_R1h
  223. vpaddq $T2, $_R2, $_R2
  224. vpaddq $T2h, $_R2h, $_R2h
  225. vpaddq $T3, $_R3, $_R3
  226. vpaddq $T3h, $_R3h, $_R3h
  227. # Now handle carry bits from this addition
  228. # Get mask of QWs whose 52-bit parts overflow
  229. vpcmpuq \$6,.Lmask52x4(%rip),${_R0},%k1 # OP=nle (i.e. gt)
  230. vpcmpuq \$6,.Lmask52x4(%rip),${_R0h},%k2
  231. kmovb %k1,%r14d
  232. kmovb %k2,%r13d
  233. shl \$4,%r13b
  234. or %r13b,%r14b
  235. vpcmpuq \$6,.Lmask52x4(%rip),${_R1},%k1
  236. vpcmpuq \$6,.Lmask52x4(%rip),${_R1h},%k2
  237. kmovb %k1,%r13d
  238. kmovb %k2,%r12d
  239. shl \$4,%r12b
  240. or %r12b,%r13b
  241. vpcmpuq \$6,.Lmask52x4(%rip),${_R2},%k1
  242. vpcmpuq \$6,.Lmask52x4(%rip),${_R2h},%k2
  243. kmovb %k1,%r12d
  244. kmovb %k2,%r11d
  245. shl \$4,%r11b
  246. or %r11b,%r12b
  247. vpcmpuq \$6,.Lmask52x4(%rip),${_R3},%k1
  248. vpcmpuq \$6,.Lmask52x4(%rip),${_R3h},%k2
  249. kmovb %k1,%r11d
  250. kmovb %k2,%r10d
  251. shl \$4,%r10b
  252. or %r10b,%r11b
  253. addb %r14b,%r14b
  254. adcb %r13b,%r13b
  255. adcb %r12b,%r12b
  256. adcb %r11b,%r11b
  257. # Get mask of QWs whose 52-bit parts saturated
  258. vpcmpuq \$0,.Lmask52x4(%rip),${_R0},%k1 # OP=eq
  259. vpcmpuq \$0,.Lmask52x4(%rip),${_R0h},%k2
  260. kmovb %k1,%r9d
  261. kmovb %k2,%r8d
  262. shl \$4,%r8b
  263. or %r8b,%r9b
  264. vpcmpuq \$0,.Lmask52x4(%rip),${_R1},%k1
  265. vpcmpuq \$0,.Lmask52x4(%rip),${_R1h},%k2
  266. kmovb %k1,%r8d
  267. kmovb %k2,%edx
  268. shl \$4,%dl
  269. or %dl,%r8b
  270. vpcmpuq \$0,.Lmask52x4(%rip),${_R2},%k1
  271. vpcmpuq \$0,.Lmask52x4(%rip),${_R2h},%k2
  272. kmovb %k1,%edx
  273. kmovb %k2,%ecx
  274. shl \$4,%cl
  275. or %cl,%dl
  276. vpcmpuq \$0,.Lmask52x4(%rip),${_R3},%k1
  277. vpcmpuq \$0,.Lmask52x4(%rip),${_R3h},%k2
  278. kmovb %k1,%ecx
  279. kmovb %k2,%ebx
  280. shl \$4,%bl
  281. or %bl,%cl
  282. addb %r9b,%r14b
  283. adcb %r8b,%r13b
  284. adcb %dl,%r12b
  285. adcb %cl,%r11b
  286. xor %r9b,%r14b
  287. xor %r8b,%r13b
  288. xor %dl,%r12b
  289. xor %cl,%r11b
  290. kmovb %r14d,%k1
  291. shr \$4,%r14b
  292. kmovb %r14d,%k2
  293. kmovb %r13d,%k3
  294. shr \$4,%r13b
  295. kmovb %r13d,%k4
  296. kmovb %r12d,%k5
  297. shr \$4,%r12b
  298. kmovb %r12d,%k6
  299. kmovb %r11d,%k7
  300. vpsubq .Lmask52x4(%rip), $_R0, ${_R0}{%k1}
  301. vpsubq .Lmask52x4(%rip), $_R0h, ${_R0h}{%k2}
  302. vpsubq .Lmask52x4(%rip), $_R1, ${_R1}{%k3}
  303. vpsubq .Lmask52x4(%rip), $_R1h, ${_R1h}{%k4}
  304. vpsubq .Lmask52x4(%rip), $_R2, ${_R2}{%k5}
  305. vpsubq .Lmask52x4(%rip), $_R2h, ${_R2h}{%k6}
  306. vpsubq .Lmask52x4(%rip), $_R3, ${_R3}{%k7}
  307. vpandq .Lmask52x4(%rip), $_R0, $_R0
  308. vpandq .Lmask52x4(%rip), $_R0h, $_R0h
  309. vpandq .Lmask52x4(%rip), $_R1, $_R1
  310. vpandq .Lmask52x4(%rip), $_R1h, $_R1h
  311. vpandq .Lmask52x4(%rip), $_R2, $_R2
  312. vpandq .Lmask52x4(%rip), $_R2h, $_R2h
  313. vpandq .Lmask52x4(%rip), $_R3, $_R3
  314. shr \$4,%r11b
  315. kmovb %r11d,%k1
  316. vpsubq .Lmask52x4(%rip), $_R3h, ${_R3h}{%k1}
  317. vpandq .Lmask52x4(%rip), $_R3h, $_R3h
  318. ___
  319. }
  320. $code.=<<___;
  321. .text
  322. .globl ossl_rsaz_amm52x30_x1_ifma256
  323. .type ossl_rsaz_amm52x30_x1_ifma256,\@function,5
  324. .align 32
  325. ossl_rsaz_amm52x30_x1_ifma256:
  326. .cfi_startproc
  327. endbranch
  328. push %rbx
  329. .cfi_push %rbx
  330. push %rbp
  331. .cfi_push %rbp
  332. push %r12
  333. .cfi_push %r12
  334. push %r13
  335. .cfi_push %r13
  336. push %r14
  337. .cfi_push %r14
  338. push %r15
  339. .cfi_push %r15
  340. ___
  341. $code.=<<___ if ($win64);
  342. lea -168(%rsp),%rsp # 16*10 + (8 bytes to get correct 16-byte SIMD alignment)
  343. vmovdqa64 %xmm6, `0*16`(%rsp) # save non-volatile registers
  344. vmovdqa64 %xmm7, `1*16`(%rsp)
  345. vmovdqa64 %xmm8, `2*16`(%rsp)
  346. vmovdqa64 %xmm9, `3*16`(%rsp)
  347. vmovdqa64 %xmm10,`4*16`(%rsp)
  348. vmovdqa64 %xmm11,`5*16`(%rsp)
  349. vmovdqa64 %xmm12,`6*16`(%rsp)
  350. vmovdqa64 %xmm13,`7*16`(%rsp)
  351. vmovdqa64 %xmm14,`8*16`(%rsp)
  352. vmovdqa64 %xmm15,`9*16`(%rsp)
  353. .Lossl_rsaz_amm52x30_x1_ifma256_body:
  354. ___
  355. $code.=<<___;
  356. # Zeroing accumulators
  357. vpxord $zero, $zero, $zero
  358. vmovdqa64 $zero, $R0_0
  359. vmovdqa64 $zero, $R0_0h
  360. vmovdqa64 $zero, $R1_0
  361. vmovdqa64 $zero, $R1_0h
  362. vmovdqa64 $zero, $R2_0
  363. vmovdqa64 $zero, $R2_0h
  364. vmovdqa64 $zero, $R3_0
  365. vmovdqa64 $zero, $R3_0h
  366. xorl $acc0_0_low, $acc0_0_low
  367. movq $b, $b_ptr # backup address of b
  368. movq \$0xfffffffffffff, $mask52 # 52-bit mask
  369. # Loop over 30 digits unrolled by 4
  370. mov \$7, $iter
  371. .align 32
  372. .Lloop7:
  373. ___
  374. foreach my $idx (0..3) {
  375. &amm52x30_x1(0,8*$idx,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$k0);
  376. }
  377. $code.=<<___;
  378. lea `4*8`($b_ptr), $b_ptr
  379. dec $iter
  380. jne .Lloop7
  381. ___
  382. &amm52x30_x1(0,8*0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$k0);
  383. &amm52x30_x1(0,8*1,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,$k0);
  384. &amm52x30_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h);
  385. $code.=<<___;
  386. vmovdqu64 $R0_0, `0*32`($res)
  387. vmovdqu64 $R0_0h, `1*32`($res)
  388. vmovdqu64 $R1_0, `2*32`($res)
  389. vmovdqu64 $R1_0h, `3*32`($res)
  390. vmovdqu64 $R2_0, `4*32`($res)
  391. vmovdqu64 $R2_0h, `5*32`($res)
  392. vmovdqu64 $R3_0, `6*32`($res)
  393. vmovdqu64 $R3_0h, `7*32`($res)
  394. vzeroupper
  395. lea (%rsp),%rax
  396. .cfi_def_cfa_register %rax
  397. ___
  398. $code.=<<___ if ($win64);
  399. vmovdqa64 `0*16`(%rax),%xmm6
  400. vmovdqa64 `1*16`(%rax),%xmm7
  401. vmovdqa64 `2*16`(%rax),%xmm8
  402. vmovdqa64 `3*16`(%rax),%xmm9
  403. vmovdqa64 `4*16`(%rax),%xmm10
  404. vmovdqa64 `5*16`(%rax),%xmm11
  405. vmovdqa64 `6*16`(%rax),%xmm12
  406. vmovdqa64 `7*16`(%rax),%xmm13
  407. vmovdqa64 `8*16`(%rax),%xmm14
  408. vmovdqa64 `9*16`(%rax),%xmm15
  409. lea 168(%rsp),%rax
  410. ___
  411. $code.=<<___;
  412. mov 0(%rax),%r15
  413. .cfi_restore %r15
  414. mov 8(%rax),%r14
  415. .cfi_restore %r14
  416. mov 16(%rax),%r13
  417. .cfi_restore %r13
  418. mov 24(%rax),%r12
  419. .cfi_restore %r12
  420. mov 32(%rax),%rbp
  421. .cfi_restore %rbp
  422. mov 40(%rax),%rbx
  423. .cfi_restore %rbx
  424. lea 48(%rax),%rsp # restore rsp
  425. .cfi_def_cfa %rsp,8
  426. .Lossl_rsaz_amm52x30_x1_ifma256_epilogue:
  427. ret
  428. .cfi_endproc
  429. .size ossl_rsaz_amm52x30_x1_ifma256, .-ossl_rsaz_amm52x30_x1_ifma256
  430. ___
  431. $code.=<<___;
  432. .data
  433. .align 32
  434. .Lmask52x4:
  435. .quad 0xfffffffffffff
  436. .quad 0xfffffffffffff
  437. .quad 0xfffffffffffff
  438. .quad 0xfffffffffffff
  439. ___
  440. ###############################################################################
  441. # Dual Almost Montgomery Multiplication for 30-digit number in radix 2^52
  442. #
  443. # See description of ossl_rsaz_amm52x30_x1_ifma256() above for details about Almost
  444. # Montgomery Multiplication algorithm and function input parameters description.
  445. #
  446. # This function does two AMMs for two independent inputs, hence dual.
  447. #
  448. # NOTE: the function uses zero-padded data - 2 high QWs is a padding.
  449. #
  450. # void ossl_rsaz_amm52x30_x2_ifma256(BN_ULONG out[2][32],
  451. # const BN_ULONG a[2][32],
  452. # const BN_ULONG b[2][32],
  453. # const BN_ULONG m[2][32],
  454. # const BN_ULONG k0[2]);
  455. ###############################################################################
  456. $code.=<<___;
  457. .text
  458. .globl ossl_rsaz_amm52x30_x2_ifma256
  459. .type ossl_rsaz_amm52x30_x2_ifma256,\@function,5
  460. .align 32
  461. ossl_rsaz_amm52x30_x2_ifma256:
  462. .cfi_startproc
  463. endbranch
  464. push %rbx
  465. .cfi_push %rbx
  466. push %rbp
  467. .cfi_push %rbp
  468. push %r12
  469. .cfi_push %r12
  470. push %r13
  471. .cfi_push %r13
  472. push %r14
  473. .cfi_push %r14
  474. push %r15
  475. .cfi_push %r15
  476. ___
  477. $code.=<<___ if ($win64);
  478. lea -168(%rsp),%rsp
  479. vmovdqa64 %xmm6, `0*16`(%rsp) # save non-volatile registers
  480. vmovdqa64 %xmm7, `1*16`(%rsp)
  481. vmovdqa64 %xmm8, `2*16`(%rsp)
  482. vmovdqa64 %xmm9, `3*16`(%rsp)
  483. vmovdqa64 %xmm10,`4*16`(%rsp)
  484. vmovdqa64 %xmm11,`5*16`(%rsp)
  485. vmovdqa64 %xmm12,`6*16`(%rsp)
  486. vmovdqa64 %xmm13,`7*16`(%rsp)
  487. vmovdqa64 %xmm14,`8*16`(%rsp)
  488. vmovdqa64 %xmm15,`9*16`(%rsp)
  489. .Lossl_rsaz_amm52x30_x2_ifma256_body:
  490. ___
  491. $code.=<<___;
  492. # Zeroing accumulators
  493. vpxord $zero, $zero, $zero
  494. vmovdqa64 $zero, $R0_0
  495. vmovdqa64 $zero, $R0_0h
  496. vmovdqa64 $zero, $R1_0
  497. vmovdqa64 $zero, $R1_0h
  498. vmovdqa64 $zero, $R2_0
  499. vmovdqa64 $zero, $R2_0h
  500. vmovdqa64 $zero, $R3_0
  501. vmovdqa64 $zero, $R3_0h
  502. vmovdqa64 $zero, $R0_1
  503. vmovdqa64 $zero, $R0_1h
  504. vmovdqa64 $zero, $R1_1
  505. vmovdqa64 $zero, $R1_1h
  506. vmovdqa64 $zero, $R2_1
  507. vmovdqa64 $zero, $R2_1h
  508. vmovdqa64 $zero, $R3_1
  509. vmovdqa64 $zero, $R3_1h
  510. xorl $acc0_0_low, $acc0_0_low
  511. xorl $acc0_1_low, $acc0_1_low
  512. movq $b, $b_ptr # backup address of b
  513. movq \$0xfffffffffffff, $mask52 # 52-bit mask
  514. mov \$30, $iter
  515. .align 32
  516. .Lloop30:
  517. ___
  518. &amm52x30_x1( 0, 0,$acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h,"($k0)");
  519. # 32*8 = offset of the next dimension in two-dimension array
  520. &amm52x30_x1(32*8,32*8,$acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h,"8($k0)");
  521. $code.=<<___;
  522. lea 8($b_ptr), $b_ptr
  523. dec $iter
  524. jne .Lloop30
  525. ___
  526. &amm52x30_x1_norm($acc0_0,$R0_0,$R0_0h,$R1_0,$R1_0h,$R2_0,$R2_0h,$R3_0,$R3_0h);
  527. &amm52x30_x1_norm($acc0_1,$R0_1,$R0_1h,$R1_1,$R1_1h,$R2_1,$R2_1h,$R3_1,$R3_1h);
  528. $code.=<<___;
  529. vmovdqu64 $R0_0, `0*32`($res)
  530. vmovdqu64 $R0_0h, `1*32`($res)
  531. vmovdqu64 $R1_0, `2*32`($res)
  532. vmovdqu64 $R1_0h, `3*32`($res)
  533. vmovdqu64 $R2_0, `4*32`($res)
  534. vmovdqu64 $R2_0h, `5*32`($res)
  535. vmovdqu64 $R3_0, `6*32`($res)
  536. vmovdqu64 $R3_0h, `7*32`($res)
  537. vmovdqu64 $R0_1, `8*32`($res)
  538. vmovdqu64 $R0_1h, `9*32`($res)
  539. vmovdqu64 $R1_1, `10*32`($res)
  540. vmovdqu64 $R1_1h, `11*32`($res)
  541. vmovdqu64 $R2_1, `12*32`($res)
  542. vmovdqu64 $R2_1h, `13*32`($res)
  543. vmovdqu64 $R3_1, `14*32`($res)
  544. vmovdqu64 $R3_1h, `15*32`($res)
  545. vzeroupper
  546. lea (%rsp),%rax
  547. .cfi_def_cfa_register %rax
  548. ___
  549. $code.=<<___ if ($win64);
  550. vmovdqa64 `0*16`(%rax),%xmm6
  551. vmovdqa64 `1*16`(%rax),%xmm7
  552. vmovdqa64 `2*16`(%rax),%xmm8
  553. vmovdqa64 `3*16`(%rax),%xmm9
  554. vmovdqa64 `4*16`(%rax),%xmm10
  555. vmovdqa64 `5*16`(%rax),%xmm11
  556. vmovdqa64 `6*16`(%rax),%xmm12
  557. vmovdqa64 `7*16`(%rax),%xmm13
  558. vmovdqa64 `8*16`(%rax),%xmm14
  559. vmovdqa64 `9*16`(%rax),%xmm15
  560. lea 168(%rsp),%rax
  561. ___
  562. $code.=<<___;
  563. mov 0(%rax),%r15
  564. .cfi_restore %r15
  565. mov 8(%rax),%r14
  566. .cfi_restore %r14
  567. mov 16(%rax),%r13
  568. .cfi_restore %r13
  569. mov 24(%rax),%r12
  570. .cfi_restore %r12
  571. mov 32(%rax),%rbp
  572. .cfi_restore %rbp
  573. mov 40(%rax),%rbx
  574. .cfi_restore %rbx
  575. lea 48(%rax),%rsp
  576. .cfi_def_cfa %rsp,8
  577. .Lossl_rsaz_amm52x30_x2_ifma256_epilogue:
  578. ret
  579. .cfi_endproc
  580. .size ossl_rsaz_amm52x30_x2_ifma256, .-ossl_rsaz_amm52x30_x2_ifma256
  581. ___
  582. }
  583. ###############################################################################
  584. # Constant time extraction from the precomputed table of powers base^i, where
  585. # i = 0..2^EXP_WIN_SIZE-1
  586. #
  587. # The input |red_table| contains precomputations for two independent base values.
  588. # |red_table_idx1| and |red_table_idx2| are corresponding power indexes.
  589. #
  590. # Extracted value (output) is 2 (30 + 2) digits numbers in 2^52 radix.
  591. # (2 high QW is zero padding)
  592. #
  593. # void ossl_extract_multiplier_2x30_win5(BN_ULONG *red_Y,
  594. # const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][32],
  595. # int red_table_idx1, int red_table_idx2);
  596. #
  597. # EXP_WIN_SIZE = 5
  598. ###############################################################################
  599. {
  600. # input parameters
  601. my ($out,$red_tbl,$red_tbl_idx1,$red_tbl_idx2)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
  602. ("%rdi","%rsi","%rdx","%rcx"); # Unix order
  603. my ($t0,$t1,$t2,$t3,$t4,$t5) = map("%ymm$_", (0..5));
  604. my ($t6,$t7,$t8,$t9,$t10,$t11,$t12,$t13,$t14,$t15) = map("%ymm$_", (16..25));
  605. my ($tmp,$cur_idx,$idx1,$idx2,$ones) = map("%ymm$_", (26..30));
  606. my @t = ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7,$t8,$t9,$t10,$t11,$t12,$t13,$t14,$t15);
  607. my $t0xmm = $t0;
  608. $t0xmm =~ s/%y/%x/;
  609. $code.=<<___;
  610. .text
  611. .align 32
  612. .globl ossl_extract_multiplier_2x30_win5
  613. .type ossl_extract_multiplier_2x30_win5,\@abi-omnipotent
  614. ossl_extract_multiplier_2x30_win5:
  615. .cfi_startproc
  616. endbranch
  617. vmovdqa64 .Lones(%rip), $ones # broadcast ones
  618. vpbroadcastq $red_tbl_idx1, $idx1
  619. vpbroadcastq $red_tbl_idx2, $idx2
  620. leaq `(1<<5)*2*32*8`($red_tbl), %rax # holds end of the tbl
  621. # zeroing t0..n, cur_idx
  622. vpxor $t0xmm, $t0xmm, $t0xmm
  623. vmovdqa64 $t0, $cur_idx
  624. ___
  625. foreach (1..15) {
  626. $code.="vmovdqa64 $t0, $t[$_] \n";
  627. }
  628. $code.=<<___;
  629. .align 32
  630. .Lloop:
  631. vpcmpq \$0, $cur_idx, $idx1, %k1 # mask of (idx1 == cur_idx)
  632. vpcmpq \$0, $cur_idx, $idx2, %k2 # mask of (idx2 == cur_idx)
  633. ___
  634. foreach (0..15) {
  635. my $mask = $_<8?"%k1":"%k2";
  636. $code.=<<___;
  637. vmovdqu64 `${_}*32`($red_tbl), $tmp # load data from red_tbl
  638. vpblendmq $tmp, $t[$_], ${t[$_]}{$mask} # extract data when mask is not zero
  639. ___
  640. }
  641. $code.=<<___;
  642. vpaddq $ones, $cur_idx, $cur_idx # increment cur_idx
  643. addq \$`2*32*8`, $red_tbl
  644. cmpq $red_tbl, %rax
  645. jne .Lloop
  646. ___
  647. # store t0..n
  648. foreach (0..15) {
  649. $code.="vmovdqu64 $t[$_], `${_}*32`($out) \n";
  650. }
  651. $code.=<<___;
  652. ret
  653. .cfi_endproc
  654. .size ossl_extract_multiplier_2x30_win5, .-ossl_extract_multiplier_2x30_win5
  655. ___
  656. $code.=<<___;
  657. .data
  658. .align 32
  659. .Lones:
  660. .quad 1,1,1,1
  661. .Lzeros:
  662. .quad 0,0,0,0
  663. ___
  664. }
  665. if ($win64) {
  666. $rec="%rcx";
  667. $frame="%rdx";
  668. $context="%r8";
  669. $disp="%r9";
  670. $code.=<<___;
  671. .extern __imp_RtlVirtualUnwind
  672. .type rsaz_avx_handler,\@abi-omnipotent
  673. .align 16
  674. rsaz_avx_handler:
  675. push %rsi
  676. push %rdi
  677. push %rbx
  678. push %rbp
  679. push %r12
  680. push %r13
  681. push %r14
  682. push %r15
  683. pushfq
  684. sub \$64,%rsp
  685. mov 120($context),%rax # pull context->Rax
  686. mov 248($context),%rbx # pull context->Rip
  687. mov 8($disp),%rsi # disp->ImageBase
  688. mov 56($disp),%r11 # disp->HandlerData
  689. mov 0(%r11),%r10d # HandlerData[0]
  690. lea (%rsi,%r10),%r10 # prologue label
  691. cmp %r10,%rbx # context->Rip<.Lprologue
  692. jb .Lcommon_seh_tail
  693. mov 4(%r11),%r10d # HandlerData[1]
  694. lea (%rsi,%r10),%r10 # epilogue label
  695. cmp %r10,%rbx # context->Rip>=.Lepilogue
  696. jae .Lcommon_seh_tail
  697. mov 152($context),%rax # pull context->Rsp
  698. lea (%rax),%rsi # %xmm save area
  699. lea 512($context),%rdi # & context.Xmm6
  700. mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
  701. .long 0xa548f3fc # cld; rep movsq
  702. lea `48+168`(%rax),%rax
  703. mov -8(%rax),%rbx
  704. mov -16(%rax),%rbp
  705. mov -24(%rax),%r12
  706. mov -32(%rax),%r13
  707. mov -40(%rax),%r14
  708. mov -48(%rax),%r15
  709. mov %rbx,144($context) # restore context->Rbx
  710. mov %rbp,160($context) # restore context->Rbp
  711. mov %r12,216($context) # restore context->R12
  712. mov %r13,224($context) # restore context->R13
  713. mov %r14,232($context) # restore context->R14
  714. mov %r15,240($context) # restore context->R14
  715. .Lcommon_seh_tail:
  716. mov 8(%rax),%rdi
  717. mov 16(%rax),%rsi
  718. mov %rax,152($context) # restore context->Rsp
  719. mov %rsi,168($context) # restore context->Rsi
  720. mov %rdi,176($context) # restore context->Rdi
  721. mov 40($disp),%rdi # disp->ContextRecord
  722. mov $context,%rsi # context
  723. mov \$154,%ecx # sizeof(CONTEXT)
  724. .long 0xa548f3fc # cld; rep movsq
  725. mov $disp,%rsi
  726. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  727. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  728. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  729. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  730. mov 40(%rsi),%r10 # disp->ContextRecord
  731. lea 56(%rsi),%r11 # &disp->HandlerData
  732. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  733. mov %r10,32(%rsp) # arg5
  734. mov %r11,40(%rsp) # arg6
  735. mov %r12,48(%rsp) # arg7
  736. mov %rcx,56(%rsp) # arg8, (NULL)
  737. call *__imp_RtlVirtualUnwind(%rip)
  738. mov \$1,%eax # ExceptionContinueSearch
  739. add \$64,%rsp
  740. popfq
  741. pop %r15
  742. pop %r14
  743. pop %r13
  744. pop %r12
  745. pop %rbp
  746. pop %rbx
  747. pop %rdi
  748. pop %rsi
  749. ret
  750. .size rsaz_avx_handler,.-rsaz_avx_handler
  751. .section .pdata
  752. .align 4
  753. .rva .LSEH_begin_ossl_rsaz_amm52x30_x1_ifma256
  754. .rva .LSEH_end_ossl_rsaz_amm52x30_x1_ifma256
  755. .rva .LSEH_info_ossl_rsaz_amm52x30_x1_ifma256
  756. .rva .LSEH_begin_ossl_rsaz_amm52x30_x2_ifma256
  757. .rva .LSEH_end_ossl_rsaz_amm52x30_x2_ifma256
  758. .rva .LSEH_info_ossl_rsaz_amm52x30_x2_ifma256
  759. .section .xdata
  760. .align 8
  761. .LSEH_info_ossl_rsaz_amm52x30_x1_ifma256:
  762. .byte 9,0,0,0
  763. .rva rsaz_avx_handler
  764. .rva .Lossl_rsaz_amm52x30_x1_ifma256_body,.Lossl_rsaz_amm52x30_x1_ifma256_epilogue
  765. .LSEH_info_ossl_rsaz_amm52x30_x2_ifma256:
  766. .byte 9,0,0,0
  767. .rva rsaz_avx_handler
  768. .rva .Lossl_rsaz_amm52x30_x2_ifma256_body,.Lossl_rsaz_amm52x30_x2_ifma256_epilogue
  769. ___
  770. }
  771. }}} else {{{ # fallback for old assembler
  772. $code.=<<___;
  773. .text
  774. .globl ossl_rsaz_amm52x30_x1_ifma256
  775. .globl ossl_rsaz_amm52x30_x2_ifma256
  776. .globl ossl_extract_multiplier_2x30_win5
  777. .type ossl_rsaz_amm52x30_x1_ifma256,\@abi-omnipotent
  778. ossl_rsaz_amm52x30_x1_ifma256:
  779. ossl_rsaz_amm52x30_x2_ifma256:
  780. ossl_extract_multiplier_2x30_win5:
  781. .byte 0x0f,0x0b # ud2
  782. ret
  783. .size ossl_rsaz_amm52x30_x1_ifma256, .-ossl_rsaz_amm52x30_x1_ifma256
  784. ___
  785. }}}
  786. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  787. print $code;
  788. close STDOUT or die "error closing STDOUT: $!";