ppc64-mont-fixed.pl 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583
  1. #! /usr/bin/env perl
  2. # Copyright 2021-2022 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # ====================================================================
  9. # Written by Amitay Isaacs <amitay@ozlabs.org>, Martin Schwenke
  10. # <martin@meltin.net> & Alastair D'Silva <alastair@d-silva.org> for
  11. # the OpenSSL project.
  12. # ====================================================================
  13. #
  14. # Fixed length (n=6), unrolled PPC Montgomery Multiplication
  15. #
  16. # 2021
  17. #
  18. # Although this is a generic implementation for unrolling Montgomery
  19. # Multiplication for arbitrary values of n, this is currently only
  20. # used for n = 6 to improve the performance of ECC p384.
  21. #
  22. # Unrolling allows intermediate results to be stored in registers,
  23. # rather than on the stack, improving performance by ~7% compared to
  24. # the existing PPC assembly code.
  25. #
  26. # The ISA 3.0 implementation uses combination multiply/add
  27. # instructions (maddld, maddhdu) to improve performance by an
  28. # additional ~10% on Power 9.
  29. #
  30. # Finally, saving non-volatile registers into volatile vector
  31. # registers instead of onto the stack saves a little more.
  32. #
  33. # On a Power 9 machine we see an overall improvement of ~18%.
  34. #
  35. use strict;
  36. use warnings;
  37. my ($flavour, $output, $dir, $xlate);
  38. # $output is the last argument if it looks like a file (it has an extension)
  39. # $flavour is the first argument if it doesn't look like a file
  40. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  41. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  42. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  43. ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  44. ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  45. die "can't locate ppc-xlate.pl";
  46. open STDOUT,"| $^X $xlate $flavour \"$output\""
  47. or die "can't call $xlate: $!";
  48. if ($flavour !~ /64/) {
  49. die "bad flavour ($flavour) - only ppc64 permitted";
  50. }
  51. my $SIZE_T= 8;
  52. # Registers are global so the code is remotely readable
  53. # Parameters for Montgomery multiplication
  54. my $ze = "r0";
  55. my $sp = "r1";
  56. my $toc = "r2";
  57. my $rp = "r3";
  58. my $ap = "r4";
  59. my $bp = "r5";
  60. my $np = "r6";
  61. my $n0 = "r7";
  62. my $num = "r8";
  63. my $i = "r9";
  64. my $c0 = "r10";
  65. my $bp0 = "r11";
  66. my $bpi = "r11";
  67. my $bpj = "r11";
  68. my $tj = "r12";
  69. my $apj = "r12";
  70. my $npj = "r12";
  71. my $lo = "r14";
  72. my $c1 = "r14";
  73. # Non-volatile registers used for tp[i]
  74. #
  75. # 12 registers are available but the limit on unrolling is 10,
  76. # since registers from $tp[0] to $tp[$n+1] are used.
  77. my @tp = ("r20" .. "r31");
  78. # volatile VSRs for saving non-volatile GPRs - faster than stack
  79. my @vsrs = ("v32" .. "v46");
  80. package Mont;
  81. sub new($$)
  82. {
  83. my ($class, $n) = @_;
  84. if ($n > 10) {
  85. die "Can't unroll for BN length ${n} (maximum 10)"
  86. }
  87. my $self = {
  88. code => "",
  89. n => $n,
  90. };
  91. bless $self, $class;
  92. return $self;
  93. }
  94. sub add_code($$)
  95. {
  96. my ($self, $c) = @_;
  97. $self->{code} .= $c;
  98. }
  99. sub get_code($)
  100. {
  101. my ($self) = @_;
  102. return $self->{code};
  103. }
  104. sub get_function_name($)
  105. {
  106. my ($self) = @_;
  107. return "bn_mul_mont_fixed_n" . $self->{n};
  108. }
  109. sub get_label($$)
  110. {
  111. my ($self, $l) = @_;
  112. return "L" . $l . "_" . $self->{n};
  113. }
  114. sub get_labels($@)
  115. {
  116. my ($self, @labels) = @_;
  117. my %out = ();
  118. foreach my $l (@labels) {
  119. $out{"$l"} = $self->get_label("$l");
  120. }
  121. return \%out;
  122. }
  123. sub nl($)
  124. {
  125. my ($self) = @_;
  126. $self->add_code("\n");
  127. }
  128. sub copy_result($)
  129. {
  130. my ($self) = @_;
  131. my ($n) = $self->{n};
  132. for (my $j = 0; $j < $n; $j++) {
  133. $self->add_code(<<___);
  134. std $tp[$j],`$j*$SIZE_T`($rp)
  135. ___
  136. }
  137. }
  138. sub mul_mont_fixed($)
  139. {
  140. my ($self) = @_;
  141. my ($n) = $self->{n};
  142. my $fname = $self->get_function_name();
  143. my $label = $self->get_labels("outer", "enter", "sub", "copy", "end");
  144. $self->add_code(<<___);
  145. .globl .${fname}
  146. .align 5
  147. .${fname}:
  148. ___
  149. $self->save_registers();
  150. $self->add_code(<<___);
  151. li $ze,0
  152. ld $n0,0($n0)
  153. ld $bp0,0($bp)
  154. ld $apj,0($ap)
  155. ___
  156. $self->mul_c_0($tp[0], $apj, $bp0, $c0);
  157. for (my $j = 1; $j < $n - 1; $j++) {
  158. $self->add_code(<<___);
  159. ld $apj,`$j*$SIZE_T`($ap)
  160. ___
  161. $self->mul($tp[$j], $apj, $bp0, $c0);
  162. }
  163. $self->add_code(<<___);
  164. ld $apj,`($n-1)*$SIZE_T`($ap)
  165. ___
  166. $self->mul_last($tp[$n-1], $tp[$n], $apj, $bp0, $c0);
  167. $self->add_code(<<___);
  168. li $tp[$n+1],0
  169. ___
  170. $self->add_code(<<___);
  171. li $i,0
  172. mtctr $num
  173. b $label->{"enter"}
  174. .align 4
  175. $label->{"outer"}:
  176. ldx $bpi,$bp,$i
  177. ld $apj,0($ap)
  178. ___
  179. $self->mul_add_c_0($tp[0], $tp[0], $apj, $bpi, $c0);
  180. for (my $j = 1; $j < $n; $j++) {
  181. $self->add_code(<<___);
  182. ld $apj,`$j*$SIZE_T`($ap)
  183. ___
  184. $self->mul_add($tp[$j], $tp[$j], $apj, $bpi, $c0);
  185. }
  186. $self->add_code(<<___);
  187. addc $tp[$n],$tp[$n],$c0
  188. addze $tp[$n+1],$ze
  189. ___
  190. $self->add_code(<<___);
  191. .align 4
  192. $label->{"enter"}:
  193. mulld $bpi,$tp[0],$n0
  194. ld $npj,0($np)
  195. ___
  196. $self->mul_add_c_0($lo, $tp[0], $bpi, $npj, $c0);
  197. for (my $j = 1; $j < $n; $j++) {
  198. $self->add_code(<<___);
  199. ld $npj,`$j*$SIZE_T`($np)
  200. ___
  201. $self->mul_add($tp[$j-1], $tp[$j], $npj, $bpi, $c0);
  202. }
  203. $self->add_code(<<___);
  204. addc $tp[$n-1],$tp[$n],$c0
  205. addze $tp[$n],$tp[$n+1]
  206. addi $i,$i,$SIZE_T
  207. bdnz $label->{"outer"}
  208. and. $tp[$n],$tp[$n],$tp[$n]
  209. bne $label->{"sub"}
  210. cmpld $tp[$n-1],$npj
  211. blt $label->{"copy"}
  212. $label->{"sub"}:
  213. ___
  214. #
  215. # Reduction
  216. #
  217. $self->add_code(<<___);
  218. ld $bpj,`0*$SIZE_T`($np)
  219. subfc $c1,$bpj,$tp[0]
  220. std $c1,`0*$SIZE_T`($rp)
  221. ___
  222. for (my $j = 1; $j < $n - 1; $j++) {
  223. $self->add_code(<<___);
  224. ld $bpj,`$j*$SIZE_T`($np)
  225. subfe $c1,$bpj,$tp[$j]
  226. std $c1,`$j*$SIZE_T`($rp)
  227. ___
  228. }
  229. $self->add_code(<<___);
  230. subfe $c1,$npj,$tp[$n-1]
  231. std $c1,`($n-1)*$SIZE_T`($rp)
  232. ___
  233. $self->add_code(<<___);
  234. addme. $tp[$n],$tp[$n]
  235. beq $label->{"end"}
  236. $label->{"copy"}:
  237. ___
  238. $self->copy_result();
  239. $self->add_code(<<___);
  240. $label->{"end"}:
  241. ___
  242. $self->restore_registers();
  243. $self->add_code(<<___);
  244. li r3,1
  245. blr
  246. .size .${fname},.-.${fname}
  247. ___
  248. }
  249. package Mont::GPR;
  250. our @ISA = ('Mont');
  251. sub new($$)
  252. {
  253. my ($class, $n) = @_;
  254. return $class->SUPER::new($n);
  255. }
  256. sub save_registers($)
  257. {
  258. my ($self) = @_;
  259. my $n = $self->{n};
  260. $self->add_code(<<___);
  261. std $lo,-8($sp)
  262. ___
  263. for (my $j = 0; $j <= $n+1; $j++) {
  264. $self->{code}.=<<___;
  265. std $tp[$j],-`($j+2)*8`($sp)
  266. ___
  267. }
  268. $self->add_code(<<___);
  269. ___
  270. }
  271. sub restore_registers($)
  272. {
  273. my ($self) = @_;
  274. my $n = $self->{n};
  275. $self->add_code(<<___);
  276. ld $lo,-8($sp)
  277. ___
  278. for (my $j = 0; $j <= $n+1; $j++) {
  279. $self->{code}.=<<___;
  280. ld $tp[$j],-`($j+2)*8`($sp)
  281. ___
  282. }
  283. $self->{code} .=<<___;
  284. ___
  285. }
  286. # Direct translation of C mul()
  287. sub mul($$$$$)
  288. {
  289. my ($self, $r, $a, $w, $c) = @_;
  290. $self->add_code(<<___);
  291. mulld $lo,$a,$w
  292. addc $r,$lo,$c
  293. mulhdu $c,$a,$w
  294. addze $c,$c
  295. ___
  296. }
  297. # Like mul() but $c is ignored as an input - an optimisation to save a
  298. # preliminary instruction that would set input $c to 0
  299. sub mul_c_0($$$$$)
  300. {
  301. my ($self, $r, $a, $w, $c) = @_;
  302. $self->add_code(<<___);
  303. mulld $r,$a,$w
  304. mulhdu $c,$a,$w
  305. ___
  306. }
  307. # Like mul() but does not to the final addition of CA into $c - an
  308. # optimisation to save an instruction
  309. sub mul_last($$$$$$)
  310. {
  311. my ($self, $r1, $r2, $a, $w, $c) = @_;
  312. $self->add_code(<<___);
  313. mulld $lo,$a,$w
  314. addc $r1,$lo,$c
  315. mulhdu $c,$a,$w
  316. addze $r2,$c
  317. ___
  318. }
  319. # Like C mul_add() but allow $r_out and $r_in to be different
  320. sub mul_add($$$$$$)
  321. {
  322. my ($self, $r_out, $r_in, $a, $w, $c) = @_;
  323. $self->add_code(<<___);
  324. mulld $lo,$a,$w
  325. addc $lo,$lo,$c
  326. mulhdu $c,$a,$w
  327. addze $c,$c
  328. addc $r_out,$r_in,$lo
  329. addze $c,$c
  330. ___
  331. }
  332. # Like mul_add() but $c is ignored as an input - an optimisation to save a
  333. # preliminary instruction that would set input $c to 0
  334. sub mul_add_c_0($$$$$$)
  335. {
  336. my ($self, $r_out, $r_in, $a, $w, $c) = @_;
  337. $self->add_code(<<___);
  338. mulld $lo,$a,$w
  339. addc $r_out,$r_in,$lo
  340. mulhdu $c,$a,$w
  341. addze $c,$c
  342. ___
  343. }
  344. package Mont::GPR_300;
  345. our @ISA = ('Mont::GPR');
  346. sub new($$)
  347. {
  348. my ($class, $n) = @_;
  349. my $mont = $class->SUPER::new($n);
  350. return $mont;
  351. }
  352. sub get_function_name($)
  353. {
  354. my ($self) = @_;
  355. return "bn_mul_mont_300_fixed_n" . $self->{n};
  356. }
  357. sub get_label($$)
  358. {
  359. my ($self, $l) = @_;
  360. return "L" . $l . "_300_" . $self->{n};
  361. }
  362. # Direct translation of C mul()
  363. sub mul($$$$$)
  364. {
  365. my ($self, $r, $a, $w, $c, $last) = @_;
  366. $self->add_code(<<___);
  367. maddld $r,$a,$w,$c
  368. maddhdu $c,$a,$w,$c
  369. ___
  370. }
  371. # Save the last carry as the final entry
  372. sub mul_last($$$$$)
  373. {
  374. my ($self, $r1, $r2, $a, $w, $c) = @_;
  375. $self->add_code(<<___);
  376. maddld $r1,$a,$w,$c
  377. maddhdu $r2,$a,$w,$c
  378. ___
  379. }
  380. # Like mul() but $c is ignored as an input - an optimisation to save a
  381. # preliminary instruction that would set input $c to 0
  382. sub mul_c_0($$$$$)
  383. {
  384. my ($self, $r, $a, $w, $c) = @_;
  385. $self->add_code(<<___);
  386. mulld $r,$a,$w
  387. mulhdu $c,$a,$w
  388. ___
  389. }
  390. # Like C mul_add() but allow $r_out and $r_in to be different
  391. sub mul_add($$$$$$)
  392. {
  393. my ($self, $r_out, $r_in, $a, $w, $c) = @_;
  394. $self->add_code(<<___);
  395. maddld $lo,$a,$w,$c
  396. maddhdu $c,$a,$w,$c
  397. addc $r_out,$r_in,$lo
  398. addze $c,$c
  399. ___
  400. }
  401. # Like mul_add() but $c is ignored as an input - an optimisation to save a
  402. # preliminary instruction that would set input $c to 0
  403. sub mul_add_c_0($$$$$$)
  404. {
  405. my ($self, $r_out, $r_in, $a, $w, $c) = @_;
  406. $self->add_code(<<___);
  407. maddld $lo,$a,$w,$r_in
  408. maddhdu $c,$a,$w,$r_in
  409. ___
  410. if ($r_out ne $lo) {
  411. $self->add_code(<<___);
  412. mr $r_out,$lo
  413. ___
  414. }
  415. $self->nl();
  416. }
  417. package main;
  418. my $code;
  419. $code.=<<___;
  420. .machine "any"
  421. .text
  422. ___
  423. my $mont;
  424. $mont = new Mont::GPR(6);
  425. $mont->mul_mont_fixed();
  426. $code .= $mont->get_code();
  427. $mont = new Mont::GPR_300(6);
  428. $mont->mul_mont_fixed();
  429. $code .= $mont->get_code();
  430. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  431. $code.=<<___;
  432. .asciz "Montgomery Multiplication for PPC by <amitay\@ozlabs.org>, <alastair\@d-silva.org>"
  433. ___
  434. print $code;
  435. close STDOUT or die "error closing STDOUT: $!";