ecp_nistp521-ppc64.pl 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436
  1. #! /usr/bin/env perl
  2. # Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Amitay Isaacs <amitay@ozlabs.org> and Martin Schwenke
  11. # <martin@meltin.net> for the OpenSSL project.
  12. # ====================================================================
  13. #
  14. # p521 lower-level primitives for PPC64 using vector instructions.
  15. #
  16. use strict;
  17. use warnings;
  18. my $flavour = shift;
  19. my $output = "";
  20. while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
  21. if (!$output) {
  22. $output = "-";
  23. }
  24. my ($xlate, $dir);
  25. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  26. ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  27. ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  28. die "can't locate ppc-xlate.pl";
  29. open OUT,"| \"$^X\" $xlate $flavour $output";
  30. *STDOUT=*OUT;
  31. my $code = "";
  32. my ($sp, $outp, $savelr, $savesp) = ("r1", "r3", "r10", "r12");
  33. my $vzero = "v32";
  34. sub startproc($)
  35. {
  36. my ($name) = @_;
  37. $code.=<<___;
  38. .globl ${name}
  39. ${name}:
  40. .cfi_startproc
  41. ___
  42. }
  43. sub endproc($)
  44. {
  45. my ($name) = @_;
  46. $code.=<<___;
  47. blr
  48. .cfi_endproc
  49. .size ${name},.-${name}
  50. ___
  51. }
  52. sub push_vrs($$)
  53. {
  54. my ($min, $max) = @_;
  55. my $count = $max - $min + 1;
  56. $code.=<<___;
  57. mr $savesp,$sp
  58. stdu $sp,-16*`$count+1`($sp)
  59. ___
  60. for (my $i = $min; $i <= $max; $i++) {
  61. my $mult = $max - $i + 1;
  62. $code.=<<___;
  63. stxv $i,-16*$mult($savesp)
  64. ___
  65. }
  66. $code.=<<___;
  67. ___
  68. }
  69. sub pop_vrs($$)
  70. {
  71. my ($min, $max) = @_;
  72. $code.=<<___;
  73. ld $savesp,0($sp)
  74. ___
  75. for (my $i = $min; $i <= $max; $i++) {
  76. my $mult = $max - $i + 1;
  77. $code.=<<___;
  78. lxv $i,-16*$mult($savesp)
  79. ___
  80. }
  81. $code.=<<___;
  82. mr $sp,$savesp
  83. ___
  84. }
  85. sub load_vrs($$)
  86. {
  87. my ($pointer, $reg_list) = @_;
  88. for (my $i = 0; $i <= 8; $i++) {
  89. my $offset = $i * 8;
  90. $code.=<<___;
  91. lxsd $reg_list->[$i],$offset($pointer)
  92. ___
  93. }
  94. $code.=<<___;
  95. ___
  96. }
  97. sub store_vrs($$)
  98. {
  99. my ($pointer, $reg_list) = @_;
  100. for (my $i = 0; $i <= 8; $i++) {
  101. my $offset = $i * 16;
  102. $code.=<<___;
  103. stxv $reg_list->[$i],$offset($pointer)
  104. ___
  105. }
  106. $code.=<<___;
  107. ___
  108. }
  109. $code.=<<___;
  110. .text
  111. ___
  112. {
  113. # mul/square common
  114. my ($t1, $t2, $t3, $t4) = ("v33", "v34", "v44", "v54");
  115. my ($zero, $one) = ("r8", "r9");
  116. my @out = map("v$_",(55..63));
  117. {
  118. #
  119. # p521_felem_mul
  120. #
  121. my ($in1p, $in2p) = ("r4", "r5");
  122. my @in1 = map("v$_",(45..53));
  123. my @in2 = map("v$_",(35..43));
  124. startproc("p521_felem_mul");
  125. push_vrs(52, 63);
  126. $code.=<<___;
  127. vspltisw $vzero,0
  128. ___
  129. load_vrs($in1p, \@in1);
  130. load_vrs($in2p, \@in2);
  131. $code.=<<___;
  132. vmsumudm $out[0],$in1[0],$in2[0],$vzero
  133. xxpermdi $t1,$in1[0],$in1[1],0b00
  134. xxpermdi $t2,$in2[1],$in2[0],0b00
  135. vmsumudm $out[1],$t1,$t2,$vzero
  136. xxpermdi $t2,$in2[2],$in2[1],0b00
  137. vmsumudm $out[2],$t1,$t2,$vzero
  138. vmsumudm $out[2],$in1[2],$in2[0],$out[2]
  139. xxpermdi $t2,$in2[3],$in2[2],0b00
  140. vmsumudm $out[3],$t1,$t2,$vzero
  141. xxpermdi $t3,$in1[2],$in1[3],0b00
  142. xxpermdi $t4,$in2[1],$in2[0],0b00
  143. vmsumudm $out[3],$t3,$t4,$out[3]
  144. xxpermdi $t2,$in2[4],$in2[3],0b00
  145. vmsumudm $out[4],$t1,$t2,$vzero
  146. xxpermdi $t4,$in2[2],$in2[1],0b00
  147. vmsumudm $out[4],$t3,$t4,$out[4]
  148. vmsumudm $out[4],$in1[4],$in2[0],$out[4]
  149. xxpermdi $t2,$in2[5],$in2[4],0b00
  150. vmsumudm $out[5],$t1,$t2,$vzero
  151. xxpermdi $t4,$in2[3],$in2[2],0b00
  152. vmsumudm $out[5],$t3,$t4,$out[5]
  153. xxpermdi $t2,$in2[6],$in2[5],0b00
  154. vmsumudm $out[6],$t1,$t2,$vzero
  155. xxpermdi $t4,$in2[4],$in2[3],0b00
  156. vmsumudm $out[6],$t3,$t4,$out[6]
  157. xxpermdi $t2,$in2[7],$in2[6],0b00
  158. vmsumudm $out[7],$t1,$t2,$vzero
  159. xxpermdi $t4,$in2[5],$in2[4],0b00
  160. vmsumudm $out[7],$t3,$t4,$out[7]
  161. xxpermdi $t2,$in2[8],$in2[7],0b00
  162. vmsumudm $out[8],$t1,$t2,$vzero
  163. xxpermdi $t4,$in2[6],$in2[5],0b00
  164. vmsumudm $out[8],$t3,$t4,$out[8]
  165. xxpermdi $t1,$in1[4],$in1[5],0b00
  166. xxpermdi $t2,$in2[1],$in2[0],0b00
  167. vmsumudm $out[5],$t1,$t2,$out[5]
  168. xxpermdi $t2,$in2[2],$in2[1],0b00
  169. vmsumudm $out[6],$t1,$t2,$out[6]
  170. vmsumudm $out[6],$in1[6],$in2[0],$out[6]
  171. xxpermdi $t2,$in2[3],$in2[2],0b00
  172. vmsumudm $out[7],$t1,$t2,$out[7]
  173. xxpermdi $t3,$in1[6],$in1[7],0b00
  174. xxpermdi $t4,$in2[1],$in2[0],0b00
  175. vmsumudm $out[7],$t3,$t4,$out[7]
  176. xxpermdi $t2,$in2[4],$in2[3],0b00
  177. vmsumudm $out[8],$t1,$t2,$out[8]
  178. xxpermdi $t4,$in2[2],$in2[1],0b00
  179. vmsumudm $out[8],$t3,$t4,$out[8]
  180. vmsumudm $out[8],$in1[8],$in2[0],$out[8]
  181. li $zero,0
  182. li $one,1
  183. mtvsrdd $t1,$one,$zero
  184. ___
  185. for (my $i = 0; $i <= 8; $i++) {
  186. $code.=<<___;
  187. vsld $in2[$i],$in2[$i],$t1
  188. ___
  189. }
  190. $code.=<<___;
  191. vmsumudm $out[7],$in1[8],$in2[8],$out[7]
  192. xxpermdi $t2,$in2[8],$in2[7],0b00
  193. xxpermdi $t1,$in1[7],$in1[8],0b00
  194. vmsumudm $out[6],$t1,$t2,$out[6]
  195. xxpermdi $t1,$in1[6],$in1[7],0b00
  196. vmsumudm $out[5],$t1,$t2,$out[5]
  197. vmsumudm $out[5],$in1[8],$in2[6],$out[5]
  198. xxpermdi $t1,$in1[5],$in1[6],0b00
  199. vmsumudm $out[4],$t1,$t2,$out[4]
  200. xxpermdi $t4,$in2[6],$in2[5],0b00
  201. xxpermdi $t3,$in1[7],$in1[8],0b00
  202. vmsumudm $out[4],$t3,$t4,$out[4]
  203. xxpermdi $t1,$in1[4],$in1[5],0b00
  204. vmsumudm $out[3],$t1,$t2,$out[3]
  205. xxpermdi $t3,$in1[6],$in1[7],0b00
  206. vmsumudm $out[3],$t3,$t4,$out[3]
  207. vmsumudm $out[3],$in1[8],$in2[4],$out[3]
  208. xxpermdi $t1,$in1[3],$in1[4],0b00
  209. vmsumudm $out[2],$t1,$t2,$out[2]
  210. xxpermdi $t3,$in1[5],$in1[6],0b00
  211. vmsumudm $out[2],$t3,$t4,$out[2]
  212. xxpermdi $t1,$in1[2],$in1[3],0b00
  213. vmsumudm $out[1],$t1,$t2,$out[1]
  214. xxpermdi $t3,$in1[4],$in1[5],0b00
  215. vmsumudm $out[1],$t3,$t4,$out[1]
  216. xxpermdi $t1,$in1[1],$in1[2],0b00
  217. vmsumudm $out[0],$t1,$t2,$out[0]
  218. xxpermdi $t3,$in1[3],$in1[4],0b00
  219. vmsumudm $out[0],$t3,$t4,$out[0]
  220. xxpermdi $t2,$in2[4],$in2[3],0b00
  221. xxpermdi $t1,$in1[7],$in1[8],0b00
  222. vmsumudm $out[2],$t1,$t2,$out[2]
  223. xxpermdi $t1,$in1[6],$in1[7],0b00
  224. vmsumudm $out[1],$t1,$t2,$out[1]
  225. vmsumudm $out[1],$in1[8],$in2[2],$out[1]
  226. xxpermdi $t1,$in1[5],$in1[6],0b00
  227. vmsumudm $out[0],$t1,$t2,$out[0]
  228. xxpermdi $t4,$in2[2],$in2[1],0b00
  229. xxpermdi $t3,$in1[7],$in1[8],0b00
  230. vmsumudm $out[0],$t3,$t4,$out[0]
  231. ___
  232. store_vrs($outp, \@out);
  233. pop_vrs(52, 63);
  234. endproc("p521_felem_mul");
  235. }
  236. {
  237. #
  238. # p51_felem_square
  239. #
  240. my ($inp) = ("r4");
  241. my @in = map("v$_",(45..53));
  242. my @inx2 = map("v$_",(35..43));
  243. startproc("p521_felem_square");
  244. push_vrs(52, 63);
  245. $code.=<<___;
  246. vspltisw $vzero,0
  247. ___
  248. load_vrs($inp, \@in);
  249. $code.=<<___;
  250. li $zero,0
  251. li $one,1
  252. mtvsrdd $t1,$one,$zero
  253. ___
  254. for (my $i = 0; $i <= 8; $i++) {
  255. $code.=<<___;
  256. vsld $inx2[$i],$in[$i],$t1
  257. ___
  258. }
  259. $code.=<<___;
  260. vmsumudm $out[0],$in[0],$in[0],$vzero
  261. vmsumudm $out[1],$in[0],$inx2[1],$vzero
  262. xxpermdi $t1,$in[0],$in[1],0b00
  263. xxpermdi $t2,$inx2[2],$in[1],0b00
  264. vmsumudm $out[2],$t1,$t2,$vzero
  265. xxpermdi $t2,$inx2[3],$inx2[2],0b00
  266. vmsumudm $out[3],$t1,$t2,$vzero
  267. xxpermdi $t2,$inx2[4],$inx2[3],0b00
  268. vmsumudm $out[4],$t1,$t2,$vzero
  269. vmsumudm $out[4],$in[2],$in[2],$out[4]
  270. xxpermdi $t2,$inx2[5],$inx2[4],0b00
  271. vmsumudm $out[5],$t1,$t2,$vzero
  272. vmsumudm $out[5],$in[2],$inx2[3],$out[5]
  273. xxpermdi $t2,$inx2[6],$inx2[5],0b00
  274. vmsumudm $out[6],$t1,$t2,$vzero
  275. xxpermdi $t3,$in[2],$in[3],0b00
  276. xxpermdi $t4,$inx2[4],$in[3],0b00
  277. vmsumudm $out[6],$t3,$t4,$out[6]
  278. xxpermdi $t2,$inx2[7],$inx2[6],0b00
  279. vmsumudm $out[7],$t1,$t2,$vzero
  280. xxpermdi $t4,$inx2[5],$inx2[4],0b00
  281. vmsumudm $out[7],$t3,$t4,$out[7]
  282. xxpermdi $t2,$inx2[8],$inx2[7],0b00
  283. vmsumudm $out[8],$t1,$t2,$vzero
  284. xxpermdi $t4,$inx2[6],$inx2[5],0b00
  285. vmsumudm $out[8],$t3,$t4,$out[8]
  286. vmsumudm $out[8],$in[4],$in[4],$out[8]
  287. vmsumudm $out[1],$in[5],$inx2[5],$out[1]
  288. vmsumudm $out[3],$in[6],$inx2[6],$out[3]
  289. vmsumudm $out[5],$in[7],$inx2[7],$out[5]
  290. vmsumudm $out[7],$in[8],$inx2[8],$out[7]
  291. mtvsrdd $t1,$one,$zero
  292. ___
  293. for (my $i = 5; $i <= 8; $i++) {
  294. $code.=<<___;
  295. vsld $inx2[$i],$inx2[$i],$t1
  296. ___
  297. }
  298. $code.=<<___;
  299. vmsumudm $out[6],$in[7],$inx2[8],$out[6]
  300. vmsumudm $out[5],$in[6],$inx2[8],$out[5]
  301. xxpermdi $t2,$inx2[8],$inx2[7],0b00
  302. xxpermdi $t1,$in[5],$in[6],0b00
  303. vmsumudm $out[4],$t1,$t2,$out[4]
  304. xxpermdi $t1,$in[4],$in[5],0b00
  305. vmsumudm $out[3],$t1,$t2,$out[3]
  306. xxpermdi $t1,$in[3],$in[4],0b00
  307. vmsumudm $out[2],$t1,$t2,$out[2]
  308. vmsumudm $out[2],$in[5],$inx2[6],$out[2]
  309. xxpermdi $t1,$in[2],$in[3],0b00
  310. vmsumudm $out[1],$t1,$t2,$out[1]
  311. vmsumudm $out[1],$in[4],$inx2[6],$out[1]
  312. xxpermdi $t1,$in[1],$in[2],0b00
  313. vmsumudm $out[0],$t1,$t2,$out[0]
  314. xxpermdi $t2,$inx2[6],$inx2[5],0b00
  315. xxpermdi $t1,$in[3],$in[4],0b00
  316. vmsumudm $out[0],$t1,$t2,$out[0]
  317. ___
  318. store_vrs($outp, \@out);
  319. pop_vrs(52, 63);
  320. endproc("p521_felem_square");
  321. }
  322. }
  323. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  324. print $code;
  325. close STDOUT or die "error closing STDOUT: $!";