x25519-ppc64.pl 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827
  1. #! /usr/bin/env perl
  2. # Copyright 2018-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # X25519 lower-level primitives for PPC64.
  17. #
  18. # July 2018.
  19. #
  20. # Base 2^64 is faster than base 2^51 on pre-POWER8, most notably ~15%
  21. # faster on PPC970/G5. POWER8 on the other hand seems to trip on own
  22. # shoelaces when handling longer carry chains. As base 2^51 has just
  23. # single-carry pairs, it's 25% faster than base 2^64. Since PPC970 is
  24. # pretty old, base 2^64 implementation is not engaged. Comparison to
  25. # compiler-generated code is complicated by the fact that not all
  26. # compilers support 128-bit integers. When compiler doesn't, like xlc,
  27. # this module delivers more than 2x improvement, and when it does,
  28. # from 12% to 30% improvement was measured...
  29. # $output is the last argument if it looks like a file (it has an extension)
  30. # $flavour is the first argument if it doesn't look like a file
  31. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  32. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  33. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  34. ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
  35. ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
  36. die "can't locate ppc-xlate.pl";
  37. open OUT,"| \"$^X\" $xlate $flavour \"$output\""
  38. or die "can't call $xlate: $!";
  39. *STDOUT=*OUT;
  40. my $sp = "r1";
  41. my ($rp,$ap,$bp) = map("r$_",3..5);
  42. ####################################################### base 2^64
  43. if (0) {
  44. my ($bi,$a0,$a1,$a2,$a3,$t0,$t1, $t2,$t3,
  45. $acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) =
  46. map("r$_",(6..12,22..31));
  47. my $zero = "r0";
  48. my $FRAME = 16*8;
  49. $code.=<<___;
  50. .text
  51. .globl x25519_fe64_mul
  52. .type x25519_fe64_mul,\@function
  53. .align 5
  54. x25519_fe64_mul:
  55. stdu $sp,-$FRAME($sp)
  56. std r22,`$FRAME-8*10`($sp)
  57. std r23,`$FRAME-8*9`($sp)
  58. std r24,`$FRAME-8*8`($sp)
  59. std r25,`$FRAME-8*7`($sp)
  60. std r26,`$FRAME-8*6`($sp)
  61. std r27,`$FRAME-8*5`($sp)
  62. std r28,`$FRAME-8*4`($sp)
  63. std r29,`$FRAME-8*3`($sp)
  64. std r30,`$FRAME-8*2`($sp)
  65. std r31,`$FRAME-8*1`($sp)
  66. ld $bi,0($bp)
  67. ld $a0,0($ap)
  68. xor $zero,$zero,$zero
  69. ld $a1,8($ap)
  70. ld $a2,16($ap)
  71. ld $a3,24($ap)
  72. mulld $acc0,$a0,$bi # a[0]*b[0]
  73. mulhdu $t0,$a0,$bi
  74. mulld $acc1,$a1,$bi # a[1]*b[0]
  75. mulhdu $t1,$a1,$bi
  76. mulld $acc2,$a2,$bi # a[2]*b[0]
  77. mulhdu $t2,$a2,$bi
  78. mulld $acc3,$a3,$bi # a[3]*b[0]
  79. mulhdu $t3,$a3,$bi
  80. ___
  81. for(my @acc=($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7),
  82. my $i=1; $i<4; shift(@acc), $i++) {
  83. my $acc4 = $i==1? $zero : @acc[4];
  84. $code.=<<___;
  85. ld $bi,`8*$i`($bp)
  86. addc @acc[1],@acc[1],$t0 # accumulate high parts
  87. mulld $t0,$a0,$bi
  88. adde @acc[2],@acc[2],$t1
  89. mulld $t1,$a1,$bi
  90. adde @acc[3],@acc[3],$t2
  91. mulld $t2,$a2,$bi
  92. adde @acc[4],$acc4,$t3
  93. mulld $t3,$a3,$bi
  94. addc @acc[1],@acc[1],$t0 # accumulate low parts
  95. mulhdu $t0,$a0,$bi
  96. adde @acc[2],@acc[2],$t1
  97. mulhdu $t1,$a1,$bi
  98. adde @acc[3],@acc[3],$t2
  99. mulhdu $t2,$a2,$bi
  100. adde @acc[4],@acc[4],$t3
  101. mulhdu $t3,$a3,$bi
  102. adde @acc[5],$zero,$zero
  103. ___
  104. }
  105. $code.=<<___;
  106. li $bi,38
  107. addc $acc4,$acc4,$t0
  108. mulld $t0,$acc4,$bi
  109. adde $acc5,$acc5,$t1
  110. mulld $t1,$acc5,$bi
  111. adde $acc6,$acc6,$t2
  112. mulld $t2,$acc6,$bi
  113. adde $acc7,$acc7,$t3
  114. mulld $t3,$acc7,$bi
  115. addc $acc0,$acc0,$t0
  116. mulhdu $t0,$acc4,$bi
  117. adde $acc1,$acc1,$t1
  118. mulhdu $t1,$acc5,$bi
  119. adde $acc2,$acc2,$t2
  120. mulhdu $t2,$acc6,$bi
  121. adde $acc3,$acc3,$t3
  122. mulhdu $t3,$acc7,$bi
  123. adde $acc4,$zero,$zero
  124. addc $acc1,$acc1,$t0
  125. adde $acc2,$acc2,$t1
  126. adde $acc3,$acc3,$t2
  127. adde $acc4,$acc4,$t3
  128. mulld $acc4,$acc4,$bi
  129. addc $acc0,$acc0,$acc4
  130. addze $acc1,$acc1
  131. addze $acc2,$acc2
  132. addze $acc3,$acc3
  133. subfe $acc4,$acc4,$acc4 # carry -> ~mask
  134. std $acc1,8($rp)
  135. andc $acc4,$bi,$acc4
  136. std $acc2,16($rp)
  137. add $acc0,$acc0,$acc4
  138. std $acc3,24($rp)
  139. std $acc0,0($rp)
  140. ld r22,`$FRAME-8*10`($sp)
  141. ld r23,`$FRAME-8*9`($sp)
  142. ld r24,`$FRAME-8*8`($sp)
  143. ld r25,`$FRAME-8*7`($sp)
  144. ld r26,`$FRAME-8*6`($sp)
  145. ld r27,`$FRAME-8*5`($sp)
  146. ld r28,`$FRAME-8*4`($sp)
  147. ld r29,`$FRAME-8*3`($sp)
  148. ld r30,`$FRAME-8*2`($sp)
  149. ld r31,`$FRAME-8*1`($sp)
  150. addi $sp,$sp,$FRAME
  151. blr
  152. .long 0
  153. .byte 0,12,4,0,0x80,10,3,0
  154. .long 0
  155. .size x25519_fe64_mul,.-x25519_fe64_mul
  156. .globl x25519_fe64_sqr
  157. .type x25519_fe64_sqr,\@function
  158. .align 5
  159. x25519_fe64_sqr:
  160. stdu $sp,-$FRAME($sp)
  161. std r22,`$FRAME-8*10`($sp)
  162. std r23,`$FRAME-8*9`($sp)
  163. std r24,`$FRAME-8*8`($sp)
  164. std r25,`$FRAME-8*7`($sp)
  165. std r26,`$FRAME-8*6`($sp)
  166. std r27,`$FRAME-8*5`($sp)
  167. std r28,`$FRAME-8*4`($sp)
  168. std r29,`$FRAME-8*3`($sp)
  169. std r30,`$FRAME-8*2`($sp)
  170. std r31,`$FRAME-8*1`($sp)
  171. ld $a0,0($ap)
  172. xor $zero,$zero,$zero
  173. ld $a1,8($ap)
  174. ld $a2,16($ap)
  175. ld $a3,24($ap)
  176. ################################
  177. # | | | | | |a1*a0| |
  178. # | | | | |a2*a0| | |
  179. # | |a3*a2|a3*a0| | | |
  180. # | | | |a2*a1| | | |
  181. # | | |a3*a1| | | | |
  182. # *| | | | | | | | 2|
  183. # +|a3*a3|a2*a2|a1*a1|a0*a0|
  184. # |--+--+--+--+--+--+--+--|
  185. # |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
  186. #
  187. # "can't overflow" below mark carrying into high part of
  188. # multiplication result, which can't overflow, because it
  189. # can never be all ones.
  190. mulld $acc1,$a1,$a0 # a[1]*a[0]
  191. mulhdu $t1,$a1,$a0
  192. mulld $acc2,$a2,$a0 # a[2]*a[0]
  193. mulhdu $t2,$a2,$a0
  194. mulld $acc3,$a3,$a0 # a[3]*a[0]
  195. mulhdu $acc4,$a3,$a0
  196. addc $acc2,$acc2,$t1 # accumulate high parts of multiplication
  197. mulld $t0,$a2,$a1 # a[2]*a[1]
  198. mulhdu $t1,$a2,$a1
  199. adde $acc3,$acc3,$t2
  200. mulld $t2,$a3,$a1 # a[3]*a[1]
  201. mulhdu $t3,$a3,$a1
  202. addze $acc4,$acc4 # can't overflow
  203. mulld $acc5,$a3,$a2 # a[3]*a[2]
  204. mulhdu $acc6,$a3,$a2
  205. addc $t1,$t1,$t2 # accumulate high parts of multiplication
  206. mulld $acc0,$a0,$a0 # a[0]*a[0]
  207. addze $t2,$t3 # can't overflow
  208. addc $acc3,$acc3,$t0 # accumulate low parts of multiplication
  209. mulhdu $a0,$a0,$a0
  210. adde $acc4,$acc4,$t1
  211. mulld $t1,$a1,$a1 # a[1]*a[1]
  212. adde $acc5,$acc5,$t2
  213. mulhdu $a1,$a1,$a1
  214. addze $acc6,$acc6 # can't overflow
  215. addc $acc1,$acc1,$acc1 # acc[1-6]*=2
  216. mulld $t2,$a2,$a2 # a[2]*a[2]
  217. adde $acc2,$acc2,$acc2
  218. mulhdu $a2,$a2,$a2
  219. adde $acc3,$acc3,$acc3
  220. mulld $t3,$a3,$a3 # a[3]*a[3]
  221. adde $acc4,$acc4,$acc4
  222. mulhdu $a3,$a3,$a3
  223. adde $acc5,$acc5,$acc5
  224. adde $acc6,$acc6,$acc6
  225. addze $acc7,$zero
  226. addc $acc1,$acc1,$a0 # +a[i]*a[i]
  227. li $bi,38
  228. adde $acc2,$acc2,$t1
  229. adde $acc3,$acc3,$a1
  230. adde $acc4,$acc4,$t2
  231. adde $acc5,$acc5,$a2
  232. adde $acc6,$acc6,$t3
  233. adde $acc7,$acc7,$a3
  234. mulld $t0,$acc4,$bi
  235. mulld $t1,$acc5,$bi
  236. mulld $t2,$acc6,$bi
  237. mulld $t3,$acc7,$bi
  238. addc $acc0,$acc0,$t0
  239. mulhdu $t0,$acc4,$bi
  240. adde $acc1,$acc1,$t1
  241. mulhdu $t1,$acc5,$bi
  242. adde $acc2,$acc2,$t2
  243. mulhdu $t2,$acc6,$bi
  244. adde $acc3,$acc3,$t3
  245. mulhdu $t3,$acc7,$bi
  246. addze $acc4,$zero
  247. addc $acc1,$acc1,$t0
  248. adde $acc2,$acc2,$t1
  249. adde $acc3,$acc3,$t2
  250. adde $acc4,$acc4,$t3
  251. mulld $acc4,$acc4,$bi
  252. addc $acc0,$acc0,$acc4
  253. addze $acc1,$acc1
  254. addze $acc2,$acc2
  255. addze $acc3,$acc3
  256. subfe $acc4,$acc4,$acc4 # carry -> ~mask
  257. std $acc1,8($rp)
  258. andc $acc4,$bi,$acc4
  259. std $acc2,16($rp)
  260. add $acc0,$acc0,$acc4
  261. std $acc3,24($rp)
  262. std $acc0,0($rp)
  263. ld r22,`$FRAME-8*10`($sp)
  264. ld r23,`$FRAME-8*9`($sp)
  265. ld r24,`$FRAME-8*8`($sp)
  266. ld r25,`$FRAME-8*7`($sp)
  267. ld r26,`$FRAME-8*6`($sp)
  268. ld r27,`$FRAME-8*5`($sp)
  269. ld r28,`$FRAME-8*4`($sp)
  270. ld r29,`$FRAME-8*3`($sp)
  271. ld r30,`$FRAME-8*2`($sp)
  272. ld r31,`$FRAME-8*1`($sp)
  273. addi $sp,$sp,$FRAME
  274. blr
  275. .long 0
  276. .byte 0,12,4,0,0x80,10,2,0
  277. .long 0
  278. .size x25519_fe64_sqr,.-x25519_fe64_sqr
  279. .globl x25519_fe64_mul121666
  280. .type x25519_fe64_mul121666,\@function
  281. .align 5
  282. x25519_fe64_mul121666:
  283. lis $bi,`65536>>16`
  284. ori $bi,$bi,`121666-65536`
  285. ld $t0,0($ap)
  286. ld $t1,8($ap)
  287. ld $bp,16($ap)
  288. ld $ap,24($ap)
  289. mulld $a0,$t0,$bi
  290. mulhdu $t0,$t0,$bi
  291. mulld $a1,$t1,$bi
  292. mulhdu $t1,$t1,$bi
  293. mulld $a2,$bp,$bi
  294. mulhdu $bp,$bp,$bi
  295. mulld $a3,$ap,$bi
  296. mulhdu $ap,$ap,$bi
  297. addc $a1,$a1,$t0
  298. adde $a2,$a2,$t1
  299. adde $a3,$a3,$bp
  300. addze $ap, $ap
  301. mulli $ap,$ap,38
  302. addc $a0,$a0,$ap
  303. addze $a1,$a1
  304. addze $a2,$a2
  305. addze $a3,$a3
  306. subfe $t1,$t1,$t1 # carry -> ~mask
  307. std $a1,8($rp)
  308. andc $t0,$t0,$t1
  309. std $a2,16($rp)
  310. add $a0,$a0,$t0
  311. std $a3,24($rp)
  312. std $a0,0($rp)
  313. blr
  314. .long 0
  315. .byte 0,12,0x14,0,0,0,2,0
  316. .long 0
  317. .size x25519_fe64_mul121666,.-x25519_fe64_mul121666
  318. .globl x25519_fe64_add
  319. .type x25519_fe64_add,\@function
  320. .align 5
  321. x25519_fe64_add:
  322. ld $a0,0($ap)
  323. ld $t0,0($bp)
  324. ld $a1,8($ap)
  325. ld $t1,8($bp)
  326. ld $a2,16($ap)
  327. ld $bi,16($bp)
  328. ld $a3,24($ap)
  329. ld $bp,24($bp)
  330. addc $a0,$a0,$t0
  331. adde $a1,$a1,$t1
  332. adde $a2,$a2,$bi
  333. adde $a3,$a3,$bp
  334. li $t0,38
  335. subfe $t1,$t1,$t1 # carry -> ~mask
  336. andc $t1,$t0,$t1
  337. addc $a0,$a0,$t1
  338. addze $a1,$a1
  339. addze $a2,$a2
  340. addze $a3,$a3
  341. subfe $t1,$t1,$t1 # carry -> ~mask
  342. std $a1,8($rp)
  343. andc $t0,$t0,$t1
  344. std $a2,16($rp)
  345. add $a0,$a0,$t0
  346. std $a3,24($rp)
  347. std $a0,0($rp)
  348. blr
  349. .long 0
  350. .byte 0,12,0x14,0,0,0,3,0
  351. .long 0
  352. .size x25519_fe64_add,.-x25519_fe64_add
  353. .globl x25519_fe64_sub
  354. .type x25519_fe64_sub,\@function
  355. .align 5
  356. x25519_fe64_sub:
  357. ld $a0,0($ap)
  358. ld $t0,0($bp)
  359. ld $a1,8($ap)
  360. ld $t1,8($bp)
  361. ld $a2,16($ap)
  362. ld $bi,16($bp)
  363. ld $a3,24($ap)
  364. ld $bp,24($bp)
  365. subfc $a0,$t0,$a0
  366. subfe $a1,$t1,$a1
  367. subfe $a2,$bi,$a2
  368. subfe $a3,$bp,$a3
  369. li $t0,38
  370. subfe $t1,$t1,$t1 # borrow -> mask
  371. xor $zero,$zero,$zero
  372. and $t1,$t0,$t1
  373. subfc $a0,$t1,$a0
  374. subfe $a1,$zero,$a1
  375. subfe $a2,$zero,$a2
  376. subfe $a3,$zero,$a3
  377. subfe $t1,$t1,$t1 # borrow -> mask
  378. std $a1,8($rp)
  379. and $t0,$t0,$t1
  380. std $a2,16($rp)
  381. subf $a0,$t0,$a0
  382. std $a3,24($rp)
  383. std $a0,0($rp)
  384. blr
  385. .long 0
  386. .byte 0,12,0x14,0,0,0,3,0
  387. .long 0
  388. .size x25519_fe64_sub,.-x25519_fe64_sub
  389. .globl x25519_fe64_tobytes
  390. .type x25519_fe64_tobytes,\@function
  391. .align 5
  392. x25519_fe64_tobytes:
  393. ld $a3,24($ap)
  394. ld $a0,0($ap)
  395. ld $a1,8($ap)
  396. ld $a2,16($ap)
  397. sradi $t0,$a3,63 # most significant bit -> mask
  398. li $t1,19
  399. and $t0,$t0,$t1
  400. sldi $a3,$a3,1
  401. add $t0,$t0,$t1 # compare to modulus in the same go
  402. srdi $a3,$a3,1 # most significant bit cleared
  403. addc $a0,$a0,$t0
  404. addze $a1,$a1
  405. addze $a2,$a2
  406. addze $a3,$a3
  407. xor $zero,$zero,$zero
  408. sradi $t0,$a3,63 # most significant bit -> mask
  409. sldi $a3,$a3,1
  410. andc $t0,$t1,$t0
  411. srdi $a3,$a3,1 # most significant bit cleared
  412. subi $rp,$rp,1
  413. subfc $a0,$t0,$a0
  414. subfe $a1,$zero,$a1
  415. subfe $a2,$zero,$a2
  416. subfe $a3,$zero,$a3
  417. ___
  418. for (my @a=($a0,$a1,$a2,$a3), my $i=0; $i<4; shift(@a), $i++) {
  419. $code.=<<___;
  420. srdi $t0,@a[0],8
  421. stbu @a[0],1($rp)
  422. srdi @a[0],@a[0],16
  423. stbu $t0,1($rp)
  424. srdi $t0,@a[0],8
  425. stbu @a[0],1($rp)
  426. srdi @a[0],@a[0],16
  427. stbu $t0,1($rp)
  428. srdi $t0,@a[0],8
  429. stbu @a[0],1($rp)
  430. srdi @a[0],@a[0],16
  431. stbu $t0,1($rp)
  432. srdi $t0,@a[0],8
  433. stbu @a[0],1($rp)
  434. stbu $t0,1($rp)
  435. ___
  436. }
  437. $code.=<<___;
  438. blr
  439. .long 0
  440. .byte 0,12,0x14,0,0,0,2,0
  441. .long 0
  442. .size x25519_fe64_tobytes,.-x25519_fe64_tobytes
  443. ___
  444. }
  445. ####################################################### base 2^51
  446. {
  447. my ($bi,$a0,$a1,$a2,$a3,$a4,$t0, $t1,
  448. $h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,$h4lo,$h4hi) =
  449. map("r$_",(6..12,21..31));
  450. my $mask = "r0";
  451. my $FRAME = 18*8;
  452. $code.=<<___;
  453. .text
  454. .globl x25519_fe51_mul
  455. .type x25519_fe51_mul,\@function
  456. .align 5
  457. x25519_fe51_mul:
  458. stdu $sp,-$FRAME($sp)
  459. std r21,`$FRAME-8*11`($sp)
  460. std r22,`$FRAME-8*10`($sp)
  461. std r23,`$FRAME-8*9`($sp)
  462. std r24,`$FRAME-8*8`($sp)
  463. std r25,`$FRAME-8*7`($sp)
  464. std r26,`$FRAME-8*6`($sp)
  465. std r27,`$FRAME-8*5`($sp)
  466. std r28,`$FRAME-8*4`($sp)
  467. std r29,`$FRAME-8*3`($sp)
  468. std r30,`$FRAME-8*2`($sp)
  469. std r31,`$FRAME-8*1`($sp)
  470. ld $bi,0($bp)
  471. ld $a0,0($ap)
  472. ld $a1,8($ap)
  473. ld $a2,16($ap)
  474. ld $a3,24($ap)
  475. ld $a4,32($ap)
  476. mulld $h0lo,$a0,$bi # a[0]*b[0]
  477. mulhdu $h0hi,$a0,$bi
  478. mulld $h1lo,$a1,$bi # a[1]*b[0]
  479. mulhdu $h1hi,$a1,$bi
  480. mulld $h4lo,$a4,$bi # a[4]*b[0]
  481. mulhdu $h4hi,$a4,$bi
  482. ld $ap,8($bp)
  483. mulli $a4,$a4,19
  484. mulld $h2lo,$a2,$bi # a[2]*b[0]
  485. mulhdu $h2hi,$a2,$bi
  486. mulld $h3lo,$a3,$bi # a[3]*b[0]
  487. mulhdu $h3hi,$a3,$bi
  488. ___
  489. for(my @a=($a0,$a1,$a2,$a3,$a4),
  490. my $i=1; $i<4; $i++) {
  491. ($ap,$bi) = ($bi,$ap);
  492. $code.=<<___;
  493. mulld $t0,@a[4],$bi
  494. mulhdu $t1,@a[4],$bi
  495. addc $h0lo,$h0lo,$t0
  496. adde $h0hi,$h0hi,$t1
  497. mulld $t0,@a[0],$bi
  498. mulhdu $t1,@a[0],$bi
  499. addc $h1lo,$h1lo,$t0
  500. adde $h1hi,$h1hi,$t1
  501. mulld $t0,@a[3],$bi
  502. mulhdu $t1,@a[3],$bi
  503. ld $ap,`8*($i+1)`($bp)
  504. mulli @a[3],@a[3],19
  505. addc $h4lo,$h4lo,$t0
  506. adde $h4hi,$h4hi,$t1
  507. mulld $t0,@a[1],$bi
  508. mulhdu $t1,@a[1],$bi
  509. addc $h2lo,$h2lo,$t0
  510. adde $h2hi,$h2hi,$t1
  511. mulld $t0,@a[2],$bi
  512. mulhdu $t1,@a[2],$bi
  513. addc $h3lo,$h3lo,$t0
  514. adde $h3hi,$h3hi,$t1
  515. ___
  516. unshift(@a,pop(@a));
  517. }
  518. ($ap,$bi) = ($bi,$ap);
  519. $code.=<<___;
  520. mulld $t0,$a1,$bi
  521. mulhdu $t1,$a1,$bi
  522. addc $h0lo,$h0lo,$t0
  523. adde $h0hi,$h0hi,$t1
  524. mulld $t0,$a2,$bi
  525. mulhdu $t1,$a2,$bi
  526. addc $h1lo,$h1lo,$t0
  527. adde $h1hi,$h1hi,$t1
  528. mulld $t0,$a3,$bi
  529. mulhdu $t1,$a3,$bi
  530. addc $h2lo,$h2lo,$t0
  531. adde $h2hi,$h2hi,$t1
  532. mulld $t0,$a4,$bi
  533. mulhdu $t1,$a4,$bi
  534. addc $h3lo,$h3lo,$t0
  535. adde $h3hi,$h3hi,$t1
  536. mulld $t0,$a0,$bi
  537. mulhdu $t1,$a0,$bi
  538. addc $h4lo,$h4lo,$t0
  539. adde $h4hi,$h4hi,$t1
  540. .Lfe51_reduce:
  541. li $mask,-1
  542. srdi $mask,$mask,13 # 0x7ffffffffffff
  543. srdi $t0,$h2lo,51
  544. and $a2,$h2lo,$mask
  545. insrdi $t0,$h2hi,51,0 # h2>>51
  546. srdi $t1,$h0lo,51
  547. and $a0,$h0lo,$mask
  548. insrdi $t1,$h0hi,51,0 # h0>>51
  549. addc $h3lo,$h3lo,$t0
  550. addze $h3hi,$h3hi
  551. addc $h1lo,$h1lo,$t1
  552. addze $h1hi,$h1hi
  553. srdi $t0,$h3lo,51
  554. and $a3,$h3lo,$mask
  555. insrdi $t0,$h3hi,51,0 # h3>>51
  556. srdi $t1,$h1lo,51
  557. and $a1,$h1lo,$mask
  558. insrdi $t1,$h1hi,51,0 # h1>>51
  559. addc $h4lo,$h4lo,$t0
  560. addze $h4hi,$h4hi
  561. add $a2,$a2,$t1
  562. srdi $t0,$h4lo,51
  563. and $a4,$h4lo,$mask
  564. insrdi $t0,$h4hi,51,0
  565. mulli $t0,$t0,19 # (h4 >> 51) * 19
  566. add $a0,$a0,$t0
  567. srdi $t1,$a2,51
  568. and $a2,$a2,$mask
  569. add $a3,$a3,$t1
  570. srdi $t0,$a0,51
  571. and $a0,$a0,$mask
  572. add $a1,$a1,$t0
  573. std $a2,16($rp)
  574. std $a3,24($rp)
  575. std $a4,32($rp)
  576. std $a0,0($rp)
  577. std $a1,8($rp)
  578. ld r21,`$FRAME-8*11`($sp)
  579. ld r22,`$FRAME-8*10`($sp)
  580. ld r23,`$FRAME-8*9`($sp)
  581. ld r24,`$FRAME-8*8`($sp)
  582. ld r25,`$FRAME-8*7`($sp)
  583. ld r26,`$FRAME-8*6`($sp)
  584. ld r27,`$FRAME-8*5`($sp)
  585. ld r28,`$FRAME-8*4`($sp)
  586. ld r29,`$FRAME-8*3`($sp)
  587. ld r30,`$FRAME-8*2`($sp)
  588. ld r31,`$FRAME-8*1`($sp)
  589. addi $sp,$sp,$FRAME
  590. blr
  591. .long 0
  592. .byte 0,12,4,0,0x80,11,3,0
  593. .long 0
  594. .size x25519_fe51_mul,.-x25519_fe51_mul
  595. ___
  596. {
  597. my ($a0,$a1,$a2,$a3,$a4,$t0,$t1) = ($a0,$a1,$a2,$a3,$a4,$t0,$t1);
  598. $code.=<<___;
  599. .globl x25519_fe51_sqr
  600. .type x25519_fe51_sqr,\@function
  601. .align 5
  602. x25519_fe51_sqr:
  603. stdu $sp,-$FRAME($sp)
  604. std r21,`$FRAME-8*11`($sp)
  605. std r22,`$FRAME-8*10`($sp)
  606. std r23,`$FRAME-8*9`($sp)
  607. std r24,`$FRAME-8*8`($sp)
  608. std r25,`$FRAME-8*7`($sp)
  609. std r26,`$FRAME-8*6`($sp)
  610. std r27,`$FRAME-8*5`($sp)
  611. std r28,`$FRAME-8*4`($sp)
  612. std r29,`$FRAME-8*3`($sp)
  613. std r30,`$FRAME-8*2`($sp)
  614. std r31,`$FRAME-8*1`($sp)
  615. ld $a0,0($ap)
  616. ld $a1,8($ap)
  617. ld $a2,16($ap)
  618. ld $a3,24($ap)
  619. ld $a4,32($ap)
  620. add $bi,$a0,$a0 # a[0]*2
  621. mulli $t1,$a4,19 # a[4]*19
  622. mulld $h0lo,$a0,$a0
  623. mulhdu $h0hi,$a0,$a0
  624. mulld $h1lo,$a1,$bi
  625. mulhdu $h1hi,$a1,$bi
  626. mulld $h2lo,$a2,$bi
  627. mulhdu $h2hi,$a2,$bi
  628. mulld $h3lo,$a3,$bi
  629. mulhdu $h3hi,$a3,$bi
  630. mulld $h4lo,$a4,$bi
  631. mulhdu $h4hi,$a4,$bi
  632. add $bi,$a1,$a1 # a[1]*2
  633. ___
  634. ($a4,$t1) = ($t1,$a4);
  635. $code.=<<___;
  636. mulld $t0,$t1,$a4
  637. mulhdu $t1,$t1,$a4
  638. addc $h3lo,$h3lo,$t0
  639. adde $h3hi,$h3hi,$t1
  640. mulli $bp,$a3,19 # a[3]*19
  641. mulld $t0,$a1,$a1
  642. mulhdu $t1,$a1,$a1
  643. addc $h2lo,$h2lo,$t0
  644. adde $h2hi,$h2hi,$t1
  645. mulld $t0,$a2,$bi
  646. mulhdu $t1,$a2,$bi
  647. addc $h3lo,$h3lo,$t0
  648. adde $h3hi,$h3hi,$t1
  649. mulld $t0,$a3,$bi
  650. mulhdu $t1,$a3,$bi
  651. addc $h4lo,$h4lo,$t0
  652. adde $h4hi,$h4hi,$t1
  653. mulld $t0,$a4,$bi
  654. mulhdu $t1,$a4,$bi
  655. add $bi,$a3,$a3 # a[3]*2
  656. addc $h0lo,$h0lo,$t0
  657. adde $h0hi,$h0hi,$t1
  658. ___
  659. ($a3,$t1) = ($bp,$a3);
  660. $code.=<<___;
  661. mulld $t0,$t1,$a3
  662. mulhdu $t1,$t1,$a3
  663. addc $h1lo,$h1lo,$t0
  664. adde $h1hi,$h1hi,$t1
  665. mulld $t0,$bi,$a4
  666. mulhdu $t1,$bi,$a4
  667. add $bi,$a2,$a2 # a[2]*2
  668. addc $h2lo,$h2lo,$t0
  669. adde $h2hi,$h2hi,$t1
  670. mulld $t0,$a2,$a2
  671. mulhdu $t1,$a2,$a2
  672. addc $h4lo,$h4lo,$t0
  673. adde $h4hi,$h4hi,$t1
  674. mulld $t0,$a3,$bi
  675. mulhdu $t1,$a3,$bi
  676. addc $h0lo,$h0lo,$t0
  677. adde $h0hi,$h0hi,$t1
  678. mulld $t0,$a4,$bi
  679. mulhdu $t1,$a4,$bi
  680. addc $h1lo,$h1lo,$t0
  681. adde $h1hi,$h1hi,$t1
  682. b .Lfe51_reduce
  683. .long 0
  684. .byte 0,12,4,0,0x80,11,2,0
  685. .long 0
  686. .size x25519_fe51_sqr,.-x25519_fe51_sqr
  687. ___
  688. }
  689. $code.=<<___;
  690. .globl x25519_fe51_mul121666
  691. .type x25519_fe51_mul121666,\@function
  692. .align 5
  693. x25519_fe51_mul121666:
  694. stdu $sp,-$FRAME($sp)
  695. std r21,`$FRAME-8*11`($sp)
  696. std r22,`$FRAME-8*10`($sp)
  697. std r23,`$FRAME-8*9`($sp)
  698. std r24,`$FRAME-8*8`($sp)
  699. std r25,`$FRAME-8*7`($sp)
  700. std r26,`$FRAME-8*6`($sp)
  701. std r27,`$FRAME-8*5`($sp)
  702. std r28,`$FRAME-8*4`($sp)
  703. std r29,`$FRAME-8*3`($sp)
  704. std r30,`$FRAME-8*2`($sp)
  705. std r31,`$FRAME-8*1`($sp)
  706. lis $bi,`65536>>16`
  707. ori $bi,$bi,`121666-65536`
  708. ld $a0,0($ap)
  709. ld $a1,8($ap)
  710. ld $a2,16($ap)
  711. ld $a3,24($ap)
  712. ld $a4,32($ap)
  713. mulld $h0lo,$a0,$bi # a[0]*121666
  714. mulhdu $h0hi,$a0,$bi
  715. mulld $h1lo,$a1,$bi # a[1]*121666
  716. mulhdu $h1hi,$a1,$bi
  717. mulld $h2lo,$a2,$bi # a[2]*121666
  718. mulhdu $h2hi,$a2,$bi
  719. mulld $h3lo,$a3,$bi # a[3]*121666
  720. mulhdu $h3hi,$a3,$bi
  721. mulld $h4lo,$a4,$bi # a[4]*121666
  722. mulhdu $h4hi,$a4,$bi
  723. b .Lfe51_reduce
  724. .long 0
  725. .byte 0,12,4,0,0x80,11,2,0
  726. .long 0
  727. .size x25519_fe51_mul121666,.-x25519_fe51_mul121666
  728. ___
  729. }
  730. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  731. print $code;
  732. close STDOUT or die "error closing STDOUT: $!";