2
0

mips-mont.pl 9.5 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. # This module doesn't present direct interest for OpenSSL, because it
  16. # doesn't provide better performance for longer keys, at least not on
  17. # in-order-execution cores. While 512-bit RSA sign operations can be
  18. # 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
  19. # 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
  20. # 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
  21. # verify:-( All comparisons are against bn_mul_mont-free assembler.
  22. # The module might be of interest to embedded system developers, as
  23. # the code is smaller than 1KB, yet offers >3x improvement on MIPS64
  24. # and 75-30% [less for longer keys] on MIPS32 over compiler-generated
  25. # code.
  26. ######################################################################
  27. # There is a number of MIPS ABI in use, O32 and N32/64 are most
  28. # widely used. Then there is a new contender: NUBI. It appears that if
  29. # one picks the latter, it's possible to arrange code in ABI neutral
  30. # manner. Therefore let's stick to NUBI register layout:
  31. #
  32. ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
  33. ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
  34. ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
  35. ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
  36. #
  37. # The return value is placed in $a0. Following coding rules facilitate
  38. # interoperability:
  39. #
  40. # - never ever touch $tp, "thread pointer", former $gp;
  41. # - copy return value to $t0, former $v0 [or to $a0 if you're adapting
  42. # old code];
  43. # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
  44. #
  45. # For reference here is register layout for N32/64 MIPS ABIs:
  46. #
  47. # ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
  48. # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
  49. # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
  50. # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
  51. # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
  52. # $output is the last argument if it looks like a file (it has an extension)
  53. # $flavour is the first argument if it doesn't look like a file
  54. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  55. # supported flavours are o32,n32,64,nubi32,nubi64, default is o32
  56. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : "o32";
  57. if ($flavour =~ /64|n32/i) {
  58. $PTR_ADD="daddu"; # incidentally works even on n32
  59. $PTR_SUB="dsubu"; # incidentally works even on n32
  60. $REG_S="sd";
  61. $REG_L="ld";
  62. $SZREG=8;
  63. } else {
  64. $PTR_ADD="addu";
  65. $PTR_SUB="subu";
  66. $REG_S="sw";
  67. $REG_L="lw";
  68. $SZREG=4;
  69. }
  70. $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
  71. #
  72. # <appro@openssl.org>
  73. #
  74. ######################################################################
  75. $output and open STDOUT,">$output";
  76. if ($flavour =~ /64|n32/i) {
  77. $LD="ld";
  78. $ST="sd";
  79. $MULTU="dmultu";
  80. $ADDU="daddu";
  81. $SUBU="dsubu";
  82. $BNSZ=8;
  83. } else {
  84. $LD="lw";
  85. $ST="sw";
  86. $MULTU="multu";
  87. $ADDU="addu";
  88. $SUBU="subu";
  89. $BNSZ=4;
  90. }
  91. # int bn_mul_mont(
  92. $rp=$a0; # BN_ULONG *rp,
  93. $ap=$a1; # const BN_ULONG *ap,
  94. $bp=$a2; # const BN_ULONG *bp,
  95. $np=$a3; # const BN_ULONG *np,
  96. $n0=$a4; # const BN_ULONG *n0,
  97. $num=$a5; # int num);
  98. $lo0=$a6;
  99. $hi0=$a7;
  100. $lo1=$t1;
  101. $hi1=$t2;
  102. $aj=$s0;
  103. $bi=$s1;
  104. $nj=$s2;
  105. $tp=$s3;
  106. $alo=$s4;
  107. $ahi=$s5;
  108. $nlo=$s6;
  109. $nhi=$s7;
  110. $tj=$s8;
  111. $i=$s9;
  112. $j=$s10;
  113. $m1=$s11;
  114. $FRAMESIZE=14;
  115. $code=<<___;
  116. #include "mips_arch.h"
  117. .text
  118. .set noat
  119. .set noreorder
  120. .align 5
  121. .globl bn_mul_mont
  122. .ent bn_mul_mont
  123. bn_mul_mont:
  124. ___
  125. $code.=<<___ if ($flavour =~ /o32/i);
  126. lw $n0,16($sp)
  127. lw $num,20($sp)
  128. ___
  129. $code.=<<___;
  130. slt $at,$num,4
  131. bnez $at,1f
  132. li $t0,0
  133. slt $at,$num,17 # on in-order CPU
  134. bnez $at,bn_mul_mont_internal
  135. nop
  136. 1: jr $ra
  137. li $a0,0
  138. .end bn_mul_mont
  139. .align 5
  140. .ent bn_mul_mont_internal
  141. bn_mul_mont_internal:
  142. .frame $fp,$FRAMESIZE*$SZREG,$ra
  143. .mask 0x40000000|$SAVED_REGS_MASK,-$SZREG
  144. $PTR_SUB $sp,$FRAMESIZE*$SZREG
  145. $REG_S $fp,($FRAMESIZE-1)*$SZREG($sp)
  146. $REG_S $s11,($FRAMESIZE-2)*$SZREG($sp)
  147. $REG_S $s10,($FRAMESIZE-3)*$SZREG($sp)
  148. $REG_S $s9,($FRAMESIZE-4)*$SZREG($sp)
  149. $REG_S $s8,($FRAMESIZE-5)*$SZREG($sp)
  150. $REG_S $s7,($FRAMESIZE-6)*$SZREG($sp)
  151. $REG_S $s6,($FRAMESIZE-7)*$SZREG($sp)
  152. $REG_S $s5,($FRAMESIZE-8)*$SZREG($sp)
  153. $REG_S $s4,($FRAMESIZE-9)*$SZREG($sp)
  154. ___
  155. $code.=<<___ if ($flavour =~ /nubi/i);
  156. $REG_S $s3,($FRAMESIZE-10)*$SZREG($sp)
  157. $REG_S $s2,($FRAMESIZE-11)*$SZREG($sp)
  158. $REG_S $s1,($FRAMESIZE-12)*$SZREG($sp)
  159. $REG_S $s0,($FRAMESIZE-13)*$SZREG($sp)
  160. ___
  161. $code.=<<___;
  162. move $fp,$sp
  163. .set reorder
  164. $LD $n0,0($n0)
  165. $LD $bi,0($bp) # bp[0]
  166. $LD $aj,0($ap) # ap[0]
  167. $LD $nj,0($np) # np[0]
  168. $PTR_SUB $sp,2*$BNSZ # place for two extra words
  169. sll $num,`log($BNSZ)/log(2)`
  170. li $at,-4096
  171. $PTR_SUB $sp,$num
  172. and $sp,$at
  173. $MULTU ($aj,$bi)
  174. $LD $ahi,$BNSZ($ap)
  175. $LD $nhi,$BNSZ($np)
  176. mflo ($lo0,$aj,$bi)
  177. mfhi ($hi0,$aj,$bi)
  178. $MULTU ($lo0,$n0)
  179. mflo ($m1,$lo0,$n0)
  180. $MULTU ($ahi,$bi)
  181. mflo ($alo,$ahi,$bi)
  182. mfhi ($ahi,$ahi,$bi)
  183. $MULTU ($nj,$m1)
  184. mflo ($lo1,$nj,$m1)
  185. mfhi ($hi1,$nj,$m1)
  186. $MULTU ($nhi,$m1)
  187. $ADDU $lo1,$lo0
  188. sltu $at,$lo1,$lo0
  189. $ADDU $hi1,$at
  190. mflo ($nlo,$nhi,$m1)
  191. mfhi ($nhi,$nhi,$m1)
  192. move $tp,$sp
  193. li $j,2*$BNSZ
  194. .align 4
  195. .L1st:
  196. .set noreorder
  197. $PTR_ADD $aj,$ap,$j
  198. $PTR_ADD $nj,$np,$j
  199. $LD $aj,($aj)
  200. $LD $nj,($nj)
  201. $MULTU ($aj,$bi)
  202. $ADDU $lo0,$alo,$hi0
  203. $ADDU $lo1,$nlo,$hi1
  204. sltu $at,$lo0,$hi0
  205. sltu $t0,$lo1,$hi1
  206. $ADDU $hi0,$ahi,$at
  207. $ADDU $hi1,$nhi,$t0
  208. mflo ($alo,$aj,$bi)
  209. mfhi ($ahi,$aj,$bi)
  210. $ADDU $lo1,$lo0
  211. sltu $at,$lo1,$lo0
  212. $MULTU ($nj,$m1)
  213. $ADDU $hi1,$at
  214. addu $j,$BNSZ
  215. $ST $lo1,($tp)
  216. sltu $t0,$j,$num
  217. mflo ($nlo,$nj,$m1)
  218. mfhi ($nhi,$nj,$m1)
  219. bnez $t0,.L1st
  220. $PTR_ADD $tp,$BNSZ
  221. .set reorder
  222. $ADDU $lo0,$alo,$hi0
  223. sltu $at,$lo0,$hi0
  224. $ADDU $hi0,$ahi,$at
  225. $ADDU $lo1,$nlo,$hi1
  226. sltu $t0,$lo1,$hi1
  227. $ADDU $hi1,$nhi,$t0
  228. $ADDU $lo1,$lo0
  229. sltu $at,$lo1,$lo0
  230. $ADDU $hi1,$at
  231. $ST $lo1,($tp)
  232. $ADDU $hi1,$hi0
  233. sltu $at,$hi1,$hi0
  234. $ST $hi1,$BNSZ($tp)
  235. $ST $at,2*$BNSZ($tp)
  236. li $i,$BNSZ
  237. .align 4
  238. .Louter:
  239. $PTR_ADD $bi,$bp,$i
  240. $LD $bi,($bi)
  241. $LD $aj,($ap)
  242. $LD $ahi,$BNSZ($ap)
  243. $LD $tj,($sp)
  244. $MULTU ($aj,$bi)
  245. $LD $nj,($np)
  246. $LD $nhi,$BNSZ($np)
  247. mflo ($lo0,$aj,$bi)
  248. mfhi ($hi0,$aj,$bi)
  249. $ADDU $lo0,$tj
  250. $MULTU ($lo0,$n0)
  251. sltu $at,$lo0,$tj
  252. $ADDU $hi0,$at
  253. mflo ($m1,$lo0,$n0)
  254. $MULTU ($ahi,$bi)
  255. mflo ($alo,$ahi,$bi)
  256. mfhi ($ahi,$ahi,$bi)
  257. $MULTU ($nj,$m1)
  258. mflo ($lo1,$nj,$m1)
  259. mfhi ($hi1,$nj,$m1)
  260. $MULTU ($nhi,$m1)
  261. $ADDU $lo1,$lo0
  262. sltu $at,$lo1,$lo0
  263. $ADDU $hi1,$at
  264. mflo ($nlo,$nhi,$m1)
  265. mfhi ($nhi,$nhi,$m1)
  266. move $tp,$sp
  267. li $j,2*$BNSZ
  268. $LD $tj,$BNSZ($tp)
  269. .align 4
  270. .Linner:
  271. .set noreorder
  272. $PTR_ADD $aj,$ap,$j
  273. $PTR_ADD $nj,$np,$j
  274. $LD $aj,($aj)
  275. $LD $nj,($nj)
  276. $MULTU ($aj,$bi)
  277. $ADDU $lo0,$alo,$hi0
  278. $ADDU $lo1,$nlo,$hi1
  279. sltu $at,$lo0,$hi0
  280. sltu $t0,$lo1,$hi1
  281. $ADDU $hi0,$ahi,$at
  282. $ADDU $hi1,$nhi,$t0
  283. mflo ($alo,$aj,$bi)
  284. mfhi ($ahi,$aj,$bi)
  285. $ADDU $lo0,$tj
  286. addu $j,$BNSZ
  287. $MULTU ($nj,$m1)
  288. sltu $at,$lo0,$tj
  289. $ADDU $lo1,$lo0
  290. $ADDU $hi0,$at
  291. sltu $t0,$lo1,$lo0
  292. $LD $tj,2*$BNSZ($tp)
  293. $ADDU $hi1,$t0
  294. sltu $at,$j,$num
  295. mflo ($nlo,$nj,$m1)
  296. mfhi ($nhi,$nj,$m1)
  297. $ST $lo1,($tp)
  298. bnez $at,.Linner
  299. $PTR_ADD $tp,$BNSZ
  300. .set reorder
  301. $ADDU $lo0,$alo,$hi0
  302. sltu $at,$lo0,$hi0
  303. $ADDU $hi0,$ahi,$at
  304. $ADDU $lo0,$tj
  305. sltu $t0,$lo0,$tj
  306. $ADDU $hi0,$t0
  307. $LD $tj,2*$BNSZ($tp)
  308. $ADDU $lo1,$nlo,$hi1
  309. sltu $at,$lo1,$hi1
  310. $ADDU $hi1,$nhi,$at
  311. $ADDU $lo1,$lo0
  312. sltu $t0,$lo1,$lo0
  313. $ADDU $hi1,$t0
  314. $ST $lo1,($tp)
  315. $ADDU $lo1,$hi1,$hi0
  316. sltu $hi1,$lo1,$hi0
  317. $ADDU $lo1,$tj
  318. sltu $at,$lo1,$tj
  319. $ADDU $hi1,$at
  320. $ST $lo1,$BNSZ($tp)
  321. $ST $hi1,2*$BNSZ($tp)
  322. addu $i,$BNSZ
  323. sltu $t0,$i,$num
  324. bnez $t0,.Louter
  325. .set noreorder
  326. $PTR_ADD $tj,$sp,$num # &tp[num]
  327. move $tp,$sp
  328. move $ap,$sp
  329. li $hi0,0 # clear borrow bit
  330. .align 4
  331. .Lsub: $LD $lo0,($tp)
  332. $LD $lo1,($np)
  333. $PTR_ADD $tp,$BNSZ
  334. $PTR_ADD $np,$BNSZ
  335. $SUBU $lo1,$lo0,$lo1 # tp[i]-np[i]
  336. sgtu $at,$lo1,$lo0
  337. $SUBU $lo0,$lo1,$hi0
  338. sgtu $hi0,$lo0,$lo1
  339. $ST $lo0,($rp)
  340. or $hi0,$at
  341. sltu $at,$tp,$tj
  342. bnez $at,.Lsub
  343. $PTR_ADD $rp,$BNSZ
  344. $SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit
  345. move $tp,$sp
  346. $PTR_SUB $rp,$num # restore rp
  347. not $hi1,$hi0
  348. .Lcopy: $LD $nj,($tp) # conditional move
  349. $LD $aj,($rp)
  350. $ST $zero,($tp)
  351. $PTR_ADD $tp,$BNSZ
  352. and $nj,$hi0
  353. and $aj,$hi1
  354. or $aj,$nj
  355. sltu $at,$tp,$tj
  356. $ST $aj,($rp)
  357. bnez $at,.Lcopy
  358. $PTR_ADD $rp,$BNSZ
  359. li $a0,1
  360. li $t0,1
  361. .set noreorder
  362. move $sp,$fp
  363. $REG_L $fp,($FRAMESIZE-1)*$SZREG($sp)
  364. $REG_L $s11,($FRAMESIZE-2)*$SZREG($sp)
  365. $REG_L $s10,($FRAMESIZE-3)*$SZREG($sp)
  366. $REG_L $s9,($FRAMESIZE-4)*$SZREG($sp)
  367. $REG_L $s8,($FRAMESIZE-5)*$SZREG($sp)
  368. $REG_L $s7,($FRAMESIZE-6)*$SZREG($sp)
  369. $REG_L $s6,($FRAMESIZE-7)*$SZREG($sp)
  370. $REG_L $s5,($FRAMESIZE-8)*$SZREG($sp)
  371. $REG_L $s4,($FRAMESIZE-9)*$SZREG($sp)
  372. ___
  373. $code.=<<___ if ($flavour =~ /nubi/i);
  374. $REG_L $s3,($FRAMESIZE-10)*$SZREG($sp)
  375. $REG_L $s2,($FRAMESIZE-11)*$SZREG($sp)
  376. $REG_L $s1,($FRAMESIZE-12)*$SZREG($sp)
  377. $REG_L $s0,($FRAMESIZE-13)*$SZREG($sp)
  378. ___
  379. $code.=<<___;
  380. jr $ra
  381. $PTR_ADD $sp,$FRAMESIZE*$SZREG
  382. .end bn_mul_mont_internal
  383. .rdata
  384. .asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
  385. ___
  386. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  387. print $code;
  388. close STDOUT or die "error closing STDOUT: $!";