mips-mont.pl 9.3 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. # This module doesn't present direct interest for OpenSSL, because it
  16. # doesn't provide better performance for longer keys, at least not on
  17. # in-order-execution cores. While 512-bit RSA sign operations can be
  18. # 65% faster in 64-bit mode, 1024-bit ones are only 15% faster, and
  19. # 4096-bit ones are up to 15% slower. In 32-bit mode it varies from
  20. # 16% improvement for 512-bit RSA sign to -33% for 4096-bit RSA
  21. # verify:-( All comparisons are against bn_mul_mont-free assembler.
  22. # The module might be of interest to embedded system developers, as
  23. # the code is smaller than 1KB, yet offers >3x improvement on MIPS64
  24. # and 75-30% [less for longer keys] on MIPS32 over compiler-generated
  25. # code.
  26. ######################################################################
  27. # There is a number of MIPS ABI in use, O32 and N32/64 are most
  28. # widely used. Then there is a new contender: NUBI. It appears that if
  29. # one picks the latter, it's possible to arrange code in ABI neutral
  30. # manner. Therefore let's stick to NUBI register layout:
  31. #
  32. ($zero,$at,$t0,$t1,$t2)=map("\$$_",(0..2,24,25));
  33. ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
  34. ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7,$s8,$s9,$s10,$s11)=map("\$$_",(12..23));
  35. ($gp,$tp,$sp,$fp,$ra)=map("\$$_",(3,28..31));
  36. #
  37. # The return value is placed in $a0. Following coding rules facilitate
  38. # interoperability:
  39. #
  40. # - never ever touch $tp, "thread pointer", former $gp;
  41. # - copy return value to $t0, former $v0 [or to $a0 if you're adapting
  42. # old code];
  43. # - on O32 populate $a4-$a7 with 'lw $aN,4*N($sp)' if necessary;
  44. #
  45. # For reference here is register layout for N32/64 MIPS ABIs:
  46. #
  47. # ($zero,$at,$v0,$v1)=map("\$$_",(0..3));
  48. # ($a0,$a1,$a2,$a3,$a4,$a5,$a6,$a7)=map("\$$_",(4..11));
  49. # ($t0,$t1,$t2,$t3,$t8,$t9)=map("\$$_",(12..15,24,25));
  50. # ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("\$$_",(16..23));
  51. # ($gp,$sp,$fp,$ra)=map("\$$_",(28..31));
  52. #
  53. $flavour = shift || "o32"; # supported flavours are o32,n32,64,nubi32,nubi64
  54. if ($flavour =~ /64|n32/i) {
  55. $PTR_ADD="daddu"; # incidentally works even on n32
  56. $PTR_SUB="dsubu"; # incidentally works even on n32
  57. $REG_S="sd";
  58. $REG_L="ld";
  59. $SZREG=8;
  60. } else {
  61. $PTR_ADD="addu";
  62. $PTR_SUB="subu";
  63. $REG_S="sw";
  64. $REG_L="lw";
  65. $SZREG=4;
  66. }
  67. $SAVED_REGS_MASK = ($flavour =~ /nubi/i) ? 0x00fff000 : 0x00ff0000;
  68. #
  69. # <appro@openssl.org>
  70. #
  71. ######################################################################
  72. while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
  73. open STDOUT,">$output";
  74. if ($flavour =~ /64|n32/i) {
  75. $LD="ld";
  76. $ST="sd";
  77. $MULTU="dmultu";
  78. $ADDU="daddu";
  79. $SUBU="dsubu";
  80. $BNSZ=8;
  81. } else {
  82. $LD="lw";
  83. $ST="sw";
  84. $MULTU="multu";
  85. $ADDU="addu";
  86. $SUBU="subu";
  87. $BNSZ=4;
  88. }
  89. # int bn_mul_mont(
  90. $rp=$a0; # BN_ULONG *rp,
  91. $ap=$a1; # const BN_ULONG *ap,
  92. $bp=$a2; # const BN_ULONG *bp,
  93. $np=$a3; # const BN_ULONG *np,
  94. $n0=$a4; # const BN_ULONG *n0,
  95. $num=$a5; # int num);
  96. $lo0=$a6;
  97. $hi0=$a7;
  98. $lo1=$t1;
  99. $hi1=$t2;
  100. $aj=$s0;
  101. $bi=$s1;
  102. $nj=$s2;
  103. $tp=$s3;
  104. $alo=$s4;
  105. $ahi=$s5;
  106. $nlo=$s6;
  107. $nhi=$s7;
  108. $tj=$s8;
  109. $i=$s9;
  110. $j=$s10;
  111. $m1=$s11;
  112. $FRAMESIZE=14;
  113. $code=<<___;
  114. #include "mips_arch.h"
  115. .text
  116. .set noat
  117. .set noreorder
  118. .align 5
  119. .globl bn_mul_mont
  120. .ent bn_mul_mont
  121. bn_mul_mont:
  122. ___
  123. $code.=<<___ if ($flavour =~ /o32/i);
  124. lw $n0,16($sp)
  125. lw $num,20($sp)
  126. ___
  127. $code.=<<___;
  128. slt $at,$num,4
  129. bnez $at,1f
  130. li $t0,0
  131. slt $at,$num,17 # on in-order CPU
  132. bnez $at,bn_mul_mont_internal
  133. nop
  134. 1: jr $ra
  135. li $a0,0
  136. .end bn_mul_mont
  137. .align 5
  138. .ent bn_mul_mont_internal
  139. bn_mul_mont_internal:
  140. .frame $fp,$FRAMESIZE*$SZREG,$ra
  141. .mask 0x40000000|$SAVED_REGS_MASK,-$SZREG
  142. $PTR_SUB $sp,$FRAMESIZE*$SZREG
  143. $REG_S $fp,($FRAMESIZE-1)*$SZREG($sp)
  144. $REG_S $s11,($FRAMESIZE-2)*$SZREG($sp)
  145. $REG_S $s10,($FRAMESIZE-3)*$SZREG($sp)
  146. $REG_S $s9,($FRAMESIZE-4)*$SZREG($sp)
  147. $REG_S $s8,($FRAMESIZE-5)*$SZREG($sp)
  148. $REG_S $s7,($FRAMESIZE-6)*$SZREG($sp)
  149. $REG_S $s6,($FRAMESIZE-7)*$SZREG($sp)
  150. $REG_S $s5,($FRAMESIZE-8)*$SZREG($sp)
  151. $REG_S $s4,($FRAMESIZE-9)*$SZREG($sp)
  152. ___
  153. $code.=<<___ if ($flavour =~ /nubi/i);
  154. $REG_S $s3,($FRAMESIZE-10)*$SZREG($sp)
  155. $REG_S $s2,($FRAMESIZE-11)*$SZREG($sp)
  156. $REG_S $s1,($FRAMESIZE-12)*$SZREG($sp)
  157. $REG_S $s0,($FRAMESIZE-13)*$SZREG($sp)
  158. ___
  159. $code.=<<___;
  160. move $fp,$sp
  161. .set reorder
  162. $LD $n0,0($n0)
  163. $LD $bi,0($bp) # bp[0]
  164. $LD $aj,0($ap) # ap[0]
  165. $LD $nj,0($np) # np[0]
  166. $PTR_SUB $sp,2*$BNSZ # place for two extra words
  167. sll $num,`log($BNSZ)/log(2)`
  168. li $at,-4096
  169. $PTR_SUB $sp,$num
  170. and $sp,$at
  171. $MULTU ($aj,$bi)
  172. $LD $ahi,$BNSZ($ap)
  173. $LD $nhi,$BNSZ($np)
  174. mflo ($lo0,$aj,$bi)
  175. mfhi ($hi0,$aj,$bi)
  176. $MULTU ($lo0,$n0)
  177. mflo ($m1,$lo0,$n0)
  178. $MULTU ($ahi,$bi)
  179. mflo ($alo,$ahi,$bi)
  180. mfhi ($ahi,$ahi,$bi)
  181. $MULTU ($nj,$m1)
  182. mflo ($lo1,$nj,$m1)
  183. mfhi ($hi1,$nj,$m1)
  184. $MULTU ($nhi,$m1)
  185. $ADDU $lo1,$lo0
  186. sltu $at,$lo1,$lo0
  187. $ADDU $hi1,$at
  188. mflo ($nlo,$nhi,$m1)
  189. mfhi ($nhi,$nhi,$m1)
  190. move $tp,$sp
  191. li $j,2*$BNSZ
  192. .align 4
  193. .L1st:
  194. .set noreorder
  195. $PTR_ADD $aj,$ap,$j
  196. $PTR_ADD $nj,$np,$j
  197. $LD $aj,($aj)
  198. $LD $nj,($nj)
  199. $MULTU ($aj,$bi)
  200. $ADDU $lo0,$alo,$hi0
  201. $ADDU $lo1,$nlo,$hi1
  202. sltu $at,$lo0,$hi0
  203. sltu $t0,$lo1,$hi1
  204. $ADDU $hi0,$ahi,$at
  205. $ADDU $hi1,$nhi,$t0
  206. mflo ($alo,$aj,$bi)
  207. mfhi ($ahi,$aj,$bi)
  208. $ADDU $lo1,$lo0
  209. sltu $at,$lo1,$lo0
  210. $MULTU ($nj,$m1)
  211. $ADDU $hi1,$at
  212. addu $j,$BNSZ
  213. $ST $lo1,($tp)
  214. sltu $t0,$j,$num
  215. mflo ($nlo,$nj,$m1)
  216. mfhi ($nhi,$nj,$m1)
  217. bnez $t0,.L1st
  218. $PTR_ADD $tp,$BNSZ
  219. .set reorder
  220. $ADDU $lo0,$alo,$hi0
  221. sltu $at,$lo0,$hi0
  222. $ADDU $hi0,$ahi,$at
  223. $ADDU $lo1,$nlo,$hi1
  224. sltu $t0,$lo1,$hi1
  225. $ADDU $hi1,$nhi,$t0
  226. $ADDU $lo1,$lo0
  227. sltu $at,$lo1,$lo0
  228. $ADDU $hi1,$at
  229. $ST $lo1,($tp)
  230. $ADDU $hi1,$hi0
  231. sltu $at,$hi1,$hi0
  232. $ST $hi1,$BNSZ($tp)
  233. $ST $at,2*$BNSZ($tp)
  234. li $i,$BNSZ
  235. .align 4
  236. .Louter:
  237. $PTR_ADD $bi,$bp,$i
  238. $LD $bi,($bi)
  239. $LD $aj,($ap)
  240. $LD $ahi,$BNSZ($ap)
  241. $LD $tj,($sp)
  242. $MULTU ($aj,$bi)
  243. $LD $nj,($np)
  244. $LD $nhi,$BNSZ($np)
  245. mflo ($lo0,$aj,$bi)
  246. mfhi ($hi0,$aj,$bi)
  247. $ADDU $lo0,$tj
  248. $MULTU ($lo0,$n0)
  249. sltu $at,$lo0,$tj
  250. $ADDU $hi0,$at
  251. mflo ($m1,$lo0,$n0)
  252. $MULTU ($ahi,$bi)
  253. mflo ($alo,$ahi,$bi)
  254. mfhi ($ahi,$ahi,$bi)
  255. $MULTU ($nj,$m1)
  256. mflo ($lo1,$nj,$m1)
  257. mfhi ($hi1,$nj,$m1)
  258. $MULTU ($nhi,$m1)
  259. $ADDU $lo1,$lo0
  260. sltu $at,$lo1,$lo0
  261. $ADDU $hi1,$at
  262. mflo ($nlo,$nhi,$m1)
  263. mfhi ($nhi,$nhi,$m1)
  264. move $tp,$sp
  265. li $j,2*$BNSZ
  266. $LD $tj,$BNSZ($tp)
  267. .align 4
  268. .Linner:
  269. .set noreorder
  270. $PTR_ADD $aj,$ap,$j
  271. $PTR_ADD $nj,$np,$j
  272. $LD $aj,($aj)
  273. $LD $nj,($nj)
  274. $MULTU ($aj,$bi)
  275. $ADDU $lo0,$alo,$hi0
  276. $ADDU $lo1,$nlo,$hi1
  277. sltu $at,$lo0,$hi0
  278. sltu $t0,$lo1,$hi1
  279. $ADDU $hi0,$ahi,$at
  280. $ADDU $hi1,$nhi,$t0
  281. mflo ($alo,$aj,$bi)
  282. mfhi ($ahi,$aj,$bi)
  283. $ADDU $lo0,$tj
  284. addu $j,$BNSZ
  285. $MULTU ($nj,$m1)
  286. sltu $at,$lo0,$tj
  287. $ADDU $lo1,$lo0
  288. $ADDU $hi0,$at
  289. sltu $t0,$lo1,$lo0
  290. $LD $tj,2*$BNSZ($tp)
  291. $ADDU $hi1,$t0
  292. sltu $at,$j,$num
  293. mflo ($nlo,$nj,$m1)
  294. mfhi ($nhi,$nj,$m1)
  295. $ST $lo1,($tp)
  296. bnez $at,.Linner
  297. $PTR_ADD $tp,$BNSZ
  298. .set reorder
  299. $ADDU $lo0,$alo,$hi0
  300. sltu $at,$lo0,$hi0
  301. $ADDU $hi0,$ahi,$at
  302. $ADDU $lo0,$tj
  303. sltu $t0,$lo0,$tj
  304. $ADDU $hi0,$t0
  305. $LD $tj,2*$BNSZ($tp)
  306. $ADDU $lo1,$nlo,$hi1
  307. sltu $at,$lo1,$hi1
  308. $ADDU $hi1,$nhi,$at
  309. $ADDU $lo1,$lo0
  310. sltu $t0,$lo1,$lo0
  311. $ADDU $hi1,$t0
  312. $ST $lo1,($tp)
  313. $ADDU $lo1,$hi1,$hi0
  314. sltu $hi1,$lo1,$hi0
  315. $ADDU $lo1,$tj
  316. sltu $at,$lo1,$tj
  317. $ADDU $hi1,$at
  318. $ST $lo1,$BNSZ($tp)
  319. $ST $hi1,2*$BNSZ($tp)
  320. addu $i,$BNSZ
  321. sltu $t0,$i,$num
  322. bnez $t0,.Louter
  323. .set noreorder
  324. $PTR_ADD $tj,$sp,$num # &tp[num]
  325. move $tp,$sp
  326. move $ap,$sp
  327. li $hi0,0 # clear borrow bit
  328. .align 4
  329. .Lsub: $LD $lo0,($tp)
  330. $LD $lo1,($np)
  331. $PTR_ADD $tp,$BNSZ
  332. $PTR_ADD $np,$BNSZ
  333. $SUBU $lo1,$lo0,$lo1 # tp[i]-np[i]
  334. sgtu $at,$lo1,$lo0
  335. $SUBU $lo0,$lo1,$hi0
  336. sgtu $hi0,$lo0,$lo1
  337. $ST $lo0,($rp)
  338. or $hi0,$at
  339. sltu $at,$tp,$tj
  340. bnez $at,.Lsub
  341. $PTR_ADD $rp,$BNSZ
  342. $SUBU $hi0,$hi1,$hi0 # handle upmost overflow bit
  343. move $tp,$sp
  344. $PTR_SUB $rp,$num # restore rp
  345. not $hi1,$hi0
  346. and $ap,$hi0,$sp
  347. and $bp,$hi1,$rp
  348. or $ap,$ap,$bp # ap=borrow?tp:rp
  349. .align 4
  350. .Lcopy: $LD $aj,($ap)
  351. $PTR_ADD $ap,$BNSZ
  352. $ST $zero,($tp)
  353. $PTR_ADD $tp,$BNSZ
  354. sltu $at,$tp,$tj
  355. $ST $aj,($rp)
  356. bnez $at,.Lcopy
  357. $PTR_ADD $rp,$BNSZ
  358. li $a0,1
  359. li $t0,1
  360. .set noreorder
  361. move $sp,$fp
  362. $REG_L $fp,($FRAMESIZE-1)*$SZREG($sp)
  363. $REG_L $s11,($FRAMESIZE-2)*$SZREG($sp)
  364. $REG_L $s10,($FRAMESIZE-3)*$SZREG($sp)
  365. $REG_L $s9,($FRAMESIZE-4)*$SZREG($sp)
  366. $REG_L $s8,($FRAMESIZE-5)*$SZREG($sp)
  367. $REG_L $s7,($FRAMESIZE-6)*$SZREG($sp)
  368. $REG_L $s6,($FRAMESIZE-7)*$SZREG($sp)
  369. $REG_L $s5,($FRAMESIZE-8)*$SZREG($sp)
  370. $REG_L $s4,($FRAMESIZE-9)*$SZREG($sp)
  371. ___
  372. $code.=<<___ if ($flavour =~ /nubi/i);
  373. $REG_L $s3,($FRAMESIZE-10)*$SZREG($sp)
  374. $REG_L $s2,($FRAMESIZE-11)*$SZREG($sp)
  375. $REG_L $s1,($FRAMESIZE-12)*$SZREG($sp)
  376. $REG_L $s0,($FRAMESIZE-13)*$SZREG($sp)
  377. ___
  378. $code.=<<___;
  379. jr $ra
  380. $PTR_ADD $sp,$FRAMESIZE*$SZREG
  381. .end bn_mul_mont_internal
  382. .rdata
  383. .asciiz "Montgomery Multiplication for MIPS, CRYPTOGAMS by <appro\@openssl.org>"
  384. ___
  385. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  386. print $code;
  387. close STDOUT;