mips3-mont.pl 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. # This module doesn't present direct interest for OpenSSL, because it
  10. # doesn't provide better performance for longer keys. While 512-bit
  11. # RSA private key operations are 40% faster, 1024-bit ones are hardly
  12. # faster at all, while longer key operations are slower by up to 20%.
  13. # It might be of interest to embedded system developers though, as
  14. # it's smaller than 1KB, yet offers ~3x improvement over compiler
  15. # generated code.
  16. #
  17. # The module targets N32 and N64 MIPS ABIs and currently is a bit
  18. # IRIX-centric, i.e. is likely to require adaptation for other OSes.
  19. # int bn_mul_mont(
  20. $rp="a0"; # BN_ULONG *rp,
  21. $ap="a1"; # const BN_ULONG *ap,
  22. $bp="a2"; # const BN_ULONG *bp,
  23. $np="a3"; # const BN_ULONG *np,
  24. $n0="a4"; # const BN_ULONG *n0,
  25. $num="a5"; # int num);
  26. $lo0="a6";
  27. $hi0="a7";
  28. $lo1="v0";
  29. $hi1="v1";
  30. $aj="t0";
  31. $bi="t1";
  32. $nj="t2";
  33. $tp="t3";
  34. $alo="s0";
  35. $ahi="s1";
  36. $nlo="s2";
  37. $nhi="s3";
  38. $tj="s4";
  39. $i="s5";
  40. $j="s6";
  41. $fp="t8";
  42. $m1="t9";
  43. $FRAME=8*(2+8);
  44. $code=<<___;
  45. #include <asm.h>
  46. #include <regdef.h>
  47. .text
  48. .set noat
  49. .set reorder
  50. .align 5
  51. .globl bn_mul_mont
  52. .ent bn_mul_mont
  53. bn_mul_mont:
  54. .set noreorder
  55. PTR_SUB sp,64
  56. move $fp,sp
  57. .frame $fp,64,ra
  58. slt AT,$num,4
  59. li v0,0
  60. beqzl AT,.Lproceed
  61. nop
  62. jr ra
  63. PTR_ADD sp,$fp,64
  64. .set reorder
  65. .align 5
  66. .Lproceed:
  67. ld $n0,0($n0)
  68. ld $bi,0($bp) # bp[0]
  69. ld $aj,0($ap) # ap[0]
  70. ld $nj,0($np) # np[0]
  71. PTR_SUB sp,16 # place for two extra words
  72. sll $num,3
  73. li AT,-4096
  74. PTR_SUB sp,$num
  75. and sp,AT
  76. sd s0,0($fp)
  77. sd s1,8($fp)
  78. sd s2,16($fp)
  79. sd s3,24($fp)
  80. sd s4,32($fp)
  81. sd s5,40($fp)
  82. sd s6,48($fp)
  83. sd s7,56($fp)
  84. dmultu $aj,$bi
  85. ld $alo,8($ap)
  86. ld $nlo,8($np)
  87. mflo $lo0
  88. mfhi $hi0
  89. dmultu $lo0,$n0
  90. mflo $m1
  91. dmultu $alo,$bi
  92. mflo $alo
  93. mfhi $ahi
  94. dmultu $nj,$m1
  95. mflo $lo1
  96. mfhi $hi1
  97. dmultu $nlo,$m1
  98. daddu $lo1,$lo0
  99. sltu AT,$lo1,$lo0
  100. daddu $hi1,AT
  101. mflo $nlo
  102. mfhi $nhi
  103. move $tp,sp
  104. li $j,16
  105. .align 4
  106. .L1st:
  107. .set noreorder
  108. PTR_ADD $aj,$ap,$j
  109. ld $aj,($aj)
  110. PTR_ADD $nj,$np,$j
  111. ld $nj,($nj)
  112. dmultu $aj,$bi
  113. daddu $lo0,$alo,$hi0
  114. daddu $lo1,$nlo,$hi1
  115. sltu AT,$lo0,$hi0
  116. sltu s7,$lo1,$hi1
  117. daddu $hi0,$ahi,AT
  118. daddu $hi1,$nhi,s7
  119. mflo $alo
  120. mfhi $ahi
  121. daddu $lo1,$lo0
  122. sltu AT,$lo1,$lo0
  123. dmultu $nj,$m1
  124. daddu $hi1,AT
  125. addu $j,8
  126. sd $lo1,($tp)
  127. sltu s7,$j,$num
  128. mflo $nlo
  129. mfhi $nhi
  130. bnez s7,.L1st
  131. PTR_ADD $tp,8
  132. .set reorder
  133. daddu $lo0,$alo,$hi0
  134. sltu AT,$lo0,$hi0
  135. daddu $hi0,$ahi,AT
  136. daddu $lo1,$nlo,$hi1
  137. sltu s7,$lo1,$hi1
  138. daddu $hi1,$nhi,s7
  139. daddu $lo1,$lo0
  140. sltu AT,$lo1,$lo0
  141. daddu $hi1,AT
  142. sd $lo1,($tp)
  143. daddu $hi1,$hi0
  144. sltu AT,$hi1,$hi0
  145. sd $hi1,8($tp)
  146. sd AT,16($tp)
  147. li $i,8
  148. .align 4
  149. .Louter:
  150. PTR_ADD $bi,$bp,$i
  151. ld $bi,($bi)
  152. ld $aj,($ap)
  153. ld $alo,8($ap)
  154. ld $tj,(sp)
  155. dmultu $aj,$bi
  156. ld $nj,($np)
  157. ld $nlo,8($np)
  158. mflo $lo0
  159. mfhi $hi0
  160. daddu $lo0,$tj
  161. dmultu $lo0,$n0
  162. sltu AT,$lo0,$tj
  163. daddu $hi0,AT
  164. mflo $m1
  165. dmultu $alo,$bi
  166. mflo $alo
  167. mfhi $ahi
  168. dmultu $nj,$m1
  169. mflo $lo1
  170. mfhi $hi1
  171. dmultu $nlo,$m1
  172. daddu $lo1,$lo0
  173. sltu AT,$lo1,$lo0
  174. daddu $hi1,AT
  175. mflo $nlo
  176. mfhi $nhi
  177. move $tp,sp
  178. li $j,16
  179. ld $tj,8($tp)
  180. .align 4
  181. .Linner:
  182. .set noreorder
  183. PTR_ADD $aj,$ap,$j
  184. ld $aj,($aj)
  185. PTR_ADD $nj,$np,$j
  186. ld $nj,($nj)
  187. dmultu $aj,$bi
  188. daddu $lo0,$alo,$hi0
  189. daddu $lo1,$nlo,$hi1
  190. sltu AT,$lo0,$hi0
  191. sltu s7,$lo1,$hi1
  192. daddu $hi0,$ahi,AT
  193. daddu $hi1,$nhi,s7
  194. mflo $alo
  195. mfhi $ahi
  196. daddu $lo0,$tj
  197. addu $j,8
  198. dmultu $nj,$m1
  199. sltu AT,$lo0,$tj
  200. daddu $lo1,$lo0
  201. daddu $hi0,AT
  202. sltu s7,$lo1,$lo0
  203. ld $tj,16($tp)
  204. daddu $hi1,s7
  205. sltu AT,$j,$num
  206. mflo $nlo
  207. mfhi $nhi
  208. sd $lo1,($tp)
  209. bnez AT,.Linner
  210. PTR_ADD $tp,8
  211. .set reorder
  212. daddu $lo0,$alo,$hi0
  213. sltu AT,$lo0,$hi0
  214. daddu $hi0,$ahi,AT
  215. daddu $lo0,$tj
  216. sltu s7,$lo0,$tj
  217. daddu $hi0,s7
  218. ld $tj,16($tp)
  219. daddu $lo1,$nlo,$hi1
  220. sltu AT,$lo1,$hi1
  221. daddu $hi1,$nhi,AT
  222. daddu $lo1,$lo0
  223. sltu s7,$lo1,$lo0
  224. daddu $hi1,s7
  225. sd $lo1,($tp)
  226. daddu $lo1,$hi1,$hi0
  227. sltu $hi1,$lo1,$hi0
  228. daddu $lo1,$tj
  229. sltu AT,$lo1,$tj
  230. daddu $hi1,AT
  231. sd $lo1,8($tp)
  232. sd $hi1,16($tp)
  233. addu $i,8
  234. sltu s7,$i,$num
  235. bnez s7,.Louter
  236. .set noreorder
  237. PTR_ADD $tj,sp,$num # &tp[num]
  238. move $tp,sp
  239. move $ap,sp
  240. li $hi0,0 # clear borrow bit
  241. .align 4
  242. .Lsub: ld $lo0,($tp)
  243. ld $lo1,($np)
  244. PTR_ADD $tp,8
  245. PTR_ADD $np,8
  246. dsubu $lo1,$lo0,$lo1 # tp[i]-np[i]
  247. sgtu AT,$lo1,$lo0
  248. dsubu $lo0,$lo1,$hi0
  249. sgtu $hi0,$lo0,$lo1
  250. sd $lo0,($rp)
  251. or $hi0,AT
  252. sltu AT,$tp,$tj
  253. bnez AT,.Lsub
  254. PTR_ADD $rp,8
  255. dsubu $hi0,$hi1,$hi0 # handle upmost overflow bit
  256. move $tp,sp
  257. PTR_SUB $rp,$num # restore rp
  258. not $hi1,$hi0
  259. and $ap,$hi0,sp
  260. and $bp,$hi1,$rp
  261. or $ap,$ap,$bp # ap=borrow?tp:rp
  262. .align 4
  263. .Lcopy: ld $aj,($ap)
  264. PTR_ADD $ap,8
  265. PTR_ADD $tp,8
  266. sd zero,-8($tp)
  267. sltu AT,$tp,$tj
  268. sd $aj,($rp)
  269. bnez AT,.Lcopy
  270. PTR_ADD $rp,8
  271. ld s0,0($fp)
  272. ld s1,8($fp)
  273. ld s2,16($fp)
  274. ld s3,24($fp)
  275. ld s4,32($fp)
  276. ld s5,40($fp)
  277. ld s6,48($fp)
  278. ld s7,56($fp)
  279. li v0,1
  280. jr ra
  281. PTR_ADD sp,$fp,64
  282. .set reorder
  283. END(bn_mul_mont)
  284. .rdata
  285. .asciiz "Montgomery Multiplication for MIPS III/IV, CRYPTOGAMS by <appro\@openssl.org>"
  286. ___
  287. print $code;
  288. close STDOUT;