alpha-mont.pl 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321
  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # On 21264 RSA sign performance improves by 70/35/20/15 percent for
  11. # 512/1024/2048/4096 bit key lengths. This is against vendor compiler
  12. # instructed to '-tune host' code with in-line assembler. Other
  13. # benchmarks improve by 15-20%. To anchor it to something else, the
  14. # code provides approximately the same performance per GHz as AMD64.
  15. # I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
  16. # difference.
  17. # int bn_mul_mont(
  18. $rp="a0"; # BN_ULONG *rp,
  19. $ap="a1"; # const BN_ULONG *ap,
  20. $bp="a2"; # const BN_ULONG *bp,
  21. $np="a3"; # const BN_ULONG *np,
  22. $n0="a4"; # const BN_ULONG *n0,
  23. $num="a5"; # int num);
  24. $lo0="t0";
  25. $hi0="t1";
  26. $lo1="t2";
  27. $hi1="t3";
  28. $aj="t4";
  29. $bi="t5";
  30. $nj="t6";
  31. $tp="t7";
  32. $alo="t8";
  33. $ahi="t9";
  34. $nlo="t10";
  35. $nhi="t11";
  36. $tj="t12";
  37. $i="s3";
  38. $j="s4";
  39. $m1="s5";
  40. $code=<<___;
  41. #ifdef __linux__
  42. #include <asm/regdef.h>
  43. #else
  44. #include <asm.h>
  45. #include <regdef.h>
  46. #endif
  47. .text
  48. .set noat
  49. .set noreorder
  50. .globl bn_mul_mont
  51. .align 5
  52. .ent bn_mul_mont
  53. bn_mul_mont:
  54. lda sp,-48(sp)
  55. stq ra,0(sp)
  56. stq s3,8(sp)
  57. stq s4,16(sp)
  58. stq s5,24(sp)
  59. stq fp,32(sp)
  60. mov sp,fp
  61. .mask 0x0400f000,-48
  62. .frame fp,48,ra
  63. .prologue 0
  64. .align 4
  65. .set reorder
  66. sextl $num,$num
  67. mov 0,v0
  68. cmplt $num,4,AT
  69. bne AT,.Lexit
  70. ldq $hi0,0($ap) # ap[0]
  71. s8addq $num,16,AT
  72. ldq $aj,8($ap)
  73. subq sp,AT,sp
  74. ldq $bi,0($bp) # bp[0]
  75. lda AT,-4096(zero) # mov -4096,AT
  76. ldq $n0,0($n0)
  77. and sp,AT,sp
  78. mulq $hi0,$bi,$lo0
  79. ldq $hi1,0($np) # np[0]
  80. umulh $hi0,$bi,$hi0
  81. ldq $nj,8($np)
  82. mulq $lo0,$n0,$m1
  83. mulq $hi1,$m1,$lo1
  84. umulh $hi1,$m1,$hi1
  85. addq $lo1,$lo0,$lo1
  86. cmpult $lo1,$lo0,AT
  87. addq $hi1,AT,$hi1
  88. mulq $aj,$bi,$alo
  89. mov 2,$j
  90. umulh $aj,$bi,$ahi
  91. mov sp,$tp
  92. mulq $nj,$m1,$nlo
  93. s8addq $j,$ap,$aj
  94. umulh $nj,$m1,$nhi
  95. s8addq $j,$np,$nj
  96. .align 4
  97. .L1st:
  98. .set noreorder
  99. ldq $aj,0($aj)
  100. addl $j,1,$j
  101. ldq $nj,0($nj)
  102. lda $tp,8($tp)
  103. addq $alo,$hi0,$lo0
  104. mulq $aj,$bi,$alo
  105. cmpult $lo0,$hi0,AT
  106. addq $nlo,$hi1,$lo1
  107. mulq $nj,$m1,$nlo
  108. addq $ahi,AT,$hi0
  109. cmpult $lo1,$hi1,v0
  110. cmplt $j,$num,$tj
  111. umulh $aj,$bi,$ahi
  112. addq $nhi,v0,$hi1
  113. addq $lo1,$lo0,$lo1
  114. s8addq $j,$ap,$aj
  115. umulh $nj,$m1,$nhi
  116. cmpult $lo1,$lo0,v0
  117. addq $hi1,v0,$hi1
  118. s8addq $j,$np,$nj
  119. stq $lo1,-8($tp)
  120. nop
  121. unop
  122. bne $tj,.L1st
  123. .set reorder
  124. addq $alo,$hi0,$lo0
  125. addq $nlo,$hi1,$lo1
  126. cmpult $lo0,$hi0,AT
  127. cmpult $lo1,$hi1,v0
  128. addq $ahi,AT,$hi0
  129. addq $nhi,v0,$hi1
  130. addq $lo1,$lo0,$lo1
  131. cmpult $lo1,$lo0,v0
  132. addq $hi1,v0,$hi1
  133. stq $lo1,0($tp)
  134. addq $hi1,$hi0,$hi1
  135. cmpult $hi1,$hi0,AT
  136. stq $hi1,8($tp)
  137. stq AT,16($tp)
  138. mov 1,$i
  139. .align 4
  140. .Louter:
  141. s8addq $i,$bp,$bi
  142. ldq $hi0,0($ap)
  143. ldq $aj,8($ap)
  144. ldq $bi,0($bi)
  145. ldq $hi1,0($np)
  146. ldq $nj,8($np)
  147. ldq $tj,0(sp)
  148. mulq $hi0,$bi,$lo0
  149. umulh $hi0,$bi,$hi0
  150. addq $lo0,$tj,$lo0
  151. cmpult $lo0,$tj,AT
  152. addq $hi0,AT,$hi0
  153. mulq $lo0,$n0,$m1
  154. mulq $hi1,$m1,$lo1
  155. umulh $hi1,$m1,$hi1
  156. addq $lo1,$lo0,$lo1
  157. cmpult $lo1,$lo0,AT
  158. mov 2,$j
  159. addq $hi1,AT,$hi1
  160. mulq $aj,$bi,$alo
  161. mov sp,$tp
  162. umulh $aj,$bi,$ahi
  163. mulq $nj,$m1,$nlo
  164. s8addq $j,$ap,$aj
  165. umulh $nj,$m1,$nhi
  166. .align 4
  167. .Linner:
  168. .set noreorder
  169. ldq $tj,8($tp) #L0
  170. nop #U1
  171. ldq $aj,0($aj) #L1
  172. s8addq $j,$np,$nj #U0
  173. ldq $nj,0($nj) #L0
  174. nop #U1
  175. addq $alo,$hi0,$lo0 #L1
  176. lda $tp,8($tp)
  177. mulq $aj,$bi,$alo #U1
  178. cmpult $lo0,$hi0,AT #L0
  179. addq $nlo,$hi1,$lo1 #L1
  180. addl $j,1,$j
  181. mulq $nj,$m1,$nlo #U1
  182. addq $ahi,AT,$hi0 #L0
  183. addq $lo0,$tj,$lo0 #L1
  184. cmpult $lo1,$hi1,v0 #U0
  185. umulh $aj,$bi,$ahi #U1
  186. cmpult $lo0,$tj,AT #L0
  187. addq $lo1,$lo0,$lo1 #L1
  188. addq $nhi,v0,$hi1 #U0
  189. umulh $nj,$m1,$nhi #U1
  190. s8addq $j,$ap,$aj #L0
  191. cmpult $lo1,$lo0,v0 #L1
  192. cmplt $j,$num,$tj #U0 # borrow $tj
  193. addq $hi0,AT,$hi0 #L0
  194. addq $hi1,v0,$hi1 #U1
  195. stq $lo1,-8($tp) #L1
  196. bne $tj,.Linner #U0
  197. .set reorder
  198. ldq $tj,8($tp)
  199. addq $alo,$hi0,$lo0
  200. addq $nlo,$hi1,$lo1
  201. cmpult $lo0,$hi0,AT
  202. cmpult $lo1,$hi1,v0
  203. addq $ahi,AT,$hi0
  204. addq $nhi,v0,$hi1
  205. addq $lo0,$tj,$lo0
  206. cmpult $lo0,$tj,AT
  207. addq $hi0,AT,$hi0
  208. ldq $tj,16($tp)
  209. addq $lo1,$lo0,$j
  210. cmpult $j,$lo0,v0
  211. addq $hi1,v0,$hi1
  212. addq $hi1,$hi0,$lo1
  213. stq $j,0($tp)
  214. cmpult $lo1,$hi0,$hi1
  215. addq $lo1,$tj,$lo1
  216. cmpult $lo1,$tj,AT
  217. addl $i,1,$i
  218. addq $hi1,AT,$hi1
  219. stq $lo1,8($tp)
  220. cmplt $i,$num,$tj # borrow $tj
  221. stq $hi1,16($tp)
  222. bne $tj,.Louter
  223. s8addq $num,sp,$tj # &tp[num]
  224. mov $rp,$bp # put rp aside
  225. mov sp,$tp
  226. mov sp,$ap
  227. mov 0,$hi0 # clear borrow bit
  228. .align 4
  229. .Lsub: ldq $lo0,0($tp)
  230. ldq $lo1,0($np)
  231. lda $tp,8($tp)
  232. lda $np,8($np)
  233. subq $lo0,$lo1,$lo1 # tp[i]-np[i]
  234. cmpult $lo0,$lo1,AT
  235. subq $lo1,$hi0,$lo0
  236. cmpult $lo1,$lo0,$hi0
  237. or $hi0,AT,$hi0
  238. stq $lo0,0($rp)
  239. cmpult $tp,$tj,v0
  240. lda $rp,8($rp)
  241. bne v0,.Lsub
  242. subq $hi1,$hi0,$hi0 # handle upmost overflow bit
  243. mov sp,$tp
  244. mov $bp,$rp # restore rp
  245. and sp,$hi0,$ap
  246. bic $bp,$hi0,$bp
  247. bis $bp,$ap,$ap # ap=borrow?tp:rp
  248. .align 4
  249. .Lcopy: ldq $aj,0($ap) # copy or in-place refresh
  250. lda $tp,8($tp)
  251. lda $rp,8($rp)
  252. lda $ap,8($ap)
  253. stq zero,-8($tp) # zap tp
  254. cmpult $tp,$tj,AT
  255. stq $aj,-8($rp)
  256. bne AT,.Lcopy
  257. mov 1,v0
  258. .Lexit:
  259. .set noreorder
  260. mov fp,sp
  261. /*ldq ra,0(sp)*/
  262. ldq s3,8(sp)
  263. ldq s4,16(sp)
  264. ldq s5,24(sp)
  265. ldq fp,32(sp)
  266. lda sp,48(sp)
  267. ret (ra)
  268. .end bn_mul_mont
  269. .ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
  270. .align 2
  271. ___
  272. print $code;
  273. close STDOUT;