alpha-mont.pl 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331
  1. #! /usr/bin/env perl
  2. # Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the OpenSSL license (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # On 21264 RSA sign performance improves by 70/35/20/15 percent for
  17. # 512/1024/2048/4096 bit key lengths. This is against vendor compiler
  18. # instructed to '-tune host' code with in-line assembler. Other
  19. # benchmarks improve by 15-20%. To anchor it to something else, the
  20. # code provides approximately the same performance per GHz as AMD64.
  21. # I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
  22. # difference.
  23. $output=pop;
  24. open STDOUT,">$output";
  25. # int bn_mul_mont(
  26. $rp="a0"; # BN_ULONG *rp,
  27. $ap="a1"; # const BN_ULONG *ap,
  28. $bp="a2"; # const BN_ULONG *bp,
  29. $np="a3"; # const BN_ULONG *np,
  30. $n0="a4"; # const BN_ULONG *n0,
  31. $num="a5"; # int num);
  32. $lo0="t0";
  33. $hi0="t1";
  34. $lo1="t2";
  35. $hi1="t3";
  36. $aj="t4";
  37. $bi="t5";
  38. $nj="t6";
  39. $tp="t7";
  40. $alo="t8";
  41. $ahi="t9";
  42. $nlo="t10";
  43. $nhi="t11";
  44. $tj="t12";
  45. $i="s3";
  46. $j="s4";
  47. $m1="s5";
  48. $code=<<___;
  49. #ifdef __linux__
  50. #include <asm/regdef.h>
  51. #else
  52. #include <asm.h>
  53. #include <regdef.h>
  54. #endif
  55. .text
  56. .set noat
  57. .set noreorder
  58. .globl bn_mul_mont
  59. .align 5
  60. .ent bn_mul_mont
  61. bn_mul_mont:
  62. lda sp,-48(sp)
  63. stq ra,0(sp)
  64. stq s3,8(sp)
  65. stq s4,16(sp)
  66. stq s5,24(sp)
  67. stq fp,32(sp)
  68. mov sp,fp
  69. .mask 0x0400f000,-48
  70. .frame fp,48,ra
  71. .prologue 0
  72. .align 4
  73. .set reorder
  74. sextl $num,$num
  75. mov 0,v0
  76. cmplt $num,4,AT
  77. bne AT,.Lexit
  78. ldq $hi0,0($ap) # ap[0]
  79. s8addq $num,16,AT
  80. ldq $aj,8($ap)
  81. subq sp,AT,sp
  82. ldq $bi,0($bp) # bp[0]
  83. lda AT,-4096(zero) # mov -4096,AT
  84. ldq $n0,0($n0)
  85. and sp,AT,sp
  86. mulq $hi0,$bi,$lo0
  87. ldq $hi1,0($np) # np[0]
  88. umulh $hi0,$bi,$hi0
  89. ldq $nj,8($np)
  90. mulq $lo0,$n0,$m1
  91. mulq $hi1,$m1,$lo1
  92. umulh $hi1,$m1,$hi1
  93. addq $lo1,$lo0,$lo1
  94. cmpult $lo1,$lo0,AT
  95. addq $hi1,AT,$hi1
  96. mulq $aj,$bi,$alo
  97. mov 2,$j
  98. umulh $aj,$bi,$ahi
  99. mov sp,$tp
  100. mulq $nj,$m1,$nlo
  101. s8addq $j,$ap,$aj
  102. umulh $nj,$m1,$nhi
  103. s8addq $j,$np,$nj
  104. .align 4
  105. .L1st:
  106. .set noreorder
  107. ldq $aj,0($aj)
  108. addl $j,1,$j
  109. ldq $nj,0($nj)
  110. lda $tp,8($tp)
  111. addq $alo,$hi0,$lo0
  112. mulq $aj,$bi,$alo
  113. cmpult $lo0,$hi0,AT
  114. addq $nlo,$hi1,$lo1
  115. mulq $nj,$m1,$nlo
  116. addq $ahi,AT,$hi0
  117. cmpult $lo1,$hi1,v0
  118. cmplt $j,$num,$tj
  119. umulh $aj,$bi,$ahi
  120. addq $nhi,v0,$hi1
  121. addq $lo1,$lo0,$lo1
  122. s8addq $j,$ap,$aj
  123. umulh $nj,$m1,$nhi
  124. cmpult $lo1,$lo0,v0
  125. addq $hi1,v0,$hi1
  126. s8addq $j,$np,$nj
  127. stq $lo1,-8($tp)
  128. nop
  129. unop
  130. bne $tj,.L1st
  131. .set reorder
  132. addq $alo,$hi0,$lo0
  133. addq $nlo,$hi1,$lo1
  134. cmpult $lo0,$hi0,AT
  135. cmpult $lo1,$hi1,v0
  136. addq $ahi,AT,$hi0
  137. addq $nhi,v0,$hi1
  138. addq $lo1,$lo0,$lo1
  139. cmpult $lo1,$lo0,v0
  140. addq $hi1,v0,$hi1
  141. stq $lo1,0($tp)
  142. addq $hi1,$hi0,$hi1
  143. cmpult $hi1,$hi0,AT
  144. stq $hi1,8($tp)
  145. stq AT,16($tp)
  146. mov 1,$i
  147. .align 4
  148. .Louter:
  149. s8addq $i,$bp,$bi
  150. ldq $hi0,0($ap)
  151. ldq $aj,8($ap)
  152. ldq $bi,0($bi)
  153. ldq $hi1,0($np)
  154. ldq $nj,8($np)
  155. ldq $tj,0(sp)
  156. mulq $hi0,$bi,$lo0
  157. umulh $hi0,$bi,$hi0
  158. addq $lo0,$tj,$lo0
  159. cmpult $lo0,$tj,AT
  160. addq $hi0,AT,$hi0
  161. mulq $lo0,$n0,$m1
  162. mulq $hi1,$m1,$lo1
  163. umulh $hi1,$m1,$hi1
  164. addq $lo1,$lo0,$lo1
  165. cmpult $lo1,$lo0,AT
  166. mov 2,$j
  167. addq $hi1,AT,$hi1
  168. mulq $aj,$bi,$alo
  169. mov sp,$tp
  170. umulh $aj,$bi,$ahi
  171. mulq $nj,$m1,$nlo
  172. s8addq $j,$ap,$aj
  173. umulh $nj,$m1,$nhi
  174. .align 4
  175. .Linner:
  176. .set noreorder
  177. ldq $tj,8($tp) #L0
  178. nop #U1
  179. ldq $aj,0($aj) #L1
  180. s8addq $j,$np,$nj #U0
  181. ldq $nj,0($nj) #L0
  182. nop #U1
  183. addq $alo,$hi0,$lo0 #L1
  184. lda $tp,8($tp)
  185. mulq $aj,$bi,$alo #U1
  186. cmpult $lo0,$hi0,AT #L0
  187. addq $nlo,$hi1,$lo1 #L1
  188. addl $j,1,$j
  189. mulq $nj,$m1,$nlo #U1
  190. addq $ahi,AT,$hi0 #L0
  191. addq $lo0,$tj,$lo0 #L1
  192. cmpult $lo1,$hi1,v0 #U0
  193. umulh $aj,$bi,$ahi #U1
  194. cmpult $lo0,$tj,AT #L0
  195. addq $lo1,$lo0,$lo1 #L1
  196. addq $nhi,v0,$hi1 #U0
  197. umulh $nj,$m1,$nhi #U1
  198. s8addq $j,$ap,$aj #L0
  199. cmpult $lo1,$lo0,v0 #L1
  200. cmplt $j,$num,$tj #U0 # borrow $tj
  201. addq $hi0,AT,$hi0 #L0
  202. addq $hi1,v0,$hi1 #U1
  203. stq $lo1,-8($tp) #L1
  204. bne $tj,.Linner #U0
  205. .set reorder
  206. ldq $tj,8($tp)
  207. addq $alo,$hi0,$lo0
  208. addq $nlo,$hi1,$lo1
  209. cmpult $lo0,$hi0,AT
  210. cmpult $lo1,$hi1,v0
  211. addq $ahi,AT,$hi0
  212. addq $nhi,v0,$hi1
  213. addq $lo0,$tj,$lo0
  214. cmpult $lo0,$tj,AT
  215. addq $hi0,AT,$hi0
  216. ldq $tj,16($tp)
  217. addq $lo1,$lo0,$j
  218. cmpult $j,$lo0,v0
  219. addq $hi1,v0,$hi1
  220. addq $hi1,$hi0,$lo1
  221. stq $j,0($tp)
  222. cmpult $lo1,$hi0,$hi1
  223. addq $lo1,$tj,$lo1
  224. cmpult $lo1,$tj,AT
  225. addl $i,1,$i
  226. addq $hi1,AT,$hi1
  227. stq $lo1,8($tp)
  228. cmplt $i,$num,$tj # borrow $tj
  229. stq $hi1,16($tp)
  230. bne $tj,.Louter
  231. s8addq $num,sp,$tj # &tp[num]
  232. mov $rp,$bp # put rp aside
  233. mov sp,$tp
  234. mov sp,$ap
  235. mov 0,$hi0 # clear borrow bit
  236. .align 4
  237. .Lsub: ldq $lo0,0($tp)
  238. ldq $lo1,0($np)
  239. lda $tp,8($tp)
  240. lda $np,8($np)
  241. subq $lo0,$lo1,$lo1 # tp[i]-np[i]
  242. cmpult $lo0,$lo1,AT
  243. subq $lo1,$hi0,$lo0
  244. cmpult $lo1,$lo0,$hi0
  245. or $hi0,AT,$hi0
  246. stq $lo0,0($rp)
  247. cmpult $tp,$tj,v0
  248. lda $rp,8($rp)
  249. bne v0,.Lsub
  250. subq $hi1,$hi0,$hi0 # handle upmost overflow bit
  251. mov sp,$tp
  252. mov $bp,$rp # restore rp
  253. and sp,$hi0,$ap
  254. bic $bp,$hi0,$bp
  255. bis $bp,$ap,$ap # ap=borrow?tp:rp
  256. .align 4
  257. .Lcopy: ldq $aj,0($ap) # copy or in-place refresh
  258. lda $tp,8($tp)
  259. lda $rp,8($rp)
  260. lda $ap,8($ap)
  261. stq zero,-8($tp) # zap tp
  262. cmpult $tp,$tj,AT
  263. stq $aj,-8($rp)
  264. bne AT,.Lcopy
  265. mov 1,v0
  266. .Lexit:
  267. .set noreorder
  268. mov fp,sp
  269. /*ldq ra,0(sp)*/
  270. ldq s3,8(sp)
  271. ldq s4,16(sp)
  272. ldq s5,24(sp)
  273. ldq fp,32(sp)
  274. lda sp,48(sp)
  275. ret (ra)
  276. .end bn_mul_mont
  277. .ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
  278. .align 2
  279. ___
  280. print $code;
  281. close STDOUT;