alpha-mont.pl 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327
  1. #! /usr/bin/env perl
  2. # Copyright 2006-2020 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. #
  9. # ====================================================================
  10. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11. # project. The module is, however, dual licensed under OpenSSL and
  12. # CRYPTOGAMS licenses depending on where you obtain it. For further
  13. # details see http://www.openssl.org/~appro/cryptogams/.
  14. # ====================================================================
  15. #
  16. # On 21264 RSA sign performance improves by 70/35/20/15 percent for
  17. # 512/1024/2048/4096 bit key lengths. This is against vendor compiler
  18. # instructed to '-tune host' code with in-line assembler. Other
  19. # benchmarks improve by 15-20%. To anchor it to something else, the
  20. # code provides approximately the same performance per GHz as AMD64.
  21. # I.e. if you compare 1GHz 21264 and 2GHz Opteron, you'll observe ~2x
  22. # difference.
  23. $output=pop and open STDOUT,">$output";
  24. # int bn_mul_mont(
  25. $rp="a0"; # BN_ULONG *rp,
  26. $ap="a1"; # const BN_ULONG *ap,
  27. $bp="a2"; # const BN_ULONG *bp,
  28. $np="a3"; # const BN_ULONG *np,
  29. $n0="a4"; # const BN_ULONG *n0,
  30. $num="a5"; # int num);
  31. $lo0="t0";
  32. $hi0="t1";
  33. $lo1="t2";
  34. $hi1="t3";
  35. $aj="t4";
  36. $bi="t5";
  37. $nj="t6";
  38. $tp="t7";
  39. $alo="t8";
  40. $ahi="t9";
  41. $nlo="t10";
  42. $nhi="t11";
  43. $tj="t12";
  44. $i="s3";
  45. $j="s4";
  46. $m1="s5";
  47. $code=<<___;
  48. #ifdef __linux__
  49. #include <asm/regdef.h>
  50. #else
  51. #include <asm.h>
  52. #include <regdef.h>
  53. #endif
  54. .text
  55. .set noat
  56. .set noreorder
  57. .globl bn_mul_mont
  58. .align 5
  59. .ent bn_mul_mont
  60. bn_mul_mont:
  61. lda sp,-48(sp)
  62. stq ra,0(sp)
  63. stq s3,8(sp)
  64. stq s4,16(sp)
  65. stq s5,24(sp)
  66. stq fp,32(sp)
  67. mov sp,fp
  68. .mask 0x0400f000,-48
  69. .frame fp,48,ra
  70. .prologue 0
  71. .align 4
  72. .set reorder
  73. sextl $num,$num
  74. mov 0,v0
  75. cmplt $num,4,AT
  76. bne AT,.Lexit
  77. ldq $hi0,0($ap) # ap[0]
  78. s8addq $num,16,AT
  79. ldq $aj,8($ap)
  80. subq sp,AT,sp
  81. ldq $bi,0($bp) # bp[0]
  82. lda AT,-4096(zero) # mov -4096,AT
  83. ldq $n0,0($n0)
  84. and sp,AT,sp
  85. mulq $hi0,$bi,$lo0
  86. ldq $hi1,0($np) # np[0]
  87. umulh $hi0,$bi,$hi0
  88. ldq $nj,8($np)
  89. mulq $lo0,$n0,$m1
  90. mulq $hi1,$m1,$lo1
  91. umulh $hi1,$m1,$hi1
  92. addq $lo1,$lo0,$lo1
  93. cmpult $lo1,$lo0,AT
  94. addq $hi1,AT,$hi1
  95. mulq $aj,$bi,$alo
  96. mov 2,$j
  97. umulh $aj,$bi,$ahi
  98. mov sp,$tp
  99. mulq $nj,$m1,$nlo
  100. s8addq $j,$ap,$aj
  101. umulh $nj,$m1,$nhi
  102. s8addq $j,$np,$nj
  103. .align 4
  104. .L1st:
  105. .set noreorder
  106. ldq $aj,0($aj)
  107. addl $j,1,$j
  108. ldq $nj,0($nj)
  109. lda $tp,8($tp)
  110. addq $alo,$hi0,$lo0
  111. mulq $aj,$bi,$alo
  112. cmpult $lo0,$hi0,AT
  113. addq $nlo,$hi1,$lo1
  114. mulq $nj,$m1,$nlo
  115. addq $ahi,AT,$hi0
  116. cmpult $lo1,$hi1,v0
  117. cmplt $j,$num,$tj
  118. umulh $aj,$bi,$ahi
  119. addq $nhi,v0,$hi1
  120. addq $lo1,$lo0,$lo1
  121. s8addq $j,$ap,$aj
  122. umulh $nj,$m1,$nhi
  123. cmpult $lo1,$lo0,v0
  124. addq $hi1,v0,$hi1
  125. s8addq $j,$np,$nj
  126. stq $lo1,-8($tp)
  127. nop
  128. unop
  129. bne $tj,.L1st
  130. .set reorder
  131. addq $alo,$hi0,$lo0
  132. addq $nlo,$hi1,$lo1
  133. cmpult $lo0,$hi0,AT
  134. cmpult $lo1,$hi1,v0
  135. addq $ahi,AT,$hi0
  136. addq $nhi,v0,$hi1
  137. addq $lo1,$lo0,$lo1
  138. cmpult $lo1,$lo0,v0
  139. addq $hi1,v0,$hi1
  140. stq $lo1,0($tp)
  141. addq $hi1,$hi0,$hi1
  142. cmpult $hi1,$hi0,AT
  143. stq $hi1,8($tp)
  144. stq AT,16($tp)
  145. mov 1,$i
  146. .align 4
  147. .Louter:
  148. s8addq $i,$bp,$bi
  149. ldq $hi0,0($ap)
  150. ldq $aj,8($ap)
  151. ldq $bi,0($bi)
  152. ldq $hi1,0($np)
  153. ldq $nj,8($np)
  154. ldq $tj,0(sp)
  155. mulq $hi0,$bi,$lo0
  156. umulh $hi0,$bi,$hi0
  157. addq $lo0,$tj,$lo0
  158. cmpult $lo0,$tj,AT
  159. addq $hi0,AT,$hi0
  160. mulq $lo0,$n0,$m1
  161. mulq $hi1,$m1,$lo1
  162. umulh $hi1,$m1,$hi1
  163. addq $lo1,$lo0,$lo1
  164. cmpult $lo1,$lo0,AT
  165. mov 2,$j
  166. addq $hi1,AT,$hi1
  167. mulq $aj,$bi,$alo
  168. mov sp,$tp
  169. umulh $aj,$bi,$ahi
  170. mulq $nj,$m1,$nlo
  171. s8addq $j,$ap,$aj
  172. umulh $nj,$m1,$nhi
  173. .align 4
  174. .Linner:
  175. .set noreorder
  176. ldq $tj,8($tp) #L0
  177. nop #U1
  178. ldq $aj,0($aj) #L1
  179. s8addq $j,$np,$nj #U0
  180. ldq $nj,0($nj) #L0
  181. nop #U1
  182. addq $alo,$hi0,$lo0 #L1
  183. lda $tp,8($tp)
  184. mulq $aj,$bi,$alo #U1
  185. cmpult $lo0,$hi0,AT #L0
  186. addq $nlo,$hi1,$lo1 #L1
  187. addl $j,1,$j
  188. mulq $nj,$m1,$nlo #U1
  189. addq $ahi,AT,$hi0 #L0
  190. addq $lo0,$tj,$lo0 #L1
  191. cmpult $lo1,$hi1,v0 #U0
  192. umulh $aj,$bi,$ahi #U1
  193. cmpult $lo0,$tj,AT #L0
  194. addq $lo1,$lo0,$lo1 #L1
  195. addq $nhi,v0,$hi1 #U0
  196. umulh $nj,$m1,$nhi #U1
  197. s8addq $j,$ap,$aj #L0
  198. cmpult $lo1,$lo0,v0 #L1
  199. cmplt $j,$num,$tj #U0 # borrow $tj
  200. addq $hi0,AT,$hi0 #L0
  201. addq $hi1,v0,$hi1 #U1
  202. stq $lo1,-8($tp) #L1
  203. bne $tj,.Linner #U0
  204. .set reorder
  205. ldq $tj,8($tp)
  206. addq $alo,$hi0,$lo0
  207. addq $nlo,$hi1,$lo1
  208. cmpult $lo0,$hi0,AT
  209. cmpult $lo1,$hi1,v0
  210. addq $ahi,AT,$hi0
  211. addq $nhi,v0,$hi1
  212. addq $lo0,$tj,$lo0
  213. cmpult $lo0,$tj,AT
  214. addq $hi0,AT,$hi0
  215. ldq $tj,16($tp)
  216. addq $lo1,$lo0,$j
  217. cmpult $j,$lo0,v0
  218. addq $hi1,v0,$hi1
  219. addq $hi1,$hi0,$lo1
  220. stq $j,0($tp)
  221. cmpult $lo1,$hi0,$hi1
  222. addq $lo1,$tj,$lo1
  223. cmpult $lo1,$tj,AT
  224. addl $i,1,$i
  225. addq $hi1,AT,$hi1
  226. stq $lo1,8($tp)
  227. cmplt $i,$num,$tj # borrow $tj
  228. stq $hi1,16($tp)
  229. bne $tj,.Louter
  230. s8addq $num,sp,$tj # &tp[num]
  231. mov $rp,$bp # put rp aside
  232. mov sp,$tp
  233. mov sp,$ap
  234. mov 0,$hi0 # clear borrow bit
  235. .align 4
  236. .Lsub: ldq $lo0,0($tp)
  237. ldq $lo1,0($np)
  238. lda $tp,8($tp)
  239. lda $np,8($np)
  240. subq $lo0,$lo1,$lo1 # tp[i]-np[i]
  241. cmpult $lo0,$lo1,AT
  242. subq $lo1,$hi0,$lo0
  243. cmpult $lo1,$lo0,$hi0
  244. or $hi0,AT,$hi0
  245. stq $lo0,0($rp)
  246. cmpult $tp,$tj,v0
  247. lda $rp,8($rp)
  248. bne v0,.Lsub
  249. subq $hi1,$hi0,$hi0 # handle upmost overflow bit
  250. mov sp,$tp
  251. mov $bp,$rp # restore rp
  252. .align 4
  253. .Lcopy: ldq $aj,0($tp) # conditional copy
  254. ldq $nj,0($rp)
  255. lda $tp,8($tp)
  256. lda $rp,8($rp)
  257. cmoveq $hi0,$nj,$aj
  258. stq zero,-8($tp) # zap tp
  259. cmpult $tp,$tj,AT
  260. stq $aj,-8($rp)
  261. bne AT,.Lcopy
  262. mov 1,v0
  263. .Lexit:
  264. .set noreorder
  265. mov fp,sp
  266. /*ldq ra,0(sp)*/
  267. ldq s3,8(sp)
  268. ldq s4,16(sp)
  269. ldq s5,24(sp)
  270. ldq fp,32(sp)
  271. lda sp,48(sp)
  272. ret (ra)
  273. .end bn_mul_mont
  274. .ascii "Montgomery Multiplication for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
  275. .align 2
  276. ___
  277. print $code;
  278. close STDOUT or die "error closing STDOUT: $!";