ppc-mont.pl 6.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328
  1. #!/usr/bin/env perl
  2. # ====================================================================
  3. # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
  4. # project. Rights for redistribution and usage in source and binary
  5. # forms are granted according to the OpenSSL license.
  6. # ====================================================================
  7. # April 2006
  8. # "Teaser" Montgomery multiplication module for PowerPC. It's possible
  9. # to gain a bit more by modulo-scheduling outer loop, then dedicated
  10. # squaring procedure should give further 20% and code can be adapted
  11. # for 32-bit application running on 64-bit CPU. As for the latter.
  12. # It won't be able to achieve "native" 64-bit performance, because in
  13. # 32-bit application context every addc instruction will have to be
  14. # expanded as addc, twice right shift by 32 and finally adde, etc.
  15. # So far RSA *sign* performance improvement over pre-bn_mul_mont asm
  16. # for 64-bit application running on PPC970/G5 is:
  17. #
  18. # 512-bit +65%
  19. # 1024-bit +35%
  20. # 2048-bit +18%
  21. # 4096-bit +4%
  22. $output = shift;
  23. if ($output =~ /32\-mont\.s/) {
  24. $BITS= 32;
  25. $BNSZ= $BITS/8;
  26. $SIZE_T=4;
  27. $RZONE= 224;
  28. $FRAME= $SIZE_T*16;
  29. $LD= "lwz"; # load
  30. $LDU= "lwzu"; # load and update
  31. $LDX= "lwzx"; # load indexed
  32. $ST= "stw"; # store
  33. $STU= "stwu"; # store and update
  34. $STX= "stwx"; # store indexed
  35. $STUX= "stwux"; # store indexed and update
  36. $UMULL= "mullw"; # unsigned multiply low
  37. $UMULH= "mulhwu"; # unsigned multiply high
  38. $UCMP= "cmplw"; # unsigned compare
  39. $PUSH= $ST;
  40. $POP= $LD;
  41. } elsif ($output =~ /64\-mont\.s/) {
  42. $BITS= 64;
  43. $BNSZ= $BITS/8;
  44. $SIZE_T=8;
  45. $RZONE= 288;
  46. $FRAME= $SIZE_T*16;
  47. # same as above, but 64-bit mnemonics...
  48. $LD= "ld"; # load
  49. $LDU= "ldu"; # load and update
  50. $LDX= "ldx"; # load indexed
  51. $ST= "std"; # store
  52. $STU= "stdu"; # store and update
  53. $STX= "stdx"; # store indexed
  54. $STUX= "stdux"; # store indexed and update
  55. $UMULL= "mulld"; # unsigned multiply low
  56. $UMULH= "mulhdu"; # unsigned multiply high
  57. $UCMP= "cmpld"; # unsigned compare
  58. $PUSH= $ST;
  59. $POP= $LD;
  60. } else { die "nonsense $output"; }
  61. ( defined shift || open STDOUT,"| $^X ../perlasm/ppc-xlate.pl $output" ) ||
  62. die "can't call ../perlasm/ppc-xlate.pl: $!";
  63. $sp="r1";
  64. $toc="r2";
  65. $rp="r3"; $ovf="r3";
  66. $ap="r4";
  67. $bp="r5";
  68. $np="r6";
  69. $n0="r7";
  70. $num="r8";
  71. $rp="r9"; # $rp is reassigned
  72. $aj="r10";
  73. $nj="r11";
  74. $tj="r12";
  75. # non-volatile registers
  76. $i="r14";
  77. $j="r15";
  78. $tp="r16";
  79. $m0="r17";
  80. $m1="r18";
  81. $lo0="r19";
  82. $hi0="r20";
  83. $lo1="r21";
  84. $hi1="r22";
  85. $alo="r23";
  86. $ahi="r24";
  87. $nlo="r25";
  88. #
  89. $nhi="r0";
  90. $code=<<___;
  91. .machine "any"
  92. .text
  93. .globl .bn_mul_mont
  94. .align 4
  95. .bn_mul_mont:
  96. cmpwi $num,4
  97. mr $rp,r3 ; $rp is reassigned
  98. li r3,0
  99. bltlr
  100. slwi $num,$num,`log($BNSZ)/log(2)`
  101. li $tj,-4096
  102. addi $ovf,$num,`$FRAME+$RZONE`
  103. subf $ovf,$ovf,$sp ; $sp-$ovf
  104. and $ovf,$ovf,$tj ; minimize TLB usage
  105. subf $ovf,$sp,$ovf ; $ovf-$sp
  106. srwi $num,$num,`log($BNSZ)/log(2)`
  107. $STUX $sp,$sp,$ovf
  108. $PUSH r14,`4*$SIZE_T`($sp)
  109. $PUSH r15,`5*$SIZE_T`($sp)
  110. $PUSH r16,`6*$SIZE_T`($sp)
  111. $PUSH r17,`7*$SIZE_T`($sp)
  112. $PUSH r18,`8*$SIZE_T`($sp)
  113. $PUSH r19,`9*$SIZE_T`($sp)
  114. $PUSH r20,`10*$SIZE_T`($sp)
  115. $PUSH r21,`11*$SIZE_T`($sp)
  116. $PUSH r22,`12*$SIZE_T`($sp)
  117. $PUSH r23,`13*$SIZE_T`($sp)
  118. $PUSH r24,`14*$SIZE_T`($sp)
  119. $PUSH r25,`15*$SIZE_T`($sp)
  120. $LD $n0,0($n0) ; pull n0[0] value
  121. addi $num,$num,-2 ; adjust $num for counter register
  122. $LD $m0,0($bp) ; m0=bp[0]
  123. $LD $aj,0($ap) ; ap[0]
  124. addi $tp,$sp,$FRAME
  125. $UMULL $lo0,$aj,$m0 ; ap[0]*bp[0]
  126. $UMULH $hi0,$aj,$m0
  127. $LD $aj,$BNSZ($ap) ; ap[1]
  128. $LD $nj,0($np) ; np[0]
  129. $UMULL $m1,$lo0,$n0 ; "tp[0]"*n0
  130. $UMULL $alo,$aj,$m0 ; ap[1]*bp[0]
  131. $UMULH $ahi,$aj,$m0
  132. $UMULL $lo1,$nj,$m1 ; np[0]*m1
  133. $UMULH $hi1,$nj,$m1
  134. $LD $nj,$BNSZ($np) ; np[1]
  135. addc $lo1,$lo1,$lo0
  136. addze $hi1,$hi1
  137. $UMULL $nlo,$nj,$m1 ; np[1]*m1
  138. $UMULH $nhi,$nj,$m1
  139. mtctr $num
  140. li $j,`2*$BNSZ`
  141. .align 4
  142. L1st:
  143. $LDX $aj,$ap,$j ; ap[j]
  144. $LDX $nj,$np,$j ; np[j]
  145. addc $lo0,$alo,$hi0
  146. addze $hi0,$ahi
  147. $UMULL $alo,$aj,$m0 ; ap[j]*bp[0]
  148. $UMULH $ahi,$aj,$m0
  149. addc $lo1,$nlo,$hi1
  150. addze $hi1,$nhi
  151. $UMULL $nlo,$nj,$m1 ; np[j]*m1
  152. $UMULH $nhi,$nj,$m1
  153. addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
  154. addze $hi1,$hi1
  155. $ST $lo1,0($tp) ; tp[j-1]
  156. addi $j,$j,$BNSZ ; j++
  157. addi $tp,$tp,$BNSZ ; tp++
  158. bdnz- L1st
  159. ;L1st
  160. addc $lo0,$alo,$hi0
  161. addze $hi0,$ahi
  162. addc $lo1,$nlo,$hi1
  163. addze $hi1,$nhi
  164. addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[0]
  165. addze $hi1,$hi1
  166. $ST $lo1,0($tp) ; tp[j-1]
  167. li $ovf,0
  168. addc $hi1,$hi1,$hi0
  169. addze $ovf,$ovf ; upmost overflow bit
  170. $ST $hi1,$BNSZ($tp)
  171. li $i,$BNSZ
  172. .align 4
  173. Louter:
  174. $LDX $m0,$bp,$i ; m0=bp[i]
  175. $LD $aj,0($ap) ; ap[0]
  176. addi $tp,$sp,$FRAME
  177. $LD $tj,$FRAME($sp) ; tp[0]
  178. $UMULL $lo0,$aj,$m0 ; ap[0]*bp[i]
  179. $UMULH $hi0,$aj,$m0
  180. $LD $aj,$BNSZ($ap) ; ap[1]
  181. $LD $nj,0($np) ; np[0]
  182. addc $lo0,$lo0,$tj ; ap[0]*bp[i]+tp[0]
  183. addze $hi0,$hi0
  184. $UMULL $m1,$lo0,$n0 ; tp[0]*n0
  185. $UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
  186. $UMULH $ahi,$aj,$m0
  187. $UMULL $lo1,$nj,$m1 ; np[0]*m1
  188. $UMULH $hi1,$nj,$m1
  189. $LD $nj,$BNSZ($np) ; np[1]
  190. addc $lo1,$lo1,$lo0
  191. addze $hi1,$hi1
  192. $UMULL $nlo,$nj,$m1 ; np[1]*m1
  193. $UMULH $nhi,$nj,$m1
  194. mtctr $num
  195. li $j,`2*$BNSZ`
  196. .align 4
  197. Linner:
  198. $LDX $aj,$ap,$j ; ap[j]
  199. $LD $tj,$BNSZ($tp) ; tp[j]
  200. addc $lo0,$alo,$hi0
  201. addze $hi0,$ahi
  202. $LDX $nj,$np,$j ; np[j]
  203. addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
  204. addze $hi0,$hi0
  205. $UMULL $alo,$aj,$m0 ; ap[j]*bp[i]
  206. $UMULH $ahi,$aj,$m0
  207. addc $lo1,$nlo,$hi1
  208. addze $hi1,$nhi
  209. $UMULL $nlo,$nj,$m1 ; np[j]*m1
  210. $UMULH $nhi,$nj,$m1
  211. addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
  212. addze $hi1,$hi1
  213. $ST $lo1,0($tp) ; tp[j-1]
  214. addi $j,$j,$BNSZ ; j++
  215. addi $tp,$tp,$BNSZ ; tp++
  216. bdnz- Linner
  217. ;Linner
  218. $LD $tj,$BNSZ($tp) ; tp[j]
  219. addc $lo0,$alo,$hi0
  220. addze $hi0,$ahi
  221. addc $lo0,$lo0,$tj ; ap[j]*bp[i]+tp[j]
  222. addze $hi0,$hi0
  223. addc $lo1,$nlo,$hi1
  224. addze $hi1,$nhi
  225. addc $lo1,$lo1,$lo0 ; np[j]*m1+ap[j]*bp[i]+tp[j]
  226. addze $hi1,$hi1
  227. $ST $lo1,0($tp) ; tp[j-1]
  228. addic $ovf,$ovf,-1 ; move upmost overflow to XER[CA]
  229. li $ovf,0
  230. adde $hi1,$hi1,$hi0
  231. addze $ovf,$ovf
  232. $ST $hi1,$BNSZ($tp)
  233. ;
  234. slwi $tj,$num,`log($BNSZ)/log(2)`
  235. $UCMP $i,$tj
  236. addi $i,$i,$BNSZ
  237. ble- Louter
  238. addi $num,$num,2 ; restore $num
  239. addi $tp,$sp,$FRAME
  240. mtctr $num
  241. li $j,0
  242. subfc. $ovf,$j,$ovf ; sets XER[CA]
  243. bne Lsub
  244. $UCMP $hi1,$nj
  245. bge Lsub
  246. .align 4
  247. Lcopy:
  248. $LDX $tj,$tp,$j
  249. $STX $tj,$rp,$j
  250. $STX $j,$tp,$j ; zap at once
  251. addi $j,$j,$BNSZ
  252. bdnz- Lcopy
  253. Lexit:
  254. $POP r14,`4*$SIZE_T`($sp)
  255. $POP r15,`5*$SIZE_T`($sp)
  256. $POP r16,`6*$SIZE_T`($sp)
  257. $POP r17,`7*$SIZE_T`($sp)
  258. $POP r18,`8*$SIZE_T`($sp)
  259. $POP r19,`9*$SIZE_T`($sp)
  260. $POP r20,`10*$SIZE_T`($sp)
  261. $POP r21,`11*$SIZE_T`($sp)
  262. $POP r22,`12*$SIZE_T`($sp)
  263. $POP r23,`13*$SIZE_T`($sp)
  264. $POP r24,`14*$SIZE_T`($sp)
  265. $POP r25,`15*$SIZE_T`($sp)
  266. $POP $sp,0($sp)
  267. li r3,1
  268. blr
  269. .long 0
  270. .align 4
  271. Lsub: $LDX $tj,$tp,$j
  272. $LDX $nj,$np,$j
  273. subfe $tj,$nj,$tj ; tp[j]-np[j]
  274. $STX $tj,$rp,$j
  275. addi $j,$j,$BNSZ
  276. bdnz- Lsub
  277. li $j,0
  278. subfe. $ovf,$j,$ovf
  279. mtctr $num
  280. bne Lcopy
  281. .align 4
  282. Lzap: $STX $j,$tp,$j
  283. addi $j,$j,$BNSZ
  284. bdnz- Lzap
  285. b Lexit
  286. ___
  287. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  288. print $code;
  289. close STDOUT;