bn-c64xplus.asm 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333
  1. ;;====================================================================
  2. ;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  3. ;; project.
  4. ;;
  5. ;; Rights for redistribution and usage in source and binary forms are
  6. ;; granted according to the OpenSSL license. Warranty of any kind is
  7. ;; disclaimed.
  8. ;;====================================================================
  9. ;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
  10. ;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
  11. ;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler
  12. ;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
  13. ;;====================================================================
  14. .text
  15. .asg B3,RA
  16. .asg A4,ARG0
  17. .asg B4,ARG1
  18. .asg A6,ARG2
  19. .asg B6,ARG3
  20. .asg A8,ARG4
  21. .asg B8,ARG5
  22. .asg A4,RET
  23. .asg A15,FP
  24. .asg B14,DP
  25. .asg B15,SP
  26. .global _bn_mul_add_words
  27. _bn_mul_add_words:
  28. .asmfunc
  29. MV ARG2,B0
  30. [!B0] BNOP RA
  31. ||[!B0] MVK 0,RET
  32. [B0] MVC B0,ILC
  33. [B0] ZERO A19 ; high part of accumulator
  34. || [B0] MV ARG0,A2
  35. || [B0] MV ARG3,A3
  36. NOP 3
  37. SPLOOP 2 ; 2*n+10
  38. ;;====================================================================
  39. LDW *ARG1++,B7 ; ap[i]
  40. NOP 3
  41. LDW *ARG0++,A7 ; rp[i]
  42. MPY32U B7,A3,A17:A16
  43. NOP 3 ; [2,0] in epilogue
  44. ADDU A16,A7,A21:A20
  45. ADDU A19,A21:A20,A19:A18
  46. || MV.S A17,A23
  47. SPKERNEL 2,1 ; leave slot for "return value"
  48. || STW A18,*A2++ ; rp[i]
  49. || ADD A19,A23,A19
  50. ;;====================================================================
  51. BNOP RA,4
  52. MV A19,RET ; return value
  53. .endasmfunc
  54. .global _bn_mul_words
  55. _bn_mul_words:
  56. .asmfunc
  57. MV ARG2,B0
  58. [!B0] BNOP RA
  59. ||[!B0] MVK 0,RET
  60. [B0] MVC B0,ILC
  61. [B0] ZERO A19 ; high part of accumulator
  62. NOP 3
  63. SPLOOP 2 ; 2*n+10
  64. ;;====================================================================
  65. LDW *ARG1++,A7 ; ap[i]
  66. NOP 4
  67. MPY32U A7,ARG3,A17:A16
  68. NOP 4 ; [2,0] in epiloque
  69. ADDU A19,A16,A19:A18
  70. || MV.S A17,A21
  71. SPKERNEL 2,1 ; leave slot for "return value"
  72. || STW A18,*ARG0++ ; rp[i]
  73. || ADD.L A19,A21,A19
  74. ;;====================================================================
  75. BNOP RA,4
  76. MV A19,RET ; return value
  77. .endasmfunc
  78. .global _bn_sqr_words
  79. _bn_sqr_words:
  80. .asmfunc
  81. MV ARG2,B0
  82. [!B0] BNOP RA
  83. ||[!B0] MVK 0,RET
  84. [B0] MVC B0,ILC
  85. [B0] MV ARG0,B2
  86. || [B0] ADD 4,ARG0,ARG0
  87. NOP 3
  88. SPLOOP 2 ; 2*n+10
  89. ;;====================================================================
  90. LDW *ARG1++,B7 ; ap[i]
  91. NOP 4
  92. MPY32U B7,B7,B1:B0
  93. NOP 3 ; [2,0] in epilogue
  94. STW B0,*B2++(8) ; rp[2*i]
  95. MV B1,A1
  96. SPKERNEL 2,0 ; fully overlap BNOP RA,5
  97. || STW A1,*ARG0++(8) ; rp[2*i+1]
  98. ;;====================================================================
  99. BNOP RA,5
  100. .endasmfunc
  101. .global _bn_add_words
  102. _bn_add_words:
  103. .asmfunc
  104. MV ARG3,B0
  105. [!B0] BNOP RA
  106. ||[!B0] MVK 0,RET
  107. [B0] MVC B0,ILC
  108. [B0] ZERO A1 ; carry flag
  109. || [B0] MV ARG0,A3
  110. NOP 3
  111. SPLOOP 2 ; 2*n+6
  112. ;;====================================================================
  113. LDW *ARG2++,A7 ; bp[i]
  114. || LDW *ARG1++,B7 ; ap[i]
  115. NOP 4
  116. ADDU A7,B7,A9:A8
  117. ADDU A1,A9:A8,A1:A0
  118. SPKERNEL 0,0 ; fully overlap BNOP RA,5
  119. || STW A0,*A3++ ; write result
  120. || MV A1,RET ; keep carry flag in RET
  121. ;;====================================================================
  122. BNOP RA,5
  123. .endasmfunc
  124. .global _bn_sub_words
  125. _bn_sub_words:
  126. .asmfunc
  127. MV ARG3,B0
  128. [!B0] BNOP RA
  129. ||[!B0] MVK 0,RET
  130. [B0] MVC B0,ILC
  131. [B0] ZERO A2 ; borrow flag
  132. || [B0] MV ARG0,A3
  133. NOP 3
  134. SPLOOP 2 ; 2*n+6
  135. ;;====================================================================
  136. LDW *ARG2++,A7 ; bp[i]
  137. || LDW *ARG1++,B7 ; ap[i]
  138. NOP 4
  139. SUBU B7,A7,A1:A0
  140. [A2] SUB A1:A0,1,A1:A0
  141. SPKERNEL 0,1 ; leave slot for "return borrow flag"
  142. || STW A0,*A3++ ; write result
  143. || AND 1,A1,A2 ; pass on borrow flag
  144. ;;====================================================================
  145. BNOP RA,4
  146. AND 1,A1,RET ; return borrow flag
  147. .endasmfunc
  148. .global _bn_div_words
  149. .global __divull
  150. _bn_div_words:
  151. .asmfunc
  152. CALLP __divull,A3 ; jump to rts64plus.lib
  153. || MV ARG0,A5
  154. || MV ARG1,ARG0
  155. || MV ARG2,ARG1
  156. || ZERO B5
  157. .endasmfunc
  158. ;;====================================================================
  159. ;; Not really Comba algorithm, just straightforward NxM... Dedicated
  160. ;; fully unrolled real Comba implementations are asymptotically 2x
  161. ;; faster, but naturally larger undertaking. Purpose of this exercise
  162. ;; was rather to learn to master nested SPLOOPs...
  163. ;;====================================================================
  164. .global _bn_sqr_comba8
  165. .global _bn_mul_comba8
  166. _bn_sqr_comba8:
  167. MV ARG1,ARG2
  168. _bn_mul_comba8:
  169. .asmfunc
  170. MVK 8,B0 ; N, RILC
  171. || MVK 8,A0 ; M, outer loop counter
  172. || MV ARG1,A5 ; copy ap
  173. || MV ARG0,B4 ; copy rp
  174. || ZERO B19 ; high part of accumulator
  175. MVC B0,RILC
  176. || SUB B0,2,B1 ; N-2, initial ILC
  177. || SUB B0,1,B2 ; const B2=N-1
  178. || LDW *A5++,B6 ; ap[0]
  179. || MV A0,A3 ; const A3=M
  180. sploopNxM?: ; for best performance arrange M<=N
  181. [A0] SPLOOPD 2 ; 2*n+10
  182. || MVC B1,ILC
  183. || ADDAW B4,B0,B5
  184. || ZERO B7
  185. || LDW *A5++,A9 ; pre-fetch ap[1]
  186. || ZERO A1
  187. || SUB A0,1,A0
  188. ;;====================================================================
  189. ;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
  190. ;; This is because of Advisory 15 from TI publication SPRZ247I.
  191. LDW *ARG2++,A7 ; bp[i]
  192. NOP 3
  193. [A1] LDW *B5++,B7 ; rp[i]
  194. MPY32U A7,B6,B17:B16
  195. NOP 3
  196. ADDU B16,B7,B21:B20
  197. ADDU B19,B21:B20,B19:B18
  198. || MV.S B17,B23
  199. SPKERNEL
  200. || STW B18,*B4++ ; rp[i]
  201. || ADD.S B19,B23,B19
  202. ;;====================================================================
  203. outer?: ; m*2*(n+1)+10
  204. SUBAW ARG2,A3,ARG2 ; rewind bp to bp[0]
  205. SPMASKR
  206. || CMPGT A0,1,A2 ; done pre-fetching ap[i+1]?
  207. MVD A9,B6 ; move through .M unit(*)
  208. [A2] LDW *A5++,A9 ; pre-fetch ap[i+1]
  209. SUBAW B5,B2,B5 ; rewind rp to rp[1]
  210. MVK 1,A1
  211. [A0] BNOP.S1 outer?,4
  212. || [A0] SUB.L A0,1,A0
  213. STW B19,*B4--[B2] ; rewind rp tp rp[1]
  214. || ZERO.S B19 ; high part of accumulator
  215. ;; end of outer?
  216. BNOP RA,5 ; return
  217. .endasmfunc
  218. ;; (*) It should be noted that B6 is used as input to MPY32U in
  219. ;; chronologically next cycle in *preceding* SPLOOP iteration.
  220. ;; Normally such arrangement would require DINT, but at this
  221. ;; point SPLOOP is draining and interrupts are disabled
  222. ;; implicitly.
  223. .global _bn_sqr_comba4
  224. .global _bn_mul_comba4
  225. _bn_sqr_comba4:
  226. MV ARG1,ARG2
  227. _bn_mul_comba4:
  228. .asmfunc
  229. .if 0
  230. BNOP sploopNxM?,3
  231. ;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case,
  232. ;; because of read-after-write penalties, it's rather
  233. ;; n*2*(n+3)+10, or 66 cycles [plus various overheads]...
  234. MVK 4,B0 ; N, RILC
  235. || MVK 4,A0 ; M, outer loop counter
  236. || MV ARG1,A5 ; copy ap
  237. || MV ARG0,B4 ; copy rp
  238. || ZERO B19 ; high part of accumulator
  239. MVC B0,RILC
  240. || SUB B0,2,B1 ; first ILC
  241. || SUB B0,1,B2 ; const B2=N-1
  242. || LDW *A5++,B6 ; ap[0]
  243. || MV A0,A3 ; const A3=M
  244. .else
  245. ;; This alternative is exercise in fully unrolled Comba
  246. ;; algorithm implementation that operates at n*(n+1)+12, or
  247. ;; as little as 32 cycles...
  248. LDW *ARG1[0],B16 ; a[0]
  249. || LDW *ARG2[0],A16 ; b[0]
  250. LDW *ARG1[1],B17 ; a[1]
  251. || LDW *ARG2[1],A17 ; b[1]
  252. LDW *ARG1[2],B18 ; a[2]
  253. || LDW *ARG2[2],A18 ; b[2]
  254. LDW *ARG1[3],B19 ; a[3]
  255. || LDW *ARG2[3],A19 ; b[3]
  256. NOP
  257. MPY32U A16,B16,A1:A0 ; a[0]*b[0]
  258. MPY32U A17,B16,A23:A22 ; a[0]*b[1]
  259. MPY32U A16,B17,A25:A24 ; a[1]*b[0]
  260. MPY32U A16,B18,A27:A26 ; a[2]*b[0]
  261. STW A0,*ARG0[0]
  262. || MPY32U A17,B17,A29:A28 ; a[1]*b[1]
  263. MPY32U A18,B16,A31:A30 ; a[0]*b[2]
  264. || ADDU A22,A1,A1:A0
  265. MV A23,B0
  266. || MPY32U A19,B16,A21:A20 ; a[3]*b[0]
  267. || ADDU A24,A1:A0,A1:A0
  268. ADDU A25,B0,B1:B0
  269. || STW A0,*ARG0[1]
  270. || MPY32U A18,B17,A23:A22 ; a[2]*b[1]
  271. || ADDU A26,A1,A9:A8
  272. ADDU A27,B1,B9:B8
  273. || MPY32U A17,B18,A25:A24 ; a[1]*b[2]
  274. || ADDU A28,A9:A8,A9:A8
  275. ADDU A29,B9:B8,B9:B8
  276. || MPY32U A16,B19,A27:A26 ; a[0]*b[3]
  277. || ADDU A30,A9:A8,A9:A8
  278. ADDU A31,B9:B8,B9:B8
  279. || ADDU B0,A9:A8,A9:A8
  280. STW A8,*ARG0[2]
  281. || ADDU A20,A9,A1:A0
  282. ADDU A21,B9,B1:B0
  283. || MPY32U A19,B17,A21:A20 ; a[3]*b[1]
  284. || ADDU A22,A1:A0,A1:A0
  285. ADDU A23,B1:B0,B1:B0
  286. || MPY32U A18,B18,A23:A22 ; a[2]*b[2]
  287. || ADDU A24,A1:A0,A1:A0
  288. ADDU A25,B1:B0,B1:B0
  289. || MPY32U A17,B19,A25:A24 ; a[1]*b[3]
  290. || ADDU A26,A1:A0,A1:A0
  291. ADDU A27,B1:B0,B1:B0
  292. || ADDU B8,A1:A0,A1:A0
  293. STW A0,*ARG0[3]
  294. || MPY32U A19,B18,A27:A26 ; a[3]*b[2]
  295. || ADDU A20,A1,A9:A8
  296. ADDU A21,B1,B9:B8
  297. || MPY32U A18,B19,A29:A28 ; a[2]*b[3]
  298. || ADDU A22,A9:A8,A9:A8
  299. ADDU A23,B9:B8,B9:B8
  300. || MPY32U A19,B19,A31:A30 ; a[3]*b[3]
  301. || ADDU A24,A9:A8,A9:A8
  302. ADDU A25,B9:B8,B9:B8
  303. || ADDU B0,A9:A8,A9:A8
  304. STW A8,*ARG0[4]
  305. || ADDU A26,A9,A1:A0
  306. ADDU A27,B9,B1:B0
  307. || ADDU A28,A1:A0,A1:A0
  308. ADDU A29,B1:B0,B1:B0
  309. || BNOP RA
  310. || ADDU B8,A1:A0,A1:A0
  311. STW A0,*ARG0[5]
  312. || ADDU A30,A1,A9:A8
  313. ADD A31,B1,B8
  314. ADDU B0,A9:A8,A9:A8 ; removed || to avoid cross-path stall below
  315. ADD B8,A9,A9
  316. || STW A8,*ARG0[6]
  317. STW A9,*ARG0[7]
  318. .endif
  319. .endasmfunc