bn-c64xplus.asm 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375
  1. ;;====================================================================
  2. ;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  3. ;; project.
  4. ;;
  5. ;; Rights for redistribution and usage in source and binary forms are
  6. ;; granted according to the OpenSSL license. Warranty of any kind is
  7. ;; disclaimed.
  8. ;;====================================================================
  9. ;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
  10. ;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
  11. ;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler
  12. ;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
  13. ;;====================================================================
  14. .text
  15. .if .ASSEMBLER_VERSION<7000000
  16. .asg 0,__TI_EABI__
  17. .endif
  18. .if __TI_EABI__
  19. .asg bn_mul_add_words,_bn_mul_add_words
  20. .asg bn_mul_words,_bn_mul_words
  21. .asg bn_sqr_words,_bn_sqr_words
  22. .asg bn_add_words,_bn_add_words
  23. .asg bn_sub_words,_bn_sub_words
  24. .asg bn_div_words,_bn_div_words
  25. .asg bn_sqr_comba8,_bn_sqr_comba8
  26. .asg bn_mul_comba8,_bn_mul_comba8
  27. .asg bn_sqr_comba4,_bn_sqr_comba4
  28. .asg bn_mul_comba4,_bn_mul_comba4
  29. .endif
  30. .asg B3,RA
  31. .asg A4,ARG0
  32. .asg B4,ARG1
  33. .asg A6,ARG2
  34. .asg B6,ARG3
  35. .asg A8,ARG4
  36. .asg B8,ARG5
  37. .asg A4,RET
  38. .asg A15,FP
  39. .asg B14,DP
  40. .asg B15,SP
  41. .global _bn_mul_add_words
  42. _bn_mul_add_words:
  43. .asmfunc
  44. MV ARG2,B0
  45. [!B0] BNOP RA
  46. ||[!B0] MVK 0,RET
  47. [B0] MVC B0,ILC
  48. [B0] ZERO A19 ; high part of accumulator
  49. || [B0] MV ARG0,A2
  50. || [B0] MV ARG3,A3
  51. NOP 3
  52. SPLOOP 2 ; 2*n+10
  53. ;;====================================================================
  54. LDW *ARG1++,B7 ; ap[i]
  55. NOP 3
  56. LDW *ARG0++,A7 ; rp[i]
  57. MPY32U B7,A3,A17:A16
  58. NOP 3 ; [2,0] in epilogue
  59. ADDU A16,A7,A21:A20
  60. ADDU A19,A21:A20,A19:A18
  61. || MV.S A17,A23
  62. SPKERNEL 2,1 ; leave slot for "return value"
  63. || STW A18,*A2++ ; rp[i]
  64. || ADD A19,A23,A19
  65. ;;====================================================================
  66. BNOP RA,4
  67. MV A19,RET ; return value
  68. .endasmfunc
  69. .global _bn_mul_words
  70. _bn_mul_words:
  71. .asmfunc
  72. MV ARG2,B0
  73. [!B0] BNOP RA
  74. ||[!B0] MVK 0,RET
  75. [B0] MVC B0,ILC
  76. [B0] ZERO A19 ; high part of accumulator
  77. NOP 3
  78. SPLOOP 2 ; 2*n+10
  79. ;;====================================================================
  80. LDW *ARG1++,A7 ; ap[i]
  81. NOP 4
  82. MPY32U A7,ARG3,A17:A16
  83. NOP 4 ; [2,0] in epiloque
  84. ADDU A19,A16,A19:A18
  85. || MV.S A17,A21
  86. SPKERNEL 2,1 ; leave slot for "return value"
  87. || STW A18,*ARG0++ ; rp[i]
  88. || ADD.L A19,A21,A19
  89. ;;====================================================================
  90. BNOP RA,4
  91. MV A19,RET ; return value
  92. .endasmfunc
  93. .global _bn_sqr_words
  94. _bn_sqr_words:
  95. .asmfunc
  96. MV ARG2,B0
  97. [!B0] BNOP RA
  98. ||[!B0] MVK 0,RET
  99. [B0] MVC B0,ILC
  100. [B0] MV ARG0,B2
  101. || [B0] ADD 4,ARG0,ARG0
  102. NOP 3
  103. SPLOOP 2 ; 2*n+10
  104. ;;====================================================================
  105. LDW *ARG1++,B7 ; ap[i]
  106. NOP 4
  107. MPY32U B7,B7,B1:B0
  108. NOP 3 ; [2,0] in epilogue
  109. STW B0,*B2++(8) ; rp[2*i]
  110. MV B1,A1
  111. SPKERNEL 2,0 ; fully overlap BNOP RA,5
  112. || STW A1,*ARG0++(8) ; rp[2*i+1]
  113. ;;====================================================================
  114. BNOP RA,5
  115. .endasmfunc
  116. .global _bn_add_words
  117. _bn_add_words:
  118. .asmfunc
  119. MV ARG3,B0
  120. [!B0] BNOP RA
  121. ||[!B0] MVK 0,RET
  122. [B0] MVC B0,ILC
  123. [B0] ZERO A1 ; carry flag
  124. || [B0] MV ARG0,A3
  125. NOP 3
  126. SPLOOP 2 ; 2*n+6
  127. ;;====================================================================
  128. LDW *ARG2++,A7 ; bp[i]
  129. || LDW *ARG1++,B7 ; ap[i]
  130. NOP 4
  131. ADDU A7,B7,A9:A8
  132. ADDU A1,A9:A8,A1:A0
  133. SPKERNEL 0,0 ; fully overlap BNOP RA,5
  134. || STW A0,*A3++ ; write result
  135. || MV A1,RET ; keep carry flag in RET
  136. ;;====================================================================
  137. BNOP RA,5
  138. .endasmfunc
  139. .global _bn_sub_words
  140. _bn_sub_words:
  141. .asmfunc
  142. MV ARG3,B0
  143. [!B0] BNOP RA
  144. ||[!B0] MVK 0,RET
  145. [B0] MVC B0,ILC
  146. [B0] ZERO A2 ; borrow flag
  147. || [B0] MV ARG0,A3
  148. NOP 3
  149. SPLOOP 2 ; 2*n+6
  150. ;;====================================================================
  151. LDW *ARG2++,A7 ; bp[i]
  152. || LDW *ARG1++,B7 ; ap[i]
  153. NOP 4
  154. SUBU B7,A7,A1:A0
  155. [A2] SUB A1:A0,1,A1:A0
  156. SPKERNEL 0,1 ; leave slot for "return borrow flag"
  157. || STW A0,*A3++ ; write result
  158. || AND 1,A1,A2 ; pass on borrow flag
  159. ;;====================================================================
  160. BNOP RA,4
  161. AND 1,A1,RET ; return borrow flag
  162. .endasmfunc
  163. .global _bn_div_words
  164. _bn_div_words:
  165. .asmfunc
  166. LMBD 1,A6,A0 ; leading zero bits in dv
  167. LMBD 1,A4,A1 ; leading zero bits in hi
  168. || MVK 32,B0
  169. CMPLTU A1,A0,A2
  170. || ADD A0,B0,B0
  171. [ A2] BNOP RA
  172. ||[ A2] MVK -1,A4 ; return overflow
  173. ||[!A2] MV A4,A3 ; reassign hi
  174. [!A2] MV B4,A4 ; reassign lo, will be quotient
  175. ||[!A2] MVC B0,ILC
  176. [!A2] SHL A6,A0,A6 ; normalize dv
  177. || MVK 1,A1
  178. [!A2] CMPLTU A3,A6,A1 ; hi<dv?
  179. ||[!A2] SHL A4,1,A5:A4 ; lo<<1
  180. [!A1] SUB A3,A6,A3 ; hi-=dv
  181. ||[!A1] OR 1,A4,A4
  182. [!A2] SHRU A3,31,A1 ; upper bit
  183. ||[!A2] ADDAH A5,A3,A3 ; hi<<1|lo>>31
  184. SPLOOP 3
  185. [!A1] CMPLTU A3,A6,A1 ; hi<dv?
  186. ||[ A1] ZERO A1
  187. || SHL A4,1,A5:A4 ; lo<<1
  188. [!A1] SUB A3,A6,A3 ; hi-=dv
  189. ||[!A1] OR 1,A4,A4 ; quotient
  190. SHRU A3,31,A1 ; upper bit
  191. || ADDAH A5,A3,A3 ; hi<<1|lo>>31
  192. SPKERNEL
  193. BNOP RA,5
  194. .endasmfunc
  195. ;;====================================================================
  196. ;; Not really Comba algorithm, just straightforward NxM... Dedicated
  197. ;; fully unrolled real Comba implementations are asymptotically 2x
  198. ;; faster, but naturally larger undertaking. Purpose of this exercise
  199. ;; was rather to learn to master nested SPLOOPs...
  200. ;;====================================================================
  201. .global _bn_sqr_comba8
  202. .global _bn_mul_comba8
  203. _bn_sqr_comba8:
  204. MV ARG1,ARG2
  205. _bn_mul_comba8:
  206. .asmfunc
  207. MVK 8,B0 ; N, RILC
  208. || MVK 8,A0 ; M, outer loop counter
  209. || MV ARG1,A5 ; copy ap
  210. || MV ARG0,B4 ; copy rp
  211. || ZERO B19 ; high part of accumulator
  212. MVC B0,RILC
  213. || SUB B0,2,B1 ; N-2, initial ILC
  214. || SUB B0,1,B2 ; const B2=N-1
  215. || LDW *A5++,B6 ; ap[0]
  216. || MV A0,A3 ; const A3=M
  217. sploopNxM?: ; for best performance arrange M<=N
  218. [A0] SPLOOPD 2 ; 2*n+10
  219. || MVC B1,ILC
  220. || ADDAW B4,B0,B5
  221. || ZERO B7
  222. || LDW *A5++,A9 ; pre-fetch ap[1]
  223. || ZERO A1
  224. || SUB A0,1,A0
  225. ;;====================================================================
  226. ;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
  227. ;; This is because of Advisory 15 from TI publication SPRZ247I.
  228. LDW *ARG2++,A7 ; bp[i]
  229. NOP 3
  230. [A1] LDW *B5++,B7 ; rp[i]
  231. MPY32U A7,B6,B17:B16
  232. NOP 3
  233. ADDU B16,B7,B21:B20
  234. ADDU B19,B21:B20,B19:B18
  235. || MV.S B17,B23
  236. SPKERNEL
  237. || STW B18,*B4++ ; rp[i]
  238. || ADD.S B19,B23,B19
  239. ;;====================================================================
  240. outer?: ; m*2*(n+1)+10
  241. SUBAW ARG2,A3,ARG2 ; rewind bp to bp[0]
  242. SPMASKR
  243. || CMPGT A0,1,A2 ; done pre-fetching ap[i+1]?
  244. MVD A9,B6 ; move through .M unit(*)
  245. [A2] LDW *A5++,A9 ; pre-fetch ap[i+1]
  246. SUBAW B5,B2,B5 ; rewind rp to rp[1]
  247. MVK 1,A1
  248. [A0] BNOP.S1 outer?,4
  249. || [A0] SUB.L A0,1,A0
  250. STW B19,*B4--[B2] ; rewind rp tp rp[1]
  251. || ZERO.S B19 ; high part of accumulator
  252. ;; end of outer?
  253. BNOP RA,5 ; return
  254. .endasmfunc
  255. ;; (*) It should be noted that B6 is used as input to MPY32U in
  256. ;; chronologically next cycle in *preceding* SPLOOP iteration.
  257. ;; Normally such arrangement would require DINT, but at this
  258. ;; point SPLOOP is draining and interrupts are disabled
  259. ;; implicitly.
  260. .global _bn_sqr_comba4
  261. .global _bn_mul_comba4
  262. _bn_sqr_comba4:
  263. MV ARG1,ARG2
  264. _bn_mul_comba4:
  265. .asmfunc
  266. .if 0
  267. BNOP sploopNxM?,3
  268. ;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case,
  269. ;; because of low-counter effect, when prologue phase finishes
  270. ;; before SPKERNEL instruction is reached. As result it's 25%
  271. ;; slower than expected...
  272. MVK 4,B0 ; N, RILC
  273. || MVK 4,A0 ; M, outer loop counter
  274. || MV ARG1,A5 ; copy ap
  275. || MV ARG0,B4 ; copy rp
  276. || ZERO B19 ; high part of accumulator
  277. MVC B0,RILC
  278. || SUB B0,2,B1 ; first ILC
  279. || SUB B0,1,B2 ; const B2=N-1
  280. || LDW *A5++,B6 ; ap[0]
  281. || MV A0,A3 ; const A3=M
  282. .else
  283. ;; This alternative is an exercise in fully unrolled Comba
  284. ;; algorithm implementation that operates at n*(n+1)+12, or
  285. ;; as little as 32 cycles...
  286. LDW *ARG1[0],B16 ; a[0]
  287. || LDW *ARG2[0],A16 ; b[0]
  288. LDW *ARG1[1],B17 ; a[1]
  289. || LDW *ARG2[1],A17 ; b[1]
  290. LDW *ARG1[2],B18 ; a[2]
  291. || LDW *ARG2[2],A18 ; b[2]
  292. LDW *ARG1[3],B19 ; a[3]
  293. || LDW *ARG2[3],A19 ; b[3]
  294. NOP
  295. MPY32U A16,B16,A1:A0 ; a[0]*b[0]
  296. MPY32U A17,B16,A23:A22 ; a[0]*b[1]
  297. MPY32U A16,B17,A25:A24 ; a[1]*b[0]
  298. MPY32U A16,B18,A27:A26 ; a[2]*b[0]
  299. STW A0,*ARG0[0]
  300. || MPY32U A17,B17,A29:A28 ; a[1]*b[1]
  301. MPY32U A18,B16,A31:A30 ; a[0]*b[2]
  302. || ADDU A22,A1,A1:A0
  303. MV A23,B0
  304. || MPY32U A19,B16,A21:A20 ; a[3]*b[0]
  305. || ADDU A24,A1:A0,A1:A0
  306. ADDU A25,B0,B1:B0
  307. || STW A0,*ARG0[1]
  308. || MPY32U A18,B17,A23:A22 ; a[2]*b[1]
  309. || ADDU A26,A1,A9:A8
  310. ADDU A27,B1,B9:B8
  311. || MPY32U A17,B18,A25:A24 ; a[1]*b[2]
  312. || ADDU A28,A9:A8,A9:A8
  313. ADDU A29,B9:B8,B9:B8
  314. || MPY32U A16,B19,A27:A26 ; a[0]*b[3]
  315. || ADDU A30,A9:A8,A9:A8
  316. ADDU A31,B9:B8,B9:B8
  317. || ADDU B0,A9:A8,A9:A8
  318. STW A8,*ARG0[2]
  319. || ADDU A20,A9,A1:A0
  320. ADDU A21,B9,B1:B0
  321. || MPY32U A19,B17,A21:A20 ; a[3]*b[1]
  322. || ADDU A22,A1:A0,A1:A0
  323. ADDU A23,B1:B0,B1:B0
  324. || MPY32U A18,B18,A23:A22 ; a[2]*b[2]
  325. || ADDU A24,A1:A0,A1:A0
  326. ADDU A25,B1:B0,B1:B0
  327. || MPY32U A17,B19,A25:A24 ; a[1]*b[3]
  328. || ADDU A26,A1:A0,A1:A0
  329. ADDU A27,B1:B0,B1:B0
  330. || ADDU B8,A1:A0,A1:A0
  331. STW A0,*ARG0[3]
  332. || MPY32U A19,B18,A27:A26 ; a[3]*b[2]
  333. || ADDU A20,A1,A9:A8
  334. ADDU A21,B1,B9:B8
  335. || MPY32U A18,B19,A29:A28 ; a[2]*b[3]
  336. || ADDU A22,A9:A8,A9:A8
  337. ADDU A23,B9:B8,B9:B8
  338. || MPY32U A19,B19,A31:A30 ; a[3]*b[3]
  339. || ADDU A24,A9:A8,A9:A8
  340. ADDU A25,B9:B8,B9:B8
  341. || ADDU B0,A9:A8,A9:A8
  342. STW A8,*ARG0[4]
  343. || ADDU A26,A9,A1:A0
  344. ADDU A27,B9,B1:B0
  345. || ADDU A28,A1:A0,A1:A0
  346. ADDU A29,B1:B0,B1:B0
  347. || BNOP RA
  348. || ADDU B8,A1:A0,A1:A0
  349. STW A0,*ARG0[5]
  350. || ADDU A30,A1,A9:A8
  351. ADD A31,B1,B8
  352. ADDU B0,A9:A8,A9:A8 ; removed || to avoid cross-path stall below
  353. ADD B8,A9,A9
  354. || STW A8,*ARG0[6]
  355. STW A9,*ARG0[7]
  356. .endif
  357. .endasmfunc