2
0

bn-c64xplus.asm 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382
  1. ;; Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
  2. ;;
  3. ;; Licensed under the OpenSSL license (the "License"). You may not use
  4. ;; this file except in compliance with the License. You can obtain a copy
  5. ;; in the file LICENSE in the source distribution or at
  6. ;; https://www.openssl.org/source/license.html
  7. ;;
  8. ;;====================================================================
  9. ;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. ;; project.
  11. ;;
  12. ;; Rights for redistribution and usage in source and binary forms are
  13. ;; granted according to the OpenSSL license. Warranty of any kind is
  14. ;; disclaimed.
  15. ;;====================================================================
  16. ;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
  17. ;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
  18. ;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler
  19. ;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
  20. ;;====================================================================
  21. .text
  22. .if .ASSEMBLER_VERSION<7000000
  23. .asg 0,__TI_EABI__
  24. .endif
  25. .if __TI_EABI__
  26. .asg bn_mul_add_words,_bn_mul_add_words
  27. .asg bn_mul_words,_bn_mul_words
  28. .asg bn_sqr_words,_bn_sqr_words
  29. .asg bn_add_words,_bn_add_words
  30. .asg bn_sub_words,_bn_sub_words
  31. .asg bn_div_words,_bn_div_words
  32. .asg bn_sqr_comba8,_bn_sqr_comba8
  33. .asg bn_mul_comba8,_bn_mul_comba8
  34. .asg bn_sqr_comba4,_bn_sqr_comba4
  35. .asg bn_mul_comba4,_bn_mul_comba4
  36. .endif
  37. .asg B3,RA
  38. .asg A4,ARG0
  39. .asg B4,ARG1
  40. .asg A6,ARG2
  41. .asg B6,ARG3
  42. .asg A8,ARG4
  43. .asg B8,ARG5
  44. .asg A4,RET
  45. .asg A15,FP
  46. .asg B14,DP
  47. .asg B15,SP
  48. .global _bn_mul_add_words
  49. _bn_mul_add_words:
  50. .asmfunc
  51. MV ARG2,B0
  52. [!B0] BNOP RA
  53. ||[!B0] MVK 0,RET
  54. [B0] MVC B0,ILC
  55. [B0] ZERO A19 ; high part of accumulator
  56. || [B0] MV ARG0,A2
  57. || [B0] MV ARG3,A3
  58. NOP 3
  59. SPLOOP 2 ; 2*n+10
  60. ;;====================================================================
  61. LDW *ARG1++,B7 ; ap[i]
  62. NOP 3
  63. LDW *ARG0++,A7 ; rp[i]
  64. MPY32U B7,A3,A17:A16
  65. NOP 3 ; [2,0] in epilogue
  66. ADDU A16,A7,A21:A20
  67. ADDU A19,A21:A20,A19:A18
  68. || MV.S A17,A23
  69. SPKERNEL 2,1 ; leave slot for "return value"
  70. || STW A18,*A2++ ; rp[i]
  71. || ADD A19,A23,A19
  72. ;;====================================================================
  73. BNOP RA,4
  74. MV A19,RET ; return value
  75. .endasmfunc
  76. .global _bn_mul_words
  77. _bn_mul_words:
  78. .asmfunc
  79. MV ARG2,B0
  80. [!B0] BNOP RA
  81. ||[!B0] MVK 0,RET
  82. [B0] MVC B0,ILC
  83. [B0] ZERO A19 ; high part of accumulator
  84. NOP 3
  85. SPLOOP 2 ; 2*n+10
  86. ;;====================================================================
  87. LDW *ARG1++,A7 ; ap[i]
  88. NOP 4
  89. MPY32U A7,ARG3,A17:A16
  90. NOP 4 ; [2,0] in epiloque
  91. ADDU A19,A16,A19:A18
  92. || MV.S A17,A21
  93. SPKERNEL 2,1 ; leave slot for "return value"
  94. || STW A18,*ARG0++ ; rp[i]
  95. || ADD.L A19,A21,A19
  96. ;;====================================================================
  97. BNOP RA,4
  98. MV A19,RET ; return value
  99. .endasmfunc
  100. .global _bn_sqr_words
  101. _bn_sqr_words:
  102. .asmfunc
  103. MV ARG2,B0
  104. [!B0] BNOP RA
  105. ||[!B0] MVK 0,RET
  106. [B0] MVC B0,ILC
  107. [B0] MV ARG0,B2
  108. || [B0] ADD 4,ARG0,ARG0
  109. NOP 3
  110. SPLOOP 2 ; 2*n+10
  111. ;;====================================================================
  112. LDW *ARG1++,B7 ; ap[i]
  113. NOP 4
  114. MPY32U B7,B7,B1:B0
  115. NOP 3 ; [2,0] in epilogue
  116. STW B0,*B2++(8) ; rp[2*i]
  117. MV B1,A1
  118. SPKERNEL 2,0 ; fully overlap BNOP RA,5
  119. || STW A1,*ARG0++(8) ; rp[2*i+1]
  120. ;;====================================================================
  121. BNOP RA,5
  122. .endasmfunc
  123. .global _bn_add_words
  124. _bn_add_words:
  125. .asmfunc
  126. MV ARG3,B0
  127. [!B0] BNOP RA
  128. ||[!B0] MVK 0,RET
  129. [B0] MVC B0,ILC
  130. [B0] ZERO A1 ; carry flag
  131. || [B0] MV ARG0,A3
  132. NOP 3
  133. SPLOOP 2 ; 2*n+6
  134. ;;====================================================================
  135. LDW *ARG2++,A7 ; bp[i]
  136. || LDW *ARG1++,B7 ; ap[i]
  137. NOP 4
  138. ADDU A7,B7,A9:A8
  139. ADDU A1,A9:A8,A1:A0
  140. SPKERNEL 0,0 ; fully overlap BNOP RA,5
  141. || STW A0,*A3++ ; write result
  142. || MV A1,RET ; keep carry flag in RET
  143. ;;====================================================================
  144. BNOP RA,5
  145. .endasmfunc
  146. .global _bn_sub_words
  147. _bn_sub_words:
  148. .asmfunc
  149. MV ARG3,B0
  150. [!B0] BNOP RA
  151. ||[!B0] MVK 0,RET
  152. [B0] MVC B0,ILC
  153. [B0] ZERO A2 ; borrow flag
  154. || [B0] MV ARG0,A3
  155. NOP 3
  156. SPLOOP 2 ; 2*n+6
  157. ;;====================================================================
  158. LDW *ARG2++,A7 ; bp[i]
  159. || LDW *ARG1++,B7 ; ap[i]
  160. NOP 4
  161. SUBU B7,A7,A1:A0
  162. [A2] SUB A1:A0,1,A1:A0
  163. SPKERNEL 0,1 ; leave slot for "return borrow flag"
  164. || STW A0,*A3++ ; write result
  165. || AND 1,A1,A2 ; pass on borrow flag
  166. ;;====================================================================
  167. BNOP RA,4
  168. AND 1,A1,RET ; return borrow flag
  169. .endasmfunc
  170. .global _bn_div_words
  171. _bn_div_words:
  172. .asmfunc
  173. LMBD 1,A6,A0 ; leading zero bits in dv
  174. LMBD 1,A4,A1 ; leading zero bits in hi
  175. || MVK 32,B0
  176. CMPLTU A1,A0,A2
  177. || ADD A0,B0,B0
  178. [ A2] BNOP RA
  179. ||[ A2] MVK -1,A4 ; return overflow
  180. ||[!A2] MV A4,A3 ; reassign hi
  181. [!A2] MV B4,A4 ; reassign lo, will be quotient
  182. ||[!A2] MVC B0,ILC
  183. [!A2] SHL A6,A0,A6 ; normalize dv
  184. || MVK 1,A1
  185. [!A2] CMPLTU A3,A6,A1 ; hi<dv?
  186. ||[!A2] SHL A4,1,A5:A4 ; lo<<1
  187. [!A1] SUB A3,A6,A3 ; hi-=dv
  188. ||[!A1] OR 1,A4,A4
  189. [!A2] SHRU A3,31,A1 ; upper bit
  190. ||[!A2] ADDAH A5,A3,A3 ; hi<<1|lo>>31
  191. SPLOOP 3
  192. [!A1] CMPLTU A3,A6,A1 ; hi<dv?
  193. ||[ A1] ZERO A1
  194. || SHL A4,1,A5:A4 ; lo<<1
  195. [!A1] SUB A3,A6,A3 ; hi-=dv
  196. ||[!A1] OR 1,A4,A4 ; quotient
  197. SHRU A3,31,A1 ; upper bit
  198. || ADDAH A5,A3,A3 ; hi<<1|lo>>31
  199. SPKERNEL
  200. BNOP RA,5
  201. .endasmfunc
  202. ;;====================================================================
  203. ;; Not really Comba algorithm, just straightforward NxM... Dedicated
  204. ;; fully unrolled real Comba implementations are asymptotically 2x
  205. ;; faster, but naturally larger undertaking. Purpose of this exercise
  206. ;; was rather to learn to master nested SPLOOPs...
  207. ;;====================================================================
  208. .global _bn_sqr_comba8
  209. .global _bn_mul_comba8
  210. _bn_sqr_comba8:
  211. MV ARG1,ARG2
  212. _bn_mul_comba8:
  213. .asmfunc
  214. MVK 8,B0 ; N, RILC
  215. || MVK 8,A0 ; M, outer loop counter
  216. || MV ARG1,A5 ; copy ap
  217. || MV ARG0,B4 ; copy rp
  218. || ZERO B19 ; high part of accumulator
  219. MVC B0,RILC
  220. || SUB B0,2,B1 ; N-2, initial ILC
  221. || SUB B0,1,B2 ; const B2=N-1
  222. || LDW *A5++,B6 ; ap[0]
  223. || MV A0,A3 ; const A3=M
  224. sploopNxM?: ; for best performance arrange M<=N
  225. [A0] SPLOOPD 2 ; 2*n+10
  226. || MVC B1,ILC
  227. || ADDAW B4,B0,B5
  228. || ZERO B7
  229. || LDW *A5++,A9 ; pre-fetch ap[1]
  230. || ZERO A1
  231. || SUB A0,1,A0
  232. ;;====================================================================
  233. ;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
  234. ;; This is because of Advisory 15 from TI publication SPRZ247I.
  235. LDW *ARG2++,A7 ; bp[i]
  236. NOP 3
  237. [A1] LDW *B5++,B7 ; rp[i]
  238. MPY32U A7,B6,B17:B16
  239. NOP 3
  240. ADDU B16,B7,B21:B20
  241. ADDU B19,B21:B20,B19:B18
  242. || MV.S B17,B23
  243. SPKERNEL
  244. || STW B18,*B4++ ; rp[i]
  245. || ADD.S B19,B23,B19
  246. ;;====================================================================
  247. outer?: ; m*2*(n+1)+10
  248. SUBAW ARG2,A3,ARG2 ; rewind bp to bp[0]
  249. SPMASKR
  250. || CMPGT A0,1,A2 ; done pre-fetching ap[i+1]?
  251. MVD A9,B6 ; move through .M unit(*)
  252. [A2] LDW *A5++,A9 ; pre-fetch ap[i+1]
  253. SUBAW B5,B2,B5 ; rewind rp to rp[1]
  254. MVK 1,A1
  255. [A0] BNOP.S1 outer?,4
  256. || [A0] SUB.L A0,1,A0
  257. STW B19,*B4--[B2] ; rewind rp tp rp[1]
  258. || ZERO.S B19 ; high part of accumulator
  259. ;; end of outer?
  260. BNOP RA,5 ; return
  261. .endasmfunc
  262. ;; (*) It should be noted that B6 is used as input to MPY32U in
  263. ;; chronologically next cycle in *preceding* SPLOOP iteration.
  264. ;; Normally such arrangement would require DINT, but at this
  265. ;; point SPLOOP is draining and interrupts are disabled
  266. ;; implicitly.
  267. .global _bn_sqr_comba4
  268. .global _bn_mul_comba4
  269. _bn_sqr_comba4:
  270. MV ARG1,ARG2
  271. _bn_mul_comba4:
  272. .asmfunc
  273. .if 0
  274. BNOP sploopNxM?,3
  275. ;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case,
  276. ;; because of low-counter effect, when prologue phase finishes
  277. ;; before SPKERNEL instruction is reached. As result it's 25%
  278. ;; slower than expected...
  279. MVK 4,B0 ; N, RILC
  280. || MVK 4,A0 ; M, outer loop counter
  281. || MV ARG1,A5 ; copy ap
  282. || MV ARG0,B4 ; copy rp
  283. || ZERO B19 ; high part of accumulator
  284. MVC B0,RILC
  285. || SUB B0,2,B1 ; first ILC
  286. || SUB B0,1,B2 ; const B2=N-1
  287. || LDW *A5++,B6 ; ap[0]
  288. || MV A0,A3 ; const A3=M
  289. .else
  290. ;; This alternative is an exercise in fully unrolled Comba
  291. ;; algorithm implementation that operates at n*(n+1)+12, or
  292. ;; as little as 32 cycles...
  293. LDW *ARG1[0],B16 ; a[0]
  294. || LDW *ARG2[0],A16 ; b[0]
  295. LDW *ARG1[1],B17 ; a[1]
  296. || LDW *ARG2[1],A17 ; b[1]
  297. LDW *ARG1[2],B18 ; a[2]
  298. || LDW *ARG2[2],A18 ; b[2]
  299. LDW *ARG1[3],B19 ; a[3]
  300. || LDW *ARG2[3],A19 ; b[3]
  301. NOP
  302. MPY32U A16,B16,A1:A0 ; a[0]*b[0]
  303. MPY32U A17,B16,A23:A22 ; a[0]*b[1]
  304. MPY32U A16,B17,A25:A24 ; a[1]*b[0]
  305. MPY32U A16,B18,A27:A26 ; a[2]*b[0]
  306. STW A0,*ARG0[0]
  307. || MPY32U A17,B17,A29:A28 ; a[1]*b[1]
  308. MPY32U A18,B16,A31:A30 ; a[0]*b[2]
  309. || ADDU A22,A1,A1:A0
  310. MV A23,B0
  311. || MPY32U A19,B16,A21:A20 ; a[3]*b[0]
  312. || ADDU A24,A1:A0,A1:A0
  313. ADDU A25,B0,B1:B0
  314. || STW A0,*ARG0[1]
  315. || MPY32U A18,B17,A23:A22 ; a[2]*b[1]
  316. || ADDU A26,A1,A9:A8
  317. ADDU A27,B1,B9:B8
  318. || MPY32U A17,B18,A25:A24 ; a[1]*b[2]
  319. || ADDU A28,A9:A8,A9:A8
  320. ADDU A29,B9:B8,B9:B8
  321. || MPY32U A16,B19,A27:A26 ; a[0]*b[3]
  322. || ADDU A30,A9:A8,A9:A8
  323. ADDU A31,B9:B8,B9:B8
  324. || ADDU B0,A9:A8,A9:A8
  325. STW A8,*ARG0[2]
  326. || ADDU A20,A9,A1:A0
  327. ADDU A21,B9,B1:B0
  328. || MPY32U A19,B17,A21:A20 ; a[3]*b[1]
  329. || ADDU A22,A1:A0,A1:A0
  330. ADDU A23,B1:B0,B1:B0
  331. || MPY32U A18,B18,A23:A22 ; a[2]*b[2]
  332. || ADDU A24,A1:A0,A1:A0
  333. ADDU A25,B1:B0,B1:B0
  334. || MPY32U A17,B19,A25:A24 ; a[1]*b[3]
  335. || ADDU A26,A1:A0,A1:A0
  336. ADDU A27,B1:B0,B1:B0
  337. || ADDU B8,A1:A0,A1:A0
  338. STW A0,*ARG0[3]
  339. || MPY32U A19,B18,A27:A26 ; a[3]*b[2]
  340. || ADDU A20,A1,A9:A8
  341. ADDU A21,B1,B9:B8
  342. || MPY32U A18,B19,A29:A28 ; a[2]*b[3]
  343. || ADDU A22,A9:A8,A9:A8
  344. ADDU A23,B9:B8,B9:B8
  345. || MPY32U A19,B19,A31:A30 ; a[3]*b[3]
  346. || ADDU A24,A9:A8,A9:A8
  347. ADDU A25,B9:B8,B9:B8
  348. || ADDU B0,A9:A8,A9:A8
  349. STW A8,*ARG0[4]
  350. || ADDU A26,A9,A1:A0
  351. ADDU A27,B9,B1:B0
  352. || ADDU A28,A1:A0,A1:A0
  353. ADDU A29,B1:B0,B1:B0
  354. || BNOP RA
  355. || ADDU B8,A1:A0,A1:A0
  356. STW A0,*ARG0[5]
  357. || ADDU A30,A1,A9:A8
  358. ADD A31,B1,B8
  359. ADDU B0,A9:A8,A9:A8 ; removed || to avoid cross-path stall below
  360. ADD B8,A9,A9
  361. || STW A8,*ARG0[6]
  362. STW A9,*ARG0[7]
  363. .endif
  364. .endasmfunc