2
0

bn-c64xplus.asm 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381
  1. ;; Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved.
  2. ;;
  3. ;; Licensed under the Apache License 2.0 (the "License"). You may not use
  4. ;; this file except in compliance with the License. You can obtain a copy
  5. ;; in the file LICENSE in the source distribution or at
  6. ;; https://www.openssl.org/source/license.html
  7. ;;
  8. ;;====================================================================
  9. ;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  10. ;; project.
  11. ;;
  12. ;; Rights for redistribution and usage in source and binary forms are
  13. ;; granted according to the License. Warranty of any kind is disclaimed.
  14. ;;====================================================================
  15. ;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
  16. ;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
  17. ;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler
  18. ;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
  19. ;;====================================================================
  20. .text
  21. .if .ASSEMBLER_VERSION<7000000
  22. .asg 0,__TI_EABI__
  23. .endif
  24. .if __TI_EABI__
  25. .asg bn_mul_add_words,_bn_mul_add_words
  26. .asg bn_mul_words,_bn_mul_words
  27. .asg bn_sqr_words,_bn_sqr_words
  28. .asg bn_add_words,_bn_add_words
  29. .asg bn_sub_words,_bn_sub_words
  30. .asg bn_div_words,_bn_div_words
  31. .asg bn_sqr_comba8,_bn_sqr_comba8
  32. .asg bn_mul_comba8,_bn_mul_comba8
  33. .asg bn_sqr_comba4,_bn_sqr_comba4
  34. .asg bn_mul_comba4,_bn_mul_comba4
  35. .endif
  36. .asg B3,RA
  37. .asg A4,ARG0
  38. .asg B4,ARG1
  39. .asg A6,ARG2
  40. .asg B6,ARG3
  41. .asg A8,ARG4
  42. .asg B8,ARG5
  43. .asg A4,RET
  44. .asg A15,FP
  45. .asg B14,DP
  46. .asg B15,SP
  47. .global _bn_mul_add_words
  48. _bn_mul_add_words:
  49. .asmfunc
  50. MV ARG2,B0
  51. [!B0] BNOP RA
  52. ||[!B0] MVK 0,RET
  53. [B0] MVC B0,ILC
  54. [B0] ZERO A19 ; high part of accumulator
  55. || [B0] MV ARG0,A2
  56. || [B0] MV ARG3,A3
  57. NOP 3
  58. SPLOOP 2 ; 2*n+10
  59. ;;====================================================================
  60. LDW *ARG1++,B7 ; ap[i]
  61. NOP 3
  62. LDW *ARG0++,A7 ; rp[i]
  63. MPY32U B7,A3,A17:A16
  64. NOP 3 ; [2,0] in epilogue
  65. ADDU A16,A7,A21:A20
  66. ADDU A19,A21:A20,A19:A18
  67. || MV.S A17,A23
  68. SPKERNEL 2,1 ; leave slot for "return value"
  69. || STW A18,*A2++ ; rp[i]
  70. || ADD A19,A23,A19
  71. ;;====================================================================
  72. BNOP RA,4
  73. MV A19,RET ; return value
  74. .endasmfunc
  75. .global _bn_mul_words
  76. _bn_mul_words:
  77. .asmfunc
  78. MV ARG2,B0
  79. [!B0] BNOP RA
  80. ||[!B0] MVK 0,RET
  81. [B0] MVC B0,ILC
  82. [B0] ZERO A19 ; high part of accumulator
  83. NOP 3
  84. SPLOOP 2 ; 2*n+10
  85. ;;====================================================================
  86. LDW *ARG1++,A7 ; ap[i]
  87. NOP 4
  88. MPY32U A7,ARG3,A17:A16
  89. NOP 4 ; [2,0] in epiloque
  90. ADDU A19,A16,A19:A18
  91. || MV.S A17,A21
  92. SPKERNEL 2,1 ; leave slot for "return value"
  93. || STW A18,*ARG0++ ; rp[i]
  94. || ADD.L A19,A21,A19
  95. ;;====================================================================
  96. BNOP RA,4
  97. MV A19,RET ; return value
  98. .endasmfunc
  99. .global _bn_sqr_words
  100. _bn_sqr_words:
  101. .asmfunc
  102. MV ARG2,B0
  103. [!B0] BNOP RA
  104. ||[!B0] MVK 0,RET
  105. [B0] MVC B0,ILC
  106. [B0] MV ARG0,B2
  107. || [B0] ADD 4,ARG0,ARG0
  108. NOP 3
  109. SPLOOP 2 ; 2*n+10
  110. ;;====================================================================
  111. LDW *ARG1++,B7 ; ap[i]
  112. NOP 4
  113. MPY32U B7,B7,B1:B0
  114. NOP 3 ; [2,0] in epilogue
  115. STW B0,*B2++(8) ; rp[2*i]
  116. MV B1,A1
  117. SPKERNEL 2,0 ; fully overlap BNOP RA,5
  118. || STW A1,*ARG0++(8) ; rp[2*i+1]
  119. ;;====================================================================
  120. BNOP RA,5
  121. .endasmfunc
  122. .global _bn_add_words
  123. _bn_add_words:
  124. .asmfunc
  125. MV ARG3,B0
  126. [!B0] BNOP RA
  127. ||[!B0] MVK 0,RET
  128. [B0] MVC B0,ILC
  129. [B0] ZERO A1 ; carry flag
  130. || [B0] MV ARG0,A3
  131. NOP 3
  132. SPLOOP 2 ; 2*n+6
  133. ;;====================================================================
  134. LDW *ARG2++,A7 ; bp[i]
  135. || LDW *ARG1++,B7 ; ap[i]
  136. NOP 4
  137. ADDU A7,B7,A9:A8
  138. ADDU A1,A9:A8,A1:A0
  139. SPKERNEL 0,0 ; fully overlap BNOP RA,5
  140. || STW A0,*A3++ ; write result
  141. || MV A1,RET ; keep carry flag in RET
  142. ;;====================================================================
  143. BNOP RA,5
  144. .endasmfunc
  145. .global _bn_sub_words
  146. _bn_sub_words:
  147. .asmfunc
  148. MV ARG3,B0
  149. [!B0] BNOP RA
  150. ||[!B0] MVK 0,RET
  151. [B0] MVC B0,ILC
  152. [B0] ZERO A2 ; borrow flag
  153. || [B0] MV ARG0,A3
  154. NOP 3
  155. SPLOOP 2 ; 2*n+6
  156. ;;====================================================================
  157. LDW *ARG2++,A7 ; bp[i]
  158. || LDW *ARG1++,B7 ; ap[i]
  159. NOP 4
  160. SUBU B7,A7,A1:A0
  161. [A2] SUB A1:A0,1,A1:A0
  162. SPKERNEL 0,1 ; leave slot for "return borrow flag"
  163. || STW A0,*A3++ ; write result
  164. || AND 1,A1,A2 ; pass on borrow flag
  165. ;;====================================================================
  166. BNOP RA,4
  167. AND 1,A1,RET ; return borrow flag
  168. .endasmfunc
  169. .global _bn_div_words
  170. _bn_div_words:
  171. .asmfunc
  172. LMBD 1,A6,A0 ; leading zero bits in dv
  173. LMBD 1,A4,A1 ; leading zero bits in hi
  174. || MVK 32,B0
  175. CMPLTU A1,A0,A2
  176. || ADD A0,B0,B0
  177. [ A2] BNOP RA
  178. ||[ A2] MVK -1,A4 ; return overflow
  179. ||[!A2] MV A4,A3 ; reassign hi
  180. [!A2] MV B4,A4 ; reassign lo, will be quotient
  181. ||[!A2] MVC B0,ILC
  182. [!A2] SHL A6,A0,A6 ; normalize dv
  183. || MVK 1,A1
  184. [!A2] CMPLTU A3,A6,A1 ; hi<dv?
  185. ||[!A2] SHL A4,1,A5:A4 ; lo<<1
  186. [!A1] SUB A3,A6,A3 ; hi-=dv
  187. ||[!A1] OR 1,A4,A4
  188. [!A2] SHRU A3,31,A1 ; upper bit
  189. ||[!A2] ADDAH A5,A3,A3 ; hi<<1|lo>>31
  190. SPLOOP 3
  191. [!A1] CMPLTU A3,A6,A1 ; hi<dv?
  192. ||[ A1] ZERO A1
  193. || SHL A4,1,A5:A4 ; lo<<1
  194. [!A1] SUB A3,A6,A3 ; hi-=dv
  195. ||[!A1] OR 1,A4,A4 ; quotient
  196. SHRU A3,31,A1 ; upper bit
  197. || ADDAH A5,A3,A3 ; hi<<1|lo>>31
  198. SPKERNEL
  199. BNOP RA,5
  200. .endasmfunc
  201. ;;====================================================================
  202. ;; Not really Comba algorithm, just straightforward NxM... Dedicated
  203. ;; fully unrolled real Comba implementations are asymptotically 2x
  204. ;; faster, but naturally larger undertaking. Purpose of this exercise
  205. ;; was rather to learn to master nested SPLOOPs...
  206. ;;====================================================================
  207. .global _bn_sqr_comba8
  208. .global _bn_mul_comba8
  209. _bn_sqr_comba8:
  210. MV ARG1,ARG2
  211. _bn_mul_comba8:
  212. .asmfunc
  213. MVK 8,B0 ; N, RILC
  214. || MVK 8,A0 ; M, outer loop counter
  215. || MV ARG1,A5 ; copy ap
  216. || MV ARG0,B4 ; copy rp
  217. || ZERO B19 ; high part of accumulator
  218. MVC B0,RILC
  219. || SUB B0,2,B1 ; N-2, initial ILC
  220. || SUB B0,1,B2 ; const B2=N-1
  221. || LDW *A5++,B6 ; ap[0]
  222. || MV A0,A3 ; const A3=M
  223. sploopNxM?: ; for best performance arrange M<=N
  224. [A0] SPLOOPD 2 ; 2*n+10
  225. || MVC B1,ILC
  226. || ADDAW B4,B0,B5
  227. || ZERO B7
  228. || LDW *A5++,A9 ; pre-fetch ap[1]
  229. || ZERO A1
  230. || SUB A0,1,A0
  231. ;;====================================================================
  232. ;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
  233. ;; This is because of Advisory 15 from TI publication SPRZ247I.
  234. LDW *ARG2++,A7 ; bp[i]
  235. NOP 3
  236. [A1] LDW *B5++,B7 ; rp[i]
  237. MPY32U A7,B6,B17:B16
  238. NOP 3
  239. ADDU B16,B7,B21:B20
  240. ADDU B19,B21:B20,B19:B18
  241. || MV.S B17,B23
  242. SPKERNEL
  243. || STW B18,*B4++ ; rp[i]
  244. || ADD.S B19,B23,B19
  245. ;;====================================================================
  246. outer?: ; m*2*(n+1)+10
  247. SUBAW ARG2,A3,ARG2 ; rewind bp to bp[0]
  248. SPMASKR
  249. || CMPGT A0,1,A2 ; done pre-fetching ap[i+1]?
  250. MVD A9,B6 ; move through .M unit(*)
  251. [A2] LDW *A5++,A9 ; pre-fetch ap[i+1]
  252. SUBAW B5,B2,B5 ; rewind rp to rp[1]
  253. MVK 1,A1
  254. [A0] BNOP.S1 outer?,4
  255. || [A0] SUB.L A0,1,A0
  256. STW B19,*B4--[B2] ; rewind rp tp rp[1]
  257. || ZERO.S B19 ; high part of accumulator
  258. ;; end of outer?
  259. BNOP RA,5 ; return
  260. .endasmfunc
  261. ;; (*) It should be noted that B6 is used as input to MPY32U in
  262. ;; chronologically next cycle in *preceding* SPLOOP iteration.
  263. ;; Normally such arrangement would require DINT, but at this
  264. ;; point SPLOOP is draining and interrupts are disabled
  265. ;; implicitly.
  266. .global _bn_sqr_comba4
  267. .global _bn_mul_comba4
  268. _bn_sqr_comba4:
  269. MV ARG1,ARG2
  270. _bn_mul_comba4:
  271. .asmfunc
  272. .if 0
  273. BNOP sploopNxM?,3
  274. ;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case,
  275. ;; because of low-counter effect, when prologue phase finishes
  276. ;; before SPKERNEL instruction is reached. As result it's 25%
  277. ;; slower than expected...
  278. MVK 4,B0 ; N, RILC
  279. || MVK 4,A0 ; M, outer loop counter
  280. || MV ARG1,A5 ; copy ap
  281. || MV ARG0,B4 ; copy rp
  282. || ZERO B19 ; high part of accumulator
  283. MVC B0,RILC
  284. || SUB B0,2,B1 ; first ILC
  285. || SUB B0,1,B2 ; const B2=N-1
  286. || LDW *A5++,B6 ; ap[0]
  287. || MV A0,A3 ; const A3=M
  288. .else
  289. ;; This alternative is an exercise in fully unrolled Comba
  290. ;; algorithm implementation that operates at n*(n+1)+12, or
  291. ;; as little as 32 cycles...
  292. LDW *ARG1[0],B16 ; a[0]
  293. || LDW *ARG2[0],A16 ; b[0]
  294. LDW *ARG1[1],B17 ; a[1]
  295. || LDW *ARG2[1],A17 ; b[1]
  296. LDW *ARG1[2],B18 ; a[2]
  297. || LDW *ARG2[2],A18 ; b[2]
  298. LDW *ARG1[3],B19 ; a[3]
  299. || LDW *ARG2[3],A19 ; b[3]
  300. NOP
  301. MPY32U A16,B16,A1:A0 ; a[0]*b[0]
  302. MPY32U A17,B16,A23:A22 ; a[0]*b[1]
  303. MPY32U A16,B17,A25:A24 ; a[1]*b[0]
  304. MPY32U A16,B18,A27:A26 ; a[2]*b[0]
  305. STW A0,*ARG0[0]
  306. || MPY32U A17,B17,A29:A28 ; a[1]*b[1]
  307. MPY32U A18,B16,A31:A30 ; a[0]*b[2]
  308. || ADDU A22,A1,A1:A0
  309. MV A23,B0
  310. || MPY32U A19,B16,A21:A20 ; a[3]*b[0]
  311. || ADDU A24,A1:A0,A1:A0
  312. ADDU A25,B0,B1:B0
  313. || STW A0,*ARG0[1]
  314. || MPY32U A18,B17,A23:A22 ; a[2]*b[1]
  315. || ADDU A26,A1,A9:A8
  316. ADDU A27,B1,B9:B8
  317. || MPY32U A17,B18,A25:A24 ; a[1]*b[2]
  318. || ADDU A28,A9:A8,A9:A8
  319. ADDU A29,B9:B8,B9:B8
  320. || MPY32U A16,B19,A27:A26 ; a[0]*b[3]
  321. || ADDU A30,A9:A8,A9:A8
  322. ADDU A31,B9:B8,B9:B8
  323. || ADDU B0,A9:A8,A9:A8
  324. STW A8,*ARG0[2]
  325. || ADDU A20,A9,A1:A0
  326. ADDU A21,B9,B1:B0
  327. || MPY32U A19,B17,A21:A20 ; a[3]*b[1]
  328. || ADDU A22,A1:A0,A1:A0
  329. ADDU A23,B1:B0,B1:B0
  330. || MPY32U A18,B18,A23:A22 ; a[2]*b[2]
  331. || ADDU A24,A1:A0,A1:A0
  332. ADDU A25,B1:B0,B1:B0
  333. || MPY32U A17,B19,A25:A24 ; a[1]*b[3]
  334. || ADDU A26,A1:A0,A1:A0
  335. ADDU A27,B1:B0,B1:B0
  336. || ADDU B8,A1:A0,A1:A0
  337. STW A0,*ARG0[3]
  338. || MPY32U A19,B18,A27:A26 ; a[3]*b[2]
  339. || ADDU A20,A1,A9:A8
  340. ADDU A21,B1,B9:B8
  341. || MPY32U A18,B19,A29:A28 ; a[2]*b[3]
  342. || ADDU A22,A9:A8,A9:A8
  343. ADDU A23,B9:B8,B9:B8
  344. || MPY32U A19,B19,A31:A30 ; a[3]*b[3]
  345. || ADDU A24,A9:A8,A9:A8
  346. ADDU A25,B9:B8,B9:B8
  347. || ADDU B0,A9:A8,A9:A8
  348. STW A8,*ARG0[4]
  349. || ADDU A26,A9,A1:A0
  350. ADDU A27,B9,B1:B0
  351. || ADDU A28,A1:A0,A1:A0
  352. ADDU A29,B1:B0,B1:B0
  353. || BNOP RA
  354. || ADDU B8,A1:A0,A1:A0
  355. STW A0,*ARG0[5]
  356. || ADDU A30,A1,A9:A8
  357. ADD A31,B1,B8
  358. ADDU B0,A9:A8,A9:A8 ; removed || to avoid cross-path stall below
  359. ADD B8,A9,A9
  360. || STW A8,*ARG0[6]
  361. STW A9,*ARG0[7]
  362. .endif
  363. .endasmfunc