s390x.S 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713
  1. .ident "s390x.S, version 1.1"
  2. // ====================================================================
  3. // Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
  4. // project.
  5. //
  6. // Rights for redistribution and usage in source and binary forms are
  7. // granted according to the OpenSSL license. Warranty of any kind is
  8. // disclaimed.
  9. // ====================================================================
  10. .text
  11. #define zero %r0
  12. // BN_ULONG bn_mul_add_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
  13. .globl bn_mul_add_words
  14. .type bn_mul_add_words,@function
  15. .align 4
  16. bn_mul_add_words:
  17. lghi zero,0 // zero = 0
  18. la %r1,0(%r2) // put rp aside [to give way to]
  19. lghi %r2,0 // return value
  20. ltgfr %r4,%r4
  21. bler %r14 // if (len<=0) return 0;
  22. stmg %r6,%r13,48(%r15)
  23. lghi %r2,3
  24. lghi %r12,0 // carry = 0
  25. slgr %r1,%r3 // rp-=ap
  26. nr %r2,%r4 // len%4
  27. sra %r4,2 // cnt=len/4
  28. jz .Loop1_madd // carry is incidentally cleared if branch taken
  29. algr zero,zero // clear carry
  30. lg %r7,0(%r3) // ap[0]
  31. lg %r9,8(%r3) // ap[1]
  32. mlgr %r6,%r5 // *=w
  33. brct %r4,.Loop4_madd
  34. j .Loop4_madd_tail
  35. .Loop4_madd:
  36. mlgr %r8,%r5
  37. lg %r11,16(%r3) // ap[i+2]
  38. alcgr %r7,%r12 // +=carry
  39. alcgr %r6,zero
  40. alg %r7,0(%r3,%r1) // +=rp[i]
  41. stg %r7,0(%r3,%r1) // rp[i]=
  42. mlgr %r10,%r5
  43. lg %r13,24(%r3)
  44. alcgr %r9,%r6
  45. alcgr %r8,zero
  46. alg %r9,8(%r3,%r1)
  47. stg %r9,8(%r3,%r1)
  48. mlgr %r12,%r5
  49. lg %r7,32(%r3)
  50. alcgr %r11,%r8
  51. alcgr %r10,zero
  52. alg %r11,16(%r3,%r1)
  53. stg %r11,16(%r3,%r1)
  54. mlgr %r6,%r5
  55. lg %r9,40(%r3)
  56. alcgr %r13,%r10
  57. alcgr %r12,zero
  58. alg %r13,24(%r3,%r1)
  59. stg %r13,24(%r3,%r1)
  60. la %r3,32(%r3) // i+=4
  61. brct %r4,.Loop4_madd
  62. .Loop4_madd_tail:
  63. mlgr %r8,%r5
  64. lg %r11,16(%r3)
  65. alcgr %r7,%r12 // +=carry
  66. alcgr %r6,zero
  67. alg %r7,0(%r3,%r1) // +=rp[i]
  68. stg %r7,0(%r3,%r1) // rp[i]=
  69. mlgr %r10,%r5
  70. lg %r13,24(%r3)
  71. alcgr %r9,%r6
  72. alcgr %r8,zero
  73. alg %r9,8(%r3,%r1)
  74. stg %r9,8(%r3,%r1)
  75. mlgr %r12,%r5
  76. alcgr %r11,%r8
  77. alcgr %r10,zero
  78. alg %r11,16(%r3,%r1)
  79. stg %r11,16(%r3,%r1)
  80. alcgr %r13,%r10
  81. alcgr %r12,zero
  82. alg %r13,24(%r3,%r1)
  83. stg %r13,24(%r3,%r1)
  84. la %r3,32(%r3) // i+=4
  85. la %r2,1(%r2) // see if len%4 is zero ...
  86. brct %r2,.Loop1_madd // without touching condition code:-)
  87. .Lend_madd:
  88. lgr %r2,zero // return value
  89. alcgr %r2,%r12 // collect even carry bit
  90. lmg %r6,%r13,48(%r15)
  91. br %r14
  92. .Loop1_madd:
  93. lg %r7,0(%r3) // ap[i]
  94. mlgr %r6,%r5 // *=w
  95. alcgr %r7,%r12 // +=carry
  96. alcgr %r6,zero
  97. alg %r7,0(%r3,%r1) // +=rp[i]
  98. stg %r7,0(%r3,%r1) // rp[i]=
  99. lgr %r12,%r6
  100. la %r3,8(%r3) // i++
  101. brct %r2,.Loop1_madd
  102. j .Lend_madd
  103. .size bn_mul_add_words,.-bn_mul_add_words
  104. // BN_ULONG bn_mul_words(BN_ULONG *r2,BN_ULONG *r3,int r4,BN_ULONG r5);
  105. .globl bn_mul_words
  106. .type bn_mul_words,@function
  107. .align 4
  108. bn_mul_words:
  109. lghi zero,0 // zero = 0
  110. la %r1,0(%r2) // put rp aside
  111. lghi %r2,0 // i=0;
  112. ltgfr %r4,%r4
  113. bler %r14 // if (len<=0) return 0;
  114. stmg %r6,%r10,48(%r15)
  115. lghi %r10,3
  116. lghi %r8,0 // carry = 0
  117. nr %r10,%r4 // len%4
  118. sra %r4,2 // cnt=len/4
  119. jz .Loop1_mul // carry is incidentally cleared if branch taken
  120. algr zero,zero // clear carry
  121. .Loop4_mul:
  122. lg %r7,0(%r2,%r3) // ap[i]
  123. mlgr %r6,%r5 // *=w
  124. alcgr %r7,%r8 // +=carry
  125. stg %r7,0(%r2,%r1) // rp[i]=
  126. lg %r9,8(%r2,%r3)
  127. mlgr %r8,%r5
  128. alcgr %r9,%r6
  129. stg %r9,8(%r2,%r1)
  130. lg %r7,16(%r2,%r3)
  131. mlgr %r6,%r5
  132. alcgr %r7,%r8
  133. stg %r7,16(%r2,%r1)
  134. lg %r9,24(%r2,%r3)
  135. mlgr %r8,%r5
  136. alcgr %r9,%r6
  137. stg %r9,24(%r2,%r1)
  138. la %r2,32(%r2) // i+=4
  139. brct %r4,.Loop4_mul
  140. la %r10,1(%r10) // see if len%4 is zero ...
  141. brct %r10,.Loop1_mul // without touching condition code:-)
  142. .Lend_mul:
  143. alcgr %r8,zero // collect carry bit
  144. lgr %r2,%r8
  145. lmg %r6,%r10,48(%r15)
  146. br %r14
  147. .Loop1_mul:
  148. lg %r7,0(%r2,%r3) // ap[i]
  149. mlgr %r6,%r5 // *=w
  150. alcgr %r7,%r8 // +=carry
  151. stg %r7,0(%r2,%r1) // rp[i]=
  152. lgr %r8,%r6
  153. la %r2,8(%r2) // i++
  154. brct %r10,.Loop1_mul
  155. j .Lend_mul
  156. .size bn_mul_words,.-bn_mul_words
  157. // void bn_sqr_words(BN_ULONG *r2,BN_ULONG *r2,int r4)
  158. .globl bn_sqr_words
  159. .type bn_sqr_words,@function
  160. .align 4
  161. bn_sqr_words:
  162. ltgfr %r4,%r4
  163. bler %r14
  164. stmg %r6,%r7,48(%r15)
  165. srag %r1,%r4,2 // cnt=len/4
  166. jz .Loop1_sqr
  167. .Loop4_sqr:
  168. lg %r7,0(%r3)
  169. mlgr %r6,%r7
  170. stg %r7,0(%r2)
  171. stg %r6,8(%r2)
  172. lg %r7,8(%r3)
  173. mlgr %r6,%r7
  174. stg %r7,16(%r2)
  175. stg %r6,24(%r2)
  176. lg %r7,16(%r3)
  177. mlgr %r6,%r7
  178. stg %r7,32(%r2)
  179. stg %r6,40(%r2)
  180. lg %r7,24(%r3)
  181. mlgr %r6,%r7
  182. stg %r7,48(%r2)
  183. stg %r6,56(%r2)
  184. la %r3,32(%r3)
  185. la %r2,64(%r2)
  186. brct %r1,.Loop4_sqr
  187. lghi %r1,3
  188. nr %r4,%r1 // cnt=len%4
  189. jz .Lend_sqr
  190. .Loop1_sqr:
  191. lg %r7,0(%r3)
  192. mlgr %r6,%r7
  193. stg %r7,0(%r2)
  194. stg %r6,8(%r2)
  195. la %r3,8(%r3)
  196. la %r2,16(%r2)
  197. brct %r4,.Loop1_sqr
  198. .Lend_sqr:
  199. lmg %r6,%r7,48(%r15)
  200. br %r14
  201. .size bn_sqr_words,.-bn_sqr_words
  202. // BN_ULONG bn_div_words(BN_ULONG h,BN_ULONG l,BN_ULONG d);
  203. .globl bn_div_words
  204. .type bn_div_words,@function
  205. .align 4
  206. bn_div_words:
  207. dlgr %r2,%r4
  208. lgr %r2,%r3
  209. br %r14
  210. .size bn_div_words,.-bn_div_words
  211. // BN_ULONG bn_add_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
  212. .globl bn_add_words
  213. .type bn_add_words,@function
  214. .align 4
  215. bn_add_words:
  216. la %r1,0(%r2) // put rp aside
  217. lghi %r2,0 // i=0
  218. ltgfr %r5,%r5
  219. bler %r14 // if (len<=0) return 0;
  220. stg %r6,48(%r15)
  221. lghi %r6,3
  222. nr %r6,%r5 // len%4
  223. sra %r5,2 // len/4, use sra because it sets condition code
  224. jz .Loop1_add // carry is incidentally cleared if branch taken
  225. algr %r2,%r2 // clear carry
  226. .Loop4_add:
  227. lg %r0,0(%r2,%r3)
  228. alcg %r0,0(%r2,%r4)
  229. stg %r0,0(%r2,%r1)
  230. lg %r0,8(%r2,%r3)
  231. alcg %r0,8(%r2,%r4)
  232. stg %r0,8(%r2,%r1)
  233. lg %r0,16(%r2,%r3)
  234. alcg %r0,16(%r2,%r4)
  235. stg %r0,16(%r2,%r1)
  236. lg %r0,24(%r2,%r3)
  237. alcg %r0,24(%r2,%r4)
  238. stg %r0,24(%r2,%r1)
  239. la %r2,32(%r2) // i+=4
  240. brct %r5,.Loop4_add
  241. la %r6,1(%r6) // see if len%4 is zero ...
  242. brct %r6,.Loop1_add // without touching condition code:-)
  243. .Lexit_add:
  244. lghi %r2,0
  245. alcgr %r2,%r2
  246. lg %r6,48(%r15)
  247. br %r14
  248. .Loop1_add:
  249. lg %r0,0(%r2,%r3)
  250. alcg %r0,0(%r2,%r4)
  251. stg %r0,0(%r2,%r1)
  252. la %r2,8(%r2) // i++
  253. brct %r6,.Loop1_add
  254. j .Lexit_add
  255. .size bn_add_words,.-bn_add_words
  256. // BN_ULONG bn_sub_words(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4,int r5);
  257. .globl bn_sub_words
  258. .type bn_sub_words,@function
  259. .align 4
  260. bn_sub_words:
  261. la %r1,0(%r2) // put rp aside
  262. lghi %r2,0 // i=0
  263. ltgfr %r5,%r5
  264. bler %r14 // if (len<=0) return 0;
  265. stg %r6,48(%r15)
  266. lghi %r6,3
  267. nr %r6,%r5 // len%4
  268. sra %r5,2 // len/4, use sra because it sets condition code
  269. jnz .Loop4_sub // borrow is incidentally cleared if branch taken
  270. slgr %r2,%r2 // clear borrow
  271. .Loop1_sub:
  272. lg %r0,0(%r2,%r3)
  273. slbg %r0,0(%r2,%r4)
  274. stg %r0,0(%r2,%r1)
  275. la %r2,8(%r2) // i++
  276. brct %r6,.Loop1_sub
  277. j .Lexit_sub
  278. .Loop4_sub:
  279. lg %r0,0(%r2,%r3)
  280. slbg %r0,0(%r2,%r4)
  281. stg %r0,0(%r2,%r1)
  282. lg %r0,8(%r2,%r3)
  283. slbg %r0,8(%r2,%r4)
  284. stg %r0,8(%r2,%r1)
  285. lg %r0,16(%r2,%r3)
  286. slbg %r0,16(%r2,%r4)
  287. stg %r0,16(%r2,%r1)
  288. lg %r0,24(%r2,%r3)
  289. slbg %r0,24(%r2,%r4)
  290. stg %r0,24(%r2,%r1)
  291. la %r2,32(%r2) // i+=4
  292. brct %r5,.Loop4_sub
  293. la %r6,1(%r6) // see if len%4 is zero ...
  294. brct %r6,.Loop1_sub // without touching condition code:-)
  295. .Lexit_sub:
  296. lghi %r2,0
  297. slbgr %r2,%r2
  298. lcgr %r2,%r2
  299. lg %r6,48(%r15)
  300. br %r14
  301. .size bn_sub_words,.-bn_sub_words
  302. #define c1 %r1
  303. #define c2 %r5
  304. #define c3 %r8
  305. #define mul_add_c(ai,bi,c1,c2,c3) \
  306. lg %r7,ai*8(%r3); \
  307. mlg %r6,bi*8(%r4); \
  308. algr c1,%r7; \
  309. alcgr c2,%r6; \
  310. alcgr c3,zero
  311. // void bn_mul_comba8(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
  312. .globl bn_mul_comba8
  313. .type bn_mul_comba8,@function
  314. .align 4
  315. bn_mul_comba8:
  316. stmg %r6,%r8,48(%r15)
  317. lghi c1,0
  318. lghi c2,0
  319. lghi c3,0
  320. lghi zero,0
  321. mul_add_c(0,0,c1,c2,c3);
  322. stg c1,0*8(%r2)
  323. lghi c1,0
  324. mul_add_c(0,1,c2,c3,c1);
  325. mul_add_c(1,0,c2,c3,c1);
  326. stg c2,1*8(%r2)
  327. lghi c2,0
  328. mul_add_c(2,0,c3,c1,c2);
  329. mul_add_c(1,1,c3,c1,c2);
  330. mul_add_c(0,2,c3,c1,c2);
  331. stg c3,2*8(%r2)
  332. lghi c3,0
  333. mul_add_c(0,3,c1,c2,c3);
  334. mul_add_c(1,2,c1,c2,c3);
  335. mul_add_c(2,1,c1,c2,c3);
  336. mul_add_c(3,0,c1,c2,c3);
  337. stg c1,3*8(%r2)
  338. lghi c1,0
  339. mul_add_c(4,0,c2,c3,c1);
  340. mul_add_c(3,1,c2,c3,c1);
  341. mul_add_c(2,2,c2,c3,c1);
  342. mul_add_c(1,3,c2,c3,c1);
  343. mul_add_c(0,4,c2,c3,c1);
  344. stg c2,4*8(%r2)
  345. lghi c2,0
  346. mul_add_c(0,5,c3,c1,c2);
  347. mul_add_c(1,4,c3,c1,c2);
  348. mul_add_c(2,3,c3,c1,c2);
  349. mul_add_c(3,2,c3,c1,c2);
  350. mul_add_c(4,1,c3,c1,c2);
  351. mul_add_c(5,0,c3,c1,c2);
  352. stg c3,5*8(%r2)
  353. lghi c3,0
  354. mul_add_c(6,0,c1,c2,c3);
  355. mul_add_c(5,1,c1,c2,c3);
  356. mul_add_c(4,2,c1,c2,c3);
  357. mul_add_c(3,3,c1,c2,c3);
  358. mul_add_c(2,4,c1,c2,c3);
  359. mul_add_c(1,5,c1,c2,c3);
  360. mul_add_c(0,6,c1,c2,c3);
  361. stg c1,6*8(%r2)
  362. lghi c1,0
  363. mul_add_c(0,7,c2,c3,c1);
  364. mul_add_c(1,6,c2,c3,c1);
  365. mul_add_c(2,5,c2,c3,c1);
  366. mul_add_c(3,4,c2,c3,c1);
  367. mul_add_c(4,3,c2,c3,c1);
  368. mul_add_c(5,2,c2,c3,c1);
  369. mul_add_c(6,1,c2,c3,c1);
  370. mul_add_c(7,0,c2,c3,c1);
  371. stg c2,7*8(%r2)
  372. lghi c2,0
  373. mul_add_c(7,1,c3,c1,c2);
  374. mul_add_c(6,2,c3,c1,c2);
  375. mul_add_c(5,3,c3,c1,c2);
  376. mul_add_c(4,4,c3,c1,c2);
  377. mul_add_c(3,5,c3,c1,c2);
  378. mul_add_c(2,6,c3,c1,c2);
  379. mul_add_c(1,7,c3,c1,c2);
  380. stg c3,8*8(%r2)
  381. lghi c3,0
  382. mul_add_c(2,7,c1,c2,c3);
  383. mul_add_c(3,6,c1,c2,c3);
  384. mul_add_c(4,5,c1,c2,c3);
  385. mul_add_c(5,4,c1,c2,c3);
  386. mul_add_c(6,3,c1,c2,c3);
  387. mul_add_c(7,2,c1,c2,c3);
  388. stg c1,9*8(%r2)
  389. lghi c1,0
  390. mul_add_c(7,3,c2,c3,c1);
  391. mul_add_c(6,4,c2,c3,c1);
  392. mul_add_c(5,5,c2,c3,c1);
  393. mul_add_c(4,6,c2,c3,c1);
  394. mul_add_c(3,7,c2,c3,c1);
  395. stg c2,10*8(%r2)
  396. lghi c2,0
  397. mul_add_c(4,7,c3,c1,c2);
  398. mul_add_c(5,6,c3,c1,c2);
  399. mul_add_c(6,5,c3,c1,c2);
  400. mul_add_c(7,4,c3,c1,c2);
  401. stg c3,11*8(%r2)
  402. lghi c3,0
  403. mul_add_c(7,5,c1,c2,c3);
  404. mul_add_c(6,6,c1,c2,c3);
  405. mul_add_c(5,7,c1,c2,c3);
  406. stg c1,12*8(%r2)
  407. lghi c1,0
  408. mul_add_c(6,7,c2,c3,c1);
  409. mul_add_c(7,6,c2,c3,c1);
  410. stg c2,13*8(%r2)
  411. lghi c2,0
  412. mul_add_c(7,7,c3,c1,c2);
  413. stg c3,14*8(%r2)
  414. stg c1,15*8(%r2)
  415. lmg %r6,%r8,48(%r15)
  416. br %r14
  417. .size bn_mul_comba8,.-bn_mul_comba8
  418. // void bn_mul_comba4(BN_ULONG *r2,BN_ULONG *r3,BN_ULONG *r4);
  419. .globl bn_mul_comba4
  420. .type bn_mul_comba4,@function
  421. .align 4
  422. bn_mul_comba4:
  423. stmg %r6,%r8,48(%r15)
  424. lghi c1,0
  425. lghi c2,0
  426. lghi c3,0
  427. lghi zero,0
  428. mul_add_c(0,0,c1,c2,c3);
  429. stg c1,0*8(%r3)
  430. lghi c1,0
  431. mul_add_c(0,1,c2,c3,c1);
  432. mul_add_c(1,0,c2,c3,c1);
  433. stg c2,1*8(%r2)
  434. lghi c2,0
  435. mul_add_c(2,0,c3,c1,c2);
  436. mul_add_c(1,1,c3,c1,c2);
  437. mul_add_c(0,2,c3,c1,c2);
  438. stg c3,2*8(%r2)
  439. lghi c3,0
  440. mul_add_c(0,3,c1,c2,c3);
  441. mul_add_c(1,2,c1,c2,c3);
  442. mul_add_c(2,1,c1,c2,c3);
  443. mul_add_c(3,0,c1,c2,c3);
  444. stg c1,3*8(%r2)
  445. lghi c1,0
  446. mul_add_c(3,1,c2,c3,c1);
  447. mul_add_c(2,2,c2,c3,c1);
  448. mul_add_c(1,3,c2,c3,c1);
  449. stg c2,4*8(%r2)
  450. lghi c2,0
  451. mul_add_c(2,3,c3,c1,c2);
  452. mul_add_c(3,2,c3,c1,c2);
  453. stg c3,5*8(%r2)
  454. lghi c3,0
  455. mul_add_c(3,3,c1,c2,c3);
  456. stg c1,6*8(%r2)
  457. stg c2,7*8(%r2)
  458. stmg %r6,%r8,48(%r15)
  459. br %r14
  460. .size bn_mul_comba4,.-bn_mul_comba4
  461. #define sqr_add_c(ai,c1,c2,c3) \
  462. lg %r7,ai*8(%r3); \
  463. mlgr %r6,%r7; \
  464. algr c1,%r7; \
  465. alcgr c2,%r6; \
  466. alcgr c3,zero
  467. #define sqr_add_c2(ai,aj,c1,c2,c3) \
  468. lg %r7,ai*8(%r3); \
  469. mlg %r6,aj*8(%r3); \
  470. algr c1,%r7; \
  471. alcgr c2,%r6; \
  472. alcgr c3,zero; \
  473. algr c1,%r7; \
  474. alcgr c2,%r6; \
  475. alcgr c3,zero
  476. // void bn_sqr_comba8(BN_ULONG *r2,BN_ULONG *r3);
  477. .globl bn_sqr_comba8
  478. .type bn_sqr_comba8,@function
  479. .align 4
  480. bn_sqr_comba8:
  481. stmg %r6,%r8,48(%r15)
  482. lghi c1,0
  483. lghi c2,0
  484. lghi c3,0
  485. lghi zero,0
  486. sqr_add_c(0,c1,c2,c3);
  487. stg c1,0*8(%r2)
  488. lghi c1,0
  489. sqr_add_c2(1,0,c2,c3,c1);
  490. stg c2,1*8(%r2)
  491. lghi c2,0
  492. sqr_add_c(1,c3,c1,c2);
  493. sqr_add_c2(2,0,c3,c1,c2);
  494. stg c3,2*8(%r2)
  495. lghi c3,0
  496. sqr_add_c2(3,0,c1,c2,c3);
  497. sqr_add_c2(2,1,c1,c2,c3);
  498. stg c1,3*8(%r2)
  499. lghi c1,0
  500. sqr_add_c(2,c2,c3,c1);
  501. sqr_add_c2(3,1,c2,c3,c1);
  502. sqr_add_c2(4,0,c2,c3,c1);
  503. stg c2,4*8(%r2)
  504. lghi c2,0
  505. sqr_add_c2(5,0,c3,c1,c2);
  506. sqr_add_c2(4,1,c3,c1,c2);
  507. sqr_add_c2(3,2,c3,c1,c2);
  508. stg c3,5*8(%r2)
  509. lghi c3,0
  510. sqr_add_c(3,c1,c2,c3);
  511. sqr_add_c2(4,2,c1,c2,c3);
  512. sqr_add_c2(5,1,c1,c2,c3);
  513. sqr_add_c2(6,0,c1,c2,c3);
  514. stg c1,6*8(%r2)
  515. lghi c1,0
  516. sqr_add_c2(7,0,c2,c3,c1);
  517. sqr_add_c2(6,1,c2,c3,c1);
  518. sqr_add_c2(5,2,c2,c3,c1);
  519. sqr_add_c2(4,3,c2,c3,c1);
  520. stg c2,7*8(%r2)
  521. lghi c2,0
  522. sqr_add_c(4,c3,c1,c2);
  523. sqr_add_c2(5,3,c3,c1,c2);
  524. sqr_add_c2(6,2,c3,c1,c2);
  525. sqr_add_c2(7,1,c3,c1,c2);
  526. stg c3,8*8(%r2)
  527. lghi c3,0
  528. sqr_add_c2(7,2,c1,c2,c3);
  529. sqr_add_c2(6,3,c1,c2,c3);
  530. sqr_add_c2(5,4,c1,c2,c3);
  531. stg c1,9*8(%r2)
  532. lghi c1,0
  533. sqr_add_c(5,c2,c3,c1);
  534. sqr_add_c2(6,4,c2,c3,c1);
  535. sqr_add_c2(7,3,c2,c3,c1);
  536. stg c2,10*8(%r2)
  537. lghi c2,0
  538. sqr_add_c2(7,4,c3,c1,c2);
  539. sqr_add_c2(6,5,c3,c1,c2);
  540. stg c3,11*8(%r2)
  541. lghi c3,0
  542. sqr_add_c(6,c1,c2,c3);
  543. sqr_add_c2(7,5,c1,c2,c3);
  544. stg c1,12*8(%r2)
  545. lghi c1,0
  546. sqr_add_c2(7,6,c2,c3,c1);
  547. stg c2,13*8(%r2)
  548. lghi c2,0
  549. sqr_add_c(7,c3,c1,c2);
  550. stg c3,14*8(%r2)
  551. stg c1,15*8(%r2)
  552. lmg %r6,%r8,48(%r15)
  553. br %r14
  554. .size bn_sqr_comba8,.-bn_sqr_comba8
  555. // void bn_sqr_comba4(BN_ULONG *r2,BN_ULONG *r3);
  556. .globl bn_sqr_comba4
  557. .type bn_sqr_comba4,@function
  558. .align 4
  559. bn_sqr_comba4:
  560. stmg %r6,%r8,48(%r15)
  561. lghi c1,0
  562. lghi c2,0
  563. lghi c3,0
  564. lghi zero,0
  565. sqr_add_c(0,c1,c2,c3);
  566. stg c1,0*8(%r2)
  567. lghi c1,0
  568. sqr_add_c2(1,0,c2,c3,c1);
  569. stg c2,1*8(%r2)
  570. lghi c2,0
  571. sqr_add_c(1,c3,c1,c2);
  572. sqr_add_c2(2,0,c3,c1,c2);
  573. stg c3,2*8(%r2)
  574. lghi c3,0
  575. sqr_add_c2(3,0,c1,c2,c3);
  576. sqr_add_c2(2,1,c1,c2,c3);
  577. stg c1,3*8(%r2)
  578. lghi c1,0
  579. sqr_add_c(2,c2,c3,c1);
  580. sqr_add_c2(3,1,c2,c3,c1);
  581. stg c2,4*8(%r2)
  582. lghi c2,0
  583. sqr_add_c2(3,2,c3,c1,c2);
  584. stg c3,5*8(%r2)
  585. lghi c3,0
  586. sqr_add_c(3,c1,c2,c3);
  587. stg c1,6*8(%r2)
  588. stg c2,7*8(%r2)
  589. lmg %r6,%r8,48(%r15)
  590. br %r14
  591. .size bn_sqr_comba4,.-bn_sqr_comba4