ecp_sm2p256-armv8.pl 16 KB


  1. #! /usr/bin/env perl
  2. # Copyright 2023-2024 The OpenSSL Project Authors. All Rights Reserved.
  3. #
  4. # Licensed under the Apache License 2.0 (the "License"). You may not use
  5. # this file except in compliance with the License. You can obtain a copy
  6. # in the file LICENSE in the source distribution or at
  7. # https://www.openssl.org/source/license.html
  8. # $output is the last argument if it looks like a file (it has an extension)
  9. # $flavour is the first argument if it doesn't look like a file
  10. $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  11. $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  12. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  13. ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  14. ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  15. die "can't locate arm-xlate.pl";
  16. open OUT,"| \"$^X\" $xlate $flavour \"$output\""
  17. or die "can't call $xlate: $!";
  18. *STDOUT=*OUT;
  19. my ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("x$_",(7..14));
  20. my ($a8,$a10,$a12,$a14,$a9,$a11,$a13,$a15)=map("x$_",(7..14));
  21. my ($t0,$t1,$t2,$t3)=map("x$_",(3..6));
  22. my ($t4,$t5,$t6,$t7,$t8)=map("x$_",(15..17,19,20));
  23. sub bn_mod_add() {
  24. my $mod = shift;
  25. $code.=<<___;
  26. // Load inputs
  27. ldp $s0,$s1,[x1]
  28. ldp $s2,$s3,[x1,#16]
  29. ldp $s4,$s5,[x2]
  30. ldp $s6,$s7,[x2,#16]
  31. // Addition
  32. adds $s0,$s0,$s4
  33. adcs $s1,$s1,$s5
  34. adcs $s2,$s2,$s6
  35. adcs $s3,$s3,$s7
  36. adc $t4,xzr,xzr
  37. // Load polynomial
  38. adr x2,$mod
  39. ldp $s4,$s5,[x2]
  40. ldp $s6,$s7,[x2,#16]
  41. // Backup Addition
  42. mov $t0,$s0
  43. mov $t1,$s1
  44. mov $t2,$s2
  45. mov $t3,$s3
  46. // Sub polynomial
  47. subs $t0,$t0,$s4
  48. sbcs $t1,$t1,$s5
  49. sbcs $t2,$t2,$s6
  50. sbcs $t3,$t3,$s7
  51. sbcs $t4,$t4,xzr
  52. // Select based on carry
  53. csel $s0,$s0,$t0,cc
  54. csel $s1,$s1,$t1,cc
  55. csel $s2,$s2,$t2,cc
  56. csel $s3,$s3,$t3,cc
  57. // Store results
  58. stp $s0,$s1,[x0]
  59. stp $s2,$s3,[x0,#16]
  60. ___
  61. }
  62. sub bn_mod_sub() {
  63. my $mod = shift;
  64. $code.=<<___;
  65. // Load inputs
  66. ldp $s0,$s1,[x1]
  67. ldp $s2,$s3,[x1,#16]
  68. ldp $s4,$s5,[x2]
  69. ldp $s6,$s7,[x2,#16]
  70. // Subtraction
  71. subs $s0,$s0,$s4
  72. sbcs $s1,$s1,$s5
  73. sbcs $s2,$s2,$s6
  74. sbcs $s3,$s3,$s7
  75. sbc $t4,xzr,xzr
  76. // Load polynomial
  77. adr x2,$mod
  78. ldp $s4,$s5,[x2]
  79. ldp $s6,$s7,[x2,#16]
  80. // Backup subtraction
  81. mov $t0,$s0
  82. mov $t1,$s1
  83. mov $t2,$s2
  84. mov $t3,$s3
  85. // Add polynomial
  86. adds $t0,$t0,$s4
  87. adcs $t1,$t1,$s5
  88. adcs $t2,$t2,$s6
  89. adcs $t3,$t3,$s7
  90. tst $t4,$t4
  91. // Select based on carry
  92. csel $s0,$s0,$t0,eq
  93. csel $s1,$s1,$t1,eq
  94. csel $s2,$s2,$t2,eq
  95. csel $s3,$s3,$t3,eq
  96. // Store results
  97. stp $s0,$s1,[x0]
  98. stp $s2,$s3,[x0,#16]
  99. ___
  100. }
  101. sub bn_mod_div_by_2() {
  102. my $mod = shift;
  103. $code.=<<___;
  104. // Load inputs
  105. ldp $s0,$s1,[x1]
  106. ldp $s2,$s3,[x1,#16]
  107. // Save the least significant bit
  108. mov $t0,$s0
  109. // Right shift 1
  110. extr $s0,$s1,$s0,#1
  111. extr $s1,$s2,$s1,#1
  112. extr $s2,$s3,$s2,#1
  113. lsr $s3,$s3,#1
  114. // Load mod
  115. adr x2,$mod
  116. ldp $s4,$s5,[x2]
  117. ldp $s6,$s7,[x2,#16]
  118. // Parity check
  119. tst $t0,#1
  120. csel $s4,xzr,$s4,eq
  121. csel $s5,xzr,$s5,eq
  122. csel $s6,xzr,$s6,eq
  123. csel $s7,xzr,$s7,eq
  124. // Add
  125. adds $s0,$s0,$s4
  126. adcs $s1,$s1,$s5
  127. adcs $s2,$s2,$s6
  128. adc $s3,$s3,$s7
  129. // Store results
  130. stp $s0,$s1,[x0]
  131. stp $s2,$s3,[x0,#16]
  132. ___
  133. }
  134. {
  135. $code.=<<___;
  136. #include "arm_arch.h"
  137. .arch armv8-a
  138. .text
  139. .align 5
  140. // The polynomial p
  141. .Lpoly:
  142. .quad 0xffffffffffffffff,0xffffffff00000000,0xffffffffffffffff,0xfffffffeffffffff
  143. // The order of polynomial n
  144. .Lord:
  145. .quad 0x53bbf40939d54123,0x7203df6b21c6052b,0xffffffffffffffff,0xfffffffeffffffff
  146. // (p + 1) / 2
  147. .Lpoly_div_2:
  148. .quad 0x8000000000000000,0xffffffff80000000,0xffffffffffffffff,0x7fffffff7fffffff
  149. // (n + 1) / 2
  150. .Lord_div_2:
  151. .quad 0xa9ddfa049ceaa092,0xb901efb590e30295,0xffffffffffffffff,0x7fffffff7fffffff
  152. // void bn_rshift1(BN_ULONG *a);
  153. .globl bn_rshift1
  154. .type bn_rshift1,%function
  155. .align 5
  156. bn_rshift1:
  157. AARCH64_VALID_CALL_TARGET
  158. // Load inputs
  159. ldp $s0,$s1,[x0]
  160. ldp $s2,$s3,[x0,#16]
  161. // Right shift
  162. extr $s0,$s1,$s0,#1
  163. extr $s1,$s2,$s1,#1
  164. extr $s2,$s3,$s2,#1
  165. lsr $s3,$s3,#1
  166. // Store results
  167. stp $s0,$s1,[x0]
  168. stp $s2,$s3,[x0,#16]
  169. ret
  170. .size bn_rshift1,.-bn_rshift1
  171. // void bn_sub(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
  172. .globl bn_sub
  173. .type bn_sub,%function
  174. .align 5
  175. bn_sub:
  176. AARCH64_VALID_CALL_TARGET
  177. // Load inputs
  178. ldp $s0,$s1,[x1]
  179. ldp $s2,$s3,[x1,#16]
  180. ldp $s4,$s5,[x2]
  181. ldp $s6,$s7,[x2,#16]
  182. // Subtraction
  183. subs $s0,$s0,$s4
  184. sbcs $s1,$s1,$s5
  185. sbcs $s2,$s2,$s6
  186. sbc $s3,$s3,$s7
  187. // Store results
  188. stp $s0,$s1,[x0]
  189. stp $s2,$s3,[x0,#16]
  190. ret
  191. .size bn_sub,.-bn_sub
  192. // void ecp_sm2p256_div_by_2(BN_ULONG *r,const BN_ULONG *a);
  193. .globl ecp_sm2p256_div_by_2
  194. .type ecp_sm2p256_div_by_2,%function
  195. .align 5
  196. ecp_sm2p256_div_by_2:
  197. AARCH64_VALID_CALL_TARGET
  198. ___
  199. &bn_mod_div_by_2(".Lpoly_div_2");
  200. $code.=<<___;
  201. ret
  202. .size ecp_sm2p256_div_by_2,.-ecp_sm2p256_div_by_2
  203. // void ecp_sm2p256_div_by_2_mod_ord(BN_ULONG *r,const BN_ULONG *a);
  204. .globl ecp_sm2p256_div_by_2_mod_ord
  205. .type ecp_sm2p256_div_by_2_mod_ord,%function
  206. .align 5
  207. ecp_sm2p256_div_by_2_mod_ord:
  208. AARCH64_VALID_CALL_TARGET
  209. ___
  210. &bn_mod_div_by_2(".Lord_div_2");
  211. $code.=<<___;
  212. ret
  213. .size ecp_sm2p256_div_by_2_mod_ord,.-ecp_sm2p256_div_by_2_mod_ord
  214. // void ecp_sm2p256_mul_by_3(BN_ULONG *r,const BN_ULONG *a);
  215. .globl ecp_sm2p256_mul_by_3
  216. .type ecp_sm2p256_mul_by_3,%function
  217. .align 5
  218. ecp_sm2p256_mul_by_3:
  219. AARCH64_VALID_CALL_TARGET
  220. // Load inputs
  221. ldp $s0,$s1,[x1]
  222. ldp $s2,$s3,[x1,#16]
  223. // 2*a
  224. adds $s0,$s0,$s0
  225. adcs $s1,$s1,$s1
  226. adcs $s2,$s2,$s2
  227. adcs $s3,$s3,$s3
  228. adcs $t4,xzr,xzr
  229. mov $t0,$s0
  230. mov $t1,$s1
  231. mov $t2,$s2
  232. mov $t3,$s3
  233. // Sub polynomial
  234. adr x2,.Lpoly
  235. ldp $s4,$s5,[x2]
  236. ldp $s6,$s7,[x2,#16]
  237. subs $s0,$s0,$s4
  238. sbcs $s1,$s1,$s5
  239. sbcs $s2,$s2,$s6
  240. sbcs $s3,$s3,$s7
  241. sbcs $t4,$t4,xzr
  242. csel $s0,$s0,$t0,cs
  243. csel $s1,$s1,$t1,cs
  244. csel $s2,$s2,$t2,cs
  245. csel $s3,$s3,$t3,cs
  246. eor $t4,$t4,$t4
  247. // 3*a
  248. ldp $s4,$s5,[x1]
  249. ldp $s6,$s7,[x1,#16]
  250. adds $s0,$s0,$s4
  251. adcs $s1,$s1,$s5
  252. adcs $s2,$s2,$s6
  253. adcs $s3,$s3,$s7
  254. adcs $t4,xzr,xzr
  255. mov $t0,$s0
  256. mov $t1,$s1
  257. mov $t2,$s2
  258. mov $t3,$s3
  259. // Sub polynomial
  260. adr x2,.Lpoly
  261. ldp $s4,$s5,[x2]
  262. ldp $s6,$s7,[x2,#16]
  263. subs $s0,$s0,$s4
  264. sbcs $s1,$s1,$s5
  265. sbcs $s2,$s2,$s6
  266. sbcs $s3,$s3,$s7
  267. sbcs $t4,$t4,xzr
  268. csel $s0,$s0,$t0,cs
  269. csel $s1,$s1,$t1,cs
  270. csel $s2,$s2,$t2,cs
  271. csel $s3,$s3,$t3,cs
  272. // Store results
  273. stp $s0,$s1,[x0]
  274. stp $s2,$s3,[x0,#16]
  275. ret
  276. .size ecp_sm2p256_mul_by_3,.-ecp_sm2p256_mul_by_3
  277. // void ecp_sm2p256_add(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b);
  278. .globl ecp_sm2p256_add
  279. .type ecp_sm2p256_add,%function
  280. .align 5
  281. ecp_sm2p256_add:
  282. AARCH64_VALID_CALL_TARGET
  283. ___
  284. &bn_mod_add(".Lpoly");
  285. $code.=<<___;
  286. ret
  287. .size ecp_sm2p256_add,.-ecp_sm2p256_add
  288. // void ecp_sm2p256_sub(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b);
  289. .globl ecp_sm2p256_sub
  290. .type ecp_sm2p256_sub,%function
  291. .align 5
  292. ecp_sm2p256_sub:
  293. AARCH64_VALID_CALL_TARGET
  294. ___
  295. &bn_mod_sub(".Lpoly");
  296. $code.=<<___;
  297. ret
  298. .size ecp_sm2p256_sub,.-ecp_sm2p256_sub
  299. // void ecp_sm2p256_sub_mod_ord(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b);
  300. .globl ecp_sm2p256_sub_mod_ord
  301. .type ecp_sm2p256_sub_mod_ord,%function
  302. .align 5
  303. ecp_sm2p256_sub_mod_ord:
  304. AARCH64_VALID_CALL_TARGET
  305. ___
  306. &bn_mod_sub(".Lord");
  307. $code.=<<___;
  308. ret
  309. .size ecp_sm2p256_sub_mod_ord,.-ecp_sm2p256_sub_mod_ord
  310. .macro RDC
  311. // a = | s7 | ... | s0 |, where si are 64-bit quantities
  312. // = |a15|a14| ... |a1|a0|, where ai are 32-bit quantities
  313. // | s7 | s6 | s5 | s4 |
  314. // | a15 | a14 | a13 | a12 | a11 | a10 | a9 | a8 |
  315. // | s3 | s2 | s1 | s0 |
  316. // | a7 | a6 | a5 | a4 | a3 | a2 | a1 | a0 |
  317. // =================================================
  318. // | a8 | a11 | a10 | a9 | a8 | 0 | s4 | (+)
  319. // | a9 | a15 | s6 | a11 | 0 | a10 | a9 | (+)
  320. // | a10 | 0 | a14 | a13 | a12 | 0 | s5 | (+)
  321. // | a11 | 0 | s7 | a13 | 0 | a12 | a11 | (+)
  322. // | a12 | 0 | s7 | a13 | 0 | s6 | (+)
  323. // | a12 | 0 | 0 | a15 | a14 | 0 | a14 | a13 | (+)
  324. // | a13 | 0 | 0 | 0 | a15 | 0 | a14 | a13 | (+)
  325. // | a13 | 0 | 0 | 0 | 0 | 0 | s7 | (+)
  326. // | a14 | 0 | 0 | 0 | 0 | 0 | s7 | (+)
  327. // | a14 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+)
  328. // | a15 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+)
  329. // | a15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (+)
  330. // | s7 | 0 | 0 | 0 | 0 | 0 | 0 | (+)
  331. // | 0 | 0 | 0 | 0 | 0 | a8 | 0 | 0 | (-)
  332. // | 0 | 0 | 0 | 0 | 0 | a9 | 0 | 0 | (-)
  333. // | 0 | 0 | 0 | 0 | 0 | a13 | 0 | 0 | (-)
  334. // | 0 | 0 | 0 | 0 | 0 | a14 | 0 | 0 | (-)
  335. // | U[7]| U[6]| U[5]| U[4]| U[3]| U[2]| U[1]| U[0]|
  336. // | V[3] | V[2] | V[1] | V[0] |
  337. // 1. 64-bit addition
  338. // t2=s6+s7+s7
  339. adds $t2,$s6,$s7
  340. adcs $t1,xzr,xzr
  341. adds $t2,$t2,$s7
  342. adcs $t1,$t1,xzr
  343. // t3=s4+s5+t2
  344. adds $t3,$s4,$t2
  345. adcs $t4,$t1,xzr
  346. adds $t3,$t3,$s5
  347. adcs $t4,$t4,xzr
  348. // sum
  349. adds $s0,$s0,$t3
  350. adcs $s1,$s1,$t4
  351. adcs $s2,$s2,$t2
  352. adcs $s3,$s3,$s7
  353. adcs $t0,xzr,xzr
  354. adds $s3,$s3,$t1
  355. adcs $t0,$t0,xzr
  356. stp $s0,$s1,[sp,#32]
  357. stp $s2,$s3,[sp,#48]
  358. // 2. 64-bit to 32-bit spread
  359. mov $t1,#0xffffffff
  360. mov $s0,$s4
  361. mov $s1,$s5
  362. mov $s2,$s6
  363. mov $s3,$s7
  364. and $s0,$s0,$t1 // a8
  365. and $s1,$s1,$t1 // a10
  366. and $s2,$s2,$t1 // a12
  367. and $s3,$s3,$t1 // a14
  368. lsr $s4,$s4,#32 // a9
  369. lsr $s5,$s5,#32 // a11
  370. lsr $s6,$s6,#32 // a13
  371. lsr $s7,$s7,#32 // a15
  372. // 3. 32-bit addition
  373. add $t1,$a14,$a12 // t1 <- a12 + a14
  374. add $t2,$a15,$a13 // t2 <- a13 + a15
  375. add $t3,$a8,$a9 // t3 <- a8 + a9
  376. add $t4,$a14,$a10 // t4 <- a10 + a14
  377. add $a15,$a15,$a11 // a15 <- a11 + a15
  378. add $a12,$t2,$t1 // a12 <- a12 + a13 + a14 + a15
  379. add $a10,$a10,$a12 // a10 <- a10 + a12 + a13 + a14 + a15
  380. add $a10,$a10,$a12 // a10 <- a10 + 2*(a12 + a13 + a14 + a15)
  381. add $a10,$a10,$t3 // a10 <- a8 + a9 + a10 + 2*(a12 + a13 + a14 + a15)
  382. add $a10,$a10,$a11 // a10 <- a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15)
  383. add $a12,$a12,$a13 // a12 <- a12 + 2*a13 + a14 + a15
  384. add $a12,$a12,$a11 // a12 <- a11 + a12 + 2*a13 + a14 + a15
  385. add $a12,$a12,$a8 // a12 <- a8 + a11 + a12 + 2*a13 + a14 + a15
  386. add $t3,$t3,$a14 // t3 <- a8 + a9 + a14
  387. add $t3,$t3,$a13 // t3 <- a8 + a9 + a13 + a14
  388. add $a9,$a9,$t2 // a9 <- a9 + a13 + a15
  389. add $a11,$a11,$a9 // a11 <- a9 + a11 + a13 + a15
  390. add $a11,$a11,$t2 // a11 <- a9 + a11 + 2*(a13 + a15)
  391. add $t1,$t1,$t4 // t1 <- a10 + a12 + 2*a14
  392. // U[0] s5 a9 + a11 + 2*(a13 + a15)
  393. // U[1] t1 a10 + a12 + 2*a14
  394. // U[2] -t3 a8 + a9 + a13 + a14
  395. // U[3] s2 a8 + a11 + a12 + 2*a13 + a14 + a15
  396. // U[4] s4 a9 + a13 + a15
  397. // U[5] t4 a10 + a14
  398. // U[6] s7 a11 + a15
  399. // U[7] s1 a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15)
  400. // 4. 32-bit to 64-bit
  401. lsl $s0,$t1,#32
  402. extr $t1,$s2,$t1,#32
  403. extr $s2,$t4,$s2,#32
  404. extr $t4,$s1,$t4,#32
  405. lsr $s1,$s1,#32
  406. // 5. 64-bit addition
  407. adds $s5,$s5,$s0
  408. adcs $t1,$t1,xzr
  409. adcs $s4,$s4,$s2
  410. adcs $s7,$s7,$t4
  411. adcs $t0,$t0,$s1
  412. // V[0] s5
  413. // V[1] t1
  414. // V[2] s4
  415. // V[3] s7
  416. // carry t0
  417. // sub t3
  418. // 5. Process s0-s3
  419. ldp $s0,$s1,[sp,#32]
  420. ldp $s2,$s3,[sp,#48]
  421. // add with V0-V3
  422. adds $s0,$s0,$s5
  423. adcs $s1,$s1,$t1
  424. adcs $s2,$s2,$s4
  425. adcs $s3,$s3,$s7
  426. adcs $t0,$t0,xzr
  427. // sub with t3
  428. subs $s1,$s1,$t3
  429. sbcs $s2,$s2,xzr
  430. sbcs $s3,$s3,xzr
  431. sbcs $t0,$t0,xzr
  432. // 6. MOD
  433. // First Mod
  434. lsl $t1,$t0,#32
  435. subs $t2,$t1,$t0
  436. adds $s0,$s0,$t0
  437. adcs $s1,$s1,$t2
  438. adcs $s2,$s2,xzr
  439. adcs $s3,$s3,$t1
  440. // Last Mod
  441. // return y - p if y > p else y
  442. mov $s4,$s0
  443. mov $s5,$s1
  444. mov $s6,$s2
  445. mov $s7,$s3
  446. adr $t0,.Lpoly
  447. ldp $t1,$t2,[$t0]
  448. ldp $t3,$t4,[$t0,#16]
  449. adcs $t5,xzr,xzr
  450. subs $s0,$s0,$t1
  451. sbcs $s1,$s1,$t2
  452. sbcs $s2,$s2,$t3
  453. sbcs $s3,$s3,$t4
  454. sbcs $t5,$t5,xzr
  455. csel $s0,$s0,$s4,cs
  456. csel $s1,$s1,$s5,cs
  457. csel $s2,$s2,$s6,cs
  458. csel $s3,$s3,$s7,cs
  459. .endm
  460. // void ecp_sm2p256_mul(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
  461. .globl ecp_sm2p256_mul
  462. .type ecp_sm2p256_mul,%function
  463. .align 5
  464. ecp_sm2p256_mul:
  465. AARCH64_SIGN_LINK_REGISTER
  466. // Store scalar registers
  467. stp x29,x30,[sp,#-80]!
  468. add x29,sp,#0
  469. stp x16,x17,[sp,#16]
  470. stp x19,x20,[sp,#64]
  471. // Load inputs
  472. ldp $s0,$s1,[x1]
  473. ldp $s2,$s3,[x1,#16]
  474. ldp $s4,$s5,[x2]
  475. ldp $s6,$s7,[x2,#16]
  476. // ### multiplication ###
  477. // ========================
  478. // s3 s2 s1 s0
  479. // * s7 s6 s5 s4
  480. // ------------------------
  481. // + s0 s0 s0 s0
  482. // * * * *
  483. // s7 s6 s5 s4
  484. // s1 s1 s1 s1
  485. // * * * *
  486. // s7 s6 s5 s4
  487. // s2 s2 s2 s2
  488. // * * * *
  489. // s7 s6 s5 s4
  490. // s3 s3 s3 s3
  491. // * * * *
  492. // s7 s6 s5 s4
  493. // ------------------------
  494. // s7 s6 s5 s4 s3 s2 s1 s0
  495. // ========================
  496. // ### s0*s4 ###
  497. mul $t5,$s0,$s4
  498. umulh $t2,$s0,$s4
  499. // ### s1*s4 + s0*s5 ###
  500. mul $t0,$s1,$s4
  501. umulh $t1,$s1,$s4
  502. adds $t2,$t2,$t0
  503. adcs $t3,$t1,xzr
  504. mul $t0,$s0,$s5
  505. umulh $t1,$s0,$s5
  506. adds $t2,$t2,$t0
  507. adcs $t3,$t3,$t1
  508. adcs $t4,xzr,xzr
  509. // ### s2*s4 + s1*s5 + s0*s6 ###
  510. mul $t0,$s2,$s4
  511. umulh $t1,$s2,$s4
  512. adds $t3,$t3,$t0
  513. adcs $t4,$t4,$t1
  514. mul $t0,$s1,$s5
  515. umulh $t1,$s1,$s5
  516. adds $t3,$t3,$t0
  517. adcs $t4,$t4,$t1
  518. adcs $t6,xzr,xzr
  519. mul $t0,$s0,$s6
  520. umulh $t1,$s0,$s6
  521. adds $t3,$t3,$t0
  522. adcs $t4,$t4,$t1
  523. adcs $t6,$t6,xzr
  524. // ### s3*s4 + s2*s5 + s1*s6 + s0*s7 ###
  525. mul $t0,$s3,$s4
  526. umulh $t1,$s3,$s4
  527. adds $t4,$t4,$t0
  528. adcs $t6,$t6,$t1
  529. adcs $t7,xzr,xzr
  530. mul $t0,$s2,$s5
  531. umulh $t1,$s2,$s5
  532. adds $t4,$t4,$t0
  533. adcs $t6,$t6,$t1
  534. adcs $t7,$t7,xzr
  535. mul $t0,$s1,$s6
  536. umulh $t1,$s1,$s6
  537. adds $t4,$t4,$t0
  538. adcs $t6,$t6,$t1
  539. adcs $t7,$t7,xzr
  540. mul $t0,$s0,$s7
  541. umulh $t1,$s0,$s7
  542. adds $t4,$t4,$t0
  543. adcs $t6,$t6,$t1
  544. adcs $t7,$t7,xzr
  545. // ### s3*s5 + s2*s6 + s1*s7 ###
  546. mul $t0,$s3,$s5
  547. umulh $t1,$s3,$s5
  548. adds $t6,$t6,$t0
  549. adcs $t7,$t7,$t1
  550. adcs $t8,xzr,xzr
  551. mul $t0,$s2,$s6
  552. umulh $t1,$s2,$s6
  553. adds $t6,$t6,$t0
  554. adcs $t7,$t7,$t1
  555. adcs $t8,$t8,xzr
  556. mul $t0,$s1,$s7
  557. umulh $t1,$s1,$s7
  558. adds $s4,$t6,$t0
  559. adcs $t7,$t7,$t1
  560. adcs $t8,$t8,xzr
  561. // ### s3*s6 + s2*s7 ###
  562. mul $t0,$s3,$s6
  563. umulh $t1,$s3,$s6
  564. adds $t7,$t7,$t0
  565. adcs $t8,$t8,$t1
  566. adcs $t6,xzr,xzr
  567. mul $t0,$s2,$s7
  568. umulh $t1,$s2,$s7
  569. adds $s5,$t7,$t0
  570. adcs $t8,$t8,$t1
  571. adcs $t6,$t6,xzr
  572. // ### s3*s7 ###
  573. mul $t0,$s3,$s7
  574. umulh $t1,$s3,$s7
  575. adds $s6,$t8,$t0
  576. adcs $s7,$t6,$t1
  577. mov $s0,$t5
  578. mov $s1,$t2
  579. mov $s2,$t3
  580. mov $s3,$t4
  581. // result of mul: s7 s6 s5 s4 s3 s2 s1 s0
  582. // ### Reduction ###
  583. RDC
  584. stp $s0,$s1,[x0]
  585. stp $s2,$s3,[x0,#16]
  586. // Restore scalar registers
  587. ldp x16,x17,[sp,#16]
  588. ldp x19,x20,[sp,#64]
  589. ldp x29,x30,[sp],#80
  590. AARCH64_VALIDATE_LINK_REGISTER
  591. ret
  592. .size ecp_sm2p256_mul,.-ecp_sm2p256_mul
  593. // void ecp_sm2p256_sqr(BN_ULONG *r, const BN_ULONG *a);
  594. .globl ecp_sm2p256_sqr
  595. .type ecp_sm2p256_sqr,%function
  596. .align 5
  597. ecp_sm2p256_sqr:
  598. AARCH64_SIGN_LINK_REGISTER
  599. // Store scalar registers
  600. stp x29,x30,[sp,#-80]!
  601. add x29,sp,#0
  602. stp x16,x17,[sp,#16]
  603. stp x19,x20,[sp,#64]
  604. // Load inputs
  605. ldp $s4,$s5,[x1]
  606. ldp $s6,$s7,[x1,#16]
  607. // ### square ###
  608. // ========================
  609. // s7 s6 s5 s4
  610. // * s7 s6 s5 s4
  611. // ------------------------
  612. // + s4 s4 s4 s4
  613. // * * * *
  614. // s7 s6 s5 s4
  615. // s5 s5 s5 s5
  616. // * * * *
  617. // s7 s6 s5 s4
  618. // s6 s6 s6 s6
  619. // * * * *
  620. // s7 s6 s5 s4
  621. // s7 s7 s7 s7
  622. // * * * *
  623. // s7 s6 s5 s4
  624. // ------------------------
  625. // s7 s6 s5 s4 s3 s2 s1 s0
  626. // ========================
  627. // ### s4*s5 ###
  628. mul $s1,$s4,$s5
  629. umulh $s2,$s4,$s5
  630. // ### s4*s6 ###
  631. mul $t0,$s6,$s4
  632. umulh $s3,$s6,$s4
  633. adds $s2,$s2,$t0
  634. adcs $s3,$s3,xzr
  635. // ### s4*s7 + s5*s6 ###
  636. mul $t0,$s7,$s4
  637. umulh $t1,$s7,$s4
  638. adds $s3,$s3,$t0
  639. adcs $s0,$t1,xzr
  640. mul $t0,$s6,$s5
  641. umulh $t1,$s6,$s5
  642. adds $s3,$s3,$t0
  643. adcs $s0,$s0,$t1
  644. adcs $t2,xzr,xzr
  645. // ### s5*s7 ###
  646. mul $t0,$s7,$s5
  647. umulh $t1,$s7,$s5
  648. adds $s0,$s0,$t0
  649. adcs $t2,$t2,$t1
  650. // ### s6*s7 ###
  651. mul $t0,$s7,$s6
  652. umulh $t1,$s7,$s6
  653. adds $t2,$t2,$t0
  654. adcs $t3,$t1,xzr
  655. // ### 2*(t3,t2,s0,s3,s2,s1) ###
  656. adds $s1,$s1,$s1
  657. adcs $s2,$s2,$s2
  658. adcs $s3,$s3,$s3
  659. adcs $s0,$s0,$s0
  660. adcs $t2,$t2,$t2
  661. adcs $t3,$t3,$t3
  662. adcs $t4,xzr,xzr
  663. // ### s4*s4 ###
  664. mul $t5,$s4,$s4
  665. umulh $t6,$s4,$s4
  666. // ### s5*s5 ###
  667. mul $s4,$s5,$s5
  668. umulh $s5,$s5,$s5
  669. // ### s6*s6 ###
  670. mul $t0,$s6,$s6
  671. umulh $t1,$s6,$s6
  672. // ### s7*s7 ###
  673. mul $t7,$s7,$s7
  674. umulh $t8,$s7,$s7
  675. adds $s1,$s1,$t6
  676. adcs $s2,$s2,$s4
  677. adcs $s3,$s3,$s5
  678. adcs $s0,$s0,$t0
  679. adcs $t2,$t2,$t1
  680. adcs $t3,$t3,$t7
  681. adcs $t4,$t4,$t8
  682. mov $s4,$s0
  683. mov $s0,$t5
  684. mov $s5,$t2
  685. mov $s6,$t3
  686. mov $s7,$t4
  687. // result of mul: s7 s6 s5 s4 s3 s2 s1 s0
  688. // ### Reduction ###
  689. RDC
  690. stp $s0,$s1,[x0]
  691. stp $s2,$s3,[x0,#16]
  692. // Restore scalar registers
  693. ldp x16,x17,[sp,#16]
  694. ldp x19,x20,[sp,#64]
  695. ldp x29,x30,[sp],#80
  696. AARCH64_VALIDATE_LINK_REGISTER
  697. ret
  698. .size ecp_sm2p256_sqr,.-ecp_sm2p256_sqr
  699. ___
  700. }
  701. foreach (split("\n",$code)) {
  702. s/\`([^\`]*)\`/eval $1/ge;
  703. print $_,"\n";
  704. }
  705. close STDOUT or die "error closing STDOUT: $!"; # enforce flush