123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823 |
- #! /usr/bin/env perl
- # Copyright 2023-2024 The OpenSSL Project Authors. All Rights Reserved.
- #
- # Licensed under the Apache License 2.0 (the "License"). You may not use
- # this file except in compliance with the License. You can obtain a copy
- # in the file LICENSE in the source distribution or at
- # https://www.openssl.org/source/license.html
- # $output is the last argument if it looks like a file (it has an extension)
- # $flavour is the first argument if it doesn't look like a file
- $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
- $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
- die "can't locate arm-xlate.pl";
- open OUT,"| \"$^X\" $xlate $flavour \"$output\""
- or die "can't call $xlate: $!";
- *STDOUT=*OUT;
- my ($s0,$s1,$s2,$s3,$s4,$s5,$s6,$s7)=map("x$_",(7..14));
- my ($a8,$a10,$a12,$a14,$a9,$a11,$a13,$a15)=map("x$_",(7..14));
- my ($t0,$t1,$t2,$t3)=map("x$_",(3..6));
- my ($t4,$t5,$t6,$t7,$t8)=map("x$_",(15..17,19,20));
- sub bn_mod_add() {
- my $mod = shift;
- $code.=<<___;
- // Load inputs
- ldp $s0,$s1,[x1]
- ldp $s2,$s3,[x1,#16]
- ldp $s4,$s5,[x2]
- ldp $s6,$s7,[x2,#16]
- // Addition
- adds $s0,$s0,$s4
- adcs $s1,$s1,$s5
- adcs $s2,$s2,$s6
- adcs $s3,$s3,$s7
- adc $t4,xzr,xzr
- // Load polynomial
- adr x2,$mod
- ldp $s4,$s5,[x2]
- ldp $s6,$s7,[x2,#16]
- // Backup Addition
- mov $t0,$s0
- mov $t1,$s1
- mov $t2,$s2
- mov $t3,$s3
- // Sub polynomial
- subs $t0,$t0,$s4
- sbcs $t1,$t1,$s5
- sbcs $t2,$t2,$s6
- sbcs $t3,$t3,$s7
- sbcs $t4,$t4,xzr
- // Select based on carry
- csel $s0,$s0,$t0,cc
- csel $s1,$s1,$t1,cc
- csel $s2,$s2,$t2,cc
- csel $s3,$s3,$t3,cc
- // Store results
- stp $s0,$s1,[x0]
- stp $s2,$s3,[x0,#16]
- ___
- }
- sub bn_mod_sub() {
- my $mod = shift;
- $code.=<<___;
- // Load inputs
- ldp $s0,$s1,[x1]
- ldp $s2,$s3,[x1,#16]
- ldp $s4,$s5,[x2]
- ldp $s6,$s7,[x2,#16]
- // Subtraction
- subs $s0,$s0,$s4
- sbcs $s1,$s1,$s5
- sbcs $s2,$s2,$s6
- sbcs $s3,$s3,$s7
- sbc $t4,xzr,xzr
- // Load polynomial
- adr x2,$mod
- ldp $s4,$s5,[x2]
- ldp $s6,$s7,[x2,#16]
- // Backup subtraction
- mov $t0,$s0
- mov $t1,$s1
- mov $t2,$s2
- mov $t3,$s3
- // Add polynomial
- adds $t0,$t0,$s4
- adcs $t1,$t1,$s5
- adcs $t2,$t2,$s6
- adcs $t3,$t3,$s7
- tst $t4,$t4
- // Select based on carry
- csel $s0,$s0,$t0,eq
- csel $s1,$s1,$t1,eq
- csel $s2,$s2,$t2,eq
- csel $s3,$s3,$t3,eq
- // Store results
- stp $s0,$s1,[x0]
- stp $s2,$s3,[x0,#16]
- ___
- }
- sub bn_mod_div_by_2() {
- my $mod = shift;
- $code.=<<___;
- // Load inputs
- ldp $s0,$s1,[x1]
- ldp $s2,$s3,[x1,#16]
- // Save the least significant bit
- mov $t0,$s0
- // Right shift 1
- extr $s0,$s1,$s0,#1
- extr $s1,$s2,$s1,#1
- extr $s2,$s3,$s2,#1
- lsr $s3,$s3,#1
- // Load mod
- adr x2,$mod
- ldp $s4,$s5,[x2]
- ldp $s6,$s7,[x2,#16]
- // Parity check
- tst $t0,#1
- csel $s4,xzr,$s4,eq
- csel $s5,xzr,$s5,eq
- csel $s6,xzr,$s6,eq
- csel $s7,xzr,$s7,eq
- // Add
- adds $s0,$s0,$s4
- adcs $s1,$s1,$s5
- adcs $s2,$s2,$s6
- adc $s3,$s3,$s7
- // Store results
- stp $s0,$s1,[x0]
- stp $s2,$s3,[x0,#16]
- ___
- }
- {
- $code.=<<___;
- #include "arm_arch.h"
- .arch armv8-a
- .text
- .align 5
- // The polynomial p
- .Lpoly:
- .quad 0xffffffffffffffff,0xffffffff00000000,0xffffffffffffffff,0xfffffffeffffffff
- // The order of polynomial n
- .Lord:
- .quad 0x53bbf40939d54123,0x7203df6b21c6052b,0xffffffffffffffff,0xfffffffeffffffff
- // (p + 1) / 2
- .Lpoly_div_2:
- .quad 0x8000000000000000,0xffffffff80000000,0xffffffffffffffff,0x7fffffff7fffffff
- // (n + 1) / 2
- .Lord_div_2:
- .quad 0xa9ddfa049ceaa092,0xb901efb590e30295,0xffffffffffffffff,0x7fffffff7fffffff
- // void bn_rshift1(BN_ULONG *a);
- .globl bn_rshift1
- .type bn_rshift1,%function
- .align 5
- bn_rshift1:
- AARCH64_VALID_CALL_TARGET
- // Load inputs
- ldp $s0,$s1,[x0]
- ldp $s2,$s3,[x0,#16]
- // Right shift
- extr $s0,$s1,$s0,#1
- extr $s1,$s2,$s1,#1
- extr $s2,$s3,$s2,#1
- lsr $s3,$s3,#1
- // Store results
- stp $s0,$s1,[x0]
- stp $s2,$s3,[x0,#16]
- ret
- .size bn_rshift1,.-bn_rshift1
- // void bn_sub(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
- .globl bn_sub
- .type bn_sub,%function
- .align 5
- bn_sub:
- AARCH64_VALID_CALL_TARGET
- // Load inputs
- ldp $s0,$s1,[x1]
- ldp $s2,$s3,[x1,#16]
- ldp $s4,$s5,[x2]
- ldp $s6,$s7,[x2,#16]
- // Subtraction
- subs $s0,$s0,$s4
- sbcs $s1,$s1,$s5
- sbcs $s2,$s2,$s6
- sbc $s3,$s3,$s7
- // Store results
- stp $s0,$s1,[x0]
- stp $s2,$s3,[x0,#16]
- ret
- .size bn_sub,.-bn_sub
- // void ecp_sm2p256_div_by_2(BN_ULONG *r,const BN_ULONG *a);
- .globl ecp_sm2p256_div_by_2
- .type ecp_sm2p256_div_by_2,%function
- .align 5
- ecp_sm2p256_div_by_2:
- AARCH64_VALID_CALL_TARGET
- ___
- &bn_mod_div_by_2(".Lpoly_div_2");
- $code.=<<___;
- ret
- .size ecp_sm2p256_div_by_2,.-ecp_sm2p256_div_by_2
- // void ecp_sm2p256_div_by_2_mod_ord(BN_ULONG *r,const BN_ULONG *a);
- .globl ecp_sm2p256_div_by_2_mod_ord
- .type ecp_sm2p256_div_by_2_mod_ord,%function
- .align 5
- ecp_sm2p256_div_by_2_mod_ord:
- AARCH64_VALID_CALL_TARGET
- ___
- &bn_mod_div_by_2(".Lord_div_2");
- $code.=<<___;
- ret
- .size ecp_sm2p256_div_by_2_mod_ord,.-ecp_sm2p256_div_by_2_mod_ord
- // void ecp_sm2p256_mul_by_3(BN_ULONG *r,const BN_ULONG *a);
- .globl ecp_sm2p256_mul_by_3
- .type ecp_sm2p256_mul_by_3,%function
- .align 5
- ecp_sm2p256_mul_by_3:
- AARCH64_VALID_CALL_TARGET
- // Load inputs
- ldp $s0,$s1,[x1]
- ldp $s2,$s3,[x1,#16]
- // 2*a
- adds $s0,$s0,$s0
- adcs $s1,$s1,$s1
- adcs $s2,$s2,$s2
- adcs $s3,$s3,$s3
- adcs $t4,xzr,xzr
- mov $t0,$s0
- mov $t1,$s1
- mov $t2,$s2
- mov $t3,$s3
- // Sub polynomial
- adr x2,.Lpoly
- ldp $s4,$s5,[x2]
- ldp $s6,$s7,[x2,#16]
- subs $s0,$s0,$s4
- sbcs $s1,$s1,$s5
- sbcs $s2,$s2,$s6
- sbcs $s3,$s3,$s7
- sbcs $t4,$t4,xzr
- csel $s0,$s0,$t0,cs
- csel $s1,$s1,$t1,cs
- csel $s2,$s2,$t2,cs
- csel $s3,$s3,$t3,cs
- eor $t4,$t4,$t4
- // 3*a
- ldp $s4,$s5,[x1]
- ldp $s6,$s7,[x1,#16]
- adds $s0,$s0,$s4
- adcs $s1,$s1,$s5
- adcs $s2,$s2,$s6
- adcs $s3,$s3,$s7
- adcs $t4,xzr,xzr
- mov $t0,$s0
- mov $t1,$s1
- mov $t2,$s2
- mov $t3,$s3
- // Sub polynomial
- adr x2,.Lpoly
- ldp $s4,$s5,[x2]
- ldp $s6,$s7,[x2,#16]
- subs $s0,$s0,$s4
- sbcs $s1,$s1,$s5
- sbcs $s2,$s2,$s6
- sbcs $s3,$s3,$s7
- sbcs $t4,$t4,xzr
- csel $s0,$s0,$t0,cs
- csel $s1,$s1,$t1,cs
- csel $s2,$s2,$t2,cs
- csel $s3,$s3,$t3,cs
- // Store results
- stp $s0,$s1,[x0]
- stp $s2,$s3,[x0,#16]
- ret
- .size ecp_sm2p256_mul_by_3,.-ecp_sm2p256_mul_by_3
- // void ecp_sm2p256_add(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b);
- .globl ecp_sm2p256_add
- .type ecp_sm2p256_add,%function
- .align 5
- ecp_sm2p256_add:
- AARCH64_VALID_CALL_TARGET
- ___
- &bn_mod_add(".Lpoly");
- $code.=<<___;
- ret
- .size ecp_sm2p256_add,.-ecp_sm2p256_add
- // void ecp_sm2p256_sub(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b);
- .globl ecp_sm2p256_sub
- .type ecp_sm2p256_sub,%function
- .align 5
- ecp_sm2p256_sub:
- AARCH64_VALID_CALL_TARGET
- ___
- &bn_mod_sub(".Lpoly");
- $code.=<<___;
- ret
- .size ecp_sm2p256_sub,.-ecp_sm2p256_sub
- // void ecp_sm2p256_sub_mod_ord(BN_ULONG *r,const BN_ULONG *a,const BN_ULONG *b);
- .globl ecp_sm2p256_sub_mod_ord
- .type ecp_sm2p256_sub_mod_ord,%function
- .align 5
- ecp_sm2p256_sub_mod_ord:
- AARCH64_VALID_CALL_TARGET
- ___
- &bn_mod_sub(".Lord");
- $code.=<<___;
- ret
- .size ecp_sm2p256_sub_mod_ord,.-ecp_sm2p256_sub_mod_ord
- .macro RDC
- // a = | s7 | ... | s0 |, where si are 64-bit quantities
- // = |a15|a14| ... |a1|a0|, where ai are 32-bit quantities
- // | s7 | s6 | s5 | s4 |
- // | a15 | a14 | a13 | a12 | a11 | a10 | a9 | a8 |
- // | s3 | s2 | s1 | s0 |
- // | a7 | a6 | a5 | a4 | a3 | a2 | a1 | a0 |
- // =================================================
- // | a8 | a11 | a10 | a9 | a8 | 0 | s4 | (+)
- // | a9 | a15 | s6 | a11 | 0 | a10 | a9 | (+)
- // | a10 | 0 | a14 | a13 | a12 | 0 | s5 | (+)
- // | a11 | 0 | s7 | a13 | 0 | a12 | a11 | (+)
- // | a12 | 0 | s7 | a13 | 0 | s6 | (+)
- // | a12 | 0 | 0 | a15 | a14 | 0 | a14 | a13 | (+)
- // | a13 | 0 | 0 | 0 | a15 | 0 | a14 | a13 | (+)
- // | a13 | 0 | 0 | 0 | 0 | 0 | s7 | (+)
- // | a14 | 0 | 0 | 0 | 0 | 0 | s7 | (+)
- // | a14 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+)
- // | a15 | 0 | 0 | 0 | 0 | 0 | 0 | a15 | (+)
- // | a15 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | (+)
- // | s7 | 0 | 0 | 0 | 0 | 0 | 0 | (+)
- // | 0 | 0 | 0 | 0 | 0 | a8 | 0 | 0 | (-)
- // | 0 | 0 | 0 | 0 | 0 | a9 | 0 | 0 | (-)
- // | 0 | 0 | 0 | 0 | 0 | a13 | 0 | 0 | (-)
- // | 0 | 0 | 0 | 0 | 0 | a14 | 0 | 0 | (-)
- // | U[7]| U[6]| U[5]| U[4]| U[3]| U[2]| U[1]| U[0]|
- // | V[3] | V[2] | V[1] | V[0] |
- // 1. 64-bit addition
- // t2=s6+s7+s7
- adds $t2,$s6,$s7
- adcs $t1,xzr,xzr
- adds $t2,$t2,$s7
- adcs $t1,$t1,xzr
- // t3=s4+s5+t2
- adds $t3,$s4,$t2
- adcs $t4,$t1,xzr
- adds $t3,$t3,$s5
- adcs $t4,$t4,xzr
- // sum
- adds $s0,$s0,$t3
- adcs $s1,$s1,$t4
- adcs $s2,$s2,$t2
- adcs $s3,$s3,$s7
- adcs $t0,xzr,xzr
- adds $s3,$s3,$t1
- adcs $t0,$t0,xzr
- stp $s0,$s1,[sp,#32]
- stp $s2,$s3,[sp,#48]
- // 2. 64-bit to 32-bit spread
- mov $t1,#0xffffffff
- mov $s0,$s4
- mov $s1,$s5
- mov $s2,$s6
- mov $s3,$s7
- and $s0,$s0,$t1 // a8
- and $s1,$s1,$t1 // a10
- and $s2,$s2,$t1 // a12
- and $s3,$s3,$t1 // a14
- lsr $s4,$s4,#32 // a9
- lsr $s5,$s5,#32 // a11
- lsr $s6,$s6,#32 // a13
- lsr $s7,$s7,#32 // a15
- // 3. 32-bit addition
- add $t1,$a14,$a12 // t1 <- a12 + a14
- add $t2,$a15,$a13 // t2 <- a13 + a15
- add $t3,$a8,$a9 // t3 <- a8 + a9
- add $t4,$a14,$a10 // t4 <- a10 + a14
- add $a15,$a15,$a11 // a15 <- a11 + a15
- add $a12,$t2,$t1 // a12 <- a12 + a13 + a14 + a15
- add $a10,$a10,$a12 // a10 <- a10 + a12 + a13 + a14 + a15
- add $a10,$a10,$a12 // a10 <- a10 + 2*(a12 + a13 + a14 + a15)
- add $a10,$a10,$t3 // a10 <- a8 + a9 + a10 + 2*(a12 + a13 + a14 + a15)
- add $a10,$a10,$a11 // a10 <- a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15)
- add $a12,$a12,$a13 // a12 <- a12 + 2*a13 + a14 + a15
- add $a12,$a12,$a11 // a12 <- a11 + a12 + 2*a13 + a14 + a15
- add $a12,$a12,$a8 // a12 <- a8 + a11 + a12 + 2*a13 + a14 + a15
- add $t3,$t3,$a14 // t3 <- a8 + a9 + a14
- add $t3,$t3,$a13 // t3 <- a8 + a9 + a13 + a14
- add $a9,$a9,$t2 // a9 <- a9 + a13 + a15
- add $a11,$a11,$a9 // a11 <- a9 + a11 + a13 + a15
- add $a11,$a11,$t2 // a11 <- a9 + a11 + 2*(a13 + a15)
- add $t1,$t1,$t4 // t1 <- a10 + a12 + 2*a14
- // U[0] s5 a9 + a11 + 2*(a13 + a15)
- // U[1] t1 a10 + a12 + 2*a14
- // U[2] -t3 a8 + a9 + a13 + a14
- // U[3] s2 a8 + a11 + a12 + 2*a13 + a14 + a15
- // U[4] s4 a9 + a13 + a15
- // U[5] t4 a10 + a14
- // U[6] s7 a11 + a15
- // U[7] s1 a8 + a9 + a10 + a11 + 2*(a12 + a13 + a14 + a15)
- // 4. 32-bit to 64-bit
- lsl $s0,$t1,#32
- extr $t1,$s2,$t1,#32
- extr $s2,$t4,$s2,#32
- extr $t4,$s1,$t4,#32
- lsr $s1,$s1,#32
- // 5. 64-bit addition
- adds $s5,$s5,$s0
- adcs $t1,$t1,xzr
- adcs $s4,$s4,$s2
- adcs $s7,$s7,$t4
- adcs $t0,$t0,$s1
- // V[0] s5
- // V[1] t1
- // V[2] s4
- // V[3] s7
- // carry t0
- // sub t3
- // 5. Process s0-s3
- ldp $s0,$s1,[sp,#32]
- ldp $s2,$s3,[sp,#48]
- // add with V0-V3
- adds $s0,$s0,$s5
- adcs $s1,$s1,$t1
- adcs $s2,$s2,$s4
- adcs $s3,$s3,$s7
- adcs $t0,$t0,xzr
- // sub with t3
- subs $s1,$s1,$t3
- sbcs $s2,$s2,xzr
- sbcs $s3,$s3,xzr
- sbcs $t0,$t0,xzr
- // 6. MOD
- // First Mod
- lsl $t1,$t0,#32
- subs $t2,$t1,$t0
- adds $s0,$s0,$t0
- adcs $s1,$s1,$t2
- adcs $s2,$s2,xzr
- adcs $s3,$s3,$t1
- // Last Mod
- // return y - p if y > p else y
- mov $s4,$s0
- mov $s5,$s1
- mov $s6,$s2
- mov $s7,$s3
- adr $t0,.Lpoly
- ldp $t1,$t2,[$t0]
- ldp $t3,$t4,[$t0,#16]
- adcs $t5,xzr,xzr
- subs $s0,$s0,$t1
- sbcs $s1,$s1,$t2
- sbcs $s2,$s2,$t3
- sbcs $s3,$s3,$t4
- sbcs $t5,$t5,xzr
- csel $s0,$s0,$s4,cs
- csel $s1,$s1,$s5,cs
- csel $s2,$s2,$s6,cs
- csel $s3,$s3,$s7,cs
- .endm
- // void ecp_sm2p256_mul(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b);
- .globl ecp_sm2p256_mul
- .type ecp_sm2p256_mul,%function
- .align 5
- ecp_sm2p256_mul:
- AARCH64_SIGN_LINK_REGISTER
- // Store scalar registers
- stp x29,x30,[sp,#-80]!
- add x29,sp,#0
- stp x16,x17,[sp,#16]
- stp x19,x20,[sp,#64]
- // Load inputs
- ldp $s0,$s1,[x1]
- ldp $s2,$s3,[x1,#16]
- ldp $s4,$s5,[x2]
- ldp $s6,$s7,[x2,#16]
- // ### multiplication ###
- // ========================
- // s3 s2 s1 s0
- // * s7 s6 s5 s4
- // ------------------------
- // + s0 s0 s0 s0
- // * * * *
- // s7 s6 s5 s4
- // s1 s1 s1 s1
- // * * * *
- // s7 s6 s5 s4
- // s2 s2 s2 s2
- // * * * *
- // s7 s6 s5 s4
- // s3 s3 s3 s3
- // * * * *
- // s7 s6 s5 s4
- // ------------------------
- // s7 s6 s5 s4 s3 s2 s1 s0
- // ========================
- // ### s0*s4 ###
- mul $t5,$s0,$s4
- umulh $t2,$s0,$s4
- // ### s1*s4 + s0*s5 ###
- mul $t0,$s1,$s4
- umulh $t1,$s1,$s4
- adds $t2,$t2,$t0
- adcs $t3,$t1,xzr
- mul $t0,$s0,$s5
- umulh $t1,$s0,$s5
- adds $t2,$t2,$t0
- adcs $t3,$t3,$t1
- adcs $t4,xzr,xzr
- // ### s2*s4 + s1*s5 + s0*s6 ###
- mul $t0,$s2,$s4
- umulh $t1,$s2,$s4
- adds $t3,$t3,$t0
- adcs $t4,$t4,$t1
- mul $t0,$s1,$s5
- umulh $t1,$s1,$s5
- adds $t3,$t3,$t0
- adcs $t4,$t4,$t1
- adcs $t6,xzr,xzr
- mul $t0,$s0,$s6
- umulh $t1,$s0,$s6
- adds $t3,$t3,$t0
- adcs $t4,$t4,$t1
- adcs $t6,$t6,xzr
- // ### s3*s4 + s2*s5 + s1*s6 + s0*s7 ###
- mul $t0,$s3,$s4
- umulh $t1,$s3,$s4
- adds $t4,$t4,$t0
- adcs $t6,$t6,$t1
- adcs $t7,xzr,xzr
- mul $t0,$s2,$s5
- umulh $t1,$s2,$s5
- adds $t4,$t4,$t0
- adcs $t6,$t6,$t1
- adcs $t7,$t7,xzr
- mul $t0,$s1,$s6
- umulh $t1,$s1,$s6
- adds $t4,$t4,$t0
- adcs $t6,$t6,$t1
- adcs $t7,$t7,xzr
- mul $t0,$s0,$s7
- umulh $t1,$s0,$s7
- adds $t4,$t4,$t0
- adcs $t6,$t6,$t1
- adcs $t7,$t7,xzr
- // ### s3*s5 + s2*s6 + s1*s7 ###
- mul $t0,$s3,$s5
- umulh $t1,$s3,$s5
- adds $t6,$t6,$t0
- adcs $t7,$t7,$t1
- adcs $t8,xzr,xzr
- mul $t0,$s2,$s6
- umulh $t1,$s2,$s6
- adds $t6,$t6,$t0
- adcs $t7,$t7,$t1
- adcs $t8,$t8,xzr
- mul $t0,$s1,$s7
- umulh $t1,$s1,$s7
- adds $s4,$t6,$t0
- adcs $t7,$t7,$t1
- adcs $t8,$t8,xzr
- // ### s3*s6 + s2*s7 ###
- mul $t0,$s3,$s6
- umulh $t1,$s3,$s6
- adds $t7,$t7,$t0
- adcs $t8,$t8,$t1
- adcs $t6,xzr,xzr
- mul $t0,$s2,$s7
- umulh $t1,$s2,$s7
- adds $s5,$t7,$t0
- adcs $t8,$t8,$t1
- adcs $t6,$t6,xzr
- // ### s3*s7 ###
- mul $t0,$s3,$s7
- umulh $t1,$s3,$s7
- adds $s6,$t8,$t0
- adcs $s7,$t6,$t1
- mov $s0,$t5
- mov $s1,$t2
- mov $s2,$t3
- mov $s3,$t4
- // result of mul: s7 s6 s5 s4 s3 s2 s1 s0
- // ### Reduction ###
- RDC
- stp $s0,$s1,[x0]
- stp $s2,$s3,[x0,#16]
- // Restore scalar registers
- ldp x16,x17,[sp,#16]
- ldp x19,x20,[sp,#64]
- ldp x29,x30,[sp],#80
- AARCH64_VALIDATE_LINK_REGISTER
- ret
- .size ecp_sm2p256_mul,.-ecp_sm2p256_mul
- // void ecp_sm2p256_sqr(BN_ULONG *r, const BN_ULONG *a);
- .globl ecp_sm2p256_sqr
- .type ecp_sm2p256_sqr,%function
- .align 5
- ecp_sm2p256_sqr:
- AARCH64_SIGN_LINK_REGISTER
- // Store scalar registers
- stp x29,x30,[sp,#-80]!
- add x29,sp,#0
- stp x16,x17,[sp,#16]
- stp x19,x20,[sp,#64]
- // Load inputs
- ldp $s4,$s5,[x1]
- ldp $s6,$s7,[x1,#16]
- // ### square ###
- // ========================
- // s7 s6 s5 s4
- // * s7 s6 s5 s4
- // ------------------------
- // + s4 s4 s4 s4
- // * * * *
- // s7 s6 s5 s4
- // s5 s5 s5 s5
- // * * * *
- // s7 s6 s5 s4
- // s6 s6 s6 s6
- // * * * *
- // s7 s6 s5 s4
- // s7 s7 s7 s7
- // * * * *
- // s7 s6 s5 s4
- // ------------------------
- // s7 s6 s5 s4 s3 s2 s1 s0
- // ========================
- // ### s4*s5 ###
- mul $s1,$s4,$s5
- umulh $s2,$s4,$s5
- // ### s4*s6 ###
- mul $t0,$s6,$s4
- umulh $s3,$s6,$s4
- adds $s2,$s2,$t0
- adcs $s3,$s3,xzr
- // ### s4*s7 + s5*s6 ###
- mul $t0,$s7,$s4
- umulh $t1,$s7,$s4
- adds $s3,$s3,$t0
- adcs $s0,$t1,xzr
- mul $t0,$s6,$s5
- umulh $t1,$s6,$s5
- adds $s3,$s3,$t0
- adcs $s0,$s0,$t1
- adcs $t2,xzr,xzr
- // ### s5*s7 ###
- mul $t0,$s7,$s5
- umulh $t1,$s7,$s5
- adds $s0,$s0,$t0
- adcs $t2,$t2,$t1
- // ### s6*s7 ###
- mul $t0,$s7,$s6
- umulh $t1,$s7,$s6
- adds $t2,$t2,$t0
- adcs $t3,$t1,xzr
- // ### 2*(t3,t2,s0,s3,s2,s1) ###
- adds $s1,$s1,$s1
- adcs $s2,$s2,$s2
- adcs $s3,$s3,$s3
- adcs $s0,$s0,$s0
- adcs $t2,$t2,$t2
- adcs $t3,$t3,$t3
- adcs $t4,xzr,xzr
- // ### s4*s4 ###
- mul $t5,$s4,$s4
- umulh $t6,$s4,$s4
- // ### s5*s5 ###
- mul $s4,$s5,$s5
- umulh $s5,$s5,$s5
- // ### s6*s6 ###
- mul $t0,$s6,$s6
- umulh $t1,$s6,$s6
- // ### s7*s7 ###
- mul $t7,$s7,$s7
- umulh $t8,$s7,$s7
- adds $s1,$s1,$t6
- adcs $s2,$s2,$s4
- adcs $s3,$s3,$s5
- adcs $s0,$s0,$t0
- adcs $t2,$t2,$t1
- adcs $t3,$t3,$t7
- adcs $t4,$t4,$t8
- mov $s4,$s0
- mov $s0,$t5
- mov $s5,$t2
- mov $s6,$t3
- mov $s7,$t4
- // result of mul: s7 s6 s5 s4 s3 s2 s1 s0
- // ### Reduction ###
- RDC
- stp $s0,$s1,[x0]
- stp $s2,$s3,[x0,#16]
- // Restore scalar registers
- ldp x16,x17,[sp,#16]
- ldp x19,x20,[sp,#64]
- ldp x29,x30,[sp],#80
- AARCH64_VALIDATE_LINK_REGISTER
- ret
- .size ecp_sm2p256_sqr,.-ecp_sm2p256_sqr
- ___
- }
- foreach (split("\n",$code)) {
- s/\`([^\`]*)\`/eval $1/ge;
- print $_,"\n";
- }
- close STDOUT or die "error closing STDOUT: $!"; # enforce flush
|