123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433 |
- #! /usr/bin/env perl
- # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
- # Copyright (c) 2012, Intel Corporation. All Rights Reserved.
- #
- # Licensed under the Apache License 2.0 (the "License"). You may not use
- # this file except in compliance with the License. You can obtain a copy
- # in the file LICENSE in the source distribution or at
- # https://www.openssl.org/source/license.html
- #
- # Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
- # (1) Intel Corporation, Israel Development Center, Haifa, Israel
- # (2) University of Haifa, Israel
- #
- # References:
- # [1] S. Gueron, "Efficient Software Implementations of Modular
- # Exponentiation", http://eprint.iacr.org/2011/239
- # [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring".
- # IEEE Proceedings of 9th International Conference on Information
- # Technology: New Generations (ITNG 2012), 821-823 (2012).
- # [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation
- # Journal of Cryptographic Engineering 2:31-43 (2012).
- # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis
- # resistant 512-bit and 1024-bit modular exponentiation for optimizing
- # RSA1024 and RSA2048 on x86_64 platforms",
- # http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest
- #
- # While original submission covers 512- and 1024-bit exponentiation,
- # this module is limited to 512-bit version only (and as such
- # accelerates RSA1024 sign). This is because improvement for longer
- # keys is not high enough to justify the effort, highest measured
- # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
- # for the moment of this writing!] Nor does this module implement
- # "monolithic" complete exponentiation jumbo-subroutine, but adheres
- # to more modular mixture of C and assembly. And it's optimized even
- # for processors other than Intel Core family (see table below for
- # improvement coefficients).
- # <appro@openssl.org>
- #
- # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
- # ----------------+---------------------------
- # Opteron +13% |+5% +20%
- # Bulldozer -0% |-1% +10%
- # P4 +11% |+7% +8%
- # Westmere +5% |+14% +17%
- # Sandy Bridge +2% |+12% +29%
- # Ivy Bridge +1% |+11% +35%
- # Haswell(**) -0% |+12% +39%
- # Atom +13% |+11% +4%
- # VIA Nano +70% |+9% +25%
- #
- # (*) rsax engine and fips numbers are presented for reference
- # purposes;
- # (**) MULX was attempted, but found to give only marginal improvement;
- # $output is the last argument if it looks like a file (it has an extension)
- # $flavour is the first argument if it doesn't look like a file
- $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
- $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
- $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
- die "can't locate x86_64-xlate.pl";
- open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
- or die "can't call $xlate: $!";
- *STDOUT=*OUT;
- if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
- =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
- $addx = ($1>=2.23);
- }
- if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
- `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
- $addx = ($1>=2.10);
- }
- if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
- `ml64 2>&1` =~ /Version ([0-9]+)\./) {
- $addx = ($1>=12);
- }
- if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) {
- my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
- $addx = ($ver>=3.03);
- }
- ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
- {
- my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
- $code.=<<___;
- .text
- .extern OPENSSL_ia32cap_P
- .globl rsaz_512_sqr
- .type rsaz_512_sqr,\@function,5
- .align 32
- rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
- .cfi_startproc
- push %rbx
- .cfi_push %rbx
- push %rbp
- .cfi_push %rbp
- push %r12
- .cfi_push %r12
- push %r13
- .cfi_push %r13
- push %r14
- .cfi_push %r14
- push %r15
- .cfi_push %r15
- subq \$128+24, %rsp
- .cfi_adjust_cfa_offset 128+24
- .Lsqr_body:
- movq $mod, %xmm1 # common off-load
- movq ($inp), %rdx
- movq 8($inp), %rax
- movq $n0, 128(%rsp)
- ___
- $code.=<<___ if ($addx);
- movl \$0x80100,%r11d
- andl OPENSSL_ia32cap_P+8(%rip),%r11d
- cmpl \$0x80100,%r11d # check for MULX and ADO/CX
- je .Loop_sqrx
- ___
- $code.=<<___;
- jmp .Loop_sqr
- .align 32
- .Loop_sqr:
- movl $times,128+8(%rsp)
- #first iteration
- movq %rdx, %rbx # 0($inp)
- mov %rax, %rbp # 8($inp)
- mulq %rdx
- movq %rax, %r8
- movq 16($inp), %rax
- movq %rdx, %r9
- mulq %rbx
- addq %rax, %r9
- movq 24($inp), %rax
- movq %rdx, %r10
- adcq \$0, %r10
- mulq %rbx
- addq %rax, %r10
- movq 32($inp), %rax
- movq %rdx, %r11
- adcq \$0, %r11
- mulq %rbx
- addq %rax, %r11
- movq 40($inp), %rax
- movq %rdx, %r12
- adcq \$0, %r12
- mulq %rbx
- addq %rax, %r12
- movq 48($inp), %rax
- movq %rdx, %r13
- adcq \$0, %r13
- mulq %rbx
- addq %rax, %r13
- movq 56($inp), %rax
- movq %rdx, %r14
- adcq \$0, %r14
- mulq %rbx
- addq %rax, %r14
- movq %rbx, %rax
- adcq \$0, %rdx
- xorq %rcx,%rcx # rcx:r8 = r8 << 1
- addq %r8, %r8
- movq %rdx, %r15
- adcq \$0, %rcx
- mulq %rax
- addq %r8, %rdx
- adcq \$0, %rcx
- movq %rax, (%rsp)
- movq %rdx, 8(%rsp)
- #second iteration
- movq 16($inp), %rax
- mulq %rbp
- addq %rax, %r10
- movq 24($inp), %rax
- movq %rdx, %rbx
- adcq \$0, %rbx
- mulq %rbp
- addq %rax, %r11
- movq 32($inp), %rax
- adcq \$0, %rdx
- addq %rbx, %r11
- movq %rdx, %rbx
- adcq \$0, %rbx
- mulq %rbp
- addq %rax, %r12
- movq 40($inp), %rax
- adcq \$0, %rdx
- addq %rbx, %r12
- movq %rdx, %rbx
- adcq \$0, %rbx
- mulq %rbp
- addq %rax, %r13
- movq 48($inp), %rax
- adcq \$0, %rdx
- addq %rbx, %r13
- movq %rdx, %rbx
- adcq \$0, %rbx
- mulq %rbp
- addq %rax, %r14
- movq 56($inp), %rax
- adcq \$0, %rdx
- addq %rbx, %r14
- movq %rdx, %rbx
- adcq \$0, %rbx
- mulq %rbp
- addq %rax, %r15
- movq %rbp, %rax
- adcq \$0, %rdx
- addq %rbx, %r15
- adcq \$0, %rdx
- xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1
- addq %r9, %r9
- movq %rdx, %r8
- adcq %r10, %r10
- adcq \$0, %rbx
- mulq %rax
- # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
- addq %rcx, %rax
- movq 16($inp), %rbp
- addq %rax, %r9
- movq 24($inp), %rax
- adcq %rdx, %r10
- adcq \$0, %rbx
- movq %r9, 16(%rsp)
- movq %r10, 24(%rsp)
- #third iteration
- mulq %rbp
- addq %rax, %r12
- movq 32($inp), %rax
- movq %rdx, %rcx
- adcq \$0, %rcx
- mulq %rbp
- addq %rax, %r13
- movq 40($inp), %rax
- adcq \$0, %rdx
- addq %rcx, %r13
- movq %rdx, %rcx
- adcq \$0, %rcx
- mulq %rbp
- addq %rax, %r14
- movq 48($inp), %rax
- adcq \$0, %rdx
- addq %rcx, %r14
- movq %rdx, %rcx
- adcq \$0, %rcx
- mulq %rbp
- addq %rax, %r15
- movq 56($inp), %rax
- adcq \$0, %rdx
- addq %rcx, %r15
- movq %rdx, %rcx
- adcq \$0, %rcx
- mulq %rbp
- addq %rax, %r8
- movq %rbp, %rax
- adcq \$0, %rdx
- addq %rcx, %r8
- adcq \$0, %rdx
- xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1
- addq %r11, %r11
- movq %rdx, %r9
- adcq %r12, %r12
- adcq \$0, %rcx
- mulq %rax
- # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
- addq %rbx, %rax
- movq 24($inp), %r10
- addq %rax, %r11
- movq 32($inp), %rax
- adcq %rdx, %r12
- adcq \$0, %rcx
- movq %r11, 32(%rsp)
- movq %r12, 40(%rsp)
- #fourth iteration
- mov %rax, %r11 # 32($inp)
- mulq %r10
- addq %rax, %r14
- movq 40($inp), %rax
- movq %rdx, %rbx
- adcq \$0, %rbx
- mov %rax, %r12 # 40($inp)
- mulq %r10
- addq %rax, %r15
- movq 48($inp), %rax
- adcq \$0, %rdx
- addq %rbx, %r15
- movq %rdx, %rbx
- adcq \$0, %rbx
- mov %rax, %rbp # 48($inp)
- mulq %r10
- addq %rax, %r8
- movq 56($inp), %rax
- adcq \$0, %rdx
- addq %rbx, %r8
- movq %rdx, %rbx
- adcq \$0, %rbx
- mulq %r10
- addq %rax, %r9
- movq %r10, %rax
- adcq \$0, %rdx
- addq %rbx, %r9
- adcq \$0, %rdx
- xorq %rbx, %rbx # rbx:r13:r14 = r13:r14 << 1
- addq %r13, %r13
- movq %rdx, %r10
- adcq %r14, %r14
- adcq \$0, %rbx
- mulq %rax
- # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
- addq %rcx, %rax
- addq %rax, %r13
- movq %r12, %rax # 40($inp)
- adcq %rdx, %r14
- adcq \$0, %rbx
- movq %r13, 48(%rsp)
- movq %r14, 56(%rsp)
- #fifth iteration
- mulq %r11
- addq %rax, %r8
- movq %rbp, %rax # 48($inp)
- movq %rdx, %rcx
- adcq \$0, %rcx
- mulq %r11
- addq %rax, %r9
- movq 56($inp), %rax
- adcq \$0, %rdx
- addq %rcx, %r9
- movq %rdx, %rcx
- adcq \$0, %rcx
- mov %rax, %r14 # 56($inp)
- mulq %r11
- addq %rax, %r10
- movq %r11, %rax
- adcq \$0, %rdx
- addq %rcx, %r10
- adcq \$0, %rdx
- xorq %rcx, %rcx # rcx:r8:r15 = r8:r15 << 1
- addq %r15, %r15
- movq %rdx, %r11
- adcq %r8, %r8
- adcq \$0, %rcx
- mulq %rax
- # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
- addq %rbx, %rax
- addq %rax, %r15
- movq %rbp, %rax # 48($inp)
- adcq %rdx, %r8
- adcq \$0, %rcx
- movq %r15, 64(%rsp)
- movq %r8, 72(%rsp)
- #sixth iteration
- mulq %r12
- addq %rax, %r10
- movq %r14, %rax # 56($inp)
- movq %rdx, %rbx
- adcq \$0, %rbx
- mulq %r12
- addq %rax, %r11
- movq %r12, %rax
- adcq \$0, %rdx
- addq %rbx, %r11
- adcq \$0, %rdx
- xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1
- addq %r9, %r9
- movq %rdx, %r12
- adcq %r10, %r10
- adcq \$0, %rbx
- mulq %rax
- # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
- addq %rcx, %rax
- addq %rax, %r9
- movq %r14, %rax # 56($inp)
- adcq %rdx, %r10
- adcq \$0, %rbx
- movq %r9, 80(%rsp)
- movq %r10, 88(%rsp)
- #seventh iteration
- mulq %rbp
- addq %rax, %r12
- movq %rbp, %rax
- adcq \$0, %rdx
- xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1
- addq %r11, %r11
- movq %rdx, %r13
- adcq %r12, %r12
- adcq \$0, %rcx
- mulq %rax
- # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
- addq %rbx, %rax
- addq %rax, %r11
- movq %r14, %rax # 56($inp)
- adcq %rdx, %r12
- adcq \$0, %rcx
- movq %r11, 96(%rsp)
- movq %r12, 104(%rsp)
- #eighth iteration
- xorq %rbx, %rbx # rbx:r13 = r13 << 1
- addq %r13, %r13
- adcq \$0, %rbx
- mulq %rax
- # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
- addq %rcx, %rax
- addq %r13, %rax
- adcq %rbx, %rdx
- movq (%rsp), %r8
- movq 8(%rsp), %r9
- movq 16(%rsp), %r10
- movq 24(%rsp), %r11
- movq 32(%rsp), %r12
- movq 40(%rsp), %r13
- movq 48(%rsp), %r14
- movq 56(%rsp), %r15
- movq %xmm1, %rbp
- movq %rax, 112(%rsp)
- movq %rdx, 120(%rsp)
- call __rsaz_512_reduce
- addq 64(%rsp), %r8
- adcq 72(%rsp), %r9
- adcq 80(%rsp), %r10
- adcq 88(%rsp), %r11
- adcq 96(%rsp), %r12
- adcq 104(%rsp), %r13
- adcq 112(%rsp), %r14
- adcq 120(%rsp), %r15
- sbbq %rcx, %rcx
- call __rsaz_512_subtract
- movq %r8, %rdx
- movq %r9, %rax
- movl 128+8(%rsp), $times
- movq $out, $inp
- decl $times
- jnz .Loop_sqr
- ___
- if ($addx) {
- $code.=<<___;
- jmp .Lsqr_tail
- .align 32
- .Loop_sqrx:
- movl $times,128+8(%rsp)
- movq $out, %xmm0 # off-load
- #first iteration
- mulx %rax, %r8, %r9
- mov %rax, %rbx
- mulx 16($inp), %rcx, %r10
- xor %rbp, %rbp # cf=0, of=0
- mulx 24($inp), %rax, %r11
- adcx %rcx, %r9
- .byte 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($inp), %rcx, %r12
- adcx %rax, %r10
- .byte 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00 # mulx 40($inp), %rax, %r13
- adcx %rcx, %r11
- mulx 48($inp), %rcx, %r14
- adcx %rax, %r12
- adcx %rcx, %r13
- mulx 56($inp), %rax, %r15
- adcx %rax, %r14
- adcx %rbp, %r15 # %rbp is 0
- mulx %rdx, %rax, $out
- mov %rbx, %rdx # 8($inp)
- xor %rcx, %rcx
- adox %r8, %r8
- adcx $out, %r8
- adox %rbp, %rcx
- adcx %rbp, %rcx
- mov %rax, (%rsp)
- mov %r8, 8(%rsp)
- #second iteration
- .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00 # mulx 16($inp), %rax, %rbx
- adox %rax, %r10
- adcx %rbx, %r11
- mulx 24($inp), $out, %r8
- adox $out, %r11
- .byte 0x66
- adcx %r8, %r12
- mulx 32($inp), %rax, %rbx
- adox %rax, %r12
- adcx %rbx, %r13
- mulx 40($inp), $out, %r8
- adox $out, %r13
- adcx %r8, %r14
- .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
- adox %rax, %r14
- adcx %rbx, %r15
- .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
- adox $out, %r15
- adcx %rbp, %r8
- mulx %rdx, %rax, $out
- adox %rbp, %r8
- .byte 0x48,0x8b,0x96,0x10,0x00,0x00,0x00 # mov 16($inp), %rdx
- xor %rbx, %rbx
- adox %r9, %r9
- # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
- adcx %rcx, %rax
- adox %r10, %r10
- adcx %rax, %r9
- adox %rbp, %rbx
- adcx $out, %r10
- adcx %rbp, %rbx
- mov %r9, 16(%rsp)
- .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
- #third iteration
- mulx 24($inp), $out, %r9
- adox $out, %r12
- adcx %r9, %r13
- mulx 32($inp), %rax, %rcx
- adox %rax, %r13
- adcx %rcx, %r14
- .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r9
- adox $out, %r14
- adcx %r9, %r15
- .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
- adox %rax, %r15
- adcx %rcx, %r8
- mulx 56($inp), $out, %r9
- adox $out, %r8
- adcx %rbp, %r9
- mulx %rdx, %rax, $out
- adox %rbp, %r9
- mov 24($inp), %rdx
- xor %rcx, %rcx
- adox %r11, %r11
- # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
- adcx %rbx, %rax
- adox %r12, %r12
- adcx %rax, %r11
- adox %rbp, %rcx
- adcx $out, %r12
- adcx %rbp, %rcx
- mov %r11, 32(%rsp)
- mov %r12, 40(%rsp)
- #fourth iteration
- mulx 32($inp), %rax, %rbx
- adox %rax, %r14
- adcx %rbx, %r15
- mulx 40($inp), $out, %r10
- adox $out, %r15
- adcx %r10, %r8
- mulx 48($inp), %rax, %rbx
- adox %rax, %r8
- adcx %rbx, %r9
- mulx 56($inp), $out, %r10
- adox $out, %r9
- adcx %rbp, %r10
- mulx %rdx, %rax, $out
- adox %rbp, %r10
- mov 32($inp), %rdx
- xor %rbx, %rbx
- adox %r13, %r13
- # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
- adcx %rcx, %rax
- adox %r14, %r14
- adcx %rax, %r13
- adox %rbp, %rbx
- adcx $out, %r14
- adcx %rbp, %rbx
- mov %r13, 48(%rsp)
- mov %r14, 56(%rsp)
- #fifth iteration
- mulx 40($inp), $out, %r11
- adox $out, %r8
- adcx %r11, %r9
- mulx 48($inp), %rax, %rcx
- adox %rax, %r9
- adcx %rcx, %r10
- mulx 56($inp), $out, %r11
- adox $out, %r10
- adcx %rbp, %r11
- mulx %rdx, %rax, $out
- mov 40($inp), %rdx
- adox %rbp, %r11
- xor %rcx, %rcx
- adox %r15, %r15
- # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
- adcx %rbx, %rax
- adox %r8, %r8
- adcx %rax, %r15
- adox %rbp, %rcx
- adcx $out, %r8
- adcx %rbp, %rcx
- mov %r15, 64(%rsp)
- mov %r8, 72(%rsp)
- #sixth iteration
- .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
- adox %rax, %r10
- adcx %rbx, %r11
- .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
- adox $out, %r11
- adcx %rbp, %r12
- mulx %rdx, %rax, $out
- adox %rbp, %r12
- mov 48($inp), %rdx
- xor %rbx, %rbx
- adox %r9, %r9
- # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
- adcx %rcx, %rax
- adox %r10, %r10
- adcx %rax, %r9
- adcx $out, %r10
- adox %rbp, %rbx
- adcx %rbp, %rbx
- mov %r9, 80(%rsp)
- mov %r10, 88(%rsp)
- #seventh iteration
- .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
- adox %rax, %r12
- adox %rbp, %r13
- mulx %rdx, %rax, $out
- xor %rcx, %rcx
- mov 56($inp), %rdx
- adox %r11, %r11
- # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
- adcx %rbx, %rax
- adox %r12, %r12
- adcx %rax, %r11
- adox %rbp, %rcx
- adcx $out, %r12
- adcx %rbp, %rcx
- .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
- .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
- #eighth iteration
- mulx %rdx, %rax, %rdx
- xor %rbx, %rbx
- adox %r13, %r13
- # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here
- adcx %rcx, %rax
- adox %rbp, %rbx
- adcx %r13, %rax
- adcx %rdx, %rbx
- movq %xmm0, $out
- movq %xmm1, %rbp
- movq 128(%rsp), %rdx # pull $n0
- movq (%rsp), %r8
- movq 8(%rsp), %r9
- movq 16(%rsp), %r10
- movq 24(%rsp), %r11
- movq 32(%rsp), %r12
- movq 40(%rsp), %r13
- movq 48(%rsp), %r14
- movq 56(%rsp), %r15
- movq %rax, 112(%rsp)
- movq %rbx, 120(%rsp)
- call __rsaz_512_reducex
- addq 64(%rsp), %r8
- adcq 72(%rsp), %r9
- adcq 80(%rsp), %r10
- adcq 88(%rsp), %r11
- adcq 96(%rsp), %r12
- adcq 104(%rsp), %r13
- adcq 112(%rsp), %r14
- adcq 120(%rsp), %r15
- sbbq %rcx, %rcx
- call __rsaz_512_subtract
- movq %r8, %rdx
- movq %r9, %rax
- movl 128+8(%rsp), $times
- movq $out, $inp
- decl $times
- jnz .Loop_sqrx
- .Lsqr_tail:
- ___
- }
- $code.=<<___;
- leaq 128+24+48(%rsp), %rax
- .cfi_def_cfa %rax,8
- movq -48(%rax), %r15
- .cfi_restore %r15
- movq -40(%rax), %r14
- .cfi_restore %r14
- movq -32(%rax), %r13
- .cfi_restore %r13
- movq -24(%rax), %r12
- .cfi_restore %r12
- movq -16(%rax), %rbp
- .cfi_restore %rbp
- movq -8(%rax), %rbx
- .cfi_restore %rbx
- leaq (%rax), %rsp
- .cfi_def_cfa_register %rsp
- .Lsqr_epilogue:
- ret
- .cfi_endproc
- .size rsaz_512_sqr,.-rsaz_512_sqr
- ___
- }
- {
- my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
- $code.=<<___;
- .globl rsaz_512_mul
- .type rsaz_512_mul,\@function,5
- .align 32
- rsaz_512_mul:
- .cfi_startproc
- push %rbx
- .cfi_push %rbx
- push %rbp
- .cfi_push %rbp
- push %r12
- .cfi_push %r12
- push %r13
- .cfi_push %r13
- push %r14
- .cfi_push %r14
- push %r15
- .cfi_push %r15
- subq \$128+24, %rsp
- .cfi_adjust_cfa_offset 128+24
- .Lmul_body:
- movq $out, %xmm0 # off-load arguments
- movq $mod, %xmm1
- movq $n0, 128(%rsp)
- ___
- $code.=<<___ if ($addx);
- movl \$0x80100,%r11d
- andl OPENSSL_ia32cap_P+8(%rip),%r11d
- cmpl \$0x80100,%r11d # check for MULX and ADO/CX
- je .Lmulx
- ___
- $code.=<<___;
- movq ($bp), %rbx # pass b[0]
- movq $bp, %rbp # pass argument
- call __rsaz_512_mul
- movq %xmm0, $out
- movq %xmm1, %rbp
- movq (%rsp), %r8
- movq 8(%rsp), %r9
- movq 16(%rsp), %r10
- movq 24(%rsp), %r11
- movq 32(%rsp), %r12
- movq 40(%rsp), %r13
- movq 48(%rsp), %r14
- movq 56(%rsp), %r15
- call __rsaz_512_reduce
- ___
- $code.=<<___ if ($addx);
- jmp .Lmul_tail
- .align 32
- .Lmulx:
- movq $bp, %rbp # pass argument
- movq ($bp), %rdx # pass b[0]
- call __rsaz_512_mulx
- movq %xmm0, $out
- movq %xmm1, %rbp
- movq 128(%rsp), %rdx # pull $n0
- movq (%rsp), %r8
- movq 8(%rsp), %r9
- movq 16(%rsp), %r10
- movq 24(%rsp), %r11
- movq 32(%rsp), %r12
- movq 40(%rsp), %r13
- movq 48(%rsp), %r14
- movq 56(%rsp), %r15
- call __rsaz_512_reducex
- .Lmul_tail:
- ___
- $code.=<<___;
- addq 64(%rsp), %r8
- adcq 72(%rsp), %r9
- adcq 80(%rsp), %r10
- adcq 88(%rsp), %r11
- adcq 96(%rsp), %r12
- adcq 104(%rsp), %r13
- adcq 112(%rsp), %r14
- adcq 120(%rsp), %r15
- sbbq %rcx, %rcx
- call __rsaz_512_subtract
- leaq 128+24+48(%rsp), %rax
- .cfi_def_cfa %rax,8
- movq -48(%rax), %r15
- .cfi_restore %r15
- movq -40(%rax), %r14
- .cfi_restore %r14
- movq -32(%rax), %r13
- .cfi_restore %r13
- movq -24(%rax), %r12
- .cfi_restore %r12
- movq -16(%rax), %rbp
- .cfi_restore %rbp
- movq -8(%rax), %rbx
- .cfi_restore %rbx
- leaq (%rax), %rsp
- .cfi_def_cfa_register %rsp
- .Lmul_epilogue:
- ret
- .cfi_endproc
- .size rsaz_512_mul,.-rsaz_512_mul
- ___
- }
- {
- my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
- $code.=<<___;
- .globl rsaz_512_mul_gather4
- .type rsaz_512_mul_gather4,\@function,6
- .align 32
- rsaz_512_mul_gather4:
- .cfi_startproc
- push %rbx
- .cfi_push %rbx
- push %rbp
- .cfi_push %rbp
- push %r12
- .cfi_push %r12
- push %r13
- .cfi_push %r13
- push %r14
- .cfi_push %r14
- push %r15
- .cfi_push %r15
- subq \$`128+24+($win64?0xb0:0)`, %rsp
- .cfi_adjust_cfa_offset `128+24+($win64?0xb0:0)`
- ___
- $code.=<<___ if ($win64);
- movaps %xmm6,0xa0(%rsp)
- movaps %xmm7,0xb0(%rsp)
- movaps %xmm8,0xc0(%rsp)
- movaps %xmm9,0xd0(%rsp)
- movaps %xmm10,0xe0(%rsp)
- movaps %xmm11,0xf0(%rsp)
- movaps %xmm12,0x100(%rsp)
- movaps %xmm13,0x110(%rsp)
- movaps %xmm14,0x120(%rsp)
- movaps %xmm15,0x130(%rsp)
- ___
- $code.=<<___;
- .Lmul_gather4_body:
- movd $pwr,%xmm8
- movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
- movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
- pshufd \$0,%xmm8,%xmm8 # broadcast $power
- movdqa %xmm1,%xmm7
- movdqa %xmm1,%xmm2
- ___
- ########################################################################
- # calculate mask by comparing 0..15 to $power
- #
- for($i=0;$i<4;$i++) {
- $code.=<<___;
- paddd %xmm`$i`,%xmm`$i+1`
- pcmpeqd %xmm8,%xmm`$i`
- movdqa %xmm7,%xmm`$i+3`
- ___
- }
- for(;$i<7;$i++) {
- $code.=<<___;
- paddd %xmm`$i`,%xmm`$i+1`
- pcmpeqd %xmm8,%xmm`$i`
- ___
- }
- $code.=<<___;
- pcmpeqd %xmm8,%xmm7
- movdqa 16*0($bp),%xmm8
- movdqa 16*1($bp),%xmm9
- movdqa 16*2($bp),%xmm10
- movdqa 16*3($bp),%xmm11
- pand %xmm0,%xmm8
- movdqa 16*4($bp),%xmm12
- pand %xmm1,%xmm9
- movdqa 16*5($bp),%xmm13
- pand %xmm2,%xmm10
- movdqa 16*6($bp),%xmm14
- pand %xmm3,%xmm11
- movdqa 16*7($bp),%xmm15
- leaq 128($bp), %rbp
- pand %xmm4,%xmm12
- pand %xmm5,%xmm13
- pand %xmm6,%xmm14
- pand %xmm7,%xmm15
- por %xmm10,%xmm8
- por %xmm11,%xmm9
- por %xmm12,%xmm8
- por %xmm13,%xmm9
- por %xmm14,%xmm8
- por %xmm15,%xmm9
- por %xmm9,%xmm8
- pshufd \$0x4e,%xmm8,%xmm9
- por %xmm9,%xmm8
- ___
- $code.=<<___ if ($addx);
- movl \$0x80100,%r11d
- andl OPENSSL_ia32cap_P+8(%rip),%r11d
- cmpl \$0x80100,%r11d # check for MULX and ADO/CX
- je .Lmulx_gather
- ___
- $code.=<<___;
- movq %xmm8,%rbx
- movq $n0, 128(%rsp) # off-load arguments
- movq $out, 128+8(%rsp)
- movq $mod, 128+16(%rsp)
- movq ($ap), %rax
- movq 8($ap), %rcx
- mulq %rbx # 0 iteration
- movq %rax, (%rsp)
- movq %rcx, %rax
- movq %rdx, %r8
- mulq %rbx
- addq %rax, %r8
- movq 16($ap), %rax
- movq %rdx, %r9
- adcq \$0, %r9
- mulq %rbx
- addq %rax, %r9
- movq 24($ap), %rax
- movq %rdx, %r10
- adcq \$0, %r10
- mulq %rbx
- addq %rax, %r10
- movq 32($ap), %rax
- movq %rdx, %r11
- adcq \$0, %r11
- mulq %rbx
- addq %rax, %r11
- movq 40($ap), %rax
- movq %rdx, %r12
- adcq \$0, %r12
- mulq %rbx
- addq %rax, %r12
- movq 48($ap), %rax
- movq %rdx, %r13
- adcq \$0, %r13
- mulq %rbx
- addq %rax, %r13
- movq 56($ap), %rax
- movq %rdx, %r14
- adcq \$0, %r14
- mulq %rbx
- addq %rax, %r14
- movq ($ap), %rax
- movq %rdx, %r15
- adcq \$0, %r15
- leaq 8(%rsp), %rdi
- movl \$7, %ecx
- jmp .Loop_mul_gather
- .align 32
- .Loop_mul_gather:
- movdqa 16*0(%rbp),%xmm8
- movdqa 16*1(%rbp),%xmm9
- movdqa 16*2(%rbp),%xmm10
- movdqa 16*3(%rbp),%xmm11
- pand %xmm0,%xmm8
- movdqa 16*4(%rbp),%xmm12
- pand %xmm1,%xmm9
- movdqa 16*5(%rbp),%xmm13
- pand %xmm2,%xmm10
- movdqa 16*6(%rbp),%xmm14
- pand %xmm3,%xmm11
- movdqa 16*7(%rbp),%xmm15
- leaq 128(%rbp), %rbp
- pand %xmm4,%xmm12
- pand %xmm5,%xmm13
- pand %xmm6,%xmm14
- pand %xmm7,%xmm15
- por %xmm10,%xmm8
- por %xmm11,%xmm9
- por %xmm12,%xmm8
- por %xmm13,%xmm9
- por %xmm14,%xmm8
- por %xmm15,%xmm9
- por %xmm9,%xmm8
- pshufd \$0x4e,%xmm8,%xmm9
- por %xmm9,%xmm8
- movq %xmm8,%rbx
- mulq %rbx
- addq %rax, %r8
- movq 8($ap), %rax
- movq %r8, (%rdi)
- movq %rdx, %r8
- adcq \$0, %r8
- mulq %rbx
- addq %rax, %r9
- movq 16($ap), %rax
- adcq \$0, %rdx
- addq %r9, %r8
- movq %rdx, %r9
- adcq \$0, %r9
- mulq %rbx
- addq %rax, %r10
- movq 24($ap), %rax
- adcq \$0, %rdx
- addq %r10, %r9
- movq %rdx, %r10
- adcq \$0, %r10
- mulq %rbx
- addq %rax, %r11
- movq 32($ap), %rax
- adcq \$0, %rdx
- addq %r11, %r10
- movq %rdx, %r11
- adcq \$0, %r11
- mulq %rbx
- addq %rax, %r12
- movq 40($ap), %rax
- adcq \$0, %rdx
- addq %r12, %r11
- movq %rdx, %r12
- adcq \$0, %r12
- mulq %rbx
- addq %rax, %r13
- movq 48($ap), %rax
- adcq \$0, %rdx
- addq %r13, %r12
- movq %rdx, %r13
- adcq \$0, %r13
- mulq %rbx
- addq %rax, %r14
- movq 56($ap), %rax
- adcq \$0, %rdx
- addq %r14, %r13
- movq %rdx, %r14
- adcq \$0, %r14
- mulq %rbx
- addq %rax, %r15
- movq ($ap), %rax
- adcq \$0, %rdx
- addq %r15, %r14
- movq %rdx, %r15
- adcq \$0, %r15
- leaq 8(%rdi), %rdi
- decl %ecx
- jnz .Loop_mul_gather
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq %r12, 32(%rdi)
- movq %r13, 40(%rdi)
- movq %r14, 48(%rdi)
- movq %r15, 56(%rdi)
- movq 128+8(%rsp), $out
- movq 128+16(%rsp), %rbp
- movq (%rsp), %r8
- movq 8(%rsp), %r9
- movq 16(%rsp), %r10
- movq 24(%rsp), %r11
- movq 32(%rsp), %r12
- movq 40(%rsp), %r13
- movq 48(%rsp), %r14
- movq 56(%rsp), %r15
- call __rsaz_512_reduce
- ___
- $code.=<<___ if ($addx);
- jmp .Lmul_gather_tail
- .align 32
- .Lmulx_gather:
- movq %xmm8,%rdx
- mov $n0, 128(%rsp) # off-load arguments
- mov $out, 128+8(%rsp)
- mov $mod, 128+16(%rsp)
- mulx ($ap), %rbx, %r8 # 0 iteration
- mov %rbx, (%rsp)
- xor %edi, %edi # cf=0, of=0
- mulx 8($ap), %rax, %r9
- mulx 16($ap), %rbx, %r10
- adcx %rax, %r8
- mulx 24($ap), %rax, %r11
- adcx %rbx, %r9
- mulx 32($ap), %rbx, %r12
- adcx %rax, %r10
- mulx 40($ap), %rax, %r13
- adcx %rbx, %r11
- mulx 48($ap), %rbx, %r14
- adcx %rax, %r12
- mulx 56($ap), %rax, %r15
- adcx %rbx, %r13
- adcx %rax, %r14
- .byte 0x67
- mov %r8, %rbx
- adcx %rdi, %r15 # %rdi is 0
- mov \$-7, %rcx
- jmp .Loop_mulx_gather
- .align 32
- .Loop_mulx_gather:
- movdqa 16*0(%rbp),%xmm8
- movdqa 16*1(%rbp),%xmm9
- movdqa 16*2(%rbp),%xmm10
- movdqa 16*3(%rbp),%xmm11
- pand %xmm0,%xmm8
- movdqa 16*4(%rbp),%xmm12
- pand %xmm1,%xmm9
- movdqa 16*5(%rbp),%xmm13
- pand %xmm2,%xmm10
- movdqa 16*6(%rbp),%xmm14
- pand %xmm3,%xmm11
- movdqa 16*7(%rbp),%xmm15
- leaq 128(%rbp), %rbp
- pand %xmm4,%xmm12
- pand %xmm5,%xmm13
- pand %xmm6,%xmm14
- pand %xmm7,%xmm15
- por %xmm10,%xmm8
- por %xmm11,%xmm9
- por %xmm12,%xmm8
- por %xmm13,%xmm9
- por %xmm14,%xmm8
- por %xmm15,%xmm9
- por %xmm9,%xmm8
- pshufd \$0x4e,%xmm8,%xmm9
- por %xmm9,%xmm8
- movq %xmm8,%rdx
- .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8
- adcx %rax, %rbx
- adox %r9, %r8
- mulx 8($ap), %rax, %r9
- adcx %rax, %r8
- adox %r10, %r9
- mulx 16($ap), %rax, %r10
- adcx %rax, %r9
- adox %r11, %r10
- .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
- adcx %rax, %r10
- adox %r12, %r11
- mulx 32($ap), %rax, %r12
- adcx %rax, %r11
- adox %r13, %r12
- mulx 40($ap), %rax, %r13
- adcx %rax, %r12
- adox %r14, %r13
- .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
- adcx %rax, %r13
- .byte 0x67
- adox %r15, %r14
- mulx 56($ap), %rax, %r15
- mov %rbx, 64(%rsp,%rcx,8)
- adcx %rax, %r14
- adox %rdi, %r15
- mov %r8, %rbx
- adcx %rdi, %r15 # cf=0
- inc %rcx # of=0
- jnz .Loop_mulx_gather
- mov %r8, 64(%rsp)
- mov %r9, 64+8(%rsp)
- mov %r10, 64+16(%rsp)
- mov %r11, 64+24(%rsp)
- mov %r12, 64+32(%rsp)
- mov %r13, 64+40(%rsp)
- mov %r14, 64+48(%rsp)
- mov %r15, 64+56(%rsp)
- mov 128(%rsp), %rdx # pull arguments
- mov 128+8(%rsp), $out
- mov 128+16(%rsp), %rbp
- mov (%rsp), %r8
- mov 8(%rsp), %r9
- mov 16(%rsp), %r10
- mov 24(%rsp), %r11
- mov 32(%rsp), %r12
- mov 40(%rsp), %r13
- mov 48(%rsp), %r14
- mov 56(%rsp), %r15
- call __rsaz_512_reducex
- .Lmul_gather_tail:
- ___
- $code.=<<___;
- addq 64(%rsp), %r8
- adcq 72(%rsp), %r9
- adcq 80(%rsp), %r10
- adcq 88(%rsp), %r11
- adcq 96(%rsp), %r12
- adcq 104(%rsp), %r13
- adcq 112(%rsp), %r14
- adcq 120(%rsp), %r15
- sbbq %rcx, %rcx
- call __rsaz_512_subtract
- leaq 128+24+48(%rsp), %rax
- ___
- $code.=<<___ if ($win64);
- movaps 0xa0-0xc8(%rax),%xmm6
- movaps 0xb0-0xc8(%rax),%xmm7
- movaps 0xc0-0xc8(%rax),%xmm8
- movaps 0xd0-0xc8(%rax),%xmm9
- movaps 0xe0-0xc8(%rax),%xmm10
- movaps 0xf0-0xc8(%rax),%xmm11
- movaps 0x100-0xc8(%rax),%xmm12
- movaps 0x110-0xc8(%rax),%xmm13
- movaps 0x120-0xc8(%rax),%xmm14
- movaps 0x130-0xc8(%rax),%xmm15
- lea 0xb0(%rax),%rax
- ___
- $code.=<<___;
- .cfi_def_cfa %rax,8
- movq -48(%rax), %r15
- .cfi_restore %r15
- movq -40(%rax), %r14
- .cfi_restore %r14
- movq -32(%rax), %r13
- .cfi_restore %r13
- movq -24(%rax), %r12
- .cfi_restore %r12
- movq -16(%rax), %rbp
- .cfi_restore %rbp
- movq -8(%rax), %rbx
- .cfi_restore %rbx
- leaq (%rax), %rsp
- .cfi_def_cfa_register %rsp
- .Lmul_gather4_epilogue:
- ret
- .cfi_endproc
- .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
- ___
- }
- {
- my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
- $code.=<<___;
- .globl rsaz_512_mul_scatter4
- .type rsaz_512_mul_scatter4,\@function,6
- .align 32
- rsaz_512_mul_scatter4:
- .cfi_startproc
- push %rbx
- .cfi_push %rbx
- push %rbp
- .cfi_push %rbp
- push %r12
- .cfi_push %r12
- push %r13
- .cfi_push %r13
- push %r14
- .cfi_push %r14
- push %r15
- .cfi_push %r15
- mov $pwr, $pwr
- subq \$128+24, %rsp
- .cfi_adjust_cfa_offset 128+24
- .Lmul_scatter4_body:
- leaq ($tbl,$pwr,8), $tbl
- movq $out, %xmm0 # off-load arguments
- movq $mod, %xmm1
- movq $tbl, %xmm2
- movq $n0, 128(%rsp)
- movq $out, %rbp
- ___
- $code.=<<___ if ($addx);
- movl \$0x80100,%r11d
- andl OPENSSL_ia32cap_P+8(%rip),%r11d
- cmpl \$0x80100,%r11d # check for MULX and ADO/CX
- je .Lmulx_scatter
- ___
- $code.=<<___;
- movq ($out),%rbx # pass b[0]
- call __rsaz_512_mul
- movq %xmm0, $out
- movq %xmm1, %rbp
- movq (%rsp), %r8
- movq 8(%rsp), %r9
- movq 16(%rsp), %r10
- movq 24(%rsp), %r11
- movq 32(%rsp), %r12
- movq 40(%rsp), %r13
- movq 48(%rsp), %r14
- movq 56(%rsp), %r15
- call __rsaz_512_reduce
- ___
- $code.=<<___ if ($addx);
- jmp .Lmul_scatter_tail
- .align 32
- .Lmulx_scatter:
- movq ($out), %rdx # pass b[0]
- call __rsaz_512_mulx
- movq %xmm0, $out
- movq %xmm1, %rbp
- movq 128(%rsp), %rdx # pull $n0
- movq (%rsp), %r8
- movq 8(%rsp), %r9
- movq 16(%rsp), %r10
- movq 24(%rsp), %r11
- movq 32(%rsp), %r12
- movq 40(%rsp), %r13
- movq 48(%rsp), %r14
- movq 56(%rsp), %r15
- call __rsaz_512_reducex
- .Lmul_scatter_tail:
- ___
- $code.=<<___;
- addq 64(%rsp), %r8
- adcq 72(%rsp), %r9
- adcq 80(%rsp), %r10
- adcq 88(%rsp), %r11
- adcq 96(%rsp), %r12
- adcq 104(%rsp), %r13
- adcq 112(%rsp), %r14
- adcq 120(%rsp), %r15
- movq %xmm2, $inp
- sbbq %rcx, %rcx
- call __rsaz_512_subtract
- movq %r8, 128*0($inp) # scatter
- movq %r9, 128*1($inp)
- movq %r10, 128*2($inp)
- movq %r11, 128*3($inp)
- movq %r12, 128*4($inp)
- movq %r13, 128*5($inp)
- movq %r14, 128*6($inp)
- movq %r15, 128*7($inp)
- leaq 128+24+48(%rsp), %rax
- .cfi_def_cfa %rax,8
- movq -48(%rax), %r15
- .cfi_restore %r15
- movq -40(%rax), %r14
- .cfi_restore %r14
- movq -32(%rax), %r13
- .cfi_restore %r13
- movq -24(%rax), %r12
- .cfi_restore %r12
- movq -16(%rax), %rbp
- .cfi_restore %rbp
- movq -8(%rax), %rbx
- .cfi_restore %rbx
- leaq (%rax), %rsp
- .cfi_def_cfa_register %rsp
- .Lmul_scatter4_epilogue:
- ret
- .cfi_endproc
- .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
- ___
- }
- {
- my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
- $code.=<<___;
- .globl rsaz_512_mul_by_one
- .type rsaz_512_mul_by_one,\@function,4
- .align 32
- rsaz_512_mul_by_one:
- .cfi_startproc
- push %rbx
- .cfi_push %rbx
- push %rbp
- .cfi_push %rbp
- push %r12
- .cfi_push %r12
- push %r13
- .cfi_push %r13
- push %r14
- .cfi_push %r14
- push %r15
- .cfi_push %r15
- subq \$128+24, %rsp
- .cfi_adjust_cfa_offset 128+24
- .Lmul_by_one_body:
- ___
- $code.=<<___ if ($addx);
- movl OPENSSL_ia32cap_P+8(%rip),%eax
- ___
- $code.=<<___;
- movq $mod, %rbp # reassign argument
- movq $n0, 128(%rsp)
- movq ($inp), %r8
- pxor %xmm0, %xmm0
- movq 8($inp), %r9
- movq 16($inp), %r10
- movq 24($inp), %r11
- movq 32($inp), %r12
- movq 40($inp), %r13
- movq 48($inp), %r14
- movq 56($inp), %r15
- movdqa %xmm0, (%rsp)
- movdqa %xmm0, 16(%rsp)
- movdqa %xmm0, 32(%rsp)
- movdqa %xmm0, 48(%rsp)
- movdqa %xmm0, 64(%rsp)
- movdqa %xmm0, 80(%rsp)
- movdqa %xmm0, 96(%rsp)
- ___
- $code.=<<___ if ($addx);
- andl \$0x80100,%eax
- cmpl \$0x80100,%eax # check for MULX and ADO/CX
- je .Lby_one_callx
- ___
- $code.=<<___;
- call __rsaz_512_reduce
- ___
- $code.=<<___ if ($addx);
- jmp .Lby_one_tail
- .align 32
- .Lby_one_callx:
- movq 128(%rsp), %rdx # pull $n0
- call __rsaz_512_reducex
- .Lby_one_tail:
- ___
- $code.=<<___;
- movq %r8, ($out)
- movq %r9, 8($out)
- movq %r10, 16($out)
- movq %r11, 24($out)
- movq %r12, 32($out)
- movq %r13, 40($out)
- movq %r14, 48($out)
- movq %r15, 56($out)
- leaq 128+24+48(%rsp), %rax
- .cfi_def_cfa %rax,8
- movq -48(%rax), %r15
- .cfi_restore %r15
- movq -40(%rax), %r14
- .cfi_restore %r14
- movq -32(%rax), %r13
- .cfi_restore %r13
- movq -24(%rax), %r12
- .cfi_restore %r12
- movq -16(%rax), %rbp
- .cfi_restore %rbp
- movq -8(%rax), %rbx
- .cfi_restore %rbx
- leaq (%rax), %rsp
- .cfi_def_cfa_register %rsp
- .Lmul_by_one_epilogue:
- ret
- .cfi_endproc
- .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
- ___
- }
- { # __rsaz_512_reduce
- #
- # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
- # output: %r8-%r15
- # clobbers: everything except %rbp and %rdi
- $code.=<<___;
- .type __rsaz_512_reduce,\@abi-omnipotent
- .align 32
- __rsaz_512_reduce:
- .cfi_startproc
- movq %r8, %rbx
- imulq 128+8(%rsp), %rbx
- movq 0(%rbp), %rax
- movl \$8, %ecx
- jmp .Lreduction_loop
- .align 32
- .Lreduction_loop:
- mulq %rbx
- movq 8(%rbp), %rax
- negq %r8
- movq %rdx, %r8
- adcq \$0, %r8
- mulq %rbx
- addq %rax, %r9
- movq 16(%rbp), %rax
- adcq \$0, %rdx
- addq %r9, %r8
- movq %rdx, %r9
- adcq \$0, %r9
- mulq %rbx
- addq %rax, %r10
- movq 24(%rbp), %rax
- adcq \$0, %rdx
- addq %r10, %r9
- movq %rdx, %r10
- adcq \$0, %r10
- mulq %rbx
- addq %rax, %r11
- movq 32(%rbp), %rax
- adcq \$0, %rdx
- addq %r11, %r10
- movq 128+8(%rsp), %rsi
- #movq %rdx, %r11
- #adcq \$0, %r11
- adcq \$0, %rdx
- movq %rdx, %r11
- mulq %rbx
- addq %rax, %r12
- movq 40(%rbp), %rax
- adcq \$0, %rdx
- imulq %r8, %rsi
- addq %r12, %r11
- movq %rdx, %r12
- adcq \$0, %r12
- mulq %rbx
- addq %rax, %r13
- movq 48(%rbp), %rax
- adcq \$0, %rdx
- addq %r13, %r12
- movq %rdx, %r13
- adcq \$0, %r13
- mulq %rbx
- addq %rax, %r14
- movq 56(%rbp), %rax
- adcq \$0, %rdx
- addq %r14, %r13
- movq %rdx, %r14
- adcq \$0, %r14
- mulq %rbx
- movq %rsi, %rbx
- addq %rax, %r15
- movq 0(%rbp), %rax
- adcq \$0, %rdx
- addq %r15, %r14
- movq %rdx, %r15
- adcq \$0, %r15
- decl %ecx
- jne .Lreduction_loop
- ret
- .cfi_endproc
- .size __rsaz_512_reduce,.-__rsaz_512_reduce
- ___
- }
- if ($addx) {
- # __rsaz_512_reducex
- #
- # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
- # output: %r8-%r15
- # clobbers: everything except %rbp and %rdi
- $code.=<<___;
- .type __rsaz_512_reducex,\@abi-omnipotent
- .align 32
- __rsaz_512_reducex:
- .cfi_startproc
- #movq 128+8(%rsp), %rdx # pull $n0
- imulq %r8, %rdx
- xorq %rsi, %rsi # cf=0,of=0
- movl \$8, %ecx
- jmp .Lreduction_loopx
- .align 32
- .Lreduction_loopx:
- mov %r8, %rbx
- mulx 0(%rbp), %rax, %r8
- adcx %rbx, %rax
- adox %r9, %r8
- mulx 8(%rbp), %rax, %r9
- adcx %rax, %r8
- adox %r10, %r9
- mulx 16(%rbp), %rbx, %r10
- adcx %rbx, %r9
- adox %r11, %r10
- mulx 24(%rbp), %rbx, %r11
- adcx %rbx, %r10
- adox %r12, %r11
- .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
- mov %rdx, %rax
- mov %r8, %rdx
- adcx %rbx, %r11
- adox %r13, %r12
- mulx 128+8(%rsp), %rbx, %rdx
- mov %rax, %rdx
- mulx 40(%rbp), %rax, %r13
- adcx %rax, %r12
- adox %r14, %r13
- .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
- adcx %rax, %r13
- adox %r15, %r14
- mulx 56(%rbp), %rax, %r15
- mov %rbx, %rdx
- adcx %rax, %r14
- adox %rsi, %r15 # %rsi is 0
- adcx %rsi, %r15 # cf=0
- decl %ecx # of=0
- jne .Lreduction_loopx
- ret
- .cfi_endproc
- .size __rsaz_512_reducex,.-__rsaz_512_reducex
- ___
- }
- { # __rsaz_512_subtract
- # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
- # output:
- # clobbers: everything but %rdi, %rsi and %rbp
- $code.=<<___;
- .type __rsaz_512_subtract,\@abi-omnipotent
- .align 32
- __rsaz_512_subtract:
- .cfi_startproc
- movq %r8, ($out)
- movq %r9, 8($out)
- movq %r10, 16($out)
- movq %r11, 24($out)
- movq %r12, 32($out)
- movq %r13, 40($out)
- movq %r14, 48($out)
- movq %r15, 56($out)
- movq 0($mod), %r8
- movq 8($mod), %r9
- negq %r8
- notq %r9
- andq %rcx, %r8
- movq 16($mod), %r10
- andq %rcx, %r9
- notq %r10
- movq 24($mod), %r11
- andq %rcx, %r10
- notq %r11
- movq 32($mod), %r12
- andq %rcx, %r11
- notq %r12
- movq 40($mod), %r13
- andq %rcx, %r12
- notq %r13
- movq 48($mod), %r14
- andq %rcx, %r13
- notq %r14
- movq 56($mod), %r15
- andq %rcx, %r14
- notq %r15
- andq %rcx, %r15
- addq ($out), %r8
- adcq 8($out), %r9
- adcq 16($out), %r10
- adcq 24($out), %r11
- adcq 32($out), %r12
- adcq 40($out), %r13
- adcq 48($out), %r14
- adcq 56($out), %r15
- movq %r8, ($out)
- movq %r9, 8($out)
- movq %r10, 16($out)
- movq %r11, 24($out)
- movq %r12, 32($out)
- movq %r13, 40($out)
- movq %r14, 48($out)
- movq %r15, 56($out)
- ret
- .cfi_endproc
- .size __rsaz_512_subtract,.-__rsaz_512_subtract
- ___
- }
- { # __rsaz_512_mul
- #
- # input: %rsi - ap, %rbp - bp
- # output:
- # clobbers: everything
- my ($ap,$bp) = ("%rsi","%rbp");
- $code.=<<___;
- .type __rsaz_512_mul,\@abi-omnipotent
- .align 32
- __rsaz_512_mul:
- .cfi_startproc
- leaq 8(%rsp), %rdi
- movq ($ap), %rax
- mulq %rbx
- movq %rax, (%rdi)
- movq 8($ap), %rax
- movq %rdx, %r8
- mulq %rbx
- addq %rax, %r8
- movq 16($ap), %rax
- movq %rdx, %r9
- adcq \$0, %r9
- mulq %rbx
- addq %rax, %r9
- movq 24($ap), %rax
- movq %rdx, %r10
- adcq \$0, %r10
- mulq %rbx
- addq %rax, %r10
- movq 32($ap), %rax
- movq %rdx, %r11
- adcq \$0, %r11
- mulq %rbx
- addq %rax, %r11
- movq 40($ap), %rax
- movq %rdx, %r12
- adcq \$0, %r12
- mulq %rbx
- addq %rax, %r12
- movq 48($ap), %rax
- movq %rdx, %r13
- adcq \$0, %r13
- mulq %rbx
- addq %rax, %r13
- movq 56($ap), %rax
- movq %rdx, %r14
- adcq \$0, %r14
- mulq %rbx
- addq %rax, %r14
- movq ($ap), %rax
- movq %rdx, %r15
- adcq \$0, %r15
- leaq 8($bp), $bp
- leaq 8(%rdi), %rdi
- movl \$7, %ecx
- jmp .Loop_mul
- .align 32
- .Loop_mul:
- movq ($bp), %rbx
- mulq %rbx
- addq %rax, %r8
- movq 8($ap), %rax
- movq %r8, (%rdi)
- movq %rdx, %r8
- adcq \$0, %r8
- mulq %rbx
- addq %rax, %r9
- movq 16($ap), %rax
- adcq \$0, %rdx
- addq %r9, %r8
- movq %rdx, %r9
- adcq \$0, %r9
- mulq %rbx
- addq %rax, %r10
- movq 24($ap), %rax
- adcq \$0, %rdx
- addq %r10, %r9
- movq %rdx, %r10
- adcq \$0, %r10
- mulq %rbx
- addq %rax, %r11
- movq 32($ap), %rax
- adcq \$0, %rdx
- addq %r11, %r10
- movq %rdx, %r11
- adcq \$0, %r11
- mulq %rbx
- addq %rax, %r12
- movq 40($ap), %rax
- adcq \$0, %rdx
- addq %r12, %r11
- movq %rdx, %r12
- adcq \$0, %r12
- mulq %rbx
- addq %rax, %r13
- movq 48($ap), %rax
- adcq \$0, %rdx
- addq %r13, %r12
- movq %rdx, %r13
- adcq \$0, %r13
- mulq %rbx
- addq %rax, %r14
- movq 56($ap), %rax
- adcq \$0, %rdx
- addq %r14, %r13
- movq %rdx, %r14
- leaq 8($bp), $bp
- adcq \$0, %r14
- mulq %rbx
- addq %rax, %r15
- movq ($ap), %rax
- adcq \$0, %rdx
- addq %r15, %r14
- movq %rdx, %r15
- adcq \$0, %r15
- leaq 8(%rdi), %rdi
- decl %ecx
- jnz .Loop_mul
- movq %r8, (%rdi)
- movq %r9, 8(%rdi)
- movq %r10, 16(%rdi)
- movq %r11, 24(%rdi)
- movq %r12, 32(%rdi)
- movq %r13, 40(%rdi)
- movq %r14, 48(%rdi)
- movq %r15, 56(%rdi)
- ret
- .cfi_endproc
- .size __rsaz_512_mul,.-__rsaz_512_mul
- ___
- }
- if ($addx) {
- # __rsaz_512_mulx
- #
- # input: %rsi - ap, %rbp - bp
- # output:
- # clobbers: everything
- my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
- $code.=<<___;
- .type __rsaz_512_mulx,\@abi-omnipotent
- .align 32
- __rsaz_512_mulx:
- .cfi_startproc
- mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
- mov \$-6, %rcx
- mulx 8($ap), %rax, %r9
- movq %rbx, 8(%rsp)
- mulx 16($ap), %rbx, %r10
- adc %rax, %r8
- mulx 24($ap), %rax, %r11
- adc %rbx, %r9
- mulx 32($ap), %rbx, %r12
- adc %rax, %r10
- mulx 40($ap), %rax, %r13
- adc %rbx, %r11
- mulx 48($ap), %rbx, %r14
- adc %rax, %r12
- mulx 56($ap), %rax, %r15
- mov 8($bp), %rdx
- adc %rbx, %r13
- adc %rax, %r14
- adc \$0, %r15
- xor $zero, $zero # cf=0,of=0
- jmp .Loop_mulx
- .align 32
- .Loop_mulx:
- movq %r8, %rbx
- mulx ($ap), %rax, %r8
- adcx %rax, %rbx
- adox %r9, %r8
- mulx 8($ap), %rax, %r9
- adcx %rax, %r8
- adox %r10, %r9
- mulx 16($ap), %rax, %r10
- adcx %rax, %r9
- adox %r11, %r10
- mulx 24($ap), %rax, %r11
- adcx %rax, %r10
- adox %r12, %r11
- .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
- adcx %rax, %r11
- adox %r13, %r12
- mulx 40($ap), %rax, %r13
- adcx %rax, %r12
- adox %r14, %r13
- mulx 48($ap), %rax, %r14
- adcx %rax, %r13
- adox %r15, %r14
- mulx 56($ap), %rax, %r15
- movq 64($bp,%rcx,8), %rdx
- movq %rbx, 8+64-8(%rsp,%rcx,8)
- adcx %rax, %r14
- adox $zero, %r15
- adcx $zero, %r15 # cf=0
- inc %rcx # of=0
- jnz .Loop_mulx
- movq %r8, %rbx
- mulx ($ap), %rax, %r8
- adcx %rax, %rbx
- adox %r9, %r8
- .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
- adcx %rax, %r8
- adox %r10, %r9
- .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
- adcx %rax, %r9
- adox %r11, %r10
- mulx 24($ap), %rax, %r11
- adcx %rax, %r10
- adox %r12, %r11
- mulx 32($ap), %rax, %r12
- adcx %rax, %r11
- adox %r13, %r12
- mulx 40($ap), %rax, %r13
- adcx %rax, %r12
- adox %r14, %r13
- .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
- adcx %rax, %r13
- adox %r15, %r14
- .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
- adcx %rax, %r14
- adox $zero, %r15
- adcx $zero, %r15
- mov %rbx, 8+64-8(%rsp)
- mov %r8, 8+64(%rsp)
- mov %r9, 8+64+8(%rsp)
- mov %r10, 8+64+16(%rsp)
- mov %r11, 8+64+24(%rsp)
- mov %r12, 8+64+32(%rsp)
- mov %r13, 8+64+40(%rsp)
- mov %r14, 8+64+48(%rsp)
- mov %r15, 8+64+56(%rsp)
- ret
- .cfi_endproc
- .size __rsaz_512_mulx,.-__rsaz_512_mulx
- ___
- }
- {
- my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
- $code.=<<___;
- .globl rsaz_512_scatter4
- .type rsaz_512_scatter4,\@abi-omnipotent
- .align 16
- rsaz_512_scatter4:
- .cfi_startproc
- leaq ($out,$power,8), $out
- movl \$8, %r9d
- jmp .Loop_scatter
- .align 16
- .Loop_scatter:
- movq ($inp), %rax
- leaq 8($inp), $inp
- movq %rax, ($out)
- leaq 128($out), $out
- decl %r9d
- jnz .Loop_scatter
- ret
- .cfi_endproc
- .size rsaz_512_scatter4,.-rsaz_512_scatter4
- .globl rsaz_512_gather4
- .type rsaz_512_gather4,\@abi-omnipotent
- .align 16
- rsaz_512_gather4:
- .cfi_startproc
- ___
- $code.=<<___ if ($win64);
- .LSEH_begin_rsaz_512_gather4:
- .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp
- .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp)
- .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp)
- .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp)
- .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp)
- .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp)
- .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp)
- .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp)
- .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp)
- .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp)
- .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp)
- ___
- $code.=<<___;
- movd $power,%xmm8
- movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
- movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
- pshufd \$0,%xmm8,%xmm8 # broadcast $power
- movdqa %xmm1,%xmm7
- movdqa %xmm1,%xmm2
- ___
- ########################################################################
- # calculate mask by comparing 0..15 to $power
- #
- for($i=0;$i<4;$i++) {
- $code.=<<___;
- paddd %xmm`$i`,%xmm`$i+1`
- pcmpeqd %xmm8,%xmm`$i`
- movdqa %xmm7,%xmm`$i+3`
- ___
- }
- for(;$i<7;$i++) {
- $code.=<<___;
- paddd %xmm`$i`,%xmm`$i+1`
- pcmpeqd %xmm8,%xmm`$i`
- ___
- }
- $code.=<<___;
- pcmpeqd %xmm8,%xmm7
- movl \$8, %r9d
- jmp .Loop_gather
- .align 16
- .Loop_gather:
- movdqa 16*0($inp),%xmm8
- movdqa 16*1($inp),%xmm9
- movdqa 16*2($inp),%xmm10
- movdqa 16*3($inp),%xmm11
- pand %xmm0,%xmm8
- movdqa 16*4($inp),%xmm12
- pand %xmm1,%xmm9
- movdqa 16*5($inp),%xmm13
- pand %xmm2,%xmm10
- movdqa 16*6($inp),%xmm14
- pand %xmm3,%xmm11
- movdqa 16*7($inp),%xmm15
- leaq 128($inp), $inp
- pand %xmm4,%xmm12
- pand %xmm5,%xmm13
- pand %xmm6,%xmm14
- pand %xmm7,%xmm15
- por %xmm10,%xmm8
- por %xmm11,%xmm9
- por %xmm12,%xmm8
- por %xmm13,%xmm9
- por %xmm14,%xmm8
- por %xmm15,%xmm9
- por %xmm9,%xmm8
- pshufd \$0x4e,%xmm8,%xmm9
- por %xmm9,%xmm8
- movq %xmm8,($out)
- leaq 8($out), $out
- decl %r9d
- jnz .Loop_gather
- ___
- $code.=<<___ if ($win64);
- movaps 0x00(%rsp),%xmm6
- movaps 0x10(%rsp),%xmm7
- movaps 0x20(%rsp),%xmm8
- movaps 0x30(%rsp),%xmm9
- movaps 0x40(%rsp),%xmm10
- movaps 0x50(%rsp),%xmm11
- movaps 0x60(%rsp),%xmm12
- movaps 0x70(%rsp),%xmm13
- movaps 0x80(%rsp),%xmm14
- movaps 0x90(%rsp),%xmm15
- add \$0xa8,%rsp
- ___
- $code.=<<___;
- ret
- .LSEH_end_rsaz_512_gather4:
- .cfi_endproc
- .size rsaz_512_gather4,.-rsaz_512_gather4
- .align 64
- .Linc:
- .long 0,0, 1,1
- .long 2,2, 2,2
- ___
- }
- # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
- # CONTEXT *context,DISPATCHER_CONTEXT *disp)
- if ($win64) {
- $rec="%rcx";
- $frame="%rdx";
- $context="%r8";
- $disp="%r9";
- $code.=<<___;
- .extern __imp_RtlVirtualUnwind
- .type se_handler,\@abi-omnipotent
- .align 16
- se_handler:
- push %rsi
- push %rdi
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- pushfq
- sub \$64,%rsp
- mov 120($context),%rax # pull context->Rax
- mov 248($context),%rbx # pull context->Rip
- mov 8($disp),%rsi # disp->ImageBase
- mov 56($disp),%r11 # disp->HandlerData
- mov 0(%r11),%r10d # HandlerData[0]
- lea (%rsi,%r10),%r10 # end of prologue label
- cmp %r10,%rbx # context->Rip<end of prologue label
- jb .Lcommon_seh_tail
- mov 152($context),%rax # pull context->Rsp
- mov 4(%r11),%r10d # HandlerData[1]
- lea (%rsi,%r10),%r10 # epilogue label
- cmp %r10,%rbx # context->Rip>=epilogue label
- jae .Lcommon_seh_tail
- lea 128+24+48(%rax),%rax
- lea .Lmul_gather4_epilogue(%rip),%rbx
- cmp %r10,%rbx
- jne .Lse_not_in_mul_gather4
- lea 0xb0(%rax),%rax
- lea -48-0xa8(%rax),%rsi
- lea 512($context),%rdi
- mov \$20,%ecx
- .long 0xa548f3fc # cld; rep movsq
- .Lse_not_in_mul_gather4:
- mov -8(%rax),%rbx
- mov -16(%rax),%rbp
- mov -24(%rax),%r12
- mov -32(%rax),%r13
- mov -40(%rax),%r14
- mov -48(%rax),%r15
- mov %rbx,144($context) # restore context->Rbx
- mov %rbp,160($context) # restore context->Rbp
- mov %r12,216($context) # restore context->R12
- mov %r13,224($context) # restore context->R13
- mov %r14,232($context) # restore context->R14
- mov %r15,240($context) # restore context->R15
- .Lcommon_seh_tail:
- mov 8(%rax),%rdi
- mov 16(%rax),%rsi
- mov %rax,152($context) # restore context->Rsp
- mov %rsi,168($context) # restore context->Rsi
- mov %rdi,176($context) # restore context->Rdi
- mov 40($disp),%rdi # disp->ContextRecord
- mov $context,%rsi # context
- mov \$154,%ecx # sizeof(CONTEXT)
- .long 0xa548f3fc # cld; rep movsq
- mov $disp,%rsi
- xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
- mov 8(%rsi),%rdx # arg2, disp->ImageBase
- mov 0(%rsi),%r8 # arg3, disp->ControlPc
- mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
- mov 40(%rsi),%r10 # disp->ContextRecord
- lea 56(%rsi),%r11 # &disp->HandlerData
- lea 24(%rsi),%r12 # &disp->EstablisherFrame
- mov %r10,32(%rsp) # arg5
- mov %r11,40(%rsp) # arg6
- mov %r12,48(%rsp) # arg7
- mov %rcx,56(%rsp) # arg8, (NULL)
- call *__imp_RtlVirtualUnwind(%rip)
- mov \$1,%eax # ExceptionContinueSearch
- add \$64,%rsp
- popfq
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- pop %rdi
- pop %rsi
- ret
- .size se_handler,.-se_handler
- .section .pdata
- .align 4
- .rva .LSEH_begin_rsaz_512_sqr
- .rva .LSEH_end_rsaz_512_sqr
- .rva .LSEH_info_rsaz_512_sqr
- .rva .LSEH_begin_rsaz_512_mul
- .rva .LSEH_end_rsaz_512_mul
- .rva .LSEH_info_rsaz_512_mul
- .rva .LSEH_begin_rsaz_512_mul_gather4
- .rva .LSEH_end_rsaz_512_mul_gather4
- .rva .LSEH_info_rsaz_512_mul_gather4
- .rva .LSEH_begin_rsaz_512_mul_scatter4
- .rva .LSEH_end_rsaz_512_mul_scatter4
- .rva .LSEH_info_rsaz_512_mul_scatter4
- .rva .LSEH_begin_rsaz_512_mul_by_one
- .rva .LSEH_end_rsaz_512_mul_by_one
- .rva .LSEH_info_rsaz_512_mul_by_one
- .rva .LSEH_begin_rsaz_512_gather4
- .rva .LSEH_end_rsaz_512_gather4
- .rva .LSEH_info_rsaz_512_gather4
- .section .xdata
- .align 8
- .LSEH_info_rsaz_512_sqr:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
- .LSEH_info_rsaz_512_mul:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
- .LSEH_info_rsaz_512_mul_gather4:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
- .LSEH_info_rsaz_512_mul_scatter4:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
- .LSEH_info_rsaz_512_mul_by_one:
- .byte 9,0,0,0
- .rva se_handler
- .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
- .LSEH_info_rsaz_512_gather4:
- .byte 0x01,0x46,0x16,0x00
- .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
- .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
- .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
- .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
- .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
- .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
- .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
- .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
- .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
- .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
- .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8
- ___
- }
- $code =~ s/\`([^\`]*)\`/eval $1/gem;
- print $code;
- close STDOUT or die "error closing STDOUT: $!";
|