1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984 |
- #! /usr/bin/env perl
- # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
- # Copyright (c) 2012, Intel Corporation. All Rights Reserved.
- #
- # Licensed under the Apache License 2.0 (the "License"). You may not use
- # this file except in compliance with the License. You can obtain a copy
- # in the file LICENSE in the source distribution or at
- # https://www.openssl.org/source/license.html
- #
- # Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1)
- # (1) Intel Corporation, Israel Development Center, Haifa, Israel
- # (2) University of Haifa, Israel
- #
- # References:
- # [1] S. Gueron, V. Krasnov: "Software Implementation of Modular
- # Exponentiation, Using Advanced Vector Instructions Architectures",
- # F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369,
- # pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012
- # [2] S. Gueron: "Efficient Software Implementations of Modular
- # Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012).
- # [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE
- # Proceedings of 9th International Conference on Information Technology:
- # New Generations (ITNG 2012), pp.821-823 (2012)
- # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis
- # resistant 1024-bit modular exponentiation, for optimizing RSA2048
- # on AVX2 capable x86_64 platforms",
- # http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest
- #
- # +13% improvement over original submission by <appro@openssl.org>
- #
- # rsa2048 sign/sec OpenSSL 1.0.1 scalar(*) this
- # 2.3GHz Haswell 621 765/+23% 1113/+79%
- # 2.3GHz Broadwell(**) 688 1200(***)/+74% 1120/+63%
- #
- # (*) if system doesn't support AVX2, for reference purposes;
- # (**) scaled to 2.3GHz to simplify comparison;
- # (***) scalar AD*X code is faster than AVX2 and is preferred code
- # path for Broadwell;
- # $output is the last argument if it looks like a file (it has an extension)
- # $flavour is the first argument if it doesn't look like a file
- $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
- $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
- $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
- die "can't locate x86_64-xlate.pl";
- if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
- =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
- $avx = ($1>=2.19) + ($1>=2.22);
- $addx = ($1>=2.23);
- }
- if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
- `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
- $avx = ($1>=2.09) + ($1>=2.10);
- $addx = ($1>=2.10);
- }
- if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
- `ml64 2>&1` =~ /Version ([0-9]+)\./) {
- $avx = ($1>=10) + ($1>=11);
- $addx = ($1>=11);
- }
- if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|based on LLVM) ([0-9]+)\.([0-9]+)/) {
- my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
- $avx = ($ver>=3.0) + ($ver>=3.01);
- $addx = ($ver>=3.03);
- }
- open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
- or die "can't call $xlate: $!";
- *STDOUT = *OUT;
- if ($avx>1) {{{
- { # void AMS_WW(
- my $rp="%rdi"; # BN_ULONG *rp,
- my $ap="%rsi"; # const BN_ULONG *ap,
- my $np="%rdx"; # const BN_ULONG *np,
- my $n0="%ecx"; # const BN_ULONG n0,
- my $rep="%r8d"; # int repeat);
- # The registers that hold the accumulated redundant result
- # The AMM works on 1024 bit operands, and redundant word size is 29
- # Therefore: ceil(1024/29)/4 = 9
- my $ACC0="%ymm0";
- my $ACC1="%ymm1";
- my $ACC2="%ymm2";
- my $ACC3="%ymm3";
- my $ACC4="%ymm4";
- my $ACC5="%ymm5";
- my $ACC6="%ymm6";
- my $ACC7="%ymm7";
- my $ACC8="%ymm8";
- my $ACC9="%ymm9";
- # Registers that hold the broadcasted words of bp, currently used
- my $B1="%ymm10";
- my $B2="%ymm11";
- # Registers that hold the broadcasted words of Y, currently used
- my $Y1="%ymm12";
- my $Y2="%ymm13";
- # Helper registers
- my $TEMP1="%ymm14";
- my $AND_MASK="%ymm15";
- # alu registers that hold the first words of the ACC
- my $r0="%r9";
- my $r1="%r10";
- my $r2="%r11";
- my $r3="%r12";
- my $i="%r14d"; # loop counter
- my $tmp = "%r15";
- my $FrameSize=32*18+32*8; # place for A^2 and 2*A
- my $aap=$r0;
- my $tp0="%rbx";
- my $tp1=$r3;
- my $tpa=$tmp;
- $np="%r13"; # reassigned argument
- $code.=<<___;
- .text
- .globl rsaz_1024_sqr_avx2
- .type rsaz_1024_sqr_avx2,\@function,5
- .align 64
- rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2
- .cfi_startproc
- lea (%rsp), %rax
- .cfi_def_cfa_register %rax
- push %rbx
- .cfi_push %rbx
- push %rbp
- .cfi_push %rbp
- push %r12
- .cfi_push %r12
- push %r13
- .cfi_push %r13
- push %r14
- .cfi_push %r14
- push %r15
- .cfi_push %r15
- vzeroupper
- ___
- $code.=<<___ if ($win64);
- lea -0xa8(%rsp),%rsp
- vmovaps %xmm6,-0xd8(%rax)
- vmovaps %xmm7,-0xc8(%rax)
- vmovaps %xmm8,-0xb8(%rax)
- vmovaps %xmm9,-0xa8(%rax)
- vmovaps %xmm10,-0x98(%rax)
- vmovaps %xmm11,-0x88(%rax)
- vmovaps %xmm12,-0x78(%rax)
- vmovaps %xmm13,-0x68(%rax)
- vmovaps %xmm14,-0x58(%rax)
- vmovaps %xmm15,-0x48(%rax)
- .Lsqr_1024_body:
- ___
- $code.=<<___;
- mov %rax,%rbp
- .cfi_def_cfa_register %rbp
- mov %rdx, $np # reassigned argument
- sub \$$FrameSize, %rsp
- mov $np, $tmp
- sub \$-128, $rp # size optimization
- sub \$-128, $ap
- sub \$-128, $np
- and \$4095, $tmp # see if $np crosses page
- add \$32*10, $tmp
- shr \$12, $tmp
- vpxor $ACC9,$ACC9,$ACC9
- jz .Lsqr_1024_no_n_copy
- # unaligned 256-bit load that crosses page boundary can
- # cause >2x performance degradation here, so if $np does
- # cross page boundary, copy it to stack and make sure stack
- # frame doesn't...
- sub \$32*10,%rsp
- vmovdqu 32*0-128($np), $ACC0
- and \$-2048, %rsp
- vmovdqu 32*1-128($np), $ACC1
- vmovdqu 32*2-128($np), $ACC2
- vmovdqu 32*3-128($np), $ACC3
- vmovdqu 32*4-128($np), $ACC4
- vmovdqu 32*5-128($np), $ACC5
- vmovdqu 32*6-128($np), $ACC6
- vmovdqu 32*7-128($np), $ACC7
- vmovdqu 32*8-128($np), $ACC8
- lea $FrameSize+128(%rsp),$np
- vmovdqu $ACC0, 32*0-128($np)
- vmovdqu $ACC1, 32*1-128($np)
- vmovdqu $ACC2, 32*2-128($np)
- vmovdqu $ACC3, 32*3-128($np)
- vmovdqu $ACC4, 32*4-128($np)
- vmovdqu $ACC5, 32*5-128($np)
- vmovdqu $ACC6, 32*6-128($np)
- vmovdqu $ACC7, 32*7-128($np)
- vmovdqu $ACC8, 32*8-128($np)
- vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero
- .Lsqr_1024_no_n_copy:
- and \$-1024, %rsp
- vmovdqu 32*1-128($ap), $ACC1
- vmovdqu 32*2-128($ap), $ACC2
- vmovdqu 32*3-128($ap), $ACC3
- vmovdqu 32*4-128($ap), $ACC4
- vmovdqu 32*5-128($ap), $ACC5
- vmovdqu 32*6-128($ap), $ACC6
- vmovdqu 32*7-128($ap), $ACC7
- vmovdqu 32*8-128($ap), $ACC8
- lea 192(%rsp), $tp0 # 64+128=192
- vmovdqu .Land_mask(%rip), $AND_MASK
- jmp .LOOP_GRANDE_SQR_1024
- .align 32
- .LOOP_GRANDE_SQR_1024:
- lea 32*18+128(%rsp), $aap # size optimization
- lea 448(%rsp), $tp1 # 64+128+256=448
- # the squaring is performed as described in Variant B of
- # "Speeding up Big-Number Squaring", so start by calculating
- # the A*2=A+A vector
- vpaddq $ACC1, $ACC1, $ACC1
- vpbroadcastq 32*0-128($ap), $B1
- vpaddq $ACC2, $ACC2, $ACC2
- vmovdqa $ACC1, 32*0-128($aap)
- vpaddq $ACC3, $ACC3, $ACC3
- vmovdqa $ACC2, 32*1-128($aap)
- vpaddq $ACC4, $ACC4, $ACC4
- vmovdqa $ACC3, 32*2-128($aap)
- vpaddq $ACC5, $ACC5, $ACC5
- vmovdqa $ACC4, 32*3-128($aap)
- vpaddq $ACC6, $ACC6, $ACC6
- vmovdqa $ACC5, 32*4-128($aap)
- vpaddq $ACC7, $ACC7, $ACC7
- vmovdqa $ACC6, 32*5-128($aap)
- vpaddq $ACC8, $ACC8, $ACC8
- vmovdqa $ACC7, 32*6-128($aap)
- vpxor $ACC9, $ACC9, $ACC9
- vmovdqa $ACC8, 32*7-128($aap)
- vpmuludq 32*0-128($ap), $B1, $ACC0
- vpbroadcastq 32*1-128($ap), $B2
- vmovdqu $ACC9, 32*9-192($tp0) # zero upper half
- vpmuludq $B1, $ACC1, $ACC1
- vmovdqu $ACC9, 32*10-448($tp1)
- vpmuludq $B1, $ACC2, $ACC2
- vmovdqu $ACC9, 32*11-448($tp1)
- vpmuludq $B1, $ACC3, $ACC3
- vmovdqu $ACC9, 32*12-448($tp1)
- vpmuludq $B1, $ACC4, $ACC4
- vmovdqu $ACC9, 32*13-448($tp1)
- vpmuludq $B1, $ACC5, $ACC5
- vmovdqu $ACC9, 32*14-448($tp1)
- vpmuludq $B1, $ACC6, $ACC6
- vmovdqu $ACC9, 32*15-448($tp1)
- vpmuludq $B1, $ACC7, $ACC7
- vmovdqu $ACC9, 32*16-448($tp1)
- vpmuludq $B1, $ACC8, $ACC8
- vpbroadcastq 32*2-128($ap), $B1
- vmovdqu $ACC9, 32*17-448($tp1)
- mov $ap, $tpa
- mov \$4, $i
- jmp .Lsqr_entry_1024
- ___
- $TEMP0=$Y1;
- $TEMP2=$Y2;
- $code.=<<___;
- .align 32
- .LOOP_SQR_1024:
- vpbroadcastq 32*1-128($tpa), $B2
- vpmuludq 32*0-128($ap), $B1, $ACC0
- vpaddq 32*0-192($tp0), $ACC0, $ACC0
- vpmuludq 32*0-128($aap), $B1, $ACC1
- vpaddq 32*1-192($tp0), $ACC1, $ACC1
- vpmuludq 32*1-128($aap), $B1, $ACC2
- vpaddq 32*2-192($tp0), $ACC2, $ACC2
- vpmuludq 32*2-128($aap), $B1, $ACC3
- vpaddq 32*3-192($tp0), $ACC3, $ACC3
- vpmuludq 32*3-128($aap), $B1, $ACC4
- vpaddq 32*4-192($tp0), $ACC4, $ACC4
- vpmuludq 32*4-128($aap), $B1, $ACC5
- vpaddq 32*5-192($tp0), $ACC5, $ACC5
- vpmuludq 32*5-128($aap), $B1, $ACC6
- vpaddq 32*6-192($tp0), $ACC6, $ACC6
- vpmuludq 32*6-128($aap), $B1, $ACC7
- vpaddq 32*7-192($tp0), $ACC7, $ACC7
- vpmuludq 32*7-128($aap), $B1, $ACC8
- vpbroadcastq 32*2-128($tpa), $B1
- vpaddq 32*8-192($tp0), $ACC8, $ACC8
- .Lsqr_entry_1024:
- vmovdqu $ACC0, 32*0-192($tp0)
- vmovdqu $ACC1, 32*1-192($tp0)
- vpmuludq 32*1-128($ap), $B2, $TEMP0
- vpaddq $TEMP0, $ACC2, $ACC2
- vpmuludq 32*1-128($aap), $B2, $TEMP1
- vpaddq $TEMP1, $ACC3, $ACC3
- vpmuludq 32*2-128($aap), $B2, $TEMP2
- vpaddq $TEMP2, $ACC4, $ACC4
- vpmuludq 32*3-128($aap), $B2, $TEMP0
- vpaddq $TEMP0, $ACC5, $ACC5
- vpmuludq 32*4-128($aap), $B2, $TEMP1
- vpaddq $TEMP1, $ACC6, $ACC6
- vpmuludq 32*5-128($aap), $B2, $TEMP2
- vpaddq $TEMP2, $ACC7, $ACC7
- vpmuludq 32*6-128($aap), $B2, $TEMP0
- vpaddq $TEMP0, $ACC8, $ACC8
- vpmuludq 32*7-128($aap), $B2, $ACC0
- vpbroadcastq 32*3-128($tpa), $B2
- vpaddq 32*9-192($tp0), $ACC0, $ACC0
- vmovdqu $ACC2, 32*2-192($tp0)
- vmovdqu $ACC3, 32*3-192($tp0)
- vpmuludq 32*2-128($ap), $B1, $TEMP2
- vpaddq $TEMP2, $ACC4, $ACC4
- vpmuludq 32*2-128($aap), $B1, $TEMP0
- vpaddq $TEMP0, $ACC5, $ACC5
- vpmuludq 32*3-128($aap), $B1, $TEMP1
- vpaddq $TEMP1, $ACC6, $ACC6
- vpmuludq 32*4-128($aap), $B1, $TEMP2
- vpaddq $TEMP2, $ACC7, $ACC7
- vpmuludq 32*5-128($aap), $B1, $TEMP0
- vpaddq $TEMP0, $ACC8, $ACC8
- vpmuludq 32*6-128($aap), $B1, $TEMP1
- vpaddq $TEMP1, $ACC0, $ACC0
- vpmuludq 32*7-128($aap), $B1, $ACC1
- vpbroadcastq 32*4-128($tpa), $B1
- vpaddq 32*10-448($tp1), $ACC1, $ACC1
- vmovdqu $ACC4, 32*4-192($tp0)
- vmovdqu $ACC5, 32*5-192($tp0)
- vpmuludq 32*3-128($ap), $B2, $TEMP0
- vpaddq $TEMP0, $ACC6, $ACC6
- vpmuludq 32*3-128($aap), $B2, $TEMP1
- vpaddq $TEMP1, $ACC7, $ACC7
- vpmuludq 32*4-128($aap), $B2, $TEMP2
- vpaddq $TEMP2, $ACC8, $ACC8
- vpmuludq 32*5-128($aap), $B2, $TEMP0
- vpaddq $TEMP0, $ACC0, $ACC0
- vpmuludq 32*6-128($aap), $B2, $TEMP1
- vpaddq $TEMP1, $ACC1, $ACC1
- vpmuludq 32*7-128($aap), $B2, $ACC2
- vpbroadcastq 32*5-128($tpa), $B2
- vpaddq 32*11-448($tp1), $ACC2, $ACC2
- vmovdqu $ACC6, 32*6-192($tp0)
- vmovdqu $ACC7, 32*7-192($tp0)
- vpmuludq 32*4-128($ap), $B1, $TEMP0
- vpaddq $TEMP0, $ACC8, $ACC8
- vpmuludq 32*4-128($aap), $B1, $TEMP1
- vpaddq $TEMP1, $ACC0, $ACC0
- vpmuludq 32*5-128($aap), $B1, $TEMP2
- vpaddq $TEMP2, $ACC1, $ACC1
- vpmuludq 32*6-128($aap), $B1, $TEMP0
- vpaddq $TEMP0, $ACC2, $ACC2
- vpmuludq 32*7-128($aap), $B1, $ACC3
- vpbroadcastq 32*6-128($tpa), $B1
- vpaddq 32*12-448($tp1), $ACC3, $ACC3
- vmovdqu $ACC8, 32*8-192($tp0)
- vmovdqu $ACC0, 32*9-192($tp0)
- lea 8($tp0), $tp0
- vpmuludq 32*5-128($ap), $B2, $TEMP2
- vpaddq $TEMP2, $ACC1, $ACC1
- vpmuludq 32*5-128($aap), $B2, $TEMP0
- vpaddq $TEMP0, $ACC2, $ACC2
- vpmuludq 32*6-128($aap), $B2, $TEMP1
- vpaddq $TEMP1, $ACC3, $ACC3
- vpmuludq 32*7-128($aap), $B2, $ACC4
- vpbroadcastq 32*7-128($tpa), $B2
- vpaddq 32*13-448($tp1), $ACC4, $ACC4
- vmovdqu $ACC1, 32*10-448($tp1)
- vmovdqu $ACC2, 32*11-448($tp1)
- vpmuludq 32*6-128($ap), $B1, $TEMP0
- vpaddq $TEMP0, $ACC3, $ACC3
- vpmuludq 32*6-128($aap), $B1, $TEMP1
- vpbroadcastq 32*8-128($tpa), $ACC0 # borrow $ACC0 for $B1
- vpaddq $TEMP1, $ACC4, $ACC4
- vpmuludq 32*7-128($aap), $B1, $ACC5
- vpbroadcastq 32*0+8-128($tpa), $B1 # for next iteration
- vpaddq 32*14-448($tp1), $ACC5, $ACC5
- vmovdqu $ACC3, 32*12-448($tp1)
- vmovdqu $ACC4, 32*13-448($tp1)
- lea 8($tpa), $tpa
- vpmuludq 32*7-128($ap), $B2, $TEMP0
- vpaddq $TEMP0, $ACC5, $ACC5
- vpmuludq 32*7-128($aap), $B2, $ACC6
- vpaddq 32*15-448($tp1), $ACC6, $ACC6
- vpmuludq 32*8-128($ap), $ACC0, $ACC7
- vmovdqu $ACC5, 32*14-448($tp1)
- vpaddq 32*16-448($tp1), $ACC7, $ACC7
- vmovdqu $ACC6, 32*15-448($tp1)
- vmovdqu $ACC7, 32*16-448($tp1)
- lea 8($tp1), $tp1
- dec $i
- jnz .LOOP_SQR_1024
- ___
- $ZERO = $ACC9;
- $TEMP0 = $B1;
- $TEMP2 = $B2;
- $TEMP3 = $Y1;
- $TEMP4 = $Y2;
- $code.=<<___;
- # we need to fix indices 32-39 to avoid overflow
- vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0),
- vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0)
- vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0)
- lea 192(%rsp), $tp0 # 64+128=192
- vpsrlq \$29, $ACC8, $TEMP1
- vpand $AND_MASK, $ACC8, $ACC8
- vpsrlq \$29, $ACC1, $TEMP2
- vpand $AND_MASK, $ACC1, $ACC1
- vpermq \$0x93, $TEMP1, $TEMP1
- vpxor $ZERO, $ZERO, $ZERO
- vpermq \$0x93, $TEMP2, $TEMP2
- vpblendd \$3, $ZERO, $TEMP1, $TEMP0
- vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
- vpaddq $TEMP0, $ACC8, $ACC8
- vpblendd \$3, $TEMP2, $ZERO, $TEMP2
- vpaddq $TEMP1, $ACC1, $ACC1
- vpaddq $TEMP2, $ACC2, $ACC2
- vmovdqu $ACC1, 32*9-192($tp0)
- vmovdqu $ACC2, 32*10-192($tp0)
- mov (%rsp), %rax
- mov 8(%rsp), $r1
- mov 16(%rsp), $r2
- mov 24(%rsp), $r3
- vmovdqu 32*1(%rsp), $ACC1
- vmovdqu 32*2-192($tp0), $ACC2
- vmovdqu 32*3-192($tp0), $ACC3
- vmovdqu 32*4-192($tp0), $ACC4
- vmovdqu 32*5-192($tp0), $ACC5
- vmovdqu 32*6-192($tp0), $ACC6
- vmovdqu 32*7-192($tp0), $ACC7
- mov %rax, $r0
- imull $n0, %eax
- and \$0x1fffffff, %eax
- vmovd %eax, $Y1
- mov %rax, %rdx
- imulq -128($np), %rax
- vpbroadcastq $Y1, $Y1
- add %rax, $r0
- mov %rdx, %rax
- imulq 8-128($np), %rax
- shr \$29, $r0
- add %rax, $r1
- mov %rdx, %rax
- imulq 16-128($np), %rax
- add $r0, $r1
- add %rax, $r2
- imulq 24-128($np), %rdx
- add %rdx, $r3
- mov $r1, %rax
- imull $n0, %eax
- and \$0x1fffffff, %eax
- mov \$9, $i
- jmp .LOOP_REDUCE_1024
- .align 32
- .LOOP_REDUCE_1024:
- vmovd %eax, $Y2
- vpbroadcastq $Y2, $Y2
- vpmuludq 32*1-128($np), $Y1, $TEMP0
- mov %rax, %rdx
- imulq -128($np), %rax
- vpaddq $TEMP0, $ACC1, $ACC1
- add %rax, $r1
- vpmuludq 32*2-128($np), $Y1, $TEMP1
- mov %rdx, %rax
- imulq 8-128($np), %rax
- vpaddq $TEMP1, $ACC2, $ACC2
- vpmuludq 32*3-128($np), $Y1, $TEMP2
- .byte 0x67
- add %rax, $r2
- .byte 0x67
- mov %rdx, %rax
- imulq 16-128($np), %rax
- shr \$29, $r1
- vpaddq $TEMP2, $ACC3, $ACC3
- vpmuludq 32*4-128($np), $Y1, $TEMP0
- add %rax, $r3
- add $r1, $r2
- vpaddq $TEMP0, $ACC4, $ACC4
- vpmuludq 32*5-128($np), $Y1, $TEMP1
- mov $r2, %rax
- imull $n0, %eax
- vpaddq $TEMP1, $ACC5, $ACC5
- vpmuludq 32*6-128($np), $Y1, $TEMP2
- and \$0x1fffffff, %eax
- vpaddq $TEMP2, $ACC6, $ACC6
- vpmuludq 32*7-128($np), $Y1, $TEMP0
- vpaddq $TEMP0, $ACC7, $ACC7
- vpmuludq 32*8-128($np), $Y1, $TEMP1
- vmovd %eax, $Y1
- #vmovdqu 32*1-8-128($np), $TEMP2 # moved below
- vpaddq $TEMP1, $ACC8, $ACC8
- #vmovdqu 32*2-8-128($np), $TEMP0 # moved below
- vpbroadcastq $Y1, $Y1
- vpmuludq 32*1-8-128($np), $Y2, $TEMP2 # see above
- vmovdqu 32*3-8-128($np), $TEMP1
- mov %rax, %rdx
- imulq -128($np), %rax
- vpaddq $TEMP2, $ACC1, $ACC1
- vpmuludq 32*2-8-128($np), $Y2, $TEMP0 # see above
- vmovdqu 32*4-8-128($np), $TEMP2
- add %rax, $r2
- mov %rdx, %rax
- imulq 8-128($np), %rax
- vpaddq $TEMP0, $ACC2, $ACC2
- add $r3, %rax
- shr \$29, $r2
- vpmuludq $Y2, $TEMP1, $TEMP1
- vmovdqu 32*5-8-128($np), $TEMP0
- add $r2, %rax
- vpaddq $TEMP1, $ACC3, $ACC3
- vpmuludq $Y2, $TEMP2, $TEMP2
- vmovdqu 32*6-8-128($np), $TEMP1
- .byte 0x67
- mov %rax, $r3
- imull $n0, %eax
- vpaddq $TEMP2, $ACC4, $ACC4
- vpmuludq $Y2, $TEMP0, $TEMP0
- .byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 # vmovdqu 32*7-8-128($np), $TEMP2
- and \$0x1fffffff, %eax
- vpaddq $TEMP0, $ACC5, $ACC5
- vpmuludq $Y2, $TEMP1, $TEMP1
- vmovdqu 32*8-8-128($np), $TEMP0
- vpaddq $TEMP1, $ACC6, $ACC6
- vpmuludq $Y2, $TEMP2, $TEMP2
- vmovdqu 32*9-8-128($np), $ACC9
- vmovd %eax, $ACC0 # borrow ACC0 for Y2
- imulq -128($np), %rax
- vpaddq $TEMP2, $ACC7, $ACC7
- vpmuludq $Y2, $TEMP0, $TEMP0
- vmovdqu 32*1-16-128($np), $TEMP1
- vpbroadcastq $ACC0, $ACC0
- vpaddq $TEMP0, $ACC8, $ACC8
- vpmuludq $Y2, $ACC9, $ACC9
- vmovdqu 32*2-16-128($np), $TEMP2
- add %rax, $r3
- ___
- ($ACC0,$Y2)=($Y2,$ACC0);
- $code.=<<___;
- vmovdqu 32*1-24-128($np), $ACC0
- vpmuludq $Y1, $TEMP1, $TEMP1
- vmovdqu 32*3-16-128($np), $TEMP0
- vpaddq $TEMP1, $ACC1, $ACC1
- vpmuludq $Y2, $ACC0, $ACC0
- vpmuludq $Y1, $TEMP2, $TEMP2
- .byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff # vmovdqu 32*4-16-128($np), $TEMP1
- vpaddq $ACC1, $ACC0, $ACC0
- vpaddq $TEMP2, $ACC2, $ACC2
- vpmuludq $Y1, $TEMP0, $TEMP0
- vmovdqu 32*5-16-128($np), $TEMP2
- .byte 0x67
- vmovq $ACC0, %rax
- vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
- vpaddq $TEMP0, $ACC3, $ACC3
- vpmuludq $Y1, $TEMP1, $TEMP1
- vmovdqu 32*6-16-128($np), $TEMP0
- vpaddq $TEMP1, $ACC4, $ACC4
- vpmuludq $Y1, $TEMP2, $TEMP2
- vmovdqu 32*7-16-128($np), $TEMP1
- vpaddq $TEMP2, $ACC5, $ACC5
- vpmuludq $Y1, $TEMP0, $TEMP0
- vmovdqu 32*8-16-128($np), $TEMP2
- vpaddq $TEMP0, $ACC6, $ACC6
- vpmuludq $Y1, $TEMP1, $TEMP1
- shr \$29, $r3
- vmovdqu 32*9-16-128($np), $TEMP0
- add $r3, %rax
- vpaddq $TEMP1, $ACC7, $ACC7
- vpmuludq $Y1, $TEMP2, $TEMP2
- #vmovdqu 32*2-24-128($np), $TEMP1 # moved below
- mov %rax, $r0
- imull $n0, %eax
- vpaddq $TEMP2, $ACC8, $ACC8
- vpmuludq $Y1, $TEMP0, $TEMP0
- and \$0x1fffffff, %eax
- vmovd %eax, $Y1
- vmovdqu 32*3-24-128($np), $TEMP2
- .byte 0x67
- vpaddq $TEMP0, $ACC9, $ACC9
- vpbroadcastq $Y1, $Y1
- vpmuludq 32*2-24-128($np), $Y2, $TEMP1 # see above
- vmovdqu 32*4-24-128($np), $TEMP0
- mov %rax, %rdx
- imulq -128($np), %rax
- mov 8(%rsp), $r1
- vpaddq $TEMP1, $ACC2, $ACC1
- vpmuludq $Y2, $TEMP2, $TEMP2
- vmovdqu 32*5-24-128($np), $TEMP1
- add %rax, $r0
- mov %rdx, %rax
- imulq 8-128($np), %rax
- .byte 0x67
- shr \$29, $r0
- mov 16(%rsp), $r2
- vpaddq $TEMP2, $ACC3, $ACC2
- vpmuludq $Y2, $TEMP0, $TEMP0
- vmovdqu 32*6-24-128($np), $TEMP2
- add %rax, $r1
- mov %rdx, %rax
- imulq 16-128($np), %rax
- vpaddq $TEMP0, $ACC4, $ACC3
- vpmuludq $Y2, $TEMP1, $TEMP1
- vmovdqu 32*7-24-128($np), $TEMP0
- imulq 24-128($np), %rdx # future $r3
- add %rax, $r2
- lea ($r0,$r1), %rax
- vpaddq $TEMP1, $ACC5, $ACC4
- vpmuludq $Y2, $TEMP2, $TEMP2
- vmovdqu 32*8-24-128($np), $TEMP1
- mov %rax, $r1
- imull $n0, %eax
- vpmuludq $Y2, $TEMP0, $TEMP0
- vpaddq $TEMP2, $ACC6, $ACC5
- vmovdqu 32*9-24-128($np), $TEMP2
- and \$0x1fffffff, %eax
- vpaddq $TEMP0, $ACC7, $ACC6
- vpmuludq $Y2, $TEMP1, $TEMP1
- add 24(%rsp), %rdx
- vpaddq $TEMP1, $ACC8, $ACC7
- vpmuludq $Y2, $TEMP2, $TEMP2
- vpaddq $TEMP2, $ACC9, $ACC8
- vmovq $r3, $ACC9
- mov %rdx, $r3
- dec $i
- jnz .LOOP_REDUCE_1024
- ___
- ($ACC0,$Y2)=($Y2,$ACC0);
- $code.=<<___;
- lea 448(%rsp), $tp1 # size optimization
- vpaddq $ACC9, $Y2, $ACC0
- vpxor $ZERO, $ZERO, $ZERO
- vpaddq 32*9-192($tp0), $ACC0, $ACC0
- vpaddq 32*10-448($tp1), $ACC1, $ACC1
- vpaddq 32*11-448($tp1), $ACC2, $ACC2
- vpaddq 32*12-448($tp1), $ACC3, $ACC3
- vpaddq 32*13-448($tp1), $ACC4, $ACC4
- vpaddq 32*14-448($tp1), $ACC5, $ACC5
- vpaddq 32*15-448($tp1), $ACC6, $ACC6
- vpaddq 32*16-448($tp1), $ACC7, $ACC7
- vpaddq 32*17-448($tp1), $ACC8, $ACC8
- vpsrlq \$29, $ACC0, $TEMP1
- vpand $AND_MASK, $ACC0, $ACC0
- vpsrlq \$29, $ACC1, $TEMP2
- vpand $AND_MASK, $ACC1, $ACC1
- vpsrlq \$29, $ACC2, $TEMP3
- vpermq \$0x93, $TEMP1, $TEMP1
- vpand $AND_MASK, $ACC2, $ACC2
- vpsrlq \$29, $ACC3, $TEMP4
- vpermq \$0x93, $TEMP2, $TEMP2
- vpand $AND_MASK, $ACC3, $ACC3
- vpermq \$0x93, $TEMP3, $TEMP3
- vpblendd \$3, $ZERO, $TEMP1, $TEMP0
- vpermq \$0x93, $TEMP4, $TEMP4
- vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
- vpaddq $TEMP0, $ACC0, $ACC0
- vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
- vpaddq $TEMP1, $ACC1, $ACC1
- vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
- vpaddq $TEMP2, $ACC2, $ACC2
- vpblendd \$3, $TEMP4, $ZERO, $TEMP4
- vpaddq $TEMP3, $ACC3, $ACC3
- vpaddq $TEMP4, $ACC4, $ACC4
- vpsrlq \$29, $ACC0, $TEMP1
- vpand $AND_MASK, $ACC0, $ACC0
- vpsrlq \$29, $ACC1, $TEMP2
- vpand $AND_MASK, $ACC1, $ACC1
- vpsrlq \$29, $ACC2, $TEMP3
- vpermq \$0x93, $TEMP1, $TEMP1
- vpand $AND_MASK, $ACC2, $ACC2
- vpsrlq \$29, $ACC3, $TEMP4
- vpermq \$0x93, $TEMP2, $TEMP2
- vpand $AND_MASK, $ACC3, $ACC3
- vpermq \$0x93, $TEMP3, $TEMP3
- vpblendd \$3, $ZERO, $TEMP1, $TEMP0
- vpermq \$0x93, $TEMP4, $TEMP4
- vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
- vpaddq $TEMP0, $ACC0, $ACC0
- vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
- vpaddq $TEMP1, $ACC1, $ACC1
- vmovdqu $ACC0, 32*0-128($rp)
- vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
- vpaddq $TEMP2, $ACC2, $ACC2
- vmovdqu $ACC1, 32*1-128($rp)
- vpblendd \$3, $TEMP4, $ZERO, $TEMP4
- vpaddq $TEMP3, $ACC3, $ACC3
- vmovdqu $ACC2, 32*2-128($rp)
- vpaddq $TEMP4, $ACC4, $ACC4
- vmovdqu $ACC3, 32*3-128($rp)
- ___
- $TEMP5=$ACC0;
- $code.=<<___;
- vpsrlq \$29, $ACC4, $TEMP1
- vpand $AND_MASK, $ACC4, $ACC4
- vpsrlq \$29, $ACC5, $TEMP2
- vpand $AND_MASK, $ACC5, $ACC5
- vpsrlq \$29, $ACC6, $TEMP3
- vpermq \$0x93, $TEMP1, $TEMP1
- vpand $AND_MASK, $ACC6, $ACC6
- vpsrlq \$29, $ACC7, $TEMP4
- vpermq \$0x93, $TEMP2, $TEMP2
- vpand $AND_MASK, $ACC7, $ACC7
- vpsrlq \$29, $ACC8, $TEMP5
- vpermq \$0x93, $TEMP3, $TEMP3
- vpand $AND_MASK, $ACC8, $ACC8
- vpermq \$0x93, $TEMP4, $TEMP4
- vpblendd \$3, $ZERO, $TEMP1, $TEMP0
- vpermq \$0x93, $TEMP5, $TEMP5
- vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
- vpaddq $TEMP0, $ACC4, $ACC4
- vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
- vpaddq $TEMP1, $ACC5, $ACC5
- vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
- vpaddq $TEMP2, $ACC6, $ACC6
- vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
- vpaddq $TEMP3, $ACC7, $ACC7
- vpaddq $TEMP4, $ACC8, $ACC8
- vpsrlq \$29, $ACC4, $TEMP1
- vpand $AND_MASK, $ACC4, $ACC4
- vpsrlq \$29, $ACC5, $TEMP2
- vpand $AND_MASK, $ACC5, $ACC5
- vpsrlq \$29, $ACC6, $TEMP3
- vpermq \$0x93, $TEMP1, $TEMP1
- vpand $AND_MASK, $ACC6, $ACC6
- vpsrlq \$29, $ACC7, $TEMP4
- vpermq \$0x93, $TEMP2, $TEMP2
- vpand $AND_MASK, $ACC7, $ACC7
- vpsrlq \$29, $ACC8, $TEMP5
- vpermq \$0x93, $TEMP3, $TEMP3
- vpand $AND_MASK, $ACC8, $ACC8
- vpermq \$0x93, $TEMP4, $TEMP4
- vpblendd \$3, $ZERO, $TEMP1, $TEMP0
- vpermq \$0x93, $TEMP5, $TEMP5
- vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
- vpaddq $TEMP0, $ACC4, $ACC4
- vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
- vpaddq $TEMP1, $ACC5, $ACC5
- vmovdqu $ACC4, 32*4-128($rp)
- vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
- vpaddq $TEMP2, $ACC6, $ACC6
- vmovdqu $ACC5, 32*5-128($rp)
- vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
- vpaddq $TEMP3, $ACC7, $ACC7
- vmovdqu $ACC6, 32*6-128($rp)
- vpaddq $TEMP4, $ACC8, $ACC8
- vmovdqu $ACC7, 32*7-128($rp)
- vmovdqu $ACC8, 32*8-128($rp)
- mov $rp, $ap
- dec $rep
- jne .LOOP_GRANDE_SQR_1024
- vzeroall
- mov %rbp, %rax
- .cfi_def_cfa_register %rax
- ___
- $code.=<<___ if ($win64);
- .Lsqr_1024_in_tail:
- movaps -0xd8(%rax),%xmm6
- movaps -0xc8(%rax),%xmm7
- movaps -0xb8(%rax),%xmm8
- movaps -0xa8(%rax),%xmm9
- movaps -0x98(%rax),%xmm10
- movaps -0x88(%rax),%xmm11
- movaps -0x78(%rax),%xmm12
- movaps -0x68(%rax),%xmm13
- movaps -0x58(%rax),%xmm14
- movaps -0x48(%rax),%xmm15
- ___
- $code.=<<___;
- mov -48(%rax),%r15
- .cfi_restore %r15
- mov -40(%rax),%r14
- .cfi_restore %r14
- mov -32(%rax),%r13
- .cfi_restore %r13
- mov -24(%rax),%r12
- .cfi_restore %r12
- mov -16(%rax),%rbp
- .cfi_restore %rbp
- mov -8(%rax),%rbx
- .cfi_restore %rbx
- lea (%rax),%rsp # restore %rsp
- .cfi_def_cfa_register %rsp
- .Lsqr_1024_epilogue:
- ret
- .cfi_endproc
- .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
- ___
- }
- { # void AMM_WW(
- my $rp="%rdi"; # BN_ULONG *rp,
- my $ap="%rsi"; # const BN_ULONG *ap,
- my $bp="%rdx"; # const BN_ULONG *bp,
- my $np="%rcx"; # const BN_ULONG *np,
- my $n0="%r8d"; # unsigned int n0);
- # The registers that hold the accumulated redundant result
- # The AMM works on 1024 bit operands, and redundant word size is 29
- # Therefore: ceil(1024/29)/4 = 9
- my $ACC0="%ymm0";
- my $ACC1="%ymm1";
- my $ACC2="%ymm2";
- my $ACC3="%ymm3";
- my $ACC4="%ymm4";
- my $ACC5="%ymm5";
- my $ACC6="%ymm6";
- my $ACC7="%ymm7";
- my $ACC8="%ymm8";
- my $ACC9="%ymm9";
- # Registers that hold the broadcasted words of multiplier, currently used
- my $Bi="%ymm10";
- my $Yi="%ymm11";
- # Helper registers
- my $TEMP0=$ACC0;
- my $TEMP1="%ymm12";
- my $TEMP2="%ymm13";
- my $ZERO="%ymm14";
- my $AND_MASK="%ymm15";
- # alu registers that hold the first words of the ACC
- my $r0="%r9";
- my $r1="%r10";
- my $r2="%r11";
- my $r3="%r12";
- my $i="%r14d";
- my $tmp="%r15";
- $bp="%r13"; # reassigned argument
- $code.=<<___;
- .globl rsaz_1024_mul_avx2
- .type rsaz_1024_mul_avx2,\@function,5
- .align 64
- rsaz_1024_mul_avx2:
- .cfi_startproc
- lea (%rsp), %rax
- .cfi_def_cfa_register %rax
- push %rbx
- .cfi_push %rbx
- push %rbp
- .cfi_push %rbp
- push %r12
- .cfi_push %r12
- push %r13
- .cfi_push %r13
- push %r14
- .cfi_push %r14
- push %r15
- .cfi_push %r15
- ___
- $code.=<<___ if ($win64);
- vzeroupper
- lea -0xa8(%rsp),%rsp
- vmovaps %xmm6,-0xd8(%rax)
- vmovaps %xmm7,-0xc8(%rax)
- vmovaps %xmm8,-0xb8(%rax)
- vmovaps %xmm9,-0xa8(%rax)
- vmovaps %xmm10,-0x98(%rax)
- vmovaps %xmm11,-0x88(%rax)
- vmovaps %xmm12,-0x78(%rax)
- vmovaps %xmm13,-0x68(%rax)
- vmovaps %xmm14,-0x58(%rax)
- vmovaps %xmm15,-0x48(%rax)
- .Lmul_1024_body:
- ___
- $code.=<<___;
- mov %rax,%rbp
- .cfi_def_cfa_register %rbp
- vzeroall
- mov %rdx, $bp # reassigned argument
- sub \$64,%rsp
- # unaligned 256-bit load that crosses page boundary can
- # cause severe performance degradation here, so if $ap does
- # cross page boundary, swap it with $bp [meaning that caller
- # is advised to lay down $ap and $bp next to each other, so
- # that only one can cross page boundary].
- .byte 0x67,0x67
- mov $ap, $tmp
- and \$4095, $tmp
- add \$32*10, $tmp
- shr \$12, $tmp
- mov $ap, $tmp
- cmovnz $bp, $ap
- cmovnz $tmp, $bp
- mov $np, $tmp
- sub \$-128,$ap # size optimization
- sub \$-128,$np
- sub \$-128,$rp
- and \$4095, $tmp # see if $np crosses page
- add \$32*10, $tmp
- .byte 0x67,0x67
- shr \$12, $tmp
- jz .Lmul_1024_no_n_copy
- # unaligned 256-bit load that crosses page boundary can
- # cause severe performance degradation here, so if $np does
- # cross page boundary, copy it to stack and make sure stack
- # frame doesn't...
- sub \$32*10,%rsp
- vmovdqu 32*0-128($np), $ACC0
- and \$-512, %rsp
- vmovdqu 32*1-128($np), $ACC1
- vmovdqu 32*2-128($np), $ACC2
- vmovdqu 32*3-128($np), $ACC3
- vmovdqu 32*4-128($np), $ACC4
- vmovdqu 32*5-128($np), $ACC5
- vmovdqu 32*6-128($np), $ACC6
- vmovdqu 32*7-128($np), $ACC7
- vmovdqu 32*8-128($np), $ACC8
- lea 64+128(%rsp),$np
- vmovdqu $ACC0, 32*0-128($np)
- vpxor $ACC0, $ACC0, $ACC0
- vmovdqu $ACC1, 32*1-128($np)
- vpxor $ACC1, $ACC1, $ACC1
- vmovdqu $ACC2, 32*2-128($np)
- vpxor $ACC2, $ACC2, $ACC2
- vmovdqu $ACC3, 32*3-128($np)
- vpxor $ACC3, $ACC3, $ACC3
- vmovdqu $ACC4, 32*4-128($np)
- vpxor $ACC4, $ACC4, $ACC4
- vmovdqu $ACC5, 32*5-128($np)
- vpxor $ACC5, $ACC5, $ACC5
- vmovdqu $ACC6, 32*6-128($np)
- vpxor $ACC6, $ACC6, $ACC6
- vmovdqu $ACC7, 32*7-128($np)
- vpxor $ACC7, $ACC7, $ACC7
- vmovdqu $ACC8, 32*8-128($np)
- vmovdqa $ACC0, $ACC8
- vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall
- .Lmul_1024_no_n_copy:
- and \$-64,%rsp
- mov ($bp), %rbx
- vpbroadcastq ($bp), $Bi
- vmovdqu $ACC0, (%rsp) # clear top of stack
- xor $r0, $r0
- .byte 0x67
- xor $r1, $r1
- xor $r2, $r2
- xor $r3, $r3
- vmovdqu .Land_mask(%rip), $AND_MASK
- mov \$9, $i
- vmovdqu $ACC9, 32*9-128($rp) # $ACC9 is zero after vzeroall
- jmp .Loop_mul_1024
- .align 32
- .Loop_mul_1024:
- vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*)
- mov %rbx, %rax
- imulq -128($ap), %rax
- add $r0, %rax
- mov %rbx, $r1
- imulq 8-128($ap), $r1
- add 8(%rsp), $r1
- mov %rax, $r0
- imull $n0, %eax
- and \$0x1fffffff, %eax
- mov %rbx, $r2
- imulq 16-128($ap), $r2
- add 16(%rsp), $r2
- mov %rbx, $r3
- imulq 24-128($ap), $r3
- add 24(%rsp), $r3
- vpmuludq 32*1-128($ap),$Bi,$TEMP0
- vmovd %eax, $Yi
- vpaddq $TEMP0,$ACC1,$ACC1
- vpmuludq 32*2-128($ap),$Bi,$TEMP1
- vpbroadcastq $Yi, $Yi
- vpaddq $TEMP1,$ACC2,$ACC2
- vpmuludq 32*3-128($ap),$Bi,$TEMP2
- vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3
- vpaddq $TEMP2,$ACC3,$ACC3
- vpmuludq 32*4-128($ap),$Bi,$TEMP0
- vpaddq $TEMP0,$ACC4,$ACC4
- vpmuludq 32*5-128($ap),$Bi,$TEMP1
- vpaddq $TEMP1,$ACC5,$ACC5
- vpmuludq 32*6-128($ap),$Bi,$TEMP2
- vpaddq $TEMP2,$ACC6,$ACC6
- vpmuludq 32*7-128($ap),$Bi,$TEMP0
- vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3
- vpaddq $TEMP0,$ACC7,$ACC7
- vpmuludq 32*8-128($ap),$Bi,$TEMP1
- vpbroadcastq 8($bp), $Bi
- vpaddq $TEMP1,$ACC8,$ACC8
- mov %rax,%rdx
- imulq -128($np),%rax
- add %rax,$r0
- mov %rdx,%rax
- imulq 8-128($np),%rax
- add %rax,$r1
- mov %rdx,%rax
- imulq 16-128($np),%rax
- add %rax,$r2
- shr \$29, $r0
- imulq 24-128($np),%rdx
- add %rdx,$r3
- add $r0, $r1
- vpmuludq 32*1-128($np),$Yi,$TEMP2
- vmovq $Bi, %rbx
- vpaddq $TEMP2,$ACC1,$ACC1
- vpmuludq 32*2-128($np),$Yi,$TEMP0
- vpaddq $TEMP0,$ACC2,$ACC2
- vpmuludq 32*3-128($np),$Yi,$TEMP1
- vpaddq $TEMP1,$ACC3,$ACC3
- vpmuludq 32*4-128($np),$Yi,$TEMP2
- vpaddq $TEMP2,$ACC4,$ACC4
- vpmuludq 32*5-128($np),$Yi,$TEMP0
- vpaddq $TEMP0,$ACC5,$ACC5
- vpmuludq 32*6-128($np),$Yi,$TEMP1
- vpaddq $TEMP1,$ACC6,$ACC6
- vpmuludq 32*7-128($np),$Yi,$TEMP2
- vpblendd \$3, $ZERO, $ACC9, $TEMP1 # correct $ACC3
- vpaddq $TEMP2,$ACC7,$ACC7
- vpmuludq 32*8-128($np),$Yi,$TEMP0
- vpaddq $TEMP1, $ACC3, $ACC3 # correct $ACC3
- vpaddq $TEMP0,$ACC8,$ACC8
- mov %rbx, %rax
- imulq -128($ap),%rax
- add %rax,$r1
- vmovdqu -8+32*1-128($ap),$TEMP1
- mov %rbx, %rax
- imulq 8-128($ap),%rax
- add %rax,$r2
- vmovdqu -8+32*2-128($ap),$TEMP2
- mov $r1, %rax
- vpblendd \$0xfc, $ZERO, $ACC9, $ACC9 # correct $ACC3
- imull $n0, %eax
- vpaddq $ACC9,$ACC4,$ACC4 # correct $ACC3
- and \$0x1fffffff, %eax
- imulq 16-128($ap),%rbx
- add %rbx,$r3
- vpmuludq $Bi,$TEMP1,$TEMP1
- vmovd %eax, $Yi
- vmovdqu -8+32*3-128($ap),$TEMP0
- vpaddq $TEMP1,$ACC1,$ACC1
- vpmuludq $Bi,$TEMP2,$TEMP2
- vpbroadcastq $Yi, $Yi
- vmovdqu -8+32*4-128($ap),$TEMP1
- vpaddq $TEMP2,$ACC2,$ACC2
- vpmuludq $Bi,$TEMP0,$TEMP0
- vmovdqu -8+32*5-128($ap),$TEMP2
- vpaddq $TEMP0,$ACC3,$ACC3
- vpmuludq $Bi,$TEMP1,$TEMP1
- vmovdqu -8+32*6-128($ap),$TEMP0
- vpaddq $TEMP1,$ACC4,$ACC4
- vpmuludq $Bi,$TEMP2,$TEMP2
- vmovdqu -8+32*7-128($ap),$TEMP1
- vpaddq $TEMP2,$ACC5,$ACC5
- vpmuludq $Bi,$TEMP0,$TEMP0
- vmovdqu -8+32*8-128($ap),$TEMP2
- vpaddq $TEMP0,$ACC6,$ACC6
- vpmuludq $Bi,$TEMP1,$TEMP1
- vmovdqu -8+32*9-128($ap),$ACC9
- vpaddq $TEMP1,$ACC7,$ACC7
- vpmuludq $Bi,$TEMP2,$TEMP2
- vpaddq $TEMP2,$ACC8,$ACC8
- vpmuludq $Bi,$ACC9,$ACC9
- vpbroadcastq 16($bp), $Bi
- mov %rax,%rdx
- imulq -128($np),%rax
- add %rax,$r1
- vmovdqu -8+32*1-128($np),$TEMP0
- mov %rdx,%rax
- imulq 8-128($np),%rax
- add %rax,$r2
- vmovdqu -8+32*2-128($np),$TEMP1
- shr \$29, $r1
- imulq 16-128($np),%rdx
- add %rdx,$r3
- add $r1, $r2
- vpmuludq $Yi,$TEMP0,$TEMP0
- vmovq $Bi, %rbx
- vmovdqu -8+32*3-128($np),$TEMP2
- vpaddq $TEMP0,$ACC1,$ACC1
- vpmuludq $Yi,$TEMP1,$TEMP1
- vmovdqu -8+32*4-128($np),$TEMP0
- vpaddq $TEMP1,$ACC2,$ACC2
- vpmuludq $Yi,$TEMP2,$TEMP2
- vmovdqu -8+32*5-128($np),$TEMP1
- vpaddq $TEMP2,$ACC3,$ACC3
- vpmuludq $Yi,$TEMP0,$TEMP0
- vmovdqu -8+32*6-128($np),$TEMP2
- vpaddq $TEMP0,$ACC4,$ACC4
- vpmuludq $Yi,$TEMP1,$TEMP1
- vmovdqu -8+32*7-128($np),$TEMP0
- vpaddq $TEMP1,$ACC5,$ACC5
- vpmuludq $Yi,$TEMP2,$TEMP2
- vmovdqu -8+32*8-128($np),$TEMP1
- vpaddq $TEMP2,$ACC6,$ACC6
- vpmuludq $Yi,$TEMP0,$TEMP0
- vmovdqu -8+32*9-128($np),$TEMP2
- vpaddq $TEMP0,$ACC7,$ACC7
- vpmuludq $Yi,$TEMP1,$TEMP1
- vpaddq $TEMP1,$ACC8,$ACC8
- vpmuludq $Yi,$TEMP2,$TEMP2
- vpaddq $TEMP2,$ACC9,$ACC9
- vmovdqu -16+32*1-128($ap),$TEMP0
- mov %rbx,%rax
- imulq -128($ap),%rax
- add $r2,%rax
- vmovdqu -16+32*2-128($ap),$TEMP1
- mov %rax,$r2
- imull $n0, %eax
- and \$0x1fffffff, %eax
- imulq 8-128($ap),%rbx
- add %rbx,$r3
- vpmuludq $Bi,$TEMP0,$TEMP0
- vmovd %eax, $Yi
- vmovdqu -16+32*3-128($ap),$TEMP2
- vpaddq $TEMP0,$ACC1,$ACC1
- vpmuludq $Bi,$TEMP1,$TEMP1
- vpbroadcastq $Yi, $Yi
- vmovdqu -16+32*4-128($ap),$TEMP0
- vpaddq $TEMP1,$ACC2,$ACC2
- vpmuludq $Bi,$TEMP2,$TEMP2
- vmovdqu -16+32*5-128($ap),$TEMP1
- vpaddq $TEMP2,$ACC3,$ACC3
- vpmuludq $Bi,$TEMP0,$TEMP0
- vmovdqu -16+32*6-128($ap),$TEMP2
- vpaddq $TEMP0,$ACC4,$ACC4
- vpmuludq $Bi,$TEMP1,$TEMP1
- vmovdqu -16+32*7-128($ap),$TEMP0
- vpaddq $TEMP1,$ACC5,$ACC5
- vpmuludq $Bi,$TEMP2,$TEMP2
- vmovdqu -16+32*8-128($ap),$TEMP1
- vpaddq $TEMP2,$ACC6,$ACC6
- vpmuludq $Bi,$TEMP0,$TEMP0
- vmovdqu -16+32*9-128($ap),$TEMP2
- vpaddq $TEMP0,$ACC7,$ACC7
- vpmuludq $Bi,$TEMP1,$TEMP1
- vpaddq $TEMP1,$ACC8,$ACC8
- vpmuludq $Bi,$TEMP2,$TEMP2
- vpbroadcastq 24($bp), $Bi
- vpaddq $TEMP2,$ACC9,$ACC9
- vmovdqu -16+32*1-128($np),$TEMP0
- mov %rax,%rdx
- imulq -128($np),%rax
- add %rax,$r2
- vmovdqu -16+32*2-128($np),$TEMP1
- imulq 8-128($np),%rdx
- add %rdx,$r3
- shr \$29, $r2
- vpmuludq $Yi,$TEMP0,$TEMP0
- vmovq $Bi, %rbx
- vmovdqu -16+32*3-128($np),$TEMP2
- vpaddq $TEMP0,$ACC1,$ACC1
- vpmuludq $Yi,$TEMP1,$TEMP1
- vmovdqu -16+32*4-128($np),$TEMP0
- vpaddq $TEMP1,$ACC2,$ACC2
- vpmuludq $Yi,$TEMP2,$TEMP2
- vmovdqu -16+32*5-128($np),$TEMP1
- vpaddq $TEMP2,$ACC3,$ACC3
- vpmuludq $Yi,$TEMP0,$TEMP0
- vmovdqu -16+32*6-128($np),$TEMP2
- vpaddq $TEMP0,$ACC4,$ACC4
- vpmuludq $Yi,$TEMP1,$TEMP1
- vmovdqu -16+32*7-128($np),$TEMP0
- vpaddq $TEMP1,$ACC5,$ACC5
- vpmuludq $Yi,$TEMP2,$TEMP2
- vmovdqu -16+32*8-128($np),$TEMP1
- vpaddq $TEMP2,$ACC6,$ACC6
- vpmuludq $Yi,$TEMP0,$TEMP0
- vmovdqu -16+32*9-128($np),$TEMP2
- vpaddq $TEMP0,$ACC7,$ACC7
- vpmuludq $Yi,$TEMP1,$TEMP1
- vmovdqu -24+32*1-128($ap),$TEMP0
- vpaddq $TEMP1,$ACC8,$ACC8
- vpmuludq $Yi,$TEMP2,$TEMP2
- vmovdqu -24+32*2-128($ap),$TEMP1
- vpaddq $TEMP2,$ACC9,$ACC9
- add $r2, $r3
- imulq -128($ap),%rbx
- add %rbx,$r3
- mov $r3, %rax
- imull $n0, %eax
- and \$0x1fffffff, %eax
- vpmuludq $Bi,$TEMP0,$TEMP0
- vmovd %eax, $Yi
- vmovdqu -24+32*3-128($ap),$TEMP2
- vpaddq $TEMP0,$ACC1,$ACC1
- vpmuludq $Bi,$TEMP1,$TEMP1
- vpbroadcastq $Yi, $Yi
- vmovdqu -24+32*4-128($ap),$TEMP0
- vpaddq $TEMP1,$ACC2,$ACC2
- vpmuludq $Bi,$TEMP2,$TEMP2
- vmovdqu -24+32*5-128($ap),$TEMP1
- vpaddq $TEMP2,$ACC3,$ACC3
- vpmuludq $Bi,$TEMP0,$TEMP0
- vmovdqu -24+32*6-128($ap),$TEMP2
- vpaddq $TEMP0,$ACC4,$ACC4
- vpmuludq $Bi,$TEMP1,$TEMP1
- vmovdqu -24+32*7-128($ap),$TEMP0
- vpaddq $TEMP1,$ACC5,$ACC5
- vpmuludq $Bi,$TEMP2,$TEMP2
- vmovdqu -24+32*8-128($ap),$TEMP1
- vpaddq $TEMP2,$ACC6,$ACC6
- vpmuludq $Bi,$TEMP0,$TEMP0
- vmovdqu -24+32*9-128($ap),$TEMP2
- vpaddq $TEMP0,$ACC7,$ACC7
- vpmuludq $Bi,$TEMP1,$TEMP1
- vpaddq $TEMP1,$ACC8,$ACC8
- vpmuludq $Bi,$TEMP2,$TEMP2
- vpbroadcastq 32($bp), $Bi
- vpaddq $TEMP2,$ACC9,$ACC9
- add \$32, $bp # $bp++
- vmovdqu -24+32*1-128($np),$TEMP0
- imulq -128($np),%rax
- add %rax,$r3
- shr \$29, $r3
- vmovdqu -24+32*2-128($np),$TEMP1
- vpmuludq $Yi,$TEMP0,$TEMP0
- vmovq $Bi, %rbx
- vmovdqu -24+32*3-128($np),$TEMP2
- vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0
- vpmuludq $Yi,$TEMP1,$TEMP1
- vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
- vpaddq $TEMP1,$ACC2,$ACC1
- vmovdqu -24+32*4-128($np),$TEMP0
- vpmuludq $Yi,$TEMP2,$TEMP2
- vmovdqu -24+32*5-128($np),$TEMP1
- vpaddq $TEMP2,$ACC3,$ACC2
- vpmuludq $Yi,$TEMP0,$TEMP0
- vmovdqu -24+32*6-128($np),$TEMP2
- vpaddq $TEMP0,$ACC4,$ACC3
- vpmuludq $Yi,$TEMP1,$TEMP1
- vmovdqu -24+32*7-128($np),$TEMP0
- vpaddq $TEMP1,$ACC5,$ACC4
- vpmuludq $Yi,$TEMP2,$TEMP2
- vmovdqu -24+32*8-128($np),$TEMP1
- vpaddq $TEMP2,$ACC6,$ACC5
- vpmuludq $Yi,$TEMP0,$TEMP0
- vmovdqu -24+32*9-128($np),$TEMP2
- mov $r3, $r0
- vpaddq $TEMP0,$ACC7,$ACC6
- vpmuludq $Yi,$TEMP1,$TEMP1
- add (%rsp), $r0
- vpaddq $TEMP1,$ACC8,$ACC7
- vpmuludq $Yi,$TEMP2,$TEMP2
- vmovq $r3, $TEMP1
- vpaddq $TEMP2,$ACC9,$ACC8
- dec $i
- jnz .Loop_mul_1024
- ___
- # (*) Original implementation was correcting ACC1-ACC3 for overflow
- # after 7 loop runs, or after 28 iterations, or 56 additions.
- # But as we underutilize resources, it's possible to correct in
- # each iteration with marginal performance loss. But then, as
- # we do it in each iteration, we can correct less digits, and
- # avoid performance penalties completely.
- $TEMP0 = $ACC9;
- $TEMP3 = $Bi;
- $TEMP4 = $Yi;
- $code.=<<___;
- vpaddq (%rsp), $TEMP1, $ACC0
- vpsrlq \$29, $ACC0, $TEMP1
- vpand $AND_MASK, $ACC0, $ACC0
- vpsrlq \$29, $ACC1, $TEMP2
- vpand $AND_MASK, $ACC1, $ACC1
- vpsrlq \$29, $ACC2, $TEMP3
- vpermq \$0x93, $TEMP1, $TEMP1
- vpand $AND_MASK, $ACC2, $ACC2
- vpsrlq \$29, $ACC3, $TEMP4
- vpermq \$0x93, $TEMP2, $TEMP2
- vpand $AND_MASK, $ACC3, $ACC3
- vpblendd \$3, $ZERO, $TEMP1, $TEMP0
- vpermq \$0x93, $TEMP3, $TEMP3
- vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
- vpermq \$0x93, $TEMP4, $TEMP4
- vpaddq $TEMP0, $ACC0, $ACC0
- vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
- vpaddq $TEMP1, $ACC1, $ACC1
- vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
- vpaddq $TEMP2, $ACC2, $ACC2
- vpblendd \$3, $TEMP4, $ZERO, $TEMP4
- vpaddq $TEMP3, $ACC3, $ACC3
- vpaddq $TEMP4, $ACC4, $ACC4
- vpsrlq \$29, $ACC0, $TEMP1
- vpand $AND_MASK, $ACC0, $ACC0
- vpsrlq \$29, $ACC1, $TEMP2
- vpand $AND_MASK, $ACC1, $ACC1
- vpsrlq \$29, $ACC2, $TEMP3
- vpermq \$0x93, $TEMP1, $TEMP1
- vpand $AND_MASK, $ACC2, $ACC2
- vpsrlq \$29, $ACC3, $TEMP4
- vpermq \$0x93, $TEMP2, $TEMP2
- vpand $AND_MASK, $ACC3, $ACC3
- vpermq \$0x93, $TEMP3, $TEMP3
- vpblendd \$3, $ZERO, $TEMP1, $TEMP0
- vpermq \$0x93, $TEMP4, $TEMP4
- vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
- vpaddq $TEMP0, $ACC0, $ACC0
- vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
- vpaddq $TEMP1, $ACC1, $ACC1
- vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
- vpaddq $TEMP2, $ACC2, $ACC2
- vpblendd \$3, $TEMP4, $ZERO, $TEMP4
- vpaddq $TEMP3, $ACC3, $ACC3
- vpaddq $TEMP4, $ACC4, $ACC4
- vmovdqu $ACC0, 0-128($rp)
- vmovdqu $ACC1, 32-128($rp)
- vmovdqu $ACC2, 64-128($rp)
- vmovdqu $ACC3, 96-128($rp)
- ___
- $TEMP5=$ACC0;
- $code.=<<___;
- vpsrlq \$29, $ACC4, $TEMP1
- vpand $AND_MASK, $ACC4, $ACC4
- vpsrlq \$29, $ACC5, $TEMP2
- vpand $AND_MASK, $ACC5, $ACC5
- vpsrlq \$29, $ACC6, $TEMP3
- vpermq \$0x93, $TEMP1, $TEMP1
- vpand $AND_MASK, $ACC6, $ACC6
- vpsrlq \$29, $ACC7, $TEMP4
- vpermq \$0x93, $TEMP2, $TEMP2
- vpand $AND_MASK, $ACC7, $ACC7
- vpsrlq \$29, $ACC8, $TEMP5
- vpermq \$0x93, $TEMP3, $TEMP3
- vpand $AND_MASK, $ACC8, $ACC8
- vpermq \$0x93, $TEMP4, $TEMP4
- vpblendd \$3, $ZERO, $TEMP1, $TEMP0
- vpermq \$0x93, $TEMP5, $TEMP5
- vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
- vpaddq $TEMP0, $ACC4, $ACC4
- vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
- vpaddq $TEMP1, $ACC5, $ACC5
- vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
- vpaddq $TEMP2, $ACC6, $ACC6
- vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
- vpaddq $TEMP3, $ACC7, $ACC7
- vpaddq $TEMP4, $ACC8, $ACC8
- vpsrlq \$29, $ACC4, $TEMP1
- vpand $AND_MASK, $ACC4, $ACC4
- vpsrlq \$29, $ACC5, $TEMP2
- vpand $AND_MASK, $ACC5, $ACC5
- vpsrlq \$29, $ACC6, $TEMP3
- vpermq \$0x93, $TEMP1, $TEMP1
- vpand $AND_MASK, $ACC6, $ACC6
- vpsrlq \$29, $ACC7, $TEMP4
- vpermq \$0x93, $TEMP2, $TEMP2
- vpand $AND_MASK, $ACC7, $ACC7
- vpsrlq \$29, $ACC8, $TEMP5
- vpermq \$0x93, $TEMP3, $TEMP3
- vpand $AND_MASK, $ACC8, $ACC8
- vpermq \$0x93, $TEMP4, $TEMP4
- vpblendd \$3, $ZERO, $TEMP1, $TEMP0
- vpermq \$0x93, $TEMP5, $TEMP5
- vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
- vpaddq $TEMP0, $ACC4, $ACC4
- vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
- vpaddq $TEMP1, $ACC5, $ACC5
- vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
- vpaddq $TEMP2, $ACC6, $ACC6
- vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
- vpaddq $TEMP3, $ACC7, $ACC7
- vpaddq $TEMP4, $ACC8, $ACC8
- vmovdqu $ACC4, 128-128($rp)
- vmovdqu $ACC5, 160-128($rp)
- vmovdqu $ACC6, 192-128($rp)
- vmovdqu $ACC7, 224-128($rp)
- vmovdqu $ACC8, 256-128($rp)
- vzeroupper
- mov %rbp, %rax
- .cfi_def_cfa_register %rax
- ___
- $code.=<<___ if ($win64);
- .Lmul_1024_in_tail:
- movaps -0xd8(%rax),%xmm6
- movaps -0xc8(%rax),%xmm7
- movaps -0xb8(%rax),%xmm8
- movaps -0xa8(%rax),%xmm9
- movaps -0x98(%rax),%xmm10
- movaps -0x88(%rax),%xmm11
- movaps -0x78(%rax),%xmm12
- movaps -0x68(%rax),%xmm13
- movaps -0x58(%rax),%xmm14
- movaps -0x48(%rax),%xmm15
- ___
- $code.=<<___;
- mov -48(%rax),%r15
- .cfi_restore %r15
- mov -40(%rax),%r14
- .cfi_restore %r14
- mov -32(%rax),%r13
- .cfi_restore %r13
- mov -24(%rax),%r12
- .cfi_restore %r12
- mov -16(%rax),%rbp
- .cfi_restore %rbp
- mov -8(%rax),%rbx
- .cfi_restore %rbx
- lea (%rax),%rsp # restore %rsp
- .cfi_def_cfa_register %rsp
- .Lmul_1024_epilogue:
- ret
- .cfi_endproc
- .size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
- ___
- }
- {
- my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi");
- my @T = map("%r$_",(8..11));
- $code.=<<___;
- .globl rsaz_1024_red2norm_avx2
- .type rsaz_1024_red2norm_avx2,\@abi-omnipotent
- .align 32
- rsaz_1024_red2norm_avx2:
- .cfi_startproc
- sub \$-128,$inp # size optimization
- xor %rax,%rax
- ___
- for ($j=0,$i=0; $i<16; $i++) {
- my $k=0;
- while (29*$j<64*($i+1)) { # load data till boundary
- $code.=" mov `8*$j-128`($inp), @T[0]\n";
- $j++; $k++; push(@T,shift(@T));
- }
- $l=$k;
- while ($k>1) { # shift loaded data but last value
- $code.=" shl \$`29*($j-$k)`,@T[-$k]\n";
- $k--;
- }
- $code.=<<___; # shift last value
- mov @T[-1], @T[0]
- shl \$`29*($j-1)`, @T[-1]
- shr \$`-29*($j-1)`, @T[0]
- ___
- while ($l) { # accumulate all values
- $code.=" add @T[-$l], %rax\n";
- $l--;
- }
- $code.=<<___;
- adc \$0, @T[0] # consume eventual carry
- mov %rax, 8*$i($out)
- mov @T[0], %rax
- ___
- push(@T,shift(@T));
- }
- $code.=<<___;
- ret
- .cfi_endproc
- .size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
- .globl rsaz_1024_norm2red_avx2
- .type rsaz_1024_norm2red_avx2,\@abi-omnipotent
- .align 32
- rsaz_1024_norm2red_avx2:
- .cfi_startproc
- sub \$-128,$out # size optimization
- mov ($inp),@T[0]
- mov \$0x1fffffff,%eax
- ___
- for ($j=0,$i=0; $i<16; $i++) {
- $code.=" mov `8*($i+1)`($inp),@T[1]\n" if ($i<15);
- $code.=" xor @T[1],@T[1]\n" if ($i==15);
- my $k=1;
- while (29*($j+1)<64*($i+1)) {
- $code.=<<___;
- mov @T[0],@T[-$k]
- shr \$`29*$j`,@T[-$k]
- and %rax,@T[-$k] # &0x1fffffff
- mov @T[-$k],`8*$j-128`($out)
- ___
- $j++; $k++;
- }
- $code.=<<___;
- shrd \$`29*$j`,@T[1],@T[0]
- and %rax,@T[0]
- mov @T[0],`8*$j-128`($out)
- ___
- $j++;
- push(@T,shift(@T));
- }
- $code.=<<___;
- mov @T[0],`8*$j-128`($out) # zero
- mov @T[0],`8*($j+1)-128`($out)
- mov @T[0],`8*($j+2)-128`($out)
- mov @T[0],`8*($j+3)-128`($out)
- ret
- .cfi_endproc
- .size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
- ___
- }
- {
- my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
- $code.=<<___;
- .globl rsaz_1024_scatter5_avx2
- .type rsaz_1024_scatter5_avx2,\@abi-omnipotent
- .align 32
- rsaz_1024_scatter5_avx2:
- .cfi_startproc
- vzeroupper
- vmovdqu .Lscatter_permd(%rip),%ymm5
- shl \$4,$power
- lea ($out,$power),$out
- mov \$9,%eax
- jmp .Loop_scatter_1024
- .align 32
- .Loop_scatter_1024:
- vmovdqu ($inp),%ymm0
- lea 32($inp),$inp
- vpermd %ymm0,%ymm5,%ymm0
- vmovdqu %xmm0,($out)
- lea 16*32($out),$out
- dec %eax
- jnz .Loop_scatter_1024
- vzeroupper
- ret
- .cfi_endproc
- .size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
- .globl rsaz_1024_gather5_avx2
- .type rsaz_1024_gather5_avx2,\@abi-omnipotent
- .align 32
- rsaz_1024_gather5_avx2:
- .cfi_startproc
- vzeroupper
- mov %rsp,%r11
- .cfi_def_cfa_register %r11
- ___
- $code.=<<___ if ($win64);
- lea -0x88(%rsp),%rax
- .LSEH_begin_rsaz_1024_gather5:
- # I can't trust assembler to use specific encoding:-(
- .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax),%rsp
- .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6,-0x20(%rax)
- .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7,-0x10(%rax)
- .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8,0(%rax)
- .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9,0x10(%rax)
- .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10,0x20(%rax)
- .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11,0x30(%rax)
- .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12,0x40(%rax)
- .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13,0x50(%rax)
- .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14,0x60(%rax)
- .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15,0x70(%rax)
- ___
- $code.=<<___;
- lea -0x100(%rsp),%rsp
- and \$-32, %rsp
- lea .Linc(%rip), %r10
- lea -128(%rsp),%rax # control u-op density
- vmovd $power, %xmm4
- vmovdqa (%r10),%ymm0
- vmovdqa 32(%r10),%ymm1
- vmovdqa 64(%r10),%ymm5
- vpbroadcastd %xmm4,%ymm4
- vpaddd %ymm5, %ymm0, %ymm2
- vpcmpeqd %ymm4, %ymm0, %ymm0
- vpaddd %ymm5, %ymm1, %ymm3
- vpcmpeqd %ymm4, %ymm1, %ymm1
- vmovdqa %ymm0, 32*0+128(%rax)
- vpaddd %ymm5, %ymm2, %ymm0
- vpcmpeqd %ymm4, %ymm2, %ymm2
- vmovdqa %ymm1, 32*1+128(%rax)
- vpaddd %ymm5, %ymm3, %ymm1
- vpcmpeqd %ymm4, %ymm3, %ymm3
- vmovdqa %ymm2, 32*2+128(%rax)
- vpaddd %ymm5, %ymm0, %ymm2
- vpcmpeqd %ymm4, %ymm0, %ymm0
- vmovdqa %ymm3, 32*3+128(%rax)
- vpaddd %ymm5, %ymm1, %ymm3
- vpcmpeqd %ymm4, %ymm1, %ymm1
- vmovdqa %ymm0, 32*4+128(%rax)
- vpaddd %ymm5, %ymm2, %ymm8
- vpcmpeqd %ymm4, %ymm2, %ymm2
- vmovdqa %ymm1, 32*5+128(%rax)
- vpaddd %ymm5, %ymm3, %ymm9
- vpcmpeqd %ymm4, %ymm3, %ymm3
- vmovdqa %ymm2, 32*6+128(%rax)
- vpaddd %ymm5, %ymm8, %ymm10
- vpcmpeqd %ymm4, %ymm8, %ymm8
- vmovdqa %ymm3, 32*7+128(%rax)
- vpaddd %ymm5, %ymm9, %ymm11
- vpcmpeqd %ymm4, %ymm9, %ymm9
- vpaddd %ymm5, %ymm10, %ymm12
- vpcmpeqd %ymm4, %ymm10, %ymm10
- vpaddd %ymm5, %ymm11, %ymm13
- vpcmpeqd %ymm4, %ymm11, %ymm11
- vpaddd %ymm5, %ymm12, %ymm14
- vpcmpeqd %ymm4, %ymm12, %ymm12
- vpaddd %ymm5, %ymm13, %ymm15
- vpcmpeqd %ymm4, %ymm13, %ymm13
- vpcmpeqd %ymm4, %ymm14, %ymm14
- vpcmpeqd %ymm4, %ymm15, %ymm15
- vmovdqa -32(%r10),%ymm7 # .Lgather_permd
- lea 128($inp), $inp
- mov \$9,$power
- .Loop_gather_1024:
- vmovdqa 32*0-128($inp), %ymm0
- vmovdqa 32*1-128($inp), %ymm1
- vmovdqa 32*2-128($inp), %ymm2
- vmovdqa 32*3-128($inp), %ymm3
- vpand 32*0+128(%rax), %ymm0, %ymm0
- vpand 32*1+128(%rax), %ymm1, %ymm1
- vpand 32*2+128(%rax), %ymm2, %ymm2
- vpor %ymm0, %ymm1, %ymm4
- vpand 32*3+128(%rax), %ymm3, %ymm3
- vmovdqa 32*4-128($inp), %ymm0
- vmovdqa 32*5-128($inp), %ymm1
- vpor %ymm2, %ymm3, %ymm5
- vmovdqa 32*6-128($inp), %ymm2
- vmovdqa 32*7-128($inp), %ymm3
- vpand 32*4+128(%rax), %ymm0, %ymm0
- vpand 32*5+128(%rax), %ymm1, %ymm1
- vpand 32*6+128(%rax), %ymm2, %ymm2
- vpor %ymm0, %ymm4, %ymm4
- vpand 32*7+128(%rax), %ymm3, %ymm3
- vpand 32*8-128($inp), %ymm8, %ymm0
- vpor %ymm1, %ymm5, %ymm5
- vpand 32*9-128($inp), %ymm9, %ymm1
- vpor %ymm2, %ymm4, %ymm4
- vpand 32*10-128($inp),%ymm10, %ymm2
- vpor %ymm3, %ymm5, %ymm5
- vpand 32*11-128($inp),%ymm11, %ymm3
- vpor %ymm0, %ymm4, %ymm4
- vpand 32*12-128($inp),%ymm12, %ymm0
- vpor %ymm1, %ymm5, %ymm5
- vpand 32*13-128($inp),%ymm13, %ymm1
- vpor %ymm2, %ymm4, %ymm4
- vpand 32*14-128($inp),%ymm14, %ymm2
- vpor %ymm3, %ymm5, %ymm5
- vpand 32*15-128($inp),%ymm15, %ymm3
- lea 32*16($inp), $inp
- vpor %ymm0, %ymm4, %ymm4
- vpor %ymm1, %ymm5, %ymm5
- vpor %ymm2, %ymm4, %ymm4
- vpor %ymm3, %ymm5, %ymm5
- vpor %ymm5, %ymm4, %ymm4
- vextracti128 \$1, %ymm4, %xmm5 # upper half is cleared
- vpor %xmm4, %xmm5, %xmm5
- vpermd %ymm5,%ymm7,%ymm5
- vmovdqu %ymm5,($out)
- lea 32($out),$out
- dec $power
- jnz .Loop_gather_1024
- vpxor %ymm0,%ymm0,%ymm0
- vmovdqu %ymm0,($out)
- vzeroupper
- ___
- $code.=<<___ if ($win64);
- movaps -0xa8(%r11),%xmm6
- movaps -0x98(%r11),%xmm7
- movaps -0x88(%r11),%xmm8
- movaps -0x78(%r11),%xmm9
- movaps -0x68(%r11),%xmm10
- movaps -0x58(%r11),%xmm11
- movaps -0x48(%r11),%xmm12
- movaps -0x38(%r11),%xmm13
- movaps -0x28(%r11),%xmm14
- movaps -0x18(%r11),%xmm15
- ___
- $code.=<<___;
- lea (%r11),%rsp
- .cfi_def_cfa_register %rsp
- ret
- .cfi_endproc
- .LSEH_end_rsaz_1024_gather5:
- .size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
- ___
- }
- $code.=<<___;
- .extern OPENSSL_ia32cap_P
- .globl rsaz_avx2_eligible
- .type rsaz_avx2_eligible,\@abi-omnipotent
- .align 32
- rsaz_avx2_eligible:
- mov OPENSSL_ia32cap_P+8(%rip),%eax
- ___
- $code.=<<___ if ($addx);
- mov \$`1<<8|1<<19`,%ecx
- mov \$0,%edx
- and %eax,%ecx
- cmp \$`1<<8|1<<19`,%ecx # check for BMI2+AD*X
- cmove %edx,%eax
- ___
- $code.=<<___;
- and \$`1<<5`,%eax
- shr \$5,%eax
- ret
- .size rsaz_avx2_eligible,.-rsaz_avx2_eligible
- .align 64
- .Land_mask:
- .quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff
- .Lscatter_permd:
- .long 0,2,4,6,7,7,7,7
- .Lgather_permd:
- .long 0,7,1,7,2,7,3,7
- .Linc:
- .long 0,0,0,0, 1,1,1,1
- .long 2,2,2,2, 3,3,3,3
- .long 4,4,4,4, 4,4,4,4
- .align 64
- ___
- if ($win64) {
- $rec="%rcx";
- $frame="%rdx";
- $context="%r8";
- $disp="%r9";
- $code.=<<___
- .extern __imp_RtlVirtualUnwind
- .type rsaz_se_handler,\@abi-omnipotent
- .align 16
- rsaz_se_handler:
- push %rsi
- push %rdi
- push %rbx
- push %rbp
- push %r12
- push %r13
- push %r14
- push %r15
- pushfq
- sub \$64,%rsp
- mov 120($context),%rax # pull context->Rax
- mov 248($context),%rbx # pull context->Rip
- mov 8($disp),%rsi # disp->ImageBase
- mov 56($disp),%r11 # disp->HandlerData
- mov 0(%r11),%r10d # HandlerData[0]
- lea (%rsi,%r10),%r10 # prologue label
- cmp %r10,%rbx # context->Rip<prologue label
- jb .Lcommon_seh_tail
- mov 4(%r11),%r10d # HandlerData[1]
- lea (%rsi,%r10),%r10 # epilogue label
- cmp %r10,%rbx # context->Rip>=epilogue label
- jae .Lcommon_seh_tail
- mov 160($context),%rbp # pull context->Rbp
- mov 8(%r11),%r10d # HandlerData[2]
- lea (%rsi,%r10),%r10 # "in tail" label
- cmp %r10,%rbx # context->Rip>="in tail" label
- cmovc %rbp,%rax
- mov -48(%rax),%r15
- mov -40(%rax),%r14
- mov -32(%rax),%r13
- mov -24(%rax),%r12
- mov -16(%rax),%rbp
- mov -8(%rax),%rbx
- mov %r15,240($context)
- mov %r14,232($context)
- mov %r13,224($context)
- mov %r12,216($context)
- mov %rbp,160($context)
- mov %rbx,144($context)
- lea -0xd8(%rax),%rsi # %xmm save area
- lea 512($context),%rdi # & context.Xmm6
- mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
- .long 0xa548f3fc # cld; rep movsq
- .Lcommon_seh_tail:
- mov 8(%rax),%rdi
- mov 16(%rax),%rsi
- mov %rax,152($context) # restore context->Rsp
- mov %rsi,168($context) # restore context->Rsi
- mov %rdi,176($context) # restore context->Rdi
- mov 40($disp),%rdi # disp->ContextRecord
- mov $context,%rsi # context
- mov \$154,%ecx # sizeof(CONTEXT)
- .long 0xa548f3fc # cld; rep movsq
- mov $disp,%rsi
- xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
- mov 8(%rsi),%rdx # arg2, disp->ImageBase
- mov 0(%rsi),%r8 # arg3, disp->ControlPc
- mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
- mov 40(%rsi),%r10 # disp->ContextRecord
- lea 56(%rsi),%r11 # &disp->HandlerData
- lea 24(%rsi),%r12 # &disp->EstablisherFrame
- mov %r10,32(%rsp) # arg5
- mov %r11,40(%rsp) # arg6
- mov %r12,48(%rsp) # arg7
- mov %rcx,56(%rsp) # arg8, (NULL)
- call *__imp_RtlVirtualUnwind(%rip)
- mov \$1,%eax # ExceptionContinueSearch
- add \$64,%rsp
- popfq
- pop %r15
- pop %r14
- pop %r13
- pop %r12
- pop %rbp
- pop %rbx
- pop %rdi
- pop %rsi
- ret
- .size rsaz_se_handler,.-rsaz_se_handler
- .section .pdata
- .align 4
- .rva .LSEH_begin_rsaz_1024_sqr_avx2
- .rva .LSEH_end_rsaz_1024_sqr_avx2
- .rva .LSEH_info_rsaz_1024_sqr_avx2
- .rva .LSEH_begin_rsaz_1024_mul_avx2
- .rva .LSEH_end_rsaz_1024_mul_avx2
- .rva .LSEH_info_rsaz_1024_mul_avx2
- .rva .LSEH_begin_rsaz_1024_gather5
- .rva .LSEH_end_rsaz_1024_gather5
- .rva .LSEH_info_rsaz_1024_gather5
- .section .xdata
- .align 8
- .LSEH_info_rsaz_1024_sqr_avx2:
- .byte 9,0,0,0
- .rva rsaz_se_handler
- .rva .Lsqr_1024_body,.Lsqr_1024_epilogue,.Lsqr_1024_in_tail
- .long 0
- .LSEH_info_rsaz_1024_mul_avx2:
- .byte 9,0,0,0
- .rva rsaz_se_handler
- .rva .Lmul_1024_body,.Lmul_1024_epilogue,.Lmul_1024_in_tail
- .long 0
- .LSEH_info_rsaz_1024_gather5:
- .byte 0x01,0x36,0x17,0x0b
- .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
- .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
- .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
- .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
- .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
- .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
- .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
- .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
- .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
- .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
- .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8
- .byte 0x00,0xb3,0x00,0x00 # set_frame r11
- ___
- }
- foreach (split("\n",$code)) {
- s/\`([^\`]*)\`/eval($1)/ge;
- s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or
- s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
- s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
- s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
- s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
- s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
- print $_,"\n";
- }
- }}} else {{{
- print <<___; # assembler is too old
- .text
- .globl rsaz_avx2_eligible
- .type rsaz_avx2_eligible,\@abi-omnipotent
- rsaz_avx2_eligible:
- xor %eax,%eax
- ret
- .size rsaz_avx2_eligible,.-rsaz_avx2_eligible
- .globl rsaz_1024_sqr_avx2
- .globl rsaz_1024_mul_avx2
- .globl rsaz_1024_norm2red_avx2
- .globl rsaz_1024_red2norm_avx2
- .globl rsaz_1024_scatter5_avx2
- .globl rsaz_1024_gather5_avx2
- .type rsaz_1024_sqr_avx2,\@abi-omnipotent
- rsaz_1024_sqr_avx2:
- rsaz_1024_mul_avx2:
- rsaz_1024_norm2red_avx2:
- rsaz_1024_red2norm_avx2:
- rsaz_1024_scatter5_avx2:
- rsaz_1024_gather5_avx2:
- .byte 0x0f,0x0b # ud2
- ret
- .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
- ___
- }}}
- close STDOUT or die "error closing STDOUT: $!";
|