1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453 |
- /* chacha_asm
- *
- * Copyright (C) 2006-2022 wolfSSL Inc.
- *
- * This file is part of wolfSSL.
- *
- * wolfSSL is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * wolfSSL is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
- */
- #ifdef WOLFSSL_USER_SETTINGS
- #ifdef WOLFSSL_USER_SETTINGS_ASM
- /*
- * user_settings_asm.h is a file generated by the script user_settings_asm.sh.
- * The script takes in a user_settings.h and produces user_settings_asm.h, which
- * is a stripped down version of user_settings.h containing only preprocessor
- * directives. This makes the header safe to include in assembly (.S) files.
- */
- #include "user_settings_asm.h"
- #else
- /*
- * Note: if user_settings.h contains any C code (e.g. a typedef or function
- * prototype), including it here in an assembly (.S) file will cause an
- * assembler failure. See user_settings_asm.h above.
- */
- #include "user_settings.h"
- #endif /* WOLFSSL_USER_SETTINGS_ASM */
- #endif /* WOLFSSL_USER_SETTINGS */
- #ifndef HAVE_INTEL_AVX1
- #define HAVE_INTEL_AVX1
- #endif /* HAVE_INTEL_AVX1 */
- #ifndef NO_AVX2_SUPPORT
- #define HAVE_INTEL_AVX2
- #endif /* NO_AVX2_SUPPORT */
- #ifdef WOLFSSL_X86_64_BUILD
- #ifndef __APPLE__
- .text
- .globl chacha_encrypt_x64
- .type chacha_encrypt_x64,@function
- .align 16
- chacha_encrypt_x64:
- #else
- .section __TEXT,__text
- .globl _chacha_encrypt_x64
- .p2align 4
- _chacha_encrypt_x64:
- #endif /* __APPLE__ */
- pushq %rbx
- pushq %rbp
- pushq %r12
- pushq %r13
- pushq %r14
- pushq %r15
- subq $0x40, %rsp
- cmpl $0x40, %ecx
- jl L_chacha_x64_small
- L_chacha_x64_start:
- subq $48, %rsp
- movq %rdx, 24(%rsp)
- movq %rsi, 32(%rsp)
- movq %rcx, 40(%rsp)
- movq 32(%rdi), %rax
- movq 40(%rdi), %rbx
- movq %rax, 8(%rsp)
- movq %rbx, 16(%rsp)
- movl (%rdi), %eax
- movl 4(%rdi), %ebx
- movl 8(%rdi), %ecx
- movl 12(%rdi), %edx
- movl 16(%rdi), %r8d
- movl 20(%rdi), %r9d
- movl 24(%rdi), %r10d
- movl 28(%rdi), %r11d
- movl 48(%rdi), %r12d
- movl 52(%rdi), %r13d
- movl 56(%rdi), %r14d
- movl 60(%rdi), %r15d
- movb $10, (%rsp)
- movl 8(%rsp), %esi
- movl 12(%rsp), %ebp
- L_chacha_x64_block_crypt_start:
- addl %r8d, %eax
- addl %r9d, %ebx
- xorl %eax, %r12d
- xorl %ebx, %r13d
- roll $16, %r12d
- roll $16, %r13d
- addl %r12d, %esi
- addl %r13d, %ebp
- xorl %esi, %r8d
- xorl %ebp, %r9d
- roll $12, %r8d
- roll $12, %r9d
- addl %r8d, %eax
- addl %r9d, %ebx
- xorl %eax, %r12d
- xorl %ebx, %r13d
- roll $8, %r12d
- roll $8, %r13d
- addl %r12d, %esi
- addl %r13d, %ebp
- xorl %esi, %r8d
- xorl %ebp, %r9d
- roll $7, %r8d
- roll $7, %r9d
- movl %esi, 8(%rsp)
- movl %ebp, 12(%rsp)
- movl 16(%rsp), %esi
- movl 20(%rsp), %ebp
- addl %r10d, %ecx
- addl %r11d, %edx
- xorl %ecx, %r14d
- xorl %edx, %r15d
- roll $16, %r14d
- roll $16, %r15d
- addl %r14d, %esi
- addl %r15d, %ebp
- xorl %esi, %r10d
- xorl %ebp, %r11d
- roll $12, %r10d
- roll $12, %r11d
- addl %r10d, %ecx
- addl %r11d, %edx
- xorl %ecx, %r14d
- xorl %edx, %r15d
- roll $8, %r14d
- roll $8, %r15d
- addl %r14d, %esi
- addl %r15d, %ebp
- xorl %esi, %r10d
- xorl %ebp, %r11d
- roll $7, %r10d
- roll $7, %r11d
- addl %r9d, %eax
- addl %r10d, %ebx
- xorl %eax, %r15d
- xorl %ebx, %r12d
- roll $16, %r15d
- roll $16, %r12d
- addl %r15d, %esi
- addl %r12d, %ebp
- xorl %esi, %r9d
- xorl %ebp, %r10d
- roll $12, %r9d
- roll $12, %r10d
- addl %r9d, %eax
- addl %r10d, %ebx
- xorl %eax, %r15d
- xorl %ebx, %r12d
- roll $8, %r15d
- roll $8, %r12d
- addl %r15d, %esi
- addl %r12d, %ebp
- xorl %esi, %r9d
- xorl %ebp, %r10d
- roll $7, %r9d
- roll $7, %r10d
- movl %esi, 16(%rsp)
- movl %ebp, 20(%rsp)
- movl 8(%rsp), %esi
- movl 12(%rsp), %ebp
- addl %r11d, %ecx
- addl %r8d, %edx
- xorl %ecx, %r13d
- xorl %edx, %r14d
- roll $16, %r13d
- roll $16, %r14d
- addl %r13d, %esi
- addl %r14d, %ebp
- xorl %esi, %r11d
- xorl %ebp, %r8d
- roll $12, %r11d
- roll $12, %r8d
- addl %r11d, %ecx
- addl %r8d, %edx
- xorl %ecx, %r13d
- xorl %edx, %r14d
- roll $8, %r13d
- roll $8, %r14d
- addl %r13d, %esi
- addl %r14d, %ebp
- xorl %esi, %r11d
- xorl %ebp, %r8d
- roll $7, %r11d
- roll $7, %r8d
- decb (%rsp)
- jnz L_chacha_x64_block_crypt_start
- movl %esi, 8(%rsp)
- movl %ebp, 12(%rsp)
- movq 32(%rsp), %rsi
- movq 24(%rsp), %rbp
- addl (%rdi), %eax
- addl 4(%rdi), %ebx
- addl 8(%rdi), %ecx
- addl 12(%rdi), %edx
- addl 16(%rdi), %r8d
- addl 20(%rdi), %r9d
- addl 24(%rdi), %r10d
- addl 28(%rdi), %r11d
- addl 48(%rdi), %r12d
- addl 52(%rdi), %r13d
- addl 56(%rdi), %r14d
- addl 60(%rdi), %r15d
- xorl (%rsi), %eax
- xorl 4(%rsi), %ebx
- xorl 8(%rsi), %ecx
- xorl 12(%rsi), %edx
- xorl 16(%rsi), %r8d
- xorl 20(%rsi), %r9d
- xorl 24(%rsi), %r10d
- xorl 28(%rsi), %r11d
- xorl 48(%rsi), %r12d
- xorl 52(%rsi), %r13d
- xorl 56(%rsi), %r14d
- xorl 60(%rsi), %r15d
- movl %eax, (%rbp)
- movl %ebx, 4(%rbp)
- movl %ecx, 8(%rbp)
- movl %edx, 12(%rbp)
- movl %r8d, 16(%rbp)
- movl %r9d, 20(%rbp)
- movl %r10d, 24(%rbp)
- movl %r11d, 28(%rbp)
- movl %r12d, 48(%rbp)
- movl %r13d, 52(%rbp)
- movl %r14d, 56(%rbp)
- movl %r15d, 60(%rbp)
- movl 8(%rsp), %eax
- movl 12(%rsp), %ebx
- movl 16(%rsp), %ecx
- movl 20(%rsp), %edx
- addl 32(%rdi), %eax
- addl 36(%rdi), %ebx
- addl 40(%rdi), %ecx
- addl 44(%rdi), %edx
- xorl 32(%rsi), %eax
- xorl 36(%rsi), %ebx
- xorl 40(%rsi), %ecx
- xorl 44(%rsi), %edx
- movl %eax, 32(%rbp)
- movl %ebx, 36(%rbp)
- movl %ecx, 40(%rbp)
- movl %edx, 44(%rbp)
- movq 24(%rsp), %rdx
- movq 40(%rsp), %rcx
- addl $0x01, 48(%rdi)
- addq $48, %rsp
- subl $0x40, %ecx
- addq $0x40, %rsi
- addq $0x40, %rdx
- cmpl $0x40, %ecx
- jge L_chacha_x64_start
- L_chacha_x64_small:
- cmpl $0x00, %ecx
- je L_chacha_x64_done
- subq $48, %rsp
- movq %rdx, 24(%rsp)
- movq %rsi, 32(%rsp)
- movq %rcx, 40(%rsp)
- movq 32(%rdi), %rax
- movq 40(%rdi), %rbx
- movq %rax, 8(%rsp)
- movq %rbx, 16(%rsp)
- movl (%rdi), %eax
- movl 4(%rdi), %ebx
- movl 8(%rdi), %ecx
- movl 12(%rdi), %edx
- movl 16(%rdi), %r8d
- movl 20(%rdi), %r9d
- movl 24(%rdi), %r10d
- movl 28(%rdi), %r11d
- movl 48(%rdi), %r12d
- movl 52(%rdi), %r13d
- movl 56(%rdi), %r14d
- movl 60(%rdi), %r15d
- movb $10, (%rsp)
- movl 8(%rsp), %esi
- movl 12(%rsp), %ebp
- L_chacha_x64_partial_crypt_start:
- addl %r8d, %eax
- addl %r9d, %ebx
- xorl %eax, %r12d
- xorl %ebx, %r13d
- roll $16, %r12d
- roll $16, %r13d
- addl %r12d, %esi
- addl %r13d, %ebp
- xorl %esi, %r8d
- xorl %ebp, %r9d
- roll $12, %r8d
- roll $12, %r9d
- addl %r8d, %eax
- addl %r9d, %ebx
- xorl %eax, %r12d
- xorl %ebx, %r13d
- roll $8, %r12d
- roll $8, %r13d
- addl %r12d, %esi
- addl %r13d, %ebp
- xorl %esi, %r8d
- xorl %ebp, %r9d
- roll $7, %r8d
- roll $7, %r9d
- movl %esi, 8(%rsp)
- movl %ebp, 12(%rsp)
- movl 16(%rsp), %esi
- movl 20(%rsp), %ebp
- addl %r10d, %ecx
- addl %r11d, %edx
- xorl %ecx, %r14d
- xorl %edx, %r15d
- roll $16, %r14d
- roll $16, %r15d
- addl %r14d, %esi
- addl %r15d, %ebp
- xorl %esi, %r10d
- xorl %ebp, %r11d
- roll $12, %r10d
- roll $12, %r11d
- addl %r10d, %ecx
- addl %r11d, %edx
- xorl %ecx, %r14d
- xorl %edx, %r15d
- roll $8, %r14d
- roll $8, %r15d
- addl %r14d, %esi
- addl %r15d, %ebp
- xorl %esi, %r10d
- xorl %ebp, %r11d
- roll $7, %r10d
- roll $7, %r11d
- addl %r9d, %eax
- addl %r10d, %ebx
- xorl %eax, %r15d
- xorl %ebx, %r12d
- roll $16, %r15d
- roll $16, %r12d
- addl %r15d, %esi
- addl %r12d, %ebp
- xorl %esi, %r9d
- xorl %ebp, %r10d
- roll $12, %r9d
- roll $12, %r10d
- addl %r9d, %eax
- addl %r10d, %ebx
- xorl %eax, %r15d
- xorl %ebx, %r12d
- roll $8, %r15d
- roll $8, %r12d
- addl %r15d, %esi
- addl %r12d, %ebp
- xorl %esi, %r9d
- xorl %ebp, %r10d
- roll $7, %r9d
- roll $7, %r10d
- movl %esi, 16(%rsp)
- movl %ebp, 20(%rsp)
- movl 8(%rsp), %esi
- movl 12(%rsp), %ebp
- addl %r11d, %ecx
- addl %r8d, %edx
- xorl %ecx, %r13d
- xorl %edx, %r14d
- roll $16, %r13d
- roll $16, %r14d
- addl %r13d, %esi
- addl %r14d, %ebp
- xorl %esi, %r11d
- xorl %ebp, %r8d
- roll $12, %r11d
- roll $12, %r8d
- addl %r11d, %ecx
- addl %r8d, %edx
- xorl %ecx, %r13d
- xorl %edx, %r14d
- roll $8, %r13d
- roll $8, %r14d
- addl %r13d, %esi
- addl %r14d, %ebp
- xorl %esi, %r11d
- xorl %ebp, %r8d
- roll $7, %r11d
- roll $7, %r8d
- decb (%rsp)
- jnz L_chacha_x64_partial_crypt_start
- movl %esi, 8(%rsp)
- movl %ebp, 12(%rsp)
- movq 32(%rsp), %rsi
- addl (%rdi), %eax
- addl 4(%rdi), %ebx
- addl 8(%rdi), %ecx
- addl 12(%rdi), %edx
- addl 16(%rdi), %r8d
- addl 20(%rdi), %r9d
- addl 24(%rdi), %r10d
- addl 28(%rdi), %r11d
- addl 48(%rdi), %r12d
- addl 52(%rdi), %r13d
- addl 56(%rdi), %r14d
- addl 60(%rdi), %r15d
- leaq 80(%rdi), %rbp
- movl %eax, (%rbp)
- movl %ebx, 4(%rbp)
- movl %ecx, 8(%rbp)
- movl %edx, 12(%rbp)
- movl %r8d, 16(%rbp)
- movl %r9d, 20(%rbp)
- movl %r10d, 24(%rbp)
- movl %r11d, 28(%rbp)
- movl %r12d, 48(%rbp)
- movl %r13d, 52(%rbp)
- movl %r14d, 56(%rbp)
- movl %r15d, 60(%rbp)
- movl 8(%rsp), %eax
- movl 12(%rsp), %ebx
- movl 16(%rsp), %ecx
- movl 20(%rsp), %edx
- addl 32(%rdi), %eax
- addl 36(%rdi), %ebx
- addl 40(%rdi), %ecx
- addl 44(%rdi), %edx
- movl %eax, 32(%rbp)
- movl %ebx, 36(%rbp)
- movl %ecx, 40(%rbp)
- movl %edx, 44(%rbp)
- movq 24(%rsp), %rdx
- movq 40(%rsp), %rcx
- addl $0x01, 48(%rdi)
- addq $48, %rsp
- movl %ecx, %r8d
- xorq %rbx, %rbx
- andl $7, %r8d
- jz L_chacha_x64_partial_start64
- L_chacha_x64_partial_start8:
- movzbl (%rbp,%rbx,1), %eax
- xorb (%rsi,%rbx,1), %al
- movb %al, (%rdx,%rbx,1)
- incl %ebx
- cmpl %r8d, %ebx
- jne L_chacha_x64_partial_start8
- je L_chacha_x64_partial_end64
- L_chacha_x64_partial_start64:
- movq (%rbp,%rbx,1), %rax
- xorq (%rsi,%rbx,1), %rax
- movq %rax, (%rdx,%rbx,1)
- addl $8, %ebx
- L_chacha_x64_partial_end64:
- cmpl %ecx, %ebx
- jne L_chacha_x64_partial_start64
- movl $0x40, %ecx
- subl %ebx, %ecx
- movl %ecx, 76(%rdi)
- L_chacha_x64_done:
- addq $0x40, %rsp
- popq %r15
- popq %r14
- popq %r13
- popq %r12
- popq %rbp
- popq %rbx
- repz retq
- #ifndef __APPLE__
- .size chacha_encrypt_x64,.-chacha_encrypt_x64
- #endif /* __APPLE__ */
- #ifdef HAVE_INTEL_AVX1
- #ifndef __APPLE__
- .data
- #else
- .section __DATA,__data
- #endif /* __APPLE__ */
- #ifndef __APPLE__
- .align 16
- #else
- .p2align 4
- #endif /* __APPLE__ */
- L_chacha20_avx1_rotl8:
- .quad 0x605040702010003, 0xe0d0c0f0a09080b
- #ifndef __APPLE__
- .data
- #else
- .section __DATA,__data
- #endif /* __APPLE__ */
- #ifndef __APPLE__
- .align 16
- #else
- .p2align 4
- #endif /* __APPLE__ */
- L_chacha20_avx1_rotl16:
- .quad 0x504070601000302, 0xd0c0f0e09080b0a
- #ifndef __APPLE__
- .data
- #else
- .section __DATA,__data
- #endif /* __APPLE__ */
- #ifndef __APPLE__
- .align 16
- #else
- .p2align 4
- #endif /* __APPLE__ */
- L_chacha20_avx1_add:
- .quad 0x100000000, 0x300000002
- #ifndef __APPLE__
- .data
- #else
- .section __DATA,__data
- #endif /* __APPLE__ */
- #ifndef __APPLE__
- .align 16
- #else
- .p2align 4
- #endif /* __APPLE__ */
- L_chacha20_avx1_four:
- .quad 0x400000004, 0x400000004
- #ifndef __APPLE__
- .text
- .globl chacha_encrypt_avx1
- .type chacha_encrypt_avx1,@function
- .align 16
- chacha_encrypt_avx1:
- #else
- .section __TEXT,__text
- .globl _chacha_encrypt_avx1
- .p2align 4
- _chacha_encrypt_avx1:
- #endif /* __APPLE__ */
- subq $0x190, %rsp
- movq %rsp, %r9
- leaq 256(%rsp), %r10
- andq $-16, %r9
- andq $-16, %r10
- movl %ecx, %eax
- shrl $8, %eax
- jz L_chacha20_avx1_end128
- vpshufd $0x00, (%rdi), %xmm0
- vpshufd $0x00, 4(%rdi), %xmm1
- vpshufd $0x00, 8(%rdi), %xmm2
- vpshufd $0x00, 12(%rdi), %xmm3
- vpshufd $0x00, 16(%rdi), %xmm4
- vpshufd $0x00, 20(%rdi), %xmm5
- vpshufd $0x00, 24(%rdi), %xmm6
- vpshufd $0x00, 28(%rdi), %xmm7
- vpshufd $0x00, 32(%rdi), %xmm8
- vpshufd $0x00, 36(%rdi), %xmm9
- vpshufd $0x00, 40(%rdi), %xmm10
- vpshufd $0x00, 44(%rdi), %xmm11
- vpshufd $0x00, 48(%rdi), %xmm12
- vpshufd $0x00, 52(%rdi), %xmm13
- vpshufd $0x00, 56(%rdi), %xmm14
- vpshufd $0x00, 60(%rdi), %xmm15
- vpaddd L_chacha20_avx1_add(%rip), %xmm12, %xmm12
- vmovdqa %xmm0, (%r9)
- vmovdqa %xmm1, 16(%r9)
- vmovdqa %xmm2, 32(%r9)
- vmovdqa %xmm3, 48(%r9)
- vmovdqa %xmm4, 64(%r9)
- vmovdqa %xmm5, 80(%r9)
- vmovdqa %xmm6, 96(%r9)
- vmovdqa %xmm7, 112(%r9)
- vmovdqa %xmm8, 128(%r9)
- vmovdqa %xmm9, 144(%r9)
- vmovdqa %xmm10, 160(%r9)
- vmovdqa %xmm11, 176(%r9)
- vmovdqa %xmm12, 192(%r9)
- vmovdqa %xmm13, 208(%r9)
- vmovdqa %xmm14, 224(%r9)
- vmovdqa %xmm15, 240(%r9)
- L_chacha20_avx1_start128:
- vmovdqa %xmm11, 48(%r10)
- movb $10, %r8b
- L_chacha20_avx1_loop128:
- vpaddd %xmm4, %xmm0, %xmm0
- vpxor %xmm0, %xmm12, %xmm12
- vmovdqa 48(%r10), %xmm11
- vpshufb L_chacha20_avx1_rotl16(%rip), %xmm12, %xmm12
- vpaddd %xmm12, %xmm8, %xmm8
- vpxor %xmm8, %xmm4, %xmm4
- vpaddd %xmm5, %xmm1, %xmm1
- vpxor %xmm1, %xmm13, %xmm13
- vpshufb L_chacha20_avx1_rotl16(%rip), %xmm13, %xmm13
- vpaddd %xmm13, %xmm9, %xmm9
- vpxor %xmm9, %xmm5, %xmm5
- vpaddd %xmm6, %xmm2, %xmm2
- vpxor %xmm2, %xmm14, %xmm14
- vpshufb L_chacha20_avx1_rotl16(%rip), %xmm14, %xmm14
- vpaddd %xmm14, %xmm10, %xmm10
- vpxor %xmm10, %xmm6, %xmm6
- vpaddd %xmm7, %xmm3, %xmm3
- vpxor %xmm3, %xmm15, %xmm15
- vpshufb L_chacha20_avx1_rotl16(%rip), %xmm15, %xmm15
- vpaddd %xmm15, %xmm11, %xmm11
- vpxor %xmm11, %xmm7, %xmm7
- vmovdqa %xmm11, 48(%r10)
- vpsrld $20, %xmm4, %xmm11
- vpslld $12, %xmm4, %xmm4
- vpxor %xmm11, %xmm4, %xmm4
- vpsrld $20, %xmm5, %xmm11
- vpslld $12, %xmm5, %xmm5
- vpxor %xmm11, %xmm5, %xmm5
- vpsrld $20, %xmm6, %xmm11
- vpslld $12, %xmm6, %xmm6
- vpxor %xmm11, %xmm6, %xmm6
- vpsrld $20, %xmm7, %xmm11
- vpslld $12, %xmm7, %xmm7
- vpxor %xmm11, %xmm7, %xmm7
- vpaddd %xmm4, %xmm0, %xmm0
- vpxor %xmm0, %xmm12, %xmm12
- vmovdqa 48(%r10), %xmm11
- vpshufb L_chacha20_avx1_rotl8(%rip), %xmm12, %xmm12
- vpaddd %xmm12, %xmm8, %xmm8
- vpxor %xmm8, %xmm4, %xmm4
- vpaddd %xmm5, %xmm1, %xmm1
- vpxor %xmm1, %xmm13, %xmm13
- vpshufb L_chacha20_avx1_rotl8(%rip), %xmm13, %xmm13
- vpaddd %xmm13, %xmm9, %xmm9
- vpxor %xmm9, %xmm5, %xmm5
- vpaddd %xmm6, %xmm2, %xmm2
- vpxor %xmm2, %xmm14, %xmm14
- vpshufb L_chacha20_avx1_rotl8(%rip), %xmm14, %xmm14
- vpaddd %xmm14, %xmm10, %xmm10
- vpxor %xmm10, %xmm6, %xmm6
- vpaddd %xmm7, %xmm3, %xmm3
- vpxor %xmm3, %xmm15, %xmm15
- vpshufb L_chacha20_avx1_rotl8(%rip), %xmm15, %xmm15
- vpaddd %xmm15, %xmm11, %xmm11
- vpxor %xmm11, %xmm7, %xmm7
- vmovdqa %xmm11, 48(%r10)
- vpsrld $25, %xmm4, %xmm11
- vpslld $7, %xmm4, %xmm4
- vpxor %xmm11, %xmm4, %xmm4
- vpsrld $25, %xmm5, %xmm11
- vpslld $7, %xmm5, %xmm5
- vpxor %xmm11, %xmm5, %xmm5
- vpsrld $25, %xmm6, %xmm11
- vpslld $7, %xmm6, %xmm6
- vpxor %xmm11, %xmm6, %xmm6
- vpsrld $25, %xmm7, %xmm11
- vpslld $7, %xmm7, %xmm7
- vpxor %xmm11, %xmm7, %xmm7
- vpaddd %xmm5, %xmm0, %xmm0
- vpxor %xmm0, %xmm15, %xmm15
- vmovdqa 48(%r10), %xmm11
- vpshufb L_chacha20_avx1_rotl16(%rip), %xmm15, %xmm15
- vpaddd %xmm15, %xmm10, %xmm10
- vpxor %xmm10, %xmm5, %xmm5
- vpaddd %xmm6, %xmm1, %xmm1
- vpxor %xmm1, %xmm12, %xmm12
- vpshufb L_chacha20_avx1_rotl16(%rip), %xmm12, %xmm12
- vpaddd %xmm12, %xmm11, %xmm11
- vpxor %xmm11, %xmm6, %xmm6
- vpaddd %xmm7, %xmm2, %xmm2
- vpxor %xmm2, %xmm13, %xmm13
- vpshufb L_chacha20_avx1_rotl16(%rip), %xmm13, %xmm13
- vpaddd %xmm13, %xmm8, %xmm8
- vpxor %xmm8, %xmm7, %xmm7
- vpaddd %xmm4, %xmm3, %xmm3
- vpxor %xmm3, %xmm14, %xmm14
- vpshufb L_chacha20_avx1_rotl16(%rip), %xmm14, %xmm14
- vpaddd %xmm14, %xmm9, %xmm9
- vpxor %xmm9, %xmm4, %xmm4
- vmovdqa %xmm11, 48(%r10)
- vpsrld $20, %xmm5, %xmm11
- vpslld $12, %xmm5, %xmm5
- vpxor %xmm11, %xmm5, %xmm5
- vpsrld $20, %xmm6, %xmm11
- vpslld $12, %xmm6, %xmm6
- vpxor %xmm11, %xmm6, %xmm6
- vpsrld $20, %xmm7, %xmm11
- vpslld $12, %xmm7, %xmm7
- vpxor %xmm11, %xmm7, %xmm7
- vpsrld $20, %xmm4, %xmm11
- vpslld $12, %xmm4, %xmm4
- vpxor %xmm11, %xmm4, %xmm4
- vpaddd %xmm5, %xmm0, %xmm0
- vpxor %xmm0, %xmm15, %xmm15
- vmovdqa 48(%r10), %xmm11
- vpshufb L_chacha20_avx1_rotl8(%rip), %xmm15, %xmm15
- vpaddd %xmm15, %xmm10, %xmm10
- vpxor %xmm10, %xmm5, %xmm5
- vpaddd %xmm6, %xmm1, %xmm1
- vpxor %xmm1, %xmm12, %xmm12
- vpshufb L_chacha20_avx1_rotl8(%rip), %xmm12, %xmm12
- vpaddd %xmm12, %xmm11, %xmm11
- vpxor %xmm11, %xmm6, %xmm6
- vpaddd %xmm7, %xmm2, %xmm2
- vpxor %xmm2, %xmm13, %xmm13
- vpshufb L_chacha20_avx1_rotl8(%rip), %xmm13, %xmm13
- vpaddd %xmm13, %xmm8, %xmm8
- vpxor %xmm8, %xmm7, %xmm7
- vpaddd %xmm4, %xmm3, %xmm3
- vpxor %xmm3, %xmm14, %xmm14
- vpshufb L_chacha20_avx1_rotl8(%rip), %xmm14, %xmm14
- vpaddd %xmm14, %xmm9, %xmm9
- vpxor %xmm9, %xmm4, %xmm4
- vmovdqa %xmm11, 48(%r10)
- vpsrld $25, %xmm5, %xmm11
- vpslld $7, %xmm5, %xmm5
- vpxor %xmm11, %xmm5, %xmm5
- vpsrld $25, %xmm6, %xmm11
- vpslld $7, %xmm6, %xmm6
- vpxor %xmm11, %xmm6, %xmm6
- vpsrld $25, %xmm7, %xmm11
- vpslld $7, %xmm7, %xmm7
- vpxor %xmm11, %xmm7, %xmm7
- vpsrld $25, %xmm4, %xmm11
- vpslld $7, %xmm4, %xmm4
- vpxor %xmm11, %xmm4, %xmm4
- decb %r8b
- jnz L_chacha20_avx1_loop128
- vmovdqa 48(%r10), %xmm11
- vpaddd (%r9), %xmm0, %xmm0
- vpaddd 16(%r9), %xmm1, %xmm1
- vpaddd 32(%r9), %xmm2, %xmm2
- vpaddd 48(%r9), %xmm3, %xmm3
- vpaddd 64(%r9), %xmm4, %xmm4
- vpaddd 80(%r9), %xmm5, %xmm5
- vpaddd 96(%r9), %xmm6, %xmm6
- vpaddd 112(%r9), %xmm7, %xmm7
- vpaddd 128(%r9), %xmm8, %xmm8
- vpaddd 144(%r9), %xmm9, %xmm9
- vpaddd 160(%r9), %xmm10, %xmm10
- vpaddd 176(%r9), %xmm11, %xmm11
- vpaddd 192(%r9), %xmm12, %xmm12
- vpaddd 208(%r9), %xmm13, %xmm13
- vpaddd 224(%r9), %xmm14, %xmm14
- vpaddd 240(%r9), %xmm15, %xmm15
- vmovdqa %xmm8, (%r10)
- vmovdqa %xmm9, 16(%r10)
- vmovdqa %xmm10, 32(%r10)
- vmovdqa %xmm11, 48(%r10)
- vmovdqa %xmm12, 64(%r10)
- vmovdqa %xmm13, 80(%r10)
- vmovdqa %xmm14, 96(%r10)
- vmovdqa %xmm15, 112(%r10)
- vpunpckldq %xmm1, %xmm0, %xmm8
- vpunpckldq %xmm3, %xmm2, %xmm9
- vpunpckhdq %xmm1, %xmm0, %xmm12
- vpunpckhdq %xmm3, %xmm2, %xmm13
- vpunpckldq %xmm5, %xmm4, %xmm10
- vpunpckldq %xmm7, %xmm6, %xmm11
- vpunpckhdq %xmm5, %xmm4, %xmm14
- vpunpckhdq %xmm7, %xmm6, %xmm15
- vpunpcklqdq %xmm9, %xmm8, %xmm0
- vpunpcklqdq %xmm11, %xmm10, %xmm1
- vpunpckhqdq %xmm9, %xmm8, %xmm2
- vpunpckhqdq %xmm11, %xmm10, %xmm3
- vpunpcklqdq %xmm13, %xmm12, %xmm4
- vpunpcklqdq %xmm15, %xmm14, %xmm5
- vpunpckhqdq %xmm13, %xmm12, %xmm6
- vpunpckhqdq %xmm15, %xmm14, %xmm7
- vmovdqu (%rsi), %xmm8
- vmovdqu 16(%rsi), %xmm9
- vmovdqu 64(%rsi), %xmm10
- vmovdqu 80(%rsi), %xmm11
- vmovdqu 128(%rsi), %xmm12
- vmovdqu 144(%rsi), %xmm13
- vmovdqu 192(%rsi), %xmm14
- vmovdqu 208(%rsi), %xmm15
- vpxor %xmm8, %xmm0, %xmm0
- vpxor %xmm9, %xmm1, %xmm1
- vpxor %xmm10, %xmm2, %xmm2
- vpxor %xmm11, %xmm3, %xmm3
- vpxor %xmm12, %xmm4, %xmm4
- vpxor %xmm13, %xmm5, %xmm5
- vpxor %xmm14, %xmm6, %xmm6
- vpxor %xmm15, %xmm7, %xmm7
- vmovdqu %xmm0, (%rdx)
- vmovdqu %xmm1, 16(%rdx)
- vmovdqu %xmm2, 64(%rdx)
- vmovdqu %xmm3, 80(%rdx)
- vmovdqu %xmm4, 128(%rdx)
- vmovdqu %xmm5, 144(%rdx)
- vmovdqu %xmm6, 192(%rdx)
- vmovdqu %xmm7, 208(%rdx)
- vmovdqa (%r10), %xmm0
- vmovdqa 16(%r10), %xmm1
- vmovdqa 32(%r10), %xmm2
- vmovdqa 48(%r10), %xmm3
- vmovdqa 64(%r10), %xmm4
- vmovdqa 80(%r10), %xmm5
- vmovdqa 96(%r10), %xmm6
- vmovdqa 112(%r10), %xmm7
- vpunpckldq %xmm1, %xmm0, %xmm8
- vpunpckldq %xmm3, %xmm2, %xmm9
- vpunpckhdq %xmm1, %xmm0, %xmm12
- vpunpckhdq %xmm3, %xmm2, %xmm13
- vpunpckldq %xmm5, %xmm4, %xmm10
- vpunpckldq %xmm7, %xmm6, %xmm11
- vpunpckhdq %xmm5, %xmm4, %xmm14
- vpunpckhdq %xmm7, %xmm6, %xmm15
- vpunpcklqdq %xmm9, %xmm8, %xmm0
- vpunpcklqdq %xmm11, %xmm10, %xmm1
- vpunpckhqdq %xmm9, %xmm8, %xmm2
- vpunpckhqdq %xmm11, %xmm10, %xmm3
- vpunpcklqdq %xmm13, %xmm12, %xmm4
- vpunpcklqdq %xmm15, %xmm14, %xmm5
- vpunpckhqdq %xmm13, %xmm12, %xmm6
- vpunpckhqdq %xmm15, %xmm14, %xmm7
- vmovdqu 32(%rsi), %xmm8
- vmovdqu 48(%rsi), %xmm9
- vmovdqu 96(%rsi), %xmm10
- vmovdqu 112(%rsi), %xmm11
- vmovdqu 160(%rsi), %xmm12
- vmovdqu 176(%rsi), %xmm13
- vmovdqu 224(%rsi), %xmm14
- vmovdqu 240(%rsi), %xmm15
- vpxor %xmm8, %xmm0, %xmm0
- vpxor %xmm9, %xmm1, %xmm1
- vpxor %xmm10, %xmm2, %xmm2
- vpxor %xmm11, %xmm3, %xmm3
- vpxor %xmm12, %xmm4, %xmm4
- vpxor %xmm13, %xmm5, %xmm5
- vpxor %xmm14, %xmm6, %xmm6
- vpxor %xmm15, %xmm7, %xmm7
- vmovdqu %xmm0, 32(%rdx)
- vmovdqu %xmm1, 48(%rdx)
- vmovdqu %xmm2, 96(%rdx)
- vmovdqu %xmm3, 112(%rdx)
- vmovdqu %xmm4, 160(%rdx)
- vmovdqu %xmm5, 176(%rdx)
- vmovdqu %xmm6, 224(%rdx)
- vmovdqu %xmm7, 240(%rdx)
- vmovdqa 192(%r9), %xmm12
- addq $0x100, %rsi
- addq $0x100, %rdx
- vpaddd L_chacha20_avx1_four(%rip), %xmm12, %xmm12
- subl $0x100, %ecx
- vmovdqa %xmm12, 192(%r9)
- cmpl $0x100, %ecx
- jl L_chacha20_avx1_done128
- vmovdqa (%r9), %xmm0
- vmovdqa 16(%r9), %xmm1
- vmovdqa 32(%r9), %xmm2
- vmovdqa 48(%r9), %xmm3
- vmovdqa 64(%r9), %xmm4
- vmovdqa 80(%r9), %xmm5
- vmovdqa 96(%r9), %xmm6
- vmovdqa 112(%r9), %xmm7
- vmovdqa 128(%r9), %xmm8
- vmovdqa 144(%r9), %xmm9
- vmovdqa 160(%r9), %xmm10
- vmovdqa 176(%r9), %xmm11
- vmovdqa 192(%r9), %xmm12
- vmovdqa 208(%r9), %xmm13
- vmovdqa 224(%r9), %xmm14
- vmovdqa 240(%r9), %xmm15
- jmp L_chacha20_avx1_start128
- L_chacha20_avx1_done128:
- shl $2, %eax
- addl %eax, 48(%rdi)
- L_chacha20_avx1_end128:
- cmpl $0x40, %ecx
- jl L_chacha20_avx1_block_done
- L_chacha20_avx1_block_start:
- vmovdqu (%rdi), %xmm0
- vmovdqu 16(%rdi), %xmm1
- vmovdqu 32(%rdi), %xmm2
- vmovdqu 48(%rdi), %xmm3
- vmovdqa %xmm0, %xmm5
- vmovdqa %xmm1, %xmm6
- vmovdqa %xmm2, %xmm7
- vmovdqa %xmm3, %xmm8
- movb $10, %al
- L_chacha20_avx1_block_crypt_start:
- vpaddd %xmm1, %xmm0, %xmm0
- vpxor %xmm0, %xmm3, %xmm3
- vpshufb L_chacha20_avx1_rotl16(%rip), %xmm3, %xmm3
- vpaddd %xmm3, %xmm2, %xmm2
- vpxor %xmm2, %xmm1, %xmm1
- vpsrld $20, %xmm1, %xmm4
- vpslld $12, %xmm1, %xmm1
- vpxor %xmm4, %xmm1, %xmm1
- vpaddd %xmm1, %xmm0, %xmm0
- vpxor %xmm0, %xmm3, %xmm3
- vpshufb L_chacha20_avx1_rotl8(%rip), %xmm3, %xmm3
- vpaddd %xmm3, %xmm2, %xmm2
- vpxor %xmm2, %xmm1, %xmm1
- vpsrld $25, %xmm1, %xmm4
- vpslld $7, %xmm1, %xmm1
- vpxor %xmm4, %xmm1, %xmm1
- vpshufd $57, %xmm1, %xmm1
- vpshufd $0x4e, %xmm2, %xmm2
- vpshufd $0x93, %xmm3, %xmm3
- vpaddd %xmm1, %xmm0, %xmm0
- vpxor %xmm0, %xmm3, %xmm3
- vpshufb L_chacha20_avx1_rotl16(%rip), %xmm3, %xmm3
- vpaddd %xmm3, %xmm2, %xmm2
- vpxor %xmm2, %xmm1, %xmm1
- vpsrld $20, %xmm1, %xmm4
- vpslld $12, %xmm1, %xmm1
- vpxor %xmm4, %xmm1, %xmm1
- vpaddd %xmm1, %xmm0, %xmm0
- vpxor %xmm0, %xmm3, %xmm3
- vpshufb L_chacha20_avx1_rotl8(%rip), %xmm3, %xmm3
- vpaddd %xmm3, %xmm2, %xmm2
- vpxor %xmm2, %xmm1, %xmm1
- vpsrld $25, %xmm1, %xmm4
- vpslld $7, %xmm1, %xmm1
- vpxor %xmm4, %xmm1, %xmm1
- vpshufd $0x93, %xmm1, %xmm1
- vpshufd $0x4e, %xmm2, %xmm2
- vpshufd $57, %xmm3, %xmm3
- decb %al
- jnz L_chacha20_avx1_block_crypt_start
- vpaddd %xmm5, %xmm0, %xmm0
- vpaddd %xmm6, %xmm1, %xmm1
- vpaddd %xmm7, %xmm2, %xmm2
- vpaddd %xmm8, %xmm3, %xmm3
- vmovdqu (%rsi), %xmm5
- vmovdqu 16(%rsi), %xmm6
- vmovdqu 32(%rsi), %xmm7
- vmovdqu 48(%rsi), %xmm8
- vpxor %xmm5, %xmm0, %xmm0
- vpxor %xmm6, %xmm1, %xmm1
- vpxor %xmm7, %xmm2, %xmm2
- vpxor %xmm8, %xmm3, %xmm3
- vmovdqu %xmm0, (%rdx)
- vmovdqu %xmm1, 16(%rdx)
- vmovdqu %xmm2, 32(%rdx)
- vmovdqu %xmm3, 48(%rdx)
- addl $0x01, 48(%rdi)
- subl $0x40, %ecx
- addq $0x40, %rsi
- addq $0x40, %rdx
- cmpl $0x40, %ecx
- jge L_chacha20_avx1_block_start
- L_chacha20_avx1_block_done:
- cmpl $0x00, %ecx
- je L_chacha20_avx1_partial_done
- leaq 80(%rdi), %r10
- vmovdqu (%rdi), %xmm0
- vmovdqu 16(%rdi), %xmm1
- vmovdqu 32(%rdi), %xmm2
- vmovdqu 48(%rdi), %xmm3
- vmovdqa %xmm0, %xmm5
- vmovdqa %xmm1, %xmm6
- vmovdqa %xmm2, %xmm7
- vmovdqa %xmm3, %xmm8
- movb $10, %al
- L_chacha20_avx1_partial_crypt_start:
- vpaddd %xmm1, %xmm0, %xmm0
- vpxor %xmm0, %xmm3, %xmm3
- vpshufb L_chacha20_avx1_rotl16(%rip), %xmm3, %xmm3
- vpaddd %xmm3, %xmm2, %xmm2
- vpxor %xmm2, %xmm1, %xmm1
- vpsrld $20, %xmm1, %xmm4
- vpslld $12, %xmm1, %xmm1
- vpxor %xmm4, %xmm1, %xmm1
- vpaddd %xmm1, %xmm0, %xmm0
- vpxor %xmm0, %xmm3, %xmm3
- vpshufb L_chacha20_avx1_rotl8(%rip), %xmm3, %xmm3
- vpaddd %xmm3, %xmm2, %xmm2
- vpxor %xmm2, %xmm1, %xmm1
- vpsrld $25, %xmm1, %xmm4
- vpslld $7, %xmm1, %xmm1
- vpxor %xmm4, %xmm1, %xmm1
- vpshufd $57, %xmm1, %xmm1
- vpshufd $0x4e, %xmm2, %xmm2
- vpshufd $0x93, %xmm3, %xmm3
- vpaddd %xmm1, %xmm0, %xmm0
- vpxor %xmm0, %xmm3, %xmm3
- vpshufb L_chacha20_avx1_rotl16(%rip), %xmm3, %xmm3
- vpaddd %xmm3, %xmm2, %xmm2
- vpxor %xmm2, %xmm1, %xmm1
- vpsrld $20, %xmm1, %xmm4
- vpslld $12, %xmm1, %xmm1
- vpxor %xmm4, %xmm1, %xmm1
- vpaddd %xmm1, %xmm0, %xmm0
- vpxor %xmm0, %xmm3, %xmm3
- vpshufb L_chacha20_avx1_rotl8(%rip), %xmm3, %xmm3
- vpaddd %xmm3, %xmm2, %xmm2
- vpxor %xmm2, %xmm1, %xmm1
- vpsrld $25, %xmm1, %xmm4
- vpslld $7, %xmm1, %xmm1
- vpxor %xmm4, %xmm1, %xmm1
- vpshufd $0x93, %xmm1, %xmm1
- vpshufd $0x4e, %xmm2, %xmm2
- vpshufd $57, %xmm3, %xmm3
- decb %al
- jnz L_chacha20_avx1_partial_crypt_start
- vpaddd %xmm5, %xmm0, %xmm0
- vpaddd %xmm6, %xmm1, %xmm1
- vpaddd %xmm7, %xmm2, %xmm2
- vpaddd %xmm8, %xmm3, %xmm3
- vmovdqu %xmm0, (%r10)
- vmovdqu %xmm1, 16(%r10)
- vmovdqu %xmm2, 32(%r10)
- vmovdqu %xmm3, 48(%r10)
- addl $0x01, 48(%rdi)
- movl %ecx, %r8d
- xorq %r11, %r11
- andl $7, %r8d
- jz L_chacha20_avx1_partial_start64
- L_chacha20_avx1_partial_start8:
- movzbl (%r10,%r11,1), %eax
- xorb (%rsi,%r11,1), %al
- movb %al, (%rdx,%r11,1)
- incl %r11d
- cmpl %r8d, %r11d
- jne L_chacha20_avx1_partial_start8
- je L_chacha20_avx1_partial_end64
- L_chacha20_avx1_partial_start64:
- movq (%r10,%r11,1), %rax
- xorq (%rsi,%r11,1), %rax
- movq %rax, (%rdx,%r11,1)
- addl $8, %r11d
- L_chacha20_avx1_partial_end64:
- cmpl %ecx, %r11d
- jne L_chacha20_avx1_partial_start64
- movl $0x40, %r8d
- subl %r11d, %r8d
- movl %r8d, 76(%rdi)
- L_chacha20_avx1_partial_done:
- addq $0x190, %rsp
- repz retq
- #ifndef __APPLE__
- .size chacha_encrypt_avx1,.-chacha_encrypt_avx1
- #endif /* __APPLE__ */
- #endif /* HAVE_INTEL_AVX1 */
- #ifdef HAVE_INTEL_AVX2
- #ifndef __APPLE__
- .data
- #else
- .section __DATA,__data
- #endif /* __APPLE__ */
- #ifndef __APPLE__
- .align 32
- #else
- .p2align 5
- #endif /* __APPLE__ */
- L_chacha20_avx2_rotl8:
- .quad 0x605040702010003, 0xe0d0c0f0a09080b
- .quad 0x605040702010003, 0xe0d0c0f0a09080b
- #ifndef __APPLE__
- .data
- #else
- .section __DATA,__data
- #endif /* __APPLE__ */
- #ifndef __APPLE__
- .align 32
- #else
- .p2align 5
- #endif /* __APPLE__ */
- L_chacha20_avx2_rotl16:
- .quad 0x504070601000302, 0xd0c0f0e09080b0a
- .quad 0x504070601000302, 0xd0c0f0e09080b0a
- #ifndef __APPLE__
- .data
- #else
- .section __DATA,__data
- #endif /* __APPLE__ */
- #ifndef __APPLE__
- .align 32
- #else
- .p2align 5
- #endif /* __APPLE__ */
- L_chacha20_avx2_add:
- .quad 0x100000000, 0x300000002
- .quad 0x500000004, 0x700000006
- #ifndef __APPLE__
- .data
- #else
- .section __DATA,__data
- #endif /* __APPLE__ */
- #ifndef __APPLE__
- .align 32
- #else
- .p2align 5
- #endif /* __APPLE__ */
- L_chacha20_avx2_eight:
- .quad 0x800000008, 0x800000008
- .quad 0x800000008, 0x800000008
- #ifndef __APPLE__
- .text
- .globl chacha_encrypt_avx2
- .type chacha_encrypt_avx2,@function
- .align 16
- chacha_encrypt_avx2:
- #else
- .section __TEXT,__text
- .globl _chacha_encrypt_avx2
- .p2align 4
- _chacha_encrypt_avx2:
- #endif /* __APPLE__ */
- subq $0x310, %rsp
- movq %rsp, %r9
- leaq 512(%rsp), %r10
- andq $-32, %r9
- andq $-32, %r10
- movl %ecx, %eax
- shrl $9, %eax
- jz L_chacha20_avx2_end256
- vpbroadcastd (%rdi), %ymm0
- vpbroadcastd 4(%rdi), %ymm1
- vpbroadcastd 8(%rdi), %ymm2
- vpbroadcastd 12(%rdi), %ymm3
- vpbroadcastd 16(%rdi), %ymm4
- vpbroadcastd 20(%rdi), %ymm5
- vpbroadcastd 24(%rdi), %ymm6
- vpbroadcastd 28(%rdi), %ymm7
- vpbroadcastd 32(%rdi), %ymm8
- vpbroadcastd 36(%rdi), %ymm9
- vpbroadcastd 40(%rdi), %ymm10
- vpbroadcastd 44(%rdi), %ymm11
- vpbroadcastd 48(%rdi), %ymm12
- vpbroadcastd 52(%rdi), %ymm13
- vpbroadcastd 56(%rdi), %ymm14
- vpbroadcastd 60(%rdi), %ymm15
- vpaddd L_chacha20_avx2_add(%rip), %ymm12, %ymm12
- vmovdqa %ymm0, (%r9)
- vmovdqa %ymm1, 32(%r9)
- vmovdqa %ymm2, 64(%r9)
- vmovdqa %ymm3, 96(%r9)
- vmovdqa %ymm4, 128(%r9)
- vmovdqa %ymm5, 160(%r9)
- vmovdqa %ymm6, 192(%r9)
- vmovdqa %ymm7, 224(%r9)
- vmovdqa %ymm8, 256(%r9)
- vmovdqa %ymm9, 288(%r9)
- vmovdqa %ymm10, 320(%r9)
- vmovdqa %ymm11, 352(%r9)
- vmovdqa %ymm12, 384(%r9)
- vmovdqa %ymm13, 416(%r9)
- vmovdqa %ymm14, 448(%r9)
- vmovdqa %ymm15, 480(%r9)
- L_chacha20_avx2_start256:
- movb $10, %r8b
- vmovdqa %ymm11, 96(%r10)
- L_chacha20_avx2_loop256:
- vpaddd %ymm4, %ymm0, %ymm0
- vpxor %ymm0, %ymm12, %ymm12
- vmovdqa 96(%r10), %ymm11
- vpshufb L_chacha20_avx2_rotl16(%rip), %ymm12, %ymm12
- vpaddd %ymm12, %ymm8, %ymm8
- vpxor %ymm8, %ymm4, %ymm4
- vpaddd %ymm5, %ymm1, %ymm1
- vpxor %ymm1, %ymm13, %ymm13
- vpshufb L_chacha20_avx2_rotl16(%rip), %ymm13, %ymm13
- vpaddd %ymm13, %ymm9, %ymm9
- vpxor %ymm9, %ymm5, %ymm5
- vpaddd %ymm6, %ymm2, %ymm2
- vpxor %ymm2, %ymm14, %ymm14
- vpshufb L_chacha20_avx2_rotl16(%rip), %ymm14, %ymm14
- vpaddd %ymm14, %ymm10, %ymm10
- vpxor %ymm10, %ymm6, %ymm6
- vpaddd %ymm7, %ymm3, %ymm3
- vpxor %ymm3, %ymm15, %ymm15
- vpshufb L_chacha20_avx2_rotl16(%rip), %ymm15, %ymm15
- vpaddd %ymm15, %ymm11, %ymm11
- vpxor %ymm11, %ymm7, %ymm7
- vmovdqa %ymm11, 96(%r10)
- vpsrld $20, %ymm4, %ymm11
- vpslld $12, %ymm4, %ymm4
- vpxor %ymm11, %ymm4, %ymm4
- vpsrld $20, %ymm5, %ymm11
- vpslld $12, %ymm5, %ymm5
- vpxor %ymm11, %ymm5, %ymm5
- vpsrld $20, %ymm6, %ymm11
- vpslld $12, %ymm6, %ymm6
- vpxor %ymm11, %ymm6, %ymm6
- vpsrld $20, %ymm7, %ymm11
- vpslld $12, %ymm7, %ymm7
- vpxor %ymm11, %ymm7, %ymm7
- vpaddd %ymm4, %ymm0, %ymm0
- vpxor %ymm0, %ymm12, %ymm12
- vmovdqa 96(%r10), %ymm11
- vpshufb L_chacha20_avx2_rotl8(%rip), %ymm12, %ymm12
- vpaddd %ymm12, %ymm8, %ymm8
- vpxor %ymm8, %ymm4, %ymm4
- vpaddd %ymm5, %ymm1, %ymm1
- vpxor %ymm1, %ymm13, %ymm13
- vpshufb L_chacha20_avx2_rotl8(%rip), %ymm13, %ymm13
- vpaddd %ymm13, %ymm9, %ymm9
- vpxor %ymm9, %ymm5, %ymm5
- vpaddd %ymm6, %ymm2, %ymm2
- vpxor %ymm2, %ymm14, %ymm14
- vpshufb L_chacha20_avx2_rotl8(%rip), %ymm14, %ymm14
- vpaddd %ymm14, %ymm10, %ymm10
- vpxor %ymm10, %ymm6, %ymm6
- vpaddd %ymm7, %ymm3, %ymm3
- vpxor %ymm3, %ymm15, %ymm15
- vpshufb L_chacha20_avx2_rotl8(%rip), %ymm15, %ymm15
- vpaddd %ymm15, %ymm11, %ymm11
- vpxor %ymm11, %ymm7, %ymm7
- vmovdqa %ymm11, 96(%r10)
- vpsrld $25, %ymm4, %ymm11
- vpslld $7, %ymm4, %ymm4
- vpxor %ymm11, %ymm4, %ymm4
- vpsrld $25, %ymm5, %ymm11
- vpslld $7, %ymm5, %ymm5
- vpxor %ymm11, %ymm5, %ymm5
- vpsrld $25, %ymm6, %ymm11
- vpslld $7, %ymm6, %ymm6
- vpxor %ymm11, %ymm6, %ymm6
- vpsrld $25, %ymm7, %ymm11
- vpslld $7, %ymm7, %ymm7
- vpxor %ymm11, %ymm7, %ymm7
- vpaddd %ymm5, %ymm0, %ymm0
- vpxor %ymm0, %ymm15, %ymm15
- vmovdqa 96(%r10), %ymm11
- vpshufb L_chacha20_avx2_rotl16(%rip), %ymm15, %ymm15
- vpaddd %ymm15, %ymm10, %ymm10
- vpxor %ymm10, %ymm5, %ymm5
- vpaddd %ymm6, %ymm1, %ymm1
- vpxor %ymm1, %ymm12, %ymm12
- vpshufb L_chacha20_avx2_rotl16(%rip), %ymm12, %ymm12
- vpaddd %ymm12, %ymm11, %ymm11
- vpxor %ymm11, %ymm6, %ymm6
- vpaddd %ymm7, %ymm2, %ymm2
- vpxor %ymm2, %ymm13, %ymm13
- vpshufb L_chacha20_avx2_rotl16(%rip), %ymm13, %ymm13
- vpaddd %ymm13, %ymm8, %ymm8
- vpxor %ymm8, %ymm7, %ymm7
- vpaddd %ymm4, %ymm3, %ymm3
- vpxor %ymm3, %ymm14, %ymm14
- vpshufb L_chacha20_avx2_rotl16(%rip), %ymm14, %ymm14
- vpaddd %ymm14, %ymm9, %ymm9
- vpxor %ymm9, %ymm4, %ymm4
- vmovdqa %ymm11, 96(%r10)
- vpsrld $20, %ymm5, %ymm11
- vpslld $12, %ymm5, %ymm5
- vpxor %ymm11, %ymm5, %ymm5
- vpsrld $20, %ymm6, %ymm11
- vpslld $12, %ymm6, %ymm6
- vpxor %ymm11, %ymm6, %ymm6
- vpsrld $20, %ymm7, %ymm11
- vpslld $12, %ymm7, %ymm7
- vpxor %ymm11, %ymm7, %ymm7
- vpsrld $20, %ymm4, %ymm11
- vpslld $12, %ymm4, %ymm4
- vpxor %ymm11, %ymm4, %ymm4
- vpaddd %ymm5, %ymm0, %ymm0
- vpxor %ymm0, %ymm15, %ymm15
- vmovdqa 96(%r10), %ymm11
- vpshufb L_chacha20_avx2_rotl8(%rip), %ymm15, %ymm15
- vpaddd %ymm15, %ymm10, %ymm10
- vpxor %ymm10, %ymm5, %ymm5
- vpaddd %ymm6, %ymm1, %ymm1
- vpxor %ymm1, %ymm12, %ymm12
- vpshufb L_chacha20_avx2_rotl8(%rip), %ymm12, %ymm12
- vpaddd %ymm12, %ymm11, %ymm11
- vpxor %ymm11, %ymm6, %ymm6
- vpaddd %ymm7, %ymm2, %ymm2
- vpxor %ymm2, %ymm13, %ymm13
- vpshufb L_chacha20_avx2_rotl8(%rip), %ymm13, %ymm13
- vpaddd %ymm13, %ymm8, %ymm8
- vpxor %ymm8, %ymm7, %ymm7
- vpaddd %ymm4, %ymm3, %ymm3
- vpxor %ymm3, %ymm14, %ymm14
- vpshufb L_chacha20_avx2_rotl8(%rip), %ymm14, %ymm14
- vpaddd %ymm14, %ymm9, %ymm9
- vpxor %ymm9, %ymm4, %ymm4
- vmovdqa %ymm11, 96(%r10)
- vpsrld $25, %ymm5, %ymm11
- vpslld $7, %ymm5, %ymm5
- vpxor %ymm11, %ymm5, %ymm5
- vpsrld $25, %ymm6, %ymm11
- vpslld $7, %ymm6, %ymm6
- vpxor %ymm11, %ymm6, %ymm6
- vpsrld $25, %ymm7, %ymm11
- vpslld $7, %ymm7, %ymm7
- vpxor %ymm11, %ymm7, %ymm7
- vpsrld $25, %ymm4, %ymm11
- vpslld $7, %ymm4, %ymm4
- vpxor %ymm11, %ymm4, %ymm4
- decb %r8b
- jnz L_chacha20_avx2_loop256
- vmovdqa 96(%r10), %ymm11
- vpaddd (%r9), %ymm0, %ymm0
- vpaddd 32(%r9), %ymm1, %ymm1
- vpaddd 64(%r9), %ymm2, %ymm2
- vpaddd 96(%r9), %ymm3, %ymm3
- vpaddd 128(%r9), %ymm4, %ymm4
- vpaddd 160(%r9), %ymm5, %ymm5
- vpaddd 192(%r9), %ymm6, %ymm6
- vpaddd 224(%r9), %ymm7, %ymm7
- vpaddd 256(%r9), %ymm8, %ymm8
- vpaddd 288(%r9), %ymm9, %ymm9
- vpaddd 320(%r9), %ymm10, %ymm10
- vpaddd 352(%r9), %ymm11, %ymm11
- vpaddd 384(%r9), %ymm12, %ymm12
- vpaddd 416(%r9), %ymm13, %ymm13
- vpaddd 448(%r9), %ymm14, %ymm14
- vpaddd 480(%r9), %ymm15, %ymm15
- vmovdqa %ymm8, (%r10)
- vmovdqa %ymm9, 32(%r10)
- vmovdqa %ymm10, 64(%r10)
- vmovdqa %ymm11, 96(%r10)
- vmovdqa %ymm12, 128(%r10)
- vmovdqa %ymm13, 160(%r10)
- vmovdqa %ymm14, 192(%r10)
- vmovdqa %ymm15, 224(%r10)
- vpunpckldq %ymm1, %ymm0, %ymm8
- vpunpckldq %ymm3, %ymm2, %ymm9
- vpunpckhdq %ymm1, %ymm0, %ymm12
- vpunpckhdq %ymm3, %ymm2, %ymm13
- vpunpckldq %ymm5, %ymm4, %ymm10
- vpunpckldq %ymm7, %ymm6, %ymm11
- vpunpckhdq %ymm5, %ymm4, %ymm14
- vpunpckhdq %ymm7, %ymm6, %ymm15
- vpunpcklqdq %ymm9, %ymm8, %ymm0
- vpunpcklqdq %ymm11, %ymm10, %ymm1
- vpunpckhqdq %ymm9, %ymm8, %ymm2
- vpunpckhqdq %ymm11, %ymm10, %ymm3
- vpunpcklqdq %ymm13, %ymm12, %ymm4
- vpunpcklqdq %ymm15, %ymm14, %ymm5
- vpunpckhqdq %ymm13, %ymm12, %ymm6
- vpunpckhqdq %ymm15, %ymm14, %ymm7
- vperm2i128 $32, %ymm1, %ymm0, %ymm8
- vperm2i128 $32, %ymm3, %ymm2, %ymm9
- vperm2i128 $49, %ymm1, %ymm0, %ymm12
- vperm2i128 $49, %ymm3, %ymm2, %ymm13
- vperm2i128 $32, %ymm5, %ymm4, %ymm10
- vperm2i128 $32, %ymm7, %ymm6, %ymm11
- vperm2i128 $49, %ymm5, %ymm4, %ymm14
- vperm2i128 $49, %ymm7, %ymm6, %ymm15
- vmovdqu (%rsi), %ymm0
- vmovdqu 64(%rsi), %ymm1
- vmovdqu 128(%rsi), %ymm2
- vmovdqu 192(%rsi), %ymm3
- vmovdqu 256(%rsi), %ymm4
- vmovdqu 320(%rsi), %ymm5
- vmovdqu 384(%rsi), %ymm6
- vmovdqu 448(%rsi), %ymm7
- vpxor %ymm0, %ymm8, %ymm8
- vpxor %ymm1, %ymm9, %ymm9
- vpxor %ymm2, %ymm10, %ymm10
- vpxor %ymm3, %ymm11, %ymm11
- vpxor %ymm4, %ymm12, %ymm12
- vpxor %ymm5, %ymm13, %ymm13
- vpxor %ymm6, %ymm14, %ymm14
- vpxor %ymm7, %ymm15, %ymm15
- vmovdqu %ymm8, (%rdx)
- vmovdqu %ymm9, 64(%rdx)
- vmovdqu %ymm10, 128(%rdx)
- vmovdqu %ymm11, 192(%rdx)
- vmovdqu %ymm12, 256(%rdx)
- vmovdqu %ymm13, 320(%rdx)
- vmovdqu %ymm14, 384(%rdx)
- vmovdqu %ymm15, 448(%rdx)
- vmovdqa (%r10), %ymm0
- vmovdqa 32(%r10), %ymm1
- vmovdqa 64(%r10), %ymm2
- vmovdqa 96(%r10), %ymm3
- vmovdqa 128(%r10), %ymm4
- vmovdqa 160(%r10), %ymm5
- vmovdqa 192(%r10), %ymm6
- vmovdqa 224(%r10), %ymm7
- vpunpckldq %ymm1, %ymm0, %ymm8
- vpunpckldq %ymm3, %ymm2, %ymm9
- vpunpckhdq %ymm1, %ymm0, %ymm12
- vpunpckhdq %ymm3, %ymm2, %ymm13
- vpunpckldq %ymm5, %ymm4, %ymm10
- vpunpckldq %ymm7, %ymm6, %ymm11
- vpunpckhdq %ymm5, %ymm4, %ymm14
- vpunpckhdq %ymm7, %ymm6, %ymm15
- vpunpcklqdq %ymm9, %ymm8, %ymm0
- vpunpcklqdq %ymm11, %ymm10, %ymm1
- vpunpckhqdq %ymm9, %ymm8, %ymm2
- vpunpckhqdq %ymm11, %ymm10, %ymm3
- vpunpcklqdq %ymm13, %ymm12, %ymm4
- vpunpcklqdq %ymm15, %ymm14, %ymm5
- vpunpckhqdq %ymm13, %ymm12, %ymm6
- vpunpckhqdq %ymm15, %ymm14, %ymm7
- vperm2i128 $32, %ymm1, %ymm0, %ymm8
- vperm2i128 $32, %ymm3, %ymm2, %ymm9
- vperm2i128 $49, %ymm1, %ymm0, %ymm12
- vperm2i128 $49, %ymm3, %ymm2, %ymm13
- vperm2i128 $32, %ymm5, %ymm4, %ymm10
- vperm2i128 $32, %ymm7, %ymm6, %ymm11
- vperm2i128 $49, %ymm5, %ymm4, %ymm14
- vperm2i128 $49, %ymm7, %ymm6, %ymm15
- vmovdqu 32(%rsi), %ymm0
- vmovdqu 96(%rsi), %ymm1
- vmovdqu 160(%rsi), %ymm2
- vmovdqu 224(%rsi), %ymm3
- vmovdqu 288(%rsi), %ymm4
- vmovdqu 352(%rsi), %ymm5
- vmovdqu 416(%rsi), %ymm6
- vmovdqu 480(%rsi), %ymm7
- vpxor %ymm0, %ymm8, %ymm8
- vpxor %ymm1, %ymm9, %ymm9
- vpxor %ymm2, %ymm10, %ymm10
- vpxor %ymm3, %ymm11, %ymm11
- vpxor %ymm4, %ymm12, %ymm12
- vpxor %ymm5, %ymm13, %ymm13
- vpxor %ymm6, %ymm14, %ymm14
- vpxor %ymm7, %ymm15, %ymm15
- vmovdqu %ymm8, 32(%rdx)
- vmovdqu %ymm9, 96(%rdx)
- vmovdqu %ymm10, 160(%rdx)
- vmovdqu %ymm11, 224(%rdx)
- vmovdqu %ymm12, 288(%rdx)
- vmovdqu %ymm13, 352(%rdx)
- vmovdqu %ymm14, 416(%rdx)
- vmovdqu %ymm15, 480(%rdx)
- vmovdqa 384(%r9), %ymm12
- addq $0x200, %rsi
- addq $0x200, %rdx
- vpaddd L_chacha20_avx2_eight(%rip), %ymm12, %ymm12
- subl $0x200, %ecx
- vmovdqa %ymm12, 384(%r9)
- cmpl $0x200, %ecx
- jl L_chacha20_avx2_done256
- vmovdqa (%r9), %ymm0
- vmovdqa 32(%r9), %ymm1
- vmovdqa 64(%r9), %ymm2
- vmovdqa 96(%r9), %ymm3
- vmovdqa 128(%r9), %ymm4
- vmovdqa 160(%r9), %ymm5
- vmovdqa 192(%r9), %ymm6
- vmovdqa 224(%r9), %ymm7
- vmovdqa 256(%r9), %ymm8
- vmovdqa 288(%r9), %ymm9
- vmovdqa 320(%r9), %ymm10
- vmovdqa 352(%r9), %ymm11
- vmovdqa 384(%r9), %ymm12
- vmovdqa 416(%r9), %ymm13
- vmovdqa 448(%r9), %ymm14
- vmovdqa 480(%r9), %ymm15
- jmp L_chacha20_avx2_start256
- L_chacha20_avx2_done256:
- shl $3, %eax
- addl %eax, 48(%rdi)
- L_chacha20_avx2_end256:
- #ifndef __APPLE__
- callq chacha_encrypt_avx1@plt
- #else
- callq _chacha_encrypt_avx1
- #endif /* __APPLE__ */
- addq $0x310, %rsp
- repz retq
- #ifndef __APPLE__
- .size chacha_encrypt_avx2,.-chacha_encrypt_avx2
- #endif /* __APPLE__ */
- #endif /* HAVE_INTEL_AVX2 */
- #endif /* WOLFSSL_X86_64_BUILD */
- #if defined(__linux__) && defined(__ELF__)
- .section .note.GNU-stack,"",%progbits
- #endif
|