Browse Source

Merge pull request #7319 from SparkiDev/chacha_poly1305_asm_msvc

ChaCha20, Poly1305 ASM for MSVC
Daniel Pouzzner 2 weeks ago
parent
commit
5a784c818d

+ 3 - 0
IDE/WIN/user_settings.h

@@ -74,6 +74,9 @@
                 #if 0
                     #define HAVE_INTEL_AVX2
                 #endif
+
+                #define USE_INTEL_CHACHA_SPEEDUP
+                #define USE_INTEL_POLY1305_SPEEDUP
             #endif
 
             /* Single Precision Support for RSA/DH 1024/2048/3072 and

+ 49 - 19
wolfcrypt/benchmark/benchmark.c

@@ -1971,6 +1971,7 @@ static int    numBlocks  = NUM_BLOCKS;
 static word32 bench_size = BENCH_SIZE;
 static int base2 = 1;
 static int digest_stream = 1;
+static int encrypt_only = 0;
 
 #ifdef MULTI_VALUE_STATISTICS
 static int minimum_runs = 0;
@@ -5820,27 +5821,54 @@ void bench_chacha(void)
     XMEMSET(enc, 0, sizeof(ChaCha));
     wc_Chacha_SetKey(enc, bench_key, 16);
 
-    bench_stats_start(&count, &start);
-    do {
-        for (i = 0; i < numBlocks; i++) {
-            ret = wc_Chacha_SetIV(enc, bench_iv, 0);
-            if (ret < 0) {
-                printf("wc_Chacha_SetIV error: %d\n", ret);
-                goto exit;
+    if (encrypt_only) {
+        ret = wc_Chacha_SetIV(enc, bench_iv, 0);
+        if (ret < 0) {
+            printf("wc_Chacha_SetIV error: %d\n", ret);
+            goto exit;
+        }
+        bench_stats_start(&count, &start);
+        do {
+            for (i = 0; i < numBlocks; i++) {
+                ret = wc_Chacha_Process(enc, bench_cipher, bench_plain,
+                    bench_size);
+                if (ret < 0) {
+                    printf("wc_Chacha_Process error: %d\n", ret);
+                    goto exit;
+                }
+                RECORD_MULTI_VALUE_STATS();
             }
-            ret = wc_Chacha_Process(enc, bench_cipher, bench_plain, bench_size);
-            if (ret < 0) {
-                printf("wc_Chacha_Process error: %d\n", ret);
-                goto exit;
+            count += i;
+        } while (bench_stats_check(start)
+    #ifdef MULTI_VALUE_STATISTICS
+            || runs < minimum_runs
+    #endif
+            );
+    }
+    else {
+        bench_stats_start(&count, &start);
+        do {
+            for (i = 0; i < numBlocks; i++) {
+                ret = wc_Chacha_SetIV(enc, bench_iv, 0);
+                if (ret < 0) {
+                    printf("wc_Chacha_SetIV error: %d\n", ret);
+                    goto exit;
+                }
+                ret = wc_Chacha_Process(enc, bench_cipher, bench_plain,
+                    bench_size);
+                if (ret < 0) {
+                    printf("wc_Chacha_Process error: %d\n", ret);
+                    goto exit;
+                }
+                RECORD_MULTI_VALUE_STATS();
             }
-            RECORD_MULTI_VALUE_STATS();
-        }
-        count += i;
-    } while (bench_stats_check(start)
-#ifdef MULTI_VALUE_STATISTICS
-        || runs < minimum_runs
-#endif
-        );
+            count += i;
+        } while (bench_stats_check(start)
+    #ifdef MULTI_VALUE_STATISTICS
+            || runs < minimum_runs
+    #endif
+            );
+    }
 
     bench_stats_sym_finish("CHACHA", 0, count, bench_size, start, 0);
 #ifdef MULTI_VALUE_STATISTICS
@@ -13470,6 +13498,8 @@ int wolfcrypt_benchmark_main(int argc, char** argv)
 #endif
         else if (string_matches(argv[1], "-dgst_full"))
             digest_stream = 0;
+        else if (string_matches(argv[1], "-enc_only"))
+            encrypt_only = 1;
 #ifndef NO_RSA
         else if (string_matches(argv[1], "-rsa_sign"))
             rsa_sign_verify = 1;

File diff suppressed because it is too large
+ 181 - 181
wolfcrypt/src/aes_gcm_asm.asm


+ 9 - 8
wolfcrypt/src/asn.c

@@ -3496,7 +3496,7 @@ int CheckBitString(const byte* input, word32* inOutIdx, int* len,
 #else
     ASNGetData dataASN[bitStringASN_Length];
     int ret;
-    int bits;
+    int bits = 0;
 
     /* Parse BIT_STRING and check validity of unused bits. */
     XMEMSET(dataASN, 0, sizeof(dataASN));
@@ -7227,7 +7227,7 @@ int wc_CreatePKCS8Key(byte* out, word32* outSz, byte* key, word32 keySz,
     return (int)(tmpSz + sz);
 #else
     DECL_ASNSETDATA(dataASN, pkcs8KeyASN_Length);
-    int sz;
+    int sz = 0;
     int ret = 0;
     word32 keyIdx = 0;
     word32 tmpAlgId = 0;
@@ -8903,7 +8903,7 @@ exit_dc:
     DECL_ASNGETDATA(dataASN, pbes2ParamsASN_Length);
     int    ret = 0;
     int    id = 0;
-    int    version;
+    int    version = 0;
     word32 idx = 0;
     word32 pIdx = 0;
     word32 iterations = 0;
@@ -14430,7 +14430,7 @@ static int GetCertName(DecodedCert* cert, char* full, byte* hash, int nameType,
     DECL_ASNGETDATA(dataASN, rdnASN_Length);
     int    ret = 0;
     word32 idx = 0;
-    int    len;
+    int    len = 0;
     word32 srcIdx = *inOutIdx;
 #ifdef WOLFSSL_X509_NAME_AVAILABLE
     WOLFSSL_X509_NAME* dName = NULL;
@@ -16139,7 +16139,7 @@ word32 wc_EncodeSignature(byte* out, const byte* digest, word32 digSz,
 #else
     DECL_ASNSETDATA(dataASN, digestInfoASN_Length);
     int ret = 0;
-    int sz;
+    int sz = 0;
     unsigned char dgst[WC_MAX_DIGEST_SIZE];
 
     CALLOC_ASNSETDATA(dataASN, digestInfoASN_Length, ret, NULL);
@@ -21727,9 +21727,9 @@ static int DecodeCertInternal(DecodedCert* cert, int verify, int* criticalExt,
     DECL_ASNGETDATA(dataASN, x509CertASN_Length);
     int ret = 0;
     int badDate = 0;
-    byte version;
+    byte version = 0;
     word32 idx;
-    word32 serialSz;
+    word32 serialSz = 0;
     const unsigned char* issuer = NULL;
     word32 issuerSz = 0;
     const unsigned char* subject = NULL;
@@ -34365,7 +34365,8 @@ int wc_BuildEccKeyDer(ecc_key* key, byte* output, word32 *inLen,
     return (int)totalSz;
 #else
     DECL_ASNSETDATA(dataASN, eccKeyASN_Length);
-    word32 privSz, pubSz;
+    word32 privSz = 0;
+    word32 pubSz = 0;
     int sz = 0;
     int ret = 0;
     int curveIdSz = 0;

+ 4 - 0
wolfcrypt/src/chacha.c

@@ -72,6 +72,10 @@ Public domain.
     #elif defined(__clang__) && defined(NO_AVX2_SUPPORT)
         #undef NO_AVX2_SUPPORT
     #endif
+    #if defined(_MSC_VER) && (_MSC_VER <= 1900)
+        #undef  NO_AVX2_SUPPORT
+        #define NO_AVX2_SUPPORT
+    #endif
 
     #ifndef NO_AVX2_SUPPORT
         #define HAVE_INTEL_AVX2

+ 2 - 2
wolfcrypt/src/chacha_asm.S

@@ -868,7 +868,7 @@ L_chacha20_avx1_loop128:
         vmovdqa	240(%r9), %xmm15
         jmp	L_chacha20_avx1_start128
 L_chacha20_avx1_done128:
-        shl	$2, %eax
+        shll	$2, %eax
         addl	%eax, 48(%rdi)
 L_chacha20_avx1_end128:
         cmpl	$0x40, %ecx
@@ -1456,7 +1456,7 @@ L_chacha20_avx2_loop256:
         vmovdqa	480(%r9), %ymm15
         jmp	L_chacha20_avx2_start256
 L_chacha20_avx2_done256:
-        shl	$3, %eax
+        shll	$3, %eax
         addl	%eax, 48(%rdi)
 L_chacha20_avx2_end256:
 #ifndef __APPLE__

+ 1426 - 0
wolfcrypt/src/chacha_asm.asm

@@ -0,0 +1,1426 @@
+; /* chacha_asm.asm */
+; /*
+;  * Copyright (C) 2006-2024 wolfSSL Inc.
+;  *
+;  * This file is part of wolfSSL.
+;  *
+;  * wolfSSL is free software; you can redistribute it and/or modify
+;  * it under the terms of the GNU General Public License as published by
+;  * the Free Software Foundation; either version 2 of the License, or
+;  * (at your option) any later version.
+;  *
+;  * wolfSSL is distributed in the hope that it will be useful,
+;  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+;  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;  * GNU General Public License for more details.
+;  *
+;  * You should have received a copy of the GNU General Public License
+;  * along with this program; if not, write to the Free Software
+;  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+;  */
+IF @Version LT 1200
+; AVX2 instructions not recognized by old versions of MASM
+IFNDEF NO_AVX2_SUPPORT
+NO_AVX2_SUPPORT = 1
+ENDIF
+; MOVBE instruction not recognized by old versions of MASM
+IFNDEF NO_MOVBE_SUPPORT
+NO_MOVBE_SUPPORT = 1
+ENDIF
+ENDIF
+
+IFNDEF HAVE_INTEL_AVX1
+HAVE_INTEL_AVX1 = 1
+ENDIF
+IFNDEF NO_AVX2_SUPPORT
+HAVE_INTEL_AVX2 = 1
+ENDIF
+
+IFNDEF _WIN64
+_WIN64 = 1
+ENDIF
+
+_text SEGMENT READONLY PARA
+chacha_encrypt_x64 PROC
+        push	rbx
+        push	rbp
+        push	r12
+        push	r13
+        push	r14
+        push	r15
+        sub	rsp, 64
+        cmp	r9d, 64
+        jl	L_chacha_x64_small
+L_chacha_x64_start:
+        sub	rsp, 48
+        mov	QWORD PTR [rsp+24], r8
+        mov	QWORD PTR [rsp+32], rdx
+        mov	QWORD PTR [rsp+40], r9
+        mov	rax, QWORD PTR [rcx+32]
+        mov	rbx, QWORD PTR [rcx+40]
+        mov	QWORD PTR [rsp+8], rax
+        mov	QWORD PTR [rsp+16], rbx
+        mov	eax, DWORD PTR [rcx]
+        mov	ebx, DWORD PTR [rcx+4]
+        mov	r9d, DWORD PTR [rcx+8]
+        mov	r8d, DWORD PTR [rcx+12]
+        mov	r8d, DWORD PTR [rcx+16]
+        mov	r9d, DWORD PTR [rcx+20]
+        mov	r10d, DWORD PTR [rcx+24]
+        mov	r11d, DWORD PTR [rcx+28]
+        mov	r12d, DWORD PTR [rcx+48]
+        mov	r13d, DWORD PTR [rcx+52]
+        mov	r14d, DWORD PTR [rcx+56]
+        mov	r15d, DWORD PTR [rcx+60]
+        mov	BYTE PTR [rsp], 10
+        mov	edx, DWORD PTR [rsp+8]
+        mov	ebp, DWORD PTR [rsp+12]
+L_chacha_x64_block_crypt_start:
+        add	eax, r8d
+        add	ebx, r9d
+        xor	r12d, eax
+        xor	r13d, ebx
+        rol	r12d, 16
+        rol	r13d, 16
+        add	edx, r12d
+        add	ebp, r13d
+        xor	r8d, edx
+        xor	r9d, ebp
+        rol	r8d, 12
+        rol	r9d, 12
+        add	eax, r8d
+        add	ebx, r9d
+        xor	r12d, eax
+        xor	r13d, ebx
+        rol	r12d, 8
+        rol	r13d, 8
+        add	edx, r12d
+        add	ebp, r13d
+        xor	r8d, edx
+        xor	r9d, ebp
+        rol	r8d, 7
+        rol	r9d, 7
+        mov	DWORD PTR [rsp+8], edx
+        mov	DWORD PTR [rsp+12], ebp
+        mov	edx, DWORD PTR [rsp+16]
+        mov	ebp, DWORD PTR [rsp+20]
+        add	r9d, r10d
+        add	r8d, r11d
+        xor	r14d, r9d
+        xor	r15d, r8d
+        rol	r14d, 16
+        rol	r15d, 16
+        add	edx, r14d
+        add	ebp, r15d
+        xor	r10d, edx
+        xor	r11d, ebp
+        rol	r10d, 12
+        rol	r11d, 12
+        add	r9d, r10d
+        add	r8d, r11d
+        xor	r14d, r9d
+        xor	r15d, r8d
+        rol	r14d, 8
+        rol	r15d, 8
+        add	edx, r14d
+        add	ebp, r15d
+        xor	r10d, edx
+        xor	r11d, ebp
+        rol	r10d, 7
+        rol	r11d, 7
+        add	eax, r9d
+        add	ebx, r10d
+        xor	r15d, eax
+        xor	r12d, ebx
+        rol	r15d, 16
+        rol	r12d, 16
+        add	edx, r15d
+        add	ebp, r12d
+        xor	r9d, edx
+        xor	r10d, ebp
+        rol	r9d, 12
+        rol	r10d, 12
+        add	eax, r9d
+        add	ebx, r10d
+        xor	r15d, eax
+        xor	r12d, ebx
+        rol	r15d, 8
+        rol	r12d, 8
+        add	edx, r15d
+        add	ebp, r12d
+        xor	r9d, edx
+        xor	r10d, ebp
+        rol	r9d, 7
+        rol	r10d, 7
+        mov	DWORD PTR [rsp+16], edx
+        mov	DWORD PTR [rsp+20], ebp
+        mov	edx, DWORD PTR [rsp+8]
+        mov	ebp, DWORD PTR [rsp+12]
+        add	r9d, r11d
+        add	r8d, r8d
+        xor	r13d, r9d
+        xor	r14d, r8d
+        rol	r13d, 16
+        rol	r14d, 16
+        add	edx, r13d
+        add	ebp, r14d
+        xor	r11d, edx
+        xor	r8d, ebp
+        rol	r11d, 12
+        rol	r8d, 12
+        add	r9d, r11d
+        add	r8d, r8d
+        xor	r13d, r9d
+        xor	r14d, r8d
+        rol	r13d, 8
+        rol	r14d, 8
+        add	edx, r13d
+        add	ebp, r14d
+        xor	r11d, edx
+        xor	r8d, ebp
+        rol	r11d, 7
+        rol	r8d, 7
+        dec	BYTE PTR [rsp]
+        jnz	L_chacha_x64_block_crypt_start
+        mov	DWORD PTR [rsp+8], edx
+        mov	DWORD PTR [rsp+12], ebp
+        mov	rdx, QWORD PTR [rsp+32]
+        mov	rbp, QWORD PTR [rsp+24]
+        add	eax, DWORD PTR [rcx]
+        add	ebx, DWORD PTR [rcx+4]
+        add	r9d, DWORD PTR [rcx+8]
+        add	r8d, DWORD PTR [rcx+12]
+        add	r8d, DWORD PTR [rcx+16]
+        add	r9d, DWORD PTR [rcx+20]
+        add	r10d, DWORD PTR [rcx+24]
+        add	r11d, DWORD PTR [rcx+28]
+        add	r12d, DWORD PTR [rcx+48]
+        add	r13d, DWORD PTR [rcx+52]
+        add	r14d, DWORD PTR [rcx+56]
+        add	r15d, DWORD PTR [rcx+60]
+        xor	eax, DWORD PTR [rdx]
+        xor	ebx, DWORD PTR [rdx+4]
+        xor	r9d, DWORD PTR [rdx+8]
+        xor	r8d, DWORD PTR [rdx+12]
+        xor	r8d, DWORD PTR [rdx+16]
+        xor	r9d, DWORD PTR [rdx+20]
+        xor	r10d, DWORD PTR [rdx+24]
+        xor	r11d, DWORD PTR [rdx+28]
+        xor	r12d, DWORD PTR [rdx+48]
+        xor	r13d, DWORD PTR [rdx+52]
+        xor	r14d, DWORD PTR [rdx+56]
+        xor	r15d, DWORD PTR [rdx+60]
+        mov	DWORD PTR [rbp], eax
+        mov	DWORD PTR [rbp+4], ebx
+        mov	DWORD PTR [rbp+8], r9d
+        mov	DWORD PTR [rbp+12], r8d
+        mov	DWORD PTR [rbp+16], r8d
+        mov	DWORD PTR [rbp+20], r9d
+        mov	DWORD PTR [rbp+24], r10d
+        mov	DWORD PTR [rbp+28], r11d
+        mov	DWORD PTR [rbp+48], r12d
+        mov	DWORD PTR [rbp+52], r13d
+        mov	DWORD PTR [rbp+56], r14d
+        mov	DWORD PTR [rbp+60], r15d
+        mov	eax, DWORD PTR [rsp+8]
+        mov	ebx, DWORD PTR [rsp+12]
+        mov	r9d, DWORD PTR [rsp+16]
+        mov	r8d, DWORD PTR [rsp+20]
+        add	eax, DWORD PTR [rcx+32]
+        add	ebx, DWORD PTR [rcx+36]
+        add	r9d, DWORD PTR [rcx+40]
+        add	r8d, DWORD PTR [rcx+44]
+        xor	eax, DWORD PTR [rdx+32]
+        xor	ebx, DWORD PTR [rdx+36]
+        xor	r9d, DWORD PTR [rdx+40]
+        xor	r8d, DWORD PTR [rdx+44]
+        mov	DWORD PTR [rbp+32], eax
+        mov	DWORD PTR [rbp+36], ebx
+        mov	DWORD PTR [rbp+40], r9d
+        mov	DWORD PTR [rbp+44], r8d
+        mov	r8, QWORD PTR [rsp+24]
+        mov	r9, QWORD PTR [rsp+40]
+        add	DWORD PTR [rcx+48], 1
+        add	rsp, 48
+        sub	r9d, 64
+        add	rdx, 64
+        add	r8, 64
+        cmp	r9d, 64
+        jge	L_chacha_x64_start
+L_chacha_x64_small:
+        cmp	r9d, 0
+        je	L_chacha_x64_done
+        sub	rsp, 48
+        mov	QWORD PTR [rsp+24], r8
+        mov	QWORD PTR [rsp+32], rdx
+        mov	QWORD PTR [rsp+40], r9
+        mov	rax, QWORD PTR [rcx+32]
+        mov	rbx, QWORD PTR [rcx+40]
+        mov	QWORD PTR [rsp+8], rax
+        mov	QWORD PTR [rsp+16], rbx
+        mov	eax, DWORD PTR [rcx]
+        mov	ebx, DWORD PTR [rcx+4]
+        mov	r9d, DWORD PTR [rcx+8]
+        mov	r8d, DWORD PTR [rcx+12]
+        mov	r8d, DWORD PTR [rcx+16]
+        mov	r9d, DWORD PTR [rcx+20]
+        mov	r10d, DWORD PTR [rcx+24]
+        mov	r11d, DWORD PTR [rcx+28]
+        mov	r12d, DWORD PTR [rcx+48]
+        mov	r13d, DWORD PTR [rcx+52]
+        mov	r14d, DWORD PTR [rcx+56]
+        mov	r15d, DWORD PTR [rcx+60]
+        mov	BYTE PTR [rsp], 10
+        mov	edx, DWORD PTR [rsp+8]
+        mov	ebp, DWORD PTR [rsp+12]
+L_chacha_x64_partial_crypt_start:
+        add	eax, r8d
+        add	ebx, r9d
+        xor	r12d, eax
+        xor	r13d, ebx
+        rol	r12d, 16
+        rol	r13d, 16
+        add	edx, r12d
+        add	ebp, r13d
+        xor	r8d, edx
+        xor	r9d, ebp
+        rol	r8d, 12
+        rol	r9d, 12
+        add	eax, r8d
+        add	ebx, r9d
+        xor	r12d, eax
+        xor	r13d, ebx
+        rol	r12d, 8
+        rol	r13d, 8
+        add	edx, r12d
+        add	ebp, r13d
+        xor	r8d, edx
+        xor	r9d, ebp
+        rol	r8d, 7
+        rol	r9d, 7
+        mov	DWORD PTR [rsp+8], edx
+        mov	DWORD PTR [rsp+12], ebp
+        mov	edx, DWORD PTR [rsp+16]
+        mov	ebp, DWORD PTR [rsp+20]
+        add	r9d, r10d
+        add	r8d, r11d
+        xor	r14d, r9d
+        xor	r15d, r8d
+        rol	r14d, 16
+        rol	r15d, 16
+        add	edx, r14d
+        add	ebp, r15d
+        xor	r10d, edx
+        xor	r11d, ebp
+        rol	r10d, 12
+        rol	r11d, 12
+        add	r9d, r10d
+        add	r8d, r11d
+        xor	r14d, r9d
+        xor	r15d, r8d
+        rol	r14d, 8
+        rol	r15d, 8
+        add	edx, r14d
+        add	ebp, r15d
+        xor	r10d, edx
+        xor	r11d, ebp
+        rol	r10d, 7
+        rol	r11d, 7
+        add	eax, r9d
+        add	ebx, r10d
+        xor	r15d, eax
+        xor	r12d, ebx
+        rol	r15d, 16
+        rol	r12d, 16
+        add	edx, r15d
+        add	ebp, r12d
+        xor	r9d, edx
+        xor	r10d, ebp
+        rol	r9d, 12
+        rol	r10d, 12
+        add	eax, r9d
+        add	ebx, r10d
+        xor	r15d, eax
+        xor	r12d, ebx
+        rol	r15d, 8
+        rol	r12d, 8
+        add	edx, r15d
+        add	ebp, r12d
+        xor	r9d, edx
+        xor	r10d, ebp
+        rol	r9d, 7
+        rol	r10d, 7
+        mov	DWORD PTR [rsp+16], edx
+        mov	DWORD PTR [rsp+20], ebp
+        mov	edx, DWORD PTR [rsp+8]
+        mov	ebp, DWORD PTR [rsp+12]
+        add	r9d, r11d
+        add	r8d, r8d
+        xor	r13d, r9d
+        xor	r14d, r8d
+        rol	r13d, 16
+        rol	r14d, 16
+        add	edx, r13d
+        add	ebp, r14d
+        xor	r11d, edx
+        xor	r8d, ebp
+        rol	r11d, 12
+        rol	r8d, 12
+        add	r9d, r11d
+        add	r8d, r8d
+        xor	r13d, r9d
+        xor	r14d, r8d
+        rol	r13d, 8
+        rol	r14d, 8
+        add	edx, r13d
+        add	ebp, r14d
+        xor	r11d, edx
+        xor	r8d, ebp
+        rol	r11d, 7
+        rol	r8d, 7
+        dec	BYTE PTR [rsp]
+        jnz	L_chacha_x64_partial_crypt_start
+        mov	DWORD PTR [rsp+8], edx
+        mov	DWORD PTR [rsp+12], ebp
+        mov	rdx, QWORD PTR [rsp+32]
+        add	eax, DWORD PTR [rcx]
+        add	ebx, DWORD PTR [rcx+4]
+        add	r9d, DWORD PTR [rcx+8]
+        add	r8d, DWORD PTR [rcx+12]
+        add	r8d, DWORD PTR [rcx+16]
+        add	r9d, DWORD PTR [rcx+20]
+        add	r10d, DWORD PTR [rcx+24]
+        add	r11d, DWORD PTR [rcx+28]
+        add	r12d, DWORD PTR [rcx+48]
+        add	r13d, DWORD PTR [rcx+52]
+        add	r14d, DWORD PTR [rcx+56]
+        add	r15d, DWORD PTR [rcx+60]
+        lea	rbp, QWORD PTR [rcx+80]
+        mov	DWORD PTR [rbp], eax
+        mov	DWORD PTR [rbp+4], ebx
+        mov	DWORD PTR [rbp+8], r9d
+        mov	DWORD PTR [rbp+12], r8d
+        mov	DWORD PTR [rbp+16], r8d
+        mov	DWORD PTR [rbp+20], r9d
+        mov	DWORD PTR [rbp+24], r10d
+        mov	DWORD PTR [rbp+28], r11d
+        mov	DWORD PTR [rbp+48], r12d
+        mov	DWORD PTR [rbp+52], r13d
+        mov	DWORD PTR [rbp+56], r14d
+        mov	DWORD PTR [rbp+60], r15d
+        mov	eax, DWORD PTR [rsp+8]
+        mov	ebx, DWORD PTR [rsp+12]
+        mov	r9d, DWORD PTR [rsp+16]
+        mov	r8d, DWORD PTR [rsp+20]
+        add	eax, DWORD PTR [rcx+32]
+        add	ebx, DWORD PTR [rcx+36]
+        add	r9d, DWORD PTR [rcx+40]
+        add	r8d, DWORD PTR [rcx+44]
+        mov	DWORD PTR [rbp+32], eax
+        mov	DWORD PTR [rbp+36], ebx
+        mov	DWORD PTR [rbp+40], r9d
+        mov	DWORD PTR [rbp+44], r8d
+        mov	r8, QWORD PTR [rsp+24]
+        mov	r9, QWORD PTR [rsp+40]
+        add	DWORD PTR [rcx+48], 1
+        add	rsp, 48
+        mov	r8d, r9d
+        xor	rbx, rbx
+        and	r8d, 7
+        jz	L_chacha_x64_partial_start64
+L_chacha_x64_partial_start8:
+        movzx	eax, BYTE PTR [rbp+rbx]
+        xor	al, BYTE PTR [rdx+rbx]
+        mov	BYTE PTR [r8+rbx], al
+        inc	ebx
+        cmp	ebx, r8d
+        jne	L_chacha_x64_partial_start8
+        je	L_chacha_x64_partial_end64
+L_chacha_x64_partial_start64:
+        mov	rax, QWORD PTR [rbp+rbx]
+        xor	rax, QWORD PTR [rdx+rbx]
+        mov	QWORD PTR [r8+rbx], rax
+        add	ebx, 8
+L_chacha_x64_partial_end64:
+        cmp	ebx, r9d
+        jne	L_chacha_x64_partial_start64
+        mov	r9d, 64
+        sub	r9d, ebx
+        mov	DWORD PTR [rcx+76], r9d
+L_chacha_x64_done:
+        add	rsp, 64
+        pop	r15
+        pop	r14
+        pop	r13
+        pop	r12
+        pop	rbp
+        pop	rbx
+        ret
+chacha_encrypt_x64 ENDP
+_text ENDS
+IFDEF HAVE_INTEL_AVX1
+_DATA SEGMENT
+ALIGN 16
+L_chacha20_avx1_rotl8 QWORD 433757367256023043, 1012478749960636427
+ptr_L_chacha20_avx1_rotl8 QWORD L_chacha20_avx1_rotl8
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_chacha20_avx1_rotl16 QWORD 361421592464458498, 940142975169071882
+ptr_L_chacha20_avx1_rotl16 QWORD L_chacha20_avx1_rotl16
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_chacha20_avx1_add QWORD 4294967296, 12884901890
+ptr_L_chacha20_avx1_add QWORD L_chacha20_avx1_add
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_chacha20_avx1_four QWORD 17179869188, 17179869188
+ptr_L_chacha20_avx1_four QWORD L_chacha20_avx1_four
+_DATA ENDS
+_text SEGMENT READONLY PARA
+chacha_encrypt_avx1 PROC
+        push	r12
+        push	r13
+        push	r14
+        push	r15
+        push	rdi
+        push	rsi
+        sub	rsp, 560
+        vmovdqu	OWORD PTR [rsp+400], xmm6
+        vmovdqu	OWORD PTR [rsp+416], xmm7
+        vmovdqu	OWORD PTR [rsp+432], xmm8
+        vmovdqu	OWORD PTR [rsp+448], xmm9
+        vmovdqu	OWORD PTR [rsp+464], xmm10
+        vmovdqu	OWORD PTR [rsp+480], xmm11
+        vmovdqu	OWORD PTR [rsp+496], xmm12
+        vmovdqu	OWORD PTR [rsp+512], xmm13
+        vmovdqu	OWORD PTR [rsp+528], xmm14
+        vmovdqu	OWORD PTR [rsp+544], xmm15
+        mov	r11, rsp
+        lea	r12, QWORD PTR [rsp+256]
+        mov	r14, QWORD PTR [ptr_L_chacha20_avx1_rotl8]
+        mov	r15, QWORD PTR [ptr_L_chacha20_avx1_rotl16]
+        mov	rdi, QWORD PTR [ptr_L_chacha20_avx1_add]
+        mov	rsi, QWORD PTR [ptr_L_chacha20_avx1_four]
+        add	r11, 15
+        add	r12, 15
+        and	r11, -16
+        and	r12, -16
+        mov	eax, r9d
+        shr	eax, 8
+        jz	L_chacha20_avx1_end128
+        vpshufd	xmm0, [rcx], 0
+        vpshufd	xmm1, [rcx+4], 0
+        vpshufd	xmm2, [rcx+8], 0
+        vpshufd	xmm3, [rcx+12], 0
+        vpshufd	xmm4, [rcx+16], 0
+        vpshufd	xmm5, [rcx+20], 0
+        vpshufd	xmm6, [rcx+24], 0
+        vpshufd	xmm7, [rcx+28], 0
+        vpshufd	xmm8, [rcx+32], 0
+        vpshufd	xmm9, [rcx+36], 0
+        vpshufd	xmm10, [rcx+40], 0
+        vpshufd	xmm11, [rcx+44], 0
+        vpshufd	xmm12, [rcx+48], 0
+        vpshufd	xmm13, [rcx+52], 0
+        vpshufd	xmm14, [rcx+56], 0
+        vpshufd	xmm15, [rcx+60], 0
+        vpaddd	xmm12, xmm12, OWORD PTR [rdi]
+        vmovdqa	OWORD PTR [r11], xmm0
+        vmovdqa	OWORD PTR [r11+16], xmm1
+        vmovdqa	OWORD PTR [r11+32], xmm2
+        vmovdqa	OWORD PTR [r11+48], xmm3
+        vmovdqa	OWORD PTR [r11+64], xmm4
+        vmovdqa	OWORD PTR [r11+80], xmm5
+        vmovdqa	OWORD PTR [r11+96], xmm6
+        vmovdqa	OWORD PTR [r11+112], xmm7
+        vmovdqa	OWORD PTR [r11+128], xmm8
+        vmovdqa	OWORD PTR [r11+144], xmm9
+        vmovdqa	OWORD PTR [r11+160], xmm10
+        vmovdqa	OWORD PTR [r11+176], xmm11
+        vmovdqa	OWORD PTR [r11+192], xmm12
+        vmovdqa	OWORD PTR [r11+208], xmm13
+        vmovdqa	OWORD PTR [r11+224], xmm14
+        vmovdqa	OWORD PTR [r11+240], xmm15
+L_chacha20_avx1_start128:
+        vmovdqa	OWORD PTR [r12+48], xmm11
+        mov	r10b, 10
+L_chacha20_avx1_loop128:
+        vpaddd	xmm0, xmm0, xmm4
+        vpxor	xmm12, xmm12, xmm0
+        vmovdqa	xmm11, OWORD PTR [r12+48]
+        vpshufb	xmm12, xmm12, OWORD PTR [r15]
+        vpaddd	xmm8, xmm8, xmm12
+        vpxor	xmm4, xmm4, xmm8
+        vpaddd	xmm1, xmm1, xmm5
+        vpxor	xmm13, xmm13, xmm1
+        vpshufb	xmm13, xmm13, OWORD PTR [r15]
+        vpaddd	xmm9, xmm9, xmm13
+        vpxor	xmm5, xmm5, xmm9
+        vpaddd	xmm2, xmm2, xmm6
+        vpxor	xmm14, xmm14, xmm2
+        vpshufb	xmm14, xmm14, OWORD PTR [r15]
+        vpaddd	xmm10, xmm10, xmm14
+        vpxor	xmm6, xmm6, xmm10
+        vpaddd	xmm3, xmm3, xmm7
+        vpxor	xmm15, xmm15, xmm3
+        vpshufb	xmm15, xmm15, OWORD PTR [r15]
+        vpaddd	xmm11, xmm11, xmm15
+        vpxor	xmm7, xmm7, xmm11
+        vmovdqa	OWORD PTR [r12+48], xmm11
+        vpsrld	xmm11, xmm4, 20
+        vpslld	xmm4, xmm4, 12
+        vpxor	xmm4, xmm4, xmm11
+        vpsrld	xmm11, xmm5, 20
+        vpslld	xmm5, xmm5, 12
+        vpxor	xmm5, xmm5, xmm11
+        vpsrld	xmm11, xmm6, 20
+        vpslld	xmm6, xmm6, 12
+        vpxor	xmm6, xmm6, xmm11
+        vpsrld	xmm11, xmm7, 20
+        vpslld	xmm7, xmm7, 12
+        vpxor	xmm7, xmm7, xmm11
+        vpaddd	xmm0, xmm0, xmm4
+        vpxor	xmm12, xmm12, xmm0
+        vmovdqa	xmm11, OWORD PTR [r12+48]
+        vpshufb	xmm12, xmm12, OWORD PTR [r14]
+        vpaddd	xmm8, xmm8, xmm12
+        vpxor	xmm4, xmm4, xmm8
+        vpaddd	xmm1, xmm1, xmm5
+        vpxor	xmm13, xmm13, xmm1
+        vpshufb	xmm13, xmm13, OWORD PTR [r14]
+        vpaddd	xmm9, xmm9, xmm13
+        vpxor	xmm5, xmm5, xmm9
+        vpaddd	xmm2, xmm2, xmm6
+        vpxor	xmm14, xmm14, xmm2
+        vpshufb	xmm14, xmm14, OWORD PTR [r14]
+        vpaddd	xmm10, xmm10, xmm14
+        vpxor	xmm6, xmm6, xmm10
+        vpaddd	xmm3, xmm3, xmm7
+        vpxor	xmm15, xmm15, xmm3
+        vpshufb	xmm15, xmm15, OWORD PTR [r14]
+        vpaddd	xmm11, xmm11, xmm15
+        vpxor	xmm7, xmm7, xmm11
+        vmovdqa	OWORD PTR [r12+48], xmm11
+        vpsrld	xmm11, xmm4, 25
+        vpslld	xmm4, xmm4, 7
+        vpxor	xmm4, xmm4, xmm11
+        vpsrld	xmm11, xmm5, 25
+        vpslld	xmm5, xmm5, 7
+        vpxor	xmm5, xmm5, xmm11
+        vpsrld	xmm11, xmm6, 25
+        vpslld	xmm6, xmm6, 7
+        vpxor	xmm6, xmm6, xmm11
+        vpsrld	xmm11, xmm7, 25
+        vpslld	xmm7, xmm7, 7
+        vpxor	xmm7, xmm7, xmm11
+        vpaddd	xmm0, xmm0, xmm5
+        vpxor	xmm15, xmm15, xmm0
+        vmovdqa	xmm11, OWORD PTR [r12+48]
+        vpshufb	xmm15, xmm15, OWORD PTR [r15]
+        vpaddd	xmm10, xmm10, xmm15
+        vpxor	xmm5, xmm5, xmm10
+        vpaddd	xmm1, xmm1, xmm6
+        vpxor	xmm12, xmm12, xmm1
+        vpshufb	xmm12, xmm12, OWORD PTR [r15]
+        vpaddd	xmm11, xmm11, xmm12
+        vpxor	xmm6, xmm6, xmm11
+        vpaddd	xmm2, xmm2, xmm7
+        vpxor	xmm13, xmm13, xmm2
+        vpshufb	xmm13, xmm13, OWORD PTR [r15]
+        vpaddd	xmm8, xmm8, xmm13
+        vpxor	xmm7, xmm7, xmm8
+        vpaddd	xmm3, xmm3, xmm4
+        vpxor	xmm14, xmm14, xmm3
+        vpshufb	xmm14, xmm14, OWORD PTR [r15]
+        vpaddd	xmm9, xmm9, xmm14
+        vpxor	xmm4, xmm4, xmm9
+        vmovdqa	OWORD PTR [r12+48], xmm11
+        vpsrld	xmm11, xmm5, 20
+        vpslld	xmm5, xmm5, 12
+        vpxor	xmm5, xmm5, xmm11
+        vpsrld	xmm11, xmm6, 20
+        vpslld	xmm6, xmm6, 12
+        vpxor	xmm6, xmm6, xmm11
+        vpsrld	xmm11, xmm7, 20
+        vpslld	xmm7, xmm7, 12
+        vpxor	xmm7, xmm7, xmm11
+        vpsrld	xmm11, xmm4, 20
+        vpslld	xmm4, xmm4, 12
+        vpxor	xmm4, xmm4, xmm11
+        vpaddd	xmm0, xmm0, xmm5
+        vpxor	xmm15, xmm15, xmm0
+        vmovdqa	xmm11, OWORD PTR [r12+48]
+        vpshufb	xmm15, xmm15, OWORD PTR [r14]
+        vpaddd	xmm10, xmm10, xmm15
+        vpxor	xmm5, xmm5, xmm10
+        vpaddd	xmm1, xmm1, xmm6
+        vpxor	xmm12, xmm12, xmm1
+        vpshufb	xmm12, xmm12, OWORD PTR [r14]
+        vpaddd	xmm11, xmm11, xmm12
+        vpxor	xmm6, xmm6, xmm11
+        vpaddd	xmm2, xmm2, xmm7
+        vpxor	xmm13, xmm13, xmm2
+        vpshufb	xmm13, xmm13, OWORD PTR [r14]
+        vpaddd	xmm8, xmm8, xmm13
+        vpxor	xmm7, xmm7, xmm8
+        vpaddd	xmm3, xmm3, xmm4
+        vpxor	xmm14, xmm14, xmm3
+        vpshufb	xmm14, xmm14, OWORD PTR [r14]
+        vpaddd	xmm9, xmm9, xmm14
+        vpxor	xmm4, xmm4, xmm9
+        vmovdqa	OWORD PTR [r12+48], xmm11
+        vpsrld	xmm11, xmm5, 25
+        vpslld	xmm5, xmm5, 7
+        vpxor	xmm5, xmm5, xmm11
+        vpsrld	xmm11, xmm6, 25
+        vpslld	xmm6, xmm6, 7
+        vpxor	xmm6, xmm6, xmm11
+        vpsrld	xmm11, xmm7, 25
+        vpslld	xmm7, xmm7, 7
+        vpxor	xmm7, xmm7, xmm11
+        vpsrld	xmm11, xmm4, 25
+        vpslld	xmm4, xmm4, 7
+        vpxor	xmm4, xmm4, xmm11
+        dec	r10b
+        jnz	L_chacha20_avx1_loop128
+        vmovdqa	xmm11, OWORD PTR [r12+48]
+        vpaddd	xmm0, xmm0, OWORD PTR [r11]
+        vpaddd	xmm1, xmm1, OWORD PTR [r11+16]
+        vpaddd	xmm2, xmm2, OWORD PTR [r11+32]
+        vpaddd	xmm3, xmm3, OWORD PTR [r11+48]
+        vpaddd	xmm4, xmm4, OWORD PTR [r11+64]
+        vpaddd	xmm5, xmm5, OWORD PTR [r11+80]
+        vpaddd	xmm6, xmm6, OWORD PTR [r11+96]
+        vpaddd	xmm7, xmm7, OWORD PTR [r11+112]
+        vpaddd	xmm8, xmm8, OWORD PTR [r11+128]
+        vpaddd	xmm9, xmm9, OWORD PTR [r11+144]
+        vpaddd	xmm10, xmm10, OWORD PTR [r11+160]
+        vpaddd	xmm11, xmm11, OWORD PTR [r11+176]
+        vpaddd	xmm12, xmm12, OWORD PTR [r11+192]
+        vpaddd	xmm13, xmm13, OWORD PTR [r11+208]
+        vpaddd	xmm14, xmm14, OWORD PTR [r11+224]
+        vpaddd	xmm15, xmm15, OWORD PTR [r11+240]
+        vmovdqa	OWORD PTR [r12], xmm8
+        vmovdqa	OWORD PTR [r12+16], xmm9
+        vmovdqa	OWORD PTR [r12+32], xmm10
+        vmovdqa	OWORD PTR [r12+48], xmm11
+        vmovdqa	OWORD PTR [r12+64], xmm12
+        vmovdqa	OWORD PTR [r12+80], xmm13
+        vmovdqa	OWORD PTR [r12+96], xmm14
+        vmovdqa	OWORD PTR [r12+112], xmm15
+        vpunpckldq	xmm8, xmm0, xmm1
+        vpunpckldq	xmm9, xmm2, xmm3
+        vpunpckhdq	xmm12, xmm0, xmm1
+        vpunpckhdq	xmm13, xmm2, xmm3
+        vpunpckldq	xmm10, xmm4, xmm5
+        vpunpckldq	xmm11, xmm6, xmm7
+        vpunpckhdq	xmm14, xmm4, xmm5
+        vpunpckhdq	xmm15, xmm6, xmm7
+        vpunpcklqdq	xmm0, xmm8, xmm9
+        vpunpcklqdq	xmm1, xmm10, xmm11
+        vpunpckhqdq	xmm2, xmm8, xmm9
+        vpunpckhqdq	xmm3, xmm10, xmm11
+        vpunpcklqdq	xmm4, xmm12, xmm13
+        vpunpcklqdq	xmm5, xmm14, xmm15
+        vpunpckhqdq	xmm6, xmm12, xmm13
+        vpunpckhqdq	xmm7, xmm14, xmm15
+        vmovdqu	xmm8, OWORD PTR [rdx]
+        vmovdqu	xmm9, OWORD PTR [rdx+16]
+        vmovdqu	xmm10, OWORD PTR [rdx+64]
+        vmovdqu	xmm11, OWORD PTR [rdx+80]
+        vmovdqu	xmm12, OWORD PTR [rdx+128]
+        vmovdqu	xmm13, OWORD PTR [rdx+144]
+        vmovdqu	xmm14, OWORD PTR [rdx+192]
+        vmovdqu	xmm15, OWORD PTR [rdx+208]
+        vpxor	xmm0, xmm0, xmm8
+        vpxor	xmm1, xmm1, xmm9
+        vpxor	xmm2, xmm2, xmm10
+        vpxor	xmm3, xmm3, xmm11
+        vpxor	xmm4, xmm4, xmm12
+        vpxor	xmm5, xmm5, xmm13
+        vpxor	xmm6, xmm6, xmm14
+        vpxor	xmm7, xmm7, xmm15
+        vmovdqu	OWORD PTR [r8], xmm0
+        vmovdqu	OWORD PTR [r8+16], xmm1
+        vmovdqu	OWORD PTR [r8+64], xmm2
+        vmovdqu	OWORD PTR [r8+80], xmm3
+        vmovdqu	OWORD PTR [r8+128], xmm4
+        vmovdqu	OWORD PTR [r8+144], xmm5
+        vmovdqu	OWORD PTR [r8+192], xmm6
+        vmovdqu	OWORD PTR [r8+208], xmm7
+        vmovdqa	xmm0, OWORD PTR [r12]
+        vmovdqa	xmm1, OWORD PTR [r12+16]
+        vmovdqa	xmm2, OWORD PTR [r12+32]
+        vmovdqa	xmm3, OWORD PTR [r12+48]
+        vmovdqa	xmm4, OWORD PTR [r12+64]
+        vmovdqa	xmm5, OWORD PTR [r12+80]
+        vmovdqa	xmm6, OWORD PTR [r12+96]
+        vmovdqa	xmm7, OWORD PTR [r12+112]
+        vpunpckldq	xmm8, xmm0, xmm1
+        vpunpckldq	xmm9, xmm2, xmm3
+        vpunpckhdq	xmm12, xmm0, xmm1
+        vpunpckhdq	xmm13, xmm2, xmm3
+        vpunpckldq	xmm10, xmm4, xmm5
+        vpunpckldq	xmm11, xmm6, xmm7
+        vpunpckhdq	xmm14, xmm4, xmm5
+        vpunpckhdq	xmm15, xmm6, xmm7
+        vpunpcklqdq	xmm0, xmm8, xmm9
+        vpunpcklqdq	xmm1, xmm10, xmm11
+        vpunpckhqdq	xmm2, xmm8, xmm9
+        vpunpckhqdq	xmm3, xmm10, xmm11
+        vpunpcklqdq	xmm4, xmm12, xmm13
+        vpunpcklqdq	xmm5, xmm14, xmm15
+        vpunpckhqdq	xmm6, xmm12, xmm13
+        vpunpckhqdq	xmm7, xmm14, xmm15
+        vmovdqu	xmm8, OWORD PTR [rdx+32]
+        vmovdqu	xmm9, OWORD PTR [rdx+48]
+        vmovdqu	xmm10, OWORD PTR [rdx+96]
+        vmovdqu	xmm11, OWORD PTR [rdx+112]
+        vmovdqu	xmm12, OWORD PTR [rdx+160]
+        vmovdqu	xmm13, OWORD PTR [rdx+176]
+        vmovdqu	xmm14, OWORD PTR [rdx+224]
+        vmovdqu	xmm15, OWORD PTR [rdx+240]
+        vpxor	xmm0, xmm0, xmm8
+        vpxor	xmm1, xmm1, xmm9
+        vpxor	xmm2, xmm2, xmm10
+        vpxor	xmm3, xmm3, xmm11
+        vpxor	xmm4, xmm4, xmm12
+        vpxor	xmm5, xmm5, xmm13
+        vpxor	xmm6, xmm6, xmm14
+        vpxor	xmm7, xmm7, xmm15
+        vmovdqu	OWORD PTR [r8+32], xmm0
+        vmovdqu	OWORD PTR [r8+48], xmm1
+        vmovdqu	OWORD PTR [r8+96], xmm2
+        vmovdqu	OWORD PTR [r8+112], xmm3
+        vmovdqu	OWORD PTR [r8+160], xmm4
+        vmovdqu	OWORD PTR [r8+176], xmm5
+        vmovdqu	OWORD PTR [r8+224], xmm6
+        vmovdqu	OWORD PTR [r8+240], xmm7
+        vmovdqa	xmm12, OWORD PTR [r11+192]
+        add	rdx, 256
+        add	r8, 256
+        vpaddd	xmm12, xmm12, OWORD PTR [rsi]
+        sub	r9d, 256
+        vmovdqa	OWORD PTR [r11+192], xmm12
+        cmp	r9d, 256
+        jl	L_chacha20_avx1_done128
+        vmovdqa	xmm0, OWORD PTR [r11]
+        vmovdqa	xmm1, OWORD PTR [r11+16]
+        vmovdqa	xmm2, OWORD PTR [r11+32]
+        vmovdqa	xmm3, OWORD PTR [r11+48]
+        vmovdqa	xmm4, OWORD PTR [r11+64]
+        vmovdqa	xmm5, OWORD PTR [r11+80]
+        vmovdqa	xmm6, OWORD PTR [r11+96]
+        vmovdqa	xmm7, OWORD PTR [r11+112]
+        vmovdqa	xmm8, OWORD PTR [r11+128]
+        vmovdqa	xmm9, OWORD PTR [r11+144]
+        vmovdqa	xmm10, OWORD PTR [r11+160]
+        vmovdqa	xmm11, OWORD PTR [r11+176]
+        vmovdqa	xmm12, OWORD PTR [r11+192]
+        vmovdqa	xmm13, OWORD PTR [r11+208]
+        vmovdqa	xmm14, OWORD PTR [r11+224]
+        vmovdqa	xmm15, OWORD PTR [r11+240]
+        jmp	L_chacha20_avx1_start128
+L_chacha20_avx1_done128:
+        shl	eax, 2
+        add	DWORD PTR [rcx+48], eax
+L_chacha20_avx1_end128:
+        cmp	r9d, 64
+        jl	L_chacha20_avx1_block_done
+L_chacha20_avx1_block_start:
+        vmovdqu	xmm0, OWORD PTR [rcx]
+        vmovdqu	xmm1, OWORD PTR [rcx+16]
+        vmovdqu	xmm2, OWORD PTR [rcx+32]
+        vmovdqu	xmm3, OWORD PTR [rcx+48]
+        vmovdqa	xmm5, xmm0
+        vmovdqa	xmm6, xmm1
+        vmovdqa	xmm7, xmm2
+        vmovdqa	xmm8, xmm3
+        mov	al, 10
+L_chacha20_avx1_block_crypt_start:
+        vpaddd	xmm0, xmm0, xmm1
+        vpxor	xmm3, xmm3, xmm0
+        vpshufb	xmm3, xmm3, OWORD PTR [r15]
+        vpaddd	xmm2, xmm2, xmm3
+        vpxor	xmm1, xmm1, xmm2
+        vpsrld	xmm4, xmm1, 20
+        vpslld	xmm1, xmm1, 12
+        vpxor	xmm1, xmm1, xmm4
+        vpaddd	xmm0, xmm0, xmm1
+        vpxor	xmm3, xmm3, xmm0
+        vpshufb	xmm3, xmm3, OWORD PTR [r14]
+        vpaddd	xmm2, xmm2, xmm3
+        vpxor	xmm1, xmm1, xmm2
+        vpsrld	xmm4, xmm1, 25
+        vpslld	xmm1, xmm1, 7
+        vpxor	xmm1, xmm1, xmm4
+        vpshufd	xmm1, xmm1, 57
+        vpshufd	xmm2, xmm2, 78
+        vpshufd	xmm3, xmm3, 147
+        vpaddd	xmm0, xmm0, xmm1
+        vpxor	xmm3, xmm3, xmm0
+        vpshufb	xmm3, xmm3, OWORD PTR [r15]
+        vpaddd	xmm2, xmm2, xmm3
+        vpxor	xmm1, xmm1, xmm2
+        vpsrld	xmm4, xmm1, 20
+        vpslld	xmm1, xmm1, 12
+        vpxor	xmm1, xmm1, xmm4
+        vpaddd	xmm0, xmm0, xmm1
+        vpxor	xmm3, xmm3, xmm0
+        vpshufb	xmm3, xmm3, OWORD PTR [r14]
+        vpaddd	xmm2, xmm2, xmm3
+        vpxor	xmm1, xmm1, xmm2
+        vpsrld	xmm4, xmm1, 25
+        vpslld	xmm1, xmm1, 7
+        vpxor	xmm1, xmm1, xmm4
+        vpshufd	xmm1, xmm1, 147
+        vpshufd	xmm2, xmm2, 78
+        vpshufd	xmm3, xmm3, 57
+        dec	al
+        jnz	L_chacha20_avx1_block_crypt_start
+        vpaddd	xmm0, xmm0, xmm5
+        vpaddd	xmm1, xmm1, xmm6
+        vpaddd	xmm2, xmm2, xmm7
+        vpaddd	xmm3, xmm3, xmm8
+        vmovdqu	xmm5, OWORD PTR [rdx]
+        vmovdqu	xmm6, OWORD PTR [rdx+16]
+        vmovdqu	xmm7, OWORD PTR [rdx+32]
+        vmovdqu	xmm8, OWORD PTR [rdx+48]
+        vpxor	xmm0, xmm0, xmm5
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm3, xmm3, xmm8
+        vmovdqu	OWORD PTR [r8], xmm0
+        vmovdqu	OWORD PTR [r8+16], xmm1
+        vmovdqu	OWORD PTR [r8+32], xmm2
+        vmovdqu	OWORD PTR [r8+48], xmm3
+        add	DWORD PTR [rcx+48], 1
+        sub	r9d, 64
+        add	rdx, 64
+        add	r8, 64
+        cmp	r9d, 64
+        jge	L_chacha20_avx1_block_start
+L_chacha20_avx1_block_done:
+        cmp	r9d, 0
+        je	L_chacha20_avx1_partial_done
+        lea	r12, QWORD PTR [rcx+80]
+        vmovdqu	xmm0, OWORD PTR [rcx]
+        vmovdqu	xmm1, OWORD PTR [rcx+16]
+        vmovdqu	xmm2, OWORD PTR [rcx+32]
+        vmovdqu	xmm3, OWORD PTR [rcx+48]
+        vmovdqa	xmm5, xmm0
+        vmovdqa	xmm6, xmm1
+        vmovdqa	xmm7, xmm2
+        vmovdqa	xmm8, xmm3
+        mov	al, 10
+L_chacha20_avx1_partial_crypt_start:
+        vpaddd	xmm0, xmm0, xmm1
+        vpxor	xmm3, xmm3, xmm0
+        vpshufb	xmm3, xmm3, OWORD PTR [r15]
+        vpaddd	xmm2, xmm2, xmm3
+        vpxor	xmm1, xmm1, xmm2
+        vpsrld	xmm4, xmm1, 20
+        vpslld	xmm1, xmm1, 12
+        vpxor	xmm1, xmm1, xmm4
+        vpaddd	xmm0, xmm0, xmm1
+        vpxor	xmm3, xmm3, xmm0
+        vpshufb	xmm3, xmm3, OWORD PTR [r14]
+        vpaddd	xmm2, xmm2, xmm3
+        vpxor	xmm1, xmm1, xmm2
+        vpsrld	xmm4, xmm1, 25
+        vpslld	xmm1, xmm1, 7
+        vpxor	xmm1, xmm1, xmm4
+        vpshufd	xmm1, xmm1, 57
+        vpshufd	xmm2, xmm2, 78
+        vpshufd	xmm3, xmm3, 147
+        vpaddd	xmm0, xmm0, xmm1
+        vpxor	xmm3, xmm3, xmm0
+        vpshufb	xmm3, xmm3, OWORD PTR [r15]
+        vpaddd	xmm2, xmm2, xmm3
+        vpxor	xmm1, xmm1, xmm2
+        vpsrld	xmm4, xmm1, 20
+        vpslld	xmm1, xmm1, 12
+        vpxor	xmm1, xmm1, xmm4
+        vpaddd	xmm0, xmm0, xmm1
+        vpxor	xmm3, xmm3, xmm0
+        vpshufb	xmm3, xmm3, OWORD PTR [r14]
+        vpaddd	xmm2, xmm2, xmm3
+        vpxor	xmm1, xmm1, xmm2
+        vpsrld	xmm4, xmm1, 25
+        vpslld	xmm1, xmm1, 7
+        vpxor	xmm1, xmm1, xmm4
+        vpshufd	xmm1, xmm1, 147
+        vpshufd	xmm2, xmm2, 78
+        vpshufd	xmm3, xmm3, 57
+        dec	al
+        jnz	L_chacha20_avx1_partial_crypt_start
+        vpaddd	xmm0, xmm0, xmm5
+        vpaddd	xmm1, xmm1, xmm6
+        vpaddd	xmm2, xmm2, xmm7
+        vpaddd	xmm3, xmm3, xmm8
+        vmovdqu	OWORD PTR [r12], xmm0
+        vmovdqu	OWORD PTR [r12+16], xmm1
+        vmovdqu	OWORD PTR [r12+32], xmm2
+        vmovdqu	OWORD PTR [r12+48], xmm3
+        add	DWORD PTR [rcx+48], 1
+        mov	r10d, r9d
+        xor	r13, r13
+        and	r10d, 7
+        jz	L_chacha20_avx1_partial_start64
+L_chacha20_avx1_partial_start8:
+        movzx	eax, BYTE PTR [r12+r13]
+        xor	al, BYTE PTR [rdx+r13]
+        mov	BYTE PTR [r8+r13], al
+        inc	r13d
+        cmp	r13d, r10d
+        jne	L_chacha20_avx1_partial_start8
+        je	L_chacha20_avx1_partial_end64
+L_chacha20_avx1_partial_start64:
+        mov	rax, QWORD PTR [r12+r13]
+        xor	rax, QWORD PTR [rdx+r13]
+        mov	QWORD PTR [r8+r13], rax
+        add	r13d, 8
+L_chacha20_avx1_partial_end64:
+        cmp	r13d, r9d
+        jne	L_chacha20_avx1_partial_start64
+        mov	r10d, 64
+        sub	r10d, r13d
+        mov	DWORD PTR [rcx+76], r10d
+L_chacha20_avx1_partial_done:
+        vzeroupper
+        vmovdqu	xmm6, OWORD PTR [rsp+400]
+        vmovdqu	xmm7, OWORD PTR [rsp+416]
+        vmovdqu	xmm8, OWORD PTR [rsp+432]
+        vmovdqu	xmm9, OWORD PTR [rsp+448]
+        vmovdqu	xmm10, OWORD PTR [rsp+464]
+        vmovdqu	xmm11, OWORD PTR [rsp+480]
+        vmovdqu	xmm12, OWORD PTR [rsp+496]
+        vmovdqu	xmm13, OWORD PTR [rsp+512]
+        vmovdqu	xmm14, OWORD PTR [rsp+528]
+        vmovdqu	xmm15, OWORD PTR [rsp+544]
+        add	rsp, 560
+        pop	rsi
+        pop	rdi
+        pop	r15
+        pop	r14
+        pop	r13
+        pop	r12
+        ret
+chacha_encrypt_avx1 ENDP
+_text ENDS
+ENDIF
+IFDEF HAVE_INTEL_AVX2
+_DATA SEGMENT
+ALIGN 16
+L_chacha20_avx2_rotl8 QWORD 433757367256023043, 1012478749960636427,
+    433757367256023043, 1012478749960636427
+ptr_L_chacha20_avx2_rotl8 QWORD L_chacha20_avx2_rotl8
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_chacha20_avx2_rotl16 QWORD 361421592464458498, 940142975169071882,
+    361421592464458498, 940142975169071882
+ptr_L_chacha20_avx2_rotl16 QWORD L_chacha20_avx2_rotl16
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_chacha20_avx2_add QWORD 4294967296, 12884901890,
+    21474836484, 30064771078
+ptr_L_chacha20_avx2_add QWORD L_chacha20_avx2_add
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_chacha20_avx2_eight QWORD 34359738376, 34359738376,
+    34359738376, 34359738376
+ptr_L_chacha20_avx2_eight QWORD L_chacha20_avx2_eight
+_DATA ENDS
+_text SEGMENT READONLY PARA
+chacha_encrypt_avx2 PROC
+        push	r12
+        push	r13
+        push	r14
+        push	r15
+        push	rdi
+        sub	rsp, 960
+        vmovdqu	OWORD PTR [rsp+800], xmm6
+        vmovdqu	OWORD PTR [rsp+816], xmm7
+        vmovdqu	OWORD PTR [rsp+832], xmm8
+        vmovdqu	OWORD PTR [rsp+848], xmm9
+        vmovdqu	OWORD PTR [rsp+864], xmm10
+        vmovdqu	OWORD PTR [rsp+880], xmm11
+        vmovdqu	OWORD PTR [rsp+896], xmm12
+        vmovdqu	OWORD PTR [rsp+912], xmm13
+        vmovdqu	OWORD PTR [rsp+928], xmm14
+        vmovdqu	OWORD PTR [rsp+944], xmm15
+        mov	r11, rsp
+        mov	r13, QWORD PTR [ptr_L_chacha20_avx2_rotl8]
+        mov	r14, QWORD PTR [ptr_L_chacha20_avx2_rotl16]
+        mov	r15, QWORD PTR [ptr_L_chacha20_avx2_add]
+        mov	rdi, QWORD PTR [ptr_L_chacha20_avx2_eight]
+        lea	r12, QWORD PTR [rsp+512]
+        add	r11, 31
+        add	r12, 31
+        and	r11, -32
+        and	r12, -32
+        mov	eax, r9d
+        shr	eax, 9
+        jz	L_chacha20_avx2_end256
+        vpbroadcastd	ymm0, DWORD PTR [rcx]
+        vpbroadcastd	ymm1, DWORD PTR [rcx+4]
+        vpbroadcastd	ymm2, DWORD PTR [rcx+8]
+        vpbroadcastd	ymm3, DWORD PTR [rcx+12]
+        vpbroadcastd	ymm4, DWORD PTR [rcx+16]
+        vpbroadcastd	ymm5, DWORD PTR [rcx+20]
+        vpbroadcastd	ymm6, DWORD PTR [rcx+24]
+        vpbroadcastd	ymm7, DWORD PTR [rcx+28]
+        vpbroadcastd	ymm8, DWORD PTR [rcx+32]
+        vpbroadcastd	ymm9, DWORD PTR [rcx+36]
+        vpbroadcastd	ymm10, DWORD PTR [rcx+40]
+        vpbroadcastd	ymm11, DWORD PTR [rcx+44]
+        vpbroadcastd	ymm12, DWORD PTR [rcx+48]
+        vpbroadcastd	ymm13, DWORD PTR [rcx+52]
+        vpbroadcastd	ymm14, DWORD PTR [rcx+56]
+        vpbroadcastd	ymm15, DWORD PTR [rcx+60]
+        vpaddd	ymm12, ymm12, YMMWORD PTR [r15]
+        vmovdqa	YMMWORD PTR [r11], ymm0
+        vmovdqa	YMMWORD PTR [r11+32], ymm1
+        vmovdqa	YMMWORD PTR [r11+64], ymm2
+        vmovdqa	YMMWORD PTR [r11+96], ymm3
+        vmovdqa	YMMWORD PTR [r11+128], ymm4
+        vmovdqa	YMMWORD PTR [r11+160], ymm5
+        vmovdqa	YMMWORD PTR [r11+192], ymm6
+        vmovdqa	YMMWORD PTR [r11+224], ymm7
+        vmovdqa	YMMWORD PTR [r11+256], ymm8
+        vmovdqa	YMMWORD PTR [r11+288], ymm9
+        vmovdqa	YMMWORD PTR [r11+320], ymm10
+        vmovdqa	YMMWORD PTR [r11+352], ymm11
+        vmovdqa	YMMWORD PTR [r11+384], ymm12
+        vmovdqa	YMMWORD PTR [r11+416], ymm13
+        vmovdqa	YMMWORD PTR [r11+448], ymm14
+        vmovdqa	YMMWORD PTR [r11+480], ymm15
+L_chacha20_avx2_start256:
+        mov	r10b, 10
+        vmovdqa	YMMWORD PTR [r12+96], ymm11
+L_chacha20_avx2_loop256:
+        vpaddd	ymm0, ymm0, ymm4
+        vpxor	ymm12, ymm12, ymm0
+        vmovdqa	ymm11, YMMWORD PTR [r12+96]
+        vpshufb	ymm12, ymm12, YMMWORD PTR [r14]
+        vpaddd	ymm8, ymm8, ymm12
+        vpxor	ymm4, ymm4, ymm8
+        vpaddd	ymm1, ymm1, ymm5
+        vpxor	ymm13, ymm13, ymm1
+        vpshufb	ymm13, ymm13, YMMWORD PTR [r14]
+        vpaddd	ymm9, ymm9, ymm13
+        vpxor	ymm5, ymm5, ymm9
+        vpaddd	ymm2, ymm2, ymm6
+        vpxor	ymm14, ymm14, ymm2
+        vpshufb	ymm14, ymm14, YMMWORD PTR [r14]
+        vpaddd	ymm10, ymm10, ymm14
+        vpxor	ymm6, ymm6, ymm10
+        vpaddd	ymm3, ymm3, ymm7
+        vpxor	ymm15, ymm15, ymm3
+        vpshufb	ymm15, ymm15, YMMWORD PTR [r14]
+        vpaddd	ymm11, ymm11, ymm15
+        vpxor	ymm7, ymm7, ymm11
+        vmovdqa	YMMWORD PTR [r12+96], ymm11
+        vpsrld	ymm11, ymm4, 20
+        vpslld	ymm4, ymm4, 12
+        vpxor	ymm4, ymm4, ymm11
+        vpsrld	ymm11, ymm5, 20
+        vpslld	ymm5, ymm5, 12
+        vpxor	ymm5, ymm5, ymm11
+        vpsrld	ymm11, ymm6, 20
+        vpslld	ymm6, ymm6, 12
+        vpxor	ymm6, ymm6, ymm11
+        vpsrld	ymm11, ymm7, 20
+        vpslld	ymm7, ymm7, 12
+        vpxor	ymm7, ymm7, ymm11
+        vpaddd	ymm0, ymm0, ymm4
+        vpxor	ymm12, ymm12, ymm0
+        vmovdqa	ymm11, YMMWORD PTR [r12+96]
+        vpshufb	ymm12, ymm12, YMMWORD PTR [r13]
+        vpaddd	ymm8, ymm8, ymm12
+        vpxor	ymm4, ymm4, ymm8
+        vpaddd	ymm1, ymm1, ymm5
+        vpxor	ymm13, ymm13, ymm1
+        vpshufb	ymm13, ymm13, YMMWORD PTR [r13]
+        vpaddd	ymm9, ymm9, ymm13
+        vpxor	ymm5, ymm5, ymm9
+        vpaddd	ymm2, ymm2, ymm6
+        vpxor	ymm14, ymm14, ymm2
+        vpshufb	ymm14, ymm14, YMMWORD PTR [r13]
+        vpaddd	ymm10, ymm10, ymm14
+        vpxor	ymm6, ymm6, ymm10
+        vpaddd	ymm3, ymm3, ymm7
+        vpxor	ymm15, ymm15, ymm3
+        vpshufb	ymm15, ymm15, YMMWORD PTR [r13]
+        vpaddd	ymm11, ymm11, ymm15
+        vpxor	ymm7, ymm7, ymm11
+        vmovdqa	YMMWORD PTR [r12+96], ymm11
+        vpsrld	ymm11, ymm4, 25
+        vpslld	ymm4, ymm4, 7
+        vpxor	ymm4, ymm4, ymm11
+        vpsrld	ymm11, ymm5, 25
+        vpslld	ymm5, ymm5, 7
+        vpxor	ymm5, ymm5, ymm11
+        vpsrld	ymm11, ymm6, 25
+        vpslld	ymm6, ymm6, 7
+        vpxor	ymm6, ymm6, ymm11
+        vpsrld	ymm11, ymm7, 25
+        vpslld	ymm7, ymm7, 7
+        vpxor	ymm7, ymm7, ymm11
+        vpaddd	ymm0, ymm0, ymm5
+        vpxor	ymm15, ymm15, ymm0
+        vmovdqa	ymm11, YMMWORD PTR [r12+96]
+        vpshufb	ymm15, ymm15, YMMWORD PTR [r14]
+        vpaddd	ymm10, ymm10, ymm15
+        vpxor	ymm5, ymm5, ymm10
+        vpaddd	ymm1, ymm1, ymm6
+        vpxor	ymm12, ymm12, ymm1
+        vpshufb	ymm12, ymm12, YMMWORD PTR [r14]
+        vpaddd	ymm11, ymm11, ymm12
+        vpxor	ymm6, ymm6, ymm11
+        vpaddd	ymm2, ymm2, ymm7
+        vpxor	ymm13, ymm13, ymm2
+        vpshufb	ymm13, ymm13, YMMWORD PTR [r14]
+        vpaddd	ymm8, ymm8, ymm13
+        vpxor	ymm7, ymm7, ymm8
+        vpaddd	ymm3, ymm3, ymm4
+        vpxor	ymm14, ymm14, ymm3
+        vpshufb	ymm14, ymm14, YMMWORD PTR [r14]
+        vpaddd	ymm9, ymm9, ymm14
+        vpxor	ymm4, ymm4, ymm9
+        vmovdqa	YMMWORD PTR [r12+96], ymm11
+        vpsrld	ymm11, ymm5, 20
+        vpslld	ymm5, ymm5, 12
+        vpxor	ymm5, ymm5, ymm11
+        vpsrld	ymm11, ymm6, 20
+        vpslld	ymm6, ymm6, 12
+        vpxor	ymm6, ymm6, ymm11
+        vpsrld	ymm11, ymm7, 20
+        vpslld	ymm7, ymm7, 12
+        vpxor	ymm7, ymm7, ymm11
+        vpsrld	ymm11, ymm4, 20
+        vpslld	ymm4, ymm4, 12
+        vpxor	ymm4, ymm4, ymm11
+        vpaddd	ymm0, ymm0, ymm5
+        vpxor	ymm15, ymm15, ymm0
+        vmovdqa	ymm11, YMMWORD PTR [r12+96]
+        vpshufb	ymm15, ymm15, YMMWORD PTR [r13]
+        vpaddd	ymm10, ymm10, ymm15
+        vpxor	ymm5, ymm5, ymm10
+        vpaddd	ymm1, ymm1, ymm6
+        vpxor	ymm12, ymm12, ymm1
+        vpshufb	ymm12, ymm12, YMMWORD PTR [r13]
+        vpaddd	ymm11, ymm11, ymm12
+        vpxor	ymm6, ymm6, ymm11
+        vpaddd	ymm2, ymm2, ymm7
+        vpxor	ymm13, ymm13, ymm2
+        vpshufb	ymm13, ymm13, YMMWORD PTR [r13]
+        vpaddd	ymm8, ymm8, ymm13
+        vpxor	ymm7, ymm7, ymm8
+        vpaddd	ymm3, ymm3, ymm4
+        vpxor	ymm14, ymm14, ymm3
+        vpshufb	ymm14, ymm14, YMMWORD PTR [r13]
+        vpaddd	ymm9, ymm9, ymm14
+        vpxor	ymm4, ymm4, ymm9
+        vmovdqa	YMMWORD PTR [r12+96], ymm11
+        vpsrld	ymm11, ymm5, 25
+        vpslld	ymm5, ymm5, 7
+        vpxor	ymm5, ymm5, ymm11
+        vpsrld	ymm11, ymm6, 25
+        vpslld	ymm6, ymm6, 7
+        vpxor	ymm6, ymm6, ymm11
+        vpsrld	ymm11, ymm7, 25
+        vpslld	ymm7, ymm7, 7
+        vpxor	ymm7, ymm7, ymm11
+        vpsrld	ymm11, ymm4, 25
+        vpslld	ymm4, ymm4, 7
+        vpxor	ymm4, ymm4, ymm11
+        dec	r10b
+        jnz	L_chacha20_avx2_loop256
+        vmovdqa	ymm11, YMMWORD PTR [r12+96]
+        vpaddd	ymm0, ymm0, YMMWORD PTR [r11]
+        vpaddd	ymm1, ymm1, YMMWORD PTR [r11+32]
+        vpaddd	ymm2, ymm2, YMMWORD PTR [r11+64]
+        vpaddd	ymm3, ymm3, YMMWORD PTR [r11+96]
+        vpaddd	ymm4, ymm4, YMMWORD PTR [r11+128]
+        vpaddd	ymm5, ymm5, YMMWORD PTR [r11+160]
+        vpaddd	ymm6, ymm6, YMMWORD PTR [r11+192]
+        vpaddd	ymm7, ymm7, YMMWORD PTR [r11+224]
+        vpaddd	ymm8, ymm8, YMMWORD PTR [r11+256]
+        vpaddd	ymm9, ymm9, YMMWORD PTR [r11+288]
+        vpaddd	ymm10, ymm10, YMMWORD PTR [r11+320]
+        vpaddd	ymm11, ymm11, YMMWORD PTR [r11+352]
+        vpaddd	ymm12, ymm12, YMMWORD PTR [r11+384]
+        vpaddd	ymm13, ymm13, YMMWORD PTR [r11+416]
+        vpaddd	ymm14, ymm14, YMMWORD PTR [r11+448]
+        vpaddd	ymm15, ymm15, YMMWORD PTR [r11+480]
+        vmovdqa	YMMWORD PTR [r12], ymm8
+        vmovdqa	YMMWORD PTR [r12+32], ymm9
+        vmovdqa	YMMWORD PTR [r12+64], ymm10
+        vmovdqa	YMMWORD PTR [r12+96], ymm11
+        vmovdqa	YMMWORD PTR [r12+128], ymm12
+        vmovdqa	YMMWORD PTR [r12+160], ymm13
+        vmovdqa	YMMWORD PTR [r12+192], ymm14
+        vmovdqa	YMMWORD PTR [r12+224], ymm15
+        vpunpckldq	ymm8, ymm0, ymm1
+        vpunpckldq	ymm9, ymm2, ymm3
+        vpunpckhdq	ymm12, ymm0, ymm1
+        vpunpckhdq	ymm13, ymm2, ymm3
+        vpunpckldq	ymm10, ymm4, ymm5
+        vpunpckldq	ymm11, ymm6, ymm7
+        vpunpckhdq	ymm14, ymm4, ymm5
+        vpunpckhdq	ymm15, ymm6, ymm7
+        vpunpcklqdq	ymm0, ymm8, ymm9
+        vpunpcklqdq	ymm1, ymm10, ymm11
+        vpunpckhqdq	ymm2, ymm8, ymm9
+        vpunpckhqdq	ymm3, ymm10, ymm11
+        vpunpcklqdq	ymm4, ymm12, ymm13
+        vpunpcklqdq	ymm5, ymm14, ymm15
+        vpunpckhqdq	ymm6, ymm12, ymm13
+        vpunpckhqdq	ymm7, ymm14, ymm15
+        vperm2i128	ymm8, ymm0, ymm1, 32
+        vperm2i128	ymm9, ymm2, ymm3, 32
+        vperm2i128	ymm12, ymm0, ymm1, 49
+        vperm2i128	ymm13, ymm2, ymm3, 49
+        vperm2i128	ymm10, ymm4, ymm5, 32
+        vperm2i128	ymm11, ymm6, ymm7, 32
+        vperm2i128	ymm14, ymm4, ymm5, 49
+        vperm2i128	ymm15, ymm6, ymm7, 49
+        vmovdqu	ymm0, YMMWORD PTR [rdx]
+        vmovdqu	ymm1, YMMWORD PTR [rdx+64]
+        vmovdqu	ymm2, YMMWORD PTR [rdx+128]
+        vmovdqu	ymm3, YMMWORD PTR [rdx+192]
+        vmovdqu	ymm4, YMMWORD PTR [rdx+256]
+        vmovdqu	ymm5, YMMWORD PTR [rdx+320]
+        vmovdqu	ymm6, YMMWORD PTR [rdx+384]
+        vmovdqu	ymm7, YMMWORD PTR [rdx+448]
+        vpxor	ymm8, ymm8, ymm0
+        vpxor	ymm9, ymm9, ymm1
+        vpxor	ymm10, ymm10, ymm2
+        vpxor	ymm11, ymm11, ymm3
+        vpxor	ymm12, ymm12, ymm4
+        vpxor	ymm13, ymm13, ymm5
+        vpxor	ymm14, ymm14, ymm6
+        vpxor	ymm15, ymm15, ymm7
+        vmovdqu	YMMWORD PTR [r8], ymm8
+        vmovdqu	YMMWORD PTR [r8+64], ymm9
+        vmovdqu	YMMWORD PTR [r8+128], ymm10
+        vmovdqu	YMMWORD PTR [r8+192], ymm11
+        vmovdqu	YMMWORD PTR [r8+256], ymm12
+        vmovdqu	YMMWORD PTR [r8+320], ymm13
+        vmovdqu	YMMWORD PTR [r8+384], ymm14
+        vmovdqu	YMMWORD PTR [r8+448], ymm15
+        vmovdqa	ymm0, YMMWORD PTR [r12]
+        vmovdqa	ymm1, YMMWORD PTR [r12+32]
+        vmovdqa	ymm2, YMMWORD PTR [r12+64]
+        vmovdqa	ymm3, YMMWORD PTR [r12+96]
+        vmovdqa	ymm4, YMMWORD PTR [r12+128]
+        vmovdqa	ymm5, YMMWORD PTR [r12+160]
+        vmovdqa	ymm6, YMMWORD PTR [r12+192]
+        vmovdqa	ymm7, YMMWORD PTR [r12+224]
+        vpunpckldq	ymm8, ymm0, ymm1
+        vpunpckldq	ymm9, ymm2, ymm3
+        vpunpckhdq	ymm12, ymm0, ymm1
+        vpunpckhdq	ymm13, ymm2, ymm3
+        vpunpckldq	ymm10, ymm4, ymm5
+        vpunpckldq	ymm11, ymm6, ymm7
+        vpunpckhdq	ymm14, ymm4, ymm5
+        vpunpckhdq	ymm15, ymm6, ymm7
+        vpunpcklqdq	ymm0, ymm8, ymm9
+        vpunpcklqdq	ymm1, ymm10, ymm11
+        vpunpckhqdq	ymm2, ymm8, ymm9
+        vpunpckhqdq	ymm3, ymm10, ymm11
+        vpunpcklqdq	ymm4, ymm12, ymm13
+        vpunpcklqdq	ymm5, ymm14, ymm15
+        vpunpckhqdq	ymm6, ymm12, ymm13
+        vpunpckhqdq	ymm7, ymm14, ymm15
+        vperm2i128	ymm8, ymm0, ymm1, 32
+        vperm2i128	ymm9, ymm2, ymm3, 32
+        vperm2i128	ymm12, ymm0, ymm1, 49
+        vperm2i128	ymm13, ymm2, ymm3, 49
+        vperm2i128	ymm10, ymm4, ymm5, 32
+        vperm2i128	ymm11, ymm6, ymm7, 32
+        vperm2i128	ymm14, ymm4, ymm5, 49
+        vperm2i128	ymm15, ymm6, ymm7, 49
+        vmovdqu	ymm0, YMMWORD PTR [rdx+32]
+        vmovdqu	ymm1, YMMWORD PTR [rdx+96]
+        vmovdqu	ymm2, YMMWORD PTR [rdx+160]
+        vmovdqu	ymm3, YMMWORD PTR [rdx+224]
+        vmovdqu	ymm4, YMMWORD PTR [rdx+288]
+        vmovdqu	ymm5, YMMWORD PTR [rdx+352]
+        vmovdqu	ymm6, YMMWORD PTR [rdx+416]
+        vmovdqu	ymm7, YMMWORD PTR [rdx+480]
+        vpxor	ymm8, ymm8, ymm0
+        vpxor	ymm9, ymm9, ymm1
+        vpxor	ymm10, ymm10, ymm2
+        vpxor	ymm11, ymm11, ymm3
+        vpxor	ymm12, ymm12, ymm4
+        vpxor	ymm13, ymm13, ymm5
+        vpxor	ymm14, ymm14, ymm6
+        vpxor	ymm15, ymm15, ymm7
+        vmovdqu	YMMWORD PTR [r8+32], ymm8
+        vmovdqu	YMMWORD PTR [r8+96], ymm9
+        vmovdqu	YMMWORD PTR [r8+160], ymm10
+        vmovdqu	YMMWORD PTR [r8+224], ymm11
+        vmovdqu	YMMWORD PTR [r8+288], ymm12
+        vmovdqu	YMMWORD PTR [r8+352], ymm13
+        vmovdqu	YMMWORD PTR [r8+416], ymm14
+        vmovdqu	YMMWORD PTR [r8+480], ymm15
+        vmovdqa	ymm12, YMMWORD PTR [r11+384]
+        add	rdx, 512
+        add	r8, 512
+        vpaddd	ymm12, ymm12, YMMWORD PTR [rdi]
+        sub	r9d, 512
+        vmovdqa	YMMWORD PTR [r11+384], ymm12
+        cmp	r9d, 512
+        jl	L_chacha20_avx2_done256
+        vmovdqa	ymm0, YMMWORD PTR [r11]
+        vmovdqa	ymm1, YMMWORD PTR [r11+32]
+        vmovdqa	ymm2, YMMWORD PTR [r11+64]
+        vmovdqa	ymm3, YMMWORD PTR [r11+96]
+        vmovdqa	ymm4, YMMWORD PTR [r11+128]
+        vmovdqa	ymm5, YMMWORD PTR [r11+160]
+        vmovdqa	ymm6, YMMWORD PTR [r11+192]
+        vmovdqa	ymm7, YMMWORD PTR [r11+224]
+        vmovdqa	ymm8, YMMWORD PTR [r11+256]
+        vmovdqa	ymm9, YMMWORD PTR [r11+288]
+        vmovdqa	ymm10, YMMWORD PTR [r11+320]
+        vmovdqa	ymm11, YMMWORD PTR [r11+352]
+        vmovdqa	ymm12, YMMWORD PTR [r11+384]
+        vmovdqa	ymm13, YMMWORD PTR [r11+416]
+        vmovdqa	ymm14, YMMWORD PTR [r11+448]
+        vmovdqa	ymm15, YMMWORD PTR [r11+480]
+        jmp	L_chacha20_avx2_start256
+L_chacha20_avx2_done256:
+        shl	eax, 3
+        add	DWORD PTR [rcx+48], eax
+L_chacha20_avx2_end256:
+        call	chacha_encrypt_avx1
+        vzeroupper
+        vmovdqu	xmm6, OWORD PTR [rsp+800]
+        vmovdqu	xmm7, OWORD PTR [rsp+816]
+        vmovdqu	xmm8, OWORD PTR [rsp+832]
+        vmovdqu	xmm9, OWORD PTR [rsp+848]
+        vmovdqu	xmm10, OWORD PTR [rsp+864]
+        vmovdqu	xmm11, OWORD PTR [rsp+880]
+        vmovdqu	xmm12, OWORD PTR [rsp+896]
+        vmovdqu	xmm13, OWORD PTR [rsp+912]
+        vmovdqu	xmm14, OWORD PTR [rsp+928]
+        vmovdqu	xmm15, OWORD PTR [rsp+944]
+        add	rsp, 960
+        pop	rdi
+        pop	r15
+        pop	r14
+        pop	r13
+        pop	r12
+        ret
+chacha_encrypt_avx2 ENDP
+_text ENDS
+ENDIF
+END

+ 41 - 41
wolfcrypt/src/fe_x25519_asm.S

@@ -1,6 +1,6 @@
 /* fe_x25519_asm.S */
 /*
- * Copyright (C) 2006-2023 wolfSSL Inc.
+ * Copyright (C) 2006-2024 wolfSSL Inc.
  *
  * This file is part of wolfSSL.
  *
@@ -4140,7 +4140,7 @@ _fe_sq2_x64:
         adcq	%r11, %r8
         adcq	%r12, %r9
         adcq	%r13, %r10
-        mov	%r10, %rax
+        movq	%r10, %rax
         shldq	$0x01, %r9, %r10
         shldq	$0x01, %r8, %r9
         shldq	$0x01, %rcx, %r8
@@ -5946,7 +5946,7 @@ _ge_p2_dbl_x64:
         adcq	%r13, %r10
         adcq	%r14, %r11
         adcq	%r15, %r12
-        mov	%r12, %rax
+        movq	%r12, %rax
         shldq	$0x01, %r11, %r12
         shldq	$0x01, %r10, %r11
         shldq	$0x01, %r9, %r10
@@ -8676,7 +8676,7 @@ _sc_reduce_x64:
         movq	$0xa7ed9ce5a30a2c13, %rcx
         movq	%r12, %rax
         mulq	%rcx
-        mov	$0x00, %rbp
+        movq	$0x00, %rbp
         addq	%rax, %r8
         adcq	%rdx, %rbp
         movq	%r13, %rax
@@ -8689,7 +8689,7 @@ _sc_reduce_x64:
         addq	%rbp, %r9
         adcq	%rax, %r10
         adcq	%rdx, %r11
-        mov	$0x00, %rbx
+        movq	$0x00, %rbx
         adcq	$0x00, %rbx
         movq	%r15, %rax
         mulq	%rcx
@@ -8699,7 +8699,7 @@ _sc_reduce_x64:
         movq	$0xeb2106215d086329, %rcx
         movq	%r12, %rax
         mulq	%rcx
-        mov	$0x00, %rbp
+        movq	$0x00, %rbp
         addq	%rax, %r9
         adcq	%rdx, %rbp
         movq	%r13, %rax
@@ -8712,7 +8712,7 @@ _sc_reduce_x64:
         addq	%rbp, %r10
         adcq	%rax, %r11
         adcq	%rdx, %rbx
-        mov	$0x00, %rbp
+        movq	$0x00, %rbp
         adcq	$0x00, %rbp
         movq	%r15, %rax
         mulq	%rcx
@@ -8962,7 +8962,7 @@ _sc_muladd_x64:
         movq	$0xa7ed9ce5a30a2c13, %rbx
         movq	%r12, %rax
         mulq	%rbx
-        mov	$0x00, %rbp
+        movq	$0x00, %rbp
         addq	%rax, %r8
         adcq	%rdx, %rbp
         movq	%r13, %rax
@@ -8975,7 +8975,7 @@ _sc_muladd_x64:
         addq	%rbp, %r9
         adcq	%rax, %r10
         adcq	%rdx, %r11
-        mov	$0x00, %rsi
+        movq	$0x00, %rsi
         adcq	$0x00, %rsi
         movq	%r15, %rax
         mulq	%rbx
@@ -8985,7 +8985,7 @@ _sc_muladd_x64:
         movq	$0xeb2106215d086329, %rbx
         movq	%r12, %rax
         mulq	%rbx
-        mov	$0x00, %rbp
+        movq	$0x00, %rbp
         addq	%rax, %r9
         adcq	%rdx, %rbp
         movq	%r13, %rax
@@ -8998,7 +8998,7 @@ _sc_muladd_x64:
         addq	%rbp, %r10
         adcq	%rax, %r11
         adcq	%rdx, %rsi
-        mov	$0x00, %rbp
+        movq	$0x00, %rbp
         adcq	$0x00, %rbp
         movq	%r15, %rax
         mulq	%rbx
@@ -11371,7 +11371,7 @@ _fe_sq2_avx2:
         adcxq	%rax, %r10
         adoxq	%r14, %r11
         adcxq	%rcx, %r11
-        mov	%r11, %rax
+        movq	%r11, %rax
         shldq	$0x01, %r10, %r11
         shldq	$0x01, %r9, %r10
         shldq	$0x01, %r8, %r9
@@ -12862,7 +12862,7 @@ _ge_p2_dbl_avx2:
         adcxq	%r9, %r12
         adoxq	%rbx, %r13
         adcxq	%rcx, %r13
-        mov	%r13, %r9
+        movq	%r13, %r9
         shldq	$0x01, %r12, %r13
         shldq	$0x01, %r11, %r12
         shldq	$0x01, %r10, %r11
@@ -15206,33 +15206,33 @@ _sc_reduce_avx2:
         adcq	$0x00, %r15
         # Sub product of top 4 words and order
         movq	$0xa7ed9ce5a30a2c13, %rdx
-        mulx	%r12, %rcx, %rax
+        mulxq	%r12, %rcx, %rax
         addq	%rcx, %r8
         adcq	%rax, %r9
-        mulx	%r14, %rcx, %rax
+        mulxq	%r14, %rcx, %rax
         adcq	%rcx, %r10
         adcq	%rax, %r11
-        mov	$0x00, %rsi
+        movq	$0x00, %rsi
         adcq	$0x00, %rsi
-        mulx	%r13, %rcx, %rax
+        mulxq	%r13, %rcx, %rax
         addq	%rcx, %r9
         adcq	%rax, %r10
-        mulx	%r15, %rcx, %rax
+        mulxq	%r15, %rcx, %rax
         adcq	%rcx, %r11
         adcq	%rax, %rsi
         movq	$0xeb2106215d086329, %rdx
-        mulx	%r12, %rcx, %rax
+        mulxq	%r12, %rcx, %rax
         addq	%rcx, %r9
         adcq	%rax, %r10
-        mulx	%r14, %rcx, %rax
+        mulxq	%r14, %rcx, %rax
         adcq	%rcx, %r11
         adcq	%rax, %rsi
-        mov	$0x00, %rbx
+        movq	$0x00, %rbx
         adcq	$0x00, %rbx
-        mulx	%r13, %rcx, %rax
+        mulxq	%r13, %rcx, %rax
         addq	%rcx, %r10
         adcq	%rax, %r11
-        mulx	%r15, %rcx, %rax
+        mulxq	%r15, %rcx, %rax
         adcq	%rcx, %rsi
         adcq	%rax, %rbx
         subq	%r12, %r10
@@ -15265,21 +15265,21 @@ _sc_reduce_avx2:
         # Sub product of top 2 words and order
         #   * -5812631a5cf5d3ed
         movq	$0xa7ed9ce5a30a2c13, %rdx
-        mulx	%r12, %rbp, %rax
+        mulxq	%r12, %rbp, %rax
         movq	$0x00, %rsi
         addq	%rbp, %r8
         adcq	%rax, %r9
-        mulx	%r13, %rbp, %rax
+        mulxq	%r13, %rbp, %rax
         adcq	$0x00, %rsi
         addq	%rbp, %r9
         adcq	%rax, %rsi
         #   * -14def9dea2f79cd7
         movq	$0xeb2106215d086329, %rdx
-        mulx	%r12, %rbp, %rax
+        mulxq	%r12, %rbp, %rax
         movq	$0x00, %rbx
         addq	%rbp, %r9
         adcq	%rax, %r10
-        mulx	%r13, %rbp, %rax
+        mulxq	%r13, %rbp, %rax
         adcq	$0x00, %rbx
         addq	%rbp, %r10
         adcq	%rax, %rbx
@@ -15450,33 +15450,33 @@ _sc_muladd_avx2:
         adcq	$0x00, %rbp
         # Sub product of top 4 words and order
         movq	$0xa7ed9ce5a30a2c13, %rdx
-        mulx	%r14, %rcx, %rax
+        mulxq	%r14, %rcx, %rax
         addq	%rcx, %r10
         adcq	%rax, %r11
-        mulx	%rbx, %rcx, %rax
+        mulxq	%rbx, %rcx, %rax
         adcq	%rcx, %r12
         adcq	%rax, %r13
-        mov	$0x00, %rsi
+        movq	$0x00, %rsi
         adcq	$0x00, %rsi
-        mulx	%r15, %rcx, %rax
+        mulxq	%r15, %rcx, %rax
         addq	%rcx, %r11
         adcq	%rax, %r12
-        mulx	%rbp, %rcx, %rax
+        mulxq	%rbp, %rcx, %rax
         adcq	%rcx, %r13
         adcq	%rax, %rsi
         movq	$0xeb2106215d086329, %rdx
-        mulx	%r14, %rcx, %rax
+        mulxq	%r14, %rcx, %rax
         addq	%rcx, %r11
         adcq	%rax, %r12
-        mulx	%rbx, %rcx, %rax
+        mulxq	%rbx, %rcx, %rax
         adcq	%rcx, %r13
         adcq	%rax, %rsi
-        mov	$0x00, %r8
+        movq	$0x00, %r8
         adcq	$0x00, %r8
-        mulx	%r15, %rcx, %rax
+        mulxq	%r15, %rcx, %rax
         addq	%rcx, %r12
         adcq	%rax, %r13
-        mulx	%rbp, %rcx, %rax
+        mulxq	%rbp, %rcx, %rax
         adcq	%rcx, %rsi
         adcq	%rax, %r8
         subq	%r14, %r12
@@ -15509,21 +15509,21 @@ _sc_muladd_avx2:
         # Sub product of top 2 words and order
         #   * -5812631a5cf5d3ed
         movq	$0xa7ed9ce5a30a2c13, %rdx
-        mulx	%r14, %r9, %rax
+        mulxq	%r14, %r9, %rax
         movq	$0x00, %rsi
         addq	%r9, %r10
         adcq	%rax, %r11
-        mulx	%r15, %r9, %rax
+        mulxq	%r15, %r9, %rax
         adcq	$0x00, %rsi
         addq	%r9, %r11
         adcq	%rax, %rsi
         #   * -14def9dea2f79cd7
         movq	$0xeb2106215d086329, %rdx
-        mulx	%r14, %r9, %rax
+        mulxq	%r14, %r9, %rax
         movq	$0x00, %r8
         addq	%r9, %r11
         adcq	%rax, %r12
-        mulx	%r15, %r9, %rax
+        mulxq	%r15, %r9, %rax
         adcq	$0x00, %r8
         addq	%r9, %r12
         adcq	%rax, %r8

+ 2 - 0
wolfcrypt/src/include.am

@@ -15,6 +15,8 @@ EXTRA_DIST += wolfcrypt/src/evp.c
 EXTRA_DIST += wolfcrypt/src/asm.c
 EXTRA_DIST += wolfcrypt/src/aes_asm.asm
 EXTRA_DIST += wolfcrypt/src/aes_gcm_asm.asm
+EXTRA_DIST += wolfcrypt/src/chacha_asm.asm
+EXTRA_DIST += wolfcrypt/src/poly1305_asm.asm
 EXTRA_DIST += wolfcrypt/src/wc_dsp.c
 EXTRA_DIST += wolfcrypt/src/sp_dsp32.c
 EXTRA_DIST += wolfcrypt/src/sp_x86_64_asm.asm

+ 15 - 13
wolfcrypt/src/poly1305.c

@@ -55,7 +55,7 @@ and Daniel J. Bernstein
     #pragma warning(disable: 4127)
 #endif
 
-#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
+#ifdef USE_INTEL_POLY1305_SPEEDUP
     #include <emmintrin.h>
     #include <immintrin.h>
 
@@ -70,6 +70,10 @@ and Daniel J. Bernstein
     #elif defined(__clang__) && defined(NO_AVX2_SUPPORT)
         #undef NO_AVX2_SUPPORT
     #endif
+    #if defined(_MSC_VER) && (_MSC_VER <= 1900)
+        #undef  NO_AVX2_SUPPORT
+        #define NO_AVX2_SUPPORT
+    #endif
 
     #define HAVE_INTEL_AVX1
     #ifndef NO_AVX2_SUPPORT
@@ -77,13 +81,12 @@ and Daniel J. Bernstein
     #endif
 #endif
 
-#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
+#ifdef USE_INTEL_POLY1305_SPEEDUP
 static word32 intel_flags = 0;
 static word32 cpu_flags_set = 0;
 #endif
 
-#if (defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)) || \
-        defined(POLY130564)
+#if defined(USE_INTEL_POLY1305_SPEEDUP) || defined(POLY130564)
     #if defined(_MSC_VER)
         #define POLY1305_NOINLINE __declspec(noinline)
     #elif defined(__GNUC__)
@@ -123,7 +126,7 @@ static word32 cpu_flags_set = 0;
     #endif
 #endif
 
-#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
+#ifdef USE_INTEL_POLY1305_SPEEDUP
 #ifdef __cplusplus
     extern "C" {
 #endif
@@ -266,7 +269,7 @@ with a given ctx pointer to a Poly1305 structure.
 static int poly1305_blocks(Poly1305* ctx, const unsigned char *m,
                      size_t bytes)
 {
-#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
+#ifdef USE_INTEL_POLY1305_SPEEDUP
     /* AVX2 is handled in wc_Poly1305Update. */
     SAVE_VECTOR_REGISTERS(return _svr_ret;);
     poly1305_blocks_avx(ctx, m, bytes);
@@ -400,7 +403,7 @@ number of bytes is less than the block size.
 */
 static int poly1305_block(Poly1305* ctx, const unsigned char *m)
 {
-#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
+#ifdef USE_INTEL_POLY1305_SPEEDUP
     /* No call to poly1305_block when AVX2, AVX2 does 4 blocks at a time. */
     SAVE_VECTOR_REGISTERS(return _svr_ret;);
     poly1305_block_avx(ctx, m);
@@ -415,8 +418,7 @@ static int poly1305_block(Poly1305* ctx, const unsigned char *m)
 #if !defined(WOLFSSL_ARMASM) || !defined(__aarch64__)
 int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
 {
-#if defined(POLY130564) && \
-    !(defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP))
+#if defined(POLY130564) && !defined(USE_INTEL_POLY1305_SPEEDUP)
     word64 t0,t1;
 #endif
 
@@ -437,7 +439,7 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
     if (keySz != 32 || ctx == NULL)
         return BAD_FUNC_ARG;
 
-#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
+#ifdef USE_INTEL_POLY1305_SPEEDUP
     if (!cpu_flags_set) {
         intel_flags = cpuid_get_flags();
         cpu_flags_set = 1;
@@ -504,7 +506,7 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
 
 int wc_Poly1305Final(Poly1305* ctx, byte* mac)
 {
-#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
+#ifdef USE_INTEL_POLY1305_SPEEDUP
 #elif defined(POLY130564)
 
     word64 h0,h1,h2,c;
@@ -523,7 +525,7 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac)
     if (ctx == NULL || mac == NULL)
         return BAD_FUNC_ARG;
 
-#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
+#ifdef USE_INTEL_POLY1305_SPEEDUP
     SAVE_VECTOR_REGISTERS(return _svr_ret;);
     #ifdef HAVE_INTEL_AVX2
     if (IS_INTEL_AVX2(intel_flags))
@@ -709,7 +711,7 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
     printf("\n");
 #endif
 
-#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
+#ifdef USE_INTEL_POLY1305_SPEEDUP
     #ifdef HAVE_INTEL_AVX2
     if (IS_INTEL_AVX2(intel_flags)) {
         SAVE_VECTOR_REGISTERS(return _svr_ret;);

+ 1 - 1
wolfcrypt/src/poly1305_asm.S

@@ -1106,7 +1106,7 @@ L_poly1305_avx2_final_start_copy:
         incb	%cl
         incb	%dl
 L_poly1305_avx2_final_cmp_copy:
-        cmp	%rcx, %rax
+        cmpb	%cl, %al
         jne	L_poly1305_avx2_final_start_copy
 #ifndef __APPLE__
         callq	poly1305_final_avx@plt

+ 1060 - 0
wolfcrypt/src/poly1305_asm.asm

@@ -0,0 +1,1060 @@
+; /* poly1305_asm.asm */
+; /*
+;  * Copyright (C) 2006-2024 wolfSSL Inc.
+;  *
+;  * This file is part of wolfSSL.
+;  *
+;  * wolfSSL is free software; you can redistribute it and/or modify
+;  * it under the terms of the GNU General Public License as published by
+;  * the Free Software Foundation; either version 2 of the License, or
+;  * (at your option) any later version.
+;  *
+;  * wolfSSL is distributed in the hope that it will be useful,
+;  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+;  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;  * GNU General Public License for more details.
+;  *
+;  * You should have received a copy of the GNU General Public License
+;  * along with this program; if not, write to the Free Software
+;  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+;  */
+IF @Version LT 1200
+; AVX2 instructions not recognized by old versions of MASM
+IFNDEF NO_AVX2_SUPPORT
+NO_AVX2_SUPPORT = 1
+ENDIF
+; MOVBE instruction not recognized by old versions of MASM
+IFNDEF NO_MOVBE_SUPPORT
+NO_MOVBE_SUPPORT = 1
+ENDIF
+ENDIF
+
+IFNDEF HAVE_INTEL_AVX1
+HAVE_INTEL_AVX1 = 1
+ENDIF
+IFNDEF NO_AVX2_SUPPORT
+HAVE_INTEL_AVX2 = 1
+ENDIF
+
+IFNDEF _WIN64
+_WIN64 = 1
+ENDIF
+
+IFDEF HAVE_INTEL_AVX1
+_text SEGMENT READONLY PARA
+poly1305_setkey_avx PROC
+        push	r12
+        push	r13
+        mov	r12, 1152921487695413247
+        mov	r13, 1152921487695413244
+        mov	rax, QWORD PTR [rdx]
+        mov	r8, QWORD PTR [rdx+8]
+        mov	r9, QWORD PTR [rdx+16]
+        mov	r10, QWORD PTR [rdx+24]
+        and	rax, r12
+        and	r8, r13
+        mov	r12, rax
+        mov	r13, r8
+        xor	r11, r11
+        mov	QWORD PTR [rcx], rax
+        mov	QWORD PTR [rcx+8], r8
+        mov	QWORD PTR [rcx+24], r11
+        mov	QWORD PTR [rcx+32], r11
+        mov	QWORD PTR [rcx+40], r11
+        mov	QWORD PTR [rcx+48], r9
+        mov	QWORD PTR [rcx+56], r10
+        mov	QWORD PTR [rcx+352], r11
+        mov	QWORD PTR [rcx+408], r11
+        mov	QWORD PTR [rcx+360], rax
+        mov	QWORD PTR [rcx+416], r8
+        add	r12, rax
+        add	r13, r8
+        mov	QWORD PTR [rcx+368], r12
+        mov	QWORD PTR [rcx+424], r13
+        add	r12, rax
+        add	r13, r8
+        mov	QWORD PTR [rcx+376], r12
+        mov	QWORD PTR [rcx+432], r13
+        add	r12, rax
+        add	r13, r8
+        mov	QWORD PTR [rcx+384], r12
+        mov	QWORD PTR [rcx+440], r13
+        add	r12, rax
+        add	r13, r8
+        mov	QWORD PTR [rcx+392], r12
+        mov	QWORD PTR [rcx+448], r13
+        add	r12, rax
+        add	r13, r8
+        mov	QWORD PTR [rcx+400], r12
+        mov	QWORD PTR [rcx+456], r13
+        mov	QWORD PTR [rcx+608], r11
+        mov	BYTE PTR [rcx+616], 1
+        pop	r13
+        pop	r12
+        ret
+poly1305_setkey_avx ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+poly1305_block_avx PROC
+        push	r15
+        push	rbx
+        push	r12
+        push	r13
+        push	r14
+        mov	r15, QWORD PTR [rcx]
+        mov	rbx, QWORD PTR [rcx+8]
+        mov	r8, QWORD PTR [rcx+24]
+        mov	r9, QWORD PTR [rcx+32]
+        mov	r10, QWORD PTR [rcx+40]
+        xor	r14, r14
+        mov	r14b, BYTE PTR [rcx+616]
+        ; h += m
+        mov	r11, QWORD PTR [rdx]
+        mov	r12, QWORD PTR [rdx+8]
+        add	r8, r11
+        adc	r9, r12
+        mov	rax, rbx
+        adc	r10, r14
+        ; r[1] * h[0] => rdx, rax ==> t2, t1
+        mul	r8
+        mov	r12, rax
+        mov	r13, rdx
+        ; r[0] * h[1] => rdx, rax ++> t2, t1
+        mov	rax, r15
+        mul	r9
+        add	r12, rax
+        mov	rax, r15
+        adc	r13, rdx
+        ; r[0] * h[0] => rdx, rax ==> t4, t0
+        mul	r8
+        mov	r11, rax
+        mov	r8, rdx
+        ; r[1] * h[1] => rdx, rax =+> t3, t2
+        mov	rax, rbx
+        mul	r9
+        ;   r[0] * h[2] +> t2
+        add	r13, QWORD PTR [rcx+8*r10+352]
+        mov	r14, rdx
+        add	r12, r8
+        adc	r13, rax
+        ;   r[1] * h[2] +> t3
+        adc	r14, QWORD PTR [rcx+8*r10+408]
+        ; r * h in r14, r13, r12, r11
+        ; h = (r * h) mod 2^130 - 5
+        mov	r10, r13
+        and	r13, -4
+        and	r10, 3
+        add	r11, r13
+        mov	r8, r13
+        adc	r12, r14
+        adc	r10, 0
+        shrd	r8, r14, 2
+        shr	r14, 2
+        add	r8, r11
+        adc	r12, r14
+        mov	r9, r12
+        adc	r10, 0
+        ; h in r10, r9, r8
+        ; Store h to ctx
+        mov	QWORD PTR [rcx+24], r8
+        mov	QWORD PTR [rcx+32], r9
+        mov	QWORD PTR [rcx+40], r10
+        pop	r14
+        pop	r13
+        pop	r12
+        pop	rbx
+        pop	r15
+        ret
+poly1305_block_avx ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+poly1305_blocks_avx PROC
+        push	rdi
+        push	rsi
+        push	r15
+        push	rbx
+        push	r12
+        push	r13
+        push	r14
+        mov	rdi, rcx
+        mov	rsi, rdx
+        mov	rcx, r8
+        mov	r15, QWORD PTR [rdi]
+        mov	rbx, QWORD PTR [rdi+8]
+        mov	r8, QWORD PTR [rdi+24]
+        mov	r9, QWORD PTR [rdi+32]
+        mov	r10, QWORD PTR [rdi+40]
+L_poly1305_avx_blocks_start:
+        ; h += m
+        mov	r11, QWORD PTR [rsi]
+        mov	r12, QWORD PTR [rsi+8]
+        add	r8, r11
+        adc	r9, r12
+        mov	rax, rbx
+        adc	r10, 0
+        ; r[1] * h[0] => rdx, rax ==> t2, t1
+        mul	r8
+        mov	r12, rax
+        mov	r13, rdx
+        ; r[0] * h[1] => rdx, rax ++> t2, t1
+        mov	rax, r15
+        mul	r9
+        add	r12, rax
+        mov	rax, r15
+        adc	r13, rdx
+        ; r[0] * h[0] => rdx, rax ==> t4, t0
+        mul	r8
+        mov	r11, rax
+        mov	r8, rdx
+        ; r[1] * h[1] => rdx, rax =+> t3, t2
+        mov	rax, rbx
+        mul	r9
+        ;   r[0] * h[2] +> t2
+        add	r13, QWORD PTR [rdi+8*r10+360]
+        mov	r14, rdx
+        add	r12, r8
+        adc	r13, rax
+        ;   r[1] * h[2] +> t3
+        adc	r14, QWORD PTR [rdi+8*r10+416]
+        ; r * h in r14, r13, r12, r11
+        ; h = (r * h) mod 2^130 - 5
+        mov	r10, r13
+        and	r13, -4
+        and	r10, 3
+        add	r11, r13
+        mov	r8, r13
+        adc	r12, r14
+        adc	r10, 0
+        shrd	r8, r14, 2
+        shr	r14, 2
+        add	r8, r11
+        adc	r12, r14
+        mov	r9, r12
+        adc	r10, 0
+        ; h in r10, r9, r8
+        ; Next block from message
+        add	rsi, 16
+        sub	rcx, 16
+        jg	L_poly1305_avx_blocks_start
+        ; Store h to ctx
+        mov	QWORD PTR [rdi+24], r8
+        mov	QWORD PTR [rdi+32], r9
+        mov	QWORD PTR [rdi+40], r10
+        pop	r14
+        pop	r13
+        pop	r12
+        pop	rbx
+        pop	r15
+        pop	rsi
+        pop	rdi
+        ret
+poly1305_blocks_avx ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+poly1305_final_avx PROC
+        push	rdi
+        push	rbx
+        push	r12
+        mov	rdi, rcx
+        mov	rbx, rdx
+        mov	rax, QWORD PTR [rdi+608]
+        test	rax, rax
+        je	L_poly1305_avx_final_no_more
+        mov	BYTE PTR [rdi+rax+480], 1
+        jmp	L_poly1305_avx_final_cmp_rem
+L_poly1305_avx_final_zero_rem:
+        mov	BYTE PTR [rdi+rax+480], 0
+L_poly1305_avx_final_cmp_rem:
+        inc	al
+        cmp	rax, 16
+        jl	L_poly1305_avx_final_zero_rem
+        mov	BYTE PTR [rdi+616], 0
+        lea	rdx, QWORD PTR [rdi+480]
+        call	poly1305_block_avx
+L_poly1305_avx_final_no_more:
+        mov	rax, QWORD PTR [rdi+24]
+        mov	rdx, QWORD PTR [rdi+32]
+        mov	rcx, QWORD PTR [rdi+40]
+        mov	r11, QWORD PTR [rdi+48]
+        mov	r12, QWORD PTR [rdi+56]
+        ; h %= p
+        ; h = (h + pad)
+        ; mod 2^130 - 5
+        mov	r8, rcx
+        and	rcx, 3
+        shr	r8, 2
+        ;   Multiply by 5
+        lea	r8, QWORD PTR [r8+4*r8+0]
+        add	rax, r8
+        adc	rdx, 0
+        adc	rcx, 0
+        ; Fixup when between (1 << 130) - 1 and (1 << 130) - 5
+        mov	r8, rax
+        mov	r9, rdx
+        mov	r10, rcx
+        add	r8, 5
+        adc	r9, 0
+        adc	r10, 0
+        cmp	r10, 4
+        cmove	rax, r8
+        cmove	rdx, r9
+        ; h += pad
+        add	rax, r11
+        adc	rdx, r12
+        mov	QWORD PTR [rbx], rax
+        mov	QWORD PTR [rbx+8], rdx
+        ; Zero out r
+        mov	QWORD PTR [rdi], 0
+        mov	QWORD PTR [rdi+8], 0
+        ; Zero out h
+        mov	QWORD PTR [rdi+24], 0
+        mov	QWORD PTR [rdi+32], 0
+        mov	QWORD PTR [rdi+40], 0
+        ; Zero out pad
+        mov	QWORD PTR [rdi+48], 0
+        mov	QWORD PTR [rdi+56], 0
+        pop	r12
+        pop	rbx
+        pop	rdi
+        ret
+poly1305_final_avx ENDP
+_text ENDS
+ENDIF
+IFDEF HAVE_INTEL_AVX2
+_text SEGMENT READONLY PARA
+poly1305_calc_powers_avx2 PROC
+        push	r12
+        push	r13
+        push	r14
+        push	r15
+        push	rdi
+        push	rsi
+        push	rbx
+        push	rbp
+        mov	r8, QWORD PTR [rcx]
+        mov	r9, QWORD PTR [rcx+8]
+        xor	r10, r10
+        ; Convert to 26 bits in 32
+        mov	rax, r8
+        mov	rdx, r8
+        mov	rsi, r8
+        mov	rbx, r9
+        mov	rbp, r9
+        shr	rdx, 26
+        shrd	rsi, r9, 52
+        shr	rbx, 14
+        shrd	rbp, r10, 40
+        and	rax, 67108863
+        and	rdx, 67108863
+        and	rsi, 67108863
+        and	rbx, 67108863
+        and	rbp, 67108863
+        mov	DWORD PTR [rcx+224], eax
+        mov	DWORD PTR [rcx+228], edx
+        mov	DWORD PTR [rcx+232], esi
+        mov	DWORD PTR [rcx+236], ebx
+        mov	DWORD PTR [rcx+240], ebp
+        mov	DWORD PTR [rcx+244], 0
+        ; Square 128-bit
+        mov	rax, r9
+        mul	r8
+        xor	r14, r14
+        mov	r12, rax
+        mov	r13, rdx
+        add	r12, rax
+        adc	r13, rdx
+        adc	r14, 0
+        mov	rax, r8
+        mul	rax
+        mov	r11, rax
+        mov	rdi, rdx
+        mov	rax, r9
+        mul	rax
+        add	r12, rdi
+        adc	r13, rax
+        adc	r14, rdx
+        ; Reduce 256-bit to 130-bit
+        mov	rax, r13
+        mov	rdx, r14
+        and	rax, -4
+        and	r13, 3
+        add	r11, rax
+        adc	r12, rdx
+        adc	r13, 0
+        shrd	rax, rdx, 2
+        shr	rdx, 2
+        add	r11, rax
+        adc	r12, rdx
+        adc	r13, 0
+        mov	rax, r13
+        shr	rax, 2
+        lea	rax, QWORD PTR [rax+4*rax+0]
+        and	r13, 3
+        add	r11, rax
+        adc	r12, 0
+        adc	r13, 0
+        ; Convert to 26 bits in 32
+        mov	rax, r11
+        mov	rdx, r11
+        mov	rsi, r11
+        mov	rbx, r12
+        mov	rbp, r12
+        shr	rdx, 26
+        shrd	rsi, r12, 52
+        shr	rbx, 14
+        shrd	rbp, r13, 40
+        and	rax, 67108863
+        and	rdx, 67108863
+        and	rsi, 67108863
+        and	rbx, 67108863
+        and	rbp, 67108863
+        mov	DWORD PTR [rcx+256], eax
+        mov	DWORD PTR [rcx+260], edx
+        mov	DWORD PTR [rcx+264], esi
+        mov	DWORD PTR [rcx+268], ebx
+        mov	DWORD PTR [rcx+272], ebp
+        mov	DWORD PTR [rcx+276], 0
+        ; Multiply 128-bit by 130-bit
+        ;   r1[0] * r2[0]
+        mov	rax, r8
+        mul	r11
+        mov	r14, rax
+        mov	r15, rdx
+        ;   r1[0] * r2[1]
+        mov	rax, r8
+        mul	r12
+        mov	rdi, 0
+        add	r15, rax
+        adc	rdi, rdx
+        ;   r1[1] * r2[0]
+        mov	rax, r9
+        mul	r11
+        mov	rsi, 0
+        add	r15, rax
+        adc	rdi, rdx
+        adc	rsi, 0
+        ;   r1[0] * r2[2]
+        mov	rax, r8
+        mul	r13
+        add	rdi, rax
+        adc	rsi, rdx
+        ;   r1[1] * r2[1]
+        mov	rax, r9
+        mul	r12
+        mov	rbx, 0
+        add	rdi, rax
+        adc	rsi, rdx
+        adc	rbx, 0
+        ;   r1[1] * r2[2]
+        mov	rax, r9
+        mul	r13
+        add	rsi, rax
+        adc	rbx, rdx
+        ; Reduce 260-bit to 130-bit
+        mov	rax, rdi
+        mov	rdx, rsi
+        mov	rbx, rbx
+        and	rax, -4
+        and	rdi, 3
+        add	r14, rax
+        adc	r15, rdx
+        adc	rdi, rbx
+        shrd	rax, rdx, 2
+        shrd	rdx, rbx, 2
+        shr	rbx, 2
+        add	r14, rax
+        adc	r15, rdx
+        adc	rdi, rbx
+        mov	rax, rdi
+        and	rdi, 3
+        shr	rax, 2
+        lea	rax, QWORD PTR [rax+4*rax+0]
+        add	r14, rax
+        adc	r15, 0
+        adc	rdi, 0
+        ; Convert to 26 bits in 32
+        mov	rax, r14
+        mov	rdx, r14
+        mov	rsi, r14
+        mov	rbx, r15
+        mov	rbp, r15
+        shr	rdx, 26
+        shrd	rsi, r15, 52
+        shr	rbx, 14
+        shrd	rbp, rdi, 40
+        and	rax, 67108863
+        and	rdx, 67108863
+        and	rsi, 67108863
+        and	rbx, 67108863
+        and	rbp, 67108863
+        mov	DWORD PTR [rcx+288], eax
+        mov	DWORD PTR [rcx+292], edx
+        mov	DWORD PTR [rcx+296], esi
+        mov	DWORD PTR [rcx+300], ebx
+        mov	DWORD PTR [rcx+304], ebp
+        mov	DWORD PTR [rcx+308], 0
+        ; Square 130-bit
+        mov	rax, r12
+        mul	r11
+        xor	r14, r14
+        mov	r9, rax
+        mov	r10, rdx
+        add	r9, rax
+        adc	r10, rdx
+        adc	r14, 0
+        mov	rax, r11
+        mul	rax
+        mov	r8, rax
+        mov	rdi, rdx
+        mov	rax, r12
+        mul	rax
+        add	r9, rdi
+        adc	r10, rax
+        adc	r14, rdx
+        mov	rax, r13
+        mul	rax
+        mov	r15, rax
+        mov	rax, r13
+        mul	r11
+        add	r10, rax
+        adc	r14, rdx
+        adc	r15, 0
+        add	r10, rax
+        adc	r14, rdx
+        adc	r15, 0
+        mov	rax, r13
+        mul	r12
+        add	r14, rax
+        adc	r15, rdx
+        add	r14, rax
+        adc	r15, rdx
+        ; Reduce 260-bit to 130-bit
+        mov	rax, r10
+        mov	rdx, r14
+        mov	rdi, r15
+        and	rax, -4
+        and	r10, 3
+        add	r8, rax
+        adc	r9, rdx
+        adc	r10, rdi
+        shrd	rax, rdx, 2
+        shrd	rdx, rdi, 2
+        shr	rdi, 2
+        add	r8, rax
+        adc	r9, rdx
+        adc	r10, rdi
+        mov	rax, r10
+        and	r10, 3
+        shr	rax, 2
+        lea	rax, QWORD PTR [rax+4*rax+0]
+        add	r8, rax
+        adc	r9, 0
+        adc	r10, 0
+        ; Convert to 26 bits in 32
+        mov	rax, r8
+        mov	rdx, r8
+        mov	rsi, r8
+        mov	rbx, r9
+        mov	rbp, r9
+        shr	rdx, 26
+        shrd	rsi, r9, 52
+        shr	rbx, 14
+        shrd	rbp, r10, 40
+        and	rax, 67108863
+        and	rdx, 67108863
+        and	rsi, 67108863
+        and	rbx, 67108863
+        and	rbp, 67108863
+        mov	DWORD PTR [rcx+320], eax
+        mov	DWORD PTR [rcx+324], edx
+        mov	DWORD PTR [rcx+328], esi
+        mov	DWORD PTR [rcx+332], ebx
+        mov	DWORD PTR [rcx+336], ebp
+        mov	DWORD PTR [rcx+340], 0
+        pop	rbp
+        pop	rbx
+        pop	rsi
+        pop	rdi
+        pop	r15
+        pop	r14
+        pop	r13
+        pop	r12
+        ret
+poly1305_calc_powers_avx2 ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+poly1305_setkey_avx2 PROC
+        call	poly1305_setkey_avx
+        vpxor	ymm0, ymm0, ymm0
+        vmovdqu	YMMWORD PTR [rcx+64], ymm0
+        vmovdqu	YMMWORD PTR [rcx+96], ymm0
+        vmovdqu	YMMWORD PTR [rcx+128], ymm0
+        vmovdqu	YMMWORD PTR [rcx+160], ymm0
+        vmovdqu	YMMWORD PTR [rcx+192], ymm0
+        mov	QWORD PTR [rcx+608], 0
+        mov	WORD PTR [rcx+616], 0
+        ret
+poly1305_setkey_avx2 ENDP
+_text ENDS
+_DATA SEGMENT
+ALIGN 16
+L_poly1305_avx2_blocks_mask QWORD 67108863, 67108863,
+    67108863, 67108863
+ptr_L_poly1305_avx2_blocks_mask QWORD L_poly1305_avx2_blocks_mask
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_poly1305_avx2_blocks_hibit QWORD 16777216, 16777216,
+    16777216, 16777216
+ptr_L_poly1305_avx2_blocks_hibit QWORD L_poly1305_avx2_blocks_hibit
+_DATA ENDS
+_text SEGMENT READONLY PARA
+poly1305_blocks_avx2 PROC
+        push	r12
+        push	rdi
+        push	rsi
+        push	rbx
+        push	r13
+        push	r14
+        mov	rdi, rcx
+        mov	rsi, rdx
+        mov	rdx, r8
+        sub	rsp, 480
+        vmovdqu	OWORD PTR [rsp+320], xmm6
+        vmovdqu	OWORD PTR [rsp+336], xmm7
+        vmovdqu	OWORD PTR [rsp+352], xmm8
+        vmovdqu	OWORD PTR [rsp+368], xmm9
+        vmovdqu	OWORD PTR [rsp+384], xmm10
+        vmovdqu	OWORD PTR [rsp+400], xmm11
+        vmovdqu	OWORD PTR [rsp+416], xmm12
+        vmovdqu	OWORD PTR [rsp+432], xmm13
+        vmovdqu	OWORD PTR [rsp+448], xmm14
+        vmovdqu	OWORD PTR [rsp+464], xmm15
+        mov	r13, QWORD PTR [ptr_L_poly1305_avx2_blocks_mask]
+        mov	r14, QWORD PTR [ptr_L_poly1305_avx2_blocks_hibit]
+        mov	rcx, rsp
+        and	rcx, -32
+        add	rcx, 32
+        vpxor	ymm15, ymm15, ymm15
+        mov	rbx, rcx
+        lea	rax, QWORD PTR [rdi+64]
+        add	rbx, 160
+        cmp	WORD PTR [rdi+616], 0
+        jne	L_poly1305_avx2_blocks_begin_h
+        ; Load the message data
+        vmovdqu	ymm0, YMMWORD PTR [rsi]
+        vmovdqu	ymm1, YMMWORD PTR [rsi+32]
+        vperm2i128	ymm2, ymm0, ymm1, 32
+        vperm2i128	ymm0, ymm0, ymm1, 49
+        vpunpckldq	ymm1, ymm2, ymm0
+        vpunpckhdq	ymm3, ymm2, ymm0
+        vpunpckldq	ymm0, ymm1, ymm15
+        vpunpckhdq	ymm1, ymm1, ymm15
+        vpunpckldq	ymm2, ymm3, ymm15
+        vpunpckhdq	ymm3, ymm3, ymm15
+        vmovdqu	ymm4, YMMWORD PTR [r14]
+        vpsllq	ymm1, ymm1, 6
+        vpsllq	ymm2, ymm2, 12
+        vpsllq	ymm3, ymm3, 18
+        vmovdqu	ymm14, YMMWORD PTR [r13]
+        ; Reduce, in place, the message data
+        vpsrlq	ymm10, ymm0, 26
+        vpsrlq	ymm11, ymm3, 26
+        vpand	ymm0, ymm0, ymm14
+        vpand	ymm3, ymm3, ymm14
+        vpaddq	ymm1, ymm10, ymm1
+        vpaddq	ymm4, ymm11, ymm4
+        vpsrlq	ymm10, ymm1, 26
+        vpsrlq	ymm11, ymm4, 26
+        vpand	ymm1, ymm1, ymm14
+        vpand	ymm4, ymm4, ymm14
+        vpaddq	ymm2, ymm10, ymm2
+        vpslld	ymm12, ymm11, 2
+        vpaddd	ymm12, ymm11, ymm12
+        vpsrlq	ymm10, ymm2, 26
+        vpaddq	ymm0, ymm12, ymm0
+        vpsrlq	ymm11, ymm0, 26
+        vpand	ymm2, ymm2, ymm14
+        vpand	ymm0, ymm0, ymm14
+        vpaddq	ymm3, ymm10, ymm3
+        vpaddq	ymm1, ymm11, ymm1
+        vpsrlq	ymm10, ymm3, 26
+        vpand	ymm3, ymm3, ymm14
+        vpaddq	ymm4, ymm10, ymm4
+        add	rsi, 64
+        sub	rdx, 64
+        jz	L_poly1305_avx2_blocks_store
+        jmp	L_poly1305_avx2_blocks_load_r4
+L_poly1305_avx2_blocks_begin_h:
+        ; Load the H values.
+        vmovdqu	ymm0, YMMWORD PTR [rax]
+        vmovdqu	ymm1, YMMWORD PTR [rax+32]
+        vmovdqu	ymm2, YMMWORD PTR [rax+64]
+        vmovdqu	ymm3, YMMWORD PTR [rax+96]
+        vmovdqu	ymm4, YMMWORD PTR [rax+128]
+        ; Check if there is a power of r to load - otherwise use r^4.
+        cmp	BYTE PTR [rdi+616], 0
+        je	L_poly1305_avx2_blocks_load_r4
+        ; Load the 4 powers of r - r^4, r^3, r^2, r^1.
+        vmovdqu	ymm8, YMMWORD PTR [rdi+224]
+        vmovdqu	ymm7, YMMWORD PTR [rdi+256]
+        vmovdqu	ymm6, YMMWORD PTR [rdi+288]
+        vmovdqu	ymm5, YMMWORD PTR [rdi+320]
+        vpermq	ymm5, ymm5, 216
+        vpermq	ymm6, ymm6, 216
+        vpermq	ymm7, ymm7, 216
+        vpermq	ymm8, ymm8, 216
+        vpunpcklqdq	ymm10, ymm5, ymm6
+        vpunpckhqdq	ymm11, ymm5, ymm6
+        vpunpcklqdq	ymm12, ymm7, ymm8
+        vpunpckhqdq	ymm13, ymm7, ymm8
+        vperm2i128	ymm5, ymm10, ymm12, 32
+        vperm2i128	ymm7, ymm10, ymm12, 49
+        vperm2i128	ymm9, ymm11, ymm13, 32
+        vpsrlq	ymm6, ymm5, 32
+        vpsrlq	ymm8, ymm7, 32
+        jmp	L_poly1305_avx2_blocks_mul_5
+L_poly1305_avx2_blocks_load_r4:
+        ; Load r^4 into all four positions.
+        vmovdqu	ymm13, YMMWORD PTR [rdi+320]
+        vpermq	ymm5, ymm13, 0
+        vpsrlq	ymm14, ymm13, 32
+        vpermq	ymm7, ymm13, 85
+        vpermq	ymm9, ymm13, 170
+        vpermq	ymm6, ymm14, 0
+        vpermq	ymm8, ymm14, 85
+L_poly1305_avx2_blocks_mul_5:
+        ; Multiply top 4 26-bit values of all four H by 5
+        vpslld	ymm10, ymm6, 2
+        vpslld	ymm11, ymm7, 2
+        vpslld	ymm12, ymm8, 2
+        vpslld	ymm13, ymm9, 2
+        vpaddq	ymm10, ymm6, ymm10
+        vpaddq	ymm11, ymm7, ymm11
+        vpaddq	ymm12, ymm8, ymm12
+        vpaddq	ymm13, ymm9, ymm13
+        ; Store powers of r and multiple of 5 for use in multiply.
+        vmovdqa	YMMWORD PTR [rbx], ymm10
+        vmovdqa	YMMWORD PTR [rbx+32], ymm11
+        vmovdqa	YMMWORD PTR [rbx+64], ymm12
+        vmovdqa	YMMWORD PTR [rbx+96], ymm13
+        vmovdqa	YMMWORD PTR [rcx], ymm5
+        vmovdqa	YMMWORD PTR [rcx+32], ymm6
+        vmovdqa	YMMWORD PTR [rcx+64], ymm7
+        vmovdqa	YMMWORD PTR [rcx+96], ymm8
+        vmovdqa	YMMWORD PTR [rcx+128], ymm9
+        vmovdqu	ymm14, YMMWORD PTR [r13]
+        ; If not finished then loop over data
+        cmp	BYTE PTR [rdi+616], 1
+        jne	L_poly1305_avx2_blocks_start
+        ; Do last multiply, reduce, add the four H together and move to
+        ; 32-bit registers
+        vpmuludq	ymm5, ymm4, [rbx]
+        vpmuludq	ymm10, ymm3, [rbx+32]
+        vpmuludq	ymm6, ymm4, [rbx+32]
+        vpmuludq	ymm11, ymm3, [rbx+64]
+        vpmuludq	ymm7, ymm4, [rbx+64]
+        vpaddq	ymm5, ymm10, ymm5
+        vpmuludq	ymm12, ymm2, [rbx+64]
+        vpmuludq	ymm8, ymm4, [rbx+96]
+        vpaddq	ymm6, ymm11, ymm6
+        vpmuludq	ymm13, ymm1, [rbx+96]
+        vpmuludq	ymm10, ymm2, [rbx+96]
+        vpaddq	ymm5, ymm12, ymm5
+        vpmuludq	ymm11, ymm3, [rbx+96]
+        vpmuludq	ymm12, ymm3, [rcx]
+        vpaddq	ymm5, ymm13, ymm5
+        vpmuludq	ymm9, ymm4, [rcx]
+        vpaddq	ymm6, ymm10, ymm6
+        vpmuludq	ymm13, ymm0, [rcx]
+        vpaddq	ymm7, ymm11, ymm7
+        vpmuludq	ymm10, ymm1, [rcx]
+        vpaddq	ymm8, ymm12, ymm8
+        vpmuludq	ymm11, ymm2, [rcx]
+        vpmuludq	ymm12, ymm2, [rcx+32]
+        vpaddq	ymm5, ymm13, ymm5
+        vpmuludq	ymm13, ymm3, [rcx+32]
+        vpaddq	ymm6, ymm10, ymm6
+        vpmuludq	ymm10, ymm0, [rcx+32]
+        vpaddq	ymm7, ymm11, ymm7
+        vpmuludq	ymm11, ymm1, [rcx+32]
+        vpaddq	ymm8, ymm12, ymm8
+        vpmuludq	ymm12, ymm1, [rcx+64]
+        vpaddq	ymm9, ymm13, ymm9
+        vpmuludq	ymm13, ymm2, [rcx+64]
+        vpaddq	ymm6, ymm10, ymm6
+        vpmuludq	ymm10, ymm0, [rcx+64]
+        vpaddq	ymm7, ymm11, ymm7
+        vpmuludq	ymm11, ymm0, [rcx+96]
+        vpaddq	ymm8, ymm12, ymm8
+        vpmuludq	ymm12, ymm1, [rcx+96]
+        vpaddq	ymm9, ymm13, ymm9
+        vpaddq	ymm7, ymm10, ymm7
+        vpmuludq	ymm13, ymm0, [rcx+128]
+        vpaddq	ymm8, ymm11, ymm8
+        vpaddq	ymm9, ymm12, ymm9
+        vpaddq	ymm9, ymm13, ymm9
+        vpsrlq	ymm10, ymm5, 26
+        vpsrlq	ymm11, ymm8, 26
+        vpand	ymm5, ymm5, ymm14
+        vpand	ymm8, ymm8, ymm14
+        vpaddq	ymm6, ymm10, ymm6
+        vpaddq	ymm9, ymm11, ymm9
+        vpsrlq	ymm10, ymm6, 26
+        vpsrlq	ymm11, ymm9, 26
+        vpand	ymm1, ymm6, ymm14
+        vpand	ymm4, ymm9, ymm14
+        vpaddq	ymm7, ymm10, ymm7
+        vpslld	ymm12, ymm11, 2
+        vpaddd	ymm12, ymm11, ymm12
+        vpsrlq	ymm10, ymm7, 26
+        vpaddq	ymm5, ymm12, ymm5
+        vpsrlq	ymm11, ymm5, 26
+        vpand	ymm2, ymm7, ymm14
+        vpand	ymm0, ymm5, ymm14
+        vpaddq	ymm8, ymm10, ymm8
+        vpaddq	ymm1, ymm11, ymm1
+        vpsrlq	ymm10, ymm8, 26
+        vpand	ymm3, ymm8, ymm14
+        vpaddq	ymm4, ymm10, ymm4
+        vpsrldq	ymm5, ymm0, 8
+        vpsrldq	ymm6, ymm1, 8
+        vpsrldq	ymm7, ymm2, 8
+        vpsrldq	ymm8, ymm3, 8
+        vpsrldq	ymm9, ymm4, 8
+        vpaddq	ymm0, ymm5, ymm0
+        vpaddq	ymm1, ymm6, ymm1
+        vpaddq	ymm2, ymm7, ymm2
+        vpaddq	ymm3, ymm8, ymm3
+        vpaddq	ymm4, ymm9, ymm4
+        vpermq	ymm5, ymm0, 2
+        vpermq	ymm6, ymm1, 2
+        vpermq	ymm7, ymm2, 2
+        vpermq	ymm8, ymm3, 2
+        vpermq	ymm9, ymm4, 2
+        vpaddq	ymm0, ymm5, ymm0
+        vpaddq	ymm1, ymm6, ymm1
+        vpaddq	ymm2, ymm7, ymm2
+        vpaddq	ymm3, ymm8, ymm3
+        vpaddq	ymm4, ymm9, ymm4
+        vmovd	r8d, xmm0
+        vmovd	r9d, xmm1
+        vmovd	r10d, xmm2
+        vmovd	r11d, xmm3
+        vmovd	r12d, xmm4
+        jmp	L_poly1305_avx2_blocks_end_calc
+L_poly1305_avx2_blocks_start:
+        vmovdqu	ymm5, YMMWORD PTR [rsi]
+        vmovdqu	ymm6, YMMWORD PTR [rsi+32]
+        vperm2i128	ymm7, ymm5, ymm6, 32
+        vperm2i128	ymm5, ymm5, ymm6, 49
+        vpunpckldq	ymm6, ymm7, ymm5
+        vpunpckhdq	ymm8, ymm7, ymm5
+        vpunpckldq	ymm5, ymm6, ymm15
+        vpunpckhdq	ymm6, ymm6, ymm15
+        vpunpckldq	ymm7, ymm8, ymm15
+        vpunpckhdq	ymm8, ymm8, ymm15
+        vmovdqu	ymm9, YMMWORD PTR [r14]
+        vpsllq	ymm6, ymm6, 6
+        vpsllq	ymm7, ymm7, 12
+        vpsllq	ymm8, ymm8, 18
+        vpmuludq	ymm10, ymm4, [rbx]
+        vpaddq	ymm5, ymm10, ymm5
+        vpmuludq	ymm10, ymm3, [rbx+32]
+        vpmuludq	ymm11, ymm4, [rbx+32]
+        vpaddq	ymm6, ymm11, ymm6
+        vpmuludq	ymm11, ymm3, [rbx+64]
+        vpmuludq	ymm12, ymm4, [rbx+64]
+        vpaddq	ymm7, ymm12, ymm7
+        vpaddq	ymm5, ymm10, ymm5
+        vpmuludq	ymm12, ymm2, [rbx+64]
+        vpmuludq	ymm13, ymm4, [rbx+96]
+        vpaddq	ymm8, ymm13, ymm8
+        vpaddq	ymm6, ymm11, ymm6
+        vpmuludq	ymm13, ymm1, [rbx+96]
+        vpmuludq	ymm10, ymm2, [rbx+96]
+        vpaddq	ymm5, ymm12, ymm5
+        vpmuludq	ymm11, ymm3, [rbx+96]
+        vpmuludq	ymm12, ymm3, [rcx]
+        vpaddq	ymm5, ymm13, ymm5
+        vpmuludq	ymm13, ymm4, [rcx]
+        vpaddq	ymm9, ymm13, ymm9
+        vpaddq	ymm6, ymm10, ymm6
+        vpmuludq	ymm13, ymm0, [rcx]
+        vpaddq	ymm7, ymm11, ymm7
+        vpmuludq	ymm10, ymm1, [rcx]
+        vpaddq	ymm8, ymm12, ymm8
+        vpmuludq	ymm11, ymm2, [rcx]
+        vpmuludq	ymm12, ymm2, [rcx+32]
+        vpaddq	ymm5, ymm13, ymm5
+        vpmuludq	ymm13, ymm3, [rcx+32]
+        vpaddq	ymm6, ymm10, ymm6
+        vpmuludq	ymm10, ymm0, [rcx+32]
+        vpaddq	ymm7, ymm11, ymm7
+        vpmuludq	ymm11, ymm1, [rcx+32]
+        vpaddq	ymm8, ymm12, ymm8
+        vpmuludq	ymm12, ymm1, [rcx+64]
+        vpaddq	ymm9, ymm13, ymm9
+        vpmuludq	ymm13, ymm2, [rcx+64]
+        vpaddq	ymm6, ymm10, ymm6
+        vpmuludq	ymm10, ymm0, [rcx+64]
+        vpaddq	ymm7, ymm11, ymm7
+        vpmuludq	ymm11, ymm0, [rcx+96]
+        vpaddq	ymm8, ymm12, ymm8
+        vpmuludq	ymm12, ymm1, [rcx+96]
+        vpaddq	ymm9, ymm13, ymm9
+        vpaddq	ymm7, ymm10, ymm7
+        vpmuludq	ymm13, ymm0, [rcx+128]
+        vpaddq	ymm8, ymm11, ymm8
+        vpaddq	ymm9, ymm12, ymm9
+        vpaddq	ymm9, ymm13, ymm9
+        vpsrlq	ymm10, ymm5, 26
+        vpsrlq	ymm11, ymm8, 26
+        vpand	ymm5, ymm5, ymm14
+        vpand	ymm8, ymm8, ymm14
+        vpaddq	ymm6, ymm10, ymm6
+        vpaddq	ymm9, ymm11, ymm9
+        vpsrlq	ymm10, ymm6, 26
+        vpsrlq	ymm11, ymm9, 26
+        vpand	ymm1, ymm6, ymm14
+        vpand	ymm4, ymm9, ymm14
+        vpaddq	ymm7, ymm10, ymm7
+        vpslld	ymm12, ymm11, 2
+        vpaddd	ymm12, ymm11, ymm12
+        vpsrlq	ymm10, ymm7, 26
+        vpaddq	ymm5, ymm12, ymm5
+        vpsrlq	ymm11, ymm5, 26
+        vpand	ymm2, ymm7, ymm14
+        vpand	ymm0, ymm5, ymm14
+        vpaddq	ymm8, ymm10, ymm8
+        vpaddq	ymm1, ymm11, ymm1
+        vpsrlq	ymm10, ymm8, 26
+        vpand	ymm3, ymm8, ymm14
+        vpaddq	ymm4, ymm10, ymm4
+        add	rsi, 64
+        sub	rdx, 64
+        jnz	L_poly1305_avx2_blocks_start
+L_poly1305_avx2_blocks_store:
+        ; Store four H values - state
+        vmovdqu	YMMWORD PTR [rax], ymm0
+        vmovdqu	YMMWORD PTR [rax+32], ymm1
+        vmovdqu	YMMWORD PTR [rax+64], ymm2
+        vmovdqu	YMMWORD PTR [rax+96], ymm3
+        vmovdqu	YMMWORD PTR [rax+128], ymm4
+L_poly1305_avx2_blocks_end_calc:
+        cmp	BYTE PTR [rdi+616], 0
+        je	L_poly1305_avx2_blocks_complete
+        mov	rax, r8
+        mov	rdx, r10
+        mov	rcx, r12
+        shr	rdx, 12
+        shr	rcx, 24
+        shl	r9, 26
+        shl	r10, 52
+        shl	r11, 14
+        shl	r12, 40
+        add	rax, r9
+        adc	rax, r10
+        adc	rdx, r11
+        adc	rdx, r12
+        adc	rcx, 0
+        mov	r8, rcx
+        and	rcx, 3
+        shr	r8, 2
+        lea	r8, QWORD PTR [r8+4*r8+0]
+        add	rax, r8
+        adc	rdx, 0
+        adc	rcx, 0
+        mov	QWORD PTR [rdi+24], rax
+        mov	QWORD PTR [rdi+32], rdx
+        mov	QWORD PTR [rdi+40], rcx
+L_poly1305_avx2_blocks_complete:
+        mov	BYTE PTR [rdi+617], 1
+        vzeroupper
+        vmovdqu	xmm6, OWORD PTR [rsp+320]
+        vmovdqu	xmm7, OWORD PTR [rsp+336]
+        vmovdqu	xmm8, OWORD PTR [rsp+352]
+        vmovdqu	xmm9, OWORD PTR [rsp+368]
+        vmovdqu	xmm10, OWORD PTR [rsp+384]
+        vmovdqu	xmm11, OWORD PTR [rsp+400]
+        vmovdqu	xmm12, OWORD PTR [rsp+416]
+        vmovdqu	xmm13, OWORD PTR [rsp+432]
+        vmovdqu	xmm14, OWORD PTR [rsp+448]
+        vmovdqu	xmm15, OWORD PTR [rsp+464]
+        add	rsp, 480
+        pop	r14
+        pop	r13
+        pop	rbx
+        pop	rsi
+        pop	rdi
+        pop	r12
+        ret
+poly1305_blocks_avx2 ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+poly1305_final_avx2 PROC
+        push	rdi
+        push	rsi
+        mov	rdi, rcx
+        mov	rsi, rdx
+        mov	BYTE PTR [rdi+616], 1
+        mov	cl, BYTE PTR [rdi+617]
+        cmp	cl, 0
+        je	L_poly1305_avx2_final_done_blocks_X4
+        push	rsi
+        mov	r8, 64
+        xor	rdx, rdx
+        mov	rcx, rdi
+        call	poly1305_blocks_avx2
+        pop	rsi
+L_poly1305_avx2_final_done_blocks_X4:
+        mov	rax, QWORD PTR [rdi+608]
+        mov	rcx, rax
+        and	rcx, -16
+        cmp	cl, 0
+        je	L_poly1305_avx2_final_done_blocks
+        push	rcx
+        push	rax
+        push	rsi
+        mov	r8, rcx
+        lea	rdx, QWORD PTR [rdi+480]
+        mov	rcx, rdi
+        call	poly1305_blocks_avx
+        pop	rsi
+        pop	rax
+        pop	rcx
+L_poly1305_avx2_final_done_blocks:
+        sub	QWORD PTR [rdi+608], rcx
+        xor	rdx, rdx
+        jmp	L_poly1305_avx2_final_cmp_copy
+L_poly1305_avx2_final_start_copy:
+        mov	r8b, BYTE PTR [rdi+rcx+480]
+        mov	BYTE PTR [rdi+rdx+480], r8b
+        inc	cl
+        inc	dl
+L_poly1305_avx2_final_cmp_copy:
+        cmp	al, cl
+        jne	L_poly1305_avx2_final_start_copy
+        mov	rcx, rdi
+        mov	rdx, rsi
+        call	poly1305_final_avx
+        vpxor	ymm0, ymm0, ymm0
+        vmovdqu	YMMWORD PTR [rdi+64], ymm0
+        vmovdqu	YMMWORD PTR [rdi+96], ymm0
+        vmovdqu	YMMWORD PTR [rdi+128], ymm0
+        vmovdqu	YMMWORD PTR [rdi+160], ymm0
+        vmovdqu	YMMWORD PTR [rdi+192], ymm0
+        vmovdqu	YMMWORD PTR [rdi+224], ymm0
+        vmovdqu	YMMWORD PTR [rdi+256], ymm0
+        vmovdqu	YMMWORD PTR [rdi+288], ymm0
+        vmovdqu	YMMWORD PTR [rdi+320], ymm0
+        mov	QWORD PTR [rdi+608], 0
+        mov	WORD PTR [rdi+616], 0
+        vzeroupper
+        pop	rsi
+        pop	rdi
+        ret
+poly1305_final_avx2 ENDP
+_text ENDS
+ENDIF
+END

+ 0 - 8
wolfcrypt/test/test.c

@@ -3471,14 +3471,6 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t sha256_test(void)
         ERROR_OUT(WC_TEST_RET_ENC_EC(ret), exit);
     }
     if (XMEMCMP(hash, b.output, WC_SHA256_DIGEST_SIZE) != 0) {
-{
-    for (int ii = 0; ii < WC_SHA256_DIGEST_SIZE; ii++)
-        fprintf(stderr, " %02x", hash[ii]);
-    fprintf(stderr, "\n");
-    for (int ii = 0; ii < WC_SHA256_DIGEST_SIZE; ii++)
-        fprintf(stderr, " %02x", b.output[ii]);
-    fprintf(stderr, "\n");
-}
         ERROR_OUT(WC_TEST_RET_ENC_NC, exit);
     }
 #endif

+ 12 - 0
wolfssl.vcproj

@@ -199,6 +199,10 @@
 				RelativePath=".\wolfcrypt\src\aes_asm.asm"
 				>
 			</File>
+			<File
+				RelativePath=".\wolfcrypt\src\aes_gcm_asm.asm"
+				>
+			</File>
 			<File
 				RelativePath=".\wolfcrypt\src\arc4.c"
 				>
@@ -227,6 +231,10 @@
 				RelativePath=".\wolfcrypt\src\chacha.c"
 				>
 			</File>
+			<File
+				RelativePath=".\wolfcrypt\src\chacha_asm.asm"
+				>
+			</File>
 			<File
 				RelativePath=".\wolfcrypt\src\chacha20_poly1305.c"
 				>
@@ -331,6 +339,10 @@
 				RelativePath=".\wolfcrypt\src\poly1305.c"
 				>
 			</File>
+			<File
+				RelativePath=".\wolfcrypt\src\poly1305_asm.asm"
+				>
+			</File>
 			<File
 				RelativePath=".\wolfcrypt\src\pwdbased.c"
 				>

+ 28 - 0
wolfssl.vcxproj

@@ -384,6 +384,34 @@
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(OutDir)%(Filename).obj</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='DLL Release|x64'">$(IntDir)%(Filename).obj</Outputs>
     </CustomBuild>
+    <CustomBuild Include="wolfcrypt\src\chacha_asm.asm">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='DLL Debug|x64'">false</ExcludedFromBuild>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='DLL Debug|x64'">ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(OutDir)%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='DLL Debug|x64'">$(IntDir)%(Filename).obj</Outputs>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='DLL Release|x64'">false</ExcludedFromBuild>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='DLL Release|x64'">ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(OutDir)%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='DLL Release|x64'">$(IntDir)%(Filename).obj</Outputs>
+    </CustomBuild>
+    <CustomBuild Include="wolfcrypt\src\poly1305_asm.asm">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='DLL Debug|x64'">false</ExcludedFromBuild>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='DLL Debug|x64'">ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(OutDir)%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='DLL Debug|x64'">$(IntDir)%(Filename).obj</Outputs>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='DLL Release|x64'">false</ExcludedFromBuild>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='DLL Release|x64'">ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(OutDir)%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='DLL Release|x64'">$(IntDir)%(Filename).obj</Outputs>
+    </CustomBuild>
     <CustomBuild Include="wolfcrypt\src\sp_x86_64_asm.asm">
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='DLL Debug|x64'">false</ExcludedFromBuild>

+ 1 - 1
wolfssl/wolfcrypt/chacha.h

@@ -77,7 +77,7 @@ enum {
 
 typedef struct ChaCha {
     word32 X[CHACHA_CHUNK_WORDS];           /* state of cipher */
-#ifdef HAVE_INTEL_AVX1
+#if defined(USE_INTEL_CHACHA_SPEEDUP)
     /* vpshufd reads 16 bytes but we only use bottom 4. */
     byte extra[12];
 #endif

+ 9 - 2
wolfssl/wolfcrypt/poly1305.h

@@ -48,7 +48,14 @@
 #define WC_HAS_GCC_4_4_64BIT
 #endif
 
-#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
+#ifdef WOLFSSL_X86_64_BUILD
+#if defined(USE_INTEL_SPEEDUP) && !defined(NO_POLY1305_ASM)
+    #define USE_INTEL_POLY1305_SPEEDUP
+    #define HAVE_INTEL_AVX1
+#endif
+#endif
+
+#if defined(USE_INTEL_POLY1305_SPEEDUP)
 #elif (defined(WC_HAS_SIZEOF_INT128_64BIT) || defined(WC_HAS_MSVC_64BIT) ||  \
        defined(WC_HAS_GCC_4_4_64BIT))
 #define POLY130564
@@ -67,7 +74,7 @@ enum {
 
 /* Poly1305 state */
 typedef struct Poly1305 {
-#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
+#ifdef USE_INTEL_POLY1305_SPEEDUP
     word64 r[3];
     word64 h[3];
     word64 pad[2];

Some files were not shown because too many files changed in this diff