|
@@ -0,0 +1,1426 @@
|
|
|
|
+; /* chacha_asm.asm */
|
|
|
|
+; /*
|
|
|
|
+; * Copyright (C) 2006-2024 wolfSSL Inc.
|
|
|
|
+; *
|
|
|
|
+; * This file is part of wolfSSL.
|
|
|
|
+; *
|
|
|
|
+; * wolfSSL is free software; you can redistribute it and/or modify
|
|
|
|
+; * it under the terms of the GNU General Public License as published by
|
|
|
|
+; * the Free Software Foundation; either version 2 of the License, or
|
|
|
|
+; * (at your option) any later version.
|
|
|
|
+; *
|
|
|
|
+; * wolfSSL is distributed in the hope that it will be useful,
|
|
|
|
+; * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
+; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
+; * GNU General Public License for more details.
|
|
|
|
+; *
|
|
|
|
+; * You should have received a copy of the GNU General Public License
|
|
|
|
+; * along with this program; if not, write to the Free Software
|
|
|
|
+; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
|
|
|
|
+; */
|
|
|
|
+IF @Version LT 1200
|
|
|
|
+; AVX2 instructions not recognized by old versions of MASM
|
|
|
|
+IFNDEF NO_AVX2_SUPPORT
|
|
|
|
+NO_AVX2_SUPPORT = 1
|
|
|
|
+ENDIF
|
|
|
|
+; MOVBE instruction not recognized by old versions of MASM
|
|
|
|
+IFNDEF NO_MOVBE_SUPPORT
|
|
|
|
+NO_MOVBE_SUPPORT = 1
|
|
|
|
+ENDIF
|
|
|
|
+ENDIF
|
|
|
|
+
|
|
|
|
+IFNDEF HAVE_INTEL_AVX1
|
|
|
|
+HAVE_INTEL_AVX1 = 1
|
|
|
|
+ENDIF
|
|
|
|
+IFNDEF NO_AVX2_SUPPORT
|
|
|
|
+HAVE_INTEL_AVX2 = 1
|
|
|
|
+ENDIF
|
|
|
|
+
|
|
|
|
+IFNDEF _WIN64
|
|
|
|
+_WIN64 = 1
|
|
|
|
+ENDIF
|
|
|
|
+
|
|
|
|
+_text SEGMENT READONLY PARA
|
|
|
|
+chacha_encrypt_x64 PROC
|
|
|
|
+ push rbx
|
|
|
|
+ push rbp
|
|
|
|
+ push r12
|
|
|
|
+ push r13
|
|
|
|
+ push r14
|
|
|
|
+ push r15
|
|
|
|
+ sub rsp, 64
|
|
|
|
+ cmp r9d, 64
|
|
|
|
+ jl L_chacha_x64_small
|
|
|
|
+L_chacha_x64_start:
|
|
|
|
+ sub rsp, 48
|
|
|
|
+ mov QWORD PTR [rsp+24], r8
|
|
|
|
+ mov QWORD PTR [rsp+32], rdx
|
|
|
|
+ mov QWORD PTR [rsp+40], r9
|
|
|
|
+ mov rax, QWORD PTR [rcx+32]
|
|
|
|
+ mov rbx, QWORD PTR [rcx+40]
|
|
|
|
+ mov QWORD PTR [rsp+8], rax
|
|
|
|
+ mov QWORD PTR [rsp+16], rbx
|
|
|
|
+ mov eax, DWORD PTR [rcx]
|
|
|
|
+ mov ebx, DWORD PTR [rcx+4]
|
|
|
|
+ mov r9d, DWORD PTR [rcx+8]
|
|
|
|
+ mov r8d, DWORD PTR [rcx+12]
|
|
|
|
+ mov r8d, DWORD PTR [rcx+16]
|
|
|
|
+ mov r9d, DWORD PTR [rcx+20]
|
|
|
|
+ mov r10d, DWORD PTR [rcx+24]
|
|
|
|
+ mov r11d, DWORD PTR [rcx+28]
|
|
|
|
+ mov r12d, DWORD PTR [rcx+48]
|
|
|
|
+ mov r13d, DWORD PTR [rcx+52]
|
|
|
|
+ mov r14d, DWORD PTR [rcx+56]
|
|
|
|
+ mov r15d, DWORD PTR [rcx+60]
|
|
|
|
+ mov BYTE PTR [rsp], 10
|
|
|
|
+ mov edx, DWORD PTR [rsp+8]
|
|
|
|
+ mov ebp, DWORD PTR [rsp+12]
|
|
|
|
+L_chacha_x64_block_crypt_start:
|
|
|
|
+ add eax, r8d
|
|
|
|
+ add ebx, r9d
|
|
|
|
+ xor r12d, eax
|
|
|
|
+ xor r13d, ebx
|
|
|
|
+ rol r12d, 16
|
|
|
|
+ rol r13d, 16
|
|
|
|
+ add edx, r12d
|
|
|
|
+ add ebp, r13d
|
|
|
|
+ xor r8d, edx
|
|
|
|
+ xor r9d, ebp
|
|
|
|
+ rol r8d, 12
|
|
|
|
+ rol r9d, 12
|
|
|
|
+ add eax, r8d
|
|
|
|
+ add ebx, r9d
|
|
|
|
+ xor r12d, eax
|
|
|
|
+ xor r13d, ebx
|
|
|
|
+ rol r12d, 8
|
|
|
|
+ rol r13d, 8
|
|
|
|
+ add edx, r12d
|
|
|
|
+ add ebp, r13d
|
|
|
|
+ xor r8d, edx
|
|
|
|
+ xor r9d, ebp
|
|
|
|
+ rol r8d, 7
|
|
|
|
+ rol r9d, 7
|
|
|
|
+ mov DWORD PTR [rsp+8], edx
|
|
|
|
+ mov DWORD PTR [rsp+12], ebp
|
|
|
|
+ mov edx, DWORD PTR [rsp+16]
|
|
|
|
+ mov ebp, DWORD PTR [rsp+20]
|
|
|
|
+ add r9d, r10d
|
|
|
|
+ add r8d, r11d
|
|
|
|
+ xor r14d, r9d
|
|
|
|
+ xor r15d, r8d
|
|
|
|
+ rol r14d, 16
|
|
|
|
+ rol r15d, 16
|
|
|
|
+ add edx, r14d
|
|
|
|
+ add ebp, r15d
|
|
|
|
+ xor r10d, edx
|
|
|
|
+ xor r11d, ebp
|
|
|
|
+ rol r10d, 12
|
|
|
|
+ rol r11d, 12
|
|
|
|
+ add r9d, r10d
|
|
|
|
+ add r8d, r11d
|
|
|
|
+ xor r14d, r9d
|
|
|
|
+ xor r15d, r8d
|
|
|
|
+ rol r14d, 8
|
|
|
|
+ rol r15d, 8
|
|
|
|
+ add edx, r14d
|
|
|
|
+ add ebp, r15d
|
|
|
|
+ xor r10d, edx
|
|
|
|
+ xor r11d, ebp
|
|
|
|
+ rol r10d, 7
|
|
|
|
+ rol r11d, 7
|
|
|
|
+ add eax, r9d
|
|
|
|
+ add ebx, r10d
|
|
|
|
+ xor r15d, eax
|
|
|
|
+ xor r12d, ebx
|
|
|
|
+ rol r15d, 16
|
|
|
|
+ rol r12d, 16
|
|
|
|
+ add edx, r15d
|
|
|
|
+ add ebp, r12d
|
|
|
|
+ xor r9d, edx
|
|
|
|
+ xor r10d, ebp
|
|
|
|
+ rol r9d, 12
|
|
|
|
+ rol r10d, 12
|
|
|
|
+ add eax, r9d
|
|
|
|
+ add ebx, r10d
|
|
|
|
+ xor r15d, eax
|
|
|
|
+ xor r12d, ebx
|
|
|
|
+ rol r15d, 8
|
|
|
|
+ rol r12d, 8
|
|
|
|
+ add edx, r15d
|
|
|
|
+ add ebp, r12d
|
|
|
|
+ xor r9d, edx
|
|
|
|
+ xor r10d, ebp
|
|
|
|
+ rol r9d, 7
|
|
|
|
+ rol r10d, 7
|
|
|
|
+ mov DWORD PTR [rsp+16], edx
|
|
|
|
+ mov DWORD PTR [rsp+20], ebp
|
|
|
|
+ mov edx, DWORD PTR [rsp+8]
|
|
|
|
+ mov ebp, DWORD PTR [rsp+12]
|
|
|
|
+ add r9d, r11d
|
|
|
|
+ add r8d, r8d
|
|
|
|
+ xor r13d, r9d
|
|
|
|
+ xor r14d, r8d
|
|
|
|
+ rol r13d, 16
|
|
|
|
+ rol r14d, 16
|
|
|
|
+ add edx, r13d
|
|
|
|
+ add ebp, r14d
|
|
|
|
+ xor r11d, edx
|
|
|
|
+ xor r8d, ebp
|
|
|
|
+ rol r11d, 12
|
|
|
|
+ rol r8d, 12
|
|
|
|
+ add r9d, r11d
|
|
|
|
+ add r8d, r8d
|
|
|
|
+ xor r13d, r9d
|
|
|
|
+ xor r14d, r8d
|
|
|
|
+ rol r13d, 8
|
|
|
|
+ rol r14d, 8
|
|
|
|
+ add edx, r13d
|
|
|
|
+ add ebp, r14d
|
|
|
|
+ xor r11d, edx
|
|
|
|
+ xor r8d, ebp
|
|
|
|
+ rol r11d, 7
|
|
|
|
+ rol r8d, 7
|
|
|
|
+ dec BYTE PTR [rsp]
|
|
|
|
+ jnz L_chacha_x64_block_crypt_start
|
|
|
|
+ mov DWORD PTR [rsp+8], edx
|
|
|
|
+ mov DWORD PTR [rsp+12], ebp
|
|
|
|
+ mov rdx, QWORD PTR [rsp+32]
|
|
|
|
+ mov rbp, QWORD PTR [rsp+24]
|
|
|
|
+ add eax, DWORD PTR [rcx]
|
|
|
|
+ add ebx, DWORD PTR [rcx+4]
|
|
|
|
+ add r9d, DWORD PTR [rcx+8]
|
|
|
|
+ add r8d, DWORD PTR [rcx+12]
|
|
|
|
+ add r8d, DWORD PTR [rcx+16]
|
|
|
|
+ add r9d, DWORD PTR [rcx+20]
|
|
|
|
+ add r10d, DWORD PTR [rcx+24]
|
|
|
|
+ add r11d, DWORD PTR [rcx+28]
|
|
|
|
+ add r12d, DWORD PTR [rcx+48]
|
|
|
|
+ add r13d, DWORD PTR [rcx+52]
|
|
|
|
+ add r14d, DWORD PTR [rcx+56]
|
|
|
|
+ add r15d, DWORD PTR [rcx+60]
|
|
|
|
+ xor eax, DWORD PTR [rdx]
|
|
|
|
+ xor ebx, DWORD PTR [rdx+4]
|
|
|
|
+ xor r9d, DWORD PTR [rdx+8]
|
|
|
|
+ xor r8d, DWORD PTR [rdx+12]
|
|
|
|
+ xor r8d, DWORD PTR [rdx+16]
|
|
|
|
+ xor r9d, DWORD PTR [rdx+20]
|
|
|
|
+ xor r10d, DWORD PTR [rdx+24]
|
|
|
|
+ xor r11d, DWORD PTR [rdx+28]
|
|
|
|
+ xor r12d, DWORD PTR [rdx+48]
|
|
|
|
+ xor r13d, DWORD PTR [rdx+52]
|
|
|
|
+ xor r14d, DWORD PTR [rdx+56]
|
|
|
|
+ xor r15d, DWORD PTR [rdx+60]
|
|
|
|
+ mov DWORD PTR [rbp], eax
|
|
|
|
+ mov DWORD PTR [rbp+4], ebx
|
|
|
|
+ mov DWORD PTR [rbp+8], r9d
|
|
|
|
+ mov DWORD PTR [rbp+12], r8d
|
|
|
|
+ mov DWORD PTR [rbp+16], r8d
|
|
|
|
+ mov DWORD PTR [rbp+20], r9d
|
|
|
|
+ mov DWORD PTR [rbp+24], r10d
|
|
|
|
+ mov DWORD PTR [rbp+28], r11d
|
|
|
|
+ mov DWORD PTR [rbp+48], r12d
|
|
|
|
+ mov DWORD PTR [rbp+52], r13d
|
|
|
|
+ mov DWORD PTR [rbp+56], r14d
|
|
|
|
+ mov DWORD PTR [rbp+60], r15d
|
|
|
|
+ mov eax, DWORD PTR [rsp+8]
|
|
|
|
+ mov ebx, DWORD PTR [rsp+12]
|
|
|
|
+ mov r9d, DWORD PTR [rsp+16]
|
|
|
|
+ mov r8d, DWORD PTR [rsp+20]
|
|
|
|
+ add eax, DWORD PTR [rcx+32]
|
|
|
|
+ add ebx, DWORD PTR [rcx+36]
|
|
|
|
+ add r9d, DWORD PTR [rcx+40]
|
|
|
|
+ add r8d, DWORD PTR [rcx+44]
|
|
|
|
+ xor eax, DWORD PTR [rdx+32]
|
|
|
|
+ xor ebx, DWORD PTR [rdx+36]
|
|
|
|
+ xor r9d, DWORD PTR [rdx+40]
|
|
|
|
+ xor r8d, DWORD PTR [rdx+44]
|
|
|
|
+ mov DWORD PTR [rbp+32], eax
|
|
|
|
+ mov DWORD PTR [rbp+36], ebx
|
|
|
|
+ mov DWORD PTR [rbp+40], r9d
|
|
|
|
+ mov DWORD PTR [rbp+44], r8d
|
|
|
|
+ mov r8, QWORD PTR [rsp+24]
|
|
|
|
+ mov r9, QWORD PTR [rsp+40]
|
|
|
|
+ add DWORD PTR [rcx+48], 1
|
|
|
|
+ add rsp, 48
|
|
|
|
+ sub r9d, 64
|
|
|
|
+ add rdx, 64
|
|
|
|
+ add r8, 64
|
|
|
|
+ cmp r9d, 64
|
|
|
|
+ jge L_chacha_x64_start
|
|
|
|
+L_chacha_x64_small:
|
|
|
|
+ cmp r9d, 0
|
|
|
|
+ je L_chacha_x64_done
|
|
|
|
+ sub rsp, 48
|
|
|
|
+ mov QWORD PTR [rsp+24], r8
|
|
|
|
+ mov QWORD PTR [rsp+32], rdx
|
|
|
|
+ mov QWORD PTR [rsp+40], r9
|
|
|
|
+ mov rax, QWORD PTR [rcx+32]
|
|
|
|
+ mov rbx, QWORD PTR [rcx+40]
|
|
|
|
+ mov QWORD PTR [rsp+8], rax
|
|
|
|
+ mov QWORD PTR [rsp+16], rbx
|
|
|
|
+ mov eax, DWORD PTR [rcx]
|
|
|
|
+ mov ebx, DWORD PTR [rcx+4]
|
|
|
|
+ mov r9d, DWORD PTR [rcx+8]
|
|
|
|
+ mov r8d, DWORD PTR [rcx+12]
|
|
|
|
+ mov r8d, DWORD PTR [rcx+16]
|
|
|
|
+ mov r9d, DWORD PTR [rcx+20]
|
|
|
|
+ mov r10d, DWORD PTR [rcx+24]
|
|
|
|
+ mov r11d, DWORD PTR [rcx+28]
|
|
|
|
+ mov r12d, DWORD PTR [rcx+48]
|
|
|
|
+ mov r13d, DWORD PTR [rcx+52]
|
|
|
|
+ mov r14d, DWORD PTR [rcx+56]
|
|
|
|
+ mov r15d, DWORD PTR [rcx+60]
|
|
|
|
+ mov BYTE PTR [rsp], 10
|
|
|
|
+ mov edx, DWORD PTR [rsp+8]
|
|
|
|
+ mov ebp, DWORD PTR [rsp+12]
|
|
|
|
+L_chacha_x64_partial_crypt_start:
|
|
|
|
+ add eax, r8d
|
|
|
|
+ add ebx, r9d
|
|
|
|
+ xor r12d, eax
|
|
|
|
+ xor r13d, ebx
|
|
|
|
+ rol r12d, 16
|
|
|
|
+ rol r13d, 16
|
|
|
|
+ add edx, r12d
|
|
|
|
+ add ebp, r13d
|
|
|
|
+ xor r8d, edx
|
|
|
|
+ xor r9d, ebp
|
|
|
|
+ rol r8d, 12
|
|
|
|
+ rol r9d, 12
|
|
|
|
+ add eax, r8d
|
|
|
|
+ add ebx, r9d
|
|
|
|
+ xor r12d, eax
|
|
|
|
+ xor r13d, ebx
|
|
|
|
+ rol r12d, 8
|
|
|
|
+ rol r13d, 8
|
|
|
|
+ add edx, r12d
|
|
|
|
+ add ebp, r13d
|
|
|
|
+ xor r8d, edx
|
|
|
|
+ xor r9d, ebp
|
|
|
|
+ rol r8d, 7
|
|
|
|
+ rol r9d, 7
|
|
|
|
+ mov DWORD PTR [rsp+8], edx
|
|
|
|
+ mov DWORD PTR [rsp+12], ebp
|
|
|
|
+ mov edx, DWORD PTR [rsp+16]
|
|
|
|
+ mov ebp, DWORD PTR [rsp+20]
|
|
|
|
+ add r9d, r10d
|
|
|
|
+ add r8d, r11d
|
|
|
|
+ xor r14d, r9d
|
|
|
|
+ xor r15d, r8d
|
|
|
|
+ rol r14d, 16
|
|
|
|
+ rol r15d, 16
|
|
|
|
+ add edx, r14d
|
|
|
|
+ add ebp, r15d
|
|
|
|
+ xor r10d, edx
|
|
|
|
+ xor r11d, ebp
|
|
|
|
+ rol r10d, 12
|
|
|
|
+ rol r11d, 12
|
|
|
|
+ add r9d, r10d
|
|
|
|
+ add r8d, r11d
|
|
|
|
+ xor r14d, r9d
|
|
|
|
+ xor r15d, r8d
|
|
|
|
+ rol r14d, 8
|
|
|
|
+ rol r15d, 8
|
|
|
|
+ add edx, r14d
|
|
|
|
+ add ebp, r15d
|
|
|
|
+ xor r10d, edx
|
|
|
|
+ xor r11d, ebp
|
|
|
|
+ rol r10d, 7
|
|
|
|
+ rol r11d, 7
|
|
|
|
+ add eax, r9d
|
|
|
|
+ add ebx, r10d
|
|
|
|
+ xor r15d, eax
|
|
|
|
+ xor r12d, ebx
|
|
|
|
+ rol r15d, 16
|
|
|
|
+ rol r12d, 16
|
|
|
|
+ add edx, r15d
|
|
|
|
+ add ebp, r12d
|
|
|
|
+ xor r9d, edx
|
|
|
|
+ xor r10d, ebp
|
|
|
|
+ rol r9d, 12
|
|
|
|
+ rol r10d, 12
|
|
|
|
+ add eax, r9d
|
|
|
|
+ add ebx, r10d
|
|
|
|
+ xor r15d, eax
|
|
|
|
+ xor r12d, ebx
|
|
|
|
+ rol r15d, 8
|
|
|
|
+ rol r12d, 8
|
|
|
|
+ add edx, r15d
|
|
|
|
+ add ebp, r12d
|
|
|
|
+ xor r9d, edx
|
|
|
|
+ xor r10d, ebp
|
|
|
|
+ rol r9d, 7
|
|
|
|
+ rol r10d, 7
|
|
|
|
+ mov DWORD PTR [rsp+16], edx
|
|
|
|
+ mov DWORD PTR [rsp+20], ebp
|
|
|
|
+ mov edx, DWORD PTR [rsp+8]
|
|
|
|
+ mov ebp, DWORD PTR [rsp+12]
|
|
|
|
+ add r9d, r11d
|
|
|
|
+ add r8d, r8d
|
|
|
|
+ xor r13d, r9d
|
|
|
|
+ xor r14d, r8d
|
|
|
|
+ rol r13d, 16
|
|
|
|
+ rol r14d, 16
|
|
|
|
+ add edx, r13d
|
|
|
|
+ add ebp, r14d
|
|
|
|
+ xor r11d, edx
|
|
|
|
+ xor r8d, ebp
|
|
|
|
+ rol r11d, 12
|
|
|
|
+ rol r8d, 12
|
|
|
|
+ add r9d, r11d
|
|
|
|
+ add r8d, r8d
|
|
|
|
+ xor r13d, r9d
|
|
|
|
+ xor r14d, r8d
|
|
|
|
+ rol r13d, 8
|
|
|
|
+ rol r14d, 8
|
|
|
|
+ add edx, r13d
|
|
|
|
+ add ebp, r14d
|
|
|
|
+ xor r11d, edx
|
|
|
|
+ xor r8d, ebp
|
|
|
|
+ rol r11d, 7
|
|
|
|
+ rol r8d, 7
|
|
|
|
+ dec BYTE PTR [rsp]
|
|
|
|
+ jnz L_chacha_x64_partial_crypt_start
|
|
|
|
+ mov DWORD PTR [rsp+8], edx
|
|
|
|
+ mov DWORD PTR [rsp+12], ebp
|
|
|
|
+ mov rdx, QWORD PTR [rsp+32]
|
|
|
|
+ add eax, DWORD PTR [rcx]
|
|
|
|
+ add ebx, DWORD PTR [rcx+4]
|
|
|
|
+ add r9d, DWORD PTR [rcx+8]
|
|
|
|
+ add r8d, DWORD PTR [rcx+12]
|
|
|
|
+ add r8d, DWORD PTR [rcx+16]
|
|
|
|
+ add r9d, DWORD PTR [rcx+20]
|
|
|
|
+ add r10d, DWORD PTR [rcx+24]
|
|
|
|
+ add r11d, DWORD PTR [rcx+28]
|
|
|
|
+ add r12d, DWORD PTR [rcx+48]
|
|
|
|
+ add r13d, DWORD PTR [rcx+52]
|
|
|
|
+ add r14d, DWORD PTR [rcx+56]
|
|
|
|
+ add r15d, DWORD PTR [rcx+60]
|
|
|
|
+ lea rbp, QWORD PTR [rcx+80]
|
|
|
|
+ mov DWORD PTR [rbp], eax
|
|
|
|
+ mov DWORD PTR [rbp+4], ebx
|
|
|
|
+ mov DWORD PTR [rbp+8], r9d
|
|
|
|
+ mov DWORD PTR [rbp+12], r8d
|
|
|
|
+ mov DWORD PTR [rbp+16], r8d
|
|
|
|
+ mov DWORD PTR [rbp+20], r9d
|
|
|
|
+ mov DWORD PTR [rbp+24], r10d
|
|
|
|
+ mov DWORD PTR [rbp+28], r11d
|
|
|
|
+ mov DWORD PTR [rbp+48], r12d
|
|
|
|
+ mov DWORD PTR [rbp+52], r13d
|
|
|
|
+ mov DWORD PTR [rbp+56], r14d
|
|
|
|
+ mov DWORD PTR [rbp+60], r15d
|
|
|
|
+ mov eax, DWORD PTR [rsp+8]
|
|
|
|
+ mov ebx, DWORD PTR [rsp+12]
|
|
|
|
+ mov r9d, DWORD PTR [rsp+16]
|
|
|
|
+ mov r8d, DWORD PTR [rsp+20]
|
|
|
|
+ add eax, DWORD PTR [rcx+32]
|
|
|
|
+ add ebx, DWORD PTR [rcx+36]
|
|
|
|
+ add r9d, DWORD PTR [rcx+40]
|
|
|
|
+ add r8d, DWORD PTR [rcx+44]
|
|
|
|
+ mov DWORD PTR [rbp+32], eax
|
|
|
|
+ mov DWORD PTR [rbp+36], ebx
|
|
|
|
+ mov DWORD PTR [rbp+40], r9d
|
|
|
|
+ mov DWORD PTR [rbp+44], r8d
|
|
|
|
+ mov r8, QWORD PTR [rsp+24]
|
|
|
|
+ mov r9, QWORD PTR [rsp+40]
|
|
|
|
+ add DWORD PTR [rcx+48], 1
|
|
|
|
+ add rsp, 48
|
|
|
|
+ mov r8d, r9d
|
|
|
|
+ xor rbx, rbx
|
|
|
|
+ and r8d, 7
|
|
|
|
+ jz L_chacha_x64_partial_start64
|
|
|
|
+L_chacha_x64_partial_start8:
|
|
|
|
+ movzx eax, BYTE PTR [rbp+rbx]
|
|
|
|
+ xor al, BYTE PTR [rdx+rbx]
|
|
|
|
+ mov BYTE PTR [r8+rbx], al
|
|
|
|
+ inc ebx
|
|
|
|
+ cmp ebx, r8d
|
|
|
|
+ jne L_chacha_x64_partial_start8
|
|
|
|
+ je L_chacha_x64_partial_end64
|
|
|
|
+L_chacha_x64_partial_start64:
|
|
|
|
+ mov rax, QWORD PTR [rbp+rbx]
|
|
|
|
+ xor rax, QWORD PTR [rdx+rbx]
|
|
|
|
+ mov QWORD PTR [r8+rbx], rax
|
|
|
|
+ add ebx, 8
|
|
|
|
+L_chacha_x64_partial_end64:
|
|
|
|
+ cmp ebx, r9d
|
|
|
|
+ jne L_chacha_x64_partial_start64
|
|
|
|
+ mov r9d, 64
|
|
|
|
+ sub r9d, ebx
|
|
|
|
+ mov DWORD PTR [rcx+76], r9d
|
|
|
|
+L_chacha_x64_done:
|
|
|
|
+ add rsp, 64
|
|
|
|
+ pop r15
|
|
|
|
+ pop r14
|
|
|
|
+ pop r13
|
|
|
|
+ pop r12
|
|
|
|
+ pop rbp
|
|
|
|
+ pop rbx
|
|
|
|
+ ret
|
|
|
|
+chacha_encrypt_x64 ENDP
|
|
|
|
+_text ENDS
|
|
|
|
+IFDEF HAVE_INTEL_AVX1
|
|
|
|
+_DATA SEGMENT
|
|
|
|
+ALIGN 16
|
|
|
|
+L_chacha20_avx1_rotl8 QWORD 433757367256023043, 1012478749960636427
|
|
|
|
+ptr_L_chacha20_avx1_rotl8 QWORD L_chacha20_avx1_rotl8
|
|
|
|
+_DATA ENDS
|
|
|
|
+_DATA SEGMENT
|
|
|
|
+ALIGN 16
|
|
|
|
+L_chacha20_avx1_rotl16 QWORD 361421592464458498, 940142975169071882
|
|
|
|
+ptr_L_chacha20_avx1_rotl16 QWORD L_chacha20_avx1_rotl16
|
|
|
|
+_DATA ENDS
|
|
|
|
+_DATA SEGMENT
|
|
|
|
+ALIGN 16
|
|
|
|
+L_chacha20_avx1_add QWORD 4294967296, 12884901890
|
|
|
|
+ptr_L_chacha20_avx1_add QWORD L_chacha20_avx1_add
|
|
|
|
+_DATA ENDS
|
|
|
|
+_DATA SEGMENT
|
|
|
|
+ALIGN 16
|
|
|
|
+L_chacha20_avx1_four QWORD 17179869188, 17179869188
|
|
|
|
+ptr_L_chacha20_avx1_four QWORD L_chacha20_avx1_four
|
|
|
|
+_DATA ENDS
|
|
|
|
+_text SEGMENT READONLY PARA
|
|
|
|
+chacha_encrypt_avx1 PROC
|
|
|
|
+ push r12
|
|
|
|
+ push r13
|
|
|
|
+ push r14
|
|
|
|
+ push r15
|
|
|
|
+ push rdi
|
|
|
|
+ push rsi
|
|
|
|
+ sub rsp, 560
|
|
|
|
+ vmovdqu OWORD PTR [rsp+400], xmm6
|
|
|
|
+ vmovdqu OWORD PTR [rsp+416], xmm7
|
|
|
|
+ vmovdqu OWORD PTR [rsp+432], xmm8
|
|
|
|
+ vmovdqu OWORD PTR [rsp+448], xmm9
|
|
|
|
+ vmovdqu OWORD PTR [rsp+464], xmm10
|
|
|
|
+ vmovdqu OWORD PTR [rsp+480], xmm11
|
|
|
|
+ vmovdqu OWORD PTR [rsp+496], xmm12
|
|
|
|
+ vmovdqu OWORD PTR [rsp+512], xmm13
|
|
|
|
+ vmovdqu OWORD PTR [rsp+528], xmm14
|
|
|
|
+ vmovdqu OWORD PTR [rsp+544], xmm15
|
|
|
|
+ mov r11, rsp
|
|
|
|
+ lea r12, QWORD PTR [rsp+256]
|
|
|
|
+ mov r14, QWORD PTR [ptr_L_chacha20_avx1_rotl8]
|
|
|
|
+ mov r15, QWORD PTR [ptr_L_chacha20_avx1_rotl16]
|
|
|
|
+ mov rdi, QWORD PTR [ptr_L_chacha20_avx1_add]
|
|
|
|
+ mov rsi, QWORD PTR [ptr_L_chacha20_avx1_four]
|
|
|
|
+ add r11, 15
|
|
|
|
+ add r12, 15
|
|
|
|
+ and r11, -16
|
|
|
|
+ and r12, -16
|
|
|
|
+ mov eax, r9d
|
|
|
|
+ shr eax, 8
|
|
|
|
+ jz L_chacha20_avx1_end128
|
|
|
|
+ vpshufd xmm0, [rcx], 0
|
|
|
|
+ vpshufd xmm1, [rcx+4], 0
|
|
|
|
+ vpshufd xmm2, [rcx+8], 0
|
|
|
|
+ vpshufd xmm3, [rcx+12], 0
|
|
|
|
+ vpshufd xmm4, [rcx+16], 0
|
|
|
|
+ vpshufd xmm5, [rcx+20], 0
|
|
|
|
+ vpshufd xmm6, [rcx+24], 0
|
|
|
|
+ vpshufd xmm7, [rcx+28], 0
|
|
|
|
+ vpshufd xmm8, [rcx+32], 0
|
|
|
|
+ vpshufd xmm9, [rcx+36], 0
|
|
|
|
+ vpshufd xmm10, [rcx+40], 0
|
|
|
|
+ vpshufd xmm11, [rcx+44], 0
|
|
|
|
+ vpshufd xmm12, [rcx+48], 0
|
|
|
|
+ vpshufd xmm13, [rcx+52], 0
|
|
|
|
+ vpshufd xmm14, [rcx+56], 0
|
|
|
|
+ vpshufd xmm15, [rcx+60], 0
|
|
|
|
+ vpaddd xmm12, xmm12, OWORD PTR [rdi]
|
|
|
|
+ vmovdqa OWORD PTR [r11], xmm0
|
|
|
|
+ vmovdqa OWORD PTR [r11+16], xmm1
|
|
|
|
+ vmovdqa OWORD PTR [r11+32], xmm2
|
|
|
|
+ vmovdqa OWORD PTR [r11+48], xmm3
|
|
|
|
+ vmovdqa OWORD PTR [r11+64], xmm4
|
|
|
|
+ vmovdqa OWORD PTR [r11+80], xmm5
|
|
|
|
+ vmovdqa OWORD PTR [r11+96], xmm6
|
|
|
|
+ vmovdqa OWORD PTR [r11+112], xmm7
|
|
|
|
+ vmovdqa OWORD PTR [r11+128], xmm8
|
|
|
|
+ vmovdqa OWORD PTR [r11+144], xmm9
|
|
|
|
+ vmovdqa OWORD PTR [r11+160], xmm10
|
|
|
|
+ vmovdqa OWORD PTR [r11+176], xmm11
|
|
|
|
+ vmovdqa OWORD PTR [r11+192], xmm12
|
|
|
|
+ vmovdqa OWORD PTR [r11+208], xmm13
|
|
|
|
+ vmovdqa OWORD PTR [r11+224], xmm14
|
|
|
|
+ vmovdqa OWORD PTR [r11+240], xmm15
|
|
|
|
+L_chacha20_avx1_start128:
|
|
|
|
+ vmovdqa OWORD PTR [r12+48], xmm11
|
|
|
|
+ mov r10b, 10
|
|
|
|
+L_chacha20_avx1_loop128:
|
|
|
|
+ vpaddd xmm0, xmm0, xmm4
|
|
|
|
+ vpxor xmm12, xmm12, xmm0
|
|
|
|
+ vmovdqa xmm11, OWORD PTR [r12+48]
|
|
|
|
+ vpshufb xmm12, xmm12, OWORD PTR [r15]
|
|
|
|
+ vpaddd xmm8, xmm8, xmm12
|
|
|
|
+ vpxor xmm4, xmm4, xmm8
|
|
|
|
+ vpaddd xmm1, xmm1, xmm5
|
|
|
|
+ vpxor xmm13, xmm13, xmm1
|
|
|
|
+ vpshufb xmm13, xmm13, OWORD PTR [r15]
|
|
|
|
+ vpaddd xmm9, xmm9, xmm13
|
|
|
|
+ vpxor xmm5, xmm5, xmm9
|
|
|
|
+ vpaddd xmm2, xmm2, xmm6
|
|
|
|
+ vpxor xmm14, xmm14, xmm2
|
|
|
|
+ vpshufb xmm14, xmm14, OWORD PTR [r15]
|
|
|
|
+ vpaddd xmm10, xmm10, xmm14
|
|
|
|
+ vpxor xmm6, xmm6, xmm10
|
|
|
|
+ vpaddd xmm3, xmm3, xmm7
|
|
|
|
+ vpxor xmm15, xmm15, xmm3
|
|
|
|
+ vpshufb xmm15, xmm15, OWORD PTR [r15]
|
|
|
|
+ vpaddd xmm11, xmm11, xmm15
|
|
|
|
+ vpxor xmm7, xmm7, xmm11
|
|
|
|
+ vmovdqa OWORD PTR [r12+48], xmm11
|
|
|
|
+ vpsrld xmm11, xmm4, 20
|
|
|
|
+ vpslld xmm4, xmm4, 12
|
|
|
|
+ vpxor xmm4, xmm4, xmm11
|
|
|
|
+ vpsrld xmm11, xmm5, 20
|
|
|
|
+ vpslld xmm5, xmm5, 12
|
|
|
|
+ vpxor xmm5, xmm5, xmm11
|
|
|
|
+ vpsrld xmm11, xmm6, 20
|
|
|
|
+ vpslld xmm6, xmm6, 12
|
|
|
|
+ vpxor xmm6, xmm6, xmm11
|
|
|
|
+ vpsrld xmm11, xmm7, 20
|
|
|
|
+ vpslld xmm7, xmm7, 12
|
|
|
|
+ vpxor xmm7, xmm7, xmm11
|
|
|
|
+ vpaddd xmm0, xmm0, xmm4
|
|
|
|
+ vpxor xmm12, xmm12, xmm0
|
|
|
|
+ vmovdqa xmm11, OWORD PTR [r12+48]
|
|
|
|
+ vpshufb xmm12, xmm12, OWORD PTR [r14]
|
|
|
|
+ vpaddd xmm8, xmm8, xmm12
|
|
|
|
+ vpxor xmm4, xmm4, xmm8
|
|
|
|
+ vpaddd xmm1, xmm1, xmm5
|
|
|
|
+ vpxor xmm13, xmm13, xmm1
|
|
|
|
+ vpshufb xmm13, xmm13, OWORD PTR [r14]
|
|
|
|
+ vpaddd xmm9, xmm9, xmm13
|
|
|
|
+ vpxor xmm5, xmm5, xmm9
|
|
|
|
+ vpaddd xmm2, xmm2, xmm6
|
|
|
|
+ vpxor xmm14, xmm14, xmm2
|
|
|
|
+ vpshufb xmm14, xmm14, OWORD PTR [r14]
|
|
|
|
+ vpaddd xmm10, xmm10, xmm14
|
|
|
|
+ vpxor xmm6, xmm6, xmm10
|
|
|
|
+ vpaddd xmm3, xmm3, xmm7
|
|
|
|
+ vpxor xmm15, xmm15, xmm3
|
|
|
|
+ vpshufb xmm15, xmm15, OWORD PTR [r14]
|
|
|
|
+ vpaddd xmm11, xmm11, xmm15
|
|
|
|
+ vpxor xmm7, xmm7, xmm11
|
|
|
|
+ vmovdqa OWORD PTR [r12+48], xmm11
|
|
|
|
+ vpsrld xmm11, xmm4, 25
|
|
|
|
+ vpslld xmm4, xmm4, 7
|
|
|
|
+ vpxor xmm4, xmm4, xmm11
|
|
|
|
+ vpsrld xmm11, xmm5, 25
|
|
|
|
+ vpslld xmm5, xmm5, 7
|
|
|
|
+ vpxor xmm5, xmm5, xmm11
|
|
|
|
+ vpsrld xmm11, xmm6, 25
|
|
|
|
+ vpslld xmm6, xmm6, 7
|
|
|
|
+ vpxor xmm6, xmm6, xmm11
|
|
|
|
+ vpsrld xmm11, xmm7, 25
|
|
|
|
+ vpslld xmm7, xmm7, 7
|
|
|
|
+ vpxor xmm7, xmm7, xmm11
|
|
|
|
+ vpaddd xmm0, xmm0, xmm5
|
|
|
|
+ vpxor xmm15, xmm15, xmm0
|
|
|
|
+ vmovdqa xmm11, OWORD PTR [r12+48]
|
|
|
|
+ vpshufb xmm15, xmm15, OWORD PTR [r15]
|
|
|
|
+ vpaddd xmm10, xmm10, xmm15
|
|
|
|
+ vpxor xmm5, xmm5, xmm10
|
|
|
|
+ vpaddd xmm1, xmm1, xmm6
|
|
|
|
+ vpxor xmm12, xmm12, xmm1
|
|
|
|
+ vpshufb xmm12, xmm12, OWORD PTR [r15]
|
|
|
|
+ vpaddd xmm11, xmm11, xmm12
|
|
|
|
+ vpxor xmm6, xmm6, xmm11
|
|
|
|
+ vpaddd xmm2, xmm2, xmm7
|
|
|
|
+ vpxor xmm13, xmm13, xmm2
|
|
|
|
+ vpshufb xmm13, xmm13, OWORD PTR [r15]
|
|
|
|
+ vpaddd xmm8, xmm8, xmm13
|
|
|
|
+ vpxor xmm7, xmm7, xmm8
|
|
|
|
+ vpaddd xmm3, xmm3, xmm4
|
|
|
|
+ vpxor xmm14, xmm14, xmm3
|
|
|
|
+ vpshufb xmm14, xmm14, OWORD PTR [r15]
|
|
|
|
+ vpaddd xmm9, xmm9, xmm14
|
|
|
|
+ vpxor xmm4, xmm4, xmm9
|
|
|
|
+ vmovdqa OWORD PTR [r12+48], xmm11
|
|
|
|
+ vpsrld xmm11, xmm5, 20
|
|
|
|
+ vpslld xmm5, xmm5, 12
|
|
|
|
+ vpxor xmm5, xmm5, xmm11
|
|
|
|
+ vpsrld xmm11, xmm6, 20
|
|
|
|
+ vpslld xmm6, xmm6, 12
|
|
|
|
+ vpxor xmm6, xmm6, xmm11
|
|
|
|
+ vpsrld xmm11, xmm7, 20
|
|
|
|
+ vpslld xmm7, xmm7, 12
|
|
|
|
+ vpxor xmm7, xmm7, xmm11
|
|
|
|
+ vpsrld xmm11, xmm4, 20
|
|
|
|
+ vpslld xmm4, xmm4, 12
|
|
|
|
+ vpxor xmm4, xmm4, xmm11
|
|
|
|
+ vpaddd xmm0, xmm0, xmm5
|
|
|
|
+ vpxor xmm15, xmm15, xmm0
|
|
|
|
+ vmovdqa xmm11, OWORD PTR [r12+48]
|
|
|
|
+ vpshufb xmm15, xmm15, OWORD PTR [r14]
|
|
|
|
+ vpaddd xmm10, xmm10, xmm15
|
|
|
|
+ vpxor xmm5, xmm5, xmm10
|
|
|
|
+ vpaddd xmm1, xmm1, xmm6
|
|
|
|
+ vpxor xmm12, xmm12, xmm1
|
|
|
|
+ vpshufb xmm12, xmm12, OWORD PTR [r14]
|
|
|
|
+ vpaddd xmm11, xmm11, xmm12
|
|
|
|
+ vpxor xmm6, xmm6, xmm11
|
|
|
|
+ vpaddd xmm2, xmm2, xmm7
|
|
|
|
+ vpxor xmm13, xmm13, xmm2
|
|
|
|
+ vpshufb xmm13, xmm13, OWORD PTR [r14]
|
|
|
|
+ vpaddd xmm8, xmm8, xmm13
|
|
|
|
+ vpxor xmm7, xmm7, xmm8
|
|
|
|
+ vpaddd xmm3, xmm3, xmm4
|
|
|
|
+ vpxor xmm14, xmm14, xmm3
|
|
|
|
+ vpshufb xmm14, xmm14, OWORD PTR [r14]
|
|
|
|
+ vpaddd xmm9, xmm9, xmm14
|
|
|
|
+ vpxor xmm4, xmm4, xmm9
|
|
|
|
+ vmovdqa OWORD PTR [r12+48], xmm11
|
|
|
|
+ vpsrld xmm11, xmm5, 25
|
|
|
|
+ vpslld xmm5, xmm5, 7
|
|
|
|
+ vpxor xmm5, xmm5, xmm11
|
|
|
|
+ vpsrld xmm11, xmm6, 25
|
|
|
|
+ vpslld xmm6, xmm6, 7
|
|
|
|
+ vpxor xmm6, xmm6, xmm11
|
|
|
|
+ vpsrld xmm11, xmm7, 25
|
|
|
|
+ vpslld xmm7, xmm7, 7
|
|
|
|
+ vpxor xmm7, xmm7, xmm11
|
|
|
|
+ vpsrld xmm11, xmm4, 25
|
|
|
|
+ vpslld xmm4, xmm4, 7
|
|
|
|
+ vpxor xmm4, xmm4, xmm11
|
|
|
|
+ dec r10b
|
|
|
|
+ jnz L_chacha20_avx1_loop128
|
|
|
|
+ vmovdqa xmm11, OWORD PTR [r12+48]
|
|
|
|
+ vpaddd xmm0, xmm0, OWORD PTR [r11]
|
|
|
|
+ vpaddd xmm1, xmm1, OWORD PTR [r11+16]
|
|
|
|
+ vpaddd xmm2, xmm2, OWORD PTR [r11+32]
|
|
|
|
+ vpaddd xmm3, xmm3, OWORD PTR [r11+48]
|
|
|
|
+ vpaddd xmm4, xmm4, OWORD PTR [r11+64]
|
|
|
|
+ vpaddd xmm5, xmm5, OWORD PTR [r11+80]
|
|
|
|
+ vpaddd xmm6, xmm6, OWORD PTR [r11+96]
|
|
|
|
+ vpaddd xmm7, xmm7, OWORD PTR [r11+112]
|
|
|
|
+ vpaddd xmm8, xmm8, OWORD PTR [r11+128]
|
|
|
|
+ vpaddd xmm9, xmm9, OWORD PTR [r11+144]
|
|
|
|
+ vpaddd xmm10, xmm10, OWORD PTR [r11+160]
|
|
|
|
+ vpaddd xmm11, xmm11, OWORD PTR [r11+176]
|
|
|
|
+ vpaddd xmm12, xmm12, OWORD PTR [r11+192]
|
|
|
|
+ vpaddd xmm13, xmm13, OWORD PTR [r11+208]
|
|
|
|
+ vpaddd xmm14, xmm14, OWORD PTR [r11+224]
|
|
|
|
+ vpaddd xmm15, xmm15, OWORD PTR [r11+240]
|
|
|
|
+ vmovdqa OWORD PTR [r12], xmm8
|
|
|
|
+ vmovdqa OWORD PTR [r12+16], xmm9
|
|
|
|
+ vmovdqa OWORD PTR [r12+32], xmm10
|
|
|
|
+ vmovdqa OWORD PTR [r12+48], xmm11
|
|
|
|
+ vmovdqa OWORD PTR [r12+64], xmm12
|
|
|
|
+ vmovdqa OWORD PTR [r12+80], xmm13
|
|
|
|
+ vmovdqa OWORD PTR [r12+96], xmm14
|
|
|
|
+ vmovdqa OWORD PTR [r12+112], xmm15
|
|
|
|
+ vpunpckldq xmm8, xmm0, xmm1
|
|
|
|
+ vpunpckldq xmm9, xmm2, xmm3
|
|
|
|
+ vpunpckhdq xmm12, xmm0, xmm1
|
|
|
|
+ vpunpckhdq xmm13, xmm2, xmm3
|
|
|
|
+ vpunpckldq xmm10, xmm4, xmm5
|
|
|
|
+ vpunpckldq xmm11, xmm6, xmm7
|
|
|
|
+ vpunpckhdq xmm14, xmm4, xmm5
|
|
|
|
+ vpunpckhdq xmm15, xmm6, xmm7
|
|
|
|
+ vpunpcklqdq xmm0, xmm8, xmm9
|
|
|
|
+ vpunpcklqdq xmm1, xmm10, xmm11
|
|
|
|
+ vpunpckhqdq xmm2, xmm8, xmm9
|
|
|
|
+ vpunpckhqdq xmm3, xmm10, xmm11
|
|
|
|
+ vpunpcklqdq xmm4, xmm12, xmm13
|
|
|
|
+ vpunpcklqdq xmm5, xmm14, xmm15
|
|
|
|
+ vpunpckhqdq xmm6, xmm12, xmm13
|
|
|
|
+ vpunpckhqdq xmm7, xmm14, xmm15
|
|
|
|
+ vmovdqu xmm8, OWORD PTR [rdx]
|
|
|
|
+ vmovdqu xmm9, OWORD PTR [rdx+16]
|
|
|
|
+ vmovdqu xmm10, OWORD PTR [rdx+64]
|
|
|
|
+ vmovdqu xmm11, OWORD PTR [rdx+80]
|
|
|
|
+ vmovdqu xmm12, OWORD PTR [rdx+128]
|
|
|
|
+ vmovdqu xmm13, OWORD PTR [rdx+144]
|
|
|
|
+ vmovdqu xmm14, OWORD PTR [rdx+192]
|
|
|
|
+ vmovdqu xmm15, OWORD PTR [rdx+208]
|
|
|
|
+ vpxor xmm0, xmm0, xmm8
|
|
|
|
+ vpxor xmm1, xmm1, xmm9
|
|
|
|
+ vpxor xmm2, xmm2, xmm10
|
|
|
|
+ vpxor xmm3, xmm3, xmm11
|
|
|
|
+ vpxor xmm4, xmm4, xmm12
|
|
|
|
+ vpxor xmm5, xmm5, xmm13
|
|
|
|
+ vpxor xmm6, xmm6, xmm14
|
|
|
|
+ vpxor xmm7, xmm7, xmm15
|
|
|
|
+ vmovdqu OWORD PTR [r8], xmm0
|
|
|
|
+ vmovdqu OWORD PTR [r8+16], xmm1
|
|
|
|
+ vmovdqu OWORD PTR [r8+64], xmm2
|
|
|
|
+ vmovdqu OWORD PTR [r8+80], xmm3
|
|
|
|
+ vmovdqu OWORD PTR [r8+128], xmm4
|
|
|
|
+ vmovdqu OWORD PTR [r8+144], xmm5
|
|
|
|
+ vmovdqu OWORD PTR [r8+192], xmm6
|
|
|
|
+ vmovdqu OWORD PTR [r8+208], xmm7
|
|
|
|
+ vmovdqa xmm0, OWORD PTR [r12]
|
|
|
|
+ vmovdqa xmm1, OWORD PTR [r12+16]
|
|
|
|
+ vmovdqa xmm2, OWORD PTR [r12+32]
|
|
|
|
+ vmovdqa xmm3, OWORD PTR [r12+48]
|
|
|
|
+ vmovdqa xmm4, OWORD PTR [r12+64]
|
|
|
|
+ vmovdqa xmm5, OWORD PTR [r12+80]
|
|
|
|
+ vmovdqa xmm6, OWORD PTR [r12+96]
|
|
|
|
+ vmovdqa xmm7, OWORD PTR [r12+112]
|
|
|
|
+ vpunpckldq xmm8, xmm0, xmm1
|
|
|
|
+ vpunpckldq xmm9, xmm2, xmm3
|
|
|
|
+ vpunpckhdq xmm12, xmm0, xmm1
|
|
|
|
+ vpunpckhdq xmm13, xmm2, xmm3
|
|
|
|
+ vpunpckldq xmm10, xmm4, xmm5
|
|
|
|
+ vpunpckldq xmm11, xmm6, xmm7
|
|
|
|
+ vpunpckhdq xmm14, xmm4, xmm5
|
|
|
|
+ vpunpckhdq xmm15, xmm6, xmm7
|
|
|
|
+ vpunpcklqdq xmm0, xmm8, xmm9
|
|
|
|
+ vpunpcklqdq xmm1, xmm10, xmm11
|
|
|
|
+ vpunpckhqdq xmm2, xmm8, xmm9
|
|
|
|
+ vpunpckhqdq xmm3, xmm10, xmm11
|
|
|
|
+ vpunpcklqdq xmm4, xmm12, xmm13
|
|
|
|
+ vpunpcklqdq xmm5, xmm14, xmm15
|
|
|
|
+ vpunpckhqdq xmm6, xmm12, xmm13
|
|
|
|
+ vpunpckhqdq xmm7, xmm14, xmm15
|
|
|
|
+ vmovdqu xmm8, OWORD PTR [rdx+32]
|
|
|
|
+ vmovdqu xmm9, OWORD PTR [rdx+48]
|
|
|
|
+ vmovdqu xmm10, OWORD PTR [rdx+96]
|
|
|
|
+ vmovdqu xmm11, OWORD PTR [rdx+112]
|
|
|
|
+ vmovdqu xmm12, OWORD PTR [rdx+160]
|
|
|
|
+ vmovdqu xmm13, OWORD PTR [rdx+176]
|
|
|
|
+ vmovdqu xmm14, OWORD PTR [rdx+224]
|
|
|
|
+ vmovdqu xmm15, OWORD PTR [rdx+240]
|
|
|
|
+ vpxor xmm0, xmm0, xmm8
|
|
|
|
+ vpxor xmm1, xmm1, xmm9
|
|
|
|
+ vpxor xmm2, xmm2, xmm10
|
|
|
|
+ vpxor xmm3, xmm3, xmm11
|
|
|
|
+ vpxor xmm4, xmm4, xmm12
|
|
|
|
+ vpxor xmm5, xmm5, xmm13
|
|
|
|
+ vpxor xmm6, xmm6, xmm14
|
|
|
|
+ vpxor xmm7, xmm7, xmm15
|
|
|
|
+ vmovdqu OWORD PTR [r8+32], xmm0
|
|
|
|
+ vmovdqu OWORD PTR [r8+48], xmm1
|
|
|
|
+ vmovdqu OWORD PTR [r8+96], xmm2
|
|
|
|
+ vmovdqu OWORD PTR [r8+112], xmm3
|
|
|
|
+ vmovdqu OWORD PTR [r8+160], xmm4
|
|
|
|
+ vmovdqu OWORD PTR [r8+176], xmm5
|
|
|
|
+ vmovdqu OWORD PTR [r8+224], xmm6
|
|
|
|
+ vmovdqu OWORD PTR [r8+240], xmm7
|
|
|
|
+ vmovdqa xmm12, OWORD PTR [r11+192]
|
|
|
|
+ add rdx, 256
|
|
|
|
+ add r8, 256
|
|
|
|
+ vpaddd xmm12, xmm12, OWORD PTR [rsi]
|
|
|
|
+ sub r9d, 256
|
|
|
|
+ vmovdqa OWORD PTR [r11+192], xmm12
|
|
|
|
+ cmp r9d, 256
|
|
|
|
+ jl L_chacha20_avx1_done128
|
|
|
|
+ vmovdqa xmm0, OWORD PTR [r11]
|
|
|
|
+ vmovdqa xmm1, OWORD PTR [r11+16]
|
|
|
|
+ vmovdqa xmm2, OWORD PTR [r11+32]
|
|
|
|
+ vmovdqa xmm3, OWORD PTR [r11+48]
|
|
|
|
+ vmovdqa xmm4, OWORD PTR [r11+64]
|
|
|
|
+ vmovdqa xmm5, OWORD PTR [r11+80]
|
|
|
|
+ vmovdqa xmm6, OWORD PTR [r11+96]
|
|
|
|
+ vmovdqa xmm7, OWORD PTR [r11+112]
|
|
|
|
+ vmovdqa xmm8, OWORD PTR [r11+128]
|
|
|
|
+ vmovdqa xmm9, OWORD PTR [r11+144]
|
|
|
|
+ vmovdqa xmm10, OWORD PTR [r11+160]
|
|
|
|
+ vmovdqa xmm11, OWORD PTR [r11+176]
|
|
|
|
+ vmovdqa xmm12, OWORD PTR [r11+192]
|
|
|
|
+ vmovdqa xmm13, OWORD PTR [r11+208]
|
|
|
|
+ vmovdqa xmm14, OWORD PTR [r11+224]
|
|
|
|
+ vmovdqa xmm15, OWORD PTR [r11+240]
|
|
|
|
+ jmp L_chacha20_avx1_start128
|
|
|
|
+L_chacha20_avx1_done128:
|
|
|
|
+ shl eax, 2
|
|
|
|
+ add DWORD PTR [rcx+48], eax
|
|
|
|
+L_chacha20_avx1_end128:
|
|
|
|
+ cmp r9d, 64
|
|
|
|
+ jl L_chacha20_avx1_block_done
|
|
|
|
+L_chacha20_avx1_block_start:
|
|
|
|
+ vmovdqu xmm0, OWORD PTR [rcx]
|
|
|
|
+ vmovdqu xmm1, OWORD PTR [rcx+16]
|
|
|
|
+ vmovdqu xmm2, OWORD PTR [rcx+32]
|
|
|
|
+ vmovdqu xmm3, OWORD PTR [rcx+48]
|
|
|
|
+ vmovdqa xmm5, xmm0
|
|
|
|
+ vmovdqa xmm6, xmm1
|
|
|
|
+ vmovdqa xmm7, xmm2
|
|
|
|
+ vmovdqa xmm8, xmm3
|
|
|
|
+ mov al, 10
|
|
|
|
+L_chacha20_avx1_block_crypt_start:
|
|
|
|
+ vpaddd xmm0, xmm0, xmm1
|
|
|
|
+ vpxor xmm3, xmm3, xmm0
|
|
|
|
+ vpshufb xmm3, xmm3, OWORD PTR [r15]
|
|
|
|
+ vpaddd xmm2, xmm2, xmm3
|
|
|
|
+ vpxor xmm1, xmm1, xmm2
|
|
|
|
+ vpsrld xmm4, xmm1, 20
|
|
|
|
+ vpslld xmm1, xmm1, 12
|
|
|
|
+ vpxor xmm1, xmm1, xmm4
|
|
|
|
+ vpaddd xmm0, xmm0, xmm1
|
|
|
|
+ vpxor xmm3, xmm3, xmm0
|
|
|
|
+ vpshufb xmm3, xmm3, OWORD PTR [r14]
|
|
|
|
+ vpaddd xmm2, xmm2, xmm3
|
|
|
|
+ vpxor xmm1, xmm1, xmm2
|
|
|
|
+ vpsrld xmm4, xmm1, 25
|
|
|
|
+ vpslld xmm1, xmm1, 7
|
|
|
|
+ vpxor xmm1, xmm1, xmm4
|
|
|
|
+ vpshufd xmm1, xmm1, 57
|
|
|
|
+ vpshufd xmm2, xmm2, 78
|
|
|
|
+ vpshufd xmm3, xmm3, 147
|
|
|
|
+ vpaddd xmm0, xmm0, xmm1
|
|
|
|
+ vpxor xmm3, xmm3, xmm0
|
|
|
|
+ vpshufb xmm3, xmm3, OWORD PTR [r15]
|
|
|
|
+ vpaddd xmm2, xmm2, xmm3
|
|
|
|
+ vpxor xmm1, xmm1, xmm2
|
|
|
|
+ vpsrld xmm4, xmm1, 20
|
|
|
|
+ vpslld xmm1, xmm1, 12
|
|
|
|
+ vpxor xmm1, xmm1, xmm4
|
|
|
|
+ vpaddd xmm0, xmm0, xmm1
|
|
|
|
+ vpxor xmm3, xmm3, xmm0
|
|
|
|
+ vpshufb xmm3, xmm3, OWORD PTR [r14]
|
|
|
|
+ vpaddd xmm2, xmm2, xmm3
|
|
|
|
+ vpxor xmm1, xmm1, xmm2
|
|
|
|
+ vpsrld xmm4, xmm1, 25
|
|
|
|
+ vpslld xmm1, xmm1, 7
|
|
|
|
+ vpxor xmm1, xmm1, xmm4
|
|
|
|
+ vpshufd xmm1, xmm1, 147
|
|
|
|
+ vpshufd xmm2, xmm2, 78
|
|
|
|
+ vpshufd xmm3, xmm3, 57
|
|
|
|
+ dec al
|
|
|
|
+ jnz L_chacha20_avx1_block_crypt_start
|
|
|
|
+ vpaddd xmm0, xmm0, xmm5
|
|
|
|
+ vpaddd xmm1, xmm1, xmm6
|
|
|
|
+ vpaddd xmm2, xmm2, xmm7
|
|
|
|
+ vpaddd xmm3, xmm3, xmm8
|
|
|
|
+ vmovdqu xmm5, OWORD PTR [rdx]
|
|
|
|
+ vmovdqu xmm6, OWORD PTR [rdx+16]
|
|
|
|
+ vmovdqu xmm7, OWORD PTR [rdx+32]
|
|
|
|
+ vmovdqu xmm8, OWORD PTR [rdx+48]
|
|
|
|
+ vpxor xmm0, xmm0, xmm5
|
|
|
|
+ vpxor xmm1, xmm1, xmm6
|
|
|
|
+ vpxor xmm2, xmm2, xmm7
|
|
|
|
+ vpxor xmm3, xmm3, xmm8
|
|
|
|
+ vmovdqu OWORD PTR [r8], xmm0
|
|
|
|
+ vmovdqu OWORD PTR [r8+16], xmm1
|
|
|
|
+ vmovdqu OWORD PTR [r8+32], xmm2
|
|
|
|
+ vmovdqu OWORD PTR [r8+48], xmm3
|
|
|
|
+ add DWORD PTR [rcx+48], 1
|
|
|
|
+ sub r9d, 64
|
|
|
|
+ add rdx, 64
|
|
|
|
+ add r8, 64
|
|
|
|
+ cmp r9d, 64
|
|
|
|
+ jge L_chacha20_avx1_block_start
|
|
|
|
+L_chacha20_avx1_block_done:
|
|
|
|
+ cmp r9d, 0
|
|
|
|
+ je L_chacha20_avx1_partial_done
|
|
|
|
+ lea r12, QWORD PTR [rcx+80]
|
|
|
|
+ vmovdqu xmm0, OWORD PTR [rcx]
|
|
|
|
+ vmovdqu xmm1, OWORD PTR [rcx+16]
|
|
|
|
+ vmovdqu xmm2, OWORD PTR [rcx+32]
|
|
|
|
+ vmovdqu xmm3, OWORD PTR [rcx+48]
|
|
|
|
+ vmovdqa xmm5, xmm0
|
|
|
|
+ vmovdqa xmm6, xmm1
|
|
|
|
+ vmovdqa xmm7, xmm2
|
|
|
|
+ vmovdqa xmm8, xmm3
|
|
|
|
+ mov al, 10
|
|
|
|
+L_chacha20_avx1_partial_crypt_start:
|
|
|
|
+ vpaddd xmm0, xmm0, xmm1
|
|
|
|
+ vpxor xmm3, xmm3, xmm0
|
|
|
|
+ vpshufb xmm3, xmm3, OWORD PTR [r15]
|
|
|
|
+ vpaddd xmm2, xmm2, xmm3
|
|
|
|
+ vpxor xmm1, xmm1, xmm2
|
|
|
|
+ vpsrld xmm4, xmm1, 20
|
|
|
|
+ vpslld xmm1, xmm1, 12
|
|
|
|
+ vpxor xmm1, xmm1, xmm4
|
|
|
|
+ vpaddd xmm0, xmm0, xmm1
|
|
|
|
+ vpxor xmm3, xmm3, xmm0
|
|
|
|
+ vpshufb xmm3, xmm3, OWORD PTR [r14]
|
|
|
|
+ vpaddd xmm2, xmm2, xmm3
|
|
|
|
+ vpxor xmm1, xmm1, xmm2
|
|
|
|
+ vpsrld xmm4, xmm1, 25
|
|
|
|
+ vpslld xmm1, xmm1, 7
|
|
|
|
+ vpxor xmm1, xmm1, xmm4
|
|
|
|
+ vpshufd xmm1, xmm1, 57
|
|
|
|
+ vpshufd xmm2, xmm2, 78
|
|
|
|
+ vpshufd xmm3, xmm3, 147
|
|
|
|
+ vpaddd xmm0, xmm0, xmm1
|
|
|
|
+ vpxor xmm3, xmm3, xmm0
|
|
|
|
+ vpshufb xmm3, xmm3, OWORD PTR [r15]
|
|
|
|
+ vpaddd xmm2, xmm2, xmm3
|
|
|
|
+ vpxor xmm1, xmm1, xmm2
|
|
|
|
+ vpsrld xmm4, xmm1, 20
|
|
|
|
+ vpslld xmm1, xmm1, 12
|
|
|
|
+ vpxor xmm1, xmm1, xmm4
|
|
|
|
+ vpaddd xmm0, xmm0, xmm1
|
|
|
|
+ vpxor xmm3, xmm3, xmm0
|
|
|
|
+ vpshufb xmm3, xmm3, OWORD PTR [r14]
|
|
|
|
+ vpaddd xmm2, xmm2, xmm3
|
|
|
|
+ vpxor xmm1, xmm1, xmm2
|
|
|
|
+ vpsrld xmm4, xmm1, 25
|
|
|
|
+ vpslld xmm1, xmm1, 7
|
|
|
|
+ vpxor xmm1, xmm1, xmm4
|
|
|
|
+ vpshufd xmm1, xmm1, 147
|
|
|
|
+ vpshufd xmm2, xmm2, 78
|
|
|
|
+ vpshufd xmm3, xmm3, 57
|
|
|
|
+ dec al
|
|
|
|
+ jnz L_chacha20_avx1_partial_crypt_start
|
|
|
|
+ vpaddd xmm0, xmm0, xmm5
|
|
|
|
+ vpaddd xmm1, xmm1, xmm6
|
|
|
|
+ vpaddd xmm2, xmm2, xmm7
|
|
|
|
+ vpaddd xmm3, xmm3, xmm8
|
|
|
|
+ vmovdqu OWORD PTR [r12], xmm0
|
|
|
|
+ vmovdqu OWORD PTR [r12+16], xmm1
|
|
|
|
+ vmovdqu OWORD PTR [r12+32], xmm2
|
|
|
|
+ vmovdqu OWORD PTR [r12+48], xmm3
|
|
|
|
+ add DWORD PTR [rcx+48], 1
|
|
|
|
+ mov r10d, r9d
|
|
|
|
+ xor r13, r13
|
|
|
|
+ and r10d, 7
|
|
|
|
+ jz L_chacha20_avx1_partial_start64
|
|
|
|
+L_chacha20_avx1_partial_start8:
|
|
|
|
+ movzx eax, BYTE PTR [r12+r13]
|
|
|
|
+ xor al, BYTE PTR [rdx+r13]
|
|
|
|
+ mov BYTE PTR [r8+r13], al
|
|
|
|
+ inc r13d
|
|
|
|
+ cmp r13d, r10d
|
|
|
|
+ jne L_chacha20_avx1_partial_start8
|
|
|
|
+ je L_chacha20_avx1_partial_end64
|
|
|
|
+L_chacha20_avx1_partial_start64:
|
|
|
|
+ mov rax, QWORD PTR [r12+r13]
|
|
|
|
+ xor rax, QWORD PTR [rdx+r13]
|
|
|
|
+ mov QWORD PTR [r8+r13], rax
|
|
|
|
+ add r13d, 8
|
|
|
|
+L_chacha20_avx1_partial_end64:
|
|
|
|
+ cmp r13d, r9d
|
|
|
|
+ jne L_chacha20_avx1_partial_start64
|
|
|
|
+ mov r10d, 64
|
|
|
|
+ sub r10d, r13d
|
|
|
|
+ mov DWORD PTR [rcx+76], r10d
|
|
|
|
+L_chacha20_avx1_partial_done:
|
|
|
|
+ vzeroupper
|
|
|
|
+ vmovdqu xmm6, OWORD PTR [rsp+400]
|
|
|
|
+ vmovdqu xmm7, OWORD PTR [rsp+416]
|
|
|
|
+ vmovdqu xmm8, OWORD PTR [rsp+432]
|
|
|
|
+ vmovdqu xmm9, OWORD PTR [rsp+448]
|
|
|
|
+ vmovdqu xmm10, OWORD PTR [rsp+464]
|
|
|
|
+ vmovdqu xmm11, OWORD PTR [rsp+480]
|
|
|
|
+ vmovdqu xmm12, OWORD PTR [rsp+496]
|
|
|
|
+ vmovdqu xmm13, OWORD PTR [rsp+512]
|
|
|
|
+ vmovdqu xmm14, OWORD PTR [rsp+528]
|
|
|
|
+ vmovdqu xmm15, OWORD PTR [rsp+544]
|
|
|
|
+ add rsp, 560
|
|
|
|
+ pop rsi
|
|
|
|
+ pop rdi
|
|
|
|
+ pop r15
|
|
|
|
+ pop r14
|
|
|
|
+ pop r13
|
|
|
|
+ pop r12
|
|
|
|
+ ret
|
|
|
|
+chacha_encrypt_avx1 ENDP
|
|
|
|
+_text ENDS
|
|
|
|
+ENDIF
|
|
|
|
+IFDEF HAVE_INTEL_AVX2
|
|
|
|
+_DATA SEGMENT
|
|
|
|
+ALIGN 16
|
|
|
|
+L_chacha20_avx2_rotl8 QWORD 433757367256023043, 1012478749960636427,
|
|
|
|
+ 433757367256023043, 1012478749960636427
|
|
|
|
+ptr_L_chacha20_avx2_rotl8 QWORD L_chacha20_avx2_rotl8
|
|
|
|
+_DATA ENDS
|
|
|
|
+_DATA SEGMENT
|
|
|
|
+ALIGN 16
|
|
|
|
+L_chacha20_avx2_rotl16 QWORD 361421592464458498, 940142975169071882,
|
|
|
|
+ 361421592464458498, 940142975169071882
|
|
|
|
+ptr_L_chacha20_avx2_rotl16 QWORD L_chacha20_avx2_rotl16
|
|
|
|
+_DATA ENDS
|
|
|
|
+_DATA SEGMENT
|
|
|
|
+ALIGN 16
|
|
|
|
+L_chacha20_avx2_add QWORD 4294967296, 12884901890,
|
|
|
|
+ 21474836484, 30064771078
|
|
|
|
+ptr_L_chacha20_avx2_add QWORD L_chacha20_avx2_add
|
|
|
|
+_DATA ENDS
|
|
|
|
+_DATA SEGMENT
|
|
|
|
+ALIGN 16
|
|
|
|
+L_chacha20_avx2_eight QWORD 34359738376, 34359738376,
|
|
|
|
+ 34359738376, 34359738376
|
|
|
|
+ptr_L_chacha20_avx2_eight QWORD L_chacha20_avx2_eight
|
|
|
|
+_DATA ENDS
|
|
|
|
+_text SEGMENT READONLY PARA
|
|
|
|
+chacha_encrypt_avx2 PROC
|
|
|
|
+ push r12
|
|
|
|
+ push r13
|
|
|
|
+ push r14
|
|
|
|
+ push r15
|
|
|
|
+ push rdi
|
|
|
|
+ sub rsp, 960
|
|
|
|
+ vmovdqu OWORD PTR [rsp+800], xmm6
|
|
|
|
+ vmovdqu OWORD PTR [rsp+816], xmm7
|
|
|
|
+ vmovdqu OWORD PTR [rsp+832], xmm8
|
|
|
|
+ vmovdqu OWORD PTR [rsp+848], xmm9
|
|
|
|
+ vmovdqu OWORD PTR [rsp+864], xmm10
|
|
|
|
+ vmovdqu OWORD PTR [rsp+880], xmm11
|
|
|
|
+ vmovdqu OWORD PTR [rsp+896], xmm12
|
|
|
|
+ vmovdqu OWORD PTR [rsp+912], xmm13
|
|
|
|
+ vmovdqu OWORD PTR [rsp+928], xmm14
|
|
|
|
+ vmovdqu OWORD PTR [rsp+944], xmm15
|
|
|
|
+ mov r11, rsp
|
|
|
|
+ mov r13, QWORD PTR [ptr_L_chacha20_avx2_rotl8]
|
|
|
|
+ mov r14, QWORD PTR [ptr_L_chacha20_avx2_rotl16]
|
|
|
|
+ mov r15, QWORD PTR [ptr_L_chacha20_avx2_add]
|
|
|
|
+ mov rdi, QWORD PTR [ptr_L_chacha20_avx2_eight]
|
|
|
|
+ lea r12, QWORD PTR [rsp+512]
|
|
|
|
+ add r11, 31
|
|
|
|
+ add r12, 31
|
|
|
|
+ and r11, -32
|
|
|
|
+ and r12, -32
|
|
|
|
+ mov eax, r9d
|
|
|
|
+ shr eax, 9
|
|
|
|
+ jz L_chacha20_avx2_end256
|
|
|
|
+ vpbroadcastd ymm0, DWORD PTR [rcx]
|
|
|
|
+ vpbroadcastd ymm1, DWORD PTR [rcx+4]
|
|
|
|
+ vpbroadcastd ymm2, DWORD PTR [rcx+8]
|
|
|
|
+ vpbroadcastd ymm3, DWORD PTR [rcx+12]
|
|
|
|
+ vpbroadcastd ymm4, DWORD PTR [rcx+16]
|
|
|
|
+ vpbroadcastd ymm5, DWORD PTR [rcx+20]
|
|
|
|
+ vpbroadcastd ymm6, DWORD PTR [rcx+24]
|
|
|
|
+ vpbroadcastd ymm7, DWORD PTR [rcx+28]
|
|
|
|
+ vpbroadcastd ymm8, DWORD PTR [rcx+32]
|
|
|
|
+ vpbroadcastd ymm9, DWORD PTR [rcx+36]
|
|
|
|
+ vpbroadcastd ymm10, DWORD PTR [rcx+40]
|
|
|
|
+ vpbroadcastd ymm11, DWORD PTR [rcx+44]
|
|
|
|
+ vpbroadcastd ymm12, DWORD PTR [rcx+48]
|
|
|
|
+ vpbroadcastd ymm13, DWORD PTR [rcx+52]
|
|
|
|
+ vpbroadcastd ymm14, DWORD PTR [rcx+56]
|
|
|
|
+ vpbroadcastd ymm15, DWORD PTR [rcx+60]
|
|
|
|
+ vpaddd ymm12, ymm12, YMMWORD PTR [r15]
|
|
|
|
+ vmovdqa YMMWORD PTR [r11], ymm0
|
|
|
|
+ vmovdqa YMMWORD PTR [r11+32], ymm1
|
|
|
|
+ vmovdqa YMMWORD PTR [r11+64], ymm2
|
|
|
|
+ vmovdqa YMMWORD PTR [r11+96], ymm3
|
|
|
|
+ vmovdqa YMMWORD PTR [r11+128], ymm4
|
|
|
|
+ vmovdqa YMMWORD PTR [r11+160], ymm5
|
|
|
|
+ vmovdqa YMMWORD PTR [r11+192], ymm6
|
|
|
|
+ vmovdqa YMMWORD PTR [r11+224], ymm7
|
|
|
|
+ vmovdqa YMMWORD PTR [r11+256], ymm8
|
|
|
|
+ vmovdqa YMMWORD PTR [r11+288], ymm9
|
|
|
|
+ vmovdqa YMMWORD PTR [r11+320], ymm10
|
|
|
|
+ vmovdqa YMMWORD PTR [r11+352], ymm11
|
|
|
|
+ vmovdqa YMMWORD PTR [r11+384], ymm12
|
|
|
|
+ vmovdqa YMMWORD PTR [r11+416], ymm13
|
|
|
|
+ vmovdqa YMMWORD PTR [r11+448], ymm14
|
|
|
|
+ vmovdqa YMMWORD PTR [r11+480], ymm15
|
|
|
|
+L_chacha20_avx2_start256:
|
|
|
|
+ mov r10b, 10
|
|
|
|
+ vmovdqa YMMWORD PTR [r12+96], ymm11
|
|
|
|
+L_chacha20_avx2_loop256:
|
|
|
|
+ vpaddd ymm0, ymm0, ymm4
|
|
|
|
+ vpxor ymm12, ymm12, ymm0
|
|
|
|
+ vmovdqa ymm11, YMMWORD PTR [r12+96]
|
|
|
|
+ vpshufb ymm12, ymm12, YMMWORD PTR [r14]
|
|
|
|
+ vpaddd ymm8, ymm8, ymm12
|
|
|
|
+ vpxor ymm4, ymm4, ymm8
|
|
|
|
+ vpaddd ymm1, ymm1, ymm5
|
|
|
|
+ vpxor ymm13, ymm13, ymm1
|
|
|
|
+ vpshufb ymm13, ymm13, YMMWORD PTR [r14]
|
|
|
|
+ vpaddd ymm9, ymm9, ymm13
|
|
|
|
+ vpxor ymm5, ymm5, ymm9
|
|
|
|
+ vpaddd ymm2, ymm2, ymm6
|
|
|
|
+ vpxor ymm14, ymm14, ymm2
|
|
|
|
+ vpshufb ymm14, ymm14, YMMWORD PTR [r14]
|
|
|
|
+ vpaddd ymm10, ymm10, ymm14
|
|
|
|
+ vpxor ymm6, ymm6, ymm10
|
|
|
|
+ vpaddd ymm3, ymm3, ymm7
|
|
|
|
+ vpxor ymm15, ymm15, ymm3
|
|
|
|
+ vpshufb ymm15, ymm15, YMMWORD PTR [r14]
|
|
|
|
+ vpaddd ymm11, ymm11, ymm15
|
|
|
|
+ vpxor ymm7, ymm7, ymm11
|
|
|
|
+ vmovdqa YMMWORD PTR [r12+96], ymm11
|
|
|
|
+ vpsrld ymm11, ymm4, 20
|
|
|
|
+ vpslld ymm4, ymm4, 12
|
|
|
|
+ vpxor ymm4, ymm4, ymm11
|
|
|
|
+ vpsrld ymm11, ymm5, 20
|
|
|
|
+ vpslld ymm5, ymm5, 12
|
|
|
|
+ vpxor ymm5, ymm5, ymm11
|
|
|
|
+ vpsrld ymm11, ymm6, 20
|
|
|
|
+ vpslld ymm6, ymm6, 12
|
|
|
|
+ vpxor ymm6, ymm6, ymm11
|
|
|
|
+ vpsrld ymm11, ymm7, 20
|
|
|
|
+ vpslld ymm7, ymm7, 12
|
|
|
|
+ vpxor ymm7, ymm7, ymm11
|
|
|
|
+ vpaddd ymm0, ymm0, ymm4
|
|
|
|
+ vpxor ymm12, ymm12, ymm0
|
|
|
|
+ vmovdqa ymm11, YMMWORD PTR [r12+96]
|
|
|
|
+ vpshufb ymm12, ymm12, YMMWORD PTR [r13]
|
|
|
|
+ vpaddd ymm8, ymm8, ymm12
|
|
|
|
+ vpxor ymm4, ymm4, ymm8
|
|
|
|
+ vpaddd ymm1, ymm1, ymm5
|
|
|
|
+ vpxor ymm13, ymm13, ymm1
|
|
|
|
+ vpshufb ymm13, ymm13, YMMWORD PTR [r13]
|
|
|
|
+ vpaddd ymm9, ymm9, ymm13
|
|
|
|
+ vpxor ymm5, ymm5, ymm9
|
|
|
|
+ vpaddd ymm2, ymm2, ymm6
|
|
|
|
+ vpxor ymm14, ymm14, ymm2
|
|
|
|
+ vpshufb ymm14, ymm14, YMMWORD PTR [r13]
|
|
|
|
+ vpaddd ymm10, ymm10, ymm14
|
|
|
|
+ vpxor ymm6, ymm6, ymm10
|
|
|
|
+ vpaddd ymm3, ymm3, ymm7
|
|
|
|
+ vpxor ymm15, ymm15, ymm3
|
|
|
|
+ vpshufb ymm15, ymm15, YMMWORD PTR [r13]
|
|
|
|
+ vpaddd ymm11, ymm11, ymm15
|
|
|
|
+ vpxor ymm7, ymm7, ymm11
|
|
|
|
+ vmovdqa YMMWORD PTR [r12+96], ymm11
|
|
|
|
+ vpsrld ymm11, ymm4, 25
|
|
|
|
+ vpslld ymm4, ymm4, 7
|
|
|
|
+ vpxor ymm4, ymm4, ymm11
|
|
|
|
+ vpsrld ymm11, ymm5, 25
|
|
|
|
+ vpslld ymm5, ymm5, 7
|
|
|
|
+ vpxor ymm5, ymm5, ymm11
|
|
|
|
+ vpsrld ymm11, ymm6, 25
|
|
|
|
+ vpslld ymm6, ymm6, 7
|
|
|
|
+ vpxor ymm6, ymm6, ymm11
|
|
|
|
+ vpsrld ymm11, ymm7, 25
|
|
|
|
+ vpslld ymm7, ymm7, 7
|
|
|
|
+ vpxor ymm7, ymm7, ymm11
|
|
|
|
+ vpaddd ymm0, ymm0, ymm5
|
|
|
|
+ vpxor ymm15, ymm15, ymm0
|
|
|
|
+ vmovdqa ymm11, YMMWORD PTR [r12+96]
|
|
|
|
+ vpshufb ymm15, ymm15, YMMWORD PTR [r14]
|
|
|
|
+ vpaddd ymm10, ymm10, ymm15
|
|
|
|
+ vpxor ymm5, ymm5, ymm10
|
|
|
|
+ vpaddd ymm1, ymm1, ymm6
|
|
|
|
+ vpxor ymm12, ymm12, ymm1
|
|
|
|
+ vpshufb ymm12, ymm12, YMMWORD PTR [r14]
|
|
|
|
+ vpaddd ymm11, ymm11, ymm12
|
|
|
|
+ vpxor ymm6, ymm6, ymm11
|
|
|
|
+ vpaddd ymm2, ymm2, ymm7
|
|
|
|
+ vpxor ymm13, ymm13, ymm2
|
|
|
|
+ vpshufb ymm13, ymm13, YMMWORD PTR [r14]
|
|
|
|
+ vpaddd ymm8, ymm8, ymm13
|
|
|
|
+ vpxor ymm7, ymm7, ymm8
|
|
|
|
+ vpaddd ymm3, ymm3, ymm4
|
|
|
|
+ vpxor ymm14, ymm14, ymm3
|
|
|
|
+ vpshufb ymm14, ymm14, YMMWORD PTR [r14]
|
|
|
|
+ vpaddd ymm9, ymm9, ymm14
|
|
|
|
+ vpxor ymm4, ymm4, ymm9
|
|
|
|
+ vmovdqa YMMWORD PTR [r12+96], ymm11
|
|
|
|
+ vpsrld ymm11, ymm5, 20
|
|
|
|
+ vpslld ymm5, ymm5, 12
|
|
|
|
+ vpxor ymm5, ymm5, ymm11
|
|
|
|
+ vpsrld ymm11, ymm6, 20
|
|
|
|
+ vpslld ymm6, ymm6, 12
|
|
|
|
+ vpxor ymm6, ymm6, ymm11
|
|
|
|
+ vpsrld ymm11, ymm7, 20
|
|
|
|
+ vpslld ymm7, ymm7, 12
|
|
|
|
+ vpxor ymm7, ymm7, ymm11
|
|
|
|
+ vpsrld ymm11, ymm4, 20
|
|
|
|
+ vpslld ymm4, ymm4, 12
|
|
|
|
+ vpxor ymm4, ymm4, ymm11
|
|
|
|
+ vpaddd ymm0, ymm0, ymm5
|
|
|
|
+ vpxor ymm15, ymm15, ymm0
|
|
|
|
+ vmovdqa ymm11, YMMWORD PTR [r12+96]
|
|
|
|
+ vpshufb ymm15, ymm15, YMMWORD PTR [r13]
|
|
|
|
+ vpaddd ymm10, ymm10, ymm15
|
|
|
|
+ vpxor ymm5, ymm5, ymm10
|
|
|
|
+ vpaddd ymm1, ymm1, ymm6
|
|
|
|
+ vpxor ymm12, ymm12, ymm1
|
|
|
|
+ vpshufb ymm12, ymm12, YMMWORD PTR [r13]
|
|
|
|
+ vpaddd ymm11, ymm11, ymm12
|
|
|
|
+ vpxor ymm6, ymm6, ymm11
|
|
|
|
+ vpaddd ymm2, ymm2, ymm7
|
|
|
|
+ vpxor ymm13, ymm13, ymm2
|
|
|
|
+ vpshufb ymm13, ymm13, YMMWORD PTR [r13]
|
|
|
|
+ vpaddd ymm8, ymm8, ymm13
|
|
|
|
+ vpxor ymm7, ymm7, ymm8
|
|
|
|
+ vpaddd ymm3, ymm3, ymm4
|
|
|
|
+ vpxor ymm14, ymm14, ymm3
|
|
|
|
+ vpshufb ymm14, ymm14, YMMWORD PTR [r13]
|
|
|
|
+ vpaddd ymm9, ymm9, ymm14
|
|
|
|
+ vpxor ymm4, ymm4, ymm9
|
|
|
|
+ vmovdqa YMMWORD PTR [r12+96], ymm11
|
|
|
|
+ vpsrld ymm11, ymm5, 25
|
|
|
|
+ vpslld ymm5, ymm5, 7
|
|
|
|
+ vpxor ymm5, ymm5, ymm11
|
|
|
|
+ vpsrld ymm11, ymm6, 25
|
|
|
|
+ vpslld ymm6, ymm6, 7
|
|
|
|
+ vpxor ymm6, ymm6, ymm11
|
|
|
|
+ vpsrld ymm11, ymm7, 25
|
|
|
|
+ vpslld ymm7, ymm7, 7
|
|
|
|
+ vpxor ymm7, ymm7, ymm11
|
|
|
|
+ vpsrld ymm11, ymm4, 25
|
|
|
|
+ vpslld ymm4, ymm4, 7
|
|
|
|
+ vpxor ymm4, ymm4, ymm11
|
|
|
|
+ dec r10b
|
|
|
|
+ jnz L_chacha20_avx2_loop256
|
|
|
|
+ vmovdqa ymm11, YMMWORD PTR [r12+96]
|
|
|
|
+ vpaddd ymm0, ymm0, YMMWORD PTR [r11]
|
|
|
|
+ vpaddd ymm1, ymm1, YMMWORD PTR [r11+32]
|
|
|
|
+ vpaddd ymm2, ymm2, YMMWORD PTR [r11+64]
|
|
|
|
+ vpaddd ymm3, ymm3, YMMWORD PTR [r11+96]
|
|
|
|
+ vpaddd ymm4, ymm4, YMMWORD PTR [r11+128]
|
|
|
|
+ vpaddd ymm5, ymm5, YMMWORD PTR [r11+160]
|
|
|
|
+ vpaddd ymm6, ymm6, YMMWORD PTR [r11+192]
|
|
|
|
+ vpaddd ymm7, ymm7, YMMWORD PTR [r11+224]
|
|
|
|
+ vpaddd ymm8, ymm8, YMMWORD PTR [r11+256]
|
|
|
|
+ vpaddd ymm9, ymm9, YMMWORD PTR [r11+288]
|
|
|
|
+ vpaddd ymm10, ymm10, YMMWORD PTR [r11+320]
|
|
|
|
+ vpaddd ymm11, ymm11, YMMWORD PTR [r11+352]
|
|
|
|
+ vpaddd ymm12, ymm12, YMMWORD PTR [r11+384]
|
|
|
|
+ vpaddd ymm13, ymm13, YMMWORD PTR [r11+416]
|
|
|
|
+ vpaddd ymm14, ymm14, YMMWORD PTR [r11+448]
|
|
|
|
+ vpaddd ymm15, ymm15, YMMWORD PTR [r11+480]
|
|
|
|
+ vmovdqa YMMWORD PTR [r12], ymm8
|
|
|
|
+ vmovdqa YMMWORD PTR [r12+32], ymm9
|
|
|
|
+ vmovdqa YMMWORD PTR [r12+64], ymm10
|
|
|
|
+ vmovdqa YMMWORD PTR [r12+96], ymm11
|
|
|
|
+ vmovdqa YMMWORD PTR [r12+128], ymm12
|
|
|
|
+ vmovdqa YMMWORD PTR [r12+160], ymm13
|
|
|
|
+ vmovdqa YMMWORD PTR [r12+192], ymm14
|
|
|
|
+ vmovdqa YMMWORD PTR [r12+224], ymm15
|
|
|
|
+ vpunpckldq ymm8, ymm0, ymm1
|
|
|
|
+ vpunpckldq ymm9, ymm2, ymm3
|
|
|
|
+ vpunpckhdq ymm12, ymm0, ymm1
|
|
|
|
+ vpunpckhdq ymm13, ymm2, ymm3
|
|
|
|
+ vpunpckldq ymm10, ymm4, ymm5
|
|
|
|
+ vpunpckldq ymm11, ymm6, ymm7
|
|
|
|
+ vpunpckhdq ymm14, ymm4, ymm5
|
|
|
|
+ vpunpckhdq ymm15, ymm6, ymm7
|
|
|
|
+ vpunpcklqdq ymm0, ymm8, ymm9
|
|
|
|
+ vpunpcklqdq ymm1, ymm10, ymm11
|
|
|
|
+ vpunpckhqdq ymm2, ymm8, ymm9
|
|
|
|
+ vpunpckhqdq ymm3, ymm10, ymm11
|
|
|
|
+ vpunpcklqdq ymm4, ymm12, ymm13
|
|
|
|
+ vpunpcklqdq ymm5, ymm14, ymm15
|
|
|
|
+ vpunpckhqdq ymm6, ymm12, ymm13
|
|
|
|
+ vpunpckhqdq ymm7, ymm14, ymm15
|
|
|
|
+ vperm2i128 ymm8, ymm0, ymm1, 32
|
|
|
|
+ vperm2i128 ymm9, ymm2, ymm3, 32
|
|
|
|
+ vperm2i128 ymm12, ymm0, ymm1, 49
|
|
|
|
+ vperm2i128 ymm13, ymm2, ymm3, 49
|
|
|
|
+ vperm2i128 ymm10, ymm4, ymm5, 32
|
|
|
|
+ vperm2i128 ymm11, ymm6, ymm7, 32
|
|
|
|
+ vperm2i128 ymm14, ymm4, ymm5, 49
|
|
|
|
+ vperm2i128 ymm15, ymm6, ymm7, 49
|
|
|
|
+ vmovdqu ymm0, YMMWORD PTR [rdx]
|
|
|
|
+ vmovdqu ymm1, YMMWORD PTR [rdx+64]
|
|
|
|
+ vmovdqu ymm2, YMMWORD PTR [rdx+128]
|
|
|
|
+ vmovdqu ymm3, YMMWORD PTR [rdx+192]
|
|
|
|
+ vmovdqu ymm4, YMMWORD PTR [rdx+256]
|
|
|
|
+ vmovdqu ymm5, YMMWORD PTR [rdx+320]
|
|
|
|
+ vmovdqu ymm6, YMMWORD PTR [rdx+384]
|
|
|
|
+ vmovdqu ymm7, YMMWORD PTR [rdx+448]
|
|
|
|
+ vpxor ymm8, ymm8, ymm0
|
|
|
|
+ vpxor ymm9, ymm9, ymm1
|
|
|
|
+ vpxor ymm10, ymm10, ymm2
|
|
|
|
+ vpxor ymm11, ymm11, ymm3
|
|
|
|
+ vpxor ymm12, ymm12, ymm4
|
|
|
|
+ vpxor ymm13, ymm13, ymm5
|
|
|
|
+ vpxor ymm14, ymm14, ymm6
|
|
|
|
+ vpxor ymm15, ymm15, ymm7
|
|
|
|
+ vmovdqu YMMWORD PTR [r8], ymm8
|
|
|
|
+ vmovdqu YMMWORD PTR [r8+64], ymm9
|
|
|
|
+ vmovdqu YMMWORD PTR [r8+128], ymm10
|
|
|
|
+ vmovdqu YMMWORD PTR [r8+192], ymm11
|
|
|
|
+ vmovdqu YMMWORD PTR [r8+256], ymm12
|
|
|
|
+ vmovdqu YMMWORD PTR [r8+320], ymm13
|
|
|
|
+ vmovdqu YMMWORD PTR [r8+384], ymm14
|
|
|
|
+ vmovdqu YMMWORD PTR [r8+448], ymm15
|
|
|
|
+ vmovdqa ymm0, YMMWORD PTR [r12]
|
|
|
|
+ vmovdqa ymm1, YMMWORD PTR [r12+32]
|
|
|
|
+ vmovdqa ymm2, YMMWORD PTR [r12+64]
|
|
|
|
+ vmovdqa ymm3, YMMWORD PTR [r12+96]
|
|
|
|
+ vmovdqa ymm4, YMMWORD PTR [r12+128]
|
|
|
|
+ vmovdqa ymm5, YMMWORD PTR [r12+160]
|
|
|
|
+ vmovdqa ymm6, YMMWORD PTR [r12+192]
|
|
|
|
+ vmovdqa ymm7, YMMWORD PTR [r12+224]
|
|
|
|
+ vpunpckldq ymm8, ymm0, ymm1
|
|
|
|
+ vpunpckldq ymm9, ymm2, ymm3
|
|
|
|
+ vpunpckhdq ymm12, ymm0, ymm1
|
|
|
|
+ vpunpckhdq ymm13, ymm2, ymm3
|
|
|
|
+ vpunpckldq ymm10, ymm4, ymm5
|
|
|
|
+ vpunpckldq ymm11, ymm6, ymm7
|
|
|
|
+ vpunpckhdq ymm14, ymm4, ymm5
|
|
|
|
+ vpunpckhdq ymm15, ymm6, ymm7
|
|
|
|
+ vpunpcklqdq ymm0, ymm8, ymm9
|
|
|
|
+ vpunpcklqdq ymm1, ymm10, ymm11
|
|
|
|
+ vpunpckhqdq ymm2, ymm8, ymm9
|
|
|
|
+ vpunpckhqdq ymm3, ymm10, ymm11
|
|
|
|
+ vpunpcklqdq ymm4, ymm12, ymm13
|
|
|
|
+ vpunpcklqdq ymm5, ymm14, ymm15
|
|
|
|
+ vpunpckhqdq ymm6, ymm12, ymm13
|
|
|
|
+ vpunpckhqdq ymm7, ymm14, ymm15
|
|
|
|
+ vperm2i128 ymm8, ymm0, ymm1, 32
|
|
|
|
+ vperm2i128 ymm9, ymm2, ymm3, 32
|
|
|
|
+ vperm2i128 ymm12, ymm0, ymm1, 49
|
|
|
|
+ vperm2i128 ymm13, ymm2, ymm3, 49
|
|
|
|
+ vperm2i128 ymm10, ymm4, ymm5, 32
|
|
|
|
+ vperm2i128 ymm11, ymm6, ymm7, 32
|
|
|
|
+ vperm2i128 ymm14, ymm4, ymm5, 49
|
|
|
|
+ vperm2i128 ymm15, ymm6, ymm7, 49
|
|
|
|
+ vmovdqu ymm0, YMMWORD PTR [rdx+32]
|
|
|
|
+ vmovdqu ymm1, YMMWORD PTR [rdx+96]
|
|
|
|
+ vmovdqu ymm2, YMMWORD PTR [rdx+160]
|
|
|
|
+ vmovdqu ymm3, YMMWORD PTR [rdx+224]
|
|
|
|
+ vmovdqu ymm4, YMMWORD PTR [rdx+288]
|
|
|
|
+ vmovdqu ymm5, YMMWORD PTR [rdx+352]
|
|
|
|
+ vmovdqu ymm6, YMMWORD PTR [rdx+416]
|
|
|
|
+ vmovdqu ymm7, YMMWORD PTR [rdx+480]
|
|
|
|
+ vpxor ymm8, ymm8, ymm0
|
|
|
|
+ vpxor ymm9, ymm9, ymm1
|
|
|
|
+ vpxor ymm10, ymm10, ymm2
|
|
|
|
+ vpxor ymm11, ymm11, ymm3
|
|
|
|
+ vpxor ymm12, ymm12, ymm4
|
|
|
|
+ vpxor ymm13, ymm13, ymm5
|
|
|
|
+ vpxor ymm14, ymm14, ymm6
|
|
|
|
+ vpxor ymm15, ymm15, ymm7
|
|
|
|
+ vmovdqu YMMWORD PTR [r8+32], ymm8
|
|
|
|
+ vmovdqu YMMWORD PTR [r8+96], ymm9
|
|
|
|
+ vmovdqu YMMWORD PTR [r8+160], ymm10
|
|
|
|
+ vmovdqu YMMWORD PTR [r8+224], ymm11
|
|
|
|
+ vmovdqu YMMWORD PTR [r8+288], ymm12
|
|
|
|
+ vmovdqu YMMWORD PTR [r8+352], ymm13
|
|
|
|
+ vmovdqu YMMWORD PTR [r8+416], ymm14
|
|
|
|
+ vmovdqu YMMWORD PTR [r8+480], ymm15
|
|
|
|
+ vmovdqa ymm12, YMMWORD PTR [r11+384]
|
|
|
|
+ add rdx, 512
|
|
|
|
+ add r8, 512
|
|
|
|
+ vpaddd ymm12, ymm12, YMMWORD PTR [rdi]
|
|
|
|
+ sub r9d, 512
|
|
|
|
+ vmovdqa YMMWORD PTR [r11+384], ymm12
|
|
|
|
+ cmp r9d, 512
|
|
|
|
+ jl L_chacha20_avx2_done256
|
|
|
|
+ vmovdqa ymm0, YMMWORD PTR [r11]
|
|
|
|
+ vmovdqa ymm1, YMMWORD PTR [r11+32]
|
|
|
|
+ vmovdqa ymm2, YMMWORD PTR [r11+64]
|
|
|
|
+ vmovdqa ymm3, YMMWORD PTR [r11+96]
|
|
|
|
+ vmovdqa ymm4, YMMWORD PTR [r11+128]
|
|
|
|
+ vmovdqa ymm5, YMMWORD PTR [r11+160]
|
|
|
|
+ vmovdqa ymm6, YMMWORD PTR [r11+192]
|
|
|
|
+ vmovdqa ymm7, YMMWORD PTR [r11+224]
|
|
|
|
+ vmovdqa ymm8, YMMWORD PTR [r11+256]
|
|
|
|
+ vmovdqa ymm9, YMMWORD PTR [r11+288]
|
|
|
|
+ vmovdqa ymm10, YMMWORD PTR [r11+320]
|
|
|
|
+ vmovdqa ymm11, YMMWORD PTR [r11+352]
|
|
|
|
+ vmovdqa ymm12, YMMWORD PTR [r11+384]
|
|
|
|
+ vmovdqa ymm13, YMMWORD PTR [r11+416]
|
|
|
|
+ vmovdqa ymm14, YMMWORD PTR [r11+448]
|
|
|
|
+ vmovdqa ymm15, YMMWORD PTR [r11+480]
|
|
|
|
+ jmp L_chacha20_avx2_start256
|
|
|
|
+L_chacha20_avx2_done256:
|
|
|
|
+ shl eax, 3
|
|
|
|
+ add DWORD PTR [rcx+48], eax
|
|
|
|
+L_chacha20_avx2_end256:
|
|
|
|
+ call chacha_encrypt_avx1
|
|
|
|
+ vzeroupper
|
|
|
|
+ vmovdqu xmm6, OWORD PTR [rsp+800]
|
|
|
|
+ vmovdqu xmm7, OWORD PTR [rsp+816]
|
|
|
|
+ vmovdqu xmm8, OWORD PTR [rsp+832]
|
|
|
|
+ vmovdqu xmm9, OWORD PTR [rsp+848]
|
|
|
|
+ vmovdqu xmm10, OWORD PTR [rsp+864]
|
|
|
|
+ vmovdqu xmm11, OWORD PTR [rsp+880]
|
|
|
|
+ vmovdqu xmm12, OWORD PTR [rsp+896]
|
|
|
|
+ vmovdqu xmm13, OWORD PTR [rsp+912]
|
|
|
|
+ vmovdqu xmm14, OWORD PTR [rsp+928]
|
|
|
|
+ vmovdqu xmm15, OWORD PTR [rsp+944]
|
|
|
|
+ add rsp, 960
|
|
|
|
+ pop rdi
|
|
|
|
+ pop r15
|
|
|
|
+ pop r14
|
|
|
|
+ pop r13
|
|
|
|
+ pop r12
|
|
|
|
+ ret
|
|
|
|
+chacha_encrypt_avx2 ENDP
|
|
|
|
+_text ENDS
|
|
|
|
+ENDIF
|
|
|
|
+END
|