Browse Source

AES-GCM: MSVC use generated assembly instead of inline

AES-GCM for MSVC was using inline assembly.
Add MSVC version of AES-GCM assembly and use it instead.
Removed inline assembly.
Sean Parkinson 1 year ago
parent
commit
397537e691

+ 1 - 1
IDE/ECLIPSE/MICRIUM/README.md

@@ -40,7 +40,7 @@ The folder hierarchy is the same as the wolfSSL folders with an exception of the
 
 4. Right click on each folders, add or link all the source code in the corresponding folder in wolfSSL.
 
-5. Remove non-C platform dependent files from your build. At the moment, only aes_asm.asm and aes_asm.s must be removed from your wolfssl/wolfcrypt/src folder.
+5. Remove non-C platform dependent files from your build. At the moment, only aes_asm.asm, aes_gcm_asm.asm and aes_asm.s must be removed from your wolfssl/wolfcrypt/src folder.
 
 6. In your C/C++ compiler preprocessor settings, add the wolfSSL directories to your include paths.
 Here's an example of the paths that must be added.

+ 8 - 1
IDE/WIN/user_settings.h

@@ -46,6 +46,7 @@
         #define HAVE_SECURE_RENEGOTIATION
 
         #define HAVE_AESGCM
+        #define WOLFSSL_AESGCM_STREAM
         #define WOLFSSL_SHA384
         #define WOLFSSL_SHA512
 
@@ -56,12 +57,19 @@
         #define ECC_SHAMIR
         #define ECC_TIMING_RESISTANT
 
+        #define WOLFSSL_SP_X86_64
+        #define SP_INT_BITS  4096
+
         /* Optional Performance Speedups */
         #if 0
             /* AESNI on x64 */
             #ifdef _WIN64
                 #define HAVE_INTEL_RDSEED
                 #define WOLFSSL_AESNI
+                #define HAVE_INTEL_AVX1
+                #if 0
+                    #define HAVE_INTEL_AVX2
+                #endif
             #endif
 
             /* Single Precision Support for RSA/DH 1024/2048/3072 and
@@ -82,7 +90,6 @@
                 #define WOLFSSL_SP_X86_64_ASM
             #endif
         #endif
-
     #else
         /* The servers and clients */
         #define OPENSSL_EXTRA

+ 8 - 0
IDE/WIN/user_settings_dtls.h

@@ -51,6 +51,7 @@
         #define HAVE_SECURE_RENEGOTIATION
 
         #define HAVE_AESGCM
+        #define WOLFSSL_AESGCM_STREAM
         #define WOLFSSL_SHA384
         #define WOLFSSL_SHA512
 
@@ -61,12 +62,19 @@
         #define ECC_SHAMIR
         #define ECC_TIMING_RESISTANT
 
+        #define WOLFSSL_SP_X86_64
+        #define SP_INT_BITS  4096
+
         /* Optional Performance Speedups */
         #if 0
             /* AESNI on x64 */
             #ifdef _WIN64
                 #define HAVE_INTEL_RDSEED
                 #define WOLFSSL_AESNI
+                #define HAVE_INTEL_AVX1
+                #if 0
+                    #define HAVE_INTEL_AVX2
+                #endif
             #endif
 
             /* Single Precision Support for RSA/DH 1024/2048/3072 and

+ 16 - 0
IDE/WIN/wolfssl-fips.vcxproj

@@ -331,6 +331,22 @@
       <Outputs Condition="'$(Configuration)|$(Platform)'=='DLL Release|x64'">$(IntDir)%(Filename).obj</Outputs>
     </CustomBuild>
   </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="..\..\wolfcrypt\src\aes_gcm_asm.asm">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='DLL Debug|x64'">false</ExcludedFromBuild>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='DLL Debug|x64'">ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='DLL Debug|x64'">$(IntDir)%(Filename).obj</Outputs>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='DLL Release|x64'">false</ExcludedFromBuild>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='DLL Release|x64'">ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='DLL Release|x64'">$(IntDir)%(Filename).obj</Outputs>
+    </CustomBuild>
+  </ItemGroup>
   <ItemGroup>
     <ClInclude Include="user_settings.h" />
   </ItemGroup>

+ 16 - 0
IDE/WIN10/wolfssl-fips.vcxproj

@@ -307,6 +307,22 @@
       <Outputs Condition="'$(Configuration)|$(Platform)'=='DLL Release|x64'">$(IntDir)%(Filename).obj</Outputs>
     </CustomBuild>
   </ItemGroup>
+  <ItemGroup>
+    <CustomBuild Include="..\..\wolfcrypt\src\aes_gcm_asm.asm">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='DLL Debug|x64'">false</ExcludedFromBuild>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ml64.exe /DHAVE_FIPS /DHAVE_FIPS_VERSION=5 /DHAVE_FIPS_VERSION_MINOR=1 /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='DLL Debug|x64'">ml64.exe /DHAVE_FIPS /DHAVE_FIPS_VERSION=5 /DHAVE_FIPS_VERSION_MINOR=1 /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(IntDir)%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='DLL Debug|x64'">$(IntDir)%(Filename).obj</Outputs>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='DLL Release|x64'">false</ExcludedFromBuild>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ml64.exe /DHAVE_FIPS /DHAVE_FIPS_VERSION=5 /DHAVE_FIPS_VERSION_MINOR=1 /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='DLL Release|x64'">ml64.exe /DHAVE_FIPS /DHAVE_FIPS_VERSION=5 /DHAVE_FIPS_VERSION_MINOR=1 /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(IntDir)%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='DLL Release|x64'">$(IntDir)%(Filename).obj</Outputs>
+    </CustomBuild>
+  </ItemGroup>
   <ItemGroup>
     <ClInclude Include="resource.h" />
     <ClInclude Include="user_settings.h" />

+ 1 - 0
IDE/WORKBENCH/README.md

@@ -9,6 +9,7 @@ src and wolfcrypt directories. Uncheck the following:
 
     ```
     wolfcrypt/src/aes_asm.asm
+    wolfcrypt/src/aes_gcm_asm.asm
     wolfcrypt/src/aes_asm.s
     examples/echoclient/
     examples/echoserver/

File diff suppressed because it is too large
+ 0 - 1142
wolfcrypt/src/aes.c


File diff suppressed because it is too large
+ 116 - 116
wolfcrypt/src/aes_gcm_asm.S


+ 15423 - 0
wolfcrypt/src/aes_gcm_asm.asm

@@ -0,0 +1,15423 @@
+; /* aes_gcm_asm
+;  *
+;  * Copyright (C) 2006-2023 wolfSSL Inc.
+;  *
+;  * This file is part of wolfSSL.
+;  *
+;  * wolfSSL is free software; you can redistribute it and/or modify
+;  * it under the terms of the GNU General Public License as published by
+;  * the Free Software Foundation; either version 2 of the License, or
+;  * (at your option) any later version.
+;  *
+;  * wolfSSL is distributed in the hope that it will be useful,
+;  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+;  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;  * GNU General Public License for more details.
+;  *
+;  * You should have received a copy of the GNU General Public License
+;  * along with this program; if not, write to the Free Software
+;  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+;  */
+IF @Version LT 1200
+; AVX2 instructions not recognized by old versions of MASM
+IFNDEF NO_AVX2_SUPPORT
+NO_AVX2_SUPPORT = 1
+ENDIF
+; MOVBE instruction not recognized by old versions of MASM
+IFNDEF NO_MOVBE_SUPPORT
+NO_MOVBE_SUPPORT = 1
+ENDIF
+ENDIF
+
+IFNDEF HAVE_INTEL_AVX1
+HAVE_INTEL_AVX1 = 1
+ENDIF
+IFNDEF NO_AVX2_SUPPORT
+HAVE_INTEL_AVX2 = 1
+ENDIF
+
+IFNDEF _WIN64
+_WIN64 = 1
+ENDIF
+
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_one QWORD 0, 1
+ptr_L_aes_gcm_one QWORD L_aes_gcm_one
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_two QWORD 0, 2
+ptr_L_aes_gcm_two QWORD L_aes_gcm_two
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_three QWORD 0, 3
+ptr_L_aes_gcm_three QWORD L_aes_gcm_three
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_four QWORD 0, 4
+ptr_L_aes_gcm_four QWORD L_aes_gcm_four
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_five QWORD 0, 5
+ptr_L_aes_gcm_five QWORD L_aes_gcm_five
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_six QWORD 0, 6
+ptr_L_aes_gcm_six QWORD L_aes_gcm_six
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_seven QWORD 0, 7
+ptr_L_aes_gcm_seven QWORD L_aes_gcm_seven
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_eight QWORD 0, 8
+ptr_L_aes_gcm_eight QWORD L_aes_gcm_eight
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_bswap_epi64 QWORD 283686952306183, 579005069656919567
+ptr_L_aes_gcm_bswap_epi64 QWORD L_aes_gcm_bswap_epi64
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_bswap_mask QWORD 579005069656919567, 283686952306183
+ptr_L_aes_gcm_bswap_mask QWORD L_aes_gcm_bswap_mask
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_aes_gcm_mod2_128 QWORD 1, 13979173243358019584
+ptr_L_aes_gcm_mod2_128 QWORD L_aes_gcm_mod2_128
+_DATA ENDS
+_text SEGMENT READONLY PARA
+AES_GCM_encrypt PROC
+        push	r13
+        push	rdi
+        push	rsi
+        push	r12
+        push	rbx
+        push	r14
+        push	r15
+        mov	rdi, rcx
+        mov	rsi, rdx
+        mov	r12, r8
+        mov	rax, r9
+        mov	r8, QWORD PTR [rsp+96]
+        mov	r9d, DWORD PTR [rsp+104]
+        mov	r11d, DWORD PTR [rsp+112]
+        mov	ebx, DWORD PTR [rsp+120]
+        mov	r14d, DWORD PTR [rsp+128]
+        mov	r15, QWORD PTR [rsp+136]
+        mov	r10d, DWORD PTR [rsp+144]
+        sub	rsp, 160
+        pxor	xmm4, xmm4
+        pxor	xmm6, xmm6
+        cmp	ebx, 12
+        mov	edx, ebx
+        jne	L_AES_GCM_encrypt_iv_not_12
+        ; # Calculate values when IV is 12 bytes
+        ; Set counter based on IV
+        mov	ecx, 16777216
+        pinsrq	xmm4, QWORD PTR [rax], 0
+        pinsrd	xmm4, DWORD PTR [rax+8], 2
+        pinsrd	xmm4, ecx, 3
+        ; H = Encrypt X(=0) and T = Encrypt counter
+        movdqa	xmm1, xmm4
+        movdqa	xmm5, OWORD PTR [r15]
+        pxor	xmm1, xmm5
+        movdqa	xmm7, OWORD PTR [r15+16]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [r15+32]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [r15+48]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [r15+64]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [r15+80]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [r15+96]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [r15+112]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [r15+128]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [r15+144]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        cmp	r10d, 11
+        movdqa	xmm7, OWORD PTR [r15+160]
+        jl	L_AES_GCM_encrypt_calc_iv_12_last
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [r15+176]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        cmp	r10d, 13
+        movdqa	xmm7, OWORD PTR [r15+192]
+        jl	L_AES_GCM_encrypt_calc_iv_12_last
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [r15+208]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [r15+224]
+L_AES_GCM_encrypt_calc_iv_12_last:
+        aesenclast	xmm5, xmm7
+        aesenclast	xmm1, xmm7
+        pshufb	xmm5, OWORD PTR L_aes_gcm_bswap_mask
+        movdqu	[rsp+144], xmm1
+        jmp	L_AES_GCM_encrypt_iv_done
+L_AES_GCM_encrypt_iv_not_12:
+        ; Calculate values when IV is not 12 bytes
+        ; H = Encrypt X(=0)
+        movdqa	xmm5, OWORD PTR [r15]
+        aesenc	xmm5, [r15+16]
+        aesenc	xmm5, [r15+32]
+        aesenc	xmm5, [r15+48]
+        aesenc	xmm5, [r15+64]
+        aesenc	xmm5, [r15+80]
+        aesenc	xmm5, [r15+96]
+        aesenc	xmm5, [r15+112]
+        aesenc	xmm5, [r15+128]
+        aesenc	xmm5, [r15+144]
+        cmp	r10d, 11
+        movdqa	xmm9, OWORD PTR [r15+160]
+        jl	L_AES_GCM_encrypt_calc_iv_1_aesenc_avx_last
+        aesenc	xmm5, xmm9
+        aesenc	xmm5, [r15+176]
+        cmp	r10d, 13
+        movdqa	xmm9, OWORD PTR [r15+192]
+        jl	L_AES_GCM_encrypt_calc_iv_1_aesenc_avx_last
+        aesenc	xmm5, xmm9
+        aesenc	xmm5, [r15+208]
+        movdqa	xmm9, OWORD PTR [r15+224]
+L_AES_GCM_encrypt_calc_iv_1_aesenc_avx_last:
+        aesenclast	xmm5, xmm9
+        pshufb	xmm5, OWORD PTR L_aes_gcm_bswap_mask
+        ; Calc counter
+        ; Initialization vector
+        cmp	edx, 0
+        mov	rcx, 0
+        je	L_AES_GCM_encrypt_calc_iv_done
+        cmp	edx, 16
+        jl	L_AES_GCM_encrypt_calc_iv_lt16
+        and	edx, 4294967280
+L_AES_GCM_encrypt_calc_iv_16_loop:
+        movdqu	xmm8, [rax+rcx]
+        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
+        pxor	xmm4, xmm8
+        pshufd	xmm1, xmm4, 78
+        pshufd	xmm2, xmm5, 78
+        movdqa	xmm3, xmm5
+        movdqa	xmm0, xmm5
+        pclmulqdq	xmm3, xmm4, 17
+        pclmulqdq	xmm0, xmm4, 0
+        pxor	xmm1, xmm4
+        pxor	xmm2, xmm5
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        movdqa	xmm7, xmm0
+        movdqa	xmm4, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm7, xmm2
+        pxor	xmm4, xmm1
+        movdqa	xmm0, xmm7
+        movdqa	xmm1, xmm4
+        psrld	xmm0, 31
+        psrld	xmm1, 31
+        pslld	xmm7, 1
+        pslld	xmm4, 1
+        movdqa	xmm2, xmm0
+        pslldq	xmm0, 4
+        psrldq	xmm2, 12
+        pslldq	xmm1, 4
+        por	xmm4, xmm2
+        por	xmm7, xmm0
+        por	xmm4, xmm1
+        movdqa	xmm0, xmm7
+        movdqa	xmm1, xmm7
+        movdqa	xmm2, xmm7
+        pslld	xmm0, 31
+        pslld	xmm1, 30
+        pslld	xmm2, 25
+        pxor	xmm0, xmm1
+        pxor	xmm0, xmm2
+        movdqa	xmm1, xmm0
+        psrldq	xmm1, 4
+        pslldq	xmm0, 12
+        pxor	xmm7, xmm0
+        movdqa	xmm2, xmm7
+        movdqa	xmm3, xmm7
+        movdqa	xmm0, xmm7
+        psrld	xmm2, 1
+        psrld	xmm3, 2
+        psrld	xmm0, 7
+        pxor	xmm2, xmm3
+        pxor	xmm2, xmm0
+        pxor	xmm2, xmm1
+        pxor	xmm2, xmm7
+        pxor	xmm4, xmm2
+        add	ecx, 16
+        cmp	ecx, edx
+        jl	L_AES_GCM_encrypt_calc_iv_16_loop
+        mov	edx, ebx
+        cmp	ecx, edx
+        je	L_AES_GCM_encrypt_calc_iv_done
+L_AES_GCM_encrypt_calc_iv_lt16:
+        sub	rsp, 16
+        pxor	xmm8, xmm8
+        xor	ebx, ebx
+        movdqu	[rsp], xmm8
+L_AES_GCM_encrypt_calc_iv_loop:
+        movzx	r13d, BYTE PTR [rax+rcx]
+        mov	BYTE PTR [rsp+rbx], r13b
+        inc	ecx
+        inc	ebx
+        cmp	ecx, edx
+        jl	L_AES_GCM_encrypt_calc_iv_loop
+        movdqu	xmm8, [rsp]
+        add	rsp, 16
+        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
+        pxor	xmm4, xmm8
+        pshufd	xmm1, xmm4, 78
+        pshufd	xmm2, xmm5, 78
+        movdqa	xmm3, xmm5
+        movdqa	xmm0, xmm5
+        pclmulqdq	xmm3, xmm4, 17
+        pclmulqdq	xmm0, xmm4, 0
+        pxor	xmm1, xmm4
+        pxor	xmm2, xmm5
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        movdqa	xmm7, xmm0
+        movdqa	xmm4, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm7, xmm2
+        pxor	xmm4, xmm1
+        movdqa	xmm0, xmm7
+        movdqa	xmm1, xmm4
+        psrld	xmm0, 31
+        psrld	xmm1, 31
+        pslld	xmm7, 1
+        pslld	xmm4, 1
+        movdqa	xmm2, xmm0
+        pslldq	xmm0, 4
+        psrldq	xmm2, 12
+        pslldq	xmm1, 4
+        por	xmm4, xmm2
+        por	xmm7, xmm0
+        por	xmm4, xmm1
+        movdqa	xmm0, xmm7
+        movdqa	xmm1, xmm7
+        movdqa	xmm2, xmm7
+        pslld	xmm0, 31
+        pslld	xmm1, 30
+        pslld	xmm2, 25
+        pxor	xmm0, xmm1
+        pxor	xmm0, xmm2
+        movdqa	xmm1, xmm0
+        psrldq	xmm1, 4
+        pslldq	xmm0, 12
+        pxor	xmm7, xmm0
+        movdqa	xmm2, xmm7
+        movdqa	xmm3, xmm7
+        movdqa	xmm0, xmm7
+        psrld	xmm2, 1
+        psrld	xmm3, 2
+        psrld	xmm0, 7
+        pxor	xmm2, xmm3
+        pxor	xmm2, xmm0
+        pxor	xmm2, xmm1
+        pxor	xmm2, xmm7
+        pxor	xmm4, xmm2
+L_AES_GCM_encrypt_calc_iv_done:
+        ; T = Encrypt counter
+        pxor	xmm0, xmm0
+        shl	edx, 3
+        pinsrq	xmm0, rdx, 0
+        pxor	xmm4, xmm0
+        pshufd	xmm1, xmm4, 78
+        pshufd	xmm2, xmm5, 78
+        movdqa	xmm3, xmm5
+        movdqa	xmm0, xmm5
+        pclmulqdq	xmm3, xmm4, 17
+        pclmulqdq	xmm0, xmm4, 0
+        pxor	xmm1, xmm4
+        pxor	xmm2, xmm5
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        movdqa	xmm7, xmm0
+        movdqa	xmm4, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm7, xmm2
+        pxor	xmm4, xmm1
+        movdqa	xmm0, xmm7
+        movdqa	xmm1, xmm4
+        psrld	xmm0, 31
+        psrld	xmm1, 31
+        pslld	xmm7, 1
+        pslld	xmm4, 1
+        movdqa	xmm2, xmm0
+        pslldq	xmm0, 4
+        psrldq	xmm2, 12
+        pslldq	xmm1, 4
+        por	xmm4, xmm2
+        por	xmm7, xmm0
+        por	xmm4, xmm1
+        movdqa	xmm0, xmm7
+        movdqa	xmm1, xmm7
+        movdqa	xmm2, xmm7
+        pslld	xmm0, 31
+        pslld	xmm1, 30
+        pslld	xmm2, 25
+        pxor	xmm0, xmm1
+        pxor	xmm0, xmm2
+        movdqa	xmm1, xmm0
+        psrldq	xmm1, 4
+        pslldq	xmm0, 12
+        pxor	xmm7, xmm0
+        movdqa	xmm2, xmm7
+        movdqa	xmm3, xmm7
+        movdqa	xmm0, xmm7
+        psrld	xmm2, 1
+        psrld	xmm3, 2
+        psrld	xmm0, 7
+        pxor	xmm2, xmm3
+        pxor	xmm2, xmm0
+        pxor	xmm2, xmm1
+        pxor	xmm2, xmm7
+        pxor	xmm4, xmm2
+        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_mask
+        ;   Encrypt counter
+        movdqa	xmm8, OWORD PTR [r15]
+        pxor	xmm8, xmm4
+        aesenc	xmm8, [r15+16]
+        aesenc	xmm8, [r15+32]
+        aesenc	xmm8, [r15+48]
+        aesenc	xmm8, [r15+64]
+        aesenc	xmm8, [r15+80]
+        aesenc	xmm8, [r15+96]
+        aesenc	xmm8, [r15+112]
+        aesenc	xmm8, [r15+128]
+        aesenc	xmm8, [r15+144]
+        cmp	r10d, 11
+        movdqa	xmm9, OWORD PTR [r15+160]
+        jl	L_AES_GCM_encrypt_calc_iv_2_aesenc_avx_last
+        aesenc	xmm8, xmm9
+        aesenc	xmm8, [r15+176]
+        cmp	r10d, 13
+        movdqa	xmm9, OWORD PTR [r15+192]
+        jl	L_AES_GCM_encrypt_calc_iv_2_aesenc_avx_last
+        aesenc	xmm8, xmm9
+        aesenc	xmm8, [r15+208]
+        movdqa	xmm9, OWORD PTR [r15+224]
+L_AES_GCM_encrypt_calc_iv_2_aesenc_avx_last:
+        aesenclast	xmm8, xmm9
+        movdqu	[rsp+144], xmm8
+L_AES_GCM_encrypt_iv_done:
+        ; Additional authentication data
+        mov	edx, r11d
+        cmp	edx, 0
+        je	L_AES_GCM_encrypt_calc_aad_done
+        xor	ecx, ecx
+        cmp	edx, 16
+        jl	L_AES_GCM_encrypt_calc_aad_lt16
+        and	edx, 4294967280
+L_AES_GCM_encrypt_calc_aad_16_loop:
+        movdqu	xmm8, [r12+rcx]
+        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
+        pxor	xmm6, xmm8
+        pshufd	xmm1, xmm6, 78
+        pshufd	xmm2, xmm5, 78
+        movdqa	xmm3, xmm5
+        movdqa	xmm0, xmm5
+        pclmulqdq	xmm3, xmm6, 17
+        pclmulqdq	xmm0, xmm6, 0
+        pxor	xmm1, xmm6
+        pxor	xmm2, xmm5
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        movdqa	xmm7, xmm0
+        movdqa	xmm6, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm7, xmm2
+        pxor	xmm6, xmm1
+        movdqa	xmm0, xmm7
+        movdqa	xmm1, xmm6
+        psrld	xmm0, 31
+        psrld	xmm1, 31
+        pslld	xmm7, 1
+        pslld	xmm6, 1
+        movdqa	xmm2, xmm0
+        pslldq	xmm0, 4
+        psrldq	xmm2, 12
+        pslldq	xmm1, 4
+        por	xmm6, xmm2
+        por	xmm7, xmm0
+        por	xmm6, xmm1
+        movdqa	xmm0, xmm7
+        movdqa	xmm1, xmm7
+        movdqa	xmm2, xmm7
+        pslld	xmm0, 31
+        pslld	xmm1, 30
+        pslld	xmm2, 25
+        pxor	xmm0, xmm1
+        pxor	xmm0, xmm2
+        movdqa	xmm1, xmm0
+        psrldq	xmm1, 4
+        pslldq	xmm0, 12
+        pxor	xmm7, xmm0
+        movdqa	xmm2, xmm7
+        movdqa	xmm3, xmm7
+        movdqa	xmm0, xmm7
+        psrld	xmm2, 1
+        psrld	xmm3, 2
+        psrld	xmm0, 7
+        pxor	xmm2, xmm3
+        pxor	xmm2, xmm0
+        pxor	xmm2, xmm1
+        pxor	xmm2, xmm7
+        pxor	xmm6, xmm2
+        add	ecx, 16
+        cmp	ecx, edx
+        jl	L_AES_GCM_encrypt_calc_aad_16_loop
+        mov	edx, r11d
+        cmp	ecx, edx
+        je	L_AES_GCM_encrypt_calc_aad_done
+L_AES_GCM_encrypt_calc_aad_lt16:
+        sub	rsp, 16
+        pxor	xmm8, xmm8
+        xor	ebx, ebx
+        movdqu	[rsp], xmm8
+L_AES_GCM_encrypt_calc_aad_loop:
+        movzx	r13d, BYTE PTR [r12+rcx]
+        mov	BYTE PTR [rsp+rbx], r13b
+        inc	ecx
+        inc	ebx
+        cmp	ecx, edx
+        jl	L_AES_GCM_encrypt_calc_aad_loop
+        movdqu	xmm8, [rsp]
+        add	rsp, 16
+        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
+        pxor	xmm6, xmm8
+        pshufd	xmm1, xmm6, 78
+        pshufd	xmm2, xmm5, 78
+        movdqa	xmm3, xmm5
+        movdqa	xmm0, xmm5
+        pclmulqdq	xmm3, xmm6, 17
+        pclmulqdq	xmm0, xmm6, 0
+        pxor	xmm1, xmm6
+        pxor	xmm2, xmm5
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        movdqa	xmm7, xmm0
+        movdqa	xmm6, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm7, xmm2
+        pxor	xmm6, xmm1
+        movdqa	xmm0, xmm7
+        movdqa	xmm1, xmm6
+        psrld	xmm0, 31
+        psrld	xmm1, 31
+        pslld	xmm7, 1
+        pslld	xmm6, 1
+        movdqa	xmm2, xmm0
+        pslldq	xmm0, 4
+        psrldq	xmm2, 12
+        pslldq	xmm1, 4
+        por	xmm6, xmm2
+        por	xmm7, xmm0
+        por	xmm6, xmm1
+        movdqa	xmm0, xmm7
+        movdqa	xmm1, xmm7
+        movdqa	xmm2, xmm7
+        pslld	xmm0, 31
+        pslld	xmm1, 30
+        pslld	xmm2, 25
+        pxor	xmm0, xmm1
+        pxor	xmm0, xmm2
+        movdqa	xmm1, xmm0
+        psrldq	xmm1, 4
+        pslldq	xmm0, 12
+        pxor	xmm7, xmm0
+        movdqa	xmm2, xmm7
+        movdqa	xmm3, xmm7
+        movdqa	xmm0, xmm7
+        psrld	xmm2, 1
+        psrld	xmm3, 2
+        psrld	xmm0, 7
+        pxor	xmm2, xmm3
+        pxor	xmm2, xmm0
+        pxor	xmm2, xmm1
+        pxor	xmm2, xmm7
+        pxor	xmm6, xmm2
+L_AES_GCM_encrypt_calc_aad_done:
+        ; Calculate counter and H
+        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_epi64
+        movdqa	xmm9, xmm5
+        paddd	xmm4, OWORD PTR L_aes_gcm_one
+        movdqa	xmm8, xmm5
+        movdqu	[rsp+128], xmm4
+        psrlq	xmm9, 63
+        psllq	xmm8, 1
+        pslldq	xmm9, 8
+        por	xmm8, xmm9
+        pshufd	xmm5, xmm5, 255
+        psrad	xmm5, 31
+        pand	xmm5, OWORD PTR L_aes_gcm_mod2_128
+        pxor	xmm5, xmm8
+        xor	rbx, rbx
+        cmp	r9d, 128
+        mov	r13d, r9d
+        jl	L_AES_GCM_encrypt_done_128
+        and	r13d, 4294967168
+        movdqa	xmm2, xmm6
+        ; H ^ 1
+        movdqu	[rsp], xmm5
+        ; H ^ 2
+        pshufd	xmm9, xmm5, 78
+        pshufd	xmm10, xmm5, 78
+        movdqa	xmm11, xmm5
+        movdqa	xmm8, xmm5
+        pclmulqdq	xmm11, xmm5, 17
+        pclmulqdq	xmm8, xmm5, 0
+        pxor	xmm9, xmm5
+        pxor	xmm10, xmm5
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm0, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm0, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm0, xmm14
+        movdqu	[rsp+16], xmm0
+        ; H ^ 3
+        pshufd	xmm9, xmm5, 78
+        pshufd	xmm10, xmm0, 78
+        movdqa	xmm11, xmm0
+        movdqa	xmm8, xmm0
+        pclmulqdq	xmm11, xmm5, 17
+        pclmulqdq	xmm8, xmm5, 0
+        pxor	xmm9, xmm5
+        pxor	xmm10, xmm0
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm1, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm1, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm1, xmm14
+        movdqu	[rsp+32], xmm1
+        ; H ^ 4
+        pshufd	xmm9, xmm0, 78
+        pshufd	xmm10, xmm0, 78
+        movdqa	xmm11, xmm0
+        movdqa	xmm8, xmm0
+        pclmulqdq	xmm11, xmm0, 17
+        pclmulqdq	xmm8, xmm0, 0
+        pxor	xmm9, xmm0
+        pxor	xmm10, xmm0
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm3, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm3, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm3, xmm14
+        movdqu	[rsp+48], xmm3
+        ; H ^ 5
+        pshufd	xmm9, xmm0, 78
+        pshufd	xmm10, xmm1, 78
+        movdqa	xmm11, xmm1
+        movdqa	xmm8, xmm1
+        pclmulqdq	xmm11, xmm0, 17
+        pclmulqdq	xmm8, xmm0, 0
+        pxor	xmm9, xmm0
+        pxor	xmm10, xmm1
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm7, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm7, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm7, xmm14
+        movdqu	[rsp+64], xmm7
+        ; H ^ 6
+        pshufd	xmm9, xmm1, 78
+        pshufd	xmm10, xmm1, 78
+        movdqa	xmm11, xmm1
+        movdqa	xmm8, xmm1
+        pclmulqdq	xmm11, xmm1, 17
+        pclmulqdq	xmm8, xmm1, 0
+        pxor	xmm9, xmm1
+        pxor	xmm10, xmm1
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm7, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm7, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm7, xmm14
+        movdqu	[rsp+80], xmm7
+        ; H ^ 7
+        pshufd	xmm9, xmm1, 78
+        pshufd	xmm10, xmm3, 78
+        movdqa	xmm11, xmm3
+        movdqa	xmm8, xmm3
+        pclmulqdq	xmm11, xmm1, 17
+        pclmulqdq	xmm8, xmm1, 0
+        pxor	xmm9, xmm1
+        pxor	xmm10, xmm3
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm7, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm7, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm7, xmm14
+        movdqu	[rsp+96], xmm7
+        ; H ^ 8
+        pshufd	xmm9, xmm3, 78
+        pshufd	xmm10, xmm3, 78
+        movdqa	xmm11, xmm3
+        movdqa	xmm8, xmm3
+        pclmulqdq	xmm11, xmm3, 17
+        pclmulqdq	xmm8, xmm3, 0
+        pxor	xmm9, xmm3
+        pxor	xmm10, xmm3
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm7, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm7, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm7, xmm14
+        movdqu	[rsp+112], xmm7
+        ; First 128 bytes of input
+        movdqu	xmm8, [rsp+128]
+        movdqa	xmm1, OWORD PTR L_aes_gcm_bswap_epi64
+        movdqa	xmm0, xmm8
+        pshufb	xmm8, xmm1
+        movdqa	xmm9, xmm0
+        paddd	xmm9, OWORD PTR L_aes_gcm_one
+        pshufb	xmm9, xmm1
+        movdqa	xmm10, xmm0
+        paddd	xmm10, OWORD PTR L_aes_gcm_two
+        pshufb	xmm10, xmm1
+        movdqa	xmm11, xmm0
+        paddd	xmm11, OWORD PTR L_aes_gcm_three
+        pshufb	xmm11, xmm1
+        movdqa	xmm12, xmm0
+        paddd	xmm12, OWORD PTR L_aes_gcm_four
+        pshufb	xmm12, xmm1
+        movdqa	xmm13, xmm0
+        paddd	xmm13, OWORD PTR L_aes_gcm_five
+        pshufb	xmm13, xmm1
+        movdqa	xmm14, xmm0
+        paddd	xmm14, OWORD PTR L_aes_gcm_six
+        pshufb	xmm14, xmm1
+        movdqa	xmm15, xmm0
+        paddd	xmm15, OWORD PTR L_aes_gcm_seven
+        pshufb	xmm15, xmm1
+        paddd	xmm0, OWORD PTR L_aes_gcm_eight
+        movdqa	xmm7, OWORD PTR [r15]
+        movdqu	[rsp+128], xmm0
+        pxor	xmm8, xmm7
+        pxor	xmm9, xmm7
+        pxor	xmm10, xmm7
+        pxor	xmm11, xmm7
+        pxor	xmm12, xmm7
+        pxor	xmm13, xmm7
+        pxor	xmm14, xmm7
+        pxor	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [r15+16]
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [r15+32]
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [r15+48]
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [r15+64]
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [r15+80]
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [r15+96]
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [r15+112]
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [r15+128]
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [r15+144]
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        cmp	r10d, 11
+        movdqa	xmm7, OWORD PTR [r15+160]
+        jl	L_AES_GCM_encrypt_enc_done
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [r15+176]
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        cmp	r10d, 13
+        movdqa	xmm7, OWORD PTR [r15+192]
+        jl	L_AES_GCM_encrypt_enc_done
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [r15+208]
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [r15+224]
+L_AES_GCM_encrypt_enc_done:
+        aesenclast	xmm8, xmm7
+        aesenclast	xmm9, xmm7
+        movdqu	xmm0, [rdi]
+        movdqu	xmm1, [rdi+16]
+        pxor	xmm8, xmm0
+        pxor	xmm9, xmm1
+        movdqu	[rsi], xmm8
+        movdqu	[rsi+16], xmm9
+        aesenclast	xmm10, xmm7
+        aesenclast	xmm11, xmm7
+        movdqu	xmm0, [rdi+32]
+        movdqu	xmm1, [rdi+48]
+        pxor	xmm10, xmm0
+        pxor	xmm11, xmm1
+        movdqu	[rsi+32], xmm10
+        movdqu	[rsi+48], xmm11
+        aesenclast	xmm12, xmm7
+        aesenclast	xmm13, xmm7
+        movdqu	xmm0, [rdi+64]
+        movdqu	xmm1, [rdi+80]
+        pxor	xmm12, xmm0
+        pxor	xmm13, xmm1
+        movdqu	[rsi+64], xmm12
+        movdqu	[rsi+80], xmm13
+        aesenclast	xmm14, xmm7
+        aesenclast	xmm15, xmm7
+        movdqu	xmm0, [rdi+96]
+        movdqu	xmm1, [rdi+112]
+        pxor	xmm14, xmm0
+        pxor	xmm15, xmm1
+        movdqu	[rsi+96], xmm14
+        movdqu	[rsi+112], xmm15
+        cmp	r13d, 128
+        mov	ebx, 128
+        jle	L_AES_GCM_encrypt_end_128
+        ; More 128 bytes of input
+L_AES_GCM_encrypt_ghash_128:
+        lea	rcx, QWORD PTR [rdi+rbx]
+        lea	rdx, QWORD PTR [rsi+rbx]
+        movdqu	xmm8, [rsp+128]
+        movdqa	xmm1, OWORD PTR L_aes_gcm_bswap_epi64
+        movdqa	xmm0, xmm8
+        pshufb	xmm8, xmm1
+        movdqa	xmm9, xmm0
+        paddd	xmm9, OWORD PTR L_aes_gcm_one
+        pshufb	xmm9, xmm1
+        movdqa	xmm10, xmm0
+        paddd	xmm10, OWORD PTR L_aes_gcm_two
+        pshufb	xmm10, xmm1
+        movdqa	xmm11, xmm0
+        paddd	xmm11, OWORD PTR L_aes_gcm_three
+        pshufb	xmm11, xmm1
+        movdqa	xmm12, xmm0
+        paddd	xmm12, OWORD PTR L_aes_gcm_four
+        pshufb	xmm12, xmm1
+        movdqa	xmm13, xmm0
+        paddd	xmm13, OWORD PTR L_aes_gcm_five
+        pshufb	xmm13, xmm1
+        movdqa	xmm14, xmm0
+        paddd	xmm14, OWORD PTR L_aes_gcm_six
+        pshufb	xmm14, xmm1
+        movdqa	xmm15, xmm0
+        paddd	xmm15, OWORD PTR L_aes_gcm_seven
+        pshufb	xmm15, xmm1
+        paddd	xmm0, OWORD PTR L_aes_gcm_eight
+        movdqa	xmm7, OWORD PTR [r15]
+        movdqu	[rsp+128], xmm0
+        pxor	xmm8, xmm7
+        pxor	xmm9, xmm7
+        pxor	xmm10, xmm7
+        pxor	xmm11, xmm7
+        pxor	xmm12, xmm7
+        pxor	xmm13, xmm7
+        pxor	xmm14, xmm7
+        pxor	xmm15, xmm7
+        movdqu	xmm7, [rsp+112]
+        movdqu	xmm0, [rdx+-128]
+        aesenc	xmm8, [r15+16]
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        pxor	xmm0, xmm2
+        pshufd	xmm1, xmm7, 78
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm1, xmm7
+        pxor	xmm5, xmm0
+        movdqa	xmm3, xmm0
+        pclmulqdq	xmm3, xmm7, 17
+        aesenc	xmm9, [r15+16]
+        aesenc	xmm10, [r15+16]
+        movdqa	xmm2, xmm0
+        pclmulqdq	xmm2, xmm7, 0
+        aesenc	xmm11, [r15+16]
+        aesenc	xmm12, [r15+16]
+        pclmulqdq	xmm1, xmm5, 0
+        aesenc	xmm13, [r15+16]
+        aesenc	xmm14, [r15+16]
+        aesenc	xmm15, [r15+16]
+        pxor	xmm1, xmm2
+        pxor	xmm1, xmm3
+        movdqu	xmm7, [rsp+96]
+        movdqu	xmm0, [rdx+-112]
+        pshufd	xmm4, xmm7, 78
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        aesenc	xmm8, [r15+32]
+        pxor	xmm4, xmm7
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm5, xmm0
+        movdqa	xmm6, xmm0
+        pclmulqdq	xmm6, xmm7, 17
+        aesenc	xmm9, [r15+32]
+        aesenc	xmm10, [r15+32]
+        pclmulqdq	xmm7, xmm0, 0
+        aesenc	xmm11, [r15+32]
+        aesenc	xmm12, [r15+32]
+        pclmulqdq	xmm4, xmm5, 0
+        aesenc	xmm13, [r15+32]
+        aesenc	xmm14, [r15+32]
+        aesenc	xmm15, [r15+32]
+        pxor	xmm1, xmm7
+        pxor	xmm2, xmm7
+        pxor	xmm1, xmm6
+        pxor	xmm3, xmm6
+        pxor	xmm1, xmm4
+        movdqu	xmm7, [rsp+80]
+        movdqu	xmm0, [rdx+-96]
+        pshufd	xmm4, xmm7, 78
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        aesenc	xmm8, [r15+48]
+        pxor	xmm4, xmm7
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm5, xmm0
+        movdqa	xmm6, xmm0
+        pclmulqdq	xmm6, xmm7, 17
+        aesenc	xmm9, [r15+48]
+        aesenc	xmm10, [r15+48]
+        pclmulqdq	xmm7, xmm0, 0
+        aesenc	xmm11, [r15+48]
+        aesenc	xmm12, [r15+48]
+        pclmulqdq	xmm4, xmm5, 0
+        aesenc	xmm13, [r15+48]
+        aesenc	xmm14, [r15+48]
+        aesenc	xmm15, [r15+48]
+        pxor	xmm1, xmm7
+        pxor	xmm2, xmm7
+        pxor	xmm1, xmm6
+        pxor	xmm3, xmm6
+        pxor	xmm1, xmm4
+        movdqu	xmm7, [rsp+64]
+        movdqu	xmm0, [rdx+-80]
+        pshufd	xmm4, xmm7, 78
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        aesenc	xmm8, [r15+64]
+        pxor	xmm4, xmm7
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm5, xmm0
+        movdqa	xmm6, xmm0
+        pclmulqdq	xmm6, xmm7, 17
+        aesenc	xmm9, [r15+64]
+        aesenc	xmm10, [r15+64]
+        pclmulqdq	xmm7, xmm0, 0
+        aesenc	xmm11, [r15+64]
+        aesenc	xmm12, [r15+64]
+        pclmulqdq	xmm4, xmm5, 0
+        aesenc	xmm13, [r15+64]
+        aesenc	xmm14, [r15+64]
+        aesenc	xmm15, [r15+64]
+        pxor	xmm1, xmm7
+        pxor	xmm2, xmm7
+        pxor	xmm1, xmm6
+        pxor	xmm3, xmm6
+        pxor	xmm1, xmm4
+        movdqu	xmm7, [rsp+48]
+        movdqu	xmm0, [rdx+-64]
+        pshufd	xmm4, xmm7, 78
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        aesenc	xmm8, [r15+80]
+        pxor	xmm4, xmm7
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm5, xmm0
+        movdqa	xmm6, xmm0
+        pclmulqdq	xmm6, xmm7, 17
+        aesenc	xmm9, [r15+80]
+        aesenc	xmm10, [r15+80]
+        pclmulqdq	xmm7, xmm0, 0
+        aesenc	xmm11, [r15+80]
+        aesenc	xmm12, [r15+80]
+        pclmulqdq	xmm4, xmm5, 0
+        aesenc	xmm13, [r15+80]
+        aesenc	xmm14, [r15+80]
+        aesenc	xmm15, [r15+80]
+        pxor	xmm1, xmm7
+        pxor	xmm2, xmm7
+        pxor	xmm1, xmm6
+        pxor	xmm3, xmm6
+        pxor	xmm1, xmm4
+        movdqu	xmm7, [rsp+32]
+        movdqu	xmm0, [rdx+-48]
+        pshufd	xmm4, xmm7, 78
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        aesenc	xmm8, [r15+96]
+        pxor	xmm4, xmm7
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm5, xmm0
+        movdqa	xmm6, xmm0
+        pclmulqdq	xmm6, xmm7, 17
+        aesenc	xmm9, [r15+96]
+        aesenc	xmm10, [r15+96]
+        pclmulqdq	xmm7, xmm0, 0
+        aesenc	xmm11, [r15+96]
+        aesenc	xmm12, [r15+96]
+        pclmulqdq	xmm4, xmm5, 0
+        aesenc	xmm13, [r15+96]
+        aesenc	xmm14, [r15+96]
+        aesenc	xmm15, [r15+96]
+        pxor	xmm1, xmm7
+        pxor	xmm2, xmm7
+        pxor	xmm1, xmm6
+        pxor	xmm3, xmm6
+        pxor	xmm1, xmm4
+        movdqu	xmm7, [rsp+16]
+        movdqu	xmm0, [rdx+-32]
+        pshufd	xmm4, xmm7, 78
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        aesenc	xmm8, [r15+112]
+        pxor	xmm4, xmm7
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm5, xmm0
+        movdqa	xmm6, xmm0
+        pclmulqdq	xmm6, xmm7, 17
+        aesenc	xmm9, [r15+112]
+        aesenc	xmm10, [r15+112]
+        pclmulqdq	xmm7, xmm0, 0
+        aesenc	xmm11, [r15+112]
+        aesenc	xmm12, [r15+112]
+        pclmulqdq	xmm4, xmm5, 0
+        aesenc	xmm13, [r15+112]
+        aesenc	xmm14, [r15+112]
+        aesenc	xmm15, [r15+112]
+        pxor	xmm1, xmm7
+        pxor	xmm2, xmm7
+        pxor	xmm1, xmm6
+        pxor	xmm3, xmm6
+        pxor	xmm1, xmm4
+        movdqu	xmm7, [rsp]
+        movdqu	xmm0, [rdx+-16]
+        pshufd	xmm4, xmm7, 78
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        aesenc	xmm8, [r15+128]
+        pxor	xmm4, xmm7
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm5, xmm0
+        movdqa	xmm6, xmm0
+        pclmulqdq	xmm6, xmm7, 17
+        aesenc	xmm9, [r15+128]
+        aesenc	xmm10, [r15+128]
+        pclmulqdq	xmm7, xmm0, 0
+        aesenc	xmm11, [r15+128]
+        aesenc	xmm12, [r15+128]
+        pclmulqdq	xmm4, xmm5, 0
+        aesenc	xmm13, [r15+128]
+        aesenc	xmm14, [r15+128]
+        aesenc	xmm15, [r15+128]
+        pxor	xmm1, xmm7
+        pxor	xmm2, xmm7
+        pxor	xmm1, xmm6
+        pxor	xmm3, xmm6
+        pxor	xmm1, xmm4
+        movdqa	xmm5, xmm1
+        psrldq	xmm1, 8
+        pslldq	xmm5, 8
+        aesenc	xmm8, [r15+144]
+        pxor	xmm2, xmm5
+        pxor	xmm3, xmm1
+        movdqa	xmm7, xmm2
+        movdqa	xmm4, xmm2
+        movdqa	xmm5, xmm2
+        aesenc	xmm9, [r15+144]
+        pslld	xmm7, 31
+        pslld	xmm4, 30
+        pslld	xmm5, 25
+        aesenc	xmm10, [r15+144]
+        pxor	xmm7, xmm4
+        pxor	xmm7, xmm5
+        aesenc	xmm11, [r15+144]
+        movdqa	xmm4, xmm7
+        pslldq	xmm7, 12
+        psrldq	xmm4, 4
+        aesenc	xmm12, [r15+144]
+        pxor	xmm2, xmm7
+        movdqa	xmm5, xmm2
+        movdqa	xmm1, xmm2
+        movdqa	xmm0, xmm2
+        aesenc	xmm13, [r15+144]
+        psrld	xmm5, 1
+        psrld	xmm1, 2
+        psrld	xmm0, 7
+        aesenc	xmm14, [r15+144]
+        pxor	xmm5, xmm1
+        pxor	xmm5, xmm0
+        aesenc	xmm15, [r15+144]
+        pxor	xmm5, xmm4
+        pxor	xmm2, xmm5
+        pxor	xmm2, xmm3
+        cmp	r10d, 11
+        movdqa	xmm7, OWORD PTR [r15+160]
+        jl	L_AES_GCM_encrypt_aesenc_128_ghash_avx_done
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [r15+176]
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        cmp	r10d, 13
+        movdqa	xmm7, OWORD PTR [r15+192]
+        jl	L_AES_GCM_encrypt_aesenc_128_ghash_avx_done
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [r15+208]
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [r15+224]
+L_AES_GCM_encrypt_aesenc_128_ghash_avx_done:
+        aesenclast	xmm8, xmm7
+        aesenclast	xmm9, xmm7
+        movdqu	xmm0, [rcx]
+        movdqu	xmm1, [rcx+16]
+        pxor	xmm8, xmm0
+        pxor	xmm9, xmm1
+        movdqu	[rdx], xmm8
+        movdqu	[rdx+16], xmm9
+        aesenclast	xmm10, xmm7
+        aesenclast	xmm11, xmm7
+        movdqu	xmm0, [rcx+32]
+        movdqu	xmm1, [rcx+48]
+        pxor	xmm10, xmm0
+        pxor	xmm11, xmm1
+        movdqu	[rdx+32], xmm10
+        movdqu	[rdx+48], xmm11
+        aesenclast	xmm12, xmm7
+        aesenclast	xmm13, xmm7
+        movdqu	xmm0, [rcx+64]
+        movdqu	xmm1, [rcx+80]
+        pxor	xmm12, xmm0
+        pxor	xmm13, xmm1
+        movdqu	[rdx+64], xmm12
+        movdqu	[rdx+80], xmm13
+        aesenclast	xmm14, xmm7
+        aesenclast	xmm15, xmm7
+        movdqu	xmm0, [rcx+96]
+        movdqu	xmm1, [rcx+112]
+        pxor	xmm14, xmm0
+        pxor	xmm15, xmm1
+        movdqu	[rdx+96], xmm14
+        movdqu	[rdx+112], xmm15
+        add	ebx, 128
+        cmp	ebx, r13d
+        jl	L_AES_GCM_encrypt_ghash_128
+L_AES_GCM_encrypt_end_128:
+        movdqa	xmm4, OWORD PTR L_aes_gcm_bswap_mask
+        pshufb	xmm8, xmm4
+        pshufb	xmm9, xmm4
+        pshufb	xmm10, xmm4
+        pshufb	xmm11, xmm4
+        pxor	xmm8, xmm2
+        pshufb	xmm12, xmm4
+        pshufb	xmm13, xmm4
+        pshufb	xmm14, xmm4
+        pshufb	xmm15, xmm4
+        movdqu	xmm7, [rsp+112]
+        pshufd	xmm1, xmm8, 78
+        pshufd	xmm2, xmm7, 78
+        movdqa	xmm3, xmm7
+        movdqa	xmm0, xmm7
+        pclmulqdq	xmm3, xmm8, 17
+        pclmulqdq	xmm0, xmm8, 0
+        pxor	xmm1, xmm8
+        pxor	xmm2, xmm7
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        movdqa	xmm4, xmm0
+        movdqa	xmm6, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm4, xmm2
+        pxor	xmm6, xmm1
+        movdqu	xmm7, [rsp+96]
+        pshufd	xmm1, xmm9, 78
+        pshufd	xmm2, xmm7, 78
+        movdqa	xmm3, xmm7
+        movdqa	xmm0, xmm7
+        pclmulqdq	xmm3, xmm9, 17
+        pclmulqdq	xmm0, xmm9, 0
+        pxor	xmm1, xmm9
+        pxor	xmm2, xmm7
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        pxor	xmm4, xmm0
+        pxor	xmm6, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm4, xmm2
+        pxor	xmm6, xmm1
+        movdqu	xmm7, [rsp+80]
+        pshufd	xmm1, xmm10, 78
+        pshufd	xmm2, xmm7, 78
+        movdqa	xmm3, xmm7
+        movdqa	xmm0, xmm7
+        pclmulqdq	xmm3, xmm10, 17
+        pclmulqdq	xmm0, xmm10, 0
+        pxor	xmm1, xmm10
+        pxor	xmm2, xmm7
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        pxor	xmm4, xmm0
+        pxor	xmm6, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm4, xmm2
+        pxor	xmm6, xmm1
+        movdqu	xmm7, [rsp+64]
+        pshufd	xmm1, xmm11, 78
+        pshufd	xmm2, xmm7, 78
+        movdqa	xmm3, xmm7
+        movdqa	xmm0, xmm7
+        pclmulqdq	xmm3, xmm11, 17
+        pclmulqdq	xmm0, xmm11, 0
+        pxor	xmm1, xmm11
+        pxor	xmm2, xmm7
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        pxor	xmm4, xmm0
+        pxor	xmm6, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm4, xmm2
+        pxor	xmm6, xmm1
+        movdqu	xmm7, [rsp+48]
+        pshufd	xmm1, xmm12, 78
+        pshufd	xmm2, xmm7, 78
+        movdqa	xmm3, xmm7
+        movdqa	xmm0, xmm7
+        pclmulqdq	xmm3, xmm12, 17
+        pclmulqdq	xmm0, xmm12, 0
+        pxor	xmm1, xmm12
+        pxor	xmm2, xmm7
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        pxor	xmm4, xmm0
+        pxor	xmm6, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm4, xmm2
+        pxor	xmm6, xmm1
+        movdqu	xmm7, [rsp+32]
+        pshufd	xmm1, xmm13, 78
+        pshufd	xmm2, xmm7, 78
+        movdqa	xmm3, xmm7
+        movdqa	xmm0, xmm7
+        pclmulqdq	xmm3, xmm13, 17
+        pclmulqdq	xmm0, xmm13, 0
+        pxor	xmm1, xmm13
+        pxor	xmm2, xmm7
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        pxor	xmm4, xmm0
+        pxor	xmm6, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm4, xmm2
+        pxor	xmm6, xmm1
+        movdqu	xmm7, [rsp+16]
+        pshufd	xmm1, xmm14, 78
+        pshufd	xmm2, xmm7, 78
+        movdqa	xmm3, xmm7
+        movdqa	xmm0, xmm7
+        pclmulqdq	xmm3, xmm14, 17
+        pclmulqdq	xmm0, xmm14, 0
+        pxor	xmm1, xmm14
+        pxor	xmm2, xmm7
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        pxor	xmm4, xmm0
+        pxor	xmm6, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm4, xmm2
+        pxor	xmm6, xmm1
+        movdqu	xmm7, [rsp]
+        pshufd	xmm1, xmm15, 78
+        pshufd	xmm2, xmm7, 78
+        movdqa	xmm3, xmm7
+        movdqa	xmm0, xmm7
+        pclmulqdq	xmm3, xmm15, 17
+        pclmulqdq	xmm0, xmm15, 0
+        pxor	xmm1, xmm15
+        pxor	xmm2, xmm7
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        pxor	xmm4, xmm0
+        pxor	xmm6, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm4, xmm2
+        pxor	xmm6, xmm1
+        movdqa	xmm0, xmm4
+        movdqa	xmm1, xmm4
+        movdqa	xmm2, xmm4
+        pslld	xmm0, 31
+        pslld	xmm1, 30
+        pslld	xmm2, 25
+        pxor	xmm0, xmm1
+        pxor	xmm0, xmm2
+        movdqa	xmm1, xmm0
+        psrldq	xmm1, 4
+        pslldq	xmm0, 12
+        pxor	xmm4, xmm0
+        movdqa	xmm2, xmm4
+        movdqa	xmm3, xmm4
+        movdqa	xmm0, xmm4
+        psrld	xmm2, 1
+        psrld	xmm3, 2
+        psrld	xmm0, 7
+        pxor	xmm2, xmm3
+        pxor	xmm2, xmm0
+        pxor	xmm2, xmm1
+        pxor	xmm2, xmm4
+        pxor	xmm6, xmm2
+        movdqu	xmm5, [rsp]
+L_AES_GCM_encrypt_done_128:
+        mov	edx, r9d
+        cmp	ebx, edx
+        jge	L_AES_GCM_encrypt_done_enc
+        mov	r13d, r9d
+        and	r13d, 4294967280
+        cmp	ebx, r13d
+        jge	L_AES_GCM_encrypt_last_block_done
+        lea	rcx, QWORD PTR [rdi+rbx]
+        lea	rdx, QWORD PTR [rsi+rbx]
+        movdqu	xmm8, [rsp+128]
+        movdqa	xmm9, xmm8
+        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_epi64
+        paddd	xmm9, OWORD PTR L_aes_gcm_one
+        pxor	xmm8, [r15]
+        movdqu	[rsp+128], xmm9
+        aesenc	xmm8, [r15+16]
+        aesenc	xmm8, [r15+32]
+        aesenc	xmm8, [r15+48]
+        aesenc	xmm8, [r15+64]
+        aesenc	xmm8, [r15+80]
+        aesenc	xmm8, [r15+96]
+        aesenc	xmm8, [r15+112]
+        aesenc	xmm8, [r15+128]
+        aesenc	xmm8, [r15+144]
+        cmp	r10d, 11
+        movdqa	xmm9, OWORD PTR [r15+160]
+        jl	L_AES_GCM_encrypt_aesenc_block_aesenc_avx_last
+        aesenc	xmm8, xmm9
+        aesenc	xmm8, [r15+176]
+        cmp	r10d, 13
+        movdqa	xmm9, OWORD PTR [r15+192]
+        jl	L_AES_GCM_encrypt_aesenc_block_aesenc_avx_last
+        aesenc	xmm8, xmm9
+        aesenc	xmm8, [r15+208]
+        movdqa	xmm9, OWORD PTR [r15+224]
+L_AES_GCM_encrypt_aesenc_block_aesenc_avx_last:
+        aesenclast	xmm8, xmm9
+        movdqu	xmm9, [rcx]
+        pxor	xmm8, xmm9
+        movdqu	[rdx], xmm8
+        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
+        pxor	xmm6, xmm8
+        add	ebx, 16
+        cmp	ebx, r13d
+        jge	L_AES_GCM_encrypt_last_block_ghash
+L_AES_GCM_encrypt_last_block_start:
+        lea	rcx, QWORD PTR [rdi+rbx]
+        lea	rdx, QWORD PTR [rsi+rbx]
+        movdqu	xmm8, [rsp+128]
+        movdqa	xmm9, xmm8
+        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_epi64
+        paddd	xmm9, OWORD PTR L_aes_gcm_one
+        pxor	xmm8, [r15]
+        movdqu	[rsp+128], xmm9
+        movdqa	xmm10, xmm6
+        pclmulqdq	xmm10, xmm5, 16
+        aesenc	xmm8, [r15+16]
+        aesenc	xmm8, [r15+32]
+        movdqa	xmm11, xmm6
+        pclmulqdq	xmm11, xmm5, 1
+        aesenc	xmm8, [r15+48]
+        aesenc	xmm8, [r15+64]
+        movdqa	xmm12, xmm6
+        pclmulqdq	xmm12, xmm5, 0
+        aesenc	xmm8, [r15+80]
+        movdqa	xmm1, xmm6
+        pclmulqdq	xmm1, xmm5, 17
+        aesenc	xmm8, [r15+96]
+        pxor	xmm10, xmm11
+        movdqa	xmm2, xmm10
+        psrldq	xmm10, 8
+        pslldq	xmm2, 8
+        aesenc	xmm8, [r15+112]
+        movdqa	xmm3, xmm1
+        pxor	xmm2, xmm12
+        pxor	xmm3, xmm10
+        movdqa	xmm0, OWORD PTR L_aes_gcm_mod2_128
+        movdqa	xmm11, xmm2
+        pclmulqdq	xmm11, xmm0, 16
+        aesenc	xmm8, [r15+128]
+        pshufd	xmm10, xmm2, 78
+        pxor	xmm10, xmm11
+        movdqa	xmm11, xmm10
+        pclmulqdq	xmm11, xmm0, 16
+        aesenc	xmm8, [r15+144]
+        pshufd	xmm6, xmm10, 78
+        pxor	xmm6, xmm11
+        pxor	xmm6, xmm3
+        cmp	r10d, 11
+        movdqa	xmm9, OWORD PTR [r15+160]
+        jl	L_AES_GCM_encrypt_aesenc_gfmul_last
+        aesenc	xmm8, xmm9
+        aesenc	xmm8, [r15+176]
+        cmp	r10d, 13
+        movdqa	xmm9, OWORD PTR [r15+192]
+        jl	L_AES_GCM_encrypt_aesenc_gfmul_last
+        aesenc	xmm8, xmm9
+        aesenc	xmm8, [r15+208]
+        movdqa	xmm9, OWORD PTR [r15+224]
+L_AES_GCM_encrypt_aesenc_gfmul_last:
+        aesenclast	xmm8, xmm9
+        movdqu	xmm9, [rcx]
+        pxor	xmm8, xmm9
+        movdqu	[rdx], xmm8
+        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
+        pxor	xmm6, xmm8
+        add	ebx, 16
+        cmp	ebx, r13d
+        jl	L_AES_GCM_encrypt_last_block_start
+L_AES_GCM_encrypt_last_block_ghash:
+        pshufd	xmm9, xmm5, 78
+        pshufd	xmm10, xmm6, 78
+        movdqa	xmm11, xmm6
+        movdqa	xmm8, xmm6
+        pclmulqdq	xmm11, xmm5, 17
+        pclmulqdq	xmm8, xmm5, 0
+        pxor	xmm9, xmm5
+        pxor	xmm10, xmm6
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm6, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm6, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm6, xmm14
+L_AES_GCM_encrypt_last_block_done:
+        mov	ecx, r9d
+        mov	edx, ecx
+        and	ecx, 15
+        jz	L_AES_GCM_encrypt_aesenc_last15_enc_avx_done
+        movdqu	xmm4, [rsp+128]
+        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_epi64
+        pxor	xmm4, [r15]
+        aesenc	xmm4, [r15+16]
+        aesenc	xmm4, [r15+32]
+        aesenc	xmm4, [r15+48]
+        aesenc	xmm4, [r15+64]
+        aesenc	xmm4, [r15+80]
+        aesenc	xmm4, [r15+96]
+        aesenc	xmm4, [r15+112]
+        aesenc	xmm4, [r15+128]
+        aesenc	xmm4, [r15+144]
+        cmp	r10d, 11
+        movdqa	xmm9, OWORD PTR [r15+160]
+        jl	L_AES_GCM_encrypt_aesenc_last15_enc_avx_aesenc_avx_last
+        aesenc	xmm4, xmm9
+        aesenc	xmm4, [r15+176]
+        cmp	r10d, 13
+        movdqa	xmm9, OWORD PTR [r15+192]
+        jl	L_AES_GCM_encrypt_aesenc_last15_enc_avx_aesenc_avx_last
+        aesenc	xmm4, xmm9
+        aesenc	xmm4, [r15+208]
+        movdqa	xmm9, OWORD PTR [r15+224]
+L_AES_GCM_encrypt_aesenc_last15_enc_avx_aesenc_avx_last:
+        aesenclast	xmm4, xmm9
+        sub	rsp, 16
+        xor	ecx, ecx
+        movdqu	[rsp], xmm4
+L_AES_GCM_encrypt_aesenc_last15_enc_avx_loop:
+        movzx	r13d, BYTE PTR [rdi+rbx]
+        xor	r13b, BYTE PTR [rsp+rcx]
+        mov	BYTE PTR [rsi+rbx], r13b
+        mov	BYTE PTR [rsp+rcx], r13b
+        inc	ebx
+        inc	ecx
+        cmp	ebx, edx
+        jl	L_AES_GCM_encrypt_aesenc_last15_enc_avx_loop
+        xor	r13, r13
+        cmp	ecx, 16
+        je	L_AES_GCM_encrypt_aesenc_last15_enc_avx_finish_enc
+L_AES_GCM_encrypt_aesenc_last15_enc_avx_byte_loop:
+        mov	BYTE PTR [rsp+rcx], r13b
+        inc	ecx
+        cmp	ecx, 16
+        jl	L_AES_GCM_encrypt_aesenc_last15_enc_avx_byte_loop
+L_AES_GCM_encrypt_aesenc_last15_enc_avx_finish_enc:
+        movdqu	xmm4, [rsp]
+        add	rsp, 16
+        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_mask
+        pxor	xmm6, xmm4
+        pshufd	xmm9, xmm5, 78
+        pshufd	xmm10, xmm6, 78
+        movdqa	xmm11, xmm6
+        movdqa	xmm8, xmm6
+        pclmulqdq	xmm11, xmm5, 17
+        pclmulqdq	xmm8, xmm5, 0
+        pxor	xmm9, xmm5
+        pxor	xmm10, xmm6
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm6, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm6, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm6, xmm14
+L_AES_GCM_encrypt_aesenc_last15_enc_avx_done:
+L_AES_GCM_encrypt_done_enc:
+        mov	edx, r9d
+        mov	ecx, r11d
+        shl	rdx, 3
+        shl	rcx, 3
+        pinsrq	xmm0, rdx, 0
+        pinsrq	xmm0, rcx, 1
+        pxor	xmm6, xmm0
+        pshufd	xmm9, xmm5, 78
+        pshufd	xmm10, xmm6, 78
+        movdqa	xmm11, xmm6
+        movdqa	xmm8, xmm6
+        pclmulqdq	xmm11, xmm5, 17
+        pclmulqdq	xmm8, xmm5, 0
+        pxor	xmm9, xmm5
+        pxor	xmm10, xmm6
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm6, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm6, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm6, xmm14
+        pshufb	xmm6, OWORD PTR L_aes_gcm_bswap_mask
+        movdqu	xmm0, [rsp+144]
+        pxor	xmm0, xmm6
+        cmp	r14d, 16
+        je	L_AES_GCM_encrypt_store_tag_16
+        xor	rcx, rcx
+        movdqu	[rsp], xmm0
+L_AES_GCM_encrypt_store_tag_loop:
+        movzx	r13d, BYTE PTR [rsp+rcx]
+        mov	BYTE PTR [r8+rcx], r13b
+        inc	ecx
+        cmp	ecx, r14d
+        jne	L_AES_GCM_encrypt_store_tag_loop
+        jmp	L_AES_GCM_encrypt_store_tag_done
+L_AES_GCM_encrypt_store_tag_16:
+        movdqu	[r8], xmm0
+L_AES_GCM_encrypt_store_tag_done:
+        add	rsp, 160
+        pop	r15
+        pop	r14
+        pop	rbx
+        pop	r12
+        pop	rsi
+        pop	rdi
+        pop	r13
+        ret
+AES_GCM_encrypt ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+AES_GCM_decrypt PROC
+        push	r13
+        push	rdi
+        push	rsi
+        push	r12
+        push	rbx
+        push	r14
+        push	r15
+        push	rbp
+        mov	rdi, rcx
+        mov	rsi, rdx
+        mov	r12, r8
+        mov	rax, r9
+        mov	r8, QWORD PTR [rsp+104]
+        mov	r9d, DWORD PTR [rsp+112]
+        mov	r11d, DWORD PTR [rsp+120]
+        mov	ebx, DWORD PTR [rsp+128]
+        mov	r14d, DWORD PTR [rsp+136]
+        mov	r15, QWORD PTR [rsp+144]
+        mov	r10d, DWORD PTR [rsp+152]
+        mov	rbp, QWORD PTR [rsp+160]
+        sub	rsp, 168
+        pxor	xmm4, xmm4
+        pxor	xmm6, xmm6
+        cmp	ebx, 12
+        mov	edx, ebx
+        jne	L_AES_GCM_decrypt_iv_not_12
+        ; # Calculate values when IV is 12 bytes
+        ; Set counter based on IV
+        mov	ecx, 16777216
+        pinsrq	xmm4, QWORD PTR [rax], 0
+        pinsrd	xmm4, DWORD PTR [rax+8], 2
+        pinsrd	xmm4, ecx, 3
+        ; H = Encrypt X(=0) and T = Encrypt counter
+        movdqa	xmm1, xmm4
+        movdqa	xmm5, OWORD PTR [r15]
+        pxor	xmm1, xmm5
+        movdqa	xmm7, OWORD PTR [r15+16]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [r15+32]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [r15+48]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [r15+64]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [r15+80]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [r15+96]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [r15+112]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [r15+128]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [r15+144]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        cmp	r10d, 11
+        movdqa	xmm7, OWORD PTR [r15+160]
+        jl	L_AES_GCM_decrypt_calc_iv_12_last
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [r15+176]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        cmp	r10d, 13
+        movdqa	xmm7, OWORD PTR [r15+192]
+        jl	L_AES_GCM_decrypt_calc_iv_12_last
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [r15+208]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [r15+224]
+L_AES_GCM_decrypt_calc_iv_12_last:
+        aesenclast	xmm5, xmm7
+        aesenclast	xmm1, xmm7
+        pshufb	xmm5, OWORD PTR L_aes_gcm_bswap_mask
+        movdqu	[rsp+144], xmm1
+        jmp	L_AES_GCM_decrypt_iv_done
+L_AES_GCM_decrypt_iv_not_12:
+        ; Calculate values when IV is not 12 bytes
+        ; H = Encrypt X(=0)
+        movdqa	xmm5, OWORD PTR [r15]
+        aesenc	xmm5, [r15+16]
+        aesenc	xmm5, [r15+32]
+        aesenc	xmm5, [r15+48]
+        aesenc	xmm5, [r15+64]
+        aesenc	xmm5, [r15+80]
+        aesenc	xmm5, [r15+96]
+        aesenc	xmm5, [r15+112]
+        aesenc	xmm5, [r15+128]
+        aesenc	xmm5, [r15+144]
+        cmp	r10d, 11
+        movdqa	xmm9, OWORD PTR [r15+160]
+        jl	L_AES_GCM_decrypt_calc_iv_1_aesenc_avx_last
+        aesenc	xmm5, xmm9
+        aesenc	xmm5, [r15+176]
+        cmp	r10d, 13
+        movdqa	xmm9, OWORD PTR [r15+192]
+        jl	L_AES_GCM_decrypt_calc_iv_1_aesenc_avx_last
+        aesenc	xmm5, xmm9
+        aesenc	xmm5, [r15+208]
+        movdqa	xmm9, OWORD PTR [r15+224]
+L_AES_GCM_decrypt_calc_iv_1_aesenc_avx_last:
+        aesenclast	xmm5, xmm9
+        pshufb	xmm5, OWORD PTR L_aes_gcm_bswap_mask
+        ; Calc counter
+        ; Initialization vector
+        cmp	edx, 0
+        mov	rcx, 0
+        je	L_AES_GCM_decrypt_calc_iv_done
+        cmp	edx, 16
+        jl	L_AES_GCM_decrypt_calc_iv_lt16
+        and	edx, 4294967280
+L_AES_GCM_decrypt_calc_iv_16_loop:
+        movdqu	xmm8, [rax+rcx]
+        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
+        pxor	xmm4, xmm8
+        pshufd	xmm1, xmm4, 78
+        pshufd	xmm2, xmm5, 78
+        movdqa	xmm3, xmm5
+        movdqa	xmm0, xmm5
+        pclmulqdq	xmm3, xmm4, 17
+        pclmulqdq	xmm0, xmm4, 0
+        pxor	xmm1, xmm4
+        pxor	xmm2, xmm5
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        movdqa	xmm7, xmm0
+        movdqa	xmm4, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm7, xmm2
+        pxor	xmm4, xmm1
+        movdqa	xmm0, xmm7
+        movdqa	xmm1, xmm4
+        psrld	xmm0, 31
+        psrld	xmm1, 31
+        pslld	xmm7, 1
+        pslld	xmm4, 1
+        movdqa	xmm2, xmm0
+        pslldq	xmm0, 4
+        psrldq	xmm2, 12
+        pslldq	xmm1, 4
+        por	xmm4, xmm2
+        por	xmm7, xmm0
+        por	xmm4, xmm1
+        movdqa	xmm0, xmm7
+        movdqa	xmm1, xmm7
+        movdqa	xmm2, xmm7
+        pslld	xmm0, 31
+        pslld	xmm1, 30
+        pslld	xmm2, 25
+        pxor	xmm0, xmm1
+        pxor	xmm0, xmm2
+        movdqa	xmm1, xmm0
+        psrldq	xmm1, 4
+        pslldq	xmm0, 12
+        pxor	xmm7, xmm0
+        movdqa	xmm2, xmm7
+        movdqa	xmm3, xmm7
+        movdqa	xmm0, xmm7
+        psrld	xmm2, 1
+        psrld	xmm3, 2
+        psrld	xmm0, 7
+        pxor	xmm2, xmm3
+        pxor	xmm2, xmm0
+        pxor	xmm2, xmm1
+        pxor	xmm2, xmm7
+        pxor	xmm4, xmm2
+        add	ecx, 16
+        cmp	ecx, edx
+        jl	L_AES_GCM_decrypt_calc_iv_16_loop
+        mov	edx, ebx
+        cmp	ecx, edx
+        je	L_AES_GCM_decrypt_calc_iv_done
+L_AES_GCM_decrypt_calc_iv_lt16:
+        sub	rsp, 16
+        pxor	xmm8, xmm8
+        xor	ebx, ebx
+        movdqu	[rsp], xmm8
+L_AES_GCM_decrypt_calc_iv_loop:
+        movzx	r13d, BYTE PTR [rax+rcx]
+        mov	BYTE PTR [rsp+rbx], r13b
+        inc	ecx
+        inc	ebx
+        cmp	ecx, edx
+        jl	L_AES_GCM_decrypt_calc_iv_loop
+        movdqu	xmm8, [rsp]
+        add	rsp, 16
+        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
+        pxor	xmm4, xmm8
+        pshufd	xmm1, xmm4, 78
+        pshufd	xmm2, xmm5, 78
+        movdqa	xmm3, xmm5
+        movdqa	xmm0, xmm5
+        pclmulqdq	xmm3, xmm4, 17
+        pclmulqdq	xmm0, xmm4, 0
+        pxor	xmm1, xmm4
+        pxor	xmm2, xmm5
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        movdqa	xmm7, xmm0
+        movdqa	xmm4, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm7, xmm2
+        pxor	xmm4, xmm1
+        movdqa	xmm0, xmm7
+        movdqa	xmm1, xmm4
+        psrld	xmm0, 31
+        psrld	xmm1, 31
+        pslld	xmm7, 1
+        pslld	xmm4, 1
+        movdqa	xmm2, xmm0
+        pslldq	xmm0, 4
+        psrldq	xmm2, 12
+        pslldq	xmm1, 4
+        por	xmm4, xmm2
+        por	xmm7, xmm0
+        por	xmm4, xmm1
+        movdqa	xmm0, xmm7
+        movdqa	xmm1, xmm7
+        movdqa	xmm2, xmm7
+        pslld	xmm0, 31
+        pslld	xmm1, 30
+        pslld	xmm2, 25
+        pxor	xmm0, xmm1
+        pxor	xmm0, xmm2
+        movdqa	xmm1, xmm0
+        psrldq	xmm1, 4
+        pslldq	xmm0, 12
+        pxor	xmm7, xmm0
+        movdqa	xmm2, xmm7
+        movdqa	xmm3, xmm7
+        movdqa	xmm0, xmm7
+        psrld	xmm2, 1
+        psrld	xmm3, 2
+        psrld	xmm0, 7
+        pxor	xmm2, xmm3
+        pxor	xmm2, xmm0
+        pxor	xmm2, xmm1
+        pxor	xmm2, xmm7
+        pxor	xmm4, xmm2
+L_AES_GCM_decrypt_calc_iv_done:
+        ; T = Encrypt counter
+        pxor	xmm0, xmm0
+        shl	edx, 3
+        pinsrq	xmm0, rdx, 0
+        pxor	xmm4, xmm0
+        pshufd	xmm1, xmm4, 78
+        pshufd	xmm2, xmm5, 78
+        movdqa	xmm3, xmm5
+        movdqa	xmm0, xmm5
+        pclmulqdq	xmm3, xmm4, 17
+        pclmulqdq	xmm0, xmm4, 0
+        pxor	xmm1, xmm4
+        pxor	xmm2, xmm5
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        movdqa	xmm7, xmm0
+        movdqa	xmm4, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm7, xmm2
+        pxor	xmm4, xmm1
+        movdqa	xmm0, xmm7
+        movdqa	xmm1, xmm4
+        psrld	xmm0, 31
+        psrld	xmm1, 31
+        pslld	xmm7, 1
+        pslld	xmm4, 1
+        movdqa	xmm2, xmm0
+        pslldq	xmm0, 4
+        psrldq	xmm2, 12
+        pslldq	xmm1, 4
+        por	xmm4, xmm2
+        por	xmm7, xmm0
+        por	xmm4, xmm1
+        movdqa	xmm0, xmm7
+        movdqa	xmm1, xmm7
+        movdqa	xmm2, xmm7
+        pslld	xmm0, 31
+        pslld	xmm1, 30
+        pslld	xmm2, 25
+        pxor	xmm0, xmm1
+        pxor	xmm0, xmm2
+        movdqa	xmm1, xmm0
+        psrldq	xmm1, 4
+        pslldq	xmm0, 12
+        pxor	xmm7, xmm0
+        movdqa	xmm2, xmm7
+        movdqa	xmm3, xmm7
+        movdqa	xmm0, xmm7
+        psrld	xmm2, 1
+        psrld	xmm3, 2
+        psrld	xmm0, 7
+        pxor	xmm2, xmm3
+        pxor	xmm2, xmm0
+        pxor	xmm2, xmm1
+        pxor	xmm2, xmm7
+        pxor	xmm4, xmm2
+        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_mask
+        ;   Encrypt counter
+        movdqa	xmm8, OWORD PTR [r15]
+        pxor	xmm8, xmm4
+        aesenc	xmm8, [r15+16]
+        aesenc	xmm8, [r15+32]
+        aesenc	xmm8, [r15+48]
+        aesenc	xmm8, [r15+64]
+        aesenc	xmm8, [r15+80]
+        aesenc	xmm8, [r15+96]
+        aesenc	xmm8, [r15+112]
+        aesenc	xmm8, [r15+128]
+        aesenc	xmm8, [r15+144]
+        cmp	r10d, 11
+        movdqa	xmm9, OWORD PTR [r15+160]
+        jl	L_AES_GCM_decrypt_calc_iv_2_aesenc_avx_last
+        aesenc	xmm8, xmm9
+        aesenc	xmm8, [r15+176]
+        cmp	r10d, 13
+        movdqa	xmm9, OWORD PTR [r15+192]
+        jl	L_AES_GCM_decrypt_calc_iv_2_aesenc_avx_last
+        aesenc	xmm8, xmm9
+        aesenc	xmm8, [r15+208]
+        movdqa	xmm9, OWORD PTR [r15+224]
+L_AES_GCM_decrypt_calc_iv_2_aesenc_avx_last:
+        aesenclast	xmm8, xmm9
+        movdqu	[rsp+144], xmm8
+L_AES_GCM_decrypt_iv_done:
+        ; Additional authentication data
+        mov	edx, r11d
+        cmp	edx, 0
+        je	L_AES_GCM_decrypt_calc_aad_done
+        xor	ecx, ecx
+        cmp	edx, 16
+        jl	L_AES_GCM_decrypt_calc_aad_lt16
+        and	edx, 4294967280
+L_AES_GCM_decrypt_calc_aad_16_loop:
+        movdqu	xmm8, [r12+rcx]
+        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
+        pxor	xmm6, xmm8
+        pshufd	xmm1, xmm6, 78
+        pshufd	xmm2, xmm5, 78
+        movdqa	xmm3, xmm5
+        movdqa	xmm0, xmm5
+        pclmulqdq	xmm3, xmm6, 17
+        pclmulqdq	xmm0, xmm6, 0
+        pxor	xmm1, xmm6
+        pxor	xmm2, xmm5
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        movdqa	xmm7, xmm0
+        movdqa	xmm6, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm7, xmm2
+        pxor	xmm6, xmm1
+        movdqa	xmm0, xmm7
+        movdqa	xmm1, xmm6
+        psrld	xmm0, 31
+        psrld	xmm1, 31
+        pslld	xmm7, 1
+        pslld	xmm6, 1
+        movdqa	xmm2, xmm0
+        pslldq	xmm0, 4
+        psrldq	xmm2, 12
+        pslldq	xmm1, 4
+        por	xmm6, xmm2
+        por	xmm7, xmm0
+        por	xmm6, xmm1
+        movdqa	xmm0, xmm7
+        movdqa	xmm1, xmm7
+        movdqa	xmm2, xmm7
+        pslld	xmm0, 31
+        pslld	xmm1, 30
+        pslld	xmm2, 25
+        pxor	xmm0, xmm1
+        pxor	xmm0, xmm2
+        movdqa	xmm1, xmm0
+        psrldq	xmm1, 4
+        pslldq	xmm0, 12
+        pxor	xmm7, xmm0
+        movdqa	xmm2, xmm7
+        movdqa	xmm3, xmm7
+        movdqa	xmm0, xmm7
+        psrld	xmm2, 1
+        psrld	xmm3, 2
+        psrld	xmm0, 7
+        pxor	xmm2, xmm3
+        pxor	xmm2, xmm0
+        pxor	xmm2, xmm1
+        pxor	xmm2, xmm7
+        pxor	xmm6, xmm2
+        add	ecx, 16
+        cmp	ecx, edx
+        jl	L_AES_GCM_decrypt_calc_aad_16_loop
+        mov	edx, r11d
+        cmp	ecx, edx
+        je	L_AES_GCM_decrypt_calc_aad_done
+L_AES_GCM_decrypt_calc_aad_lt16:
+        sub	rsp, 16
+        pxor	xmm8, xmm8
+        xor	ebx, ebx
+        movdqu	[rsp], xmm8
+L_AES_GCM_decrypt_calc_aad_loop:
+        movzx	r13d, BYTE PTR [r12+rcx]
+        mov	BYTE PTR [rsp+rbx], r13b
+        inc	ecx
+        inc	ebx
+        cmp	ecx, edx
+        jl	L_AES_GCM_decrypt_calc_aad_loop
+        movdqu	xmm8, [rsp]
+        add	rsp, 16
+        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
+        pxor	xmm6, xmm8
+        pshufd	xmm1, xmm6, 78
+        pshufd	xmm2, xmm5, 78
+        movdqa	xmm3, xmm5
+        movdqa	xmm0, xmm5
+        pclmulqdq	xmm3, xmm6, 17
+        pclmulqdq	xmm0, xmm6, 0
+        pxor	xmm1, xmm6
+        pxor	xmm2, xmm5
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        movdqa	xmm7, xmm0
+        movdqa	xmm6, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm7, xmm2
+        pxor	xmm6, xmm1
+        movdqa	xmm0, xmm7
+        movdqa	xmm1, xmm6
+        psrld	xmm0, 31
+        psrld	xmm1, 31
+        pslld	xmm7, 1
+        pslld	xmm6, 1
+        movdqa	xmm2, xmm0
+        pslldq	xmm0, 4
+        psrldq	xmm2, 12
+        pslldq	xmm1, 4
+        por	xmm6, xmm2
+        por	xmm7, xmm0
+        por	xmm6, xmm1
+        movdqa	xmm0, xmm7
+        movdqa	xmm1, xmm7
+        movdqa	xmm2, xmm7
+        pslld	xmm0, 31
+        pslld	xmm1, 30
+        pslld	xmm2, 25
+        pxor	xmm0, xmm1
+        pxor	xmm0, xmm2
+        movdqa	xmm1, xmm0
+        psrldq	xmm1, 4
+        pslldq	xmm0, 12
+        pxor	xmm7, xmm0
+        movdqa	xmm2, xmm7
+        movdqa	xmm3, xmm7
+        movdqa	xmm0, xmm7
+        psrld	xmm2, 1
+        psrld	xmm3, 2
+        psrld	xmm0, 7
+        pxor	xmm2, xmm3
+        pxor	xmm2, xmm0
+        pxor	xmm2, xmm1
+        pxor	xmm2, xmm7
+        pxor	xmm6, xmm2
+L_AES_GCM_decrypt_calc_aad_done:
+        ; Calculate counter and H
+        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_epi64
+        movdqa	xmm9, xmm5
+        paddd	xmm4, OWORD PTR L_aes_gcm_one
+        movdqa	xmm8, xmm5
+        movdqu	[rsp+128], xmm4
+        psrlq	xmm9, 63
+        psllq	xmm8, 1
+        pslldq	xmm9, 8
+        por	xmm8, xmm9
+        pshufd	xmm5, xmm5, 255
+        psrad	xmm5, 31
+        pand	xmm5, OWORD PTR L_aes_gcm_mod2_128
+        pxor	xmm5, xmm8
+        xor	ebx, ebx
+        cmp	r9d, 128
+        mov	r13d, r9d
+        jl	L_AES_GCM_decrypt_done_128
+        and	r13d, 4294967168
+        movdqa	xmm2, xmm6
+        ; H ^ 1
+        movdqu	[rsp], xmm5
+        ; H ^ 2
+        pshufd	xmm9, xmm5, 78
+        pshufd	xmm10, xmm5, 78
+        movdqa	xmm11, xmm5
+        movdqa	xmm8, xmm5
+        pclmulqdq	xmm11, xmm5, 17
+        pclmulqdq	xmm8, xmm5, 0
+        pxor	xmm9, xmm5
+        pxor	xmm10, xmm5
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm0, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm0, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm0, xmm14
+        movdqu	[rsp+16], xmm0
+        ; H ^ 3
+        pshufd	xmm9, xmm5, 78
+        pshufd	xmm10, xmm0, 78
+        movdqa	xmm11, xmm0
+        movdqa	xmm8, xmm0
+        pclmulqdq	xmm11, xmm5, 17
+        pclmulqdq	xmm8, xmm5, 0
+        pxor	xmm9, xmm5
+        pxor	xmm10, xmm0
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm1, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm1, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm1, xmm14
+        movdqu	[rsp+32], xmm1
+        ; H ^ 4
+        pshufd	xmm9, xmm0, 78
+        pshufd	xmm10, xmm0, 78
+        movdqa	xmm11, xmm0
+        movdqa	xmm8, xmm0
+        pclmulqdq	xmm11, xmm0, 17
+        pclmulqdq	xmm8, xmm0, 0
+        pxor	xmm9, xmm0
+        pxor	xmm10, xmm0
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm3, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm3, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm3, xmm14
+        movdqu	[rsp+48], xmm3
+        ; H ^ 5
+        pshufd	xmm9, xmm0, 78
+        pshufd	xmm10, xmm1, 78
+        movdqa	xmm11, xmm1
+        movdqa	xmm8, xmm1
+        pclmulqdq	xmm11, xmm0, 17
+        pclmulqdq	xmm8, xmm0, 0
+        pxor	xmm9, xmm0
+        pxor	xmm10, xmm1
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm7, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm7, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm7, xmm14
+        movdqu	[rsp+64], xmm7
+        ; H ^ 6
+        pshufd	xmm9, xmm1, 78
+        pshufd	xmm10, xmm1, 78
+        movdqa	xmm11, xmm1
+        movdqa	xmm8, xmm1
+        pclmulqdq	xmm11, xmm1, 17
+        pclmulqdq	xmm8, xmm1, 0
+        pxor	xmm9, xmm1
+        pxor	xmm10, xmm1
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm7, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm7, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm7, xmm14
+        movdqu	[rsp+80], xmm7
+        ; H ^ 7
+        pshufd	xmm9, xmm1, 78
+        pshufd	xmm10, xmm3, 78
+        movdqa	xmm11, xmm3
+        movdqa	xmm8, xmm3
+        pclmulqdq	xmm11, xmm1, 17
+        pclmulqdq	xmm8, xmm1, 0
+        pxor	xmm9, xmm1
+        pxor	xmm10, xmm3
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm7, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm7, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm7, xmm14
+        movdqu	[rsp+96], xmm7
+        ; H ^ 8
+        pshufd	xmm9, xmm3, 78
+        pshufd	xmm10, xmm3, 78
+        movdqa	xmm11, xmm3
+        movdqa	xmm8, xmm3
+        pclmulqdq	xmm11, xmm3, 17
+        pclmulqdq	xmm8, xmm3, 0
+        pxor	xmm9, xmm3
+        pxor	xmm10, xmm3
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm7, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm7, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm7, xmm14
+        movdqu	[rsp+112], xmm7
+L_AES_GCM_decrypt_ghash_128:
+        lea	rcx, QWORD PTR [rdi+rbx]
+        lea	rdx, QWORD PTR [rsi+rbx]
+        movdqu	xmm8, [rsp+128]
+        movdqa	xmm1, OWORD PTR L_aes_gcm_bswap_epi64
+        movdqa	xmm0, xmm8
+        pshufb	xmm8, xmm1
+        movdqa	xmm9, xmm0
+        paddd	xmm9, OWORD PTR L_aes_gcm_one
+        pshufb	xmm9, xmm1
+        movdqa	xmm10, xmm0
+        paddd	xmm10, OWORD PTR L_aes_gcm_two
+        pshufb	xmm10, xmm1
+        movdqa	xmm11, xmm0
+        paddd	xmm11, OWORD PTR L_aes_gcm_three
+        pshufb	xmm11, xmm1
+        movdqa	xmm12, xmm0
+        paddd	xmm12, OWORD PTR L_aes_gcm_four
+        pshufb	xmm12, xmm1
+        movdqa	xmm13, xmm0
+        paddd	xmm13, OWORD PTR L_aes_gcm_five
+        pshufb	xmm13, xmm1
+        movdqa	xmm14, xmm0
+        paddd	xmm14, OWORD PTR L_aes_gcm_six
+        pshufb	xmm14, xmm1
+        movdqa	xmm15, xmm0
+        paddd	xmm15, OWORD PTR L_aes_gcm_seven
+        pshufb	xmm15, xmm1
+        paddd	xmm0, OWORD PTR L_aes_gcm_eight
+        movdqa	xmm7, OWORD PTR [r15]
+        movdqu	[rsp+128], xmm0
+        pxor	xmm8, xmm7
+        pxor	xmm9, xmm7
+        pxor	xmm10, xmm7
+        pxor	xmm11, xmm7
+        pxor	xmm12, xmm7
+        pxor	xmm13, xmm7
+        pxor	xmm14, xmm7
+        pxor	xmm15, xmm7
+        movdqu	xmm7, [rsp+112]
+        movdqu	xmm0, [rcx]
+        aesenc	xmm8, [r15+16]
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        pxor	xmm0, xmm2
+        pshufd	xmm1, xmm7, 78
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm1, xmm7
+        pxor	xmm5, xmm0
+        movdqa	xmm3, xmm0
+        pclmulqdq	xmm3, xmm7, 17
+        aesenc	xmm9, [r15+16]
+        aesenc	xmm10, [r15+16]
+        movdqa	xmm2, xmm0
+        pclmulqdq	xmm2, xmm7, 0
+        aesenc	xmm11, [r15+16]
+        aesenc	xmm12, [r15+16]
+        pclmulqdq	xmm1, xmm5, 0
+        aesenc	xmm13, [r15+16]
+        aesenc	xmm14, [r15+16]
+        aesenc	xmm15, [r15+16]
+        pxor	xmm1, xmm2
+        pxor	xmm1, xmm3
+        movdqu	xmm7, [rsp+96]
+        movdqu	xmm0, [rcx+16]
+        pshufd	xmm4, xmm7, 78
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        aesenc	xmm8, [r15+32]
+        pxor	xmm4, xmm7
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm5, xmm0
+        movdqa	xmm6, xmm0
+        pclmulqdq	xmm6, xmm7, 17
+        aesenc	xmm9, [r15+32]
+        aesenc	xmm10, [r15+32]
+        pclmulqdq	xmm7, xmm0, 0
+        aesenc	xmm11, [r15+32]
+        aesenc	xmm12, [r15+32]
+        pclmulqdq	xmm4, xmm5, 0
+        aesenc	xmm13, [r15+32]
+        aesenc	xmm14, [r15+32]
+        aesenc	xmm15, [r15+32]
+        pxor	xmm1, xmm7
+        pxor	xmm2, xmm7
+        pxor	xmm1, xmm6
+        pxor	xmm3, xmm6
+        pxor	xmm1, xmm4
+        movdqu	xmm7, [rsp+80]
+        movdqu	xmm0, [rcx+32]
+        pshufd	xmm4, xmm7, 78
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        aesenc	xmm8, [r15+48]
+        pxor	xmm4, xmm7
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm5, xmm0
+        movdqa	xmm6, xmm0
+        pclmulqdq	xmm6, xmm7, 17
+        aesenc	xmm9, [r15+48]
+        aesenc	xmm10, [r15+48]
+        pclmulqdq	xmm7, xmm0, 0
+        aesenc	xmm11, [r15+48]
+        aesenc	xmm12, [r15+48]
+        pclmulqdq	xmm4, xmm5, 0
+        aesenc	xmm13, [r15+48]
+        aesenc	xmm14, [r15+48]
+        aesenc	xmm15, [r15+48]
+        pxor	xmm1, xmm7
+        pxor	xmm2, xmm7
+        pxor	xmm1, xmm6
+        pxor	xmm3, xmm6
+        pxor	xmm1, xmm4
+        movdqu	xmm7, [rsp+64]
+        movdqu	xmm0, [rcx+48]
+        pshufd	xmm4, xmm7, 78
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        aesenc	xmm8, [r15+64]
+        pxor	xmm4, xmm7
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm5, xmm0
+        movdqa	xmm6, xmm0
+        pclmulqdq	xmm6, xmm7, 17
+        aesenc	xmm9, [r15+64]
+        aesenc	xmm10, [r15+64]
+        pclmulqdq	xmm7, xmm0, 0
+        aesenc	xmm11, [r15+64]
+        aesenc	xmm12, [r15+64]
+        pclmulqdq	xmm4, xmm5, 0
+        aesenc	xmm13, [r15+64]
+        aesenc	xmm14, [r15+64]
+        aesenc	xmm15, [r15+64]
+        pxor	xmm1, xmm7
+        pxor	xmm2, xmm7
+        pxor	xmm1, xmm6
+        pxor	xmm3, xmm6
+        pxor	xmm1, xmm4
+        movdqu	xmm7, [rsp+48]
+        movdqu	xmm0, [rcx+64]
+        pshufd	xmm4, xmm7, 78
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        aesenc	xmm8, [r15+80]
+        pxor	xmm4, xmm7
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm5, xmm0
+        movdqa	xmm6, xmm0
+        pclmulqdq	xmm6, xmm7, 17
+        aesenc	xmm9, [r15+80]
+        aesenc	xmm10, [r15+80]
+        pclmulqdq	xmm7, xmm0, 0
+        aesenc	xmm11, [r15+80]
+        aesenc	xmm12, [r15+80]
+        pclmulqdq	xmm4, xmm5, 0
+        aesenc	xmm13, [r15+80]
+        aesenc	xmm14, [r15+80]
+        aesenc	xmm15, [r15+80]
+        pxor	xmm1, xmm7
+        pxor	xmm2, xmm7
+        pxor	xmm1, xmm6
+        pxor	xmm3, xmm6
+        pxor	xmm1, xmm4
+        movdqu	xmm7, [rsp+32]
+        movdqu	xmm0, [rcx+80]
+        pshufd	xmm4, xmm7, 78
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        aesenc	xmm8, [r15+96]
+        pxor	xmm4, xmm7
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm5, xmm0
+        movdqa	xmm6, xmm0
+        pclmulqdq	xmm6, xmm7, 17
+        aesenc	xmm9, [r15+96]
+        aesenc	xmm10, [r15+96]
+        pclmulqdq	xmm7, xmm0, 0
+        aesenc	xmm11, [r15+96]
+        aesenc	xmm12, [r15+96]
+        pclmulqdq	xmm4, xmm5, 0
+        aesenc	xmm13, [r15+96]
+        aesenc	xmm14, [r15+96]
+        aesenc	xmm15, [r15+96]
+        pxor	xmm1, xmm7
+        pxor	xmm2, xmm7
+        pxor	xmm1, xmm6
+        pxor	xmm3, xmm6
+        pxor	xmm1, xmm4
+        movdqu	xmm7, [rsp+16]
+        movdqu	xmm0, [rcx+96]
+        pshufd	xmm4, xmm7, 78
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        aesenc	xmm8, [r15+112]
+        pxor	xmm4, xmm7
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm5, xmm0
+        movdqa	xmm6, xmm0
+        pclmulqdq	xmm6, xmm7, 17
+        aesenc	xmm9, [r15+112]
+        aesenc	xmm10, [r15+112]
+        pclmulqdq	xmm7, xmm0, 0
+        aesenc	xmm11, [r15+112]
+        aesenc	xmm12, [r15+112]
+        pclmulqdq	xmm4, xmm5, 0
+        aesenc	xmm13, [r15+112]
+        aesenc	xmm14, [r15+112]
+        aesenc	xmm15, [r15+112]
+        pxor	xmm1, xmm7
+        pxor	xmm2, xmm7
+        pxor	xmm1, xmm6
+        pxor	xmm3, xmm6
+        pxor	xmm1, xmm4
+        movdqu	xmm7, [rsp]
+        movdqu	xmm0, [rcx+112]
+        pshufd	xmm4, xmm7, 78
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        aesenc	xmm8, [r15+128]
+        pxor	xmm4, xmm7
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm5, xmm0
+        movdqa	xmm6, xmm0
+        pclmulqdq	xmm6, xmm7, 17
+        aesenc	xmm9, [r15+128]
+        aesenc	xmm10, [r15+128]
+        pclmulqdq	xmm7, xmm0, 0
+        aesenc	xmm11, [r15+128]
+        aesenc	xmm12, [r15+128]
+        pclmulqdq	xmm4, xmm5, 0
+        aesenc	xmm13, [r15+128]
+        aesenc	xmm14, [r15+128]
+        aesenc	xmm15, [r15+128]
+        pxor	xmm1, xmm7
+        pxor	xmm2, xmm7
+        pxor	xmm1, xmm6
+        pxor	xmm3, xmm6
+        pxor	xmm1, xmm4
+        movdqa	xmm5, xmm1
+        psrldq	xmm1, 8
+        pslldq	xmm5, 8
+        aesenc	xmm8, [r15+144]
+        pxor	xmm2, xmm5
+        pxor	xmm3, xmm1
+        movdqa	xmm7, xmm2
+        movdqa	xmm4, xmm2
+        movdqa	xmm5, xmm2
+        aesenc	xmm9, [r15+144]
+        pslld	xmm7, 31
+        pslld	xmm4, 30
+        pslld	xmm5, 25
+        aesenc	xmm10, [r15+144]
+        pxor	xmm7, xmm4
+        pxor	xmm7, xmm5
+        aesenc	xmm11, [r15+144]
+        movdqa	xmm4, xmm7
+        pslldq	xmm7, 12
+        psrldq	xmm4, 4
+        aesenc	xmm12, [r15+144]
+        pxor	xmm2, xmm7
+        movdqa	xmm5, xmm2
+        movdqa	xmm1, xmm2
+        movdqa	xmm0, xmm2
+        aesenc	xmm13, [r15+144]
+        psrld	xmm5, 1
+        psrld	xmm1, 2
+        psrld	xmm0, 7
+        aesenc	xmm14, [r15+144]
+        pxor	xmm5, xmm1
+        pxor	xmm5, xmm0
+        aesenc	xmm15, [r15+144]
+        pxor	xmm5, xmm4
+        pxor	xmm2, xmm5
+        pxor	xmm2, xmm3
+        cmp	r10d, 11
+        movdqa	xmm7, OWORD PTR [r15+160]
+        jl	L_AES_GCM_decrypt_aesenc_128_ghash_avx_done
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [r15+176]
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        cmp	r10d, 13
+        movdqa	xmm7, OWORD PTR [r15+192]
+        jl	L_AES_GCM_decrypt_aesenc_128_ghash_avx_done
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [r15+208]
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [r15+224]
+L_AES_GCM_decrypt_aesenc_128_ghash_avx_done:
+        aesenclast	xmm8, xmm7
+        aesenclast	xmm9, xmm7
+        movdqu	xmm0, [rcx]
+        movdqu	xmm1, [rcx+16]
+        pxor	xmm8, xmm0
+        pxor	xmm9, xmm1
+        movdqu	[rdx], xmm8
+        movdqu	[rdx+16], xmm9
+        aesenclast	xmm10, xmm7
+        aesenclast	xmm11, xmm7
+        movdqu	xmm0, [rcx+32]
+        movdqu	xmm1, [rcx+48]
+        pxor	xmm10, xmm0
+        pxor	xmm11, xmm1
+        movdqu	[rdx+32], xmm10
+        movdqu	[rdx+48], xmm11
+        aesenclast	xmm12, xmm7
+        aesenclast	xmm13, xmm7
+        movdqu	xmm0, [rcx+64]
+        movdqu	xmm1, [rcx+80]
+        pxor	xmm12, xmm0
+        pxor	xmm13, xmm1
+        movdqu	[rdx+64], xmm12
+        movdqu	[rdx+80], xmm13
+        aesenclast	xmm14, xmm7
+        aesenclast	xmm15, xmm7
+        movdqu	xmm0, [rcx+96]
+        movdqu	xmm1, [rcx+112]
+        pxor	xmm14, xmm0
+        pxor	xmm15, xmm1
+        movdqu	[rdx+96], xmm14
+        movdqu	[rdx+112], xmm15
+        add	ebx, 128
+        cmp	ebx, r13d
+        jl	L_AES_GCM_decrypt_ghash_128
+        movdqa	xmm6, xmm2
+        movdqu	xmm5, [rsp]
+L_AES_GCM_decrypt_done_128:
+        mov	edx, r9d
+        cmp	ebx, edx
+        jge	L_AES_GCM_decrypt_done_dec
+        mov	r13d, r9d
+        and	r13d, 4294967280
+        cmp	ebx, r13d
+        jge	L_AES_GCM_decrypt_last_block_done
+L_AES_GCM_decrypt_last_block_start:
+        lea	rcx, QWORD PTR [rdi+rbx]
+        lea	rdx, QWORD PTR [rsi+rbx]
+        movdqu	xmm1, [rcx]
+        movdqa	xmm0, xmm5
+        pshufb	xmm1, OWORD PTR L_aes_gcm_bswap_mask
+        pxor	xmm1, xmm6
+        movdqu	xmm8, [rsp+128]
+        movdqa	xmm9, xmm8
+        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_epi64
+        paddd	xmm9, OWORD PTR L_aes_gcm_one
+        pxor	xmm8, [r15]
+        movdqu	[rsp+128], xmm9
+        movdqa	xmm10, xmm1
+        pclmulqdq	xmm10, xmm0, 16
+        aesenc	xmm8, [r15+16]
+        aesenc	xmm8, [r15+32]
+        movdqa	xmm11, xmm1
+        pclmulqdq	xmm11, xmm0, 1
+        aesenc	xmm8, [r15+48]
+        aesenc	xmm8, [r15+64]
+        movdqa	xmm12, xmm1
+        pclmulqdq	xmm12, xmm0, 0
+        aesenc	xmm8, [r15+80]
+        movdqa	xmm1, xmm1
+        pclmulqdq	xmm1, xmm0, 17
+        aesenc	xmm8, [r15+96]
+        pxor	xmm10, xmm11
+        movdqa	xmm2, xmm10
+        psrldq	xmm10, 8
+        pslldq	xmm2, 8
+        aesenc	xmm8, [r15+112]
+        movdqa	xmm3, xmm1
+        pxor	xmm2, xmm12
+        pxor	xmm3, xmm10
+        movdqa	xmm0, OWORD PTR L_aes_gcm_mod2_128
+        movdqa	xmm11, xmm2
+        pclmulqdq	xmm11, xmm0, 16
+        aesenc	xmm8, [r15+128]
+        pshufd	xmm10, xmm2, 78
+        pxor	xmm10, xmm11
+        movdqa	xmm11, xmm10
+        pclmulqdq	xmm11, xmm0, 16
+        aesenc	xmm8, [r15+144]
+        pshufd	xmm6, xmm10, 78
+        pxor	xmm6, xmm11
+        pxor	xmm6, xmm3
+        cmp	r10d, 11
+        movdqa	xmm9, OWORD PTR [r15+160]
+        jl	L_AES_GCM_decrypt_aesenc_gfmul_last
+        aesenc	xmm8, xmm9
+        aesenc	xmm8, [r15+176]
+        cmp	r10d, 13
+        movdqa	xmm9, OWORD PTR [r15+192]
+        jl	L_AES_GCM_decrypt_aesenc_gfmul_last
+        aesenc	xmm8, xmm9
+        aesenc	xmm8, [r15+208]
+        movdqa	xmm9, OWORD PTR [r15+224]
+L_AES_GCM_decrypt_aesenc_gfmul_last:
+        aesenclast	xmm8, xmm9
+        movdqu	xmm9, [rcx]
+        pxor	xmm8, xmm9
+        movdqu	[rdx], xmm8
+        add	ebx, 16
+        cmp	ebx, r13d
+        jl	L_AES_GCM_decrypt_last_block_start
+L_AES_GCM_decrypt_last_block_done:
+        mov	ecx, r9d
+        mov	edx, ecx
+        and	ecx, 15
+        jz	L_AES_GCM_decrypt_aesenc_last15_dec_avx_done
+        movdqu	xmm4, [rsp+128]
+        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_epi64
+        pxor	xmm4, [r15]
+        aesenc	xmm4, [r15+16]
+        aesenc	xmm4, [r15+32]
+        aesenc	xmm4, [r15+48]
+        aesenc	xmm4, [r15+64]
+        aesenc	xmm4, [r15+80]
+        aesenc	xmm4, [r15+96]
+        aesenc	xmm4, [r15+112]
+        aesenc	xmm4, [r15+128]
+        aesenc	xmm4, [r15+144]
+        cmp	r10d, 11
+        movdqa	xmm9, OWORD PTR [r15+160]
+        jl	L_AES_GCM_decrypt_aesenc_last15_dec_avx_aesenc_avx_last
+        aesenc	xmm4, xmm9
+        aesenc	xmm4, [r15+176]
+        cmp	r10d, 13
+        movdqa	xmm9, OWORD PTR [r15+192]
+        jl	L_AES_GCM_decrypt_aesenc_last15_dec_avx_aesenc_avx_last
+        aesenc	xmm4, xmm9
+        aesenc	xmm4, [r15+208]
+        movdqa	xmm9, OWORD PTR [r15+224]
+L_AES_GCM_decrypt_aesenc_last15_dec_avx_aesenc_avx_last:
+        aesenclast	xmm4, xmm9
+        sub	rsp, 32
+        xor	ecx, ecx
+        movdqu	[rsp], xmm4
+        pxor	xmm0, xmm0
+        movdqu	[rsp+16], xmm0
+L_AES_GCM_decrypt_aesenc_last15_dec_avx_loop:
+        movzx	r13d, BYTE PTR [rdi+rbx]
+        mov	BYTE PTR [rsp+rcx+16], r13b
+        xor	r13b, BYTE PTR [rsp+rcx]
+        mov	BYTE PTR [rsi+rbx], r13b
+        inc	ebx
+        inc	ecx
+        cmp	ebx, edx
+        jl	L_AES_GCM_decrypt_aesenc_last15_dec_avx_loop
+        movdqu	xmm4, [rsp+16]
+        add	rsp, 32
+        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_mask
+        pxor	xmm6, xmm4
+        pshufd	xmm9, xmm5, 78
+        pshufd	xmm10, xmm6, 78
+        movdqa	xmm11, xmm6
+        movdqa	xmm8, xmm6
+        pclmulqdq	xmm11, xmm5, 17
+        pclmulqdq	xmm8, xmm5, 0
+        pxor	xmm9, xmm5
+        pxor	xmm10, xmm6
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm6, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm6, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm6, xmm14
+L_AES_GCM_decrypt_aesenc_last15_dec_avx_done:
+L_AES_GCM_decrypt_done_dec:
+        mov	edx, r9d
+        mov	ecx, r11d
+        shl	rdx, 3
+        shl	rcx, 3
+        pinsrq	xmm0, rdx, 0
+        pinsrq	xmm0, rcx, 1
+        pxor	xmm6, xmm0
+        pshufd	xmm9, xmm5, 78
+        pshufd	xmm10, xmm6, 78
+        movdqa	xmm11, xmm6
+        movdqa	xmm8, xmm6
+        pclmulqdq	xmm11, xmm5, 17
+        pclmulqdq	xmm8, xmm5, 0
+        pxor	xmm9, xmm5
+        pxor	xmm10, xmm6
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm6, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm6, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm6, xmm14
+        pshufb	xmm6, OWORD PTR L_aes_gcm_bswap_mask
+        movdqu	xmm0, [rsp+144]
+        pxor	xmm0, xmm6
+        cmp	r14d, 16
+        je	L_AES_GCM_decrypt_cmp_tag_16
+        sub	rsp, 16
+        xor	rcx, rcx
+        xor	rbx, rbx
+        movdqu	[rsp], xmm0
+L_AES_GCM_decrypt_cmp_tag_loop:
+        movzx	r13d, BYTE PTR [rsp+rcx]
+        xor	r13b, BYTE PTR [r8+rcx]
+        or	bl, r13b
+        inc	ecx
+        cmp	ecx, r14d
+        jne	L_AES_GCM_decrypt_cmp_tag_loop
+        cmp	rbx, 0
+        sete	bl
+        add	rsp, 16
+        xor	rcx, rcx
+        jmp	L_AES_GCM_decrypt_cmp_tag_done
+L_AES_GCM_decrypt_cmp_tag_16:
+        movdqu	xmm1, [r8]
+        pcmpeqb	xmm0, xmm1
+        pmovmskb	rdx, xmm0
+        ; %%edx == 0xFFFF then return 1 else => return 0
+        xor	ebx, ebx
+        cmp	edx, 65535
+        sete	bl
+L_AES_GCM_decrypt_cmp_tag_done:
+        mov	DWORD PTR [rbp], ebx
+        add	rsp, 168
+        pop	rbp
+        pop	r15
+        pop	r14
+        pop	rbx
+        pop	r12
+        pop	rsi
+        pop	rdi
+        pop	r13
+        ret
+AES_GCM_decrypt ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+AES_GCM_init_aesni PROC
+        push	rdi
+        push	rsi
+        push	r12
+        push	r13
+        push	r14
+        mov	rdi, rcx
+        mov	rsi, rdx
+        mov	r10, r8
+        mov	r11d, r9d
+        mov	rax, QWORD PTR [rsp+80]
+        mov	r8, QWORD PTR [rsp+88]
+        mov	r9, QWORD PTR [rsp+96]
+        sub	rsp, 16
+        pxor	xmm4, xmm4
+        mov	edx, r11d
+        cmp	edx, 12
+        jne	L_AES_GCM_init_aesni_iv_not_12
+        ; # Calculate values when IV is 12 bytes
+        ; Set counter based on IV
+        mov	ecx, 16777216
+        pinsrq	xmm4, QWORD PTR [r10], 0
+        pinsrd	xmm4, DWORD PTR [r10+8], 2
+        pinsrd	xmm4, ecx, 3
+        ; H = Encrypt X(=0) and T = Encrypt counter
+        movdqa	xmm1, xmm4
+        movdqa	xmm5, OWORD PTR [rdi]
+        pxor	xmm1, xmm5
+        movdqa	xmm7, OWORD PTR [rdi+16]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [rdi+32]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [rdi+48]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [rdi+64]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [rdi+80]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [rdi+96]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [rdi+112]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [rdi+128]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [rdi+144]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        cmp	esi, 11
+        movdqa	xmm7, OWORD PTR [rdi+160]
+        jl	L_AES_GCM_init_aesni_calc_iv_12_last
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [rdi+176]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        cmp	esi, 13
+        movdqa	xmm7, OWORD PTR [rdi+192]
+        jl	L_AES_GCM_init_aesni_calc_iv_12_last
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [rdi+208]
+        aesenc	xmm5, xmm7
+        aesenc	xmm1, xmm7
+        movdqa	xmm7, OWORD PTR [rdi+224]
+L_AES_GCM_init_aesni_calc_iv_12_last:
+        aesenclast	xmm5, xmm7
+        aesenclast	xmm1, xmm7
+        pshufb	xmm5, OWORD PTR L_aes_gcm_bswap_mask
+        movdqu	xmm15, xmm1
+        jmp	L_AES_GCM_init_aesni_iv_done
+L_AES_GCM_init_aesni_iv_not_12:
+        ; Calculate values when IV is not 12 bytes
+        ; H = Encrypt X(=0)
+        movdqa	xmm5, OWORD PTR [rdi]
+        aesenc	xmm5, [rdi+16]
+        aesenc	xmm5, [rdi+32]
+        aesenc	xmm5, [rdi+48]
+        aesenc	xmm5, [rdi+64]
+        aesenc	xmm5, [rdi+80]
+        aesenc	xmm5, [rdi+96]
+        aesenc	xmm5, [rdi+112]
+        aesenc	xmm5, [rdi+128]
+        aesenc	xmm5, [rdi+144]
+        cmp	esi, 11
+        movdqa	xmm9, OWORD PTR [rdi+160]
+        jl	L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last
+        aesenc	xmm5, xmm9
+        aesenc	xmm5, [rdi+176]
+        cmp	esi, 13
+        movdqa	xmm9, OWORD PTR [rdi+192]
+        jl	L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last
+        aesenc	xmm5, xmm9
+        aesenc	xmm5, [rdi+208]
+        movdqa	xmm9, OWORD PTR [rdi+224]
+L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last:
+        aesenclast	xmm5, xmm9
+        pshufb	xmm5, OWORD PTR L_aes_gcm_bswap_mask
+        ; Calc counter
+        ; Initialization vector
+        cmp	edx, 0
+        mov	rcx, 0
+        je	L_AES_GCM_init_aesni_calc_iv_done
+        cmp	edx, 16
+        jl	L_AES_GCM_init_aesni_calc_iv_lt16
+        and	edx, 4294967280
+L_AES_GCM_init_aesni_calc_iv_16_loop:
+        movdqu	xmm8, [r10+rcx]
+        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
+        pxor	xmm4, xmm8
+        pshufd	xmm1, xmm4, 78
+        pshufd	xmm2, xmm5, 78
+        movdqa	xmm3, xmm5
+        movdqa	xmm0, xmm5
+        pclmulqdq	xmm3, xmm4, 17
+        pclmulqdq	xmm0, xmm4, 0
+        pxor	xmm1, xmm4
+        pxor	xmm2, xmm5
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        movdqa	xmm7, xmm0
+        movdqa	xmm4, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm7, xmm2
+        pxor	xmm4, xmm1
+        movdqa	xmm0, xmm7
+        movdqa	xmm1, xmm4
+        psrld	xmm0, 31
+        psrld	xmm1, 31
+        pslld	xmm7, 1
+        pslld	xmm4, 1
+        movdqa	xmm2, xmm0
+        pslldq	xmm0, 4
+        psrldq	xmm2, 12
+        pslldq	xmm1, 4
+        por	xmm4, xmm2
+        por	xmm7, xmm0
+        por	xmm4, xmm1
+        movdqa	xmm0, xmm7
+        movdqa	xmm1, xmm7
+        movdqa	xmm2, xmm7
+        pslld	xmm0, 31
+        pslld	xmm1, 30
+        pslld	xmm2, 25
+        pxor	xmm0, xmm1
+        pxor	xmm0, xmm2
+        movdqa	xmm1, xmm0
+        psrldq	xmm1, 4
+        pslldq	xmm0, 12
+        pxor	xmm7, xmm0
+        movdqa	xmm2, xmm7
+        movdqa	xmm3, xmm7
+        movdqa	xmm0, xmm7
+        psrld	xmm2, 1
+        psrld	xmm3, 2
+        psrld	xmm0, 7
+        pxor	xmm2, xmm3
+        pxor	xmm2, xmm0
+        pxor	xmm2, xmm1
+        pxor	xmm2, xmm7
+        pxor	xmm4, xmm2
+        add	ecx, 16
+        cmp	ecx, edx
+        jl	L_AES_GCM_init_aesni_calc_iv_16_loop
+        mov	edx, r11d
+        cmp	ecx, edx
+        je	L_AES_GCM_init_aesni_calc_iv_done
+L_AES_GCM_init_aesni_calc_iv_lt16:
+        sub	rsp, 16
+        pxor	xmm8, xmm8
+        xor	r13d, r13d
+        movdqu	[rsp], xmm8
+L_AES_GCM_init_aesni_calc_iv_loop:
+        movzx	r12d, BYTE PTR [r10+rcx]
+        mov	BYTE PTR [rsp+r13], r12b
+        inc	ecx
+        inc	r13d
+        cmp	ecx, edx
+        jl	L_AES_GCM_init_aesni_calc_iv_loop
+        movdqu	xmm8, [rsp]
+        add	rsp, 16
+        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
+        pxor	xmm4, xmm8
+        pshufd	xmm1, xmm4, 78
+        pshufd	xmm2, xmm5, 78
+        movdqa	xmm3, xmm5
+        movdqa	xmm0, xmm5
+        pclmulqdq	xmm3, xmm4, 17
+        pclmulqdq	xmm0, xmm4, 0
+        pxor	xmm1, xmm4
+        pxor	xmm2, xmm5
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        movdqa	xmm7, xmm0
+        movdqa	xmm4, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm7, xmm2
+        pxor	xmm4, xmm1
+        movdqa	xmm0, xmm7
+        movdqa	xmm1, xmm4
+        psrld	xmm0, 31
+        psrld	xmm1, 31
+        pslld	xmm7, 1
+        pslld	xmm4, 1
+        movdqa	xmm2, xmm0
+        pslldq	xmm0, 4
+        psrldq	xmm2, 12
+        pslldq	xmm1, 4
+        por	xmm4, xmm2
+        por	xmm7, xmm0
+        por	xmm4, xmm1
+        movdqa	xmm0, xmm7
+        movdqa	xmm1, xmm7
+        movdqa	xmm2, xmm7
+        pslld	xmm0, 31
+        pslld	xmm1, 30
+        pslld	xmm2, 25
+        pxor	xmm0, xmm1
+        pxor	xmm0, xmm2
+        movdqa	xmm1, xmm0
+        psrldq	xmm1, 4
+        pslldq	xmm0, 12
+        pxor	xmm7, xmm0
+        movdqa	xmm2, xmm7
+        movdqa	xmm3, xmm7
+        movdqa	xmm0, xmm7
+        psrld	xmm2, 1
+        psrld	xmm3, 2
+        psrld	xmm0, 7
+        pxor	xmm2, xmm3
+        pxor	xmm2, xmm0
+        pxor	xmm2, xmm1
+        pxor	xmm2, xmm7
+        pxor	xmm4, xmm2
+L_AES_GCM_init_aesni_calc_iv_done:
+        ; T = Encrypt counter
+        pxor	xmm0, xmm0
+        shl	edx, 3
+        pinsrq	xmm0, rdx, 0
+        pxor	xmm4, xmm0
+        pshufd	xmm1, xmm4, 78
+        pshufd	xmm2, xmm5, 78
+        movdqa	xmm3, xmm5
+        movdqa	xmm0, xmm5
+        pclmulqdq	xmm3, xmm4, 17
+        pclmulqdq	xmm0, xmm4, 0
+        pxor	xmm1, xmm4
+        pxor	xmm2, xmm5
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        movdqa	xmm7, xmm0
+        movdqa	xmm4, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm7, xmm2
+        pxor	xmm4, xmm1
+        movdqa	xmm0, xmm7
+        movdqa	xmm1, xmm4
+        psrld	xmm0, 31
+        psrld	xmm1, 31
+        pslld	xmm7, 1
+        pslld	xmm4, 1
+        movdqa	xmm2, xmm0
+        pslldq	xmm0, 4
+        psrldq	xmm2, 12
+        pslldq	xmm1, 4
+        por	xmm4, xmm2
+        por	xmm7, xmm0
+        por	xmm4, xmm1
+        movdqa	xmm0, xmm7
+        movdqa	xmm1, xmm7
+        movdqa	xmm2, xmm7
+        pslld	xmm0, 31
+        pslld	xmm1, 30
+        pslld	xmm2, 25
+        pxor	xmm0, xmm1
+        pxor	xmm0, xmm2
+        movdqa	xmm1, xmm0
+        psrldq	xmm1, 4
+        pslldq	xmm0, 12
+        pxor	xmm7, xmm0
+        movdqa	xmm2, xmm7
+        movdqa	xmm3, xmm7
+        movdqa	xmm0, xmm7
+        psrld	xmm2, 1
+        psrld	xmm3, 2
+        psrld	xmm0, 7
+        pxor	xmm2, xmm3
+        pxor	xmm2, xmm0
+        pxor	xmm2, xmm1
+        pxor	xmm2, xmm7
+        pxor	xmm4, xmm2
+        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_mask
+        ;   Encrypt counter
+        movdqa	xmm8, OWORD PTR [rdi]
+        pxor	xmm8, xmm4
+        aesenc	xmm8, [rdi+16]
+        aesenc	xmm8, [rdi+32]
+        aesenc	xmm8, [rdi+48]
+        aesenc	xmm8, [rdi+64]
+        aesenc	xmm8, [rdi+80]
+        aesenc	xmm8, [rdi+96]
+        aesenc	xmm8, [rdi+112]
+        aesenc	xmm8, [rdi+128]
+        aesenc	xmm8, [rdi+144]
+        cmp	esi, 11
+        movdqa	xmm9, OWORD PTR [rdi+160]
+        jl	L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last
+        aesenc	xmm8, xmm9
+        aesenc	xmm8, [rdi+176]
+        cmp	esi, 13
+        movdqa	xmm9, OWORD PTR [rdi+192]
+        jl	L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last
+        aesenc	xmm8, xmm9
+        aesenc	xmm8, [rdi+208]
+        movdqa	xmm9, OWORD PTR [rdi+224]
+L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last:
+        aesenclast	xmm8, xmm9
+        movdqu	xmm15, xmm8
+L_AES_GCM_init_aesni_iv_done:
+        movdqa	OWORD PTR [r9], xmm15
+        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_epi64
+        paddd	xmm4, OWORD PTR L_aes_gcm_one
+        movdqa	OWORD PTR [rax], xmm5
+        movdqa	OWORD PTR [r8], xmm4
+        add	rsp, 16
+        pop	r14
+        pop	r13
+        pop	r12
+        pop	rsi
+        pop	rdi
+        ret
+AES_GCM_init_aesni ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+AES_GCM_aad_update_aesni PROC
+        mov	rax, rcx
+        movdqa	xmm5, OWORD PTR [r8]
+        movdqa	xmm6, OWORD PTR [r9]
+        xor	ecx, ecx
+L_AES_GCM_aad_update_aesni_16_loop:
+        movdqu	xmm8, [rax+rcx]
+        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
+        pxor	xmm5, xmm8
+        pshufd	xmm1, xmm5, 78
+        pshufd	xmm2, xmm6, 78
+        movdqa	xmm3, xmm6
+        movdqa	xmm0, xmm6
+        pclmulqdq	xmm3, xmm5, 17
+        pclmulqdq	xmm0, xmm5, 0
+        pxor	xmm1, xmm5
+        pxor	xmm2, xmm6
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        movdqa	xmm4, xmm0
+        movdqa	xmm5, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm4, xmm2
+        pxor	xmm5, xmm1
+        movdqa	xmm0, xmm4
+        movdqa	xmm1, xmm5
+        psrld	xmm0, 31
+        psrld	xmm1, 31
+        pslld	xmm4, 1
+        pslld	xmm5, 1
+        movdqa	xmm2, xmm0
+        pslldq	xmm0, 4
+        psrldq	xmm2, 12
+        pslldq	xmm1, 4
+        por	xmm5, xmm2
+        por	xmm4, xmm0
+        por	xmm5, xmm1
+        movdqa	xmm0, xmm4
+        movdqa	xmm1, xmm4
+        movdqa	xmm2, xmm4
+        pslld	xmm0, 31
+        pslld	xmm1, 30
+        pslld	xmm2, 25
+        pxor	xmm0, xmm1
+        pxor	xmm0, xmm2
+        movdqa	xmm1, xmm0
+        psrldq	xmm1, 4
+        pslldq	xmm0, 12
+        pxor	xmm4, xmm0
+        movdqa	xmm2, xmm4
+        movdqa	xmm3, xmm4
+        movdqa	xmm0, xmm4
+        psrld	xmm2, 1
+        psrld	xmm3, 2
+        psrld	xmm0, 7
+        pxor	xmm2, xmm3
+        pxor	xmm2, xmm0
+        pxor	xmm2, xmm1
+        pxor	xmm2, xmm4
+        pxor	xmm5, xmm2
+        add	ecx, 16
+        cmp	ecx, edx
+        jl	L_AES_GCM_aad_update_aesni_16_loop
+        movdqa	OWORD PTR [r8], xmm5
+        ret
+AES_GCM_aad_update_aesni ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+AES_GCM_encrypt_block_aesni PROC
+        mov	r10, r8
+        mov	r11, r9
+        mov	rax, QWORD PTR [rsp+40]
+        movdqu	xmm8, [rax]
+        movdqa	xmm9, xmm8
+        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_epi64
+        paddd	xmm9, OWORD PTR L_aes_gcm_one
+        pxor	xmm8, [rcx]
+        movdqu	[rax], xmm9
+        aesenc	xmm8, [rcx+16]
+        aesenc	xmm8, [rcx+32]
+        aesenc	xmm8, [rcx+48]
+        aesenc	xmm8, [rcx+64]
+        aesenc	xmm8, [rcx+80]
+        aesenc	xmm8, [rcx+96]
+        aesenc	xmm8, [rcx+112]
+        aesenc	xmm8, [rcx+128]
+        aesenc	xmm8, [rcx+144]
+        cmp	edx, 11
+        movdqa	xmm9, OWORD PTR [rcx+160]
+        jl	L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last
+        aesenc	xmm8, xmm9
+        aesenc	xmm8, [rcx+176]
+        cmp	edx, 13
+        movdqa	xmm9, OWORD PTR [rcx+192]
+        jl	L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last
+        aesenc	xmm8, xmm9
+        aesenc	xmm8, [rcx+208]
+        movdqa	xmm9, OWORD PTR [rcx+224]
+L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last:
+        aesenclast	xmm8, xmm9
+        movdqu	xmm9, [r11]
+        pxor	xmm8, xmm9
+        movdqu	[r10], xmm8
+        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
+        ret
+AES_GCM_encrypt_block_aesni ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+AES_GCM_ghash_block_aesni PROC
+        movdqa	xmm4, OWORD PTR [rdx]
+        movdqa	xmm5, OWORD PTR [r8]
+        movdqu	xmm8, [rcx]
+        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
+        pxor	xmm4, xmm8
+        pshufd	xmm1, xmm4, 78
+        pshufd	xmm2, xmm5, 78
+        movdqa	xmm3, xmm5
+        movdqa	xmm0, xmm5
+        pclmulqdq	xmm3, xmm4, 17
+        pclmulqdq	xmm0, xmm4, 0
+        pxor	xmm1, xmm4
+        pxor	xmm2, xmm5
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        movdqa	xmm6, xmm0
+        movdqa	xmm4, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm6, xmm2
+        pxor	xmm4, xmm1
+        movdqa	xmm0, xmm6
+        movdqa	xmm1, xmm4
+        psrld	xmm0, 31
+        psrld	xmm1, 31
+        pslld	xmm6, 1
+        pslld	xmm4, 1
+        movdqa	xmm2, xmm0
+        pslldq	xmm0, 4
+        psrldq	xmm2, 12
+        pslldq	xmm1, 4
+        por	xmm4, xmm2
+        por	xmm6, xmm0
+        por	xmm4, xmm1
+        movdqa	xmm0, xmm6
+        movdqa	xmm1, xmm6
+        movdqa	xmm2, xmm6
+        pslld	xmm0, 31
+        pslld	xmm1, 30
+        pslld	xmm2, 25
+        pxor	xmm0, xmm1
+        pxor	xmm0, xmm2
+        movdqa	xmm1, xmm0
+        psrldq	xmm1, 4
+        pslldq	xmm0, 12
+        pxor	xmm6, xmm0
+        movdqa	xmm2, xmm6
+        movdqa	xmm3, xmm6
+        movdqa	xmm0, xmm6
+        psrld	xmm2, 1
+        psrld	xmm3, 2
+        psrld	xmm0, 7
+        pxor	xmm2, xmm3
+        pxor	xmm2, xmm0
+        pxor	xmm2, xmm1
+        pxor	xmm2, xmm6
+        pxor	xmm4, xmm2
+        movdqa	OWORD PTR [rdx], xmm4
+        ret
+AES_GCM_ghash_block_aesni ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+AES_GCM_encrypt_update_aesni PROC
+        push	r13
+        push	r12
+        push	r14
+        push	r15
+        push	rdi
+        mov	rax, rcx
+        mov	r10, r8
+        mov	r8d, edx
+        mov	r11, r9
+        mov	r9d, DWORD PTR [rsp+80]
+        mov	r12, QWORD PTR [rsp+88]
+        mov	r14, QWORD PTR [rsp+96]
+        mov	r15, QWORD PTR [rsp+104]
+        sub	rsp, 160
+        movdqa	xmm6, OWORD PTR [r12]
+        movdqa	xmm5, OWORD PTR [r14]
+        movdqa	xmm9, xmm5
+        movdqa	xmm8, xmm5
+        psrlq	xmm9, 63
+        psllq	xmm8, 1
+        pslldq	xmm9, 8
+        por	xmm8, xmm9
+        pshufd	xmm5, xmm5, 255
+        psrad	xmm5, 31
+        pand	xmm5, OWORD PTR L_aes_gcm_mod2_128
+        pxor	xmm5, xmm8
+        xor	rdi, rdi
+        cmp	r9d, 128
+        mov	r13d, r9d
+        jl	L_AES_GCM_encrypt_update_aesni_done_128
+        and	r13d, 4294967168
+        movdqa	xmm2, xmm6
+        ; H ^ 1
+        movdqu	[rsp], xmm5
+        ; H ^ 2
+        pshufd	xmm9, xmm5, 78
+        pshufd	xmm10, xmm5, 78
+        movdqa	xmm11, xmm5
+        movdqa	xmm8, xmm5
+        pclmulqdq	xmm11, xmm5, 17
+        pclmulqdq	xmm8, xmm5, 0
+        pxor	xmm9, xmm5
+        pxor	xmm10, xmm5
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm0, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm0, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm0, xmm14
+        movdqu	[rsp+16], xmm0
+        ; H ^ 3
+        pshufd	xmm9, xmm5, 78
+        pshufd	xmm10, xmm0, 78
+        movdqa	xmm11, xmm0
+        movdqa	xmm8, xmm0
+        pclmulqdq	xmm11, xmm5, 17
+        pclmulqdq	xmm8, xmm5, 0
+        pxor	xmm9, xmm5
+        pxor	xmm10, xmm0
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm1, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm1, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm1, xmm14
+        movdqu	[rsp+32], xmm1
+        ; H ^ 4
+        pshufd	xmm9, xmm0, 78
+        pshufd	xmm10, xmm0, 78
+        movdqa	xmm11, xmm0
+        movdqa	xmm8, xmm0
+        pclmulqdq	xmm11, xmm0, 17
+        pclmulqdq	xmm8, xmm0, 0
+        pxor	xmm9, xmm0
+        pxor	xmm10, xmm0
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm3, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm3, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm3, xmm14
+        movdqu	[rsp+48], xmm3
+        ; H ^ 5
+        pshufd	xmm9, xmm0, 78
+        pshufd	xmm10, xmm1, 78
+        movdqa	xmm11, xmm1
+        movdqa	xmm8, xmm1
+        pclmulqdq	xmm11, xmm0, 17
+        pclmulqdq	xmm8, xmm0, 0
+        pxor	xmm9, xmm0
+        pxor	xmm10, xmm1
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm7, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm7, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm7, xmm14
+        movdqu	[rsp+64], xmm7
+        ; H ^ 6
+        pshufd	xmm9, xmm1, 78
+        pshufd	xmm10, xmm1, 78
+        movdqa	xmm11, xmm1
+        movdqa	xmm8, xmm1
+        pclmulqdq	xmm11, xmm1, 17
+        pclmulqdq	xmm8, xmm1, 0
+        pxor	xmm9, xmm1
+        pxor	xmm10, xmm1
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm7, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm7, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm7, xmm14
+        movdqu	[rsp+80], xmm7
+        ; H ^ 7
+        pshufd	xmm9, xmm1, 78
+        pshufd	xmm10, xmm3, 78
+        movdqa	xmm11, xmm3
+        movdqa	xmm8, xmm3
+        pclmulqdq	xmm11, xmm1, 17
+        pclmulqdq	xmm8, xmm1, 0
+        pxor	xmm9, xmm1
+        pxor	xmm10, xmm3
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm7, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm7, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm7, xmm14
+        movdqu	[rsp+96], xmm7
+        ; H ^ 8
+        pshufd	xmm9, xmm3, 78
+        pshufd	xmm10, xmm3, 78
+        movdqa	xmm11, xmm3
+        movdqa	xmm8, xmm3
+        pclmulqdq	xmm11, xmm3, 17
+        pclmulqdq	xmm8, xmm3, 0
+        pxor	xmm9, xmm3
+        pxor	xmm10, xmm3
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm7, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm7, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm7, xmm14
+        movdqu	[rsp+112], xmm7
+        ; First 128 bytes of input
+        movdqu	xmm8, [r15]
+        movdqa	xmm1, OWORD PTR L_aes_gcm_bswap_epi64
+        movdqa	xmm0, xmm8
+        pshufb	xmm8, xmm1
+        movdqa	xmm9, xmm0
+        paddd	xmm9, OWORD PTR L_aes_gcm_one
+        pshufb	xmm9, xmm1
+        movdqa	xmm10, xmm0
+        paddd	xmm10, OWORD PTR L_aes_gcm_two
+        pshufb	xmm10, xmm1
+        movdqa	xmm11, xmm0
+        paddd	xmm11, OWORD PTR L_aes_gcm_three
+        pshufb	xmm11, xmm1
+        movdqa	xmm12, xmm0
+        paddd	xmm12, OWORD PTR L_aes_gcm_four
+        pshufb	xmm12, xmm1
+        movdqa	xmm13, xmm0
+        paddd	xmm13, OWORD PTR L_aes_gcm_five
+        pshufb	xmm13, xmm1
+        movdqa	xmm14, xmm0
+        paddd	xmm14, OWORD PTR L_aes_gcm_six
+        pshufb	xmm14, xmm1
+        movdqa	xmm15, xmm0
+        paddd	xmm15, OWORD PTR L_aes_gcm_seven
+        pshufb	xmm15, xmm1
+        paddd	xmm0, OWORD PTR L_aes_gcm_eight
+        movdqa	xmm7, OWORD PTR [rax]
+        movdqu	[r15], xmm0
+        pxor	xmm8, xmm7
+        pxor	xmm9, xmm7
+        pxor	xmm10, xmm7
+        pxor	xmm11, xmm7
+        pxor	xmm12, xmm7
+        pxor	xmm13, xmm7
+        pxor	xmm14, xmm7
+        pxor	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [rax+16]
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [rax+32]
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [rax+48]
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [rax+64]
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [rax+80]
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [rax+96]
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [rax+112]
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [rax+128]
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [rax+144]
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        cmp	r8d, 11
+        movdqa	xmm7, OWORD PTR [rax+160]
+        jl	L_AES_GCM_encrypt_update_aesni_enc_done
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [rax+176]
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        cmp	r8d, 13
+        movdqa	xmm7, OWORD PTR [rax+192]
+        jl	L_AES_GCM_encrypt_update_aesni_enc_done
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [rax+208]
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [rax+224]
+L_AES_GCM_encrypt_update_aesni_enc_done:
+        aesenclast	xmm8, xmm7
+        aesenclast	xmm9, xmm7
+        movdqu	xmm0, [r11]
+        movdqu	xmm1, [r11+16]
+        pxor	xmm8, xmm0
+        pxor	xmm9, xmm1
+        movdqu	[r10], xmm8
+        movdqu	[r10+16], xmm9
+        aesenclast	xmm10, xmm7
+        aesenclast	xmm11, xmm7
+        movdqu	xmm0, [r11+32]
+        movdqu	xmm1, [r11+48]
+        pxor	xmm10, xmm0
+        pxor	xmm11, xmm1
+        movdqu	[r10+32], xmm10
+        movdqu	[r10+48], xmm11
+        aesenclast	xmm12, xmm7
+        aesenclast	xmm13, xmm7
+        movdqu	xmm0, [r11+64]
+        movdqu	xmm1, [r11+80]
+        pxor	xmm12, xmm0
+        pxor	xmm13, xmm1
+        movdqu	[r10+64], xmm12
+        movdqu	[r10+80], xmm13
+        aesenclast	xmm14, xmm7
+        aesenclast	xmm15, xmm7
+        movdqu	xmm0, [r11+96]
+        movdqu	xmm1, [r11+112]
+        pxor	xmm14, xmm0
+        pxor	xmm15, xmm1
+        movdqu	[r10+96], xmm14
+        movdqu	[r10+112], xmm15
+        cmp	r13d, 128
+        mov	edi, 128
+        jle	L_AES_GCM_encrypt_update_aesni_end_128
+        ; More 128 bytes of input
+L_AES_GCM_encrypt_update_aesni_ghash_128:
+        lea	rcx, QWORD PTR [r11+rdi]
+        lea	rdx, QWORD PTR [r10+rdi]
+        movdqu	xmm8, [r15]
+        movdqa	xmm1, OWORD PTR L_aes_gcm_bswap_epi64
+        movdqa	xmm0, xmm8
+        pshufb	xmm8, xmm1
+        movdqa	xmm9, xmm0
+        paddd	xmm9, OWORD PTR L_aes_gcm_one
+        pshufb	xmm9, xmm1
+        movdqa	xmm10, xmm0
+        paddd	xmm10, OWORD PTR L_aes_gcm_two
+        pshufb	xmm10, xmm1
+        movdqa	xmm11, xmm0
+        paddd	xmm11, OWORD PTR L_aes_gcm_three
+        pshufb	xmm11, xmm1
+        movdqa	xmm12, xmm0
+        paddd	xmm12, OWORD PTR L_aes_gcm_four
+        pshufb	xmm12, xmm1
+        movdqa	xmm13, xmm0
+        paddd	xmm13, OWORD PTR L_aes_gcm_five
+        pshufb	xmm13, xmm1
+        movdqa	xmm14, xmm0
+        paddd	xmm14, OWORD PTR L_aes_gcm_six
+        pshufb	xmm14, xmm1
+        movdqa	xmm15, xmm0
+        paddd	xmm15, OWORD PTR L_aes_gcm_seven
+        pshufb	xmm15, xmm1
+        paddd	xmm0, OWORD PTR L_aes_gcm_eight
+        movdqa	xmm7, OWORD PTR [rax]
+        movdqu	[r15], xmm0
+        pxor	xmm8, xmm7
+        pxor	xmm9, xmm7
+        pxor	xmm10, xmm7
+        pxor	xmm11, xmm7
+        pxor	xmm12, xmm7
+        pxor	xmm13, xmm7
+        pxor	xmm14, xmm7
+        pxor	xmm15, xmm7
+        movdqu	xmm7, [rsp+112]
+        movdqu	xmm0, [rdx+-128]
+        aesenc	xmm8, [rax+16]
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        pxor	xmm0, xmm2
+        pshufd	xmm1, xmm7, 78
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm1, xmm7
+        pxor	xmm5, xmm0
+        movdqa	xmm3, xmm0
+        pclmulqdq	xmm3, xmm7, 17
+        aesenc	xmm9, [rax+16]
+        aesenc	xmm10, [rax+16]
+        movdqa	xmm2, xmm0
+        pclmulqdq	xmm2, xmm7, 0
+        aesenc	xmm11, [rax+16]
+        aesenc	xmm12, [rax+16]
+        pclmulqdq	xmm1, xmm5, 0
+        aesenc	xmm13, [rax+16]
+        aesenc	xmm14, [rax+16]
+        aesenc	xmm15, [rax+16]
+        pxor	xmm1, xmm2
+        pxor	xmm1, xmm3
+        movdqu	xmm7, [rsp+96]
+        movdqu	xmm0, [rdx+-112]
+        pshufd	xmm4, xmm7, 78
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        aesenc	xmm8, [rax+32]
+        pxor	xmm4, xmm7
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm5, xmm0
+        movdqa	xmm6, xmm0
+        pclmulqdq	xmm6, xmm7, 17
+        aesenc	xmm9, [rax+32]
+        aesenc	xmm10, [rax+32]
+        pclmulqdq	xmm7, xmm0, 0
+        aesenc	xmm11, [rax+32]
+        aesenc	xmm12, [rax+32]
+        pclmulqdq	xmm4, xmm5, 0
+        aesenc	xmm13, [rax+32]
+        aesenc	xmm14, [rax+32]
+        aesenc	xmm15, [rax+32]
+        pxor	xmm1, xmm7
+        pxor	xmm2, xmm7
+        pxor	xmm1, xmm6
+        pxor	xmm3, xmm6
+        pxor	xmm1, xmm4
+        movdqu	xmm7, [rsp+80]
+        movdqu	xmm0, [rdx+-96]
+        pshufd	xmm4, xmm7, 78
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        aesenc	xmm8, [rax+48]
+        pxor	xmm4, xmm7
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm5, xmm0
+        movdqa	xmm6, xmm0
+        pclmulqdq	xmm6, xmm7, 17
+        aesenc	xmm9, [rax+48]
+        aesenc	xmm10, [rax+48]
+        pclmulqdq	xmm7, xmm0, 0
+        aesenc	xmm11, [rax+48]
+        aesenc	xmm12, [rax+48]
+        pclmulqdq	xmm4, xmm5, 0
+        aesenc	xmm13, [rax+48]
+        aesenc	xmm14, [rax+48]
+        aesenc	xmm15, [rax+48]
+        pxor	xmm1, xmm7
+        pxor	xmm2, xmm7
+        pxor	xmm1, xmm6
+        pxor	xmm3, xmm6
+        pxor	xmm1, xmm4
+        movdqu	xmm7, [rsp+64]
+        movdqu	xmm0, [rdx+-80]
+        pshufd	xmm4, xmm7, 78
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        aesenc	xmm8, [rax+64]
+        pxor	xmm4, xmm7
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm5, xmm0
+        movdqa	xmm6, xmm0
+        pclmulqdq	xmm6, xmm7, 17
+        aesenc	xmm9, [rax+64]
+        aesenc	xmm10, [rax+64]
+        pclmulqdq	xmm7, xmm0, 0
+        aesenc	xmm11, [rax+64]
+        aesenc	xmm12, [rax+64]
+        pclmulqdq	xmm4, xmm5, 0
+        aesenc	xmm13, [rax+64]
+        aesenc	xmm14, [rax+64]
+        aesenc	xmm15, [rax+64]
+        pxor	xmm1, xmm7
+        pxor	xmm2, xmm7
+        pxor	xmm1, xmm6
+        pxor	xmm3, xmm6
+        pxor	xmm1, xmm4
+        movdqu	xmm7, [rsp+48]
+        movdqu	xmm0, [rdx+-64]
+        pshufd	xmm4, xmm7, 78
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        aesenc	xmm8, [rax+80]
+        pxor	xmm4, xmm7
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm5, xmm0
+        movdqa	xmm6, xmm0
+        pclmulqdq	xmm6, xmm7, 17
+        aesenc	xmm9, [rax+80]
+        aesenc	xmm10, [rax+80]
+        pclmulqdq	xmm7, xmm0, 0
+        aesenc	xmm11, [rax+80]
+        aesenc	xmm12, [rax+80]
+        pclmulqdq	xmm4, xmm5, 0
+        aesenc	xmm13, [rax+80]
+        aesenc	xmm14, [rax+80]
+        aesenc	xmm15, [rax+80]
+        pxor	xmm1, xmm7
+        pxor	xmm2, xmm7
+        pxor	xmm1, xmm6
+        pxor	xmm3, xmm6
+        pxor	xmm1, xmm4
+        movdqu	xmm7, [rsp+32]
+        movdqu	xmm0, [rdx+-48]
+        pshufd	xmm4, xmm7, 78
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        aesenc	xmm8, [rax+96]
+        pxor	xmm4, xmm7
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm5, xmm0
+        movdqa	xmm6, xmm0
+        pclmulqdq	xmm6, xmm7, 17
+        aesenc	xmm9, [rax+96]
+        aesenc	xmm10, [rax+96]
+        pclmulqdq	xmm7, xmm0, 0
+        aesenc	xmm11, [rax+96]
+        aesenc	xmm12, [rax+96]
+        pclmulqdq	xmm4, xmm5, 0
+        aesenc	xmm13, [rax+96]
+        aesenc	xmm14, [rax+96]
+        aesenc	xmm15, [rax+96]
+        pxor	xmm1, xmm7
+        pxor	xmm2, xmm7
+        pxor	xmm1, xmm6
+        pxor	xmm3, xmm6
+        pxor	xmm1, xmm4
+        movdqu	xmm7, [rsp+16]
+        movdqu	xmm0, [rdx+-32]
+        pshufd	xmm4, xmm7, 78
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        aesenc	xmm8, [rax+112]
+        pxor	xmm4, xmm7
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm5, xmm0
+        movdqa	xmm6, xmm0
+        pclmulqdq	xmm6, xmm7, 17
+        aesenc	xmm9, [rax+112]
+        aesenc	xmm10, [rax+112]
+        pclmulqdq	xmm7, xmm0, 0
+        aesenc	xmm11, [rax+112]
+        aesenc	xmm12, [rax+112]
+        pclmulqdq	xmm4, xmm5, 0
+        aesenc	xmm13, [rax+112]
+        aesenc	xmm14, [rax+112]
+        aesenc	xmm15, [rax+112]
+        pxor	xmm1, xmm7
+        pxor	xmm2, xmm7
+        pxor	xmm1, xmm6
+        pxor	xmm3, xmm6
+        pxor	xmm1, xmm4
+        movdqu	xmm7, [rsp]
+        movdqu	xmm0, [rdx+-16]
+        pshufd	xmm4, xmm7, 78
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        aesenc	xmm8, [rax+128]
+        pxor	xmm4, xmm7
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm5, xmm0
+        movdqa	xmm6, xmm0
+        pclmulqdq	xmm6, xmm7, 17
+        aesenc	xmm9, [rax+128]
+        aesenc	xmm10, [rax+128]
+        pclmulqdq	xmm7, xmm0, 0
+        aesenc	xmm11, [rax+128]
+        aesenc	xmm12, [rax+128]
+        pclmulqdq	xmm4, xmm5, 0
+        aesenc	xmm13, [rax+128]
+        aesenc	xmm14, [rax+128]
+        aesenc	xmm15, [rax+128]
+        pxor	xmm1, xmm7
+        pxor	xmm2, xmm7
+        pxor	xmm1, xmm6
+        pxor	xmm3, xmm6
+        pxor	xmm1, xmm4
+        movdqa	xmm5, xmm1
+        psrldq	xmm1, 8
+        pslldq	xmm5, 8
+        aesenc	xmm8, [rax+144]
+        pxor	xmm2, xmm5
+        pxor	xmm3, xmm1
+        movdqa	xmm7, xmm2
+        movdqa	xmm4, xmm2
+        movdqa	xmm5, xmm2
+        aesenc	xmm9, [rax+144]
+        pslld	xmm7, 31
+        pslld	xmm4, 30
+        pslld	xmm5, 25
+        aesenc	xmm10, [rax+144]
+        pxor	xmm7, xmm4
+        pxor	xmm7, xmm5
+        aesenc	xmm11, [rax+144]
+        movdqa	xmm4, xmm7
+        pslldq	xmm7, 12
+        psrldq	xmm4, 4
+        aesenc	xmm12, [rax+144]
+        pxor	xmm2, xmm7
+        movdqa	xmm5, xmm2
+        movdqa	xmm1, xmm2
+        movdqa	xmm0, xmm2
+        aesenc	xmm13, [rax+144]
+        psrld	xmm5, 1
+        psrld	xmm1, 2
+        psrld	xmm0, 7
+        aesenc	xmm14, [rax+144]
+        pxor	xmm5, xmm1
+        pxor	xmm5, xmm0
+        aesenc	xmm15, [rax+144]
+        pxor	xmm5, xmm4
+        pxor	xmm2, xmm5
+        pxor	xmm2, xmm3
+        cmp	r8d, 11
+        movdqa	xmm7, OWORD PTR [rax+160]
+        jl	L_AES_GCM_encrypt_update_aesni_aesenc_128_ghash_avx_done
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [rax+176]
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        cmp	r8d, 13
+        movdqa	xmm7, OWORD PTR [rax+192]
+        jl	L_AES_GCM_encrypt_update_aesni_aesenc_128_ghash_avx_done
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [rax+208]
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [rax+224]
+L_AES_GCM_encrypt_update_aesni_aesenc_128_ghash_avx_done:
+        aesenclast	xmm8, xmm7
+        aesenclast	xmm9, xmm7
+        movdqu	xmm0, [rcx]
+        movdqu	xmm1, [rcx+16]
+        pxor	xmm8, xmm0
+        pxor	xmm9, xmm1
+        movdqu	[rdx], xmm8
+        movdqu	[rdx+16], xmm9
+        aesenclast	xmm10, xmm7
+        aesenclast	xmm11, xmm7
+        movdqu	xmm0, [rcx+32]
+        movdqu	xmm1, [rcx+48]
+        pxor	xmm10, xmm0
+        pxor	xmm11, xmm1
+        movdqu	[rdx+32], xmm10
+        movdqu	[rdx+48], xmm11
+        aesenclast	xmm12, xmm7
+        aesenclast	xmm13, xmm7
+        movdqu	xmm0, [rcx+64]
+        movdqu	xmm1, [rcx+80]
+        pxor	xmm12, xmm0
+        pxor	xmm13, xmm1
+        movdqu	[rdx+64], xmm12
+        movdqu	[rdx+80], xmm13
+        aesenclast	xmm14, xmm7
+        aesenclast	xmm15, xmm7
+        movdqu	xmm0, [rcx+96]
+        movdqu	xmm1, [rcx+112]
+        pxor	xmm14, xmm0
+        pxor	xmm15, xmm1
+        movdqu	[rdx+96], xmm14
+        movdqu	[rdx+112], xmm15
+        add	edi, 128
+        cmp	edi, r13d
+        jl	L_AES_GCM_encrypt_update_aesni_ghash_128
+L_AES_GCM_encrypt_update_aesni_end_128:
+        movdqa	xmm4, OWORD PTR L_aes_gcm_bswap_mask
+        pshufb	xmm8, xmm4
+        pshufb	xmm9, xmm4
+        pshufb	xmm10, xmm4
+        pshufb	xmm11, xmm4
+        pxor	xmm8, xmm2
+        pshufb	xmm12, xmm4
+        pshufb	xmm13, xmm4
+        pshufb	xmm14, xmm4
+        pshufb	xmm15, xmm4
+        movdqu	xmm7, [rsp+112]
+        pshufd	xmm1, xmm8, 78
+        pshufd	xmm2, xmm7, 78
+        movdqa	xmm3, xmm7
+        movdqa	xmm0, xmm7
+        pclmulqdq	xmm3, xmm8, 17
+        pclmulqdq	xmm0, xmm8, 0
+        pxor	xmm1, xmm8
+        pxor	xmm2, xmm7
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        movdqa	xmm4, xmm0
+        movdqa	xmm6, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm4, xmm2
+        pxor	xmm6, xmm1
+        movdqu	xmm7, [rsp+96]
+        pshufd	xmm1, xmm9, 78
+        pshufd	xmm2, xmm7, 78
+        movdqa	xmm3, xmm7
+        movdqa	xmm0, xmm7
+        pclmulqdq	xmm3, xmm9, 17
+        pclmulqdq	xmm0, xmm9, 0
+        pxor	xmm1, xmm9
+        pxor	xmm2, xmm7
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        pxor	xmm4, xmm0
+        pxor	xmm6, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm4, xmm2
+        pxor	xmm6, xmm1
+        movdqu	xmm7, [rsp+80]
+        pshufd	xmm1, xmm10, 78
+        pshufd	xmm2, xmm7, 78
+        movdqa	xmm3, xmm7
+        movdqa	xmm0, xmm7
+        pclmulqdq	xmm3, xmm10, 17
+        pclmulqdq	xmm0, xmm10, 0
+        pxor	xmm1, xmm10
+        pxor	xmm2, xmm7
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        pxor	xmm4, xmm0
+        pxor	xmm6, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm4, xmm2
+        pxor	xmm6, xmm1
+        movdqu	xmm7, [rsp+64]
+        pshufd	xmm1, xmm11, 78
+        pshufd	xmm2, xmm7, 78
+        movdqa	xmm3, xmm7
+        movdqa	xmm0, xmm7
+        pclmulqdq	xmm3, xmm11, 17
+        pclmulqdq	xmm0, xmm11, 0
+        pxor	xmm1, xmm11
+        pxor	xmm2, xmm7
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        pxor	xmm4, xmm0
+        pxor	xmm6, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm4, xmm2
+        pxor	xmm6, xmm1
+        movdqu	xmm7, [rsp+48]
+        pshufd	xmm1, xmm12, 78
+        pshufd	xmm2, xmm7, 78
+        movdqa	xmm3, xmm7
+        movdqa	xmm0, xmm7
+        pclmulqdq	xmm3, xmm12, 17
+        pclmulqdq	xmm0, xmm12, 0
+        pxor	xmm1, xmm12
+        pxor	xmm2, xmm7
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        pxor	xmm4, xmm0
+        pxor	xmm6, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm4, xmm2
+        pxor	xmm6, xmm1
+        movdqu	xmm7, [rsp+32]
+        pshufd	xmm1, xmm13, 78
+        pshufd	xmm2, xmm7, 78
+        movdqa	xmm3, xmm7
+        movdqa	xmm0, xmm7
+        pclmulqdq	xmm3, xmm13, 17
+        pclmulqdq	xmm0, xmm13, 0
+        pxor	xmm1, xmm13
+        pxor	xmm2, xmm7
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        pxor	xmm4, xmm0
+        pxor	xmm6, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm4, xmm2
+        pxor	xmm6, xmm1
+        movdqu	xmm7, [rsp+16]
+        pshufd	xmm1, xmm14, 78
+        pshufd	xmm2, xmm7, 78
+        movdqa	xmm3, xmm7
+        movdqa	xmm0, xmm7
+        pclmulqdq	xmm3, xmm14, 17
+        pclmulqdq	xmm0, xmm14, 0
+        pxor	xmm1, xmm14
+        pxor	xmm2, xmm7
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        pxor	xmm4, xmm0
+        pxor	xmm6, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm4, xmm2
+        pxor	xmm6, xmm1
+        movdqu	xmm7, [rsp]
+        pshufd	xmm1, xmm15, 78
+        pshufd	xmm2, xmm7, 78
+        movdqa	xmm3, xmm7
+        movdqa	xmm0, xmm7
+        pclmulqdq	xmm3, xmm15, 17
+        pclmulqdq	xmm0, xmm15, 0
+        pxor	xmm1, xmm15
+        pxor	xmm2, xmm7
+        pclmulqdq	xmm1, xmm2, 0
+        pxor	xmm1, xmm0
+        pxor	xmm1, xmm3
+        movdqa	xmm2, xmm1
+        pxor	xmm4, xmm0
+        pxor	xmm6, xmm3
+        pslldq	xmm2, 8
+        psrldq	xmm1, 8
+        pxor	xmm4, xmm2
+        pxor	xmm6, xmm1
+        movdqa	xmm0, xmm4
+        movdqa	xmm1, xmm4
+        movdqa	xmm2, xmm4
+        pslld	xmm0, 31
+        pslld	xmm1, 30
+        pslld	xmm2, 25
+        pxor	xmm0, xmm1
+        pxor	xmm0, xmm2
+        movdqa	xmm1, xmm0
+        psrldq	xmm1, 4
+        pslldq	xmm0, 12
+        pxor	xmm4, xmm0
+        movdqa	xmm2, xmm4
+        movdqa	xmm3, xmm4
+        movdqa	xmm0, xmm4
+        psrld	xmm2, 1
+        psrld	xmm3, 2
+        psrld	xmm0, 7
+        pxor	xmm2, xmm3
+        pxor	xmm2, xmm0
+        pxor	xmm2, xmm1
+        pxor	xmm2, xmm4
+        pxor	xmm6, xmm2
+        movdqu	xmm5, [rsp]
+L_AES_GCM_encrypt_update_aesni_done_128:
+        mov	edx, r9d
+        cmp	edi, edx
+        jge	L_AES_GCM_encrypt_update_aesni_done_enc
+        mov	r13d, r9d
+        and	r13d, 4294967280
+        cmp	edi, r13d
+        jge	L_AES_GCM_encrypt_update_aesni_last_block_done
+        lea	rcx, QWORD PTR [r11+rdi]
+        lea	rdx, QWORD PTR [r10+rdi]
+        movdqu	xmm8, [r15]
+        movdqa	xmm9, xmm8
+        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_epi64
+        paddd	xmm9, OWORD PTR L_aes_gcm_one
+        pxor	xmm8, [rax]
+        movdqu	[r15], xmm9
+        aesenc	xmm8, [rax+16]
+        aesenc	xmm8, [rax+32]
+        aesenc	xmm8, [rax+48]
+        aesenc	xmm8, [rax+64]
+        aesenc	xmm8, [rax+80]
+        aesenc	xmm8, [rax+96]
+        aesenc	xmm8, [rax+112]
+        aesenc	xmm8, [rax+128]
+        aesenc	xmm8, [rax+144]
+        cmp	r8d, 11
+        movdqa	xmm9, OWORD PTR [rax+160]
+        jl	L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last
+        aesenc	xmm8, xmm9
+        aesenc	xmm8, [rax+176]
+        cmp	r8d, 13
+        movdqa	xmm9, OWORD PTR [rax+192]
+        jl	L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last
+        aesenc	xmm8, xmm9
+        aesenc	xmm8, [rax+208]
+        movdqa	xmm9, OWORD PTR [rax+224]
+L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last:
+        aesenclast	xmm8, xmm9
+        movdqu	xmm9, [rcx]
+        pxor	xmm8, xmm9
+        movdqu	[rdx], xmm8
+        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
+        pxor	xmm6, xmm8
+        add	edi, 16
+        cmp	edi, r13d
+        jge	L_AES_GCM_encrypt_update_aesni_last_block_ghash
+L_AES_GCM_encrypt_update_aesni_last_block_start:
+        lea	rcx, QWORD PTR [r11+rdi]
+        lea	rdx, QWORD PTR [r10+rdi]
+        movdqu	xmm8, [r15]
+        movdqa	xmm9, xmm8
+        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_epi64
+        paddd	xmm9, OWORD PTR L_aes_gcm_one
+        pxor	xmm8, [rax]
+        movdqu	[r15], xmm9
+        movdqa	xmm10, xmm6
+        pclmulqdq	xmm10, xmm5, 16
+        aesenc	xmm8, [rax+16]
+        aesenc	xmm8, [rax+32]
+        movdqa	xmm11, xmm6
+        pclmulqdq	xmm11, xmm5, 1
+        aesenc	xmm8, [rax+48]
+        aesenc	xmm8, [rax+64]
+        movdqa	xmm12, xmm6
+        pclmulqdq	xmm12, xmm5, 0
+        aesenc	xmm8, [rax+80]
+        movdqa	xmm1, xmm6
+        pclmulqdq	xmm1, xmm5, 17
+        aesenc	xmm8, [rax+96]
+        pxor	xmm10, xmm11
+        movdqa	xmm2, xmm10
+        psrldq	xmm10, 8
+        pslldq	xmm2, 8
+        aesenc	xmm8, [rax+112]
+        movdqa	xmm3, xmm1
+        pxor	xmm2, xmm12
+        pxor	xmm3, xmm10
+        movdqa	xmm0, OWORD PTR L_aes_gcm_mod2_128
+        movdqa	xmm11, xmm2
+        pclmulqdq	xmm11, xmm0, 16
+        aesenc	xmm8, [rax+128]
+        pshufd	xmm10, xmm2, 78
+        pxor	xmm10, xmm11
+        movdqa	xmm11, xmm10
+        pclmulqdq	xmm11, xmm0, 16
+        aesenc	xmm8, [rax+144]
+        pshufd	xmm6, xmm10, 78
+        pxor	xmm6, xmm11
+        pxor	xmm6, xmm3
+        cmp	r8d, 11
+        movdqa	xmm9, OWORD PTR [rax+160]
+        jl	L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last
+        aesenc	xmm8, xmm9
+        aesenc	xmm8, [rax+176]
+        cmp	r8d, 13
+        movdqa	xmm9, OWORD PTR [rax+192]
+        jl	L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last
+        aesenc	xmm8, xmm9
+        aesenc	xmm8, [rax+208]
+        movdqa	xmm9, OWORD PTR [rax+224]
+L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last:
+        aesenclast	xmm8, xmm9
+        movdqu	xmm9, [rcx]
+        pxor	xmm8, xmm9
+        movdqu	[rdx], xmm8
+        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_mask
+        pxor	xmm6, xmm8
+        add	edi, 16
+        cmp	edi, r13d
+        jl	L_AES_GCM_encrypt_update_aesni_last_block_start
+L_AES_GCM_encrypt_update_aesni_last_block_ghash:
+        pshufd	xmm9, xmm5, 78
+        pshufd	xmm10, xmm6, 78
+        movdqa	xmm11, xmm6
+        movdqa	xmm8, xmm6
+        pclmulqdq	xmm11, xmm5, 17
+        pclmulqdq	xmm8, xmm5, 0
+        pxor	xmm9, xmm5
+        pxor	xmm10, xmm6
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm6, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm6, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm6, xmm14
+L_AES_GCM_encrypt_update_aesni_last_block_done:
+L_AES_GCM_encrypt_update_aesni_done_enc:
+        movdqa	OWORD PTR [r12], xmm6
+        add	rsp, 160
+        pop	rdi
+        pop	r15
+        pop	r14
+        pop	r12
+        pop	r13
+        ret
+AES_GCM_encrypt_update_aesni ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+AES_GCM_encrypt_final_aesni PROC
+        push	r13
+        push	r12
+        push	r14
+        mov	rax, rcx
+        mov	r10d, r9d
+        mov	r9, rdx
+        mov	r11d, DWORD PTR [rsp+64]
+        mov	r12, QWORD PTR [rsp+72]
+        mov	r14, QWORD PTR [rsp+80]
+        sub	rsp, 16
+        movdqa	xmm4, OWORD PTR [rax]
+        movdqa	xmm5, OWORD PTR [r12]
+        movdqa	xmm6, OWORD PTR [r14]
+        movdqa	xmm9, xmm5
+        movdqa	xmm8, xmm5
+        psrlq	xmm9, 63
+        psllq	xmm8, 1
+        pslldq	xmm9, 8
+        por	xmm8, xmm9
+        pshufd	xmm5, xmm5, 255
+        psrad	xmm5, 31
+        pand	xmm5, OWORD PTR L_aes_gcm_mod2_128
+        pxor	xmm5, xmm8
+        mov	edx, r10d
+        mov	ecx, r11d
+        shl	rdx, 3
+        shl	rcx, 3
+        pinsrq	xmm0, rdx, 0
+        pinsrq	xmm0, rcx, 1
+        pxor	xmm4, xmm0
+        pshufd	xmm9, xmm5, 78
+        pshufd	xmm10, xmm4, 78
+        movdqa	xmm11, xmm4
+        movdqa	xmm8, xmm4
+        pclmulqdq	xmm11, xmm5, 17
+        pclmulqdq	xmm8, xmm5, 0
+        pxor	xmm9, xmm5
+        pxor	xmm10, xmm4
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm4, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm4, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm4, xmm14
+        pshufb	xmm4, OWORD PTR L_aes_gcm_bswap_mask
+        movdqu	xmm0, xmm6
+        pxor	xmm0, xmm4
+        cmp	r8d, 16
+        je	L_AES_GCM_encrypt_final_aesni_store_tag_16
+        xor	rcx, rcx
+        movdqu	[rsp], xmm0
+L_AES_GCM_encrypt_final_aesni_store_tag_loop:
+        movzx	r13d, BYTE PTR [rsp+rcx]
+        mov	BYTE PTR [r9+rcx], r13b
+        inc	ecx
+        cmp	ecx, r8d
+        jne	L_AES_GCM_encrypt_final_aesni_store_tag_loop
+        jmp	L_AES_GCM_encrypt_final_aesni_store_tag_done
+L_AES_GCM_encrypt_final_aesni_store_tag_16:
+        movdqu	[r9], xmm0
+L_AES_GCM_encrypt_final_aesni_store_tag_done:
+        add	rsp, 16
+        pop	r14
+        pop	r12
+        pop	r13
+        ret
+AES_GCM_encrypt_final_aesni ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+AES_GCM_decrypt_update_aesni PROC
+        push	r13
+        push	r12
+        push	r14
+        push	r15
+        push	rdi
+        push	rsi
+        mov	rax, rcx
+        mov	r10, r8
+        mov	r8d, edx
+        mov	r11, r9
+        mov	r9d, DWORD PTR [rsp+88]
+        mov	r12, QWORD PTR [rsp+96]
+        mov	r14, QWORD PTR [rsp+104]
+        mov	r15, QWORD PTR [rsp+112]
+        sub	rsp, 168
+        movdqa	xmm6, OWORD PTR [r12]
+        movdqa	xmm5, OWORD PTR [r14]
+        movdqa	xmm9, xmm5
+        movdqa	xmm8, xmm5
+        psrlq	xmm9, 63
+        psllq	xmm8, 1
+        pslldq	xmm9, 8
+        por	xmm8, xmm9
+        pshufd	xmm5, xmm5, 255
+        psrad	xmm5, 31
+        pand	xmm5, OWORD PTR L_aes_gcm_mod2_128
+        pxor	xmm5, xmm8
+        xor	edi, edi
+        cmp	r9d, 128
+        mov	r13d, r9d
+        jl	L_AES_GCM_decrypt_update_aesni_done_128
+        and	r13d, 4294967168
+        movdqa	xmm2, xmm6
+        ; H ^ 1
+        movdqu	[rsp], xmm5
+        ; H ^ 2
+        pshufd	xmm9, xmm5, 78
+        pshufd	xmm10, xmm5, 78
+        movdqa	xmm11, xmm5
+        movdqa	xmm8, xmm5
+        pclmulqdq	xmm11, xmm5, 17
+        pclmulqdq	xmm8, xmm5, 0
+        pxor	xmm9, xmm5
+        pxor	xmm10, xmm5
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm0, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm0, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm0, xmm14
+        movdqu	[rsp+16], xmm0
+        ; H ^ 3
+        pshufd	xmm9, xmm5, 78
+        pshufd	xmm10, xmm0, 78
+        movdqa	xmm11, xmm0
+        movdqa	xmm8, xmm0
+        pclmulqdq	xmm11, xmm5, 17
+        pclmulqdq	xmm8, xmm5, 0
+        pxor	xmm9, xmm5
+        pxor	xmm10, xmm0
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm1, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm1, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm1, xmm14
+        movdqu	[rsp+32], xmm1
+        ; H ^ 4
+        pshufd	xmm9, xmm0, 78
+        pshufd	xmm10, xmm0, 78
+        movdqa	xmm11, xmm0
+        movdqa	xmm8, xmm0
+        pclmulqdq	xmm11, xmm0, 17
+        pclmulqdq	xmm8, xmm0, 0
+        pxor	xmm9, xmm0
+        pxor	xmm10, xmm0
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm3, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm3, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm3, xmm14
+        movdqu	[rsp+48], xmm3
+        ; H ^ 5
+        pshufd	xmm9, xmm0, 78
+        pshufd	xmm10, xmm1, 78
+        movdqa	xmm11, xmm1
+        movdqa	xmm8, xmm1
+        pclmulqdq	xmm11, xmm0, 17
+        pclmulqdq	xmm8, xmm0, 0
+        pxor	xmm9, xmm0
+        pxor	xmm10, xmm1
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm7, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm7, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm7, xmm14
+        movdqu	[rsp+64], xmm7
+        ; H ^ 6
+        pshufd	xmm9, xmm1, 78
+        pshufd	xmm10, xmm1, 78
+        movdqa	xmm11, xmm1
+        movdqa	xmm8, xmm1
+        pclmulqdq	xmm11, xmm1, 17
+        pclmulqdq	xmm8, xmm1, 0
+        pxor	xmm9, xmm1
+        pxor	xmm10, xmm1
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm7, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm7, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm7, xmm14
+        movdqu	[rsp+80], xmm7
+        ; H ^ 7
+        pshufd	xmm9, xmm1, 78
+        pshufd	xmm10, xmm3, 78
+        movdqa	xmm11, xmm3
+        movdqa	xmm8, xmm3
+        pclmulqdq	xmm11, xmm1, 17
+        pclmulqdq	xmm8, xmm1, 0
+        pxor	xmm9, xmm1
+        pxor	xmm10, xmm3
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm7, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm7, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm7, xmm14
+        movdqu	[rsp+96], xmm7
+        ; H ^ 8
+        pshufd	xmm9, xmm3, 78
+        pshufd	xmm10, xmm3, 78
+        movdqa	xmm11, xmm3
+        movdqa	xmm8, xmm3
+        pclmulqdq	xmm11, xmm3, 17
+        pclmulqdq	xmm8, xmm3, 0
+        pxor	xmm9, xmm3
+        pxor	xmm10, xmm3
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm7, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm7, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm7, xmm14
+        movdqu	[rsp+112], xmm7
+L_AES_GCM_decrypt_update_aesni_ghash_128:
+        lea	rcx, QWORD PTR [r11+rdi]
+        lea	rdx, QWORD PTR [r10+rdi]
+        movdqu	xmm8, [r15]
+        movdqa	xmm1, OWORD PTR L_aes_gcm_bswap_epi64
+        movdqa	xmm0, xmm8
+        pshufb	xmm8, xmm1
+        movdqa	xmm9, xmm0
+        paddd	xmm9, OWORD PTR L_aes_gcm_one
+        pshufb	xmm9, xmm1
+        movdqa	xmm10, xmm0
+        paddd	xmm10, OWORD PTR L_aes_gcm_two
+        pshufb	xmm10, xmm1
+        movdqa	xmm11, xmm0
+        paddd	xmm11, OWORD PTR L_aes_gcm_three
+        pshufb	xmm11, xmm1
+        movdqa	xmm12, xmm0
+        paddd	xmm12, OWORD PTR L_aes_gcm_four
+        pshufb	xmm12, xmm1
+        movdqa	xmm13, xmm0
+        paddd	xmm13, OWORD PTR L_aes_gcm_five
+        pshufb	xmm13, xmm1
+        movdqa	xmm14, xmm0
+        paddd	xmm14, OWORD PTR L_aes_gcm_six
+        pshufb	xmm14, xmm1
+        movdqa	xmm15, xmm0
+        paddd	xmm15, OWORD PTR L_aes_gcm_seven
+        pshufb	xmm15, xmm1
+        paddd	xmm0, OWORD PTR L_aes_gcm_eight
+        movdqa	xmm7, OWORD PTR [rax]
+        movdqu	[r15], xmm0
+        pxor	xmm8, xmm7
+        pxor	xmm9, xmm7
+        pxor	xmm10, xmm7
+        pxor	xmm11, xmm7
+        pxor	xmm12, xmm7
+        pxor	xmm13, xmm7
+        pxor	xmm14, xmm7
+        pxor	xmm15, xmm7
+        movdqu	xmm7, [rsp+112]
+        movdqu	xmm0, [rcx]
+        aesenc	xmm8, [rax+16]
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        pxor	xmm0, xmm2
+        pshufd	xmm1, xmm7, 78
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm1, xmm7
+        pxor	xmm5, xmm0
+        movdqa	xmm3, xmm0
+        pclmulqdq	xmm3, xmm7, 17
+        aesenc	xmm9, [rax+16]
+        aesenc	xmm10, [rax+16]
+        movdqa	xmm2, xmm0
+        pclmulqdq	xmm2, xmm7, 0
+        aesenc	xmm11, [rax+16]
+        aesenc	xmm12, [rax+16]
+        pclmulqdq	xmm1, xmm5, 0
+        aesenc	xmm13, [rax+16]
+        aesenc	xmm14, [rax+16]
+        aesenc	xmm15, [rax+16]
+        pxor	xmm1, xmm2
+        pxor	xmm1, xmm3
+        movdqu	xmm7, [rsp+96]
+        movdqu	xmm0, [rcx+16]
+        pshufd	xmm4, xmm7, 78
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        aesenc	xmm8, [rax+32]
+        pxor	xmm4, xmm7
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm5, xmm0
+        movdqa	xmm6, xmm0
+        pclmulqdq	xmm6, xmm7, 17
+        aesenc	xmm9, [rax+32]
+        aesenc	xmm10, [rax+32]
+        pclmulqdq	xmm7, xmm0, 0
+        aesenc	xmm11, [rax+32]
+        aesenc	xmm12, [rax+32]
+        pclmulqdq	xmm4, xmm5, 0
+        aesenc	xmm13, [rax+32]
+        aesenc	xmm14, [rax+32]
+        aesenc	xmm15, [rax+32]
+        pxor	xmm1, xmm7
+        pxor	xmm2, xmm7
+        pxor	xmm1, xmm6
+        pxor	xmm3, xmm6
+        pxor	xmm1, xmm4
+        movdqu	xmm7, [rsp+80]
+        movdqu	xmm0, [rcx+32]
+        pshufd	xmm4, xmm7, 78
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        aesenc	xmm8, [rax+48]
+        pxor	xmm4, xmm7
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm5, xmm0
+        movdqa	xmm6, xmm0
+        pclmulqdq	xmm6, xmm7, 17
+        aesenc	xmm9, [rax+48]
+        aesenc	xmm10, [rax+48]
+        pclmulqdq	xmm7, xmm0, 0
+        aesenc	xmm11, [rax+48]
+        aesenc	xmm12, [rax+48]
+        pclmulqdq	xmm4, xmm5, 0
+        aesenc	xmm13, [rax+48]
+        aesenc	xmm14, [rax+48]
+        aesenc	xmm15, [rax+48]
+        pxor	xmm1, xmm7
+        pxor	xmm2, xmm7
+        pxor	xmm1, xmm6
+        pxor	xmm3, xmm6
+        pxor	xmm1, xmm4
+        movdqu	xmm7, [rsp+64]
+        movdqu	xmm0, [rcx+48]
+        pshufd	xmm4, xmm7, 78
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        aesenc	xmm8, [rax+64]
+        pxor	xmm4, xmm7
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm5, xmm0
+        movdqa	xmm6, xmm0
+        pclmulqdq	xmm6, xmm7, 17
+        aesenc	xmm9, [rax+64]
+        aesenc	xmm10, [rax+64]
+        pclmulqdq	xmm7, xmm0, 0
+        aesenc	xmm11, [rax+64]
+        aesenc	xmm12, [rax+64]
+        pclmulqdq	xmm4, xmm5, 0
+        aesenc	xmm13, [rax+64]
+        aesenc	xmm14, [rax+64]
+        aesenc	xmm15, [rax+64]
+        pxor	xmm1, xmm7
+        pxor	xmm2, xmm7
+        pxor	xmm1, xmm6
+        pxor	xmm3, xmm6
+        pxor	xmm1, xmm4
+        movdqu	xmm7, [rsp+48]
+        movdqu	xmm0, [rcx+64]
+        pshufd	xmm4, xmm7, 78
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        aesenc	xmm8, [rax+80]
+        pxor	xmm4, xmm7
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm5, xmm0
+        movdqa	xmm6, xmm0
+        pclmulqdq	xmm6, xmm7, 17
+        aesenc	xmm9, [rax+80]
+        aesenc	xmm10, [rax+80]
+        pclmulqdq	xmm7, xmm0, 0
+        aesenc	xmm11, [rax+80]
+        aesenc	xmm12, [rax+80]
+        pclmulqdq	xmm4, xmm5, 0
+        aesenc	xmm13, [rax+80]
+        aesenc	xmm14, [rax+80]
+        aesenc	xmm15, [rax+80]
+        pxor	xmm1, xmm7
+        pxor	xmm2, xmm7
+        pxor	xmm1, xmm6
+        pxor	xmm3, xmm6
+        pxor	xmm1, xmm4
+        movdqu	xmm7, [rsp+32]
+        movdqu	xmm0, [rcx+80]
+        pshufd	xmm4, xmm7, 78
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        aesenc	xmm8, [rax+96]
+        pxor	xmm4, xmm7
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm5, xmm0
+        movdqa	xmm6, xmm0
+        pclmulqdq	xmm6, xmm7, 17
+        aesenc	xmm9, [rax+96]
+        aesenc	xmm10, [rax+96]
+        pclmulqdq	xmm7, xmm0, 0
+        aesenc	xmm11, [rax+96]
+        aesenc	xmm12, [rax+96]
+        pclmulqdq	xmm4, xmm5, 0
+        aesenc	xmm13, [rax+96]
+        aesenc	xmm14, [rax+96]
+        aesenc	xmm15, [rax+96]
+        pxor	xmm1, xmm7
+        pxor	xmm2, xmm7
+        pxor	xmm1, xmm6
+        pxor	xmm3, xmm6
+        pxor	xmm1, xmm4
+        movdqu	xmm7, [rsp+16]
+        movdqu	xmm0, [rcx+96]
+        pshufd	xmm4, xmm7, 78
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        aesenc	xmm8, [rax+112]
+        pxor	xmm4, xmm7
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm5, xmm0
+        movdqa	xmm6, xmm0
+        pclmulqdq	xmm6, xmm7, 17
+        aesenc	xmm9, [rax+112]
+        aesenc	xmm10, [rax+112]
+        pclmulqdq	xmm7, xmm0, 0
+        aesenc	xmm11, [rax+112]
+        aesenc	xmm12, [rax+112]
+        pclmulqdq	xmm4, xmm5, 0
+        aesenc	xmm13, [rax+112]
+        aesenc	xmm14, [rax+112]
+        aesenc	xmm15, [rax+112]
+        pxor	xmm1, xmm7
+        pxor	xmm2, xmm7
+        pxor	xmm1, xmm6
+        pxor	xmm3, xmm6
+        pxor	xmm1, xmm4
+        movdqu	xmm7, [rsp]
+        movdqu	xmm0, [rcx+112]
+        pshufd	xmm4, xmm7, 78
+        pshufb	xmm0, OWORD PTR L_aes_gcm_bswap_mask
+        aesenc	xmm8, [rax+128]
+        pxor	xmm4, xmm7
+        pshufd	xmm5, xmm0, 78
+        pxor	xmm5, xmm0
+        movdqa	xmm6, xmm0
+        pclmulqdq	xmm6, xmm7, 17
+        aesenc	xmm9, [rax+128]
+        aesenc	xmm10, [rax+128]
+        pclmulqdq	xmm7, xmm0, 0
+        aesenc	xmm11, [rax+128]
+        aesenc	xmm12, [rax+128]
+        pclmulqdq	xmm4, xmm5, 0
+        aesenc	xmm13, [rax+128]
+        aesenc	xmm14, [rax+128]
+        aesenc	xmm15, [rax+128]
+        pxor	xmm1, xmm7
+        pxor	xmm2, xmm7
+        pxor	xmm1, xmm6
+        pxor	xmm3, xmm6
+        pxor	xmm1, xmm4
+        movdqa	xmm5, xmm1
+        psrldq	xmm1, 8
+        pslldq	xmm5, 8
+        aesenc	xmm8, [rax+144]
+        pxor	xmm2, xmm5
+        pxor	xmm3, xmm1
+        movdqa	xmm7, xmm2
+        movdqa	xmm4, xmm2
+        movdqa	xmm5, xmm2
+        aesenc	xmm9, [rax+144]
+        pslld	xmm7, 31
+        pslld	xmm4, 30
+        pslld	xmm5, 25
+        aesenc	xmm10, [rax+144]
+        pxor	xmm7, xmm4
+        pxor	xmm7, xmm5
+        aesenc	xmm11, [rax+144]
+        movdqa	xmm4, xmm7
+        pslldq	xmm7, 12
+        psrldq	xmm4, 4
+        aesenc	xmm12, [rax+144]
+        pxor	xmm2, xmm7
+        movdqa	xmm5, xmm2
+        movdqa	xmm1, xmm2
+        movdqa	xmm0, xmm2
+        aesenc	xmm13, [rax+144]
+        psrld	xmm5, 1
+        psrld	xmm1, 2
+        psrld	xmm0, 7
+        aesenc	xmm14, [rax+144]
+        pxor	xmm5, xmm1
+        pxor	xmm5, xmm0
+        aesenc	xmm15, [rax+144]
+        pxor	xmm5, xmm4
+        pxor	xmm2, xmm5
+        pxor	xmm2, xmm3
+        cmp	r8d, 11
+        movdqa	xmm7, OWORD PTR [rax+160]
+        jl	L_AES_GCM_decrypt_update_aesni_aesenc_128_ghash_avx_done
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [rax+176]
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        cmp	r8d, 13
+        movdqa	xmm7, OWORD PTR [rax+192]
+        jl	L_AES_GCM_decrypt_update_aesni_aesenc_128_ghash_avx_done
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [rax+208]
+        aesenc	xmm8, xmm7
+        aesenc	xmm9, xmm7
+        aesenc	xmm10, xmm7
+        aesenc	xmm11, xmm7
+        aesenc	xmm12, xmm7
+        aesenc	xmm13, xmm7
+        aesenc	xmm14, xmm7
+        aesenc	xmm15, xmm7
+        movdqa	xmm7, OWORD PTR [rax+224]
+L_AES_GCM_decrypt_update_aesni_aesenc_128_ghash_avx_done:
+        aesenclast	xmm8, xmm7
+        aesenclast	xmm9, xmm7
+        movdqu	xmm0, [rcx]
+        movdqu	xmm1, [rcx+16]
+        pxor	xmm8, xmm0
+        pxor	xmm9, xmm1
+        movdqu	[rdx], xmm8
+        movdqu	[rdx+16], xmm9
+        aesenclast	xmm10, xmm7
+        aesenclast	xmm11, xmm7
+        movdqu	xmm0, [rcx+32]
+        movdqu	xmm1, [rcx+48]
+        pxor	xmm10, xmm0
+        pxor	xmm11, xmm1
+        movdqu	[rdx+32], xmm10
+        movdqu	[rdx+48], xmm11
+        aesenclast	xmm12, xmm7
+        aesenclast	xmm13, xmm7
+        movdqu	xmm0, [rcx+64]
+        movdqu	xmm1, [rcx+80]
+        pxor	xmm12, xmm0
+        pxor	xmm13, xmm1
+        movdqu	[rdx+64], xmm12
+        movdqu	[rdx+80], xmm13
+        aesenclast	xmm14, xmm7
+        aesenclast	xmm15, xmm7
+        movdqu	xmm0, [rcx+96]
+        movdqu	xmm1, [rcx+112]
+        pxor	xmm14, xmm0
+        pxor	xmm15, xmm1
+        movdqu	[rdx+96], xmm14
+        movdqu	[rdx+112], xmm15
+        add	edi, 128
+        cmp	edi, r13d
+        jl	L_AES_GCM_decrypt_update_aesni_ghash_128
+        movdqa	xmm6, xmm2
+        movdqu	xmm5, [rsp]
+L_AES_GCM_decrypt_update_aesni_done_128:
+        mov	edx, r9d
+        cmp	edi, edx
+        jge	L_AES_GCM_decrypt_update_aesni_done_dec
+        mov	r13d, r9d
+        and	r13d, 4294967280
+        cmp	edi, r13d
+        jge	L_AES_GCM_decrypt_update_aesni_last_block_done
+L_AES_GCM_decrypt_update_aesni_last_block_start:
+        lea	rcx, QWORD PTR [r11+rdi]
+        lea	rdx, QWORD PTR [r10+rdi]
+        movdqu	xmm1, [rcx]
+        movdqa	xmm0, xmm5
+        pshufb	xmm1, OWORD PTR L_aes_gcm_bswap_mask
+        pxor	xmm1, xmm6
+        movdqu	xmm8, [r15]
+        movdqa	xmm9, xmm8
+        pshufb	xmm8, OWORD PTR L_aes_gcm_bswap_epi64
+        paddd	xmm9, OWORD PTR L_aes_gcm_one
+        pxor	xmm8, [rax]
+        movdqu	[r15], xmm9
+        movdqa	xmm10, xmm1
+        pclmulqdq	xmm10, xmm0, 16
+        aesenc	xmm8, [rax+16]
+        aesenc	xmm8, [rax+32]
+        movdqa	xmm11, xmm1
+        pclmulqdq	xmm11, xmm0, 1
+        aesenc	xmm8, [rax+48]
+        aesenc	xmm8, [rax+64]
+        movdqa	xmm12, xmm1
+        pclmulqdq	xmm12, xmm0, 0
+        aesenc	xmm8, [rax+80]
+        movdqa	xmm1, xmm1
+        pclmulqdq	xmm1, xmm0, 17
+        aesenc	xmm8, [rax+96]
+        pxor	xmm10, xmm11
+        movdqa	xmm2, xmm10
+        psrldq	xmm10, 8
+        pslldq	xmm2, 8
+        aesenc	xmm8, [rax+112]
+        movdqa	xmm3, xmm1
+        pxor	xmm2, xmm12
+        pxor	xmm3, xmm10
+        movdqa	xmm0, OWORD PTR L_aes_gcm_mod2_128
+        movdqa	xmm11, xmm2
+        pclmulqdq	xmm11, xmm0, 16
+        aesenc	xmm8, [rax+128]
+        pshufd	xmm10, xmm2, 78
+        pxor	xmm10, xmm11
+        movdqa	xmm11, xmm10
+        pclmulqdq	xmm11, xmm0, 16
+        aesenc	xmm8, [rax+144]
+        pshufd	xmm6, xmm10, 78
+        pxor	xmm6, xmm11
+        pxor	xmm6, xmm3
+        cmp	r8d, 11
+        movdqa	xmm9, OWORD PTR [rax+160]
+        jl	L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last
+        aesenc	xmm8, xmm9
+        aesenc	xmm8, [rax+176]
+        cmp	r8d, 13
+        movdqa	xmm9, OWORD PTR [rax+192]
+        jl	L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last
+        aesenc	xmm8, xmm9
+        aesenc	xmm8, [rax+208]
+        movdqa	xmm9, OWORD PTR [rax+224]
+L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last:
+        aesenclast	xmm8, xmm9
+        movdqu	xmm9, [rcx]
+        pxor	xmm8, xmm9
+        movdqu	[rdx], xmm8
+        add	edi, 16
+        cmp	edi, r13d
+        jl	L_AES_GCM_decrypt_update_aesni_last_block_start
+L_AES_GCM_decrypt_update_aesni_last_block_done:
+L_AES_GCM_decrypt_update_aesni_done_dec:
+        movdqa	OWORD PTR [r12], xmm6
+        add	rsp, 168
+        pop	rsi
+        pop	rdi
+        pop	r15
+        pop	r14
+        pop	r12
+        pop	r13
+        ret
+AES_GCM_decrypt_update_aesni ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+AES_GCM_decrypt_final_aesni PROC
+        push	r13
+        push	r12
+        push	r14
+        push	rbp
+        push	r15
+        mov	rax, rcx
+        mov	r10d, r9d
+        mov	r9, rdx
+        mov	r11d, DWORD PTR [rsp+80]
+        mov	r12, QWORD PTR [rsp+88]
+        mov	r14, QWORD PTR [rsp+96]
+        mov	rbp, QWORD PTR [rsp+104]
+        sub	rsp, 16
+        movdqa	xmm6, OWORD PTR [rax]
+        movdqa	xmm5, OWORD PTR [r12]
+        movdqa	xmm15, OWORD PTR [r14]
+        movdqa	xmm9, xmm5
+        movdqa	xmm8, xmm5
+        psrlq	xmm9, 63
+        psllq	xmm8, 1
+        pslldq	xmm9, 8
+        por	xmm8, xmm9
+        pshufd	xmm5, xmm5, 255
+        psrad	xmm5, 31
+        pand	xmm5, OWORD PTR L_aes_gcm_mod2_128
+        pxor	xmm5, xmm8
+        mov	edx, r10d
+        mov	ecx, r11d
+        shl	rdx, 3
+        shl	rcx, 3
+        pinsrq	xmm0, rdx, 0
+        pinsrq	xmm0, rcx, 1
+        pxor	xmm6, xmm0
+        pshufd	xmm9, xmm5, 78
+        pshufd	xmm10, xmm6, 78
+        movdqa	xmm11, xmm6
+        movdqa	xmm8, xmm6
+        pclmulqdq	xmm11, xmm5, 17
+        pclmulqdq	xmm8, xmm5, 0
+        pxor	xmm9, xmm5
+        pxor	xmm10, xmm6
+        pclmulqdq	xmm9, xmm10, 0
+        pxor	xmm9, xmm8
+        pxor	xmm9, xmm11
+        movdqa	xmm10, xmm9
+        movdqa	xmm6, xmm11
+        pslldq	xmm10, 8
+        psrldq	xmm9, 8
+        pxor	xmm8, xmm10
+        pxor	xmm6, xmm9
+        movdqa	xmm12, xmm8
+        movdqa	xmm13, xmm8
+        movdqa	xmm14, xmm8
+        pslld	xmm12, 31
+        pslld	xmm13, 30
+        pslld	xmm14, 25
+        pxor	xmm12, xmm13
+        pxor	xmm12, xmm14
+        movdqa	xmm13, xmm12
+        psrldq	xmm13, 4
+        pslldq	xmm12, 12
+        pxor	xmm8, xmm12
+        movdqa	xmm14, xmm8
+        movdqa	xmm10, xmm8
+        movdqa	xmm9, xmm8
+        psrld	xmm14, 1
+        psrld	xmm10, 2
+        psrld	xmm9, 7
+        pxor	xmm14, xmm10
+        pxor	xmm14, xmm9
+        pxor	xmm14, xmm13
+        pxor	xmm14, xmm8
+        pxor	xmm6, xmm14
+        pshufb	xmm6, OWORD PTR L_aes_gcm_bswap_mask
+        movdqu	xmm0, xmm15
+        pxor	xmm0, xmm6
+        cmp	r8d, 16
+        je	L_AES_GCM_decrypt_final_aesni_cmp_tag_16
+        sub	rsp, 16
+        xor	rcx, rcx
+        xor	r15, r15
+        movdqu	[rsp], xmm0
+L_AES_GCM_decrypt_final_aesni_cmp_tag_loop:
+        movzx	r13d, BYTE PTR [rsp+rcx]
+        xor	r13b, BYTE PTR [r9+rcx]
+        or	r15b, r13b
+        inc	ecx
+        cmp	ecx, r8d
+        jne	L_AES_GCM_decrypt_final_aesni_cmp_tag_loop
+        cmp	r15, 0
+        sete	r15b
+        add	rsp, 16
+        xor	rcx, rcx
+        jmp	L_AES_GCM_decrypt_final_aesni_cmp_tag_done
+L_AES_GCM_decrypt_final_aesni_cmp_tag_16:
+        movdqu	xmm1, [r9]
+        pcmpeqb	xmm0, xmm1
+        pmovmskb	rdx, xmm0
+        ; %%edx == 0xFFFF then return 1 else => return 0
+        xor	r15d, r15d
+        cmp	edx, 65535
+        sete	r15b
+L_AES_GCM_decrypt_final_aesni_cmp_tag_done:
+        mov	DWORD PTR [rbp], r15d
+        add	rsp, 16
+        pop	r15
+        pop	rbp
+        pop	r14
+        pop	r12
+        pop	r13
+        ret
+AES_GCM_decrypt_final_aesni ENDP
+_text ENDS
+IFDEF HAVE_INTEL_AVX1
+_DATA SEGMENT
+ALIGN 16
+L_avx1_aes_gcm_one QWORD 0, 1
+ptr_L_avx1_aes_gcm_one QWORD L_avx1_aes_gcm_one
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx1_aes_gcm_two QWORD 0, 2
+ptr_L_avx1_aes_gcm_two QWORD L_avx1_aes_gcm_two
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx1_aes_gcm_three QWORD 0, 3
+ptr_L_avx1_aes_gcm_three QWORD L_avx1_aes_gcm_three
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx1_aes_gcm_four QWORD 0, 4
+ptr_L_avx1_aes_gcm_four QWORD L_avx1_aes_gcm_four
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx1_aes_gcm_five QWORD 0, 5
+ptr_L_avx1_aes_gcm_five QWORD L_avx1_aes_gcm_five
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx1_aes_gcm_six QWORD 0, 6
+ptr_L_avx1_aes_gcm_six QWORD L_avx1_aes_gcm_six
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx1_aes_gcm_seven QWORD 0, 7
+ptr_L_avx1_aes_gcm_seven QWORD L_avx1_aes_gcm_seven
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx1_aes_gcm_eight QWORD 0, 8
+ptr_L_avx1_aes_gcm_eight QWORD L_avx1_aes_gcm_eight
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx1_aes_gcm_bswap_epi64 QWORD 283686952306183, 579005069656919567
+ptr_L_avx1_aes_gcm_bswap_epi64 QWORD L_avx1_aes_gcm_bswap_epi64
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx1_aes_gcm_bswap_mask QWORD 579005069656919567, 283686952306183
+ptr_L_avx1_aes_gcm_bswap_mask QWORD L_avx1_aes_gcm_bswap_mask
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx1_aes_gcm_mod2_128 QWORD 1, 13979173243358019584
+ptr_L_avx1_aes_gcm_mod2_128 QWORD L_avx1_aes_gcm_mod2_128
+_DATA ENDS
+_text SEGMENT READONLY PARA
+AES_GCM_encrypt_avx1 PROC
+        push	r13
+        push	rdi
+        push	rsi
+        push	r12
+        push	rbx
+        push	r14
+        push	r15
+        mov	rdi, rcx
+        mov	rsi, rdx
+        mov	r12, r8
+        mov	rax, r9
+        mov	r8, QWORD PTR [rsp+96]
+        mov	r9d, DWORD PTR [rsp+104]
+        mov	r11d, DWORD PTR [rsp+112]
+        mov	ebx, DWORD PTR [rsp+120]
+        mov	r14d, DWORD PTR [rsp+128]
+        mov	r15, QWORD PTR [rsp+136]
+        mov	r10d, DWORD PTR [rsp+144]
+        sub	rsp, 160
+        vpxor	xmm4, xmm4, xmm4
+        vpxor	xmm6, xmm6, xmm6
+        mov	edx, ebx
+        cmp	edx, 12
+        jne	L_AES_GCM_encrypt_avx1_iv_not_12
+        ; # Calculate values when IV is 12 bytes
+        ; Set counter based on IV
+        mov	ecx, 16777216
+        vmovq	xmm4, QWORD PTR [rax]
+        vpinsrd	xmm4, xmm4, DWORD PTR [rax+8], 2
+        vpinsrd	xmm4, xmm4, ecx, 3
+        ; H = Encrypt X(=0) and T = Encrypt counter
+        vmovdqa	xmm5, OWORD PTR [r15]
+        vpxor	xmm1, xmm4, xmm5
+        vmovdqa	xmm7, OWORD PTR [r15+16]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+32]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+48]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+64]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+80]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+96]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+112]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+128]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+144]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        cmp	r10d, 11
+        vmovdqa	xmm7, OWORD PTR [r15+160]
+        jl	L_AES_GCM_encrypt_avx1_calc_iv_12_last
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+176]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        cmp	r10d, 13
+        vmovdqa	xmm7, OWORD PTR [r15+192]
+        jl	L_AES_GCM_encrypt_avx1_calc_iv_12_last
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+208]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+224]
+L_AES_GCM_encrypt_avx1_calc_iv_12_last:
+        vaesenclast	xmm5, xmm5, xmm7
+        vaesenclast	xmm1, xmm1, xmm7
+        vpshufb	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vmovdqu	OWORD PTR [rsp+144], xmm1
+        jmp	L_AES_GCM_encrypt_avx1_iv_done
+L_AES_GCM_encrypt_avx1_iv_not_12:
+        ; Calculate values when IV is not 12 bytes
+        ; H = Encrypt X(=0)
+        vmovdqa	xmm5, OWORD PTR [r15]
+        vaesenc	xmm5, xmm5, [r15+16]
+        vaesenc	xmm5, xmm5, [r15+32]
+        vaesenc	xmm5, xmm5, [r15+48]
+        vaesenc	xmm5, xmm5, [r15+64]
+        vaesenc	xmm5, xmm5, [r15+80]
+        vaesenc	xmm5, xmm5, [r15+96]
+        vaesenc	xmm5, xmm5, [r15+112]
+        vaesenc	xmm5, xmm5, [r15+128]
+        vaesenc	xmm5, xmm5, [r15+144]
+        cmp	r10d, 11
+        vmovdqa	xmm9, OWORD PTR [r15+160]
+        jl	L_AES_GCM_encrypt_avx1_calc_iv_1_aesenc_avx_last
+        vaesenc	xmm5, xmm5, xmm9
+        vaesenc	xmm5, xmm5, [r15+176]
+        cmp	r10d, 13
+        vmovdqa	xmm9, OWORD PTR [r15+192]
+        jl	L_AES_GCM_encrypt_avx1_calc_iv_1_aesenc_avx_last
+        vaesenc	xmm5, xmm5, xmm9
+        vaesenc	xmm5, xmm5, [r15+208]
+        vmovdqa	xmm9, OWORD PTR [r15+224]
+L_AES_GCM_encrypt_avx1_calc_iv_1_aesenc_avx_last:
+        vaesenclast	xmm5, xmm5, xmm9
+        vpshufb	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        ; Calc counter
+        ; Initialization vector
+        cmp	edx, 0
+        mov	rcx, 0
+        je	L_AES_GCM_encrypt_avx1_calc_iv_done
+        cmp	edx, 16
+        jl	L_AES_GCM_encrypt_avx1_calc_iv_lt16
+        and	edx, 4294967280
+L_AES_GCM_encrypt_avx1_calc_iv_16_loop:
+        vmovdqu	xmm8, OWORD PTR [rax+rcx]
+        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vpxor	xmm4, xmm4, xmm8
+        ; ghash_gfmul_avx
+        vpshufd	xmm1, xmm4, 78
+        vpshufd	xmm2, xmm5, 78
+        vpclmulqdq	xmm3, xmm5, xmm4, 17
+        vpclmulqdq	xmm0, xmm5, xmm4, 0
+        vpxor	xmm1, xmm1, xmm4
+        vpxor	xmm2, xmm2, xmm5
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vmovdqa	xmm7, xmm0
+        vmovdqa	xmm4, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm7, xmm7, xmm2
+        vpxor	xmm4, xmm4, xmm1
+        vpsrld	xmm0, xmm7, 31
+        vpsrld	xmm1, xmm4, 31
+        vpslld	xmm7, xmm7, 1
+        vpslld	xmm4, xmm4, 1
+        vpsrldq	xmm2, xmm0, 12
+        vpslldq	xmm0, xmm0, 4
+        vpslldq	xmm1, xmm1, 4
+        vpor	xmm4, xmm4, xmm2
+        vpor	xmm7, xmm7, xmm0
+        vpor	xmm4, xmm4, xmm1
+        vpslld	xmm0, xmm7, 31
+        vpslld	xmm1, xmm7, 30
+        vpslld	xmm2, xmm7, 25
+        vpxor	xmm0, xmm0, xmm1
+        vpxor	xmm0, xmm0, xmm2
+        vmovdqa	xmm1, xmm0
+        vpsrldq	xmm1, xmm1, 4
+        vpslldq	xmm0, xmm0, 12
+        vpxor	xmm7, xmm7, xmm0
+        vpsrld	xmm2, xmm7, 1
+        vpsrld	xmm3, xmm7, 2
+        vpsrld	xmm0, xmm7, 7
+        vpxor	xmm2, xmm2, xmm3
+        vpxor	xmm2, xmm2, xmm0
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm4, xmm4, xmm2
+        add	ecx, 16
+        cmp	ecx, edx
+        jl	L_AES_GCM_encrypt_avx1_calc_iv_16_loop
+        mov	edx, ebx
+        cmp	ecx, edx
+        je	L_AES_GCM_encrypt_avx1_calc_iv_done
+L_AES_GCM_encrypt_avx1_calc_iv_lt16:
+        sub	rsp, 16
+        vpxor	xmm8, xmm8, xmm8
+        xor	ebx, ebx
+        vmovdqu	OWORD PTR [rsp], xmm8
+L_AES_GCM_encrypt_avx1_calc_iv_loop:
+        movzx	r13d, BYTE PTR [rax+rcx]
+        mov	BYTE PTR [rsp+rbx], r13b
+        inc	ecx
+        inc	ebx
+        cmp	ecx, edx
+        jl	L_AES_GCM_encrypt_avx1_calc_iv_loop
+        vmovdqu	xmm8, OWORD PTR [rsp]
+        add	rsp, 16
+        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vpxor	xmm4, xmm4, xmm8
+        ; ghash_gfmul_avx
+        vpshufd	xmm1, xmm4, 78
+        vpshufd	xmm2, xmm5, 78
+        vpclmulqdq	xmm3, xmm5, xmm4, 17
+        vpclmulqdq	xmm0, xmm5, xmm4, 0
+        vpxor	xmm1, xmm1, xmm4
+        vpxor	xmm2, xmm2, xmm5
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vmovdqa	xmm7, xmm0
+        vmovdqa	xmm4, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm7, xmm7, xmm2
+        vpxor	xmm4, xmm4, xmm1
+        vpsrld	xmm0, xmm7, 31
+        vpsrld	xmm1, xmm4, 31
+        vpslld	xmm7, xmm7, 1
+        vpslld	xmm4, xmm4, 1
+        vpsrldq	xmm2, xmm0, 12
+        vpslldq	xmm0, xmm0, 4
+        vpslldq	xmm1, xmm1, 4
+        vpor	xmm4, xmm4, xmm2
+        vpor	xmm7, xmm7, xmm0
+        vpor	xmm4, xmm4, xmm1
+        vpslld	xmm0, xmm7, 31
+        vpslld	xmm1, xmm7, 30
+        vpslld	xmm2, xmm7, 25
+        vpxor	xmm0, xmm0, xmm1
+        vpxor	xmm0, xmm0, xmm2
+        vmovdqa	xmm1, xmm0
+        vpsrldq	xmm1, xmm1, 4
+        vpslldq	xmm0, xmm0, 12
+        vpxor	xmm7, xmm7, xmm0
+        vpsrld	xmm2, xmm7, 1
+        vpsrld	xmm3, xmm7, 2
+        vpsrld	xmm0, xmm7, 7
+        vpxor	xmm2, xmm2, xmm3
+        vpxor	xmm2, xmm2, xmm0
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm4, xmm4, xmm2
+L_AES_GCM_encrypt_avx1_calc_iv_done:
+        ; T = Encrypt counter
+        vpxor	xmm0, xmm0, xmm0
+        shl	edx, 3
+        vmovq	xmm0, rdx
+        vpxor	xmm4, xmm4, xmm0
+        ; ghash_gfmul_avx
+        vpshufd	xmm1, xmm4, 78
+        vpshufd	xmm2, xmm5, 78
+        vpclmulqdq	xmm3, xmm5, xmm4, 17
+        vpclmulqdq	xmm0, xmm5, xmm4, 0
+        vpxor	xmm1, xmm1, xmm4
+        vpxor	xmm2, xmm2, xmm5
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vmovdqa	xmm7, xmm0
+        vmovdqa	xmm4, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm7, xmm7, xmm2
+        vpxor	xmm4, xmm4, xmm1
+        vpsrld	xmm0, xmm7, 31
+        vpsrld	xmm1, xmm4, 31
+        vpslld	xmm7, xmm7, 1
+        vpslld	xmm4, xmm4, 1
+        vpsrldq	xmm2, xmm0, 12
+        vpslldq	xmm0, xmm0, 4
+        vpslldq	xmm1, xmm1, 4
+        vpor	xmm4, xmm4, xmm2
+        vpor	xmm7, xmm7, xmm0
+        vpor	xmm4, xmm4, xmm1
+        vpslld	xmm0, xmm7, 31
+        vpslld	xmm1, xmm7, 30
+        vpslld	xmm2, xmm7, 25
+        vpxor	xmm0, xmm0, xmm1
+        vpxor	xmm0, xmm0, xmm2
+        vmovdqa	xmm1, xmm0
+        vpsrldq	xmm1, xmm1, 4
+        vpslldq	xmm0, xmm0, 12
+        vpxor	xmm7, xmm7, xmm0
+        vpsrld	xmm2, xmm7, 1
+        vpsrld	xmm3, xmm7, 2
+        vpsrld	xmm0, xmm7, 7
+        vpxor	xmm2, xmm2, xmm3
+        vpxor	xmm2, xmm2, xmm0
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm4, xmm4, xmm2
+        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        ;   Encrypt counter
+        vmovdqa	xmm8, OWORD PTR [r15]
+        vpxor	xmm8, xmm8, xmm4
+        vaesenc	xmm8, xmm8, [r15+16]
+        vaesenc	xmm8, xmm8, [r15+32]
+        vaesenc	xmm8, xmm8, [r15+48]
+        vaesenc	xmm8, xmm8, [r15+64]
+        vaesenc	xmm8, xmm8, [r15+80]
+        vaesenc	xmm8, xmm8, [r15+96]
+        vaesenc	xmm8, xmm8, [r15+112]
+        vaesenc	xmm8, xmm8, [r15+128]
+        vaesenc	xmm8, xmm8, [r15+144]
+        cmp	r10d, 11
+        vmovdqa	xmm9, OWORD PTR [r15+160]
+        jl	L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last
+        vaesenc	xmm8, xmm8, xmm9
+        vaesenc	xmm8, xmm8, [r15+176]
+        cmp	r10d, 13
+        vmovdqa	xmm9, OWORD PTR [r15+192]
+        jl	L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last
+        vaesenc	xmm8, xmm8, xmm9
+        vaesenc	xmm8, xmm8, [r15+208]
+        vmovdqa	xmm9, OWORD PTR [r15+224]
+L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last:
+        vaesenclast	xmm8, xmm8, xmm9
+        vmovdqu	OWORD PTR [rsp+144], xmm8
+L_AES_GCM_encrypt_avx1_iv_done:
+        ; Additional authentication data
+        mov	edx, r11d
+        cmp	edx, 0
+        je	L_AES_GCM_encrypt_avx1_calc_aad_done
+        xor	ecx, ecx
+        cmp	edx, 16
+        jl	L_AES_GCM_encrypt_avx1_calc_aad_lt16
+        and	edx, 4294967280
+L_AES_GCM_encrypt_avx1_calc_aad_16_loop:
+        vmovdqu	xmm8, OWORD PTR [r12+rcx]
+        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vpxor	xmm6, xmm6, xmm8
+        ; ghash_gfmul_avx
+        vpshufd	xmm1, xmm6, 78
+        vpshufd	xmm2, xmm5, 78
+        vpclmulqdq	xmm3, xmm5, xmm6, 17
+        vpclmulqdq	xmm0, xmm5, xmm6, 0
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm2, xmm2, xmm5
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vmovdqa	xmm7, xmm0
+        vmovdqa	xmm6, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm7, xmm7, xmm2
+        vpxor	xmm6, xmm6, xmm1
+        vpsrld	xmm0, xmm7, 31
+        vpsrld	xmm1, xmm6, 31
+        vpslld	xmm7, xmm7, 1
+        vpslld	xmm6, xmm6, 1
+        vpsrldq	xmm2, xmm0, 12
+        vpslldq	xmm0, xmm0, 4
+        vpslldq	xmm1, xmm1, 4
+        vpor	xmm6, xmm6, xmm2
+        vpor	xmm7, xmm7, xmm0
+        vpor	xmm6, xmm6, xmm1
+        vpslld	xmm0, xmm7, 31
+        vpslld	xmm1, xmm7, 30
+        vpslld	xmm2, xmm7, 25
+        vpxor	xmm0, xmm0, xmm1
+        vpxor	xmm0, xmm0, xmm2
+        vmovdqa	xmm1, xmm0
+        vpsrldq	xmm1, xmm1, 4
+        vpslldq	xmm0, xmm0, 12
+        vpxor	xmm7, xmm7, xmm0
+        vpsrld	xmm2, xmm7, 1
+        vpsrld	xmm3, xmm7, 2
+        vpsrld	xmm0, xmm7, 7
+        vpxor	xmm2, xmm2, xmm3
+        vpxor	xmm2, xmm2, xmm0
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm6, xmm6, xmm2
+        add	ecx, 16
+        cmp	ecx, edx
+        jl	L_AES_GCM_encrypt_avx1_calc_aad_16_loop
+        mov	edx, r11d
+        cmp	ecx, edx
+        je	L_AES_GCM_encrypt_avx1_calc_aad_done
+L_AES_GCM_encrypt_avx1_calc_aad_lt16:
+        sub	rsp, 16
+        vpxor	xmm8, xmm8, xmm8
+        xor	ebx, ebx
+        vmovdqu	OWORD PTR [rsp], xmm8
+L_AES_GCM_encrypt_avx1_calc_aad_loop:
+        movzx	r13d, BYTE PTR [r12+rcx]
+        mov	BYTE PTR [rsp+rbx], r13b
+        inc	ecx
+        inc	ebx
+        cmp	ecx, edx
+        jl	L_AES_GCM_encrypt_avx1_calc_aad_loop
+        vmovdqu	xmm8, OWORD PTR [rsp]
+        add	rsp, 16
+        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vpxor	xmm6, xmm6, xmm8
+        ; ghash_gfmul_avx
+        vpshufd	xmm1, xmm6, 78
+        vpshufd	xmm2, xmm5, 78
+        vpclmulqdq	xmm3, xmm5, xmm6, 17
+        vpclmulqdq	xmm0, xmm5, xmm6, 0
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm2, xmm2, xmm5
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vmovdqa	xmm7, xmm0
+        vmovdqa	xmm6, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm7, xmm7, xmm2
+        vpxor	xmm6, xmm6, xmm1
+        vpsrld	xmm0, xmm7, 31
+        vpsrld	xmm1, xmm6, 31
+        vpslld	xmm7, xmm7, 1
+        vpslld	xmm6, xmm6, 1
+        vpsrldq	xmm2, xmm0, 12
+        vpslldq	xmm0, xmm0, 4
+        vpslldq	xmm1, xmm1, 4
+        vpor	xmm6, xmm6, xmm2
+        vpor	xmm7, xmm7, xmm0
+        vpor	xmm6, xmm6, xmm1
+        vpslld	xmm0, xmm7, 31
+        vpslld	xmm1, xmm7, 30
+        vpslld	xmm2, xmm7, 25
+        vpxor	xmm0, xmm0, xmm1
+        vpxor	xmm0, xmm0, xmm2
+        vmovdqa	xmm1, xmm0
+        vpsrldq	xmm1, xmm1, 4
+        vpslldq	xmm0, xmm0, 12
+        vpxor	xmm7, xmm7, xmm0
+        vpsrld	xmm2, xmm7, 1
+        vpsrld	xmm3, xmm7, 2
+        vpsrld	xmm0, xmm7, 7
+        vpxor	xmm2, xmm2, xmm3
+        vpxor	xmm2, xmm2, xmm0
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm6, xmm6, xmm2
+L_AES_GCM_encrypt_avx1_calc_aad_done:
+        ; Calculate counter and H
+        vpsrlq	xmm9, xmm5, 63
+        vpsllq	xmm8, xmm5, 1
+        vpslldq	xmm9, xmm9, 8
+        vpor	xmm8, xmm8, xmm9
+        vpshufd	xmm5, xmm5, 255
+        vpsrad	xmm5, xmm5, 31
+        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_epi64
+        vpand	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_mod2_128
+        vpaddd	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_one
+        vpxor	xmm5, xmm5, xmm8
+        vmovdqu	OWORD PTR [rsp+128], xmm4
+        xor	ebx, ebx
+        cmp	r9d, 128
+        mov	r13d, r9d
+        jl	L_AES_GCM_encrypt_avx1_done_128
+        and	r13d, 4294967168
+        vmovdqa	xmm2, xmm6
+        ; H ^ 1
+        vmovdqu	OWORD PTR [rsp], xmm5
+        ; H ^ 2
+        vpclmulqdq	xmm8, xmm5, xmm5, 0
+        vpclmulqdq	xmm0, xmm5, xmm5, 17
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm0, xmm0, xmm14
+        vmovdqu	OWORD PTR [rsp+16], xmm0
+        ; H ^ 3
+        ; ghash_gfmul_red_avx
+        vpshufd	xmm9, xmm5, 78
+        vpshufd	xmm10, xmm0, 78
+        vpclmulqdq	xmm11, xmm0, xmm5, 17
+        vpclmulqdq	xmm8, xmm0, xmm5, 0
+        vpxor	xmm9, xmm9, xmm5
+        vpxor	xmm10, xmm10, xmm0
+        vpclmulqdq	xmm9, xmm9, xmm10, 0
+        vpxor	xmm9, xmm9, xmm8
+        vpxor	xmm9, xmm9, xmm11
+        vpslldq	xmm10, xmm9, 8
+        vpsrldq	xmm9, xmm9, 8
+        vpxor	xmm8, xmm8, xmm10
+        vpxor	xmm1, xmm11, xmm9
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm1, xmm1, xmm14
+        vmovdqu	OWORD PTR [rsp+32], xmm1
+        ; H ^ 4
+        vpclmulqdq	xmm8, xmm0, xmm0, 0
+        vpclmulqdq	xmm3, xmm0, xmm0, 17
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm3, xmm3, xmm14
+        vmovdqu	OWORD PTR [rsp+48], xmm3
+        ; H ^ 5
+        ; ghash_gfmul_red_avx
+        vpshufd	xmm9, xmm0, 78
+        vpshufd	xmm10, xmm1, 78
+        vpclmulqdq	xmm11, xmm1, xmm0, 17
+        vpclmulqdq	xmm8, xmm1, xmm0, 0
+        vpxor	xmm9, xmm9, xmm0
+        vpxor	xmm10, xmm10, xmm1
+        vpclmulqdq	xmm9, xmm9, xmm10, 0
+        vpxor	xmm9, xmm9, xmm8
+        vpxor	xmm9, xmm9, xmm11
+        vpslldq	xmm10, xmm9, 8
+        vpsrldq	xmm9, xmm9, 8
+        vpxor	xmm8, xmm8, xmm10
+        vpxor	xmm7, xmm11, xmm9
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm7, xmm7, xmm14
+        vmovdqu	OWORD PTR [rsp+64], xmm7
+        ; H ^ 6
+        vpclmulqdq	xmm8, xmm1, xmm1, 0
+        vpclmulqdq	xmm7, xmm1, xmm1, 17
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm7, xmm7, xmm14
+        vmovdqu	OWORD PTR [rsp+80], xmm7
+        ; H ^ 7
+        ; ghash_gfmul_red_avx
+        vpshufd	xmm9, xmm1, 78
+        vpshufd	xmm10, xmm3, 78
+        vpclmulqdq	xmm11, xmm3, xmm1, 17
+        vpclmulqdq	xmm8, xmm3, xmm1, 0
+        vpxor	xmm9, xmm9, xmm1
+        vpxor	xmm10, xmm10, xmm3
+        vpclmulqdq	xmm9, xmm9, xmm10, 0
+        vpxor	xmm9, xmm9, xmm8
+        vpxor	xmm9, xmm9, xmm11
+        vpslldq	xmm10, xmm9, 8
+        vpsrldq	xmm9, xmm9, 8
+        vpxor	xmm8, xmm8, xmm10
+        vpxor	xmm7, xmm11, xmm9
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm7, xmm7, xmm14
+        vmovdqu	OWORD PTR [rsp+96], xmm7
+        ; H ^ 8
+        vpclmulqdq	xmm8, xmm3, xmm3, 0
+        vpclmulqdq	xmm7, xmm3, xmm3, 17
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm7, xmm7, xmm14
+        vmovdqu	OWORD PTR [rsp+112], xmm7
+        ; First 128 bytes of input
+        vmovdqu	xmm0, OWORD PTR [rsp+128]
+        vmovdqa	xmm1, OWORD PTR L_avx1_aes_gcm_bswap_epi64
+        vpshufb	xmm8, xmm0, xmm1
+        vpaddd	xmm9, xmm0, OWORD PTR L_avx1_aes_gcm_one
+        vpshufb	xmm9, xmm9, xmm1
+        vpaddd	xmm10, xmm0, OWORD PTR L_avx1_aes_gcm_two
+        vpshufb	xmm10, xmm10, xmm1
+        vpaddd	xmm11, xmm0, OWORD PTR L_avx1_aes_gcm_three
+        vpshufb	xmm11, xmm11, xmm1
+        vpaddd	xmm12, xmm0, OWORD PTR L_avx1_aes_gcm_four
+        vpshufb	xmm12, xmm12, xmm1
+        vpaddd	xmm13, xmm0, OWORD PTR L_avx1_aes_gcm_five
+        vpshufb	xmm13, xmm13, xmm1
+        vpaddd	xmm14, xmm0, OWORD PTR L_avx1_aes_gcm_six
+        vpshufb	xmm14, xmm14, xmm1
+        vpaddd	xmm15, xmm0, OWORD PTR L_avx1_aes_gcm_seven
+        vpshufb	xmm15, xmm15, xmm1
+        vpaddd	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_eight
+        vmovdqa	xmm7, OWORD PTR [r15]
+        vmovdqu	OWORD PTR [rsp+128], xmm0
+        vpxor	xmm8, xmm8, xmm7
+        vpxor	xmm9, xmm9, xmm7
+        vpxor	xmm10, xmm10, xmm7
+        vpxor	xmm11, xmm11, xmm7
+        vpxor	xmm12, xmm12, xmm7
+        vpxor	xmm13, xmm13, xmm7
+        vpxor	xmm14, xmm14, xmm7
+        vpxor	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+16]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+32]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+48]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+64]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+80]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+96]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+112]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+128]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+144]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        cmp	r10d, 11
+        vmovdqa	xmm7, OWORD PTR [r15+160]
+        jl	L_AES_GCM_encrypt_avx1_aesenc_128_enc_done
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+176]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        cmp	r10d, 13
+        vmovdqa	xmm7, OWORD PTR [r15+192]
+        jl	L_AES_GCM_encrypt_avx1_aesenc_128_enc_done
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+208]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+224]
+L_AES_GCM_encrypt_avx1_aesenc_128_enc_done:
+        vaesenclast	xmm8, xmm8, xmm7
+        vaesenclast	xmm9, xmm9, xmm7
+        vmovdqu	xmm0, OWORD PTR [rdi]
+        vmovdqu	xmm1, OWORD PTR [rdi+16]
+        vpxor	xmm8, xmm8, xmm0
+        vpxor	xmm9, xmm9, xmm1
+        vmovdqu	OWORD PTR [rsi], xmm8
+        vmovdqu	OWORD PTR [rsi+16], xmm9
+        vaesenclast	xmm10, xmm10, xmm7
+        vaesenclast	xmm11, xmm11, xmm7
+        vmovdqu	xmm0, OWORD PTR [rdi+32]
+        vmovdqu	xmm1, OWORD PTR [rdi+48]
+        vpxor	xmm10, xmm10, xmm0
+        vpxor	xmm11, xmm11, xmm1
+        vmovdqu	OWORD PTR [rsi+32], xmm10
+        vmovdqu	OWORD PTR [rsi+48], xmm11
+        vaesenclast	xmm12, xmm12, xmm7
+        vaesenclast	xmm13, xmm13, xmm7
+        vmovdqu	xmm0, OWORD PTR [rdi+64]
+        vmovdqu	xmm1, OWORD PTR [rdi+80]
+        vpxor	xmm12, xmm12, xmm0
+        vpxor	xmm13, xmm13, xmm1
+        vmovdqu	OWORD PTR [rsi+64], xmm12
+        vmovdqu	OWORD PTR [rsi+80], xmm13
+        vaesenclast	xmm14, xmm14, xmm7
+        vaesenclast	xmm15, xmm15, xmm7
+        vmovdqu	xmm0, OWORD PTR [rdi+96]
+        vmovdqu	xmm1, OWORD PTR [rdi+112]
+        vpxor	xmm14, xmm14, xmm0
+        vpxor	xmm15, xmm15, xmm1
+        vmovdqu	OWORD PTR [rsi+96], xmm14
+        vmovdqu	OWORD PTR [rsi+112], xmm15
+        cmp	r13d, 128
+        mov	ebx, 128
+        jle	L_AES_GCM_encrypt_avx1_end_128
+        ; More 128 bytes of input
+L_AES_GCM_encrypt_avx1_ghash_128:
+        lea	rcx, QWORD PTR [rdi+rbx]
+        lea	rdx, QWORD PTR [rsi+rbx]
+        vmovdqu	xmm0, OWORD PTR [rsp+128]
+        vmovdqa	xmm1, OWORD PTR L_avx1_aes_gcm_bswap_epi64
+        vpshufb	xmm8, xmm0, xmm1
+        vpaddd	xmm9, xmm0, OWORD PTR L_avx1_aes_gcm_one
+        vpshufb	xmm9, xmm9, xmm1
+        vpaddd	xmm10, xmm0, OWORD PTR L_avx1_aes_gcm_two
+        vpshufb	xmm10, xmm10, xmm1
+        vpaddd	xmm11, xmm0, OWORD PTR L_avx1_aes_gcm_three
+        vpshufb	xmm11, xmm11, xmm1
+        vpaddd	xmm12, xmm0, OWORD PTR L_avx1_aes_gcm_four
+        vpshufb	xmm12, xmm12, xmm1
+        vpaddd	xmm13, xmm0, OWORD PTR L_avx1_aes_gcm_five
+        vpshufb	xmm13, xmm13, xmm1
+        vpaddd	xmm14, xmm0, OWORD PTR L_avx1_aes_gcm_six
+        vpshufb	xmm14, xmm14, xmm1
+        vpaddd	xmm15, xmm0, OWORD PTR L_avx1_aes_gcm_seven
+        vpshufb	xmm15, xmm15, xmm1
+        vpaddd	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_eight
+        vmovdqa	xmm7, OWORD PTR [r15]
+        vmovdqu	OWORD PTR [rsp+128], xmm0
+        vpxor	xmm8, xmm8, xmm7
+        vpxor	xmm9, xmm9, xmm7
+        vpxor	xmm10, xmm10, xmm7
+        vpxor	xmm11, xmm11, xmm7
+        vpxor	xmm12, xmm12, xmm7
+        vpxor	xmm13, xmm13, xmm7
+        vpxor	xmm14, xmm14, xmm7
+        vpxor	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rsp+112]
+        vmovdqu	xmm0, OWORD PTR [rdx+-128]
+        vaesenc	xmm8, xmm8, [r15+16]
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vpxor	xmm0, xmm0, xmm2
+        vpshufd	xmm1, xmm7, 78
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm3, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [r15+16]
+        vaesenc	xmm10, xmm10, [r15+16]
+        vpclmulqdq	xmm2, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [r15+16]
+        vaesenc	xmm12, xmm12, [r15+16]
+        vpclmulqdq	xmm1, xmm1, xmm5, 0
+        vaesenc	xmm13, xmm13, [r15+16]
+        vaesenc	xmm14, xmm14, [r15+16]
+        vaesenc	xmm15, xmm15, [r15+16]
+        vpxor	xmm1, xmm1, xmm2
+        vpxor	xmm1, xmm1, xmm3
+        vmovdqu	xmm7, OWORD PTR [rsp+96]
+        vmovdqu	xmm0, OWORD PTR [rdx+-112]
+        vpshufd	xmm4, xmm7, 78
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vaesenc	xmm8, xmm8, [r15+32]
+        vpxor	xmm4, xmm4, xmm7
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm6, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [r15+32]
+        vaesenc	xmm10, xmm10, [r15+32]
+        vpclmulqdq	xmm7, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [r15+32]
+        vaesenc	xmm12, xmm12, [r15+32]
+        vpclmulqdq	xmm4, xmm4, xmm5, 0
+        vaesenc	xmm13, xmm13, [r15+32]
+        vaesenc	xmm14, xmm14, [r15+32]
+        vaesenc	xmm15, xmm15, [r15+32]
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm3, xmm3, xmm6
+        vpxor	xmm1, xmm1, xmm4
+        vmovdqu	xmm7, OWORD PTR [rsp+80]
+        vmovdqu	xmm0, OWORD PTR [rdx+-96]
+        vpshufd	xmm4, xmm7, 78
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vaesenc	xmm8, xmm8, [r15+48]
+        vpxor	xmm4, xmm4, xmm7
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm6, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [r15+48]
+        vaesenc	xmm10, xmm10, [r15+48]
+        vpclmulqdq	xmm7, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [r15+48]
+        vaesenc	xmm12, xmm12, [r15+48]
+        vpclmulqdq	xmm4, xmm4, xmm5, 0
+        vaesenc	xmm13, xmm13, [r15+48]
+        vaesenc	xmm14, xmm14, [r15+48]
+        vaesenc	xmm15, xmm15, [r15+48]
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm3, xmm3, xmm6
+        vpxor	xmm1, xmm1, xmm4
+        vmovdqu	xmm7, OWORD PTR [rsp+64]
+        vmovdqu	xmm0, OWORD PTR [rdx+-80]
+        vpshufd	xmm4, xmm7, 78
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vaesenc	xmm8, xmm8, [r15+64]
+        vpxor	xmm4, xmm4, xmm7
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm6, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [r15+64]
+        vaesenc	xmm10, xmm10, [r15+64]
+        vpclmulqdq	xmm7, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [r15+64]
+        vaesenc	xmm12, xmm12, [r15+64]
+        vpclmulqdq	xmm4, xmm4, xmm5, 0
+        vaesenc	xmm13, xmm13, [r15+64]
+        vaesenc	xmm14, xmm14, [r15+64]
+        vaesenc	xmm15, xmm15, [r15+64]
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm3, xmm3, xmm6
+        vpxor	xmm1, xmm1, xmm4
+        vmovdqu	xmm7, OWORD PTR [rsp+48]
+        vmovdqu	xmm0, OWORD PTR [rdx+-64]
+        vpshufd	xmm4, xmm7, 78
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vaesenc	xmm8, xmm8, [r15+80]
+        vpxor	xmm4, xmm4, xmm7
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm6, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [r15+80]
+        vaesenc	xmm10, xmm10, [r15+80]
+        vpclmulqdq	xmm7, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [r15+80]
+        vaesenc	xmm12, xmm12, [r15+80]
+        vpclmulqdq	xmm4, xmm4, xmm5, 0
+        vaesenc	xmm13, xmm13, [r15+80]
+        vaesenc	xmm14, xmm14, [r15+80]
+        vaesenc	xmm15, xmm15, [r15+80]
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm3, xmm3, xmm6
+        vpxor	xmm1, xmm1, xmm4
+        vmovdqu	xmm7, OWORD PTR [rsp+32]
+        vmovdqu	xmm0, OWORD PTR [rdx+-48]
+        vpshufd	xmm4, xmm7, 78
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vaesenc	xmm8, xmm8, [r15+96]
+        vpxor	xmm4, xmm4, xmm7
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm6, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [r15+96]
+        vaesenc	xmm10, xmm10, [r15+96]
+        vpclmulqdq	xmm7, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [r15+96]
+        vaesenc	xmm12, xmm12, [r15+96]
+        vpclmulqdq	xmm4, xmm4, xmm5, 0
+        vaesenc	xmm13, xmm13, [r15+96]
+        vaesenc	xmm14, xmm14, [r15+96]
+        vaesenc	xmm15, xmm15, [r15+96]
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm3, xmm3, xmm6
+        vpxor	xmm1, xmm1, xmm4
+        vmovdqu	xmm7, OWORD PTR [rsp+16]
+        vmovdqu	xmm0, OWORD PTR [rdx+-32]
+        vpshufd	xmm4, xmm7, 78
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vaesenc	xmm8, xmm8, [r15+112]
+        vpxor	xmm4, xmm4, xmm7
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm6, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [r15+112]
+        vaesenc	xmm10, xmm10, [r15+112]
+        vpclmulqdq	xmm7, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [r15+112]
+        vaesenc	xmm12, xmm12, [r15+112]
+        vpclmulqdq	xmm4, xmm4, xmm5, 0
+        vaesenc	xmm13, xmm13, [r15+112]
+        vaesenc	xmm14, xmm14, [r15+112]
+        vaesenc	xmm15, xmm15, [r15+112]
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm3, xmm3, xmm6
+        vpxor	xmm1, xmm1, xmm4
+        vmovdqu	xmm7, OWORD PTR [rsp]
+        vmovdqu	xmm0, OWORD PTR [rdx+-16]
+        vpshufd	xmm4, xmm7, 78
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vaesenc	xmm8, xmm8, [r15+128]
+        vpxor	xmm4, xmm4, xmm7
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm6, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [r15+128]
+        vaesenc	xmm10, xmm10, [r15+128]
+        vpclmulqdq	xmm7, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [r15+128]
+        vaesenc	xmm12, xmm12, [r15+128]
+        vpclmulqdq	xmm4, xmm4, xmm5, 0
+        vaesenc	xmm13, xmm13, [r15+128]
+        vaesenc	xmm14, xmm14, [r15+128]
+        vaesenc	xmm15, xmm15, [r15+128]
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm3, xmm3, xmm6
+        vpxor	xmm1, xmm1, xmm4
+        vpslldq	xmm5, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vaesenc	xmm8, xmm8, [r15+144]
+        vpxor	xmm2, xmm2, xmm5
+        vpxor	xmm3, xmm3, xmm1
+        vaesenc	xmm9, xmm9, [r15+144]
+        vpslld	xmm7, xmm2, 31
+        vpslld	xmm4, xmm2, 30
+        vpslld	xmm5, xmm2, 25
+        vaesenc	xmm10, xmm10, [r15+144]
+        vpxor	xmm7, xmm7, xmm4
+        vpxor	xmm7, xmm7, xmm5
+        vaesenc	xmm11, xmm11, [r15+144]
+        vpsrldq	xmm4, xmm7, 4
+        vpslldq	xmm7, xmm7, 12
+        vaesenc	xmm12, xmm12, [r15+144]
+        vpxor	xmm2, xmm2, xmm7
+        vpsrld	xmm5, xmm2, 1
+        vaesenc	xmm13, xmm13, [r15+144]
+        vpsrld	xmm1, xmm2, 2
+        vpsrld	xmm0, xmm2, 7
+        vaesenc	xmm14, xmm14, [r15+144]
+        vpxor	xmm5, xmm5, xmm1
+        vpxor	xmm5, xmm5, xmm0
+        vaesenc	xmm15, xmm15, [r15+144]
+        vpxor	xmm5, xmm5, xmm4
+        vpxor	xmm2, xmm2, xmm5
+        vpxor	xmm2, xmm2, xmm3
+        cmp	r10d, 11
+        vmovdqa	xmm7, OWORD PTR [r15+160]
+        jl	L_AES_GCM_encrypt_avx1_aesenc_128_ghash_avx_done
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+176]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        cmp	r10d, 13
+        vmovdqa	xmm7, OWORD PTR [r15+192]
+        jl	L_AES_GCM_encrypt_avx1_aesenc_128_ghash_avx_done
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+208]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+224]
+L_AES_GCM_encrypt_avx1_aesenc_128_ghash_avx_done:
+        vaesenclast	xmm8, xmm8, xmm7
+        vaesenclast	xmm9, xmm9, xmm7
+        vmovdqu	xmm0, OWORD PTR [rcx]
+        vmovdqu	xmm1, OWORD PTR [rcx+16]
+        vpxor	xmm8, xmm8, xmm0
+        vpxor	xmm9, xmm9, xmm1
+        vmovdqu	OWORD PTR [rdx], xmm8
+        vmovdqu	OWORD PTR [rdx+16], xmm9
+        vaesenclast	xmm10, xmm10, xmm7
+        vaesenclast	xmm11, xmm11, xmm7
+        vmovdqu	xmm0, OWORD PTR [rcx+32]
+        vmovdqu	xmm1, OWORD PTR [rcx+48]
+        vpxor	xmm10, xmm10, xmm0
+        vpxor	xmm11, xmm11, xmm1
+        vmovdqu	OWORD PTR [rdx+32], xmm10
+        vmovdqu	OWORD PTR [rdx+48], xmm11
+        vaesenclast	xmm12, xmm12, xmm7
+        vaesenclast	xmm13, xmm13, xmm7
+        vmovdqu	xmm0, OWORD PTR [rcx+64]
+        vmovdqu	xmm1, OWORD PTR [rcx+80]
+        vpxor	xmm12, xmm12, xmm0
+        vpxor	xmm13, xmm13, xmm1
+        vmovdqu	OWORD PTR [rdx+64], xmm12
+        vmovdqu	OWORD PTR [rdx+80], xmm13
+        vaesenclast	xmm14, xmm14, xmm7
+        vaesenclast	xmm15, xmm15, xmm7
+        vmovdqu	xmm0, OWORD PTR [rcx+96]
+        vmovdqu	xmm1, OWORD PTR [rcx+112]
+        vpxor	xmm14, xmm14, xmm0
+        vpxor	xmm15, xmm15, xmm1
+        vmovdqu	OWORD PTR [rdx+96], xmm14
+        vmovdqu	OWORD PTR [rdx+112], xmm15
+        add	ebx, 128
+        cmp	ebx, r13d
+        jl	L_AES_GCM_encrypt_avx1_ghash_128
+L_AES_GCM_encrypt_avx1_end_128:
+        vmovdqa	xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vpshufb	xmm8, xmm8, xmm4
+        vpshufb	xmm9, xmm9, xmm4
+        vpshufb	xmm10, xmm10, xmm4
+        vpshufb	xmm11, xmm11, xmm4
+        vpxor	xmm8, xmm8, xmm2
+        vpshufb	xmm12, xmm12, xmm4
+        vpshufb	xmm13, xmm13, xmm4
+        vpshufb	xmm14, xmm14, xmm4
+        vpshufb	xmm15, xmm15, xmm4
+        vmovdqu	xmm7, OWORD PTR [rsp]
+        vmovdqu	xmm5, OWORD PTR [rsp+16]
+        ; ghash_gfmul_avx
+        vpshufd	xmm1, xmm15, 78
+        vpshufd	xmm2, xmm7, 78
+        vpclmulqdq	xmm3, xmm7, xmm15, 17
+        vpclmulqdq	xmm0, xmm7, xmm15, 0
+        vpxor	xmm1, xmm1, xmm15
+        vpxor	xmm2, xmm2, xmm7
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vmovdqa	xmm4, xmm0
+        vmovdqa	xmm6, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm4, xmm4, xmm2
+        vpxor	xmm6, xmm6, xmm1
+        ; ghash_gfmul_xor_avx
+        vpshufd	xmm1, xmm14, 78
+        vpshufd	xmm2, xmm5, 78
+        vpclmulqdq	xmm3, xmm5, xmm14, 17
+        vpclmulqdq	xmm0, xmm5, xmm14, 0
+        vpxor	xmm1, xmm1, xmm14
+        vpxor	xmm2, xmm2, xmm5
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vpxor	xmm4, xmm4, xmm0
+        vpxor	xmm6, xmm6, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm4, xmm4, xmm2
+        vpxor	xmm6, xmm6, xmm1
+        vmovdqu	xmm7, OWORD PTR [rsp+32]
+        vmovdqu	xmm5, OWORD PTR [rsp+48]
+        ; ghash_gfmul_xor_avx
+        vpshufd	xmm1, xmm13, 78
+        vpshufd	xmm2, xmm7, 78
+        vpclmulqdq	xmm3, xmm7, xmm13, 17
+        vpclmulqdq	xmm0, xmm7, xmm13, 0
+        vpxor	xmm1, xmm1, xmm13
+        vpxor	xmm2, xmm2, xmm7
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vpxor	xmm4, xmm4, xmm0
+        vpxor	xmm6, xmm6, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm4, xmm4, xmm2
+        vpxor	xmm6, xmm6, xmm1
+        ; ghash_gfmul_xor_avx
+        vpshufd	xmm1, xmm12, 78
+        vpshufd	xmm2, xmm5, 78
+        vpclmulqdq	xmm3, xmm5, xmm12, 17
+        vpclmulqdq	xmm0, xmm5, xmm12, 0
+        vpxor	xmm1, xmm1, xmm12
+        vpxor	xmm2, xmm2, xmm5
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vpxor	xmm4, xmm4, xmm0
+        vpxor	xmm6, xmm6, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm4, xmm4, xmm2
+        vpxor	xmm6, xmm6, xmm1
+        vmovdqu	xmm7, OWORD PTR [rsp+64]
+        vmovdqu	xmm5, OWORD PTR [rsp+80]
+        ; ghash_gfmul_xor_avx
+        vpshufd	xmm1, xmm11, 78
+        vpshufd	xmm2, xmm7, 78
+        vpclmulqdq	xmm3, xmm7, xmm11, 17
+        vpclmulqdq	xmm0, xmm7, xmm11, 0
+        vpxor	xmm1, xmm1, xmm11
+        vpxor	xmm2, xmm2, xmm7
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vpxor	xmm4, xmm4, xmm0
+        vpxor	xmm6, xmm6, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm4, xmm4, xmm2
+        vpxor	xmm6, xmm6, xmm1
+        ; ghash_gfmul_xor_avx
+        vpshufd	xmm1, xmm10, 78
+        vpshufd	xmm2, xmm5, 78
+        vpclmulqdq	xmm3, xmm5, xmm10, 17
+        vpclmulqdq	xmm0, xmm5, xmm10, 0
+        vpxor	xmm1, xmm1, xmm10
+        vpxor	xmm2, xmm2, xmm5
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vpxor	xmm4, xmm4, xmm0
+        vpxor	xmm6, xmm6, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm4, xmm4, xmm2
+        vpxor	xmm6, xmm6, xmm1
+        vmovdqu	xmm7, OWORD PTR [rsp+96]
+        vmovdqu	xmm5, OWORD PTR [rsp+112]
+        ; ghash_gfmul_xor_avx
+        vpshufd	xmm1, xmm9, 78
+        vpshufd	xmm2, xmm7, 78
+        vpclmulqdq	xmm3, xmm7, xmm9, 17
+        vpclmulqdq	xmm0, xmm7, xmm9, 0
+        vpxor	xmm1, xmm1, xmm9
+        vpxor	xmm2, xmm2, xmm7
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vpxor	xmm4, xmm4, xmm0
+        vpxor	xmm6, xmm6, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm4, xmm4, xmm2
+        vpxor	xmm6, xmm6, xmm1
+        ; ghash_gfmul_xor_avx
+        vpshufd	xmm1, xmm8, 78
+        vpshufd	xmm2, xmm5, 78
+        vpclmulqdq	xmm3, xmm5, xmm8, 17
+        vpclmulqdq	xmm0, xmm5, xmm8, 0
+        vpxor	xmm1, xmm1, xmm8
+        vpxor	xmm2, xmm2, xmm5
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vpxor	xmm4, xmm4, xmm0
+        vpxor	xmm6, xmm6, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm4, xmm4, xmm2
+        vpxor	xmm6, xmm6, xmm1
+        vpslld	xmm0, xmm4, 31
+        vpslld	xmm1, xmm4, 30
+        vpslld	xmm2, xmm4, 25
+        vpxor	xmm0, xmm0, xmm1
+        vpxor	xmm0, xmm0, xmm2
+        vmovdqa	xmm1, xmm0
+        vpsrldq	xmm1, xmm1, 4
+        vpslldq	xmm0, xmm0, 12
+        vpxor	xmm4, xmm4, xmm0
+        vpsrld	xmm2, xmm4, 1
+        vpsrld	xmm3, xmm4, 2
+        vpsrld	xmm0, xmm4, 7
+        vpxor	xmm2, xmm2, xmm3
+        vpxor	xmm2, xmm2, xmm0
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm2, xmm2, xmm4
+        vpxor	xmm6, xmm6, xmm2
+        vmovdqu	xmm5, OWORD PTR [rsp]
+L_AES_GCM_encrypt_avx1_done_128:
+        mov	edx, r9d
+        cmp	ebx, edx
+        jge	L_AES_GCM_encrypt_avx1_done_enc
+        mov	r13d, r9d
+        and	r13d, 4294967280
+        cmp	ebx, r13d
+        jge	L_AES_GCM_encrypt_avx1_last_block_done
+        vmovdqu	xmm9, OWORD PTR [rsp+128]
+        vpshufb	xmm8, xmm9, OWORD PTR L_avx1_aes_gcm_bswap_epi64
+        vpaddd	xmm9, xmm9, OWORD PTR L_avx1_aes_gcm_one
+        vmovdqu	OWORD PTR [rsp+128], xmm9
+        vpxor	xmm8, xmm8, [r15]
+        vaesenc	xmm8, xmm8, [r15+16]
+        vaesenc	xmm8, xmm8, [r15+32]
+        vaesenc	xmm8, xmm8, [r15+48]
+        vaesenc	xmm8, xmm8, [r15+64]
+        vaesenc	xmm8, xmm8, [r15+80]
+        vaesenc	xmm8, xmm8, [r15+96]
+        vaesenc	xmm8, xmm8, [r15+112]
+        vaesenc	xmm8, xmm8, [r15+128]
+        vaesenc	xmm8, xmm8, [r15+144]
+        cmp	r10d, 11
+        vmovdqa	xmm9, OWORD PTR [r15+160]
+        jl	L_AES_GCM_encrypt_avx1_aesenc_block_last
+        vaesenc	xmm8, xmm8, xmm9
+        vaesenc	xmm8, xmm8, [r15+176]
+        cmp	r10d, 13
+        vmovdqa	xmm9, OWORD PTR [r15+192]
+        jl	L_AES_GCM_encrypt_avx1_aesenc_block_last
+        vaesenc	xmm8, xmm8, xmm9
+        vaesenc	xmm8, xmm8, [r15+208]
+        vmovdqa	xmm9, OWORD PTR [r15+224]
+L_AES_GCM_encrypt_avx1_aesenc_block_last:
+        vaesenclast	xmm8, xmm8, xmm9
+        vmovdqu	xmm9, OWORD PTR [rdi+rbx]
+        vpxor	xmm8, xmm8, xmm9
+        vmovdqu	OWORD PTR [rsi+rbx], xmm8
+        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vpxor	xmm6, xmm6, xmm8
+        add	ebx, 16
+        cmp	ebx, r13d
+        jge	L_AES_GCM_encrypt_avx1_last_block_ghash
+L_AES_GCM_encrypt_avx1_last_block_start:
+        vmovdqu	xmm13, OWORD PTR [rdi+rbx]
+        vmovdqu	xmm9, OWORD PTR [rsp+128]
+        vpshufb	xmm8, xmm9, OWORD PTR L_avx1_aes_gcm_bswap_epi64
+        vpaddd	xmm9, xmm9, OWORD PTR L_avx1_aes_gcm_one
+        vmovdqu	OWORD PTR [rsp+128], xmm9
+        vpxor	xmm8, xmm8, [r15]
+        vpclmulqdq	xmm10, xmm6, xmm5, 16
+        vaesenc	xmm8, xmm8, [r15+16]
+        vaesenc	xmm8, xmm8, [r15+32]
+        vpclmulqdq	xmm11, xmm6, xmm5, 1
+        vaesenc	xmm8, xmm8, [r15+48]
+        vaesenc	xmm8, xmm8, [r15+64]
+        vpclmulqdq	xmm12, xmm6, xmm5, 0
+        vaesenc	xmm8, xmm8, [r15+80]
+        vpclmulqdq	xmm1, xmm6, xmm5, 17
+        vaesenc	xmm8, xmm8, [r15+96]
+        vpxor	xmm10, xmm10, xmm11
+        vpslldq	xmm2, xmm10, 8
+        vpsrldq	xmm10, xmm10, 8
+        vaesenc	xmm8, xmm8, [r15+112]
+        vpxor	xmm2, xmm2, xmm12
+        vpxor	xmm3, xmm1, xmm10
+        vmovdqa	xmm0, OWORD PTR L_avx1_aes_gcm_mod2_128
+        vpclmulqdq	xmm11, xmm2, xmm0, 16
+        vaesenc	xmm8, xmm8, [r15+128]
+        vpshufd	xmm10, xmm2, 78
+        vpxor	xmm10, xmm10, xmm11
+        vpclmulqdq	xmm11, xmm10, xmm0, 16
+        vaesenc	xmm8, xmm8, [r15+144]
+        vpshufd	xmm10, xmm10, 78
+        vpxor	xmm10, xmm10, xmm11
+        vpxor	xmm6, xmm10, xmm3
+        cmp	r10d, 11
+        vmovdqa	xmm9, OWORD PTR [r15+160]
+        jl	L_AES_GCM_encrypt_avx1_aesenc_gfmul_last
+        vaesenc	xmm8, xmm8, xmm9
+        vaesenc	xmm8, xmm8, [r15+176]
+        cmp	r10d, 13
+        vmovdqa	xmm9, OWORD PTR [r15+192]
+        jl	L_AES_GCM_encrypt_avx1_aesenc_gfmul_last
+        vaesenc	xmm8, xmm8, xmm9
+        vaesenc	xmm8, xmm8, [r15+208]
+        vmovdqa	xmm9, OWORD PTR [r15+224]
+L_AES_GCM_encrypt_avx1_aesenc_gfmul_last:
+        vaesenclast	xmm8, xmm8, xmm9
+        vmovdqa	xmm0, xmm13
+        vpxor	xmm8, xmm8, xmm0
+        vmovdqu	OWORD PTR [rsi+rbx], xmm8
+        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        add	ebx, 16
+        vpxor	xmm6, xmm6, xmm8
+        cmp	ebx, r13d
+        jl	L_AES_GCM_encrypt_avx1_last_block_start
+L_AES_GCM_encrypt_avx1_last_block_ghash:
+        ; ghash_gfmul_red_avx
+        vpshufd	xmm9, xmm5, 78
+        vpshufd	xmm10, xmm6, 78
+        vpclmulqdq	xmm11, xmm6, xmm5, 17
+        vpclmulqdq	xmm8, xmm6, xmm5, 0
+        vpxor	xmm9, xmm9, xmm5
+        vpxor	xmm10, xmm10, xmm6
+        vpclmulqdq	xmm9, xmm9, xmm10, 0
+        vpxor	xmm9, xmm9, xmm8
+        vpxor	xmm9, xmm9, xmm11
+        vpslldq	xmm10, xmm9, 8
+        vpsrldq	xmm9, xmm9, 8
+        vpxor	xmm8, xmm8, xmm10
+        vpxor	xmm6, xmm11, xmm9
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm6, xmm6, xmm14
+L_AES_GCM_encrypt_avx1_last_block_done:
+        mov	ecx, r9d
+        mov	edx, ecx
+        and	ecx, 15
+        jz	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_done
+        vmovdqu	xmm4, OWORD PTR [rsp+128]
+        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_epi64
+        vpxor	xmm4, xmm4, [r15]
+        vaesenc	xmm4, xmm4, [r15+16]
+        vaesenc	xmm4, xmm4, [r15+32]
+        vaesenc	xmm4, xmm4, [r15+48]
+        vaesenc	xmm4, xmm4, [r15+64]
+        vaesenc	xmm4, xmm4, [r15+80]
+        vaesenc	xmm4, xmm4, [r15+96]
+        vaesenc	xmm4, xmm4, [r15+112]
+        vaesenc	xmm4, xmm4, [r15+128]
+        vaesenc	xmm4, xmm4, [r15+144]
+        cmp	r10d, 11
+        vmovdqa	xmm9, OWORD PTR [r15+160]
+        jl	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_aesenc_avx_last
+        vaesenc	xmm4, xmm4, xmm9
+        vaesenc	xmm4, xmm4, [r15+176]
+        cmp	r10d, 13
+        vmovdqa	xmm9, OWORD PTR [r15+192]
+        jl	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_aesenc_avx_last
+        vaesenc	xmm4, xmm4, xmm9
+        vaesenc	xmm4, xmm4, [r15+208]
+        vmovdqa	xmm9, OWORD PTR [r15+224]
+L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_aesenc_avx_last:
+        vaesenclast	xmm4, xmm4, xmm9
+        sub	rsp, 16
+        xor	ecx, ecx
+        vmovdqu	OWORD PTR [rsp], xmm4
+L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_loop:
+        movzx	r13d, BYTE PTR [rdi+rbx]
+        xor	r13b, BYTE PTR [rsp+rcx]
+        mov	BYTE PTR [rsi+rbx], r13b
+        mov	BYTE PTR [rsp+rcx], r13b
+        inc	ebx
+        inc	ecx
+        cmp	ebx, edx
+        jl	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_loop
+        xor	r13, r13
+        cmp	ecx, 16
+        je	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_finish_enc
+L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_byte_loop:
+        mov	BYTE PTR [rsp+rcx], r13b
+        inc	ecx
+        cmp	ecx, 16
+        jl	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_byte_loop
+L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_finish_enc:
+        vmovdqu	xmm4, OWORD PTR [rsp]
+        add	rsp, 16
+        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vpxor	xmm6, xmm6, xmm4
+        ; ghash_gfmul_red_avx
+        vpshufd	xmm9, xmm5, 78
+        vpshufd	xmm10, xmm6, 78
+        vpclmulqdq	xmm11, xmm6, xmm5, 17
+        vpclmulqdq	xmm8, xmm6, xmm5, 0
+        vpxor	xmm9, xmm9, xmm5
+        vpxor	xmm10, xmm10, xmm6
+        vpclmulqdq	xmm9, xmm9, xmm10, 0
+        vpxor	xmm9, xmm9, xmm8
+        vpxor	xmm9, xmm9, xmm11
+        vpslldq	xmm10, xmm9, 8
+        vpsrldq	xmm9, xmm9, 8
+        vpxor	xmm8, xmm8, xmm10
+        vpxor	xmm6, xmm11, xmm9
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm6, xmm6, xmm14
+L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_done:
+L_AES_GCM_encrypt_avx1_done_enc:
+        mov	edx, r9d
+        mov	ecx, r11d
+        shl	rdx, 3
+        shl	rcx, 3
+        vmovq	xmm0, rdx
+        vmovq	xmm1, rcx
+        vpunpcklqdq	xmm0, xmm0, xmm1
+        vpxor	xmm6, xmm6, xmm0
+        ; ghash_gfmul_red_avx
+        vpshufd	xmm9, xmm5, 78
+        vpshufd	xmm10, xmm6, 78
+        vpclmulqdq	xmm11, xmm6, xmm5, 17
+        vpclmulqdq	xmm8, xmm6, xmm5, 0
+        vpxor	xmm9, xmm9, xmm5
+        vpxor	xmm10, xmm10, xmm6
+        vpclmulqdq	xmm9, xmm9, xmm10, 0
+        vpxor	xmm9, xmm9, xmm8
+        vpxor	xmm9, xmm9, xmm11
+        vpslldq	xmm10, xmm9, 8
+        vpsrldq	xmm9, xmm9, 8
+        vpxor	xmm8, xmm8, xmm10
+        vpxor	xmm6, xmm11, xmm9
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm6, xmm6, xmm14
+        vpshufb	xmm6, xmm6, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vmovdqu	xmm0, OWORD PTR [rsp+144]
+        vpxor	xmm0, xmm0, xmm6
+        cmp	r14d, 16
+        je	L_AES_GCM_encrypt_avx1_store_tag_16
+        xor	rcx, rcx
+        vmovdqu	OWORD PTR [rsp], xmm0
+L_AES_GCM_encrypt_avx1_store_tag_loop:
+        movzx	r13d, BYTE PTR [rsp+rcx]
+        mov	BYTE PTR [r8+rcx], r13b
+        inc	ecx
+        cmp	ecx, r14d
+        jne	L_AES_GCM_encrypt_avx1_store_tag_loop
+        jmp	L_AES_GCM_encrypt_avx1_store_tag_done
+L_AES_GCM_encrypt_avx1_store_tag_16:
+        vmovdqu	OWORD PTR [r8], xmm0
+L_AES_GCM_encrypt_avx1_store_tag_done:
+        vzeroupper
+        add	rsp, 160
+        pop	r15
+        pop	r14
+        pop	rbx
+        pop	r12
+        pop	rsi
+        pop	rdi
+        pop	r13
+        ret
+AES_GCM_encrypt_avx1 ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+AES_GCM_decrypt_avx1 PROC
+        push	r13
+        push	rdi
+        push	rsi
+        push	r12
+        push	rbx
+        push	r14
+        push	r15
+        push	rbp
+        mov	rdi, rcx
+        mov	rsi, rdx
+        mov	r12, r8
+        mov	rax, r9
+        mov	r8, QWORD PTR [rsp+104]
+        mov	r9d, DWORD PTR [rsp+112]
+        mov	r11d, DWORD PTR [rsp+120]
+        mov	ebx, DWORD PTR [rsp+128]
+        mov	r14d, DWORD PTR [rsp+136]
+        mov	r15, QWORD PTR [rsp+144]
+        mov	r10d, DWORD PTR [rsp+152]
+        mov	rbp, QWORD PTR [rsp+160]
+        sub	rsp, 168
+        vpxor	xmm4, xmm4, xmm4
+        vpxor	xmm6, xmm6, xmm6
+        cmp	ebx, 12
+        mov	edx, ebx
+        jne	L_AES_GCM_decrypt_avx1_iv_not_12
+        ; # Calculate values when IV is 12 bytes
+        ; Set counter based on IV
+        mov	ecx, 16777216
+        vmovq	xmm4, QWORD PTR [rax]
+        vpinsrd	xmm4, xmm4, DWORD PTR [rax+8], 2
+        vpinsrd	xmm4, xmm4, ecx, 3
+        ; H = Encrypt X(=0) and T = Encrypt counter
+        vmovdqa	xmm5, OWORD PTR [r15]
+        vpxor	xmm1, xmm4, xmm5
+        vmovdqa	xmm7, OWORD PTR [r15+16]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+32]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+48]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+64]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+80]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+96]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+112]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+128]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+144]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        cmp	r10d, 11
+        vmovdqa	xmm7, OWORD PTR [r15+160]
+        jl	L_AES_GCM_decrypt_avx1_calc_iv_12_last
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+176]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        cmp	r10d, 13
+        vmovdqa	xmm7, OWORD PTR [r15+192]
+        jl	L_AES_GCM_decrypt_avx1_calc_iv_12_last
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+208]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+224]
+L_AES_GCM_decrypt_avx1_calc_iv_12_last:
+        vaesenclast	xmm5, xmm5, xmm7
+        vaesenclast	xmm1, xmm1, xmm7
+        vpshufb	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vmovdqu	OWORD PTR [rsp+144], xmm1
+        jmp	L_AES_GCM_decrypt_avx1_iv_done
+L_AES_GCM_decrypt_avx1_iv_not_12:
+        ; Calculate values when IV is not 12 bytes
+        ; H = Encrypt X(=0)
+        vmovdqa	xmm5, OWORD PTR [r15]
+        vaesenc	xmm5, xmm5, [r15+16]
+        vaesenc	xmm5, xmm5, [r15+32]
+        vaesenc	xmm5, xmm5, [r15+48]
+        vaesenc	xmm5, xmm5, [r15+64]
+        vaesenc	xmm5, xmm5, [r15+80]
+        vaesenc	xmm5, xmm5, [r15+96]
+        vaesenc	xmm5, xmm5, [r15+112]
+        vaesenc	xmm5, xmm5, [r15+128]
+        vaesenc	xmm5, xmm5, [r15+144]
+        cmp	r10d, 11
+        vmovdqa	xmm9, OWORD PTR [r15+160]
+        jl	L_AES_GCM_decrypt_avx1_calc_iv_1_aesenc_avx_last
+        vaesenc	xmm5, xmm5, xmm9
+        vaesenc	xmm5, xmm5, [r15+176]
+        cmp	r10d, 13
+        vmovdqa	xmm9, OWORD PTR [r15+192]
+        jl	L_AES_GCM_decrypt_avx1_calc_iv_1_aesenc_avx_last
+        vaesenc	xmm5, xmm5, xmm9
+        vaesenc	xmm5, xmm5, [r15+208]
+        vmovdqa	xmm9, OWORD PTR [r15+224]
+L_AES_GCM_decrypt_avx1_calc_iv_1_aesenc_avx_last:
+        vaesenclast	xmm5, xmm5, xmm9
+        vpshufb	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        ; Calc counter
+        ; Initialization vector
+        cmp	edx, 0
+        mov	rcx, 0
+        je	L_AES_GCM_decrypt_avx1_calc_iv_done
+        cmp	edx, 16
+        jl	L_AES_GCM_decrypt_avx1_calc_iv_lt16
+        and	edx, 4294967280
+L_AES_GCM_decrypt_avx1_calc_iv_16_loop:
+        vmovdqu	xmm8, OWORD PTR [rax+rcx]
+        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vpxor	xmm4, xmm4, xmm8
+        ; ghash_gfmul_avx
+        vpshufd	xmm1, xmm4, 78
+        vpshufd	xmm2, xmm5, 78
+        vpclmulqdq	xmm3, xmm5, xmm4, 17
+        vpclmulqdq	xmm0, xmm5, xmm4, 0
+        vpxor	xmm1, xmm1, xmm4
+        vpxor	xmm2, xmm2, xmm5
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vmovdqa	xmm7, xmm0
+        vmovdqa	xmm4, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm7, xmm7, xmm2
+        vpxor	xmm4, xmm4, xmm1
+        vpsrld	xmm0, xmm7, 31
+        vpsrld	xmm1, xmm4, 31
+        vpslld	xmm7, xmm7, 1
+        vpslld	xmm4, xmm4, 1
+        vpsrldq	xmm2, xmm0, 12
+        vpslldq	xmm0, xmm0, 4
+        vpslldq	xmm1, xmm1, 4
+        vpor	xmm4, xmm4, xmm2
+        vpor	xmm7, xmm7, xmm0
+        vpor	xmm4, xmm4, xmm1
+        vpslld	xmm0, xmm7, 31
+        vpslld	xmm1, xmm7, 30
+        vpslld	xmm2, xmm7, 25
+        vpxor	xmm0, xmm0, xmm1
+        vpxor	xmm0, xmm0, xmm2
+        vmovdqa	xmm1, xmm0
+        vpsrldq	xmm1, xmm1, 4
+        vpslldq	xmm0, xmm0, 12
+        vpxor	xmm7, xmm7, xmm0
+        vpsrld	xmm2, xmm7, 1
+        vpsrld	xmm3, xmm7, 2
+        vpsrld	xmm0, xmm7, 7
+        vpxor	xmm2, xmm2, xmm3
+        vpxor	xmm2, xmm2, xmm0
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm4, xmm4, xmm2
+        add	ecx, 16
+        cmp	ecx, edx
+        jl	L_AES_GCM_decrypt_avx1_calc_iv_16_loop
+        mov	edx, ebx
+        cmp	ecx, edx
+        je	L_AES_GCM_decrypt_avx1_calc_iv_done
+L_AES_GCM_decrypt_avx1_calc_iv_lt16:
+        sub	rsp, 16
+        vpxor	xmm8, xmm8, xmm8
+        xor	ebx, ebx
+        vmovdqu	OWORD PTR [rsp], xmm8
+L_AES_GCM_decrypt_avx1_calc_iv_loop:
+        movzx	r13d, BYTE PTR [rax+rcx]
+        mov	BYTE PTR [rsp+rbx], r13b
+        inc	ecx
+        inc	ebx
+        cmp	ecx, edx
+        jl	L_AES_GCM_decrypt_avx1_calc_iv_loop
+        vmovdqu	xmm8, OWORD PTR [rsp]
+        add	rsp, 16
+        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vpxor	xmm4, xmm4, xmm8
+        ; ghash_gfmul_avx
+        vpshufd	xmm1, xmm4, 78
+        vpshufd	xmm2, xmm5, 78
+        vpclmulqdq	xmm3, xmm5, xmm4, 17
+        vpclmulqdq	xmm0, xmm5, xmm4, 0
+        vpxor	xmm1, xmm1, xmm4
+        vpxor	xmm2, xmm2, xmm5
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vmovdqa	xmm7, xmm0
+        vmovdqa	xmm4, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm7, xmm7, xmm2
+        vpxor	xmm4, xmm4, xmm1
+        vpsrld	xmm0, xmm7, 31
+        vpsrld	xmm1, xmm4, 31
+        vpslld	xmm7, xmm7, 1
+        vpslld	xmm4, xmm4, 1
+        vpsrldq	xmm2, xmm0, 12
+        vpslldq	xmm0, xmm0, 4
+        vpslldq	xmm1, xmm1, 4
+        vpor	xmm4, xmm4, xmm2
+        vpor	xmm7, xmm7, xmm0
+        vpor	xmm4, xmm4, xmm1
+        vpslld	xmm0, xmm7, 31
+        vpslld	xmm1, xmm7, 30
+        vpslld	xmm2, xmm7, 25
+        vpxor	xmm0, xmm0, xmm1
+        vpxor	xmm0, xmm0, xmm2
+        vmovdqa	xmm1, xmm0
+        vpsrldq	xmm1, xmm1, 4
+        vpslldq	xmm0, xmm0, 12
+        vpxor	xmm7, xmm7, xmm0
+        vpsrld	xmm2, xmm7, 1
+        vpsrld	xmm3, xmm7, 2
+        vpsrld	xmm0, xmm7, 7
+        vpxor	xmm2, xmm2, xmm3
+        vpxor	xmm2, xmm2, xmm0
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm4, xmm4, xmm2
+L_AES_GCM_decrypt_avx1_calc_iv_done:
+        ; T = Encrypt counter
+        vpxor	xmm0, xmm0, xmm0
+        shl	edx, 3
+        vmovq	xmm0, rdx
+        vpxor	xmm4, xmm4, xmm0
+        ; ghash_gfmul_avx
+        vpshufd	xmm1, xmm4, 78
+        vpshufd	xmm2, xmm5, 78
+        vpclmulqdq	xmm3, xmm5, xmm4, 17
+        vpclmulqdq	xmm0, xmm5, xmm4, 0
+        vpxor	xmm1, xmm1, xmm4
+        vpxor	xmm2, xmm2, xmm5
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vmovdqa	xmm7, xmm0
+        vmovdqa	xmm4, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm7, xmm7, xmm2
+        vpxor	xmm4, xmm4, xmm1
+        vpsrld	xmm0, xmm7, 31
+        vpsrld	xmm1, xmm4, 31
+        vpslld	xmm7, xmm7, 1
+        vpslld	xmm4, xmm4, 1
+        vpsrldq	xmm2, xmm0, 12
+        vpslldq	xmm0, xmm0, 4
+        vpslldq	xmm1, xmm1, 4
+        vpor	xmm4, xmm4, xmm2
+        vpor	xmm7, xmm7, xmm0
+        vpor	xmm4, xmm4, xmm1
+        vpslld	xmm0, xmm7, 31
+        vpslld	xmm1, xmm7, 30
+        vpslld	xmm2, xmm7, 25
+        vpxor	xmm0, xmm0, xmm1
+        vpxor	xmm0, xmm0, xmm2
+        vmovdqa	xmm1, xmm0
+        vpsrldq	xmm1, xmm1, 4
+        vpslldq	xmm0, xmm0, 12
+        vpxor	xmm7, xmm7, xmm0
+        vpsrld	xmm2, xmm7, 1
+        vpsrld	xmm3, xmm7, 2
+        vpsrld	xmm0, xmm7, 7
+        vpxor	xmm2, xmm2, xmm3
+        vpxor	xmm2, xmm2, xmm0
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm4, xmm4, xmm2
+        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        ;   Encrypt counter
+        vmovdqa	xmm8, OWORD PTR [r15]
+        vpxor	xmm8, xmm8, xmm4
+        vaesenc	xmm8, xmm8, [r15+16]
+        vaesenc	xmm8, xmm8, [r15+32]
+        vaesenc	xmm8, xmm8, [r15+48]
+        vaesenc	xmm8, xmm8, [r15+64]
+        vaesenc	xmm8, xmm8, [r15+80]
+        vaesenc	xmm8, xmm8, [r15+96]
+        vaesenc	xmm8, xmm8, [r15+112]
+        vaesenc	xmm8, xmm8, [r15+128]
+        vaesenc	xmm8, xmm8, [r15+144]
+        cmp	r10d, 11
+        vmovdqa	xmm9, OWORD PTR [r15+160]
+        jl	L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last
+        vaesenc	xmm8, xmm8, xmm9
+        vaesenc	xmm8, xmm8, [r15+176]
+        cmp	r10d, 13
+        vmovdqa	xmm9, OWORD PTR [r15+192]
+        jl	L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last
+        vaesenc	xmm8, xmm8, xmm9
+        vaesenc	xmm8, xmm8, [r15+208]
+        vmovdqa	xmm9, OWORD PTR [r15+224]
+L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last:
+        vaesenclast	xmm8, xmm8, xmm9
+        vmovdqu	OWORD PTR [rsp+144], xmm8
+L_AES_GCM_decrypt_avx1_iv_done:
+        ; Additional authentication data
+        mov	edx, r11d
+        cmp	edx, 0
+        je	L_AES_GCM_decrypt_avx1_calc_aad_done
+        xor	ecx, ecx
+        cmp	edx, 16
+        jl	L_AES_GCM_decrypt_avx1_calc_aad_lt16
+        and	edx, 4294967280
+L_AES_GCM_decrypt_avx1_calc_aad_16_loop:
+        vmovdqu	xmm8, OWORD PTR [r12+rcx]
+        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vpxor	xmm6, xmm6, xmm8
+        ; ghash_gfmul_avx
+        vpshufd	xmm1, xmm6, 78
+        vpshufd	xmm2, xmm5, 78
+        vpclmulqdq	xmm3, xmm5, xmm6, 17
+        vpclmulqdq	xmm0, xmm5, xmm6, 0
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm2, xmm2, xmm5
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vmovdqa	xmm7, xmm0
+        vmovdqa	xmm6, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm7, xmm7, xmm2
+        vpxor	xmm6, xmm6, xmm1
+        vpsrld	xmm0, xmm7, 31
+        vpsrld	xmm1, xmm6, 31
+        vpslld	xmm7, xmm7, 1
+        vpslld	xmm6, xmm6, 1
+        vpsrldq	xmm2, xmm0, 12
+        vpslldq	xmm0, xmm0, 4
+        vpslldq	xmm1, xmm1, 4
+        vpor	xmm6, xmm6, xmm2
+        vpor	xmm7, xmm7, xmm0
+        vpor	xmm6, xmm6, xmm1
+        vpslld	xmm0, xmm7, 31
+        vpslld	xmm1, xmm7, 30
+        vpslld	xmm2, xmm7, 25
+        vpxor	xmm0, xmm0, xmm1
+        vpxor	xmm0, xmm0, xmm2
+        vmovdqa	xmm1, xmm0
+        vpsrldq	xmm1, xmm1, 4
+        vpslldq	xmm0, xmm0, 12
+        vpxor	xmm7, xmm7, xmm0
+        vpsrld	xmm2, xmm7, 1
+        vpsrld	xmm3, xmm7, 2
+        vpsrld	xmm0, xmm7, 7
+        vpxor	xmm2, xmm2, xmm3
+        vpxor	xmm2, xmm2, xmm0
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm6, xmm6, xmm2
+        add	ecx, 16
+        cmp	ecx, edx
+        jl	L_AES_GCM_decrypt_avx1_calc_aad_16_loop
+        mov	edx, r11d
+        cmp	ecx, edx
+        je	L_AES_GCM_decrypt_avx1_calc_aad_done
+L_AES_GCM_decrypt_avx1_calc_aad_lt16:
+        sub	rsp, 16
+        vpxor	xmm8, xmm8, xmm8
+        xor	ebx, ebx
+        vmovdqu	OWORD PTR [rsp], xmm8
+L_AES_GCM_decrypt_avx1_calc_aad_loop:
+        movzx	r13d, BYTE PTR [r12+rcx]
+        mov	BYTE PTR [rsp+rbx], r13b
+        inc	ecx
+        inc	ebx
+        cmp	ecx, edx
+        jl	L_AES_GCM_decrypt_avx1_calc_aad_loop
+        vmovdqu	xmm8, OWORD PTR [rsp]
+        add	rsp, 16
+        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vpxor	xmm6, xmm6, xmm8
+        ; ghash_gfmul_avx
+        vpshufd	xmm1, xmm6, 78
+        vpshufd	xmm2, xmm5, 78
+        vpclmulqdq	xmm3, xmm5, xmm6, 17
+        vpclmulqdq	xmm0, xmm5, xmm6, 0
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm2, xmm2, xmm5
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vmovdqa	xmm7, xmm0
+        vmovdqa	xmm6, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm7, xmm7, xmm2
+        vpxor	xmm6, xmm6, xmm1
+        vpsrld	xmm0, xmm7, 31
+        vpsrld	xmm1, xmm6, 31
+        vpslld	xmm7, xmm7, 1
+        vpslld	xmm6, xmm6, 1
+        vpsrldq	xmm2, xmm0, 12
+        vpslldq	xmm0, xmm0, 4
+        vpslldq	xmm1, xmm1, 4
+        vpor	xmm6, xmm6, xmm2
+        vpor	xmm7, xmm7, xmm0
+        vpor	xmm6, xmm6, xmm1
+        vpslld	xmm0, xmm7, 31
+        vpslld	xmm1, xmm7, 30
+        vpslld	xmm2, xmm7, 25
+        vpxor	xmm0, xmm0, xmm1
+        vpxor	xmm0, xmm0, xmm2
+        vmovdqa	xmm1, xmm0
+        vpsrldq	xmm1, xmm1, 4
+        vpslldq	xmm0, xmm0, 12
+        vpxor	xmm7, xmm7, xmm0
+        vpsrld	xmm2, xmm7, 1
+        vpsrld	xmm3, xmm7, 2
+        vpsrld	xmm0, xmm7, 7
+        vpxor	xmm2, xmm2, xmm3
+        vpxor	xmm2, xmm2, xmm0
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm6, xmm6, xmm2
+L_AES_GCM_decrypt_avx1_calc_aad_done:
+        ; Calculate counter and H
+        vpsrlq	xmm9, xmm5, 63
+        vpsllq	xmm8, xmm5, 1
+        vpslldq	xmm9, xmm9, 8
+        vpor	xmm8, xmm8, xmm9
+        vpshufd	xmm5, xmm5, 255
+        vpsrad	xmm5, xmm5, 31
+        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_epi64
+        vpand	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_mod2_128
+        vpaddd	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_one
+        vpxor	xmm5, xmm5, xmm8
+        vmovdqu	OWORD PTR [rsp+128], xmm4
+        xor	ebx, ebx
+        cmp	r9d, 128
+        mov	r13d, r9d
+        jl	L_AES_GCM_decrypt_avx1_done_128
+        and	r13d, 4294967168
+        vmovdqa	xmm2, xmm6
+        ; H ^ 1
+        vmovdqu	OWORD PTR [rsp], xmm5
+        ; H ^ 2
+        vpclmulqdq	xmm8, xmm5, xmm5, 0
+        vpclmulqdq	xmm0, xmm5, xmm5, 17
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm0, xmm0, xmm14
+        vmovdqu	OWORD PTR [rsp+16], xmm0
+        ; H ^ 3
+        ; ghash_gfmul_red_avx
+        vpshufd	xmm9, xmm5, 78
+        vpshufd	xmm10, xmm0, 78
+        vpclmulqdq	xmm11, xmm0, xmm5, 17
+        vpclmulqdq	xmm8, xmm0, xmm5, 0
+        vpxor	xmm9, xmm9, xmm5
+        vpxor	xmm10, xmm10, xmm0
+        vpclmulqdq	xmm9, xmm9, xmm10, 0
+        vpxor	xmm9, xmm9, xmm8
+        vpxor	xmm9, xmm9, xmm11
+        vpslldq	xmm10, xmm9, 8
+        vpsrldq	xmm9, xmm9, 8
+        vpxor	xmm8, xmm8, xmm10
+        vpxor	xmm1, xmm11, xmm9
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm1, xmm1, xmm14
+        vmovdqu	OWORD PTR [rsp+32], xmm1
+        ; H ^ 4
+        vpclmulqdq	xmm8, xmm0, xmm0, 0
+        vpclmulqdq	xmm3, xmm0, xmm0, 17
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm3, xmm3, xmm14
+        vmovdqu	OWORD PTR [rsp+48], xmm3
+        ; H ^ 5
+        ; ghash_gfmul_red_avx
+        vpshufd	xmm9, xmm0, 78
+        vpshufd	xmm10, xmm1, 78
+        vpclmulqdq	xmm11, xmm1, xmm0, 17
+        vpclmulqdq	xmm8, xmm1, xmm0, 0
+        vpxor	xmm9, xmm9, xmm0
+        vpxor	xmm10, xmm10, xmm1
+        vpclmulqdq	xmm9, xmm9, xmm10, 0
+        vpxor	xmm9, xmm9, xmm8
+        vpxor	xmm9, xmm9, xmm11
+        vpslldq	xmm10, xmm9, 8
+        vpsrldq	xmm9, xmm9, 8
+        vpxor	xmm8, xmm8, xmm10
+        vpxor	xmm7, xmm11, xmm9
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm7, xmm7, xmm14
+        vmovdqu	OWORD PTR [rsp+64], xmm7
+        ; H ^ 6
+        vpclmulqdq	xmm8, xmm1, xmm1, 0
+        vpclmulqdq	xmm7, xmm1, xmm1, 17
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm7, xmm7, xmm14
+        vmovdqu	OWORD PTR [rsp+80], xmm7
+        ; H ^ 7
+        ; ghash_gfmul_red_avx
+        vpshufd	xmm9, xmm1, 78
+        vpshufd	xmm10, xmm3, 78
+        vpclmulqdq	xmm11, xmm3, xmm1, 17
+        vpclmulqdq	xmm8, xmm3, xmm1, 0
+        vpxor	xmm9, xmm9, xmm1
+        vpxor	xmm10, xmm10, xmm3
+        vpclmulqdq	xmm9, xmm9, xmm10, 0
+        vpxor	xmm9, xmm9, xmm8
+        vpxor	xmm9, xmm9, xmm11
+        vpslldq	xmm10, xmm9, 8
+        vpsrldq	xmm9, xmm9, 8
+        vpxor	xmm8, xmm8, xmm10
+        vpxor	xmm7, xmm11, xmm9
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm7, xmm7, xmm14
+        vmovdqu	OWORD PTR [rsp+96], xmm7
+        ; H ^ 8
+        vpclmulqdq	xmm8, xmm3, xmm3, 0
+        vpclmulqdq	xmm7, xmm3, xmm3, 17
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm7, xmm7, xmm14
+        vmovdqu	OWORD PTR [rsp+112], xmm7
+L_AES_GCM_decrypt_avx1_ghash_128:
+        lea	rcx, QWORD PTR [rdi+rbx]
+        lea	rdx, QWORD PTR [rsi+rbx]
+        vmovdqu	xmm0, OWORD PTR [rsp+128]
+        vmovdqa	xmm1, OWORD PTR L_avx1_aes_gcm_bswap_epi64
+        vpshufb	xmm8, xmm0, xmm1
+        vpaddd	xmm9, xmm0, OWORD PTR L_avx1_aes_gcm_one
+        vpshufb	xmm9, xmm9, xmm1
+        vpaddd	xmm10, xmm0, OWORD PTR L_avx1_aes_gcm_two
+        vpshufb	xmm10, xmm10, xmm1
+        vpaddd	xmm11, xmm0, OWORD PTR L_avx1_aes_gcm_three
+        vpshufb	xmm11, xmm11, xmm1
+        vpaddd	xmm12, xmm0, OWORD PTR L_avx1_aes_gcm_four
+        vpshufb	xmm12, xmm12, xmm1
+        vpaddd	xmm13, xmm0, OWORD PTR L_avx1_aes_gcm_five
+        vpshufb	xmm13, xmm13, xmm1
+        vpaddd	xmm14, xmm0, OWORD PTR L_avx1_aes_gcm_six
+        vpshufb	xmm14, xmm14, xmm1
+        vpaddd	xmm15, xmm0, OWORD PTR L_avx1_aes_gcm_seven
+        vpshufb	xmm15, xmm15, xmm1
+        vpaddd	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_eight
+        vmovdqa	xmm7, OWORD PTR [r15]
+        vmovdqu	OWORD PTR [rsp+128], xmm0
+        vpxor	xmm8, xmm8, xmm7
+        vpxor	xmm9, xmm9, xmm7
+        vpxor	xmm10, xmm10, xmm7
+        vpxor	xmm11, xmm11, xmm7
+        vpxor	xmm12, xmm12, xmm7
+        vpxor	xmm13, xmm13, xmm7
+        vpxor	xmm14, xmm14, xmm7
+        vpxor	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rsp+112]
+        vmovdqu	xmm0, OWORD PTR [rcx]
+        vaesenc	xmm8, xmm8, [r15+16]
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vpxor	xmm0, xmm0, xmm2
+        vpshufd	xmm1, xmm7, 78
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm3, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [r15+16]
+        vaesenc	xmm10, xmm10, [r15+16]
+        vpclmulqdq	xmm2, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [r15+16]
+        vaesenc	xmm12, xmm12, [r15+16]
+        vpclmulqdq	xmm1, xmm1, xmm5, 0
+        vaesenc	xmm13, xmm13, [r15+16]
+        vaesenc	xmm14, xmm14, [r15+16]
+        vaesenc	xmm15, xmm15, [r15+16]
+        vpxor	xmm1, xmm1, xmm2
+        vpxor	xmm1, xmm1, xmm3
+        vmovdqu	xmm7, OWORD PTR [rsp+96]
+        vmovdqu	xmm0, OWORD PTR [rcx+16]
+        vpshufd	xmm4, xmm7, 78
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vaesenc	xmm8, xmm8, [r15+32]
+        vpxor	xmm4, xmm4, xmm7
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm6, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [r15+32]
+        vaesenc	xmm10, xmm10, [r15+32]
+        vpclmulqdq	xmm7, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [r15+32]
+        vaesenc	xmm12, xmm12, [r15+32]
+        vpclmulqdq	xmm4, xmm4, xmm5, 0
+        vaesenc	xmm13, xmm13, [r15+32]
+        vaesenc	xmm14, xmm14, [r15+32]
+        vaesenc	xmm15, xmm15, [r15+32]
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm3, xmm3, xmm6
+        vpxor	xmm1, xmm1, xmm4
+        vmovdqu	xmm7, OWORD PTR [rsp+80]
+        vmovdqu	xmm0, OWORD PTR [rcx+32]
+        vpshufd	xmm4, xmm7, 78
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vaesenc	xmm8, xmm8, [r15+48]
+        vpxor	xmm4, xmm4, xmm7
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm6, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [r15+48]
+        vaesenc	xmm10, xmm10, [r15+48]
+        vpclmulqdq	xmm7, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [r15+48]
+        vaesenc	xmm12, xmm12, [r15+48]
+        vpclmulqdq	xmm4, xmm4, xmm5, 0
+        vaesenc	xmm13, xmm13, [r15+48]
+        vaesenc	xmm14, xmm14, [r15+48]
+        vaesenc	xmm15, xmm15, [r15+48]
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm3, xmm3, xmm6
+        vpxor	xmm1, xmm1, xmm4
+        vmovdqu	xmm7, OWORD PTR [rsp+64]
+        vmovdqu	xmm0, OWORD PTR [rcx+48]
+        vpshufd	xmm4, xmm7, 78
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vaesenc	xmm8, xmm8, [r15+64]
+        vpxor	xmm4, xmm4, xmm7
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm6, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [r15+64]
+        vaesenc	xmm10, xmm10, [r15+64]
+        vpclmulqdq	xmm7, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [r15+64]
+        vaesenc	xmm12, xmm12, [r15+64]
+        vpclmulqdq	xmm4, xmm4, xmm5, 0
+        vaesenc	xmm13, xmm13, [r15+64]
+        vaesenc	xmm14, xmm14, [r15+64]
+        vaesenc	xmm15, xmm15, [r15+64]
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm3, xmm3, xmm6
+        vpxor	xmm1, xmm1, xmm4
+        vmovdqu	xmm7, OWORD PTR [rsp+48]
+        vmovdqu	xmm0, OWORD PTR [rcx+64]
+        vpshufd	xmm4, xmm7, 78
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vaesenc	xmm8, xmm8, [r15+80]
+        vpxor	xmm4, xmm4, xmm7
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm6, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [r15+80]
+        vaesenc	xmm10, xmm10, [r15+80]
+        vpclmulqdq	xmm7, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [r15+80]
+        vaesenc	xmm12, xmm12, [r15+80]
+        vpclmulqdq	xmm4, xmm4, xmm5, 0
+        vaesenc	xmm13, xmm13, [r15+80]
+        vaesenc	xmm14, xmm14, [r15+80]
+        vaesenc	xmm15, xmm15, [r15+80]
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm3, xmm3, xmm6
+        vpxor	xmm1, xmm1, xmm4
+        vmovdqu	xmm7, OWORD PTR [rsp+32]
+        vmovdqu	xmm0, OWORD PTR [rcx+80]
+        vpshufd	xmm4, xmm7, 78
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vaesenc	xmm8, xmm8, [r15+96]
+        vpxor	xmm4, xmm4, xmm7
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm6, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [r15+96]
+        vaesenc	xmm10, xmm10, [r15+96]
+        vpclmulqdq	xmm7, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [r15+96]
+        vaesenc	xmm12, xmm12, [r15+96]
+        vpclmulqdq	xmm4, xmm4, xmm5, 0
+        vaesenc	xmm13, xmm13, [r15+96]
+        vaesenc	xmm14, xmm14, [r15+96]
+        vaesenc	xmm15, xmm15, [r15+96]
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm3, xmm3, xmm6
+        vpxor	xmm1, xmm1, xmm4
+        vmovdqu	xmm7, OWORD PTR [rsp+16]
+        vmovdqu	xmm0, OWORD PTR [rcx+96]
+        vpshufd	xmm4, xmm7, 78
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vaesenc	xmm8, xmm8, [r15+112]
+        vpxor	xmm4, xmm4, xmm7
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm6, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [r15+112]
+        vaesenc	xmm10, xmm10, [r15+112]
+        vpclmulqdq	xmm7, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [r15+112]
+        vaesenc	xmm12, xmm12, [r15+112]
+        vpclmulqdq	xmm4, xmm4, xmm5, 0
+        vaesenc	xmm13, xmm13, [r15+112]
+        vaesenc	xmm14, xmm14, [r15+112]
+        vaesenc	xmm15, xmm15, [r15+112]
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm3, xmm3, xmm6
+        vpxor	xmm1, xmm1, xmm4
+        vmovdqu	xmm7, OWORD PTR [rsp]
+        vmovdqu	xmm0, OWORD PTR [rcx+112]
+        vpshufd	xmm4, xmm7, 78
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vaesenc	xmm8, xmm8, [r15+128]
+        vpxor	xmm4, xmm4, xmm7
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm6, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [r15+128]
+        vaesenc	xmm10, xmm10, [r15+128]
+        vpclmulqdq	xmm7, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [r15+128]
+        vaesenc	xmm12, xmm12, [r15+128]
+        vpclmulqdq	xmm4, xmm4, xmm5, 0
+        vaesenc	xmm13, xmm13, [r15+128]
+        vaesenc	xmm14, xmm14, [r15+128]
+        vaesenc	xmm15, xmm15, [r15+128]
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm3, xmm3, xmm6
+        vpxor	xmm1, xmm1, xmm4
+        vpslldq	xmm5, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vaesenc	xmm8, xmm8, [r15+144]
+        vpxor	xmm2, xmm2, xmm5
+        vpxor	xmm3, xmm3, xmm1
+        vaesenc	xmm9, xmm9, [r15+144]
+        vpslld	xmm7, xmm2, 31
+        vpslld	xmm4, xmm2, 30
+        vpslld	xmm5, xmm2, 25
+        vaesenc	xmm10, xmm10, [r15+144]
+        vpxor	xmm7, xmm7, xmm4
+        vpxor	xmm7, xmm7, xmm5
+        vaesenc	xmm11, xmm11, [r15+144]
+        vpsrldq	xmm4, xmm7, 4
+        vpslldq	xmm7, xmm7, 12
+        vaesenc	xmm12, xmm12, [r15+144]
+        vpxor	xmm2, xmm2, xmm7
+        vpsrld	xmm5, xmm2, 1
+        vaesenc	xmm13, xmm13, [r15+144]
+        vpsrld	xmm1, xmm2, 2
+        vpsrld	xmm0, xmm2, 7
+        vaesenc	xmm14, xmm14, [r15+144]
+        vpxor	xmm5, xmm5, xmm1
+        vpxor	xmm5, xmm5, xmm0
+        vaesenc	xmm15, xmm15, [r15+144]
+        vpxor	xmm5, xmm5, xmm4
+        vpxor	xmm2, xmm2, xmm5
+        vpxor	xmm2, xmm2, xmm3
+        cmp	r10d, 11
+        vmovdqa	xmm7, OWORD PTR [r15+160]
+        jl	L_AES_GCM_decrypt_avx1_aesenc_128_ghash_avx_done
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+176]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        cmp	r10d, 13
+        vmovdqa	xmm7, OWORD PTR [r15+192]
+        jl	L_AES_GCM_decrypt_avx1_aesenc_128_ghash_avx_done
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+208]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [r15+224]
+L_AES_GCM_decrypt_avx1_aesenc_128_ghash_avx_done:
+        vaesenclast	xmm8, xmm8, xmm7
+        vaesenclast	xmm9, xmm9, xmm7
+        vmovdqu	xmm0, OWORD PTR [rcx]
+        vmovdqu	xmm1, OWORD PTR [rcx+16]
+        vpxor	xmm8, xmm8, xmm0
+        vpxor	xmm9, xmm9, xmm1
+        vmovdqu	OWORD PTR [rdx], xmm8
+        vmovdqu	OWORD PTR [rdx+16], xmm9
+        vaesenclast	xmm10, xmm10, xmm7
+        vaesenclast	xmm11, xmm11, xmm7
+        vmovdqu	xmm0, OWORD PTR [rcx+32]
+        vmovdqu	xmm1, OWORD PTR [rcx+48]
+        vpxor	xmm10, xmm10, xmm0
+        vpxor	xmm11, xmm11, xmm1
+        vmovdqu	OWORD PTR [rdx+32], xmm10
+        vmovdqu	OWORD PTR [rdx+48], xmm11
+        vaesenclast	xmm12, xmm12, xmm7
+        vaesenclast	xmm13, xmm13, xmm7
+        vmovdqu	xmm0, OWORD PTR [rcx+64]
+        vmovdqu	xmm1, OWORD PTR [rcx+80]
+        vpxor	xmm12, xmm12, xmm0
+        vpxor	xmm13, xmm13, xmm1
+        vmovdqu	OWORD PTR [rdx+64], xmm12
+        vmovdqu	OWORD PTR [rdx+80], xmm13
+        vaesenclast	xmm14, xmm14, xmm7
+        vaesenclast	xmm15, xmm15, xmm7
+        vmovdqu	xmm0, OWORD PTR [rcx+96]
+        vmovdqu	xmm1, OWORD PTR [rcx+112]
+        vpxor	xmm14, xmm14, xmm0
+        vpxor	xmm15, xmm15, xmm1
+        vmovdqu	OWORD PTR [rdx+96], xmm14
+        vmovdqu	OWORD PTR [rdx+112], xmm15
+        add	ebx, 128
+        cmp	ebx, r13d
+        jl	L_AES_GCM_decrypt_avx1_ghash_128
+        vmovdqa	xmm6, xmm2
+        vmovdqu	xmm5, OWORD PTR [rsp]
+L_AES_GCM_decrypt_avx1_done_128:
+        mov	edx, r9d
+        cmp	ebx, edx
+        jge	L_AES_GCM_decrypt_avx1_done_dec
+        mov	r13d, r9d
+        and	r13d, 4294967280
+        cmp	ebx, r13d
+        jge	L_AES_GCM_decrypt_avx1_last_block_done
+L_AES_GCM_decrypt_avx1_last_block_start:
+        vmovdqu	xmm13, OWORD PTR [rdi+rbx]
+        vmovdqa	xmm0, xmm5
+        vpshufb	xmm1, xmm13, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vpxor	xmm1, xmm1, xmm6
+        vmovdqu	xmm9, OWORD PTR [rsp+128]
+        vpshufb	xmm8, xmm9, OWORD PTR L_avx1_aes_gcm_bswap_epi64
+        vpaddd	xmm9, xmm9, OWORD PTR L_avx1_aes_gcm_one
+        vmovdqu	OWORD PTR [rsp+128], xmm9
+        vpxor	xmm8, xmm8, [r15]
+        vpclmulqdq	xmm10, xmm1, xmm0, 16
+        vaesenc	xmm8, xmm8, [r15+16]
+        vaesenc	xmm8, xmm8, [r15+32]
+        vpclmulqdq	xmm11, xmm1, xmm0, 1
+        vaesenc	xmm8, xmm8, [r15+48]
+        vaesenc	xmm8, xmm8, [r15+64]
+        vpclmulqdq	xmm12, xmm1, xmm0, 0
+        vaesenc	xmm8, xmm8, [r15+80]
+        vpclmulqdq	xmm1, xmm1, xmm0, 17
+        vaesenc	xmm8, xmm8, [r15+96]
+        vpxor	xmm10, xmm10, xmm11
+        vpslldq	xmm2, xmm10, 8
+        vpsrldq	xmm10, xmm10, 8
+        vaesenc	xmm8, xmm8, [r15+112]
+        vpxor	xmm2, xmm2, xmm12
+        vpxor	xmm3, xmm1, xmm10
+        vmovdqa	xmm0, OWORD PTR L_avx1_aes_gcm_mod2_128
+        vpclmulqdq	xmm11, xmm2, xmm0, 16
+        vaesenc	xmm8, xmm8, [r15+128]
+        vpshufd	xmm10, xmm2, 78
+        vpxor	xmm10, xmm10, xmm11
+        vpclmulqdq	xmm11, xmm10, xmm0, 16
+        vaesenc	xmm8, xmm8, [r15+144]
+        vpshufd	xmm10, xmm10, 78
+        vpxor	xmm10, xmm10, xmm11
+        vpxor	xmm6, xmm10, xmm3
+        cmp	r10d, 11
+        vmovdqa	xmm9, OWORD PTR [r15+160]
+        jl	L_AES_GCM_decrypt_avx1_aesenc_gfmul_last
+        vaesenc	xmm8, xmm8, xmm9
+        vaesenc	xmm8, xmm8, [r15+176]
+        cmp	r10d, 13
+        vmovdqa	xmm9, OWORD PTR [r15+192]
+        jl	L_AES_GCM_decrypt_avx1_aesenc_gfmul_last
+        vaesenc	xmm8, xmm8, xmm9
+        vaesenc	xmm8, xmm8, [r15+208]
+        vmovdqa	xmm9, OWORD PTR [r15+224]
+L_AES_GCM_decrypt_avx1_aesenc_gfmul_last:
+        vaesenclast	xmm8, xmm8, xmm9
+        vmovdqa	xmm0, xmm13
+        vpxor	xmm8, xmm8, xmm0
+        vmovdqu	OWORD PTR [rsi+rbx], xmm8
+        add	ebx, 16
+        cmp	ebx, r13d
+        jl	L_AES_GCM_decrypt_avx1_last_block_start
+L_AES_GCM_decrypt_avx1_last_block_done:
+        mov	ecx, r9d
+        mov	edx, ecx
+        and	ecx, 15
+        jz	L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_done
+        vmovdqu	xmm4, OWORD PTR [rsp+128]
+        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_epi64
+        vpxor	xmm4, xmm4, [r15]
+        vaesenc	xmm4, xmm4, [r15+16]
+        vaesenc	xmm4, xmm4, [r15+32]
+        vaesenc	xmm4, xmm4, [r15+48]
+        vaesenc	xmm4, xmm4, [r15+64]
+        vaesenc	xmm4, xmm4, [r15+80]
+        vaesenc	xmm4, xmm4, [r15+96]
+        vaesenc	xmm4, xmm4, [r15+112]
+        vaesenc	xmm4, xmm4, [r15+128]
+        vaesenc	xmm4, xmm4, [r15+144]
+        cmp	r10d, 11
+        vmovdqa	xmm9, OWORD PTR [r15+160]
+        jl	L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_aesenc_avx_last
+        vaesenc	xmm4, xmm4, xmm9
+        vaesenc	xmm4, xmm4, [r15+176]
+        cmp	r10d, 13
+        vmovdqa	xmm9, OWORD PTR [r15+192]
+        jl	L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_aesenc_avx_last
+        vaesenc	xmm4, xmm4, xmm9
+        vaesenc	xmm4, xmm4, [r15+208]
+        vmovdqa	xmm9, OWORD PTR [r15+224]
+L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_aesenc_avx_last:
+        vaesenclast	xmm4, xmm4, xmm9
+        sub	rsp, 32
+        xor	ecx, ecx
+        vmovdqu	OWORD PTR [rsp], xmm4
+        vpxor	xmm0, xmm0, xmm0
+        vmovdqu	OWORD PTR [rsp+16], xmm0
+L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_loop:
+        movzx	r13d, BYTE PTR [rdi+rbx]
+        mov	BYTE PTR [rsp+rcx+16], r13b
+        xor	r13b, BYTE PTR [rsp+rcx]
+        mov	BYTE PTR [rsi+rbx], r13b
+        inc	ebx
+        inc	ecx
+        cmp	ebx, edx
+        jl	L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_loop
+        vmovdqu	xmm4, OWORD PTR [rsp+16]
+        add	rsp, 32
+        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vpxor	xmm6, xmm6, xmm4
+        ; ghash_gfmul_red_avx
+        vpshufd	xmm9, xmm5, 78
+        vpshufd	xmm10, xmm6, 78
+        vpclmulqdq	xmm11, xmm6, xmm5, 17
+        vpclmulqdq	xmm8, xmm6, xmm5, 0
+        vpxor	xmm9, xmm9, xmm5
+        vpxor	xmm10, xmm10, xmm6
+        vpclmulqdq	xmm9, xmm9, xmm10, 0
+        vpxor	xmm9, xmm9, xmm8
+        vpxor	xmm9, xmm9, xmm11
+        vpslldq	xmm10, xmm9, 8
+        vpsrldq	xmm9, xmm9, 8
+        vpxor	xmm8, xmm8, xmm10
+        vpxor	xmm6, xmm11, xmm9
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm6, xmm6, xmm14
+L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_done:
+L_AES_GCM_decrypt_avx1_done_dec:
+        mov	edx, r9d
+        mov	ecx, r11d
+        shl	rdx, 3
+        shl	rcx, 3
+        vmovq	xmm0, rdx
+        vmovq	xmm1, rcx
+        vpunpcklqdq	xmm0, xmm0, xmm1
+        vpxor	xmm6, xmm6, xmm0
+        ; ghash_gfmul_red_avx
+        vpshufd	xmm9, xmm5, 78
+        vpshufd	xmm10, xmm6, 78
+        vpclmulqdq	xmm11, xmm6, xmm5, 17
+        vpclmulqdq	xmm8, xmm6, xmm5, 0
+        vpxor	xmm9, xmm9, xmm5
+        vpxor	xmm10, xmm10, xmm6
+        vpclmulqdq	xmm9, xmm9, xmm10, 0
+        vpxor	xmm9, xmm9, xmm8
+        vpxor	xmm9, xmm9, xmm11
+        vpslldq	xmm10, xmm9, 8
+        vpsrldq	xmm9, xmm9, 8
+        vpxor	xmm8, xmm8, xmm10
+        vpxor	xmm6, xmm11, xmm9
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm6, xmm6, xmm14
+        vpshufb	xmm6, xmm6, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vmovdqu	xmm0, OWORD PTR [rsp+144]
+        vpxor	xmm0, xmm0, xmm6
+        cmp	r14d, 16
+        je	L_AES_GCM_decrypt_avx1_cmp_tag_16
+        sub	rsp, 16
+        xor	rcx, rcx
+        xor	rbx, rbx
+        vmovdqu	OWORD PTR [rsp], xmm0
+L_AES_GCM_decrypt_avx1_cmp_tag_loop:
+        movzx	r13d, BYTE PTR [rsp+rcx]
+        xor	r13b, BYTE PTR [r8+rcx]
+        or	bl, r13b
+        inc	ecx
+        cmp	ecx, r14d
+        jne	L_AES_GCM_decrypt_avx1_cmp_tag_loop
+        cmp	rbx, 0
+        sete	bl
+        add	rsp, 16
+        xor	rcx, rcx
+        jmp	L_AES_GCM_decrypt_avx1_cmp_tag_done
+L_AES_GCM_decrypt_avx1_cmp_tag_16:
+        vmovdqu	xmm1, OWORD PTR [r8]
+        vpcmpeqb	xmm0, xmm0, xmm1
+        vpmovmskb	rdx, xmm0
+        ; %%edx == 0xFFFF then return 1 else => return 0
+        xor	ebx, ebx
+        cmp	edx, 65535
+        sete	bl
+L_AES_GCM_decrypt_avx1_cmp_tag_done:
+        mov	DWORD PTR [rbp], ebx
+        vzeroupper
+        add	rsp, 168
+        pop	rbp
+        pop	r15
+        pop	r14
+        pop	rbx
+        pop	r12
+        pop	rsi
+        pop	rdi
+        pop	r13
+        ret
+AES_GCM_decrypt_avx1 ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+AES_GCM_init_avx1 PROC
+        push	rdi
+        push	rsi
+        push	r12
+        push	r13
+        mov	rdi, rcx
+        mov	rsi, rdx
+        mov	r10, r8
+        mov	r11d, r9d
+        mov	rax, QWORD PTR [rsp+72]
+        mov	r8, QWORD PTR [rsp+80]
+        mov	r9, QWORD PTR [rsp+88]
+        sub	rsp, 16
+        vpxor	xmm4, xmm4, xmm4
+        mov	edx, r11d
+        cmp	edx, 12
+        jne	L_AES_GCM_init_avx1_iv_not_12
+        ; # Calculate values when IV is 12 bytes
+        ; Set counter based on IV
+        mov	ecx, 16777216
+        vmovq	xmm4, QWORD PTR [r10]
+        vpinsrd	xmm4, xmm4, DWORD PTR [r10+8], 2
+        vpinsrd	xmm4, xmm4, ecx, 3
+        ; H = Encrypt X(=0) and T = Encrypt counter
+        vmovdqa	xmm5, OWORD PTR [rdi]
+        vpxor	xmm1, xmm4, xmm5
+        vmovdqa	xmm7, OWORD PTR [rdi+16]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [rdi+32]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [rdi+48]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [rdi+64]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [rdi+80]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [rdi+96]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [rdi+112]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [rdi+128]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [rdi+144]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        cmp	esi, 11
+        vmovdqa	xmm7, OWORD PTR [rdi+160]
+        jl	L_AES_GCM_init_avx1_calc_iv_12_last
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [rdi+176]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        cmp	esi, 13
+        vmovdqa	xmm7, OWORD PTR [rdi+192]
+        jl	L_AES_GCM_init_avx1_calc_iv_12_last
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [rdi+208]
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm1, xmm1, xmm7
+        vmovdqa	xmm7, OWORD PTR [rdi+224]
+L_AES_GCM_init_avx1_calc_iv_12_last:
+        vaesenclast	xmm5, xmm5, xmm7
+        vaesenclast	xmm1, xmm1, xmm7
+        vpshufb	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vmovdqu	xmm15, xmm1
+        jmp	L_AES_GCM_init_avx1_iv_done
+L_AES_GCM_init_avx1_iv_not_12:
+        ; Calculate values when IV is not 12 bytes
+        ; H = Encrypt X(=0)
+        vmovdqa	xmm5, OWORD PTR [rdi]
+        vaesenc	xmm5, xmm5, [rdi+16]
+        vaesenc	xmm5, xmm5, [rdi+32]
+        vaesenc	xmm5, xmm5, [rdi+48]
+        vaesenc	xmm5, xmm5, [rdi+64]
+        vaesenc	xmm5, xmm5, [rdi+80]
+        vaesenc	xmm5, xmm5, [rdi+96]
+        vaesenc	xmm5, xmm5, [rdi+112]
+        vaesenc	xmm5, xmm5, [rdi+128]
+        vaesenc	xmm5, xmm5, [rdi+144]
+        cmp	esi, 11
+        vmovdqa	xmm9, OWORD PTR [rdi+160]
+        jl	L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last
+        vaesenc	xmm5, xmm5, xmm9
+        vaesenc	xmm5, xmm5, [rdi+176]
+        cmp	esi, 13
+        vmovdqa	xmm9, OWORD PTR [rdi+192]
+        jl	L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last
+        vaesenc	xmm5, xmm5, xmm9
+        vaesenc	xmm5, xmm5, [rdi+208]
+        vmovdqa	xmm9, OWORD PTR [rdi+224]
+L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last:
+        vaesenclast	xmm5, xmm5, xmm9
+        vpshufb	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        ; Calc counter
+        ; Initialization vector
+        cmp	edx, 0
+        mov	rcx, 0
+        je	L_AES_GCM_init_avx1_calc_iv_done
+        cmp	edx, 16
+        jl	L_AES_GCM_init_avx1_calc_iv_lt16
+        and	edx, 4294967280
+L_AES_GCM_init_avx1_calc_iv_16_loop:
+        vmovdqu	xmm8, OWORD PTR [r10+rcx]
+        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vpxor	xmm4, xmm4, xmm8
+        ; ghash_gfmul_avx
+        vpshufd	xmm1, xmm4, 78
+        vpshufd	xmm2, xmm5, 78
+        vpclmulqdq	xmm3, xmm5, xmm4, 17
+        vpclmulqdq	xmm0, xmm5, xmm4, 0
+        vpxor	xmm1, xmm1, xmm4
+        vpxor	xmm2, xmm2, xmm5
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vmovdqa	xmm7, xmm0
+        vmovdqa	xmm4, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm7, xmm7, xmm2
+        vpxor	xmm4, xmm4, xmm1
+        vpsrld	xmm0, xmm7, 31
+        vpsrld	xmm1, xmm4, 31
+        vpslld	xmm7, xmm7, 1
+        vpslld	xmm4, xmm4, 1
+        vpsrldq	xmm2, xmm0, 12
+        vpslldq	xmm0, xmm0, 4
+        vpslldq	xmm1, xmm1, 4
+        vpor	xmm4, xmm4, xmm2
+        vpor	xmm7, xmm7, xmm0
+        vpor	xmm4, xmm4, xmm1
+        vpslld	xmm0, xmm7, 31
+        vpslld	xmm1, xmm7, 30
+        vpslld	xmm2, xmm7, 25
+        vpxor	xmm0, xmm0, xmm1
+        vpxor	xmm0, xmm0, xmm2
+        vmovdqa	xmm1, xmm0
+        vpsrldq	xmm1, xmm1, 4
+        vpslldq	xmm0, xmm0, 12
+        vpxor	xmm7, xmm7, xmm0
+        vpsrld	xmm2, xmm7, 1
+        vpsrld	xmm3, xmm7, 2
+        vpsrld	xmm0, xmm7, 7
+        vpxor	xmm2, xmm2, xmm3
+        vpxor	xmm2, xmm2, xmm0
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm4, xmm4, xmm2
+        add	ecx, 16
+        cmp	ecx, edx
+        jl	L_AES_GCM_init_avx1_calc_iv_16_loop
+        mov	edx, r11d
+        cmp	ecx, edx
+        je	L_AES_GCM_init_avx1_calc_iv_done
+L_AES_GCM_init_avx1_calc_iv_lt16:
+        sub	rsp, 16
+        vpxor	xmm8, xmm8, xmm8
+        xor	r13d, r13d
+        vmovdqu	OWORD PTR [rsp], xmm8
+L_AES_GCM_init_avx1_calc_iv_loop:
+        movzx	r12d, BYTE PTR [r10+rcx]
+        mov	BYTE PTR [rsp+r13], r12b
+        inc	ecx
+        inc	r13d
+        cmp	ecx, edx
+        jl	L_AES_GCM_init_avx1_calc_iv_loop
+        vmovdqu	xmm8, OWORD PTR [rsp]
+        add	rsp, 16
+        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vpxor	xmm4, xmm4, xmm8
+        ; ghash_gfmul_avx
+        vpshufd	xmm1, xmm4, 78
+        vpshufd	xmm2, xmm5, 78
+        vpclmulqdq	xmm3, xmm5, xmm4, 17
+        vpclmulqdq	xmm0, xmm5, xmm4, 0
+        vpxor	xmm1, xmm1, xmm4
+        vpxor	xmm2, xmm2, xmm5
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vmovdqa	xmm7, xmm0
+        vmovdqa	xmm4, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm7, xmm7, xmm2
+        vpxor	xmm4, xmm4, xmm1
+        vpsrld	xmm0, xmm7, 31
+        vpsrld	xmm1, xmm4, 31
+        vpslld	xmm7, xmm7, 1
+        vpslld	xmm4, xmm4, 1
+        vpsrldq	xmm2, xmm0, 12
+        vpslldq	xmm0, xmm0, 4
+        vpslldq	xmm1, xmm1, 4
+        vpor	xmm4, xmm4, xmm2
+        vpor	xmm7, xmm7, xmm0
+        vpor	xmm4, xmm4, xmm1
+        vpslld	xmm0, xmm7, 31
+        vpslld	xmm1, xmm7, 30
+        vpslld	xmm2, xmm7, 25
+        vpxor	xmm0, xmm0, xmm1
+        vpxor	xmm0, xmm0, xmm2
+        vmovdqa	xmm1, xmm0
+        vpsrldq	xmm1, xmm1, 4
+        vpslldq	xmm0, xmm0, 12
+        vpxor	xmm7, xmm7, xmm0
+        vpsrld	xmm2, xmm7, 1
+        vpsrld	xmm3, xmm7, 2
+        vpsrld	xmm0, xmm7, 7
+        vpxor	xmm2, xmm2, xmm3
+        vpxor	xmm2, xmm2, xmm0
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm4, xmm4, xmm2
+L_AES_GCM_init_avx1_calc_iv_done:
+        ; T = Encrypt counter
+        vpxor	xmm0, xmm0, xmm0
+        shl	edx, 3
+        vmovq	xmm0, rdx
+        vpxor	xmm4, xmm4, xmm0
+        ; ghash_gfmul_avx
+        vpshufd	xmm1, xmm4, 78
+        vpshufd	xmm2, xmm5, 78
+        vpclmulqdq	xmm3, xmm5, xmm4, 17
+        vpclmulqdq	xmm0, xmm5, xmm4, 0
+        vpxor	xmm1, xmm1, xmm4
+        vpxor	xmm2, xmm2, xmm5
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vmovdqa	xmm7, xmm0
+        vmovdqa	xmm4, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm7, xmm7, xmm2
+        vpxor	xmm4, xmm4, xmm1
+        vpsrld	xmm0, xmm7, 31
+        vpsrld	xmm1, xmm4, 31
+        vpslld	xmm7, xmm7, 1
+        vpslld	xmm4, xmm4, 1
+        vpsrldq	xmm2, xmm0, 12
+        vpslldq	xmm0, xmm0, 4
+        vpslldq	xmm1, xmm1, 4
+        vpor	xmm4, xmm4, xmm2
+        vpor	xmm7, xmm7, xmm0
+        vpor	xmm4, xmm4, xmm1
+        vpslld	xmm0, xmm7, 31
+        vpslld	xmm1, xmm7, 30
+        vpslld	xmm2, xmm7, 25
+        vpxor	xmm0, xmm0, xmm1
+        vpxor	xmm0, xmm0, xmm2
+        vmovdqa	xmm1, xmm0
+        vpsrldq	xmm1, xmm1, 4
+        vpslldq	xmm0, xmm0, 12
+        vpxor	xmm7, xmm7, xmm0
+        vpsrld	xmm2, xmm7, 1
+        vpsrld	xmm3, xmm7, 2
+        vpsrld	xmm0, xmm7, 7
+        vpxor	xmm2, xmm2, xmm3
+        vpxor	xmm2, xmm2, xmm0
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm4, xmm4, xmm2
+        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        ;   Encrypt counter
+        vmovdqa	xmm8, OWORD PTR [rdi]
+        vpxor	xmm8, xmm8, xmm4
+        vaesenc	xmm8, xmm8, [rdi+16]
+        vaesenc	xmm8, xmm8, [rdi+32]
+        vaesenc	xmm8, xmm8, [rdi+48]
+        vaesenc	xmm8, xmm8, [rdi+64]
+        vaesenc	xmm8, xmm8, [rdi+80]
+        vaesenc	xmm8, xmm8, [rdi+96]
+        vaesenc	xmm8, xmm8, [rdi+112]
+        vaesenc	xmm8, xmm8, [rdi+128]
+        vaesenc	xmm8, xmm8, [rdi+144]
+        cmp	esi, 11
+        vmovdqa	xmm9, OWORD PTR [rdi+160]
+        jl	L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last
+        vaesenc	xmm8, xmm8, xmm9
+        vaesenc	xmm8, xmm8, [rdi+176]
+        cmp	esi, 13
+        vmovdqa	xmm9, OWORD PTR [rdi+192]
+        jl	L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last
+        vaesenc	xmm8, xmm8, xmm9
+        vaesenc	xmm8, xmm8, [rdi+208]
+        vmovdqa	xmm9, OWORD PTR [rdi+224]
+L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last:
+        vaesenclast	xmm8, xmm8, xmm9
+        vmovdqu	xmm15, xmm8
+L_AES_GCM_init_avx1_iv_done:
+        vmovdqa	OWORD PTR [r9], xmm15
+        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_epi64
+        vpaddd	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_one
+        vmovdqa	OWORD PTR [rax], xmm5
+        vmovdqa	OWORD PTR [r8], xmm4
+        vzeroupper
+        add	rsp, 16
+        pop	r13
+        pop	r12
+        pop	rsi
+        pop	rdi
+        ret
+AES_GCM_init_avx1 ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+AES_GCM_aad_update_avx1 PROC
+        mov	rax, rcx
+        vmovdqa	xmm5, OWORD PTR [r8]
+        vmovdqa	xmm6, OWORD PTR [r9]
+        xor	ecx, ecx
+L_AES_GCM_aad_update_avx1_16_loop:
+        vmovdqu	xmm8, OWORD PTR [rax+rcx]
+        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vpxor	xmm5, xmm5, xmm8
+        ; ghash_gfmul_avx
+        vpshufd	xmm1, xmm5, 78
+        vpshufd	xmm2, xmm6, 78
+        vpclmulqdq	xmm3, xmm6, xmm5, 17
+        vpclmulqdq	xmm0, xmm6, xmm5, 0
+        vpxor	xmm1, xmm1, xmm5
+        vpxor	xmm2, xmm2, xmm6
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vmovdqa	xmm4, xmm0
+        vmovdqa	xmm5, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm4, xmm4, xmm2
+        vpxor	xmm5, xmm5, xmm1
+        vpsrld	xmm0, xmm4, 31
+        vpsrld	xmm1, xmm5, 31
+        vpslld	xmm4, xmm4, 1
+        vpslld	xmm5, xmm5, 1
+        vpsrldq	xmm2, xmm0, 12
+        vpslldq	xmm0, xmm0, 4
+        vpslldq	xmm1, xmm1, 4
+        vpor	xmm5, xmm5, xmm2
+        vpor	xmm4, xmm4, xmm0
+        vpor	xmm5, xmm5, xmm1
+        vpslld	xmm0, xmm4, 31
+        vpslld	xmm1, xmm4, 30
+        vpslld	xmm2, xmm4, 25
+        vpxor	xmm0, xmm0, xmm1
+        vpxor	xmm0, xmm0, xmm2
+        vmovdqa	xmm1, xmm0
+        vpsrldq	xmm1, xmm1, 4
+        vpslldq	xmm0, xmm0, 12
+        vpxor	xmm4, xmm4, xmm0
+        vpsrld	xmm2, xmm4, 1
+        vpsrld	xmm3, xmm4, 2
+        vpsrld	xmm0, xmm4, 7
+        vpxor	xmm2, xmm2, xmm3
+        vpxor	xmm2, xmm2, xmm0
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm2, xmm2, xmm4
+        vpxor	xmm5, xmm5, xmm2
+        add	ecx, 16
+        cmp	ecx, edx
+        jl	L_AES_GCM_aad_update_avx1_16_loop
+        vmovdqa	OWORD PTR [r8], xmm5
+        vzeroupper
+        ret
+AES_GCM_aad_update_avx1 ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+AES_GCM_encrypt_block_avx1 PROC
+        mov	r10, r8
+        mov	r11, r9
+        mov	rax, QWORD PTR [rsp+40]
+        vmovdqu	xmm9, OWORD PTR [rax]
+        vpshufb	xmm8, xmm9, OWORD PTR L_avx1_aes_gcm_bswap_epi64
+        vpaddd	xmm9, xmm9, OWORD PTR L_avx1_aes_gcm_one
+        vmovdqu	OWORD PTR [rax], xmm9
+        vpxor	xmm8, xmm8, [rcx]
+        vaesenc	xmm8, xmm8, [rcx+16]
+        vaesenc	xmm8, xmm8, [rcx+32]
+        vaesenc	xmm8, xmm8, [rcx+48]
+        vaesenc	xmm8, xmm8, [rcx+64]
+        vaesenc	xmm8, xmm8, [rcx+80]
+        vaesenc	xmm8, xmm8, [rcx+96]
+        vaesenc	xmm8, xmm8, [rcx+112]
+        vaesenc	xmm8, xmm8, [rcx+128]
+        vaesenc	xmm8, xmm8, [rcx+144]
+        cmp	edx, 11
+        vmovdqa	xmm9, OWORD PTR [rcx+160]
+        jl	L_AES_GCM_encrypt_block_avx1_aesenc_block_last
+        vaesenc	xmm8, xmm8, xmm9
+        vaesenc	xmm8, xmm8, [rcx+176]
+        cmp	edx, 13
+        vmovdqa	xmm9, OWORD PTR [rcx+192]
+        jl	L_AES_GCM_encrypt_block_avx1_aesenc_block_last
+        vaesenc	xmm8, xmm8, xmm9
+        vaesenc	xmm8, xmm8, [rcx+208]
+        vmovdqa	xmm9, OWORD PTR [rcx+224]
+L_AES_GCM_encrypt_block_avx1_aesenc_block_last:
+        vaesenclast	xmm8, xmm8, xmm9
+        vmovdqu	xmm9, OWORD PTR [r11]
+        vpxor	xmm8, xmm8, xmm9
+        vmovdqu	OWORD PTR [r10], xmm8
+        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vzeroupper
+        ret
+AES_GCM_encrypt_block_avx1 ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+AES_GCM_ghash_block_avx1 PROC
+        vmovdqa	xmm4, OWORD PTR [rdx]
+        vmovdqa	xmm5, OWORD PTR [r8]
+        vmovdqu	xmm8, OWORD PTR [rcx]
+        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vpxor	xmm4, xmm4, xmm8
+        ; ghash_gfmul_avx
+        vpshufd	xmm1, xmm4, 78
+        vpshufd	xmm2, xmm5, 78
+        vpclmulqdq	xmm3, xmm5, xmm4, 17
+        vpclmulqdq	xmm0, xmm5, xmm4, 0
+        vpxor	xmm1, xmm1, xmm4
+        vpxor	xmm2, xmm2, xmm5
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vmovdqa	xmm6, xmm0
+        vmovdqa	xmm4, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm6, xmm6, xmm2
+        vpxor	xmm4, xmm4, xmm1
+        vpsrld	xmm0, xmm6, 31
+        vpsrld	xmm1, xmm4, 31
+        vpslld	xmm6, xmm6, 1
+        vpslld	xmm4, xmm4, 1
+        vpsrldq	xmm2, xmm0, 12
+        vpslldq	xmm0, xmm0, 4
+        vpslldq	xmm1, xmm1, 4
+        vpor	xmm4, xmm4, xmm2
+        vpor	xmm6, xmm6, xmm0
+        vpor	xmm4, xmm4, xmm1
+        vpslld	xmm0, xmm6, 31
+        vpslld	xmm1, xmm6, 30
+        vpslld	xmm2, xmm6, 25
+        vpxor	xmm0, xmm0, xmm1
+        vpxor	xmm0, xmm0, xmm2
+        vmovdqa	xmm1, xmm0
+        vpsrldq	xmm1, xmm1, 4
+        vpslldq	xmm0, xmm0, 12
+        vpxor	xmm6, xmm6, xmm0
+        vpsrld	xmm2, xmm6, 1
+        vpsrld	xmm3, xmm6, 2
+        vpsrld	xmm0, xmm6, 7
+        vpxor	xmm2, xmm2, xmm3
+        vpxor	xmm2, xmm2, xmm0
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm2, xmm2, xmm6
+        vpxor	xmm4, xmm4, xmm2
+        vmovdqa	OWORD PTR [rdx], xmm4
+        vzeroupper
+        ret
+AES_GCM_ghash_block_avx1 ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+AES_GCM_encrypt_update_avx1 PROC
+        push	r13
+        push	r12
+        push	r14
+        push	r15
+        push	rdi
+        mov	rax, rcx
+        mov	r10, r8
+        mov	r8d, edx
+        mov	r11, r9
+        mov	r9d, DWORD PTR [rsp+80]
+        mov	r12, QWORD PTR [rsp+88]
+        mov	r14, QWORD PTR [rsp+96]
+        mov	r15, QWORD PTR [rsp+104]
+        sub	rsp, 160
+        vmovdqa	xmm6, OWORD PTR [r12]
+        vmovdqa	xmm5, OWORD PTR [r14]
+        vpsrlq	xmm9, xmm5, 63
+        vpsllq	xmm8, xmm5, 1
+        vpslldq	xmm9, xmm9, 8
+        vpor	xmm8, xmm8, xmm9
+        vpshufd	xmm5, xmm5, 255
+        vpsrad	xmm5, xmm5, 31
+        vpand	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_mod2_128
+        vpxor	xmm5, xmm5, xmm8
+        xor	edi, edi
+        cmp	r9d, 128
+        mov	r13d, r9d
+        jl	L_AES_GCM_encrypt_update_avx1_done_128
+        and	r13d, 4294967168
+        vmovdqa	xmm2, xmm6
+        ; H ^ 1
+        vmovdqu	OWORD PTR [rsp], xmm5
+        ; H ^ 2
+        vpclmulqdq	xmm8, xmm5, xmm5, 0
+        vpclmulqdq	xmm0, xmm5, xmm5, 17
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm0, xmm0, xmm14
+        vmovdqu	OWORD PTR [rsp+16], xmm0
+        ; H ^ 3
+        ; ghash_gfmul_red_avx
+        vpshufd	xmm9, xmm5, 78
+        vpshufd	xmm10, xmm0, 78
+        vpclmulqdq	xmm11, xmm0, xmm5, 17
+        vpclmulqdq	xmm8, xmm0, xmm5, 0
+        vpxor	xmm9, xmm9, xmm5
+        vpxor	xmm10, xmm10, xmm0
+        vpclmulqdq	xmm9, xmm9, xmm10, 0
+        vpxor	xmm9, xmm9, xmm8
+        vpxor	xmm9, xmm9, xmm11
+        vpslldq	xmm10, xmm9, 8
+        vpsrldq	xmm9, xmm9, 8
+        vpxor	xmm8, xmm8, xmm10
+        vpxor	xmm1, xmm11, xmm9
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm1, xmm1, xmm14
+        vmovdqu	OWORD PTR [rsp+32], xmm1
+        ; H ^ 4
+        vpclmulqdq	xmm8, xmm0, xmm0, 0
+        vpclmulqdq	xmm3, xmm0, xmm0, 17
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm3, xmm3, xmm14
+        vmovdqu	OWORD PTR [rsp+48], xmm3
+        ; H ^ 5
+        ; ghash_gfmul_red_avx
+        vpshufd	xmm9, xmm0, 78
+        vpshufd	xmm10, xmm1, 78
+        vpclmulqdq	xmm11, xmm1, xmm0, 17
+        vpclmulqdq	xmm8, xmm1, xmm0, 0
+        vpxor	xmm9, xmm9, xmm0
+        vpxor	xmm10, xmm10, xmm1
+        vpclmulqdq	xmm9, xmm9, xmm10, 0
+        vpxor	xmm9, xmm9, xmm8
+        vpxor	xmm9, xmm9, xmm11
+        vpslldq	xmm10, xmm9, 8
+        vpsrldq	xmm9, xmm9, 8
+        vpxor	xmm8, xmm8, xmm10
+        vpxor	xmm7, xmm11, xmm9
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm7, xmm7, xmm14
+        vmovdqu	OWORD PTR [rsp+64], xmm7
+        ; H ^ 6
+        vpclmulqdq	xmm8, xmm1, xmm1, 0
+        vpclmulqdq	xmm7, xmm1, xmm1, 17
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm7, xmm7, xmm14
+        vmovdqu	OWORD PTR [rsp+80], xmm7
+        ; H ^ 7
+        ; ghash_gfmul_red_avx
+        vpshufd	xmm9, xmm1, 78
+        vpshufd	xmm10, xmm3, 78
+        vpclmulqdq	xmm11, xmm3, xmm1, 17
+        vpclmulqdq	xmm8, xmm3, xmm1, 0
+        vpxor	xmm9, xmm9, xmm1
+        vpxor	xmm10, xmm10, xmm3
+        vpclmulqdq	xmm9, xmm9, xmm10, 0
+        vpxor	xmm9, xmm9, xmm8
+        vpxor	xmm9, xmm9, xmm11
+        vpslldq	xmm10, xmm9, 8
+        vpsrldq	xmm9, xmm9, 8
+        vpxor	xmm8, xmm8, xmm10
+        vpxor	xmm7, xmm11, xmm9
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm7, xmm7, xmm14
+        vmovdqu	OWORD PTR [rsp+96], xmm7
+        ; H ^ 8
+        vpclmulqdq	xmm8, xmm3, xmm3, 0
+        vpclmulqdq	xmm7, xmm3, xmm3, 17
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm7, xmm7, xmm14
+        vmovdqu	OWORD PTR [rsp+112], xmm7
+        ; First 128 bytes of input
+        vmovdqu	xmm0, OWORD PTR [r15]
+        vmovdqa	xmm1, OWORD PTR L_avx1_aes_gcm_bswap_epi64
+        vpshufb	xmm8, xmm0, xmm1
+        vpaddd	xmm9, xmm0, OWORD PTR L_avx1_aes_gcm_one
+        vpshufb	xmm9, xmm9, xmm1
+        vpaddd	xmm10, xmm0, OWORD PTR L_avx1_aes_gcm_two
+        vpshufb	xmm10, xmm10, xmm1
+        vpaddd	xmm11, xmm0, OWORD PTR L_avx1_aes_gcm_three
+        vpshufb	xmm11, xmm11, xmm1
+        vpaddd	xmm12, xmm0, OWORD PTR L_avx1_aes_gcm_four
+        vpshufb	xmm12, xmm12, xmm1
+        vpaddd	xmm13, xmm0, OWORD PTR L_avx1_aes_gcm_five
+        vpshufb	xmm13, xmm13, xmm1
+        vpaddd	xmm14, xmm0, OWORD PTR L_avx1_aes_gcm_six
+        vpshufb	xmm14, xmm14, xmm1
+        vpaddd	xmm15, xmm0, OWORD PTR L_avx1_aes_gcm_seven
+        vpshufb	xmm15, xmm15, xmm1
+        vpaddd	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_eight
+        vmovdqa	xmm7, OWORD PTR [rax]
+        vmovdqu	OWORD PTR [r15], xmm0
+        vpxor	xmm8, xmm8, xmm7
+        vpxor	xmm9, xmm9, xmm7
+        vpxor	xmm10, xmm10, xmm7
+        vpxor	xmm11, xmm11, xmm7
+        vpxor	xmm12, xmm12, xmm7
+        vpxor	xmm13, xmm13, xmm7
+        vpxor	xmm14, xmm14, xmm7
+        vpxor	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [rax+16]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [rax+32]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [rax+48]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [rax+64]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [rax+80]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [rax+96]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [rax+112]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [rax+128]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [rax+144]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        cmp	r8d, 11
+        vmovdqa	xmm7, OWORD PTR [rax+160]
+        jl	L_AES_GCM_encrypt_update_avx1_aesenc_128_enc_done
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [rax+176]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        cmp	r8d, 13
+        vmovdqa	xmm7, OWORD PTR [rax+192]
+        jl	L_AES_GCM_encrypt_update_avx1_aesenc_128_enc_done
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [rax+208]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [rax+224]
+L_AES_GCM_encrypt_update_avx1_aesenc_128_enc_done:
+        vaesenclast	xmm8, xmm8, xmm7
+        vaesenclast	xmm9, xmm9, xmm7
+        vmovdqu	xmm0, OWORD PTR [r11]
+        vmovdqu	xmm1, OWORD PTR [r11+16]
+        vpxor	xmm8, xmm8, xmm0
+        vpxor	xmm9, xmm9, xmm1
+        vmovdqu	OWORD PTR [r10], xmm8
+        vmovdqu	OWORD PTR [r10+16], xmm9
+        vaesenclast	xmm10, xmm10, xmm7
+        vaesenclast	xmm11, xmm11, xmm7
+        vmovdqu	xmm0, OWORD PTR [r11+32]
+        vmovdqu	xmm1, OWORD PTR [r11+48]
+        vpxor	xmm10, xmm10, xmm0
+        vpxor	xmm11, xmm11, xmm1
+        vmovdqu	OWORD PTR [r10+32], xmm10
+        vmovdqu	OWORD PTR [r10+48], xmm11
+        vaesenclast	xmm12, xmm12, xmm7
+        vaesenclast	xmm13, xmm13, xmm7
+        vmovdqu	xmm0, OWORD PTR [r11+64]
+        vmovdqu	xmm1, OWORD PTR [r11+80]
+        vpxor	xmm12, xmm12, xmm0
+        vpxor	xmm13, xmm13, xmm1
+        vmovdqu	OWORD PTR [r10+64], xmm12
+        vmovdqu	OWORD PTR [r10+80], xmm13
+        vaesenclast	xmm14, xmm14, xmm7
+        vaesenclast	xmm15, xmm15, xmm7
+        vmovdqu	xmm0, OWORD PTR [r11+96]
+        vmovdqu	xmm1, OWORD PTR [r11+112]
+        vpxor	xmm14, xmm14, xmm0
+        vpxor	xmm15, xmm15, xmm1
+        vmovdqu	OWORD PTR [r10+96], xmm14
+        vmovdqu	OWORD PTR [r10+112], xmm15
+        cmp	r13d, 128
+        mov	edi, 128
+        jle	L_AES_GCM_encrypt_update_avx1_end_128
+        ; More 128 bytes of input
+L_AES_GCM_encrypt_update_avx1_ghash_128:
+        lea	rcx, QWORD PTR [r11+rdi]
+        lea	rdx, QWORD PTR [r10+rdi]
+        vmovdqu	xmm0, OWORD PTR [r15]
+        vmovdqa	xmm1, OWORD PTR L_avx1_aes_gcm_bswap_epi64
+        vpshufb	xmm8, xmm0, xmm1
+        vpaddd	xmm9, xmm0, OWORD PTR L_avx1_aes_gcm_one
+        vpshufb	xmm9, xmm9, xmm1
+        vpaddd	xmm10, xmm0, OWORD PTR L_avx1_aes_gcm_two
+        vpshufb	xmm10, xmm10, xmm1
+        vpaddd	xmm11, xmm0, OWORD PTR L_avx1_aes_gcm_three
+        vpshufb	xmm11, xmm11, xmm1
+        vpaddd	xmm12, xmm0, OWORD PTR L_avx1_aes_gcm_four
+        vpshufb	xmm12, xmm12, xmm1
+        vpaddd	xmm13, xmm0, OWORD PTR L_avx1_aes_gcm_five
+        vpshufb	xmm13, xmm13, xmm1
+        vpaddd	xmm14, xmm0, OWORD PTR L_avx1_aes_gcm_six
+        vpshufb	xmm14, xmm14, xmm1
+        vpaddd	xmm15, xmm0, OWORD PTR L_avx1_aes_gcm_seven
+        vpshufb	xmm15, xmm15, xmm1
+        vpaddd	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_eight
+        vmovdqa	xmm7, OWORD PTR [rax]
+        vmovdqu	OWORD PTR [r15], xmm0
+        vpxor	xmm8, xmm8, xmm7
+        vpxor	xmm9, xmm9, xmm7
+        vpxor	xmm10, xmm10, xmm7
+        vpxor	xmm11, xmm11, xmm7
+        vpxor	xmm12, xmm12, xmm7
+        vpxor	xmm13, xmm13, xmm7
+        vpxor	xmm14, xmm14, xmm7
+        vpxor	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rsp+112]
+        vmovdqu	xmm0, OWORD PTR [rdx+-128]
+        vaesenc	xmm8, xmm8, [rax+16]
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vpxor	xmm0, xmm0, xmm2
+        vpshufd	xmm1, xmm7, 78
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm3, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [rax+16]
+        vaesenc	xmm10, xmm10, [rax+16]
+        vpclmulqdq	xmm2, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [rax+16]
+        vaesenc	xmm12, xmm12, [rax+16]
+        vpclmulqdq	xmm1, xmm1, xmm5, 0
+        vaesenc	xmm13, xmm13, [rax+16]
+        vaesenc	xmm14, xmm14, [rax+16]
+        vaesenc	xmm15, xmm15, [rax+16]
+        vpxor	xmm1, xmm1, xmm2
+        vpxor	xmm1, xmm1, xmm3
+        vmovdqu	xmm7, OWORD PTR [rsp+96]
+        vmovdqu	xmm0, OWORD PTR [rdx+-112]
+        vpshufd	xmm4, xmm7, 78
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vaesenc	xmm8, xmm8, [rax+32]
+        vpxor	xmm4, xmm4, xmm7
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm6, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [rax+32]
+        vaesenc	xmm10, xmm10, [rax+32]
+        vpclmulqdq	xmm7, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [rax+32]
+        vaesenc	xmm12, xmm12, [rax+32]
+        vpclmulqdq	xmm4, xmm4, xmm5, 0
+        vaesenc	xmm13, xmm13, [rax+32]
+        vaesenc	xmm14, xmm14, [rax+32]
+        vaesenc	xmm15, xmm15, [rax+32]
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm3, xmm3, xmm6
+        vpxor	xmm1, xmm1, xmm4
+        vmovdqu	xmm7, OWORD PTR [rsp+80]
+        vmovdqu	xmm0, OWORD PTR [rdx+-96]
+        vpshufd	xmm4, xmm7, 78
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vaesenc	xmm8, xmm8, [rax+48]
+        vpxor	xmm4, xmm4, xmm7
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm6, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [rax+48]
+        vaesenc	xmm10, xmm10, [rax+48]
+        vpclmulqdq	xmm7, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [rax+48]
+        vaesenc	xmm12, xmm12, [rax+48]
+        vpclmulqdq	xmm4, xmm4, xmm5, 0
+        vaesenc	xmm13, xmm13, [rax+48]
+        vaesenc	xmm14, xmm14, [rax+48]
+        vaesenc	xmm15, xmm15, [rax+48]
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm3, xmm3, xmm6
+        vpxor	xmm1, xmm1, xmm4
+        vmovdqu	xmm7, OWORD PTR [rsp+64]
+        vmovdqu	xmm0, OWORD PTR [rdx+-80]
+        vpshufd	xmm4, xmm7, 78
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vaesenc	xmm8, xmm8, [rax+64]
+        vpxor	xmm4, xmm4, xmm7
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm6, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [rax+64]
+        vaesenc	xmm10, xmm10, [rax+64]
+        vpclmulqdq	xmm7, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [rax+64]
+        vaesenc	xmm12, xmm12, [rax+64]
+        vpclmulqdq	xmm4, xmm4, xmm5, 0
+        vaesenc	xmm13, xmm13, [rax+64]
+        vaesenc	xmm14, xmm14, [rax+64]
+        vaesenc	xmm15, xmm15, [rax+64]
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm3, xmm3, xmm6
+        vpxor	xmm1, xmm1, xmm4
+        vmovdqu	xmm7, OWORD PTR [rsp+48]
+        vmovdqu	xmm0, OWORD PTR [rdx+-64]
+        vpshufd	xmm4, xmm7, 78
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vaesenc	xmm8, xmm8, [rax+80]
+        vpxor	xmm4, xmm4, xmm7
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm6, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [rax+80]
+        vaesenc	xmm10, xmm10, [rax+80]
+        vpclmulqdq	xmm7, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [rax+80]
+        vaesenc	xmm12, xmm12, [rax+80]
+        vpclmulqdq	xmm4, xmm4, xmm5, 0
+        vaesenc	xmm13, xmm13, [rax+80]
+        vaesenc	xmm14, xmm14, [rax+80]
+        vaesenc	xmm15, xmm15, [rax+80]
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm3, xmm3, xmm6
+        vpxor	xmm1, xmm1, xmm4
+        vmovdqu	xmm7, OWORD PTR [rsp+32]
+        vmovdqu	xmm0, OWORD PTR [rdx+-48]
+        vpshufd	xmm4, xmm7, 78
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vaesenc	xmm8, xmm8, [rax+96]
+        vpxor	xmm4, xmm4, xmm7
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm6, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [rax+96]
+        vaesenc	xmm10, xmm10, [rax+96]
+        vpclmulqdq	xmm7, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [rax+96]
+        vaesenc	xmm12, xmm12, [rax+96]
+        vpclmulqdq	xmm4, xmm4, xmm5, 0
+        vaesenc	xmm13, xmm13, [rax+96]
+        vaesenc	xmm14, xmm14, [rax+96]
+        vaesenc	xmm15, xmm15, [rax+96]
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm3, xmm3, xmm6
+        vpxor	xmm1, xmm1, xmm4
+        vmovdqu	xmm7, OWORD PTR [rsp+16]
+        vmovdqu	xmm0, OWORD PTR [rdx+-32]
+        vpshufd	xmm4, xmm7, 78
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vaesenc	xmm8, xmm8, [rax+112]
+        vpxor	xmm4, xmm4, xmm7
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm6, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [rax+112]
+        vaesenc	xmm10, xmm10, [rax+112]
+        vpclmulqdq	xmm7, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [rax+112]
+        vaesenc	xmm12, xmm12, [rax+112]
+        vpclmulqdq	xmm4, xmm4, xmm5, 0
+        vaesenc	xmm13, xmm13, [rax+112]
+        vaesenc	xmm14, xmm14, [rax+112]
+        vaesenc	xmm15, xmm15, [rax+112]
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm3, xmm3, xmm6
+        vpxor	xmm1, xmm1, xmm4
+        vmovdqu	xmm7, OWORD PTR [rsp]
+        vmovdqu	xmm0, OWORD PTR [rdx+-16]
+        vpshufd	xmm4, xmm7, 78
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vaesenc	xmm8, xmm8, [rax+128]
+        vpxor	xmm4, xmm4, xmm7
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm6, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [rax+128]
+        vaesenc	xmm10, xmm10, [rax+128]
+        vpclmulqdq	xmm7, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [rax+128]
+        vaesenc	xmm12, xmm12, [rax+128]
+        vpclmulqdq	xmm4, xmm4, xmm5, 0
+        vaesenc	xmm13, xmm13, [rax+128]
+        vaesenc	xmm14, xmm14, [rax+128]
+        vaesenc	xmm15, xmm15, [rax+128]
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm3, xmm3, xmm6
+        vpxor	xmm1, xmm1, xmm4
+        vpslldq	xmm5, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vaesenc	xmm8, xmm8, [rax+144]
+        vpxor	xmm2, xmm2, xmm5
+        vpxor	xmm3, xmm3, xmm1
+        vaesenc	xmm9, xmm9, [rax+144]
+        vpslld	xmm7, xmm2, 31
+        vpslld	xmm4, xmm2, 30
+        vpslld	xmm5, xmm2, 25
+        vaesenc	xmm10, xmm10, [rax+144]
+        vpxor	xmm7, xmm7, xmm4
+        vpxor	xmm7, xmm7, xmm5
+        vaesenc	xmm11, xmm11, [rax+144]
+        vpsrldq	xmm4, xmm7, 4
+        vpslldq	xmm7, xmm7, 12
+        vaesenc	xmm12, xmm12, [rax+144]
+        vpxor	xmm2, xmm2, xmm7
+        vpsrld	xmm5, xmm2, 1
+        vaesenc	xmm13, xmm13, [rax+144]
+        vpsrld	xmm1, xmm2, 2
+        vpsrld	xmm0, xmm2, 7
+        vaesenc	xmm14, xmm14, [rax+144]
+        vpxor	xmm5, xmm5, xmm1
+        vpxor	xmm5, xmm5, xmm0
+        vaesenc	xmm15, xmm15, [rax+144]
+        vpxor	xmm5, xmm5, xmm4
+        vpxor	xmm2, xmm2, xmm5
+        vpxor	xmm2, xmm2, xmm3
+        cmp	r8d, 11
+        vmovdqa	xmm7, OWORD PTR [rax+160]
+        jl	L_AES_GCM_encrypt_update_avx1_aesenc_128_ghash_avx_done
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [rax+176]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        cmp	r8d, 13
+        vmovdqa	xmm7, OWORD PTR [rax+192]
+        jl	L_AES_GCM_encrypt_update_avx1_aesenc_128_ghash_avx_done
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [rax+208]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [rax+224]
+L_AES_GCM_encrypt_update_avx1_aesenc_128_ghash_avx_done:
+        vaesenclast	xmm8, xmm8, xmm7
+        vaesenclast	xmm9, xmm9, xmm7
+        vmovdqu	xmm0, OWORD PTR [rcx]
+        vmovdqu	xmm1, OWORD PTR [rcx+16]
+        vpxor	xmm8, xmm8, xmm0
+        vpxor	xmm9, xmm9, xmm1
+        vmovdqu	OWORD PTR [rdx], xmm8
+        vmovdqu	OWORD PTR [rdx+16], xmm9
+        vaesenclast	xmm10, xmm10, xmm7
+        vaesenclast	xmm11, xmm11, xmm7
+        vmovdqu	xmm0, OWORD PTR [rcx+32]
+        vmovdqu	xmm1, OWORD PTR [rcx+48]
+        vpxor	xmm10, xmm10, xmm0
+        vpxor	xmm11, xmm11, xmm1
+        vmovdqu	OWORD PTR [rdx+32], xmm10
+        vmovdqu	OWORD PTR [rdx+48], xmm11
+        vaesenclast	xmm12, xmm12, xmm7
+        vaesenclast	xmm13, xmm13, xmm7
+        vmovdqu	xmm0, OWORD PTR [rcx+64]
+        vmovdqu	xmm1, OWORD PTR [rcx+80]
+        vpxor	xmm12, xmm12, xmm0
+        vpxor	xmm13, xmm13, xmm1
+        vmovdqu	OWORD PTR [rdx+64], xmm12
+        vmovdqu	OWORD PTR [rdx+80], xmm13
+        vaesenclast	xmm14, xmm14, xmm7
+        vaesenclast	xmm15, xmm15, xmm7
+        vmovdqu	xmm0, OWORD PTR [rcx+96]
+        vmovdqu	xmm1, OWORD PTR [rcx+112]
+        vpxor	xmm14, xmm14, xmm0
+        vpxor	xmm15, xmm15, xmm1
+        vmovdqu	OWORD PTR [rdx+96], xmm14
+        vmovdqu	OWORD PTR [rdx+112], xmm15
+        add	edi, 128
+        cmp	edi, r13d
+        jl	L_AES_GCM_encrypt_update_avx1_ghash_128
+L_AES_GCM_encrypt_update_avx1_end_128:
+        vmovdqa	xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vpshufb	xmm8, xmm8, xmm4
+        vpshufb	xmm9, xmm9, xmm4
+        vpshufb	xmm10, xmm10, xmm4
+        vpshufb	xmm11, xmm11, xmm4
+        vpxor	xmm8, xmm8, xmm2
+        vpshufb	xmm12, xmm12, xmm4
+        vpshufb	xmm13, xmm13, xmm4
+        vpshufb	xmm14, xmm14, xmm4
+        vpshufb	xmm15, xmm15, xmm4
+        vmovdqu	xmm7, OWORD PTR [rsp]
+        vmovdqu	xmm5, OWORD PTR [rsp+16]
+        ; ghash_gfmul_avx
+        vpshufd	xmm1, xmm15, 78
+        vpshufd	xmm2, xmm7, 78
+        vpclmulqdq	xmm3, xmm7, xmm15, 17
+        vpclmulqdq	xmm0, xmm7, xmm15, 0
+        vpxor	xmm1, xmm1, xmm15
+        vpxor	xmm2, xmm2, xmm7
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vmovdqa	xmm4, xmm0
+        vmovdqa	xmm6, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm4, xmm4, xmm2
+        vpxor	xmm6, xmm6, xmm1
+        ; ghash_gfmul_xor_avx
+        vpshufd	xmm1, xmm14, 78
+        vpshufd	xmm2, xmm5, 78
+        vpclmulqdq	xmm3, xmm5, xmm14, 17
+        vpclmulqdq	xmm0, xmm5, xmm14, 0
+        vpxor	xmm1, xmm1, xmm14
+        vpxor	xmm2, xmm2, xmm5
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vpxor	xmm4, xmm4, xmm0
+        vpxor	xmm6, xmm6, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm4, xmm4, xmm2
+        vpxor	xmm6, xmm6, xmm1
+        vmovdqu	xmm7, OWORD PTR [rsp+32]
+        vmovdqu	xmm5, OWORD PTR [rsp+48]
+        ; ghash_gfmul_xor_avx
+        vpshufd	xmm1, xmm13, 78
+        vpshufd	xmm2, xmm7, 78
+        vpclmulqdq	xmm3, xmm7, xmm13, 17
+        vpclmulqdq	xmm0, xmm7, xmm13, 0
+        vpxor	xmm1, xmm1, xmm13
+        vpxor	xmm2, xmm2, xmm7
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vpxor	xmm4, xmm4, xmm0
+        vpxor	xmm6, xmm6, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm4, xmm4, xmm2
+        vpxor	xmm6, xmm6, xmm1
+        ; ghash_gfmul_xor_avx
+        vpshufd	xmm1, xmm12, 78
+        vpshufd	xmm2, xmm5, 78
+        vpclmulqdq	xmm3, xmm5, xmm12, 17
+        vpclmulqdq	xmm0, xmm5, xmm12, 0
+        vpxor	xmm1, xmm1, xmm12
+        vpxor	xmm2, xmm2, xmm5
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vpxor	xmm4, xmm4, xmm0
+        vpxor	xmm6, xmm6, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm4, xmm4, xmm2
+        vpxor	xmm6, xmm6, xmm1
+        vmovdqu	xmm7, OWORD PTR [rsp+64]
+        vmovdqu	xmm5, OWORD PTR [rsp+80]
+        ; ghash_gfmul_xor_avx
+        vpshufd	xmm1, xmm11, 78
+        vpshufd	xmm2, xmm7, 78
+        vpclmulqdq	xmm3, xmm7, xmm11, 17
+        vpclmulqdq	xmm0, xmm7, xmm11, 0
+        vpxor	xmm1, xmm1, xmm11
+        vpxor	xmm2, xmm2, xmm7
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vpxor	xmm4, xmm4, xmm0
+        vpxor	xmm6, xmm6, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm4, xmm4, xmm2
+        vpxor	xmm6, xmm6, xmm1
+        ; ghash_gfmul_xor_avx
+        vpshufd	xmm1, xmm10, 78
+        vpshufd	xmm2, xmm5, 78
+        vpclmulqdq	xmm3, xmm5, xmm10, 17
+        vpclmulqdq	xmm0, xmm5, xmm10, 0
+        vpxor	xmm1, xmm1, xmm10
+        vpxor	xmm2, xmm2, xmm5
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vpxor	xmm4, xmm4, xmm0
+        vpxor	xmm6, xmm6, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm4, xmm4, xmm2
+        vpxor	xmm6, xmm6, xmm1
+        vmovdqu	xmm7, OWORD PTR [rsp+96]
+        vmovdqu	xmm5, OWORD PTR [rsp+112]
+        ; ghash_gfmul_xor_avx
+        vpshufd	xmm1, xmm9, 78
+        vpshufd	xmm2, xmm7, 78
+        vpclmulqdq	xmm3, xmm7, xmm9, 17
+        vpclmulqdq	xmm0, xmm7, xmm9, 0
+        vpxor	xmm1, xmm1, xmm9
+        vpxor	xmm2, xmm2, xmm7
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vpxor	xmm4, xmm4, xmm0
+        vpxor	xmm6, xmm6, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm4, xmm4, xmm2
+        vpxor	xmm6, xmm6, xmm1
+        ; ghash_gfmul_xor_avx
+        vpshufd	xmm1, xmm8, 78
+        vpshufd	xmm2, xmm5, 78
+        vpclmulqdq	xmm3, xmm5, xmm8, 17
+        vpclmulqdq	xmm0, xmm5, xmm8, 0
+        vpxor	xmm1, xmm1, xmm8
+        vpxor	xmm2, xmm2, xmm5
+        vpclmulqdq	xmm1, xmm1, xmm2, 0
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm1, xmm1, xmm3
+        vpxor	xmm4, xmm4, xmm0
+        vpxor	xmm6, xmm6, xmm3
+        vpslldq	xmm2, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vpxor	xmm4, xmm4, xmm2
+        vpxor	xmm6, xmm6, xmm1
+        vpslld	xmm0, xmm4, 31
+        vpslld	xmm1, xmm4, 30
+        vpslld	xmm2, xmm4, 25
+        vpxor	xmm0, xmm0, xmm1
+        vpxor	xmm0, xmm0, xmm2
+        vmovdqa	xmm1, xmm0
+        vpsrldq	xmm1, xmm1, 4
+        vpslldq	xmm0, xmm0, 12
+        vpxor	xmm4, xmm4, xmm0
+        vpsrld	xmm2, xmm4, 1
+        vpsrld	xmm3, xmm4, 2
+        vpsrld	xmm0, xmm4, 7
+        vpxor	xmm2, xmm2, xmm3
+        vpxor	xmm2, xmm2, xmm0
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm2, xmm2, xmm4
+        vpxor	xmm6, xmm6, xmm2
+        vmovdqu	xmm5, OWORD PTR [rsp]
+L_AES_GCM_encrypt_update_avx1_done_128:
+        mov	edx, r9d
+        cmp	edi, edx
+        jge	L_AES_GCM_encrypt_update_avx1_done_enc
+        mov	r13d, r9d
+        and	r13d, 4294967280
+        cmp	edi, r13d
+        jge	L_AES_GCM_encrypt_update_avx1_last_block_done
+        vmovdqu	xmm9, OWORD PTR [r15]
+        vpshufb	xmm8, xmm9, OWORD PTR L_avx1_aes_gcm_bswap_epi64
+        vpaddd	xmm9, xmm9, OWORD PTR L_avx1_aes_gcm_one
+        vmovdqu	OWORD PTR [r15], xmm9
+        vpxor	xmm8, xmm8, [rax]
+        vaesenc	xmm8, xmm8, [rax+16]
+        vaesenc	xmm8, xmm8, [rax+32]
+        vaesenc	xmm8, xmm8, [rax+48]
+        vaesenc	xmm8, xmm8, [rax+64]
+        vaesenc	xmm8, xmm8, [rax+80]
+        vaesenc	xmm8, xmm8, [rax+96]
+        vaesenc	xmm8, xmm8, [rax+112]
+        vaesenc	xmm8, xmm8, [rax+128]
+        vaesenc	xmm8, xmm8, [rax+144]
+        cmp	r8d, 11
+        vmovdqa	xmm9, OWORD PTR [rax+160]
+        jl	L_AES_GCM_encrypt_update_avx1_aesenc_block_last
+        vaesenc	xmm8, xmm8, xmm9
+        vaesenc	xmm8, xmm8, [rax+176]
+        cmp	r8d, 13
+        vmovdqa	xmm9, OWORD PTR [rax+192]
+        jl	L_AES_GCM_encrypt_update_avx1_aesenc_block_last
+        vaesenc	xmm8, xmm8, xmm9
+        vaesenc	xmm8, xmm8, [rax+208]
+        vmovdqa	xmm9, OWORD PTR [rax+224]
+L_AES_GCM_encrypt_update_avx1_aesenc_block_last:
+        vaesenclast	xmm8, xmm8, xmm9
+        vmovdqu	xmm9, OWORD PTR [r11+rdi]
+        vpxor	xmm8, xmm8, xmm9
+        vmovdqu	OWORD PTR [r10+rdi], xmm8
+        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vpxor	xmm6, xmm6, xmm8
+        add	edi, 16
+        cmp	edi, r13d
+        jge	L_AES_GCM_encrypt_update_avx1_last_block_ghash
+L_AES_GCM_encrypt_update_avx1_last_block_start:
+        vmovdqu	xmm13, OWORD PTR [r11+rdi]
+        vmovdqu	xmm9, OWORD PTR [r15]
+        vpshufb	xmm8, xmm9, OWORD PTR L_avx1_aes_gcm_bswap_epi64
+        vpaddd	xmm9, xmm9, OWORD PTR L_avx1_aes_gcm_one
+        vmovdqu	OWORD PTR [r15], xmm9
+        vpxor	xmm8, xmm8, [rax]
+        vpclmulqdq	xmm10, xmm6, xmm5, 16
+        vaesenc	xmm8, xmm8, [rax+16]
+        vaesenc	xmm8, xmm8, [rax+32]
+        vpclmulqdq	xmm11, xmm6, xmm5, 1
+        vaesenc	xmm8, xmm8, [rax+48]
+        vaesenc	xmm8, xmm8, [rax+64]
+        vpclmulqdq	xmm12, xmm6, xmm5, 0
+        vaesenc	xmm8, xmm8, [rax+80]
+        vpclmulqdq	xmm1, xmm6, xmm5, 17
+        vaesenc	xmm8, xmm8, [rax+96]
+        vpxor	xmm10, xmm10, xmm11
+        vpslldq	xmm2, xmm10, 8
+        vpsrldq	xmm10, xmm10, 8
+        vaesenc	xmm8, xmm8, [rax+112]
+        vpxor	xmm2, xmm2, xmm12
+        vpxor	xmm3, xmm1, xmm10
+        vmovdqa	xmm0, OWORD PTR L_avx1_aes_gcm_mod2_128
+        vpclmulqdq	xmm11, xmm2, xmm0, 16
+        vaesenc	xmm8, xmm8, [rax+128]
+        vpshufd	xmm10, xmm2, 78
+        vpxor	xmm10, xmm10, xmm11
+        vpclmulqdq	xmm11, xmm10, xmm0, 16
+        vaesenc	xmm8, xmm8, [rax+144]
+        vpshufd	xmm10, xmm10, 78
+        vpxor	xmm10, xmm10, xmm11
+        vpxor	xmm6, xmm10, xmm3
+        cmp	r8d, 11
+        vmovdqa	xmm9, OWORD PTR [rax+160]
+        jl	L_AES_GCM_encrypt_update_avx1_aesenc_gfmul_last
+        vaesenc	xmm8, xmm8, xmm9
+        vaesenc	xmm8, xmm8, [rax+176]
+        cmp	r8d, 13
+        vmovdqa	xmm9, OWORD PTR [rax+192]
+        jl	L_AES_GCM_encrypt_update_avx1_aesenc_gfmul_last
+        vaesenc	xmm8, xmm8, xmm9
+        vaesenc	xmm8, xmm8, [rax+208]
+        vmovdqa	xmm9, OWORD PTR [rax+224]
+L_AES_GCM_encrypt_update_avx1_aesenc_gfmul_last:
+        vaesenclast	xmm8, xmm8, xmm9
+        vmovdqa	xmm0, xmm13
+        vpxor	xmm8, xmm8, xmm0
+        vmovdqu	OWORD PTR [r10+rdi], xmm8
+        vpshufb	xmm8, xmm8, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        add	edi, 16
+        vpxor	xmm6, xmm6, xmm8
+        cmp	edi, r13d
+        jl	L_AES_GCM_encrypt_update_avx1_last_block_start
+L_AES_GCM_encrypt_update_avx1_last_block_ghash:
+        ; ghash_gfmul_red_avx
+        vpshufd	xmm9, xmm5, 78
+        vpshufd	xmm10, xmm6, 78
+        vpclmulqdq	xmm11, xmm6, xmm5, 17
+        vpclmulqdq	xmm8, xmm6, xmm5, 0
+        vpxor	xmm9, xmm9, xmm5
+        vpxor	xmm10, xmm10, xmm6
+        vpclmulqdq	xmm9, xmm9, xmm10, 0
+        vpxor	xmm9, xmm9, xmm8
+        vpxor	xmm9, xmm9, xmm11
+        vpslldq	xmm10, xmm9, 8
+        vpsrldq	xmm9, xmm9, 8
+        vpxor	xmm8, xmm8, xmm10
+        vpxor	xmm6, xmm11, xmm9
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm6, xmm6, xmm14
+L_AES_GCM_encrypt_update_avx1_last_block_done:
+L_AES_GCM_encrypt_update_avx1_done_enc:
+        vmovdqa	OWORD PTR [r12], xmm6
+        vzeroupper
+        add	rsp, 160
+        pop	rdi
+        pop	r15
+        pop	r14
+        pop	r12
+        pop	r13
+        ret
+AES_GCM_encrypt_update_avx1 ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+AES_GCM_encrypt_final_avx1 PROC
+        push	r13
+        push	r12
+        push	r14
+        mov	rax, rcx
+        mov	r10d, r9d
+        mov	r9, rdx
+        mov	r11d, DWORD PTR [rsp+64]
+        mov	r12, QWORD PTR [rsp+72]
+        mov	r14, QWORD PTR [rsp+80]
+        sub	rsp, 16
+        vmovdqa	xmm4, OWORD PTR [rax]
+        vmovdqa	xmm5, OWORD PTR [r12]
+        vmovdqa	xmm6, OWORD PTR [r14]
+        vpsrlq	xmm9, xmm5, 63
+        vpsllq	xmm8, xmm5, 1
+        vpslldq	xmm9, xmm9, 8
+        vpor	xmm8, xmm8, xmm9
+        vpshufd	xmm5, xmm5, 255
+        vpsrad	xmm5, xmm5, 31
+        vpand	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_mod2_128
+        vpxor	xmm5, xmm5, xmm8
+        mov	edx, r10d
+        mov	ecx, r11d
+        shl	rdx, 3
+        shl	rcx, 3
+        vmovq	xmm0, rdx
+        vmovq	xmm1, rcx
+        vpunpcklqdq	xmm0, xmm0, xmm1
+        vpxor	xmm4, xmm4, xmm0
+        ; ghash_gfmul_red_avx
+        vpshufd	xmm9, xmm5, 78
+        vpshufd	xmm10, xmm4, 78
+        vpclmulqdq	xmm11, xmm4, xmm5, 17
+        vpclmulqdq	xmm8, xmm4, xmm5, 0
+        vpxor	xmm9, xmm9, xmm5
+        vpxor	xmm10, xmm10, xmm4
+        vpclmulqdq	xmm9, xmm9, xmm10, 0
+        vpxor	xmm9, xmm9, xmm8
+        vpxor	xmm9, xmm9, xmm11
+        vpslldq	xmm10, xmm9, 8
+        vpsrldq	xmm9, xmm9, 8
+        vpxor	xmm8, xmm8, xmm10
+        vpxor	xmm4, xmm11, xmm9
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm4, xmm4, xmm14
+        vpshufb	xmm4, xmm4, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vpxor	xmm0, xmm4, xmm6
+        cmp	r8d, 16
+        je	L_AES_GCM_encrypt_final_avx1_store_tag_16
+        xor	rcx, rcx
+        vmovdqu	OWORD PTR [rsp], xmm0
+L_AES_GCM_encrypt_final_avx1_store_tag_loop:
+        movzx	r13d, BYTE PTR [rsp+rcx]
+        mov	BYTE PTR [r9+rcx], r13b
+        inc	ecx
+        cmp	ecx, r8d
+        jne	L_AES_GCM_encrypt_final_avx1_store_tag_loop
+        jmp	L_AES_GCM_encrypt_final_avx1_store_tag_done
+L_AES_GCM_encrypt_final_avx1_store_tag_16:
+        vmovdqu	OWORD PTR [r9], xmm0
+L_AES_GCM_encrypt_final_avx1_store_tag_done:
+        vzeroupper
+        add	rsp, 16
+        pop	r14
+        pop	r12
+        pop	r13
+        ret
+AES_GCM_encrypt_final_avx1 ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+AES_GCM_decrypt_update_avx1 PROC
+        push	r13
+        push	r12
+        push	r14
+        push	r15
+        push	rdi
+        mov	rax, rcx
+        mov	r10, r8
+        mov	r8d, edx
+        mov	r11, r9
+        mov	r9d, DWORD PTR [rsp+80]
+        mov	r12, QWORD PTR [rsp+88]
+        mov	r14, QWORD PTR [rsp+96]
+        mov	r15, QWORD PTR [rsp+104]
+        sub	rsp, 168
+        vmovdqa	xmm6, OWORD PTR [r12]
+        vmovdqa	xmm5, OWORD PTR [r14]
+        vpsrlq	xmm9, xmm5, 63
+        vpsllq	xmm8, xmm5, 1
+        vpslldq	xmm9, xmm9, 8
+        vpor	xmm8, xmm8, xmm9
+        vpshufd	xmm5, xmm5, 255
+        vpsrad	xmm5, xmm5, 31
+        vpand	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_mod2_128
+        vpxor	xmm5, xmm5, xmm8
+        xor	edi, edi
+        cmp	r9d, 128
+        mov	r13d, r9d
+        jl	L_AES_GCM_decrypt_update_avx1_done_128
+        and	r13d, 4294967168
+        vmovdqa	xmm2, xmm6
+        ; H ^ 1
+        vmovdqu	OWORD PTR [rsp], xmm5
+        ; H ^ 2
+        vpclmulqdq	xmm8, xmm5, xmm5, 0
+        vpclmulqdq	xmm0, xmm5, xmm5, 17
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm0, xmm0, xmm14
+        vmovdqu	OWORD PTR [rsp+16], xmm0
+        ; H ^ 3
+        ; ghash_gfmul_red_avx
+        vpshufd	xmm9, xmm5, 78
+        vpshufd	xmm10, xmm0, 78
+        vpclmulqdq	xmm11, xmm0, xmm5, 17
+        vpclmulqdq	xmm8, xmm0, xmm5, 0
+        vpxor	xmm9, xmm9, xmm5
+        vpxor	xmm10, xmm10, xmm0
+        vpclmulqdq	xmm9, xmm9, xmm10, 0
+        vpxor	xmm9, xmm9, xmm8
+        vpxor	xmm9, xmm9, xmm11
+        vpslldq	xmm10, xmm9, 8
+        vpsrldq	xmm9, xmm9, 8
+        vpxor	xmm8, xmm8, xmm10
+        vpxor	xmm1, xmm11, xmm9
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm1, xmm1, xmm14
+        vmovdqu	OWORD PTR [rsp+32], xmm1
+        ; H ^ 4
+        vpclmulqdq	xmm8, xmm0, xmm0, 0
+        vpclmulqdq	xmm3, xmm0, xmm0, 17
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm3, xmm3, xmm14
+        vmovdqu	OWORD PTR [rsp+48], xmm3
+        ; H ^ 5
+        ; ghash_gfmul_red_avx
+        vpshufd	xmm9, xmm0, 78
+        vpshufd	xmm10, xmm1, 78
+        vpclmulqdq	xmm11, xmm1, xmm0, 17
+        vpclmulqdq	xmm8, xmm1, xmm0, 0
+        vpxor	xmm9, xmm9, xmm0
+        vpxor	xmm10, xmm10, xmm1
+        vpclmulqdq	xmm9, xmm9, xmm10, 0
+        vpxor	xmm9, xmm9, xmm8
+        vpxor	xmm9, xmm9, xmm11
+        vpslldq	xmm10, xmm9, 8
+        vpsrldq	xmm9, xmm9, 8
+        vpxor	xmm8, xmm8, xmm10
+        vpxor	xmm7, xmm11, xmm9
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm7, xmm7, xmm14
+        vmovdqu	OWORD PTR [rsp+64], xmm7
+        ; H ^ 6
+        vpclmulqdq	xmm8, xmm1, xmm1, 0
+        vpclmulqdq	xmm7, xmm1, xmm1, 17
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm7, xmm7, xmm14
+        vmovdqu	OWORD PTR [rsp+80], xmm7
+        ; H ^ 7
+        ; ghash_gfmul_red_avx
+        vpshufd	xmm9, xmm1, 78
+        vpshufd	xmm10, xmm3, 78
+        vpclmulqdq	xmm11, xmm3, xmm1, 17
+        vpclmulqdq	xmm8, xmm3, xmm1, 0
+        vpxor	xmm9, xmm9, xmm1
+        vpxor	xmm10, xmm10, xmm3
+        vpclmulqdq	xmm9, xmm9, xmm10, 0
+        vpxor	xmm9, xmm9, xmm8
+        vpxor	xmm9, xmm9, xmm11
+        vpslldq	xmm10, xmm9, 8
+        vpsrldq	xmm9, xmm9, 8
+        vpxor	xmm8, xmm8, xmm10
+        vpxor	xmm7, xmm11, xmm9
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm7, xmm7, xmm14
+        vmovdqu	OWORD PTR [rsp+96], xmm7
+        ; H ^ 8
+        vpclmulqdq	xmm8, xmm3, xmm3, 0
+        vpclmulqdq	xmm7, xmm3, xmm3, 17
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm7, xmm7, xmm14
+        vmovdqu	OWORD PTR [rsp+112], xmm7
+L_AES_GCM_decrypt_update_avx1_ghash_128:
+        lea	rcx, QWORD PTR [r11+rdi]
+        lea	rdx, QWORD PTR [r10+rdi]
+        vmovdqu	xmm0, OWORD PTR [r15]
+        vmovdqa	xmm1, OWORD PTR L_avx1_aes_gcm_bswap_epi64
+        vpshufb	xmm8, xmm0, xmm1
+        vpaddd	xmm9, xmm0, OWORD PTR L_avx1_aes_gcm_one
+        vpshufb	xmm9, xmm9, xmm1
+        vpaddd	xmm10, xmm0, OWORD PTR L_avx1_aes_gcm_two
+        vpshufb	xmm10, xmm10, xmm1
+        vpaddd	xmm11, xmm0, OWORD PTR L_avx1_aes_gcm_three
+        vpshufb	xmm11, xmm11, xmm1
+        vpaddd	xmm12, xmm0, OWORD PTR L_avx1_aes_gcm_four
+        vpshufb	xmm12, xmm12, xmm1
+        vpaddd	xmm13, xmm0, OWORD PTR L_avx1_aes_gcm_five
+        vpshufb	xmm13, xmm13, xmm1
+        vpaddd	xmm14, xmm0, OWORD PTR L_avx1_aes_gcm_six
+        vpshufb	xmm14, xmm14, xmm1
+        vpaddd	xmm15, xmm0, OWORD PTR L_avx1_aes_gcm_seven
+        vpshufb	xmm15, xmm15, xmm1
+        vpaddd	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_eight
+        vmovdqa	xmm7, OWORD PTR [rax]
+        vmovdqu	OWORD PTR [r15], xmm0
+        vpxor	xmm8, xmm8, xmm7
+        vpxor	xmm9, xmm9, xmm7
+        vpxor	xmm10, xmm10, xmm7
+        vpxor	xmm11, xmm11, xmm7
+        vpxor	xmm12, xmm12, xmm7
+        vpxor	xmm13, xmm13, xmm7
+        vpxor	xmm14, xmm14, xmm7
+        vpxor	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rsp+112]
+        vmovdqu	xmm0, OWORD PTR [rcx]
+        vaesenc	xmm8, xmm8, [rax+16]
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vpxor	xmm0, xmm0, xmm2
+        vpshufd	xmm1, xmm7, 78
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm3, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [rax+16]
+        vaesenc	xmm10, xmm10, [rax+16]
+        vpclmulqdq	xmm2, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [rax+16]
+        vaesenc	xmm12, xmm12, [rax+16]
+        vpclmulqdq	xmm1, xmm1, xmm5, 0
+        vaesenc	xmm13, xmm13, [rax+16]
+        vaesenc	xmm14, xmm14, [rax+16]
+        vaesenc	xmm15, xmm15, [rax+16]
+        vpxor	xmm1, xmm1, xmm2
+        vpxor	xmm1, xmm1, xmm3
+        vmovdqu	xmm7, OWORD PTR [rsp+96]
+        vmovdqu	xmm0, OWORD PTR [rcx+16]
+        vpshufd	xmm4, xmm7, 78
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vaesenc	xmm8, xmm8, [rax+32]
+        vpxor	xmm4, xmm4, xmm7
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm6, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [rax+32]
+        vaesenc	xmm10, xmm10, [rax+32]
+        vpclmulqdq	xmm7, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [rax+32]
+        vaesenc	xmm12, xmm12, [rax+32]
+        vpclmulqdq	xmm4, xmm4, xmm5, 0
+        vaesenc	xmm13, xmm13, [rax+32]
+        vaesenc	xmm14, xmm14, [rax+32]
+        vaesenc	xmm15, xmm15, [rax+32]
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm3, xmm3, xmm6
+        vpxor	xmm1, xmm1, xmm4
+        vmovdqu	xmm7, OWORD PTR [rsp+80]
+        vmovdqu	xmm0, OWORD PTR [rcx+32]
+        vpshufd	xmm4, xmm7, 78
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vaesenc	xmm8, xmm8, [rax+48]
+        vpxor	xmm4, xmm4, xmm7
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm6, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [rax+48]
+        vaesenc	xmm10, xmm10, [rax+48]
+        vpclmulqdq	xmm7, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [rax+48]
+        vaesenc	xmm12, xmm12, [rax+48]
+        vpclmulqdq	xmm4, xmm4, xmm5, 0
+        vaesenc	xmm13, xmm13, [rax+48]
+        vaesenc	xmm14, xmm14, [rax+48]
+        vaesenc	xmm15, xmm15, [rax+48]
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm3, xmm3, xmm6
+        vpxor	xmm1, xmm1, xmm4
+        vmovdqu	xmm7, OWORD PTR [rsp+64]
+        vmovdqu	xmm0, OWORD PTR [rcx+48]
+        vpshufd	xmm4, xmm7, 78
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vaesenc	xmm8, xmm8, [rax+64]
+        vpxor	xmm4, xmm4, xmm7
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm6, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [rax+64]
+        vaesenc	xmm10, xmm10, [rax+64]
+        vpclmulqdq	xmm7, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [rax+64]
+        vaesenc	xmm12, xmm12, [rax+64]
+        vpclmulqdq	xmm4, xmm4, xmm5, 0
+        vaesenc	xmm13, xmm13, [rax+64]
+        vaesenc	xmm14, xmm14, [rax+64]
+        vaesenc	xmm15, xmm15, [rax+64]
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm3, xmm3, xmm6
+        vpxor	xmm1, xmm1, xmm4
+        vmovdqu	xmm7, OWORD PTR [rsp+48]
+        vmovdqu	xmm0, OWORD PTR [rcx+64]
+        vpshufd	xmm4, xmm7, 78
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vaesenc	xmm8, xmm8, [rax+80]
+        vpxor	xmm4, xmm4, xmm7
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm6, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [rax+80]
+        vaesenc	xmm10, xmm10, [rax+80]
+        vpclmulqdq	xmm7, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [rax+80]
+        vaesenc	xmm12, xmm12, [rax+80]
+        vpclmulqdq	xmm4, xmm4, xmm5, 0
+        vaesenc	xmm13, xmm13, [rax+80]
+        vaesenc	xmm14, xmm14, [rax+80]
+        vaesenc	xmm15, xmm15, [rax+80]
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm3, xmm3, xmm6
+        vpxor	xmm1, xmm1, xmm4
+        vmovdqu	xmm7, OWORD PTR [rsp+32]
+        vmovdqu	xmm0, OWORD PTR [rcx+80]
+        vpshufd	xmm4, xmm7, 78
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vaesenc	xmm8, xmm8, [rax+96]
+        vpxor	xmm4, xmm4, xmm7
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm6, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [rax+96]
+        vaesenc	xmm10, xmm10, [rax+96]
+        vpclmulqdq	xmm7, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [rax+96]
+        vaesenc	xmm12, xmm12, [rax+96]
+        vpclmulqdq	xmm4, xmm4, xmm5, 0
+        vaesenc	xmm13, xmm13, [rax+96]
+        vaesenc	xmm14, xmm14, [rax+96]
+        vaesenc	xmm15, xmm15, [rax+96]
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm3, xmm3, xmm6
+        vpxor	xmm1, xmm1, xmm4
+        vmovdqu	xmm7, OWORD PTR [rsp+16]
+        vmovdqu	xmm0, OWORD PTR [rcx+96]
+        vpshufd	xmm4, xmm7, 78
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vaesenc	xmm8, xmm8, [rax+112]
+        vpxor	xmm4, xmm4, xmm7
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm6, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [rax+112]
+        vaesenc	xmm10, xmm10, [rax+112]
+        vpclmulqdq	xmm7, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [rax+112]
+        vaesenc	xmm12, xmm12, [rax+112]
+        vpclmulqdq	xmm4, xmm4, xmm5, 0
+        vaesenc	xmm13, xmm13, [rax+112]
+        vaesenc	xmm14, xmm14, [rax+112]
+        vaesenc	xmm15, xmm15, [rax+112]
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm3, xmm3, xmm6
+        vpxor	xmm1, xmm1, xmm4
+        vmovdqu	xmm7, OWORD PTR [rsp]
+        vmovdqu	xmm0, OWORD PTR [rcx+112]
+        vpshufd	xmm4, xmm7, 78
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vaesenc	xmm8, xmm8, [rax+128]
+        vpxor	xmm4, xmm4, xmm7
+        vpshufd	xmm5, xmm0, 78
+        vpxor	xmm5, xmm5, xmm0
+        vpclmulqdq	xmm6, xmm0, xmm7, 17
+        vaesenc	xmm9, xmm9, [rax+128]
+        vaesenc	xmm10, xmm10, [rax+128]
+        vpclmulqdq	xmm7, xmm0, xmm7, 0
+        vaesenc	xmm11, xmm11, [rax+128]
+        vaesenc	xmm12, xmm12, [rax+128]
+        vpclmulqdq	xmm4, xmm4, xmm5, 0
+        vaesenc	xmm13, xmm13, [rax+128]
+        vaesenc	xmm14, xmm14, [rax+128]
+        vaesenc	xmm15, xmm15, [rax+128]
+        vpxor	xmm1, xmm1, xmm7
+        vpxor	xmm2, xmm2, xmm7
+        vpxor	xmm1, xmm1, xmm6
+        vpxor	xmm3, xmm3, xmm6
+        vpxor	xmm1, xmm1, xmm4
+        vpslldq	xmm5, xmm1, 8
+        vpsrldq	xmm1, xmm1, 8
+        vaesenc	xmm8, xmm8, [rax+144]
+        vpxor	xmm2, xmm2, xmm5
+        vpxor	xmm3, xmm3, xmm1
+        vaesenc	xmm9, xmm9, [rax+144]
+        vpslld	xmm7, xmm2, 31
+        vpslld	xmm4, xmm2, 30
+        vpslld	xmm5, xmm2, 25
+        vaesenc	xmm10, xmm10, [rax+144]
+        vpxor	xmm7, xmm7, xmm4
+        vpxor	xmm7, xmm7, xmm5
+        vaesenc	xmm11, xmm11, [rax+144]
+        vpsrldq	xmm4, xmm7, 4
+        vpslldq	xmm7, xmm7, 12
+        vaesenc	xmm12, xmm12, [rax+144]
+        vpxor	xmm2, xmm2, xmm7
+        vpsrld	xmm5, xmm2, 1
+        vaesenc	xmm13, xmm13, [rax+144]
+        vpsrld	xmm1, xmm2, 2
+        vpsrld	xmm0, xmm2, 7
+        vaesenc	xmm14, xmm14, [rax+144]
+        vpxor	xmm5, xmm5, xmm1
+        vpxor	xmm5, xmm5, xmm0
+        vaesenc	xmm15, xmm15, [rax+144]
+        vpxor	xmm5, xmm5, xmm4
+        vpxor	xmm2, xmm2, xmm5
+        vpxor	xmm2, xmm2, xmm3
+        cmp	r8d, 11
+        vmovdqa	xmm7, OWORD PTR [rax+160]
+        jl	L_AES_GCM_decrypt_update_avx1_aesenc_128_ghash_avx_done
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [rax+176]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        cmp	r8d, 13
+        vmovdqa	xmm7, OWORD PTR [rax+192]
+        jl	L_AES_GCM_decrypt_update_avx1_aesenc_128_ghash_avx_done
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [rax+208]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqa	xmm7, OWORD PTR [rax+224]
+L_AES_GCM_decrypt_update_avx1_aesenc_128_ghash_avx_done:
+        vaesenclast	xmm8, xmm8, xmm7
+        vaesenclast	xmm9, xmm9, xmm7
+        vmovdqu	xmm0, OWORD PTR [rcx]
+        vmovdqu	xmm1, OWORD PTR [rcx+16]
+        vpxor	xmm8, xmm8, xmm0
+        vpxor	xmm9, xmm9, xmm1
+        vmovdqu	OWORD PTR [rdx], xmm8
+        vmovdqu	OWORD PTR [rdx+16], xmm9
+        vaesenclast	xmm10, xmm10, xmm7
+        vaesenclast	xmm11, xmm11, xmm7
+        vmovdqu	xmm0, OWORD PTR [rcx+32]
+        vmovdqu	xmm1, OWORD PTR [rcx+48]
+        vpxor	xmm10, xmm10, xmm0
+        vpxor	xmm11, xmm11, xmm1
+        vmovdqu	OWORD PTR [rdx+32], xmm10
+        vmovdqu	OWORD PTR [rdx+48], xmm11
+        vaesenclast	xmm12, xmm12, xmm7
+        vaesenclast	xmm13, xmm13, xmm7
+        vmovdqu	xmm0, OWORD PTR [rcx+64]
+        vmovdqu	xmm1, OWORD PTR [rcx+80]
+        vpxor	xmm12, xmm12, xmm0
+        vpxor	xmm13, xmm13, xmm1
+        vmovdqu	OWORD PTR [rdx+64], xmm12
+        vmovdqu	OWORD PTR [rdx+80], xmm13
+        vaesenclast	xmm14, xmm14, xmm7
+        vaesenclast	xmm15, xmm15, xmm7
+        vmovdqu	xmm0, OWORD PTR [rcx+96]
+        vmovdqu	xmm1, OWORD PTR [rcx+112]
+        vpxor	xmm14, xmm14, xmm0
+        vpxor	xmm15, xmm15, xmm1
+        vmovdqu	OWORD PTR [rdx+96], xmm14
+        vmovdqu	OWORD PTR [rdx+112], xmm15
+        add	edi, 128
+        cmp	edi, r13d
+        jl	L_AES_GCM_decrypt_update_avx1_ghash_128
+        vmovdqa	xmm6, xmm2
+        vmovdqu	xmm5, OWORD PTR [rsp]
+L_AES_GCM_decrypt_update_avx1_done_128:
+        mov	edx, r9d
+        cmp	edi, edx
+        jge	L_AES_GCM_decrypt_update_avx1_done_dec
+        mov	r13d, r9d
+        and	r13d, 4294967280
+        cmp	edi, r13d
+        jge	L_AES_GCM_decrypt_update_avx1_last_block_done
+L_AES_GCM_decrypt_update_avx1_last_block_start:
+        vmovdqu	xmm13, OWORD PTR [r11+rdi]
+        vmovdqa	xmm0, xmm5
+        vpshufb	xmm1, xmm13, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vpxor	xmm1, xmm1, xmm6
+        vmovdqu	xmm9, OWORD PTR [r15]
+        vpshufb	xmm8, xmm9, OWORD PTR L_avx1_aes_gcm_bswap_epi64
+        vpaddd	xmm9, xmm9, OWORD PTR L_avx1_aes_gcm_one
+        vmovdqu	OWORD PTR [r15], xmm9
+        vpxor	xmm8, xmm8, [rax]
+        vpclmulqdq	xmm10, xmm1, xmm0, 16
+        vaesenc	xmm8, xmm8, [rax+16]
+        vaesenc	xmm8, xmm8, [rax+32]
+        vpclmulqdq	xmm11, xmm1, xmm0, 1
+        vaesenc	xmm8, xmm8, [rax+48]
+        vaesenc	xmm8, xmm8, [rax+64]
+        vpclmulqdq	xmm12, xmm1, xmm0, 0
+        vaesenc	xmm8, xmm8, [rax+80]
+        vpclmulqdq	xmm1, xmm1, xmm0, 17
+        vaesenc	xmm8, xmm8, [rax+96]
+        vpxor	xmm10, xmm10, xmm11
+        vpslldq	xmm2, xmm10, 8
+        vpsrldq	xmm10, xmm10, 8
+        vaesenc	xmm8, xmm8, [rax+112]
+        vpxor	xmm2, xmm2, xmm12
+        vpxor	xmm3, xmm1, xmm10
+        vmovdqa	xmm0, OWORD PTR L_avx1_aes_gcm_mod2_128
+        vpclmulqdq	xmm11, xmm2, xmm0, 16
+        vaesenc	xmm8, xmm8, [rax+128]
+        vpshufd	xmm10, xmm2, 78
+        vpxor	xmm10, xmm10, xmm11
+        vpclmulqdq	xmm11, xmm10, xmm0, 16
+        vaesenc	xmm8, xmm8, [rax+144]
+        vpshufd	xmm10, xmm10, 78
+        vpxor	xmm10, xmm10, xmm11
+        vpxor	xmm6, xmm10, xmm3
+        cmp	r8d, 11
+        vmovdqa	xmm9, OWORD PTR [rax+160]
+        jl	L_AES_GCM_decrypt_update_avx1_aesenc_gfmul_last
+        vaesenc	xmm8, xmm8, xmm9
+        vaesenc	xmm8, xmm8, [rax+176]
+        cmp	r8d, 13
+        vmovdqa	xmm9, OWORD PTR [rax+192]
+        jl	L_AES_GCM_decrypt_update_avx1_aesenc_gfmul_last
+        vaesenc	xmm8, xmm8, xmm9
+        vaesenc	xmm8, xmm8, [rax+208]
+        vmovdqa	xmm9, OWORD PTR [rax+224]
+L_AES_GCM_decrypt_update_avx1_aesenc_gfmul_last:
+        vaesenclast	xmm8, xmm8, xmm9
+        vmovdqa	xmm0, xmm13
+        vpxor	xmm8, xmm8, xmm0
+        vmovdqu	OWORD PTR [r10+rdi], xmm8
+        add	edi, 16
+        cmp	edi, r13d
+        jl	L_AES_GCM_decrypt_update_avx1_last_block_start
+L_AES_GCM_decrypt_update_avx1_last_block_done:
+L_AES_GCM_decrypt_update_avx1_done_dec:
+        vmovdqa	OWORD PTR [r12], xmm6
+        vzeroupper
+        add	rsp, 168
+        pop	rdi
+        pop	r15
+        pop	r14
+        pop	r12
+        pop	r13
+        ret
+AES_GCM_decrypt_update_avx1 ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+AES_GCM_decrypt_final_avx1 PROC
+        push	r13
+        push	r12
+        push	r14
+        push	rbp
+        push	r15
+        mov	rax, rcx
+        mov	r10d, r9d
+        mov	r9, rdx
+        mov	r11d, DWORD PTR [rsp+80]
+        mov	r12, QWORD PTR [rsp+88]
+        mov	r14, QWORD PTR [rsp+96]
+        mov	rbp, QWORD PTR [rsp+104]
+        sub	rsp, 16
+        vmovdqa	xmm6, OWORD PTR [rax]
+        vmovdqa	xmm5, OWORD PTR [r12]
+        vmovdqa	xmm15, OWORD PTR [r14]
+        vpsrlq	xmm9, xmm5, 63
+        vpsllq	xmm8, xmm5, 1
+        vpslldq	xmm9, xmm9, 8
+        vpor	xmm8, xmm8, xmm9
+        vpshufd	xmm5, xmm5, 255
+        vpsrad	xmm5, xmm5, 31
+        vpand	xmm5, xmm5, OWORD PTR L_avx1_aes_gcm_mod2_128
+        vpxor	xmm5, xmm5, xmm8
+        mov	edx, r10d
+        mov	ecx, r11d
+        shl	rdx, 3
+        shl	rcx, 3
+        vmovq	xmm0, rdx
+        vmovq	xmm1, rcx
+        vpunpcklqdq	xmm0, xmm0, xmm1
+        vpxor	xmm6, xmm6, xmm0
+        ; ghash_gfmul_red_avx
+        vpshufd	xmm9, xmm5, 78
+        vpshufd	xmm10, xmm6, 78
+        vpclmulqdq	xmm11, xmm6, xmm5, 17
+        vpclmulqdq	xmm8, xmm6, xmm5, 0
+        vpxor	xmm9, xmm9, xmm5
+        vpxor	xmm10, xmm10, xmm6
+        vpclmulqdq	xmm9, xmm9, xmm10, 0
+        vpxor	xmm9, xmm9, xmm8
+        vpxor	xmm9, xmm9, xmm11
+        vpslldq	xmm10, xmm9, 8
+        vpsrldq	xmm9, xmm9, 8
+        vpxor	xmm8, xmm8, xmm10
+        vpxor	xmm6, xmm11, xmm9
+        vpslld	xmm12, xmm8, 31
+        vpslld	xmm13, xmm8, 30
+        vpslld	xmm14, xmm8, 25
+        vpxor	xmm12, xmm12, xmm13
+        vpxor	xmm12, xmm12, xmm14
+        vpsrldq	xmm13, xmm12, 4
+        vpslldq	xmm12, xmm12, 12
+        vpxor	xmm8, xmm8, xmm12
+        vpsrld	xmm14, xmm8, 1
+        vpsrld	xmm10, xmm8, 2
+        vpsrld	xmm9, xmm8, 7
+        vpxor	xmm14, xmm14, xmm10
+        vpxor	xmm14, xmm14, xmm9
+        vpxor	xmm14, xmm14, xmm13
+        vpxor	xmm14, xmm14, xmm8
+        vpxor	xmm6, xmm6, xmm14
+        vpshufb	xmm6, xmm6, OWORD PTR L_avx1_aes_gcm_bswap_mask
+        vpxor	xmm0, xmm6, xmm15
+        cmp	r8d, 16
+        je	L_AES_GCM_decrypt_final_avx1_cmp_tag_16
+        sub	rsp, 16
+        xor	rcx, rcx
+        xor	r15, r15
+        vmovdqu	OWORD PTR [rsp], xmm0
+L_AES_GCM_decrypt_final_avx1_cmp_tag_loop:
+        movzx	r13d, BYTE PTR [rsp+rcx]
+        xor	r13b, BYTE PTR [r9+rcx]
+        or	r15b, r13b
+        inc	ecx
+        cmp	ecx, r8d
+        jne	L_AES_GCM_decrypt_final_avx1_cmp_tag_loop
+        cmp	r15, 0
+        sete	r15b
+        add	rsp, 16
+        xor	rcx, rcx
+        jmp	L_AES_GCM_decrypt_final_avx1_cmp_tag_done
+L_AES_GCM_decrypt_final_avx1_cmp_tag_16:
+        vmovdqu	xmm1, OWORD PTR [r9]
+        vpcmpeqb	xmm0, xmm0, xmm1
+        vpmovmskb	rdx, xmm0
+        ; %%edx == 0xFFFF then return 1 else => return 0
+        xor	r15d, r15d
+        cmp	edx, 65535
+        sete	r15b
+L_AES_GCM_decrypt_final_avx1_cmp_tag_done:
+        mov	DWORD PTR [rbp], r15d
+        vzeroupper
+        add	rsp, 16
+        pop	r15
+        pop	rbp
+        pop	r14
+        pop	r12
+        pop	r13
+        ret
+AES_GCM_decrypt_final_avx1 ENDP
+_text ENDS
+ENDIF
+IFDEF HAVE_INTEL_AVX2
+_DATA SEGMENT
+ALIGN 16
+L_avx2_aes_gcm_one QWORD 0, 1
+ptr_L_avx2_aes_gcm_one QWORD L_avx2_aes_gcm_one
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx2_aes_gcm_two QWORD 0, 2
+ptr_L_avx2_aes_gcm_two QWORD L_avx2_aes_gcm_two
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx2_aes_gcm_three QWORD 0, 3
+ptr_L_avx2_aes_gcm_three QWORD L_avx2_aes_gcm_three
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx2_aes_gcm_four QWORD 0, 4
+ptr_L_avx2_aes_gcm_four QWORD L_avx2_aes_gcm_four
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx2_aes_gcm_five QWORD 0, 5
+ptr_L_avx2_aes_gcm_five QWORD L_avx2_aes_gcm_five
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx2_aes_gcm_six QWORD 0, 6
+ptr_L_avx2_aes_gcm_six QWORD L_avx2_aes_gcm_six
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx2_aes_gcm_seven QWORD 0, 7
+ptr_L_avx2_aes_gcm_seven QWORD L_avx2_aes_gcm_seven
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx2_aes_gcm_eight QWORD 0, 8
+ptr_L_avx2_aes_gcm_eight QWORD L_avx2_aes_gcm_eight
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx2_aes_gcm_bswap_one QWORD 0, 72057594037927936
+ptr_L_avx2_aes_gcm_bswap_one QWORD L_avx2_aes_gcm_bswap_one
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx2_aes_gcm_bswap_epi64 QWORD 283686952306183, 579005069656919567
+ptr_L_avx2_aes_gcm_bswap_epi64 QWORD L_avx2_aes_gcm_bswap_epi64
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx2_aes_gcm_bswap_mask QWORD 579005069656919567, 283686952306183
+ptr_L_avx2_aes_gcm_bswap_mask QWORD L_avx2_aes_gcm_bswap_mask
+_DATA ENDS
+_DATA SEGMENT
+ALIGN 16
+L_avx2_aes_gcm_mod2_128 QWORD 1, 13979173243358019584
+ptr_L_avx2_aes_gcm_mod2_128 QWORD L_avx2_aes_gcm_mod2_128
+_DATA ENDS
+_text SEGMENT READONLY PARA
+AES_GCM_encrypt_avx2 PROC
+        push	r13
+        push	rdi
+        push	r12
+        push	r15
+        push	rbx
+        push	r14
+        push	rsi
+        mov	rdi, rcx
+        mov	r12, r8
+        mov	rax, r9
+        mov	r15, QWORD PTR [rsp+96]
+        mov	r8, rdx
+        mov	r10d, DWORD PTR [rsp+104]
+        mov	r11d, DWORD PTR [rsp+112]
+        mov	ebx, DWORD PTR [rsp+120]
+        mov	r14d, DWORD PTR [rsp+128]
+        mov	rsi, QWORD PTR [rsp+136]
+        mov	r9d, DWORD PTR [rsp+144]
+        sub	rsp, 160
+        vpxor	xmm4, xmm4, xmm4
+        vpxor	xmm6, xmm6, xmm6
+        mov	edx, ebx
+        cmp	edx, 12
+        je	L_AES_GCM_encrypt_avx2_iv_12
+        ; Calculate values when IV is not 12 bytes
+        ; H = Encrypt X(=0)
+        vmovdqu	xmm5, OWORD PTR [rsi]
+        vaesenc	xmm5, xmm5, [rsi+16]
+        vaesenc	xmm5, xmm5, [rsi+32]
+        vaesenc	xmm5, xmm5, [rsi+48]
+        vaesenc	xmm5, xmm5, [rsi+64]
+        vaesenc	xmm5, xmm5, [rsi+80]
+        vaesenc	xmm5, xmm5, [rsi+96]
+        vaesenc	xmm5, xmm5, [rsi+112]
+        vaesenc	xmm5, xmm5, [rsi+128]
+        vaesenc	xmm5, xmm5, [rsi+144]
+        cmp	r9d, 11
+        vmovdqu	xmm0, OWORD PTR [rsi+160]
+        jl	L_AES_GCM_encrypt_avx2_calc_iv_1_aesenc_avx_last
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm5, xmm5, [rsi+176]
+        cmp	r9d, 13
+        vmovdqu	xmm0, OWORD PTR [rsi+192]
+        jl	L_AES_GCM_encrypt_avx2_calc_iv_1_aesenc_avx_last
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm5, xmm5, [rsi+208]
+        vmovdqu	xmm0, OWORD PTR [rsi+224]
+L_AES_GCM_encrypt_avx2_calc_iv_1_aesenc_avx_last:
+        vaesenclast	xmm5, xmm5, xmm0
+        vpshufb	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        ; Calc counter
+        ; Initialization vector
+        cmp	edx, 0
+        mov	rcx, 0
+        je	L_AES_GCM_encrypt_avx2_calc_iv_done
+        cmp	edx, 16
+        jl	L_AES_GCM_encrypt_avx2_calc_iv_lt16
+        and	edx, 4294967280
+L_AES_GCM_encrypt_avx2_calc_iv_16_loop:
+        vmovdqu	xmm0, OWORD PTR [rax+rcx]
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm4, xmm4, xmm0
+        ; ghash_gfmul_avx
+        vpclmulqdq	xmm2, xmm5, xmm4, 16
+        vpclmulqdq	xmm1, xmm5, xmm4, 1
+        vpclmulqdq	xmm0, xmm5, xmm4, 0
+        vpclmulqdq	xmm3, xmm5, xmm4, 17
+        vpxor	xmm2, xmm2, xmm1
+        vpslldq	xmm1, xmm2, 8
+        vpsrldq	xmm2, xmm2, 8
+        vpxor	xmm7, xmm0, xmm1
+        vpxor	xmm4, xmm3, xmm2
+        ; ghash_mid
+        vpsrld	xmm0, xmm7, 31
+        vpsrld	xmm1, xmm4, 31
+        vpslld	xmm7, xmm7, 1
+        vpslld	xmm4, xmm4, 1
+        vpsrldq	xmm2, xmm0, 12
+        vpslldq	xmm0, xmm0, 4
+        vpslldq	xmm1, xmm1, 4
+        vpor	xmm4, xmm4, xmm2
+        vpor	xmm7, xmm7, xmm0
+        vpor	xmm4, xmm4, xmm1
+        ; ghash_red
+        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
+        vpclmulqdq	xmm0, xmm7, xmm2, 16
+        vpshufd	xmm1, xmm7, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpclmulqdq	xmm0, xmm1, xmm2, 16
+        vpshufd	xmm1, xmm1, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm4, xmm4, xmm1
+        add	ecx, 16
+        cmp	ecx, edx
+        jl	L_AES_GCM_encrypt_avx2_calc_iv_16_loop
+        mov	edx, ebx
+        cmp	ecx, edx
+        je	L_AES_GCM_encrypt_avx2_calc_iv_done
+L_AES_GCM_encrypt_avx2_calc_iv_lt16:
+        vpxor	xmm0, xmm0, xmm0
+        xor	ebx, ebx
+        vmovdqu	OWORD PTR [rsp], xmm0
+L_AES_GCM_encrypt_avx2_calc_iv_loop:
+        movzx	r13d, BYTE PTR [rax+rcx]
+        mov	BYTE PTR [rsp+rbx], r13b
+        inc	ecx
+        inc	ebx
+        cmp	ecx, edx
+        jl	L_AES_GCM_encrypt_avx2_calc_iv_loop
+        vmovdqu	xmm0, OWORD PTR [rsp]
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm4, xmm4, xmm0
+        ; ghash_gfmul_avx
+        vpclmulqdq	xmm2, xmm5, xmm4, 16
+        vpclmulqdq	xmm1, xmm5, xmm4, 1
+        vpclmulqdq	xmm0, xmm5, xmm4, 0
+        vpclmulqdq	xmm3, xmm5, xmm4, 17
+        vpxor	xmm2, xmm2, xmm1
+        vpslldq	xmm1, xmm2, 8
+        vpsrldq	xmm2, xmm2, 8
+        vpxor	xmm7, xmm0, xmm1
+        vpxor	xmm4, xmm3, xmm2
+        ; ghash_mid
+        vpsrld	xmm0, xmm7, 31
+        vpsrld	xmm1, xmm4, 31
+        vpslld	xmm7, xmm7, 1
+        vpslld	xmm4, xmm4, 1
+        vpsrldq	xmm2, xmm0, 12
+        vpslldq	xmm0, xmm0, 4
+        vpslldq	xmm1, xmm1, 4
+        vpor	xmm4, xmm4, xmm2
+        vpor	xmm7, xmm7, xmm0
+        vpor	xmm4, xmm4, xmm1
+        ; ghash_red
+        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
+        vpclmulqdq	xmm0, xmm7, xmm2, 16
+        vpshufd	xmm1, xmm7, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpclmulqdq	xmm0, xmm1, xmm2, 16
+        vpshufd	xmm1, xmm1, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm4, xmm4, xmm1
+L_AES_GCM_encrypt_avx2_calc_iv_done:
+        ; T = Encrypt counter
+        vpxor	xmm0, xmm0, xmm0
+        shl	edx, 3
+        vmovq	xmm0, rdx
+        vpxor	xmm4, xmm4, xmm0
+        ; ghash_gfmul_avx
+        vpclmulqdq	xmm2, xmm5, xmm4, 16
+        vpclmulqdq	xmm1, xmm5, xmm4, 1
+        vpclmulqdq	xmm0, xmm5, xmm4, 0
+        vpclmulqdq	xmm3, xmm5, xmm4, 17
+        vpxor	xmm2, xmm2, xmm1
+        vpslldq	xmm1, xmm2, 8
+        vpsrldq	xmm2, xmm2, 8
+        vpxor	xmm7, xmm0, xmm1
+        vpxor	xmm4, xmm3, xmm2
+        ; ghash_mid
+        vpsrld	xmm0, xmm7, 31
+        vpsrld	xmm1, xmm4, 31
+        vpslld	xmm7, xmm7, 1
+        vpslld	xmm4, xmm4, 1
+        vpsrldq	xmm2, xmm0, 12
+        vpslldq	xmm0, xmm0, 4
+        vpslldq	xmm1, xmm1, 4
+        vpor	xmm4, xmm4, xmm2
+        vpor	xmm7, xmm7, xmm0
+        vpor	xmm4, xmm4, xmm1
+        ; ghash_red
+        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
+        vpclmulqdq	xmm0, xmm7, xmm2, 16
+        vpshufd	xmm1, xmm7, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpclmulqdq	xmm0, xmm1, xmm2, 16
+        vpshufd	xmm1, xmm1, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm4, xmm4, xmm1
+        vpshufb	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        ;   Encrypt counter
+        vmovdqu	xmm15, OWORD PTR [rsi]
+        vpxor	xmm15, xmm15, xmm4
+        vaesenc	xmm15, xmm15, [rsi+16]
+        vaesenc	xmm15, xmm15, [rsi+32]
+        vaesenc	xmm15, xmm15, [rsi+48]
+        vaesenc	xmm15, xmm15, [rsi+64]
+        vaesenc	xmm15, xmm15, [rsi+80]
+        vaesenc	xmm15, xmm15, [rsi+96]
+        vaesenc	xmm15, xmm15, [rsi+112]
+        vaesenc	xmm15, xmm15, [rsi+128]
+        vaesenc	xmm15, xmm15, [rsi+144]
+        cmp	r9d, 11
+        vmovdqu	xmm0, OWORD PTR [rsi+160]
+        jl	L_AES_GCM_encrypt_avx2_calc_iv_2_aesenc_avx_last
+        vaesenc	xmm15, xmm15, xmm0
+        vaesenc	xmm15, xmm15, [rsi+176]
+        cmp	r9d, 13
+        vmovdqu	xmm0, OWORD PTR [rsi+192]
+        jl	L_AES_GCM_encrypt_avx2_calc_iv_2_aesenc_avx_last
+        vaesenc	xmm15, xmm15, xmm0
+        vaesenc	xmm15, xmm15, [rsi+208]
+        vmovdqu	xmm0, OWORD PTR [rsi+224]
+L_AES_GCM_encrypt_avx2_calc_iv_2_aesenc_avx_last:
+        vaesenclast	xmm15, xmm15, xmm0
+        jmp	L_AES_GCM_encrypt_avx2_iv_done
+L_AES_GCM_encrypt_avx2_iv_12:
+        ; # Calculate values when IV is 12 bytes
+        ; Set counter based on IV
+        vmovdqu	xmm4, OWORD PTR L_avx2_aes_gcm_bswap_one
+        vmovdqu	xmm5, OWORD PTR [rsi]
+        vpblendd	xmm4, xmm4, [rax], 7
+        ; H = Encrypt X(=0) and T = Encrypt counter
+        vmovdqu	xmm7, OWORD PTR [rsi+16]
+        vpxor	xmm15, xmm4, xmm5
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm0, OWORD PTR [rsi+32]
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        vmovdqu	xmm0, OWORD PTR [rsi+48]
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        vmovdqu	xmm0, OWORD PTR [rsi+64]
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        vmovdqu	xmm0, OWORD PTR [rsi+80]
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        vmovdqu	xmm0, OWORD PTR [rsi+96]
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        vmovdqu	xmm0, OWORD PTR [rsi+112]
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        vmovdqu	xmm0, OWORD PTR [rsi+128]
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        vmovdqu	xmm0, OWORD PTR [rsi+144]
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        cmp	r9d, 11
+        vmovdqu	xmm0, OWORD PTR [rsi+160]
+        jl	L_AES_GCM_encrypt_avx2_calc_iv_12_last
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        vmovdqu	xmm0, OWORD PTR [rsi+176]
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        cmp	r9d, 13
+        vmovdqu	xmm0, OWORD PTR [rsi+192]
+        jl	L_AES_GCM_encrypt_avx2_calc_iv_12_last
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        vmovdqu	xmm0, OWORD PTR [rsi+208]
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        vmovdqu	xmm0, OWORD PTR [rsi+224]
+L_AES_GCM_encrypt_avx2_calc_iv_12_last:
+        vaesenclast	xmm5, xmm5, xmm0
+        vaesenclast	xmm15, xmm15, xmm0
+        vpshufb	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_bswap_mask
+L_AES_GCM_encrypt_avx2_iv_done:
+        ; Additional authentication data
+        mov	edx, r11d
+        cmp	edx, 0
+        je	L_AES_GCM_encrypt_avx2_calc_aad_done
+        xor	ecx, ecx
+        cmp	edx, 16
+        jl	L_AES_GCM_encrypt_avx2_calc_aad_lt16
+        and	edx, 4294967280
+L_AES_GCM_encrypt_avx2_calc_aad_16_loop:
+        vmovdqu	xmm0, OWORD PTR [r12+rcx]
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm6, xmm6, xmm0
+        ; ghash_gfmul_avx
+        vpclmulqdq	xmm2, xmm5, xmm6, 16
+        vpclmulqdq	xmm1, xmm5, xmm6, 1
+        vpclmulqdq	xmm0, xmm5, xmm6, 0
+        vpclmulqdq	xmm3, xmm5, xmm6, 17
+        vpxor	xmm2, xmm2, xmm1
+        vpslldq	xmm1, xmm2, 8
+        vpsrldq	xmm2, xmm2, 8
+        vpxor	xmm7, xmm0, xmm1
+        vpxor	xmm6, xmm3, xmm2
+        ; ghash_mid
+        vpsrld	xmm0, xmm7, 31
+        vpsrld	xmm1, xmm6, 31
+        vpslld	xmm7, xmm7, 1
+        vpslld	xmm6, xmm6, 1
+        vpsrldq	xmm2, xmm0, 12
+        vpslldq	xmm0, xmm0, 4
+        vpslldq	xmm1, xmm1, 4
+        vpor	xmm6, xmm6, xmm2
+        vpor	xmm7, xmm7, xmm0
+        vpor	xmm6, xmm6, xmm1
+        ; ghash_red
+        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
+        vpclmulqdq	xmm0, xmm7, xmm2, 16
+        vpshufd	xmm1, xmm7, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpclmulqdq	xmm0, xmm1, xmm2, 16
+        vpshufd	xmm1, xmm1, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm6, xmm6, xmm1
+        add	ecx, 16
+        cmp	ecx, edx
+        jl	L_AES_GCM_encrypt_avx2_calc_aad_16_loop
+        mov	edx, r11d
+        cmp	ecx, edx
+        je	L_AES_GCM_encrypt_avx2_calc_aad_done
+L_AES_GCM_encrypt_avx2_calc_aad_lt16:
+        vpxor	xmm0, xmm0, xmm0
+        xor	ebx, ebx
+        vmovdqu	OWORD PTR [rsp], xmm0
+L_AES_GCM_encrypt_avx2_calc_aad_loop:
+        movzx	r13d, BYTE PTR [r12+rcx]
+        mov	BYTE PTR [rsp+rbx], r13b
+        inc	ecx
+        inc	ebx
+        cmp	ecx, edx
+        jl	L_AES_GCM_encrypt_avx2_calc_aad_loop
+        vmovdqu	xmm0, OWORD PTR [rsp]
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm6, xmm6, xmm0
+        ; ghash_gfmul_avx
+        vpclmulqdq	xmm2, xmm5, xmm6, 16
+        vpclmulqdq	xmm1, xmm5, xmm6, 1
+        vpclmulqdq	xmm0, xmm5, xmm6, 0
+        vpclmulqdq	xmm3, xmm5, xmm6, 17
+        vpxor	xmm2, xmm2, xmm1
+        vpslldq	xmm1, xmm2, 8
+        vpsrldq	xmm2, xmm2, 8
+        vpxor	xmm7, xmm0, xmm1
+        vpxor	xmm6, xmm3, xmm2
+        ; ghash_mid
+        vpsrld	xmm0, xmm7, 31
+        vpsrld	xmm1, xmm6, 31
+        vpslld	xmm7, xmm7, 1
+        vpslld	xmm6, xmm6, 1
+        vpsrldq	xmm2, xmm0, 12
+        vpslldq	xmm0, xmm0, 4
+        vpslldq	xmm1, xmm1, 4
+        vpor	xmm6, xmm6, xmm2
+        vpor	xmm7, xmm7, xmm0
+        vpor	xmm6, xmm6, xmm1
+        ; ghash_red
+        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
+        vpclmulqdq	xmm0, xmm7, xmm2, 16
+        vpshufd	xmm1, xmm7, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpclmulqdq	xmm0, xmm1, xmm2, 16
+        vpshufd	xmm1, xmm1, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm6, xmm6, xmm1
+L_AES_GCM_encrypt_avx2_calc_aad_done:
+        ; Calculate counter and H
+        vpsrlq	xmm1, xmm5, 63
+        vpsllq	xmm0, xmm5, 1
+        vpslldq	xmm1, xmm1, 8
+        vpor	xmm0, xmm0, xmm1
+        vpshufd	xmm5, xmm5, 255
+        vpsrad	xmm5, xmm5, 31
+        vpshufb	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64
+        vpand	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_mod2_128
+        vpaddd	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_one
+        vpxor	xmm5, xmm5, xmm0
+        xor	ebx, ebx
+        cmp	r10d, 128
+        mov	r13d, r10d
+        jl	L_AES_GCM_encrypt_avx2_done_128
+        and	r13d, 4294967168
+        vmovdqu	OWORD PTR [rsp+128], xmm4
+        vmovdqu	OWORD PTR [rsp+144], xmm15
+        vmovdqu	xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128
+        ; H ^ 1 and H ^ 2
+        vpclmulqdq	xmm9, xmm5, xmm5, 0
+        vpclmulqdq	xmm10, xmm5, xmm5, 17
+        vpclmulqdq	xmm8, xmm9, xmm3, 16
+        vpshufd	xmm9, xmm9, 78
+        vpxor	xmm9, xmm9, xmm8
+        vpclmulqdq	xmm8, xmm9, xmm3, 16
+        vpshufd	xmm9, xmm9, 78
+        vpxor	xmm9, xmm9, xmm8
+        vpxor	xmm0, xmm10, xmm9
+        vmovdqu	OWORD PTR [rsp], xmm5
+        vmovdqu	OWORD PTR [rsp+16], xmm0
+        ; H ^ 3 and H ^ 4
+        vpclmulqdq	xmm11, xmm0, xmm5, 16
+        vpclmulqdq	xmm10, xmm0, xmm5, 1
+        vpclmulqdq	xmm9, xmm0, xmm5, 0
+        vpclmulqdq	xmm12, xmm0, xmm5, 17
+        vpclmulqdq	xmm13, xmm0, xmm0, 0
+        vpclmulqdq	xmm14, xmm0, xmm0, 17
+        vpxor	xmm11, xmm11, xmm10
+        vpslldq	xmm10, xmm11, 8
+        vpsrldq	xmm11, xmm11, 8
+        vpxor	xmm10, xmm10, xmm9
+        vpclmulqdq	xmm8, xmm13, xmm3, 16
+        vpclmulqdq	xmm9, xmm10, xmm3, 16
+        vpshufd	xmm10, xmm10, 78
+        vpshufd	xmm13, xmm13, 78
+        vpxor	xmm10, xmm10, xmm9
+        vpxor	xmm13, xmm13, xmm8
+        vpclmulqdq	xmm9, xmm10, xmm3, 16
+        vpclmulqdq	xmm8, xmm13, xmm3, 16
+        vpshufd	xmm10, xmm10, 78
+        vpshufd	xmm13, xmm13, 78
+        vpxor	xmm12, xmm12, xmm11
+        vpxor	xmm13, xmm13, xmm8
+        vpxor	xmm10, xmm10, xmm12
+        vpxor	xmm2, xmm13, xmm14
+        vpxor	xmm1, xmm10, xmm9
+        vmovdqu	OWORD PTR [rsp+32], xmm1
+        vmovdqu	OWORD PTR [rsp+48], xmm2
+        ; H ^ 5 and H ^ 6
+        vpclmulqdq	xmm11, xmm1, xmm0, 16
+        vpclmulqdq	xmm10, xmm1, xmm0, 1
+        vpclmulqdq	xmm9, xmm1, xmm0, 0
+        vpclmulqdq	xmm12, xmm1, xmm0, 17
+        vpclmulqdq	xmm13, xmm1, xmm1, 0
+        vpclmulqdq	xmm14, xmm1, xmm1, 17
+        vpxor	xmm11, xmm11, xmm10
+        vpslldq	xmm10, xmm11, 8
+        vpsrldq	xmm11, xmm11, 8
+        vpxor	xmm10, xmm10, xmm9
+        vpclmulqdq	xmm8, xmm13, xmm3, 16
+        vpclmulqdq	xmm9, xmm10, xmm3, 16
+        vpshufd	xmm10, xmm10, 78
+        vpshufd	xmm13, xmm13, 78
+        vpxor	xmm10, xmm10, xmm9
+        vpxor	xmm13, xmm13, xmm8
+        vpclmulqdq	xmm9, xmm10, xmm3, 16
+        vpclmulqdq	xmm8, xmm13, xmm3, 16
+        vpshufd	xmm10, xmm10, 78
+        vpshufd	xmm13, xmm13, 78
+        vpxor	xmm12, xmm12, xmm11
+        vpxor	xmm13, xmm13, xmm8
+        vpxor	xmm10, xmm10, xmm12
+        vpxor	xmm0, xmm13, xmm14
+        vpxor	xmm7, xmm10, xmm9
+        vmovdqu	OWORD PTR [rsp+64], xmm7
+        vmovdqu	OWORD PTR [rsp+80], xmm0
+        ; H ^ 7 and H ^ 8
+        vpclmulqdq	xmm11, xmm2, xmm1, 16
+        vpclmulqdq	xmm10, xmm2, xmm1, 1
+        vpclmulqdq	xmm9, xmm2, xmm1, 0
+        vpclmulqdq	xmm12, xmm2, xmm1, 17
+        vpclmulqdq	xmm13, xmm2, xmm2, 0
+        vpclmulqdq	xmm14, xmm2, xmm2, 17
+        vpxor	xmm11, xmm11, xmm10
+        vpslldq	xmm10, xmm11, 8
+        vpsrldq	xmm11, xmm11, 8
+        vpxor	xmm10, xmm10, xmm9
+        vpclmulqdq	xmm8, xmm13, xmm3, 16
+        vpclmulqdq	xmm9, xmm10, xmm3, 16
+        vpshufd	xmm10, xmm10, 78
+        vpshufd	xmm13, xmm13, 78
+        vpxor	xmm10, xmm10, xmm9
+        vpxor	xmm13, xmm13, xmm8
+        vpclmulqdq	xmm9, xmm10, xmm3, 16
+        vpclmulqdq	xmm8, xmm13, xmm3, 16
+        vpshufd	xmm10, xmm10, 78
+        vpshufd	xmm13, xmm13, 78
+        vpxor	xmm12, xmm12, xmm11
+        vpxor	xmm13, xmm13, xmm8
+        vpxor	xmm10, xmm10, xmm12
+        vpxor	xmm0, xmm13, xmm14
+        vpxor	xmm7, xmm10, xmm9
+        vmovdqu	OWORD PTR [rsp+96], xmm7
+        vmovdqu	OWORD PTR [rsp+112], xmm0
+        ; First 128 bytes of input
+        ; aesenc_128
+        ; aesenc_ctr
+        vmovdqu	xmm0, OWORD PTR [rsp+128]
+        vmovdqu	xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64
+        vpaddd	xmm9, xmm0, OWORD PTR L_avx2_aes_gcm_one
+        vpshufb	xmm8, xmm0, xmm1
+        vpaddd	xmm10, xmm0, OWORD PTR L_avx2_aes_gcm_two
+        vpshufb	xmm9, xmm9, xmm1
+        vpaddd	xmm11, xmm0, OWORD PTR L_avx2_aes_gcm_three
+        vpshufb	xmm10, xmm10, xmm1
+        vpaddd	xmm12, xmm0, OWORD PTR L_avx2_aes_gcm_four
+        vpshufb	xmm11, xmm11, xmm1
+        vpaddd	xmm13, xmm0, OWORD PTR L_avx2_aes_gcm_five
+        vpshufb	xmm12, xmm12, xmm1
+        vpaddd	xmm14, xmm0, OWORD PTR L_avx2_aes_gcm_six
+        vpshufb	xmm13, xmm13, xmm1
+        vpaddd	xmm15, xmm0, OWORD PTR L_avx2_aes_gcm_seven
+        vpshufb	xmm14, xmm14, xmm1
+        vpaddd	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_eight
+        vpshufb	xmm15, xmm15, xmm1
+        ; aesenc_xor
+        vmovdqu	xmm7, OWORD PTR [rsi]
+        vmovdqu	OWORD PTR [rsp+128], xmm0
+        vpxor	xmm8, xmm8, xmm7
+        vpxor	xmm9, xmm9, xmm7
+        vpxor	xmm10, xmm10, xmm7
+        vpxor	xmm11, xmm11, xmm7
+        vpxor	xmm12, xmm12, xmm7
+        vpxor	xmm13, xmm13, xmm7
+        vpxor	xmm14, xmm14, xmm7
+        vpxor	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rsi+16]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rsi+32]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rsi+48]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rsi+64]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rsi+80]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rsi+96]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rsi+112]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rsi+128]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rsi+144]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        cmp	r9d, 11
+        vmovdqu	xmm7, OWORD PTR [rsi+160]
+        jl	L_AES_GCM_encrypt_avx2_aesenc_128_enc_done
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rsi+176]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        cmp	r9d, 13
+        vmovdqu	xmm7, OWORD PTR [rsi+192]
+        jl	L_AES_GCM_encrypt_avx2_aesenc_128_enc_done
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rsi+208]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rsi+224]
+L_AES_GCM_encrypt_avx2_aesenc_128_enc_done:
+        ; aesenc_last
+        vaesenclast	xmm8, xmm8, xmm7
+        vaesenclast	xmm9, xmm9, xmm7
+        vaesenclast	xmm10, xmm10, xmm7
+        vaesenclast	xmm11, xmm11, xmm7
+        vmovdqu	xmm0, OWORD PTR [rdi]
+        vmovdqu	xmm1, OWORD PTR [rdi+16]
+        vmovdqu	xmm2, OWORD PTR [rdi+32]
+        vmovdqu	xmm3, OWORD PTR [rdi+48]
+        vpxor	xmm8, xmm8, xmm0
+        vpxor	xmm9, xmm9, xmm1
+        vpxor	xmm10, xmm10, xmm2
+        vpxor	xmm11, xmm11, xmm3
+        vmovdqu	OWORD PTR [r8], xmm8
+        vmovdqu	OWORD PTR [r8+16], xmm9
+        vmovdqu	OWORD PTR [r8+32], xmm10
+        vmovdqu	OWORD PTR [r8+48], xmm11
+        vaesenclast	xmm12, xmm12, xmm7
+        vaesenclast	xmm13, xmm13, xmm7
+        vaesenclast	xmm14, xmm14, xmm7
+        vaesenclast	xmm15, xmm15, xmm7
+        vmovdqu	xmm0, OWORD PTR [rdi+64]
+        vmovdqu	xmm1, OWORD PTR [rdi+80]
+        vmovdqu	xmm2, OWORD PTR [rdi+96]
+        vmovdqu	xmm3, OWORD PTR [rdi+112]
+        vpxor	xmm12, xmm12, xmm0
+        vpxor	xmm13, xmm13, xmm1
+        vpxor	xmm14, xmm14, xmm2
+        vpxor	xmm15, xmm15, xmm3
+        vmovdqu	OWORD PTR [r8+64], xmm12
+        vmovdqu	OWORD PTR [r8+80], xmm13
+        vmovdqu	OWORD PTR [r8+96], xmm14
+        vmovdqu	OWORD PTR [r8+112], xmm15
+        cmp	r13d, 128
+        mov	ebx, 128
+        jle	L_AES_GCM_encrypt_avx2_end_128
+        ; More 128 bytes of input
+L_AES_GCM_encrypt_avx2_ghash_128:
+        ; aesenc_128_ghash
+        lea	rcx, QWORD PTR [rdi+rbx]
+        lea	rdx, QWORD PTR [r8+rbx]
+        ; aesenc_ctr
+        vmovdqu	xmm0, OWORD PTR [rsp+128]
+        vmovdqu	xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64
+        vpaddd	xmm9, xmm0, OWORD PTR L_avx2_aes_gcm_one
+        vpshufb	xmm8, xmm0, xmm1
+        vpaddd	xmm10, xmm0, OWORD PTR L_avx2_aes_gcm_two
+        vpshufb	xmm9, xmm9, xmm1
+        vpaddd	xmm11, xmm0, OWORD PTR L_avx2_aes_gcm_three
+        vpshufb	xmm10, xmm10, xmm1
+        vpaddd	xmm12, xmm0, OWORD PTR L_avx2_aes_gcm_four
+        vpshufb	xmm11, xmm11, xmm1
+        vpaddd	xmm13, xmm0, OWORD PTR L_avx2_aes_gcm_five
+        vpshufb	xmm12, xmm12, xmm1
+        vpaddd	xmm14, xmm0, OWORD PTR L_avx2_aes_gcm_six
+        vpshufb	xmm13, xmm13, xmm1
+        vpaddd	xmm15, xmm0, OWORD PTR L_avx2_aes_gcm_seven
+        vpshufb	xmm14, xmm14, xmm1
+        vpaddd	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_eight
+        vpshufb	xmm15, xmm15, xmm1
+        ; aesenc_xor
+        vmovdqu	xmm7, OWORD PTR [rsi]
+        vmovdqu	OWORD PTR [rsp+128], xmm0
+        vpxor	xmm8, xmm8, xmm7
+        vpxor	xmm9, xmm9, xmm7
+        vpxor	xmm10, xmm10, xmm7
+        vpxor	xmm11, xmm11, xmm7
+        vpxor	xmm12, xmm12, xmm7
+        vpxor	xmm13, xmm13, xmm7
+        vpxor	xmm14, xmm14, xmm7
+        vpxor	xmm15, xmm15, xmm7
+        ; aesenc_pclmul_1
+        vmovdqu	xmm1, OWORD PTR [rdx+-128]
+        vmovdqu	xmm0, OWORD PTR [rsi+16]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vmovdqu	xmm2, OWORD PTR [rsp+112]
+        vpxor	xmm1, xmm1, xmm6
+        vpclmulqdq	xmm5, xmm1, xmm2, 16
+        vpclmulqdq	xmm3, xmm1, xmm2, 1
+        vpclmulqdq	xmm6, xmm1, xmm2, 0
+        vpclmulqdq	xmm7, xmm1, xmm2, 17
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_2
+        vmovdqu	xmm1, OWORD PTR [rdx+-112]
+        vmovdqu	xmm0, OWORD PTR [rsp+96]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm5, xmm5, xmm3
+        vpclmulqdq	xmm2, xmm1, xmm0, 16
+        vpclmulqdq	xmm3, xmm1, xmm0, 1
+        vpclmulqdq	xmm4, xmm1, xmm0, 0
+        vpclmulqdq	xmm1, xmm1, xmm0, 17
+        vmovdqu	xmm0, OWORD PTR [rsi+32]
+        vpxor	xmm7, xmm7, xmm1
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_n
+        vmovdqu	xmm1, OWORD PTR [rdx+-96]
+        vmovdqu	xmm0, OWORD PTR [rsp+80]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm5, xmm5, xmm2
+        vpclmulqdq	xmm2, xmm1, xmm0, 16
+        vpxor	xmm5, xmm5, xmm3
+        vpclmulqdq	xmm3, xmm1, xmm0, 1
+        vpxor	xmm6, xmm6, xmm4
+        vpclmulqdq	xmm4, xmm1, xmm0, 0
+        vpclmulqdq	xmm1, xmm1, xmm0, 17
+        vmovdqu	xmm0, OWORD PTR [rsi+48]
+        vpxor	xmm7, xmm7, xmm1
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_n
+        vmovdqu	xmm1, OWORD PTR [rdx+-80]
+        vmovdqu	xmm0, OWORD PTR [rsp+64]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm5, xmm5, xmm2
+        vpclmulqdq	xmm2, xmm1, xmm0, 16
+        vpxor	xmm5, xmm5, xmm3
+        vpclmulqdq	xmm3, xmm1, xmm0, 1
+        vpxor	xmm6, xmm6, xmm4
+        vpclmulqdq	xmm4, xmm1, xmm0, 0
+        vpclmulqdq	xmm1, xmm1, xmm0, 17
+        vmovdqu	xmm0, OWORD PTR [rsi+64]
+        vpxor	xmm7, xmm7, xmm1
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_n
+        vmovdqu	xmm1, OWORD PTR [rdx+-64]
+        vmovdqu	xmm0, OWORD PTR [rsp+48]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm5, xmm5, xmm2
+        vpclmulqdq	xmm2, xmm1, xmm0, 16
+        vpxor	xmm5, xmm5, xmm3
+        vpclmulqdq	xmm3, xmm1, xmm0, 1
+        vpxor	xmm6, xmm6, xmm4
+        vpclmulqdq	xmm4, xmm1, xmm0, 0
+        vpclmulqdq	xmm1, xmm1, xmm0, 17
+        vmovdqu	xmm0, OWORD PTR [rsi+80]
+        vpxor	xmm7, xmm7, xmm1
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_n
+        vmovdqu	xmm1, OWORD PTR [rdx+-48]
+        vmovdqu	xmm0, OWORD PTR [rsp+32]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm5, xmm5, xmm2
+        vpclmulqdq	xmm2, xmm1, xmm0, 16
+        vpxor	xmm5, xmm5, xmm3
+        vpclmulqdq	xmm3, xmm1, xmm0, 1
+        vpxor	xmm6, xmm6, xmm4
+        vpclmulqdq	xmm4, xmm1, xmm0, 0
+        vpclmulqdq	xmm1, xmm1, xmm0, 17
+        vmovdqu	xmm0, OWORD PTR [rsi+96]
+        vpxor	xmm7, xmm7, xmm1
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_n
+        vmovdqu	xmm1, OWORD PTR [rdx+-32]
+        vmovdqu	xmm0, OWORD PTR [rsp+16]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm5, xmm5, xmm2
+        vpclmulqdq	xmm2, xmm1, xmm0, 16
+        vpxor	xmm5, xmm5, xmm3
+        vpclmulqdq	xmm3, xmm1, xmm0, 1
+        vpxor	xmm6, xmm6, xmm4
+        vpclmulqdq	xmm4, xmm1, xmm0, 0
+        vpclmulqdq	xmm1, xmm1, xmm0, 17
+        vmovdqu	xmm0, OWORD PTR [rsi+112]
+        vpxor	xmm7, xmm7, xmm1
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_n
+        vmovdqu	xmm1, OWORD PTR [rdx+-16]
+        vmovdqu	xmm0, OWORD PTR [rsp]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm5, xmm5, xmm2
+        vpclmulqdq	xmm2, xmm1, xmm0, 16
+        vpxor	xmm5, xmm5, xmm3
+        vpclmulqdq	xmm3, xmm1, xmm0, 1
+        vpxor	xmm6, xmm6, xmm4
+        vpclmulqdq	xmm4, xmm1, xmm0, 0
+        vpclmulqdq	xmm1, xmm1, xmm0, 17
+        vmovdqu	xmm0, OWORD PTR [rsi+128]
+        vpxor	xmm7, xmm7, xmm1
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_l
+        vpxor	xmm5, xmm5, xmm2
+        vpxor	xmm6, xmm6, xmm4
+        vpxor	xmm5, xmm5, xmm3
+        vpslldq	xmm1, xmm5, 8
+        vpsrldq	xmm5, xmm5, 8
+        vmovdqu	xmm4, OWORD PTR [rsi+144]
+        vmovdqu	xmm0, OWORD PTR L_avx2_aes_gcm_mod2_128
+        vaesenc	xmm8, xmm8, xmm4
+        vpxor	xmm6, xmm6, xmm1
+        vpxor	xmm7, xmm7, xmm5
+        vpclmulqdq	xmm3, xmm6, xmm0, 16
+        vaesenc	xmm9, xmm9, xmm4
+        vaesenc	xmm10, xmm10, xmm4
+        vaesenc	xmm11, xmm11, xmm4
+        vpshufd	xmm6, xmm6, 78
+        vpxor	xmm6, xmm6, xmm3
+        vpclmulqdq	xmm3, xmm6, xmm0, 16
+        vaesenc	xmm12, xmm12, xmm4
+        vaesenc	xmm13, xmm13, xmm4
+        vaesenc	xmm14, xmm14, xmm4
+        vpshufd	xmm6, xmm6, 78
+        vpxor	xmm6, xmm6, xmm3
+        vpxor	xmm6, xmm6, xmm7
+        vaesenc	xmm15, xmm15, xmm4
+        cmp	r9d, 11
+        vmovdqu	xmm7, OWORD PTR [rsi+160]
+        jl	L_AES_GCM_encrypt_avx2_aesenc_128_ghash_avx_done
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rsi+176]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        cmp	r9d, 13
+        vmovdqu	xmm7, OWORD PTR [rsi+192]
+        jl	L_AES_GCM_encrypt_avx2_aesenc_128_ghash_avx_done
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rsi+208]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rsi+224]
+L_AES_GCM_encrypt_avx2_aesenc_128_ghash_avx_done:
+        ; aesenc_last
+        vaesenclast	xmm8, xmm8, xmm7
+        vaesenclast	xmm9, xmm9, xmm7
+        vaesenclast	xmm10, xmm10, xmm7
+        vaesenclast	xmm11, xmm11, xmm7
+        vmovdqu	xmm0, OWORD PTR [rcx]
+        vmovdqu	xmm1, OWORD PTR [rcx+16]
+        vmovdqu	xmm2, OWORD PTR [rcx+32]
+        vmovdqu	xmm3, OWORD PTR [rcx+48]
+        vpxor	xmm8, xmm8, xmm0
+        vpxor	xmm9, xmm9, xmm1
+        vpxor	xmm10, xmm10, xmm2
+        vpxor	xmm11, xmm11, xmm3
+        vmovdqu	OWORD PTR [rdx], xmm8
+        vmovdqu	OWORD PTR [rdx+16], xmm9
+        vmovdqu	OWORD PTR [rdx+32], xmm10
+        vmovdqu	OWORD PTR [rdx+48], xmm11
+        vaesenclast	xmm12, xmm12, xmm7
+        vaesenclast	xmm13, xmm13, xmm7
+        vaesenclast	xmm14, xmm14, xmm7
+        vaesenclast	xmm15, xmm15, xmm7
+        vmovdqu	xmm0, OWORD PTR [rcx+64]
+        vmovdqu	xmm1, OWORD PTR [rcx+80]
+        vmovdqu	xmm2, OWORD PTR [rcx+96]
+        vmovdqu	xmm3, OWORD PTR [rcx+112]
+        vpxor	xmm12, xmm12, xmm0
+        vpxor	xmm13, xmm13, xmm1
+        vpxor	xmm14, xmm14, xmm2
+        vpxor	xmm15, xmm15, xmm3
+        vmovdqu	OWORD PTR [rdx+64], xmm12
+        vmovdqu	OWORD PTR [rdx+80], xmm13
+        vmovdqu	OWORD PTR [rdx+96], xmm14
+        vmovdqu	OWORD PTR [rdx+112], xmm15
+        ; aesenc_128_ghash - end
+        add	ebx, 128
+        cmp	ebx, r13d
+        jl	L_AES_GCM_encrypt_avx2_ghash_128
+L_AES_GCM_encrypt_avx2_end_128:
+        vmovdqu	xmm4, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpshufb	xmm8, xmm8, xmm4
+        vpshufb	xmm9, xmm9, xmm4
+        vpshufb	xmm10, xmm10, xmm4
+        vpshufb	xmm11, xmm11, xmm4
+        vpshufb	xmm12, xmm12, xmm4
+        vpshufb	xmm13, xmm13, xmm4
+        vpshufb	xmm14, xmm14, xmm4
+        vpshufb	xmm15, xmm15, xmm4
+        vpxor	xmm8, xmm8, xmm6
+        vmovdqu	xmm7, OWORD PTR [rsp]
+        vpclmulqdq	xmm5, xmm7, xmm15, 16
+        vpclmulqdq	xmm1, xmm7, xmm15, 1
+        vpclmulqdq	xmm4, xmm7, xmm15, 0
+        vpclmulqdq	xmm6, xmm7, xmm15, 17
+        vpxor	xmm5, xmm5, xmm1
+        vmovdqu	xmm7, OWORD PTR [rsp+16]
+        vpclmulqdq	xmm2, xmm7, xmm14, 16
+        vpclmulqdq	xmm1, xmm7, xmm14, 1
+        vpclmulqdq	xmm0, xmm7, xmm14, 0
+        vpclmulqdq	xmm3, xmm7, xmm14, 17
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm6, xmm6, xmm3
+        vpxor	xmm5, xmm5, xmm2
+        vpxor	xmm4, xmm4, xmm0
+        vmovdqu	xmm15, OWORD PTR [rsp+32]
+        vmovdqu	xmm7, OWORD PTR [rsp+48]
+        vpclmulqdq	xmm2, xmm15, xmm13, 16
+        vpclmulqdq	xmm1, xmm15, xmm13, 1
+        vpclmulqdq	xmm0, xmm15, xmm13, 0
+        vpclmulqdq	xmm3, xmm15, xmm13, 17
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm6, xmm6, xmm3
+        vpxor	xmm5, xmm5, xmm2
+        vpxor	xmm4, xmm4, xmm0
+        vpclmulqdq	xmm2, xmm7, xmm12, 16
+        vpclmulqdq	xmm1, xmm7, xmm12, 1
+        vpclmulqdq	xmm0, xmm7, xmm12, 0
+        vpclmulqdq	xmm3, xmm7, xmm12, 17
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm6, xmm6, xmm3
+        vpxor	xmm5, xmm5, xmm2
+        vpxor	xmm4, xmm4, xmm0
+        vmovdqu	xmm15, OWORD PTR [rsp+64]
+        vmovdqu	xmm7, OWORD PTR [rsp+80]
+        vpclmulqdq	xmm2, xmm15, xmm11, 16
+        vpclmulqdq	xmm1, xmm15, xmm11, 1
+        vpclmulqdq	xmm0, xmm15, xmm11, 0
+        vpclmulqdq	xmm3, xmm15, xmm11, 17
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm6, xmm6, xmm3
+        vpxor	xmm5, xmm5, xmm2
+        vpxor	xmm4, xmm4, xmm0
+        vpclmulqdq	xmm2, xmm7, xmm10, 16
+        vpclmulqdq	xmm1, xmm7, xmm10, 1
+        vpclmulqdq	xmm0, xmm7, xmm10, 0
+        vpclmulqdq	xmm3, xmm7, xmm10, 17
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm6, xmm6, xmm3
+        vpxor	xmm5, xmm5, xmm2
+        vpxor	xmm4, xmm4, xmm0
+        vmovdqu	xmm15, OWORD PTR [rsp+96]
+        vmovdqu	xmm7, OWORD PTR [rsp+112]
+        vpclmulqdq	xmm2, xmm15, xmm9, 16
+        vpclmulqdq	xmm1, xmm15, xmm9, 1
+        vpclmulqdq	xmm0, xmm15, xmm9, 0
+        vpclmulqdq	xmm3, xmm15, xmm9, 17
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm6, xmm6, xmm3
+        vpxor	xmm5, xmm5, xmm2
+        vpxor	xmm4, xmm4, xmm0
+        vpclmulqdq	xmm2, xmm7, xmm8, 16
+        vpclmulqdq	xmm1, xmm7, xmm8, 1
+        vpclmulqdq	xmm0, xmm7, xmm8, 0
+        vpclmulqdq	xmm3, xmm7, xmm8, 17
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm6, xmm6, xmm3
+        vpxor	xmm5, xmm5, xmm2
+        vpxor	xmm4, xmm4, xmm0
+        vpslldq	xmm7, xmm5, 8
+        vpsrldq	xmm5, xmm5, 8
+        vpxor	xmm4, xmm4, xmm7
+        vpxor	xmm6, xmm6, xmm5
+        ; ghash_red
+        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
+        vpclmulqdq	xmm0, xmm4, xmm2, 16
+        vpshufd	xmm1, xmm4, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpclmulqdq	xmm0, xmm1, xmm2, 16
+        vpshufd	xmm1, xmm1, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm6, xmm6, xmm1
+        vmovdqu	xmm5, OWORD PTR [rsp]
+        vmovdqu	xmm4, OWORD PTR [rsp+128]
+        vmovdqu	xmm15, OWORD PTR [rsp+144]
+L_AES_GCM_encrypt_avx2_done_128:
+        cmp	ebx, r10d
+        je	L_AES_GCM_encrypt_avx2_done_enc
+        mov	r13d, r10d
+        and	r13d, 4294967280
+        cmp	ebx, r13d
+        jge	L_AES_GCM_encrypt_avx2_last_block_done
+        ; aesenc_block
+        vmovdqu	xmm1, xmm4
+        vpshufb	xmm0, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64
+        vpaddd	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_one
+        vpxor	xmm0, xmm0, [rsi]
+        vmovdqu	xmm2, OWORD PTR [rsi+16]
+        vaesenc	xmm0, xmm0, xmm2
+        vmovdqu	xmm2, OWORD PTR [rsi+32]
+        vaesenc	xmm0, xmm0, xmm2
+        vmovdqu	xmm2, OWORD PTR [rsi+48]
+        vaesenc	xmm0, xmm0, xmm2
+        vmovdqu	xmm2, OWORD PTR [rsi+64]
+        vaesenc	xmm0, xmm0, xmm2
+        vmovdqu	xmm2, OWORD PTR [rsi+80]
+        vaesenc	xmm0, xmm0, xmm2
+        vmovdqu	xmm2, OWORD PTR [rsi+96]
+        vaesenc	xmm0, xmm0, xmm2
+        vmovdqu	xmm2, OWORD PTR [rsi+112]
+        vaesenc	xmm0, xmm0, xmm2
+        vmovdqu	xmm2, OWORD PTR [rsi+128]
+        vaesenc	xmm0, xmm0, xmm2
+        vmovdqu	xmm2, OWORD PTR [rsi+144]
+        vaesenc	xmm0, xmm0, xmm2
+        vmovdqu	xmm4, xmm1
+        cmp	r9d, 11
+        vmovdqu	xmm1, OWORD PTR [rsi+160]
+        jl	L_AES_GCM_encrypt_avx2_aesenc_block_last
+        vaesenc	xmm0, xmm0, xmm1
+        vmovdqu	xmm2, OWORD PTR [rsi+176]
+        vaesenc	xmm0, xmm0, xmm2
+        cmp	r9d, 13
+        vmovdqu	xmm1, OWORD PTR [rsi+192]
+        jl	L_AES_GCM_encrypt_avx2_aesenc_block_last
+        vaesenc	xmm0, xmm0, xmm1
+        vmovdqu	xmm2, OWORD PTR [rsi+208]
+        vaesenc	xmm0, xmm0, xmm2
+        vmovdqu	xmm1, OWORD PTR [rsi+224]
+L_AES_GCM_encrypt_avx2_aesenc_block_last:
+        vaesenclast	xmm0, xmm0, xmm1
+        vmovdqu	xmm1, OWORD PTR [rdi+rbx]
+        vpxor	xmm0, xmm0, xmm1
+        vmovdqu	OWORD PTR [r8+rbx], xmm0
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm6, xmm6, xmm0
+        add	ebx, 16
+        cmp	ebx, r13d
+        jge	L_AES_GCM_encrypt_avx2_last_block_ghash
+L_AES_GCM_encrypt_avx2_last_block_start:
+        vmovdqu	xmm12, OWORD PTR [rdi+rbx]
+        vpshufb	xmm11, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64
+        vpaddd	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_one
+        ; aesenc_gfmul_sb
+        vpclmulqdq	xmm2, xmm6, xmm5, 1
+        vpclmulqdq	xmm3, xmm6, xmm5, 16
+        vpclmulqdq	xmm1, xmm6, xmm5, 0
+        vpclmulqdq	xmm8, xmm6, xmm5, 17
+        vpxor	xmm11, xmm11, [rsi]
+        vaesenc	xmm11, xmm11, [rsi+16]
+        vpxor	xmm3, xmm3, xmm2
+        vpslldq	xmm2, xmm3, 8
+        vpsrldq	xmm3, xmm3, 8
+        vaesenc	xmm11, xmm11, [rsi+32]
+        vpxor	xmm2, xmm2, xmm1
+        vpclmulqdq	xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
+        vaesenc	xmm11, xmm11, [rsi+48]
+        vaesenc	xmm11, xmm11, [rsi+64]
+        vaesenc	xmm11, xmm11, [rsi+80]
+        vpshufd	xmm2, xmm2, 78
+        vpxor	xmm2, xmm2, xmm1
+        vpclmulqdq	xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
+        vaesenc	xmm11, xmm11, [rsi+96]
+        vaesenc	xmm11, xmm11, [rsi+112]
+        vaesenc	xmm11, xmm11, [rsi+128]
+        vpshufd	xmm2, xmm2, 78
+        vaesenc	xmm11, xmm11, [rsi+144]
+        vpxor	xmm8, xmm8, xmm3
+        vpxor	xmm2, xmm2, xmm8
+        vmovdqu	xmm0, OWORD PTR [rsi+160]
+        cmp	r9d, 11
+        jl	L_AES_GCM_encrypt_avx2_aesenc_gfmul_sb_last
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm11, xmm11, [rsi+176]
+        vmovdqu	xmm0, OWORD PTR [rsi+192]
+        cmp	r9d, 13
+        jl	L_AES_GCM_encrypt_avx2_aesenc_gfmul_sb_last
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm11, xmm11, [rsi+208]
+        vmovdqu	xmm0, OWORD PTR [rsi+224]
+L_AES_GCM_encrypt_avx2_aesenc_gfmul_sb_last:
+        vaesenclast	xmm11, xmm11, xmm0
+        vpxor	xmm6, xmm2, xmm1
+        vpxor	xmm11, xmm11, xmm12
+        vmovdqu	OWORD PTR [r8+rbx], xmm11
+        vpshufb	xmm11, xmm11, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm6, xmm6, xmm11
+        add	ebx, 16
+        cmp	ebx, r13d
+        jl	L_AES_GCM_encrypt_avx2_last_block_start
+L_AES_GCM_encrypt_avx2_last_block_ghash:
+        ; ghash_gfmul_red
+        vpclmulqdq	xmm10, xmm6, xmm5, 16
+        vpclmulqdq	xmm9, xmm6, xmm5, 1
+        vpclmulqdq	xmm8, xmm6, xmm5, 0
+        vpxor	xmm10, xmm10, xmm9
+        vpslldq	xmm9, xmm10, 8
+        vpsrldq	xmm10, xmm10, 8
+        vpxor	xmm9, xmm9, xmm8
+        vpclmulqdq	xmm6, xmm6, xmm5, 17
+        vpclmulqdq	xmm8, xmm9, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
+        vpshufd	xmm9, xmm9, 78
+        vpxor	xmm9, xmm9, xmm8
+        vpclmulqdq	xmm8, xmm9, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
+        vpshufd	xmm9, xmm9, 78
+        vpxor	xmm6, xmm6, xmm10
+        vpxor	xmm6, xmm6, xmm9
+        vpxor	xmm6, xmm6, xmm8
+L_AES_GCM_encrypt_avx2_last_block_done:
+        mov	ecx, r10d
+        mov	edx, r10d
+        and	ecx, 15
+        jz	L_AES_GCM_encrypt_avx2_done_enc
+        ; aesenc_last15_enc
+        vpshufb	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64
+        vpxor	xmm4, xmm4, [rsi]
+        vaesenc	xmm4, xmm4, [rsi+16]
+        vaesenc	xmm4, xmm4, [rsi+32]
+        vaesenc	xmm4, xmm4, [rsi+48]
+        vaesenc	xmm4, xmm4, [rsi+64]
+        vaesenc	xmm4, xmm4, [rsi+80]
+        vaesenc	xmm4, xmm4, [rsi+96]
+        vaesenc	xmm4, xmm4, [rsi+112]
+        vaesenc	xmm4, xmm4, [rsi+128]
+        vaesenc	xmm4, xmm4, [rsi+144]
+        cmp	r9d, 11
+        vmovdqu	xmm0, OWORD PTR [rsi+160]
+        jl	L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_aesenc_avx_last
+        vaesenc	xmm4, xmm4, xmm0
+        vaesenc	xmm4, xmm4, [rsi+176]
+        cmp	r9d, 13
+        vmovdqu	xmm0, OWORD PTR [rsi+192]
+        jl	L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_aesenc_avx_last
+        vaesenc	xmm4, xmm4, xmm0
+        vaesenc	xmm4, xmm4, [rsi+208]
+        vmovdqu	xmm0, OWORD PTR [rsi+224]
+L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_aesenc_avx_last:
+        vaesenclast	xmm4, xmm4, xmm0
+        xor	ecx, ecx
+        vpxor	xmm0, xmm0, xmm0
+        vmovdqu	OWORD PTR [rsp], xmm4
+        vmovdqu	OWORD PTR [rsp+16], xmm0
+L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_loop:
+        movzx	r13d, BYTE PTR [rdi+rbx]
+        xor	r13b, BYTE PTR [rsp+rcx]
+        mov	BYTE PTR [rsp+rcx+16], r13b
+        mov	BYTE PTR [r8+rbx], r13b
+        inc	ebx
+        inc	ecx
+        cmp	ebx, edx
+        jl	L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_loop
+L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_finish_enc:
+        vmovdqu	xmm4, OWORD PTR [rsp+16]
+        vpshufb	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm6, xmm6, xmm4
+        ; ghash_gfmul_red
+        vpclmulqdq	xmm2, xmm6, xmm5, 16
+        vpclmulqdq	xmm1, xmm6, xmm5, 1
+        vpclmulqdq	xmm0, xmm6, xmm5, 0
+        vpxor	xmm2, xmm2, xmm1
+        vpslldq	xmm1, xmm2, 8
+        vpsrldq	xmm2, xmm2, 8
+        vpxor	xmm1, xmm1, xmm0
+        vpclmulqdq	xmm6, xmm6, xmm5, 17
+        vpclmulqdq	xmm0, xmm1, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
+        vpshufd	xmm1, xmm1, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpclmulqdq	xmm0, xmm1, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
+        vpshufd	xmm1, xmm1, 78
+        vpxor	xmm6, xmm6, xmm2
+        vpxor	xmm6, xmm6, xmm1
+        vpxor	xmm6, xmm6, xmm0
+L_AES_GCM_encrypt_avx2_done_enc:
+        ; calc_tag
+        shl	r10, 3
+        shl	r11, 3
+        vmovq	xmm0, r10
+        vmovq	xmm1, r11
+        vpunpcklqdq	xmm0, xmm0, xmm1
+        vpxor	xmm0, xmm0, xmm6
+        ; ghash_gfmul_red
+        vpclmulqdq	xmm4, xmm0, xmm5, 16
+        vpclmulqdq	xmm3, xmm0, xmm5, 1
+        vpclmulqdq	xmm2, xmm0, xmm5, 0
+        vpxor	xmm4, xmm4, xmm3
+        vpslldq	xmm3, xmm4, 8
+        vpsrldq	xmm4, xmm4, 8
+        vpxor	xmm3, xmm3, xmm2
+        vpclmulqdq	xmm0, xmm0, xmm5, 17
+        vpclmulqdq	xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
+        vpshufd	xmm3, xmm3, 78
+        vpxor	xmm3, xmm3, xmm2
+        vpclmulqdq	xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
+        vpshufd	xmm3, xmm3, 78
+        vpxor	xmm0, xmm0, xmm4
+        vpxor	xmm0, xmm0, xmm3
+        vpxor	xmm0, xmm0, xmm2
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm0, xmm0, xmm15
+        ; store_tag
+        cmp	r14d, 16
+        je	L_AES_GCM_encrypt_avx2_store_tag_16
+        xor	rcx, rcx
+        vmovdqu	OWORD PTR [rsp], xmm0
+L_AES_GCM_encrypt_avx2_store_tag_loop:
+        movzx	r13d, BYTE PTR [rsp+rcx]
+        mov	BYTE PTR [r15+rcx], r13b
+        inc	ecx
+        cmp	ecx, r14d
+        jne	L_AES_GCM_encrypt_avx2_store_tag_loop
+        jmp	L_AES_GCM_encrypt_avx2_store_tag_done
+L_AES_GCM_encrypt_avx2_store_tag_16:
+        vmovdqu	OWORD PTR [r15], xmm0
+L_AES_GCM_encrypt_avx2_store_tag_done:
+        vzeroupper
+        add	rsp, 160
+        pop	rsi
+        pop	r14
+        pop	rbx
+        pop	r15
+        pop	r12
+        pop	rdi
+        pop	r13
+        ret
+AES_GCM_encrypt_avx2 ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+AES_GCM_decrypt_avx2 PROC
+        push	r13
+        push	rdi
+        push	r12
+        push	r14
+        push	rbx
+        push	r15
+        push	rsi
+        push	rbp
+        mov	rdi, rcx
+        mov	r12, r8
+        mov	rax, r9
+        mov	r14, QWORD PTR [rsp+104]
+        mov	r8, rdx
+        mov	r10d, DWORD PTR [rsp+112]
+        mov	r11d, DWORD PTR [rsp+120]
+        mov	ebx, DWORD PTR [rsp+128]
+        mov	r15d, DWORD PTR [rsp+136]
+        mov	rsi, QWORD PTR [rsp+144]
+        mov	r9d, DWORD PTR [rsp+152]
+        mov	rbp, QWORD PTR [rsp+160]
+        sub	rsp, 168
+        vpxor	xmm4, xmm4, xmm4
+        vpxor	xmm6, xmm6, xmm6
+        mov	edx, ebx
+        cmp	edx, 12
+        je	L_AES_GCM_decrypt_avx2_iv_12
+        ; Calculate values when IV is not 12 bytes
+        ; H = Encrypt X(=0)
+        vmovdqu	xmm5, OWORD PTR [rsi]
+        vaesenc	xmm5, xmm5, [rsi+16]
+        vaesenc	xmm5, xmm5, [rsi+32]
+        vaesenc	xmm5, xmm5, [rsi+48]
+        vaesenc	xmm5, xmm5, [rsi+64]
+        vaesenc	xmm5, xmm5, [rsi+80]
+        vaesenc	xmm5, xmm5, [rsi+96]
+        vaesenc	xmm5, xmm5, [rsi+112]
+        vaesenc	xmm5, xmm5, [rsi+128]
+        vaesenc	xmm5, xmm5, [rsi+144]
+        cmp	r9d, 11
+        vmovdqu	xmm0, OWORD PTR [rsi+160]
+        jl	L_AES_GCM_decrypt_avx2_calc_iv_1_aesenc_avx_last
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm5, xmm5, [rsi+176]
+        cmp	r9d, 13
+        vmovdqu	xmm0, OWORD PTR [rsi+192]
+        jl	L_AES_GCM_decrypt_avx2_calc_iv_1_aesenc_avx_last
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm5, xmm5, [rsi+208]
+        vmovdqu	xmm0, OWORD PTR [rsi+224]
+L_AES_GCM_decrypt_avx2_calc_iv_1_aesenc_avx_last:
+        vaesenclast	xmm5, xmm5, xmm0
+        vpshufb	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        ; Calc counter
+        ; Initialization vector
+        cmp	edx, 0
+        mov	rcx, 0
+        je	L_AES_GCM_decrypt_avx2_calc_iv_done
+        cmp	edx, 16
+        jl	L_AES_GCM_decrypt_avx2_calc_iv_lt16
+        and	edx, 4294967280
+L_AES_GCM_decrypt_avx2_calc_iv_16_loop:
+        vmovdqu	xmm0, OWORD PTR [rax+rcx]
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm4, xmm4, xmm0
+        ; ghash_gfmul_avx
+        vpclmulqdq	xmm2, xmm5, xmm4, 16
+        vpclmulqdq	xmm1, xmm5, xmm4, 1
+        vpclmulqdq	xmm0, xmm5, xmm4, 0
+        vpclmulqdq	xmm3, xmm5, xmm4, 17
+        vpxor	xmm2, xmm2, xmm1
+        vpslldq	xmm1, xmm2, 8
+        vpsrldq	xmm2, xmm2, 8
+        vpxor	xmm7, xmm0, xmm1
+        vpxor	xmm4, xmm3, xmm2
+        ; ghash_mid
+        vpsrld	xmm0, xmm7, 31
+        vpsrld	xmm1, xmm4, 31
+        vpslld	xmm7, xmm7, 1
+        vpslld	xmm4, xmm4, 1
+        vpsrldq	xmm2, xmm0, 12
+        vpslldq	xmm0, xmm0, 4
+        vpslldq	xmm1, xmm1, 4
+        vpor	xmm4, xmm4, xmm2
+        vpor	xmm7, xmm7, xmm0
+        vpor	xmm4, xmm4, xmm1
+        ; ghash_red
+        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
+        vpclmulqdq	xmm0, xmm7, xmm2, 16
+        vpshufd	xmm1, xmm7, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpclmulqdq	xmm0, xmm1, xmm2, 16
+        vpshufd	xmm1, xmm1, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm4, xmm4, xmm1
+        add	ecx, 16
+        cmp	ecx, edx
+        jl	L_AES_GCM_decrypt_avx2_calc_iv_16_loop
+        mov	edx, ebx
+        cmp	ecx, edx
+        je	L_AES_GCM_decrypt_avx2_calc_iv_done
+L_AES_GCM_decrypt_avx2_calc_iv_lt16:
+        vpxor	xmm0, xmm0, xmm0
+        xor	ebx, ebx
+        vmovdqu	OWORD PTR [rsp], xmm0
+L_AES_GCM_decrypt_avx2_calc_iv_loop:
+        movzx	r13d, BYTE PTR [rax+rcx]
+        mov	BYTE PTR [rsp+rbx], r13b
+        inc	ecx
+        inc	ebx
+        cmp	ecx, edx
+        jl	L_AES_GCM_decrypt_avx2_calc_iv_loop
+        vmovdqu	xmm0, OWORD PTR [rsp]
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm4, xmm4, xmm0
+        ; ghash_gfmul_avx
+        vpclmulqdq	xmm2, xmm5, xmm4, 16
+        vpclmulqdq	xmm1, xmm5, xmm4, 1
+        vpclmulqdq	xmm0, xmm5, xmm4, 0
+        vpclmulqdq	xmm3, xmm5, xmm4, 17
+        vpxor	xmm2, xmm2, xmm1
+        vpslldq	xmm1, xmm2, 8
+        vpsrldq	xmm2, xmm2, 8
+        vpxor	xmm7, xmm0, xmm1
+        vpxor	xmm4, xmm3, xmm2
+        ; ghash_mid
+        vpsrld	xmm0, xmm7, 31
+        vpsrld	xmm1, xmm4, 31
+        vpslld	xmm7, xmm7, 1
+        vpslld	xmm4, xmm4, 1
+        vpsrldq	xmm2, xmm0, 12
+        vpslldq	xmm0, xmm0, 4
+        vpslldq	xmm1, xmm1, 4
+        vpor	xmm4, xmm4, xmm2
+        vpor	xmm7, xmm7, xmm0
+        vpor	xmm4, xmm4, xmm1
+        ; ghash_red
+        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
+        vpclmulqdq	xmm0, xmm7, xmm2, 16
+        vpshufd	xmm1, xmm7, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpclmulqdq	xmm0, xmm1, xmm2, 16
+        vpshufd	xmm1, xmm1, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm4, xmm4, xmm1
+L_AES_GCM_decrypt_avx2_calc_iv_done:
+        ; T = Encrypt counter
+        vpxor	xmm0, xmm0, xmm0
+        shl	edx, 3
+        vmovq	xmm0, rdx
+        vpxor	xmm4, xmm4, xmm0
+        ; ghash_gfmul_avx
+        vpclmulqdq	xmm2, xmm5, xmm4, 16
+        vpclmulqdq	xmm1, xmm5, xmm4, 1
+        vpclmulqdq	xmm0, xmm5, xmm4, 0
+        vpclmulqdq	xmm3, xmm5, xmm4, 17
+        vpxor	xmm2, xmm2, xmm1
+        vpslldq	xmm1, xmm2, 8
+        vpsrldq	xmm2, xmm2, 8
+        vpxor	xmm7, xmm0, xmm1
+        vpxor	xmm4, xmm3, xmm2
+        ; ghash_mid
+        vpsrld	xmm0, xmm7, 31
+        vpsrld	xmm1, xmm4, 31
+        vpslld	xmm7, xmm7, 1
+        vpslld	xmm4, xmm4, 1
+        vpsrldq	xmm2, xmm0, 12
+        vpslldq	xmm0, xmm0, 4
+        vpslldq	xmm1, xmm1, 4
+        vpor	xmm4, xmm4, xmm2
+        vpor	xmm7, xmm7, xmm0
+        vpor	xmm4, xmm4, xmm1
+        ; ghash_red
+        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
+        vpclmulqdq	xmm0, xmm7, xmm2, 16
+        vpshufd	xmm1, xmm7, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpclmulqdq	xmm0, xmm1, xmm2, 16
+        vpshufd	xmm1, xmm1, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm4, xmm4, xmm1
+        vpshufb	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        ;   Encrypt counter
+        vmovdqu	xmm15, OWORD PTR [rsi]
+        vpxor	xmm15, xmm15, xmm4
+        vaesenc	xmm15, xmm15, [rsi+16]
+        vaesenc	xmm15, xmm15, [rsi+32]
+        vaesenc	xmm15, xmm15, [rsi+48]
+        vaesenc	xmm15, xmm15, [rsi+64]
+        vaesenc	xmm15, xmm15, [rsi+80]
+        vaesenc	xmm15, xmm15, [rsi+96]
+        vaesenc	xmm15, xmm15, [rsi+112]
+        vaesenc	xmm15, xmm15, [rsi+128]
+        vaesenc	xmm15, xmm15, [rsi+144]
+        cmp	r9d, 11
+        vmovdqu	xmm0, OWORD PTR [rsi+160]
+        jl	L_AES_GCM_decrypt_avx2_calc_iv_2_aesenc_avx_last
+        vaesenc	xmm15, xmm15, xmm0
+        vaesenc	xmm15, xmm15, [rsi+176]
+        cmp	r9d, 13
+        vmovdqu	xmm0, OWORD PTR [rsi+192]
+        jl	L_AES_GCM_decrypt_avx2_calc_iv_2_aesenc_avx_last
+        vaesenc	xmm15, xmm15, xmm0
+        vaesenc	xmm15, xmm15, [rsi+208]
+        vmovdqu	xmm0, OWORD PTR [rsi+224]
+L_AES_GCM_decrypt_avx2_calc_iv_2_aesenc_avx_last:
+        vaesenclast	xmm15, xmm15, xmm0
+        jmp	L_AES_GCM_decrypt_avx2_iv_done
+L_AES_GCM_decrypt_avx2_iv_12:
+        ; # Calculate values when IV is 12 bytes
+        ; Set counter based on IV
+        vmovdqu	xmm4, OWORD PTR L_avx2_aes_gcm_bswap_one
+        vmovdqu	xmm5, OWORD PTR [rsi]
+        vpblendd	xmm4, xmm4, [rax], 7
+        ; H = Encrypt X(=0) and T = Encrypt counter
+        vmovdqu	xmm7, OWORD PTR [rsi+16]
+        vpxor	xmm15, xmm4, xmm5
+        vaesenc	xmm5, xmm5, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm0, OWORD PTR [rsi+32]
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        vmovdqu	xmm0, OWORD PTR [rsi+48]
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        vmovdqu	xmm0, OWORD PTR [rsi+64]
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        vmovdqu	xmm0, OWORD PTR [rsi+80]
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        vmovdqu	xmm0, OWORD PTR [rsi+96]
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        vmovdqu	xmm0, OWORD PTR [rsi+112]
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        vmovdqu	xmm0, OWORD PTR [rsi+128]
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        vmovdqu	xmm0, OWORD PTR [rsi+144]
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        cmp	r9d, 11
+        vmovdqu	xmm0, OWORD PTR [rsi+160]
+        jl	L_AES_GCM_decrypt_avx2_calc_iv_12_last
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        vmovdqu	xmm0, OWORD PTR [rsi+176]
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        cmp	r9d, 13
+        vmovdqu	xmm0, OWORD PTR [rsi+192]
+        jl	L_AES_GCM_decrypt_avx2_calc_iv_12_last
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        vmovdqu	xmm0, OWORD PTR [rsi+208]
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        vmovdqu	xmm0, OWORD PTR [rsi+224]
+L_AES_GCM_decrypt_avx2_calc_iv_12_last:
+        vaesenclast	xmm5, xmm5, xmm0
+        vaesenclast	xmm15, xmm15, xmm0
+        vpshufb	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_bswap_mask
+L_AES_GCM_decrypt_avx2_iv_done:
+        ; Additional authentication data
+        mov	edx, r11d
+        cmp	edx, 0
+        je	L_AES_GCM_decrypt_avx2_calc_aad_done
+        xor	ecx, ecx
+        cmp	edx, 16
+        jl	L_AES_GCM_decrypt_avx2_calc_aad_lt16
+        and	edx, 4294967280
+L_AES_GCM_decrypt_avx2_calc_aad_16_loop:
+        vmovdqu	xmm0, OWORD PTR [r12+rcx]
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm6, xmm6, xmm0
+        ; ghash_gfmul_avx
+        vpclmulqdq	xmm2, xmm5, xmm6, 16
+        vpclmulqdq	xmm1, xmm5, xmm6, 1
+        vpclmulqdq	xmm0, xmm5, xmm6, 0
+        vpclmulqdq	xmm3, xmm5, xmm6, 17
+        vpxor	xmm2, xmm2, xmm1
+        vpslldq	xmm1, xmm2, 8
+        vpsrldq	xmm2, xmm2, 8
+        vpxor	xmm7, xmm0, xmm1
+        vpxor	xmm6, xmm3, xmm2
+        ; ghash_mid
+        vpsrld	xmm0, xmm7, 31
+        vpsrld	xmm1, xmm6, 31
+        vpslld	xmm7, xmm7, 1
+        vpslld	xmm6, xmm6, 1
+        vpsrldq	xmm2, xmm0, 12
+        vpslldq	xmm0, xmm0, 4
+        vpslldq	xmm1, xmm1, 4
+        vpor	xmm6, xmm6, xmm2
+        vpor	xmm7, xmm7, xmm0
+        vpor	xmm6, xmm6, xmm1
+        ; ghash_red
+        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
+        vpclmulqdq	xmm0, xmm7, xmm2, 16
+        vpshufd	xmm1, xmm7, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpclmulqdq	xmm0, xmm1, xmm2, 16
+        vpshufd	xmm1, xmm1, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm6, xmm6, xmm1
+        add	ecx, 16
+        cmp	ecx, edx
+        jl	L_AES_GCM_decrypt_avx2_calc_aad_16_loop
+        mov	edx, r11d
+        cmp	ecx, edx
+        je	L_AES_GCM_decrypt_avx2_calc_aad_done
+L_AES_GCM_decrypt_avx2_calc_aad_lt16:
+        vpxor	xmm0, xmm0, xmm0
+        xor	ebx, ebx
+        vmovdqu	OWORD PTR [rsp], xmm0
+L_AES_GCM_decrypt_avx2_calc_aad_loop:
+        movzx	r13d, BYTE PTR [r12+rcx]
+        mov	BYTE PTR [rsp+rbx], r13b
+        inc	ecx
+        inc	ebx
+        cmp	ecx, edx
+        jl	L_AES_GCM_decrypt_avx2_calc_aad_loop
+        vmovdqu	xmm0, OWORD PTR [rsp]
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm6, xmm6, xmm0
+        ; ghash_gfmul_avx
+        vpclmulqdq	xmm2, xmm5, xmm6, 16
+        vpclmulqdq	xmm1, xmm5, xmm6, 1
+        vpclmulqdq	xmm0, xmm5, xmm6, 0
+        vpclmulqdq	xmm3, xmm5, xmm6, 17
+        vpxor	xmm2, xmm2, xmm1
+        vpslldq	xmm1, xmm2, 8
+        vpsrldq	xmm2, xmm2, 8
+        vpxor	xmm7, xmm0, xmm1
+        vpxor	xmm6, xmm3, xmm2
+        ; ghash_mid
+        vpsrld	xmm0, xmm7, 31
+        vpsrld	xmm1, xmm6, 31
+        vpslld	xmm7, xmm7, 1
+        vpslld	xmm6, xmm6, 1
+        vpsrldq	xmm2, xmm0, 12
+        vpslldq	xmm0, xmm0, 4
+        vpslldq	xmm1, xmm1, 4
+        vpor	xmm6, xmm6, xmm2
+        vpor	xmm7, xmm7, xmm0
+        vpor	xmm6, xmm6, xmm1
+        ; ghash_red
+        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
+        vpclmulqdq	xmm0, xmm7, xmm2, 16
+        vpshufd	xmm1, xmm7, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpclmulqdq	xmm0, xmm1, xmm2, 16
+        vpshufd	xmm1, xmm1, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm6, xmm6, xmm1
+L_AES_GCM_decrypt_avx2_calc_aad_done:
+        ; Calculate counter and H
+        vpsrlq	xmm1, xmm5, 63
+        vpsllq	xmm0, xmm5, 1
+        vpslldq	xmm1, xmm1, 8
+        vpor	xmm0, xmm0, xmm1
+        vpshufd	xmm5, xmm5, 255
+        vpsrad	xmm5, xmm5, 31
+        vpshufb	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64
+        vpand	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_mod2_128
+        vpaddd	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_one
+        vpxor	xmm5, xmm5, xmm0
+        xor	ebx, ebx
+        cmp	r10d, 128
+        mov	r13d, r10d
+        jl	L_AES_GCM_decrypt_avx2_done_128
+        and	r13d, 4294967168
+        vmovdqu	OWORD PTR [rsp+128], xmm4
+        vmovdqu	OWORD PTR [rsp+144], xmm15
+        vmovdqu	xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128
+        ; H ^ 1 and H ^ 2
+        vpclmulqdq	xmm9, xmm5, xmm5, 0
+        vpclmulqdq	xmm10, xmm5, xmm5, 17
+        vpclmulqdq	xmm8, xmm9, xmm3, 16
+        vpshufd	xmm9, xmm9, 78
+        vpxor	xmm9, xmm9, xmm8
+        vpclmulqdq	xmm8, xmm9, xmm3, 16
+        vpshufd	xmm9, xmm9, 78
+        vpxor	xmm9, xmm9, xmm8
+        vpxor	xmm0, xmm10, xmm9
+        vmovdqu	OWORD PTR [rsp], xmm5
+        vmovdqu	OWORD PTR [rsp+16], xmm0
+        ; H ^ 3 and H ^ 4
+        vpclmulqdq	xmm11, xmm0, xmm5, 16
+        vpclmulqdq	xmm10, xmm0, xmm5, 1
+        vpclmulqdq	xmm9, xmm0, xmm5, 0
+        vpclmulqdq	xmm12, xmm0, xmm5, 17
+        vpclmulqdq	xmm13, xmm0, xmm0, 0
+        vpclmulqdq	xmm14, xmm0, xmm0, 17
+        vpxor	xmm11, xmm11, xmm10
+        vpslldq	xmm10, xmm11, 8
+        vpsrldq	xmm11, xmm11, 8
+        vpxor	xmm10, xmm10, xmm9
+        vpclmulqdq	xmm8, xmm13, xmm3, 16
+        vpclmulqdq	xmm9, xmm10, xmm3, 16
+        vpshufd	xmm10, xmm10, 78
+        vpshufd	xmm13, xmm13, 78
+        vpxor	xmm10, xmm10, xmm9
+        vpxor	xmm13, xmm13, xmm8
+        vpclmulqdq	xmm9, xmm10, xmm3, 16
+        vpclmulqdq	xmm8, xmm13, xmm3, 16
+        vpshufd	xmm10, xmm10, 78
+        vpshufd	xmm13, xmm13, 78
+        vpxor	xmm12, xmm12, xmm11
+        vpxor	xmm13, xmm13, xmm8
+        vpxor	xmm10, xmm10, xmm12
+        vpxor	xmm2, xmm13, xmm14
+        vpxor	xmm1, xmm10, xmm9
+        vmovdqu	OWORD PTR [rsp+32], xmm1
+        vmovdqu	OWORD PTR [rsp+48], xmm2
+        ; H ^ 5 and H ^ 6
+        vpclmulqdq	xmm11, xmm1, xmm0, 16
+        vpclmulqdq	xmm10, xmm1, xmm0, 1
+        vpclmulqdq	xmm9, xmm1, xmm0, 0
+        vpclmulqdq	xmm12, xmm1, xmm0, 17
+        vpclmulqdq	xmm13, xmm1, xmm1, 0
+        vpclmulqdq	xmm14, xmm1, xmm1, 17
+        vpxor	xmm11, xmm11, xmm10
+        vpslldq	xmm10, xmm11, 8
+        vpsrldq	xmm11, xmm11, 8
+        vpxor	xmm10, xmm10, xmm9
+        vpclmulqdq	xmm8, xmm13, xmm3, 16
+        vpclmulqdq	xmm9, xmm10, xmm3, 16
+        vpshufd	xmm10, xmm10, 78
+        vpshufd	xmm13, xmm13, 78
+        vpxor	xmm10, xmm10, xmm9
+        vpxor	xmm13, xmm13, xmm8
+        vpclmulqdq	xmm9, xmm10, xmm3, 16
+        vpclmulqdq	xmm8, xmm13, xmm3, 16
+        vpshufd	xmm10, xmm10, 78
+        vpshufd	xmm13, xmm13, 78
+        vpxor	xmm12, xmm12, xmm11
+        vpxor	xmm13, xmm13, xmm8
+        vpxor	xmm10, xmm10, xmm12
+        vpxor	xmm0, xmm13, xmm14
+        vpxor	xmm7, xmm10, xmm9
+        vmovdqu	OWORD PTR [rsp+64], xmm7
+        vmovdqu	OWORD PTR [rsp+80], xmm0
+        ; H ^ 7 and H ^ 8
+        vpclmulqdq	xmm11, xmm2, xmm1, 16
+        vpclmulqdq	xmm10, xmm2, xmm1, 1
+        vpclmulqdq	xmm9, xmm2, xmm1, 0
+        vpclmulqdq	xmm12, xmm2, xmm1, 17
+        vpclmulqdq	xmm13, xmm2, xmm2, 0
+        vpclmulqdq	xmm14, xmm2, xmm2, 17
+        vpxor	xmm11, xmm11, xmm10
+        vpslldq	xmm10, xmm11, 8
+        vpsrldq	xmm11, xmm11, 8
+        vpxor	xmm10, xmm10, xmm9
+        vpclmulqdq	xmm8, xmm13, xmm3, 16
+        vpclmulqdq	xmm9, xmm10, xmm3, 16
+        vpshufd	xmm10, xmm10, 78
+        vpshufd	xmm13, xmm13, 78
+        vpxor	xmm10, xmm10, xmm9
+        vpxor	xmm13, xmm13, xmm8
+        vpclmulqdq	xmm9, xmm10, xmm3, 16
+        vpclmulqdq	xmm8, xmm13, xmm3, 16
+        vpshufd	xmm10, xmm10, 78
+        vpshufd	xmm13, xmm13, 78
+        vpxor	xmm12, xmm12, xmm11
+        vpxor	xmm13, xmm13, xmm8
+        vpxor	xmm10, xmm10, xmm12
+        vpxor	xmm0, xmm13, xmm14
+        vpxor	xmm7, xmm10, xmm9
+        vmovdqu	OWORD PTR [rsp+96], xmm7
+        vmovdqu	OWORD PTR [rsp+112], xmm0
+L_AES_GCM_decrypt_avx2_ghash_128:
+        ; aesenc_128_ghash
+        lea	rcx, QWORD PTR [rdi+rbx]
+        lea	rdx, QWORD PTR [r8+rbx]
+        ; aesenc_ctr
+        vmovdqu	xmm0, OWORD PTR [rsp+128]
+        vmovdqu	xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64
+        vpaddd	xmm9, xmm0, OWORD PTR L_avx2_aes_gcm_one
+        vpshufb	xmm8, xmm0, xmm1
+        vpaddd	xmm10, xmm0, OWORD PTR L_avx2_aes_gcm_two
+        vpshufb	xmm9, xmm9, xmm1
+        vpaddd	xmm11, xmm0, OWORD PTR L_avx2_aes_gcm_three
+        vpshufb	xmm10, xmm10, xmm1
+        vpaddd	xmm12, xmm0, OWORD PTR L_avx2_aes_gcm_four
+        vpshufb	xmm11, xmm11, xmm1
+        vpaddd	xmm13, xmm0, OWORD PTR L_avx2_aes_gcm_five
+        vpshufb	xmm12, xmm12, xmm1
+        vpaddd	xmm14, xmm0, OWORD PTR L_avx2_aes_gcm_six
+        vpshufb	xmm13, xmm13, xmm1
+        vpaddd	xmm15, xmm0, OWORD PTR L_avx2_aes_gcm_seven
+        vpshufb	xmm14, xmm14, xmm1
+        vpaddd	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_eight
+        vpshufb	xmm15, xmm15, xmm1
+        ; aesenc_xor
+        vmovdqu	xmm7, OWORD PTR [rsi]
+        vmovdqu	OWORD PTR [rsp+128], xmm0
+        vpxor	xmm8, xmm8, xmm7
+        vpxor	xmm9, xmm9, xmm7
+        vpxor	xmm10, xmm10, xmm7
+        vpxor	xmm11, xmm11, xmm7
+        vpxor	xmm12, xmm12, xmm7
+        vpxor	xmm13, xmm13, xmm7
+        vpxor	xmm14, xmm14, xmm7
+        vpxor	xmm15, xmm15, xmm7
+        ; aesenc_pclmul_1
+        vmovdqu	xmm1, OWORD PTR [rcx]
+        vmovdqu	xmm0, OWORD PTR [rsi+16]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vmovdqu	xmm2, OWORD PTR [rsp+112]
+        vpxor	xmm1, xmm1, xmm6
+        vpclmulqdq	xmm5, xmm1, xmm2, 16
+        vpclmulqdq	xmm3, xmm1, xmm2, 1
+        vpclmulqdq	xmm6, xmm1, xmm2, 0
+        vpclmulqdq	xmm7, xmm1, xmm2, 17
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_2
+        vmovdqu	xmm1, OWORD PTR [rcx+16]
+        vmovdqu	xmm0, OWORD PTR [rsp+96]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm5, xmm5, xmm3
+        vpclmulqdq	xmm2, xmm1, xmm0, 16
+        vpclmulqdq	xmm3, xmm1, xmm0, 1
+        vpclmulqdq	xmm4, xmm1, xmm0, 0
+        vpclmulqdq	xmm1, xmm1, xmm0, 17
+        vmovdqu	xmm0, OWORD PTR [rsi+32]
+        vpxor	xmm7, xmm7, xmm1
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_n
+        vmovdqu	xmm1, OWORD PTR [rcx+32]
+        vmovdqu	xmm0, OWORD PTR [rsp+80]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm5, xmm5, xmm2
+        vpclmulqdq	xmm2, xmm1, xmm0, 16
+        vpxor	xmm5, xmm5, xmm3
+        vpclmulqdq	xmm3, xmm1, xmm0, 1
+        vpxor	xmm6, xmm6, xmm4
+        vpclmulqdq	xmm4, xmm1, xmm0, 0
+        vpclmulqdq	xmm1, xmm1, xmm0, 17
+        vmovdqu	xmm0, OWORD PTR [rsi+48]
+        vpxor	xmm7, xmm7, xmm1
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_n
+        vmovdqu	xmm1, OWORD PTR [rcx+48]
+        vmovdqu	xmm0, OWORD PTR [rsp+64]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm5, xmm5, xmm2
+        vpclmulqdq	xmm2, xmm1, xmm0, 16
+        vpxor	xmm5, xmm5, xmm3
+        vpclmulqdq	xmm3, xmm1, xmm0, 1
+        vpxor	xmm6, xmm6, xmm4
+        vpclmulqdq	xmm4, xmm1, xmm0, 0
+        vpclmulqdq	xmm1, xmm1, xmm0, 17
+        vmovdqu	xmm0, OWORD PTR [rsi+64]
+        vpxor	xmm7, xmm7, xmm1
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_n
+        vmovdqu	xmm1, OWORD PTR [rcx+64]
+        vmovdqu	xmm0, OWORD PTR [rsp+48]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm5, xmm5, xmm2
+        vpclmulqdq	xmm2, xmm1, xmm0, 16
+        vpxor	xmm5, xmm5, xmm3
+        vpclmulqdq	xmm3, xmm1, xmm0, 1
+        vpxor	xmm6, xmm6, xmm4
+        vpclmulqdq	xmm4, xmm1, xmm0, 0
+        vpclmulqdq	xmm1, xmm1, xmm0, 17
+        vmovdqu	xmm0, OWORD PTR [rsi+80]
+        vpxor	xmm7, xmm7, xmm1
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_n
+        vmovdqu	xmm1, OWORD PTR [rcx+80]
+        vmovdqu	xmm0, OWORD PTR [rsp+32]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm5, xmm5, xmm2
+        vpclmulqdq	xmm2, xmm1, xmm0, 16
+        vpxor	xmm5, xmm5, xmm3
+        vpclmulqdq	xmm3, xmm1, xmm0, 1
+        vpxor	xmm6, xmm6, xmm4
+        vpclmulqdq	xmm4, xmm1, xmm0, 0
+        vpclmulqdq	xmm1, xmm1, xmm0, 17
+        vmovdqu	xmm0, OWORD PTR [rsi+96]
+        vpxor	xmm7, xmm7, xmm1
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_n
+        vmovdqu	xmm1, OWORD PTR [rcx+96]
+        vmovdqu	xmm0, OWORD PTR [rsp+16]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm5, xmm5, xmm2
+        vpclmulqdq	xmm2, xmm1, xmm0, 16
+        vpxor	xmm5, xmm5, xmm3
+        vpclmulqdq	xmm3, xmm1, xmm0, 1
+        vpxor	xmm6, xmm6, xmm4
+        vpclmulqdq	xmm4, xmm1, xmm0, 0
+        vpclmulqdq	xmm1, xmm1, xmm0, 17
+        vmovdqu	xmm0, OWORD PTR [rsi+112]
+        vpxor	xmm7, xmm7, xmm1
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_n
+        vmovdqu	xmm1, OWORD PTR [rcx+112]
+        vmovdqu	xmm0, OWORD PTR [rsp]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm5, xmm5, xmm2
+        vpclmulqdq	xmm2, xmm1, xmm0, 16
+        vpxor	xmm5, xmm5, xmm3
+        vpclmulqdq	xmm3, xmm1, xmm0, 1
+        vpxor	xmm6, xmm6, xmm4
+        vpclmulqdq	xmm4, xmm1, xmm0, 0
+        vpclmulqdq	xmm1, xmm1, xmm0, 17
+        vmovdqu	xmm0, OWORD PTR [rsi+128]
+        vpxor	xmm7, xmm7, xmm1
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_l
+        vpxor	xmm5, xmm5, xmm2
+        vpxor	xmm6, xmm6, xmm4
+        vpxor	xmm5, xmm5, xmm3
+        vpslldq	xmm1, xmm5, 8
+        vpsrldq	xmm5, xmm5, 8
+        vmovdqu	xmm4, OWORD PTR [rsi+144]
+        vmovdqu	xmm0, OWORD PTR L_avx2_aes_gcm_mod2_128
+        vaesenc	xmm8, xmm8, xmm4
+        vpxor	xmm6, xmm6, xmm1
+        vpxor	xmm7, xmm7, xmm5
+        vpclmulqdq	xmm3, xmm6, xmm0, 16
+        vaesenc	xmm9, xmm9, xmm4
+        vaesenc	xmm10, xmm10, xmm4
+        vaesenc	xmm11, xmm11, xmm4
+        vpshufd	xmm6, xmm6, 78
+        vpxor	xmm6, xmm6, xmm3
+        vpclmulqdq	xmm3, xmm6, xmm0, 16
+        vaesenc	xmm12, xmm12, xmm4
+        vaesenc	xmm13, xmm13, xmm4
+        vaesenc	xmm14, xmm14, xmm4
+        vpshufd	xmm6, xmm6, 78
+        vpxor	xmm6, xmm6, xmm3
+        vpxor	xmm6, xmm6, xmm7
+        vaesenc	xmm15, xmm15, xmm4
+        cmp	r9d, 11
+        vmovdqu	xmm7, OWORD PTR [rsi+160]
+        jl	L_AES_GCM_decrypt_avx2_aesenc_128_ghash_avx_done
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rsi+176]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        cmp	r9d, 13
+        vmovdqu	xmm7, OWORD PTR [rsi+192]
+        jl	L_AES_GCM_decrypt_avx2_aesenc_128_ghash_avx_done
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rsi+208]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rsi+224]
+L_AES_GCM_decrypt_avx2_aesenc_128_ghash_avx_done:
+        ; aesenc_last
+        vaesenclast	xmm8, xmm8, xmm7
+        vaesenclast	xmm9, xmm9, xmm7
+        vaesenclast	xmm10, xmm10, xmm7
+        vaesenclast	xmm11, xmm11, xmm7
+        vmovdqu	xmm0, OWORD PTR [rcx]
+        vmovdqu	xmm1, OWORD PTR [rcx+16]
+        vmovdqu	xmm2, OWORD PTR [rcx+32]
+        vmovdqu	xmm3, OWORD PTR [rcx+48]
+        vpxor	xmm8, xmm8, xmm0
+        vpxor	xmm9, xmm9, xmm1
+        vpxor	xmm10, xmm10, xmm2
+        vpxor	xmm11, xmm11, xmm3
+        vmovdqu	OWORD PTR [rdx], xmm8
+        vmovdqu	OWORD PTR [rdx+16], xmm9
+        vmovdqu	OWORD PTR [rdx+32], xmm10
+        vmovdqu	OWORD PTR [rdx+48], xmm11
+        vaesenclast	xmm12, xmm12, xmm7
+        vaesenclast	xmm13, xmm13, xmm7
+        vaesenclast	xmm14, xmm14, xmm7
+        vaesenclast	xmm15, xmm15, xmm7
+        vmovdqu	xmm0, OWORD PTR [rcx+64]
+        vmovdqu	xmm1, OWORD PTR [rcx+80]
+        vmovdqu	xmm2, OWORD PTR [rcx+96]
+        vmovdqu	xmm3, OWORD PTR [rcx+112]
+        vpxor	xmm12, xmm12, xmm0
+        vpxor	xmm13, xmm13, xmm1
+        vpxor	xmm14, xmm14, xmm2
+        vpxor	xmm15, xmm15, xmm3
+        vmovdqu	OWORD PTR [rdx+64], xmm12
+        vmovdqu	OWORD PTR [rdx+80], xmm13
+        vmovdqu	OWORD PTR [rdx+96], xmm14
+        vmovdqu	OWORD PTR [rdx+112], xmm15
+        ; aesenc_128_ghash - end
+        add	ebx, 128
+        cmp	ebx, r13d
+        jl	L_AES_GCM_decrypt_avx2_ghash_128
+        vmovdqu	xmm5, OWORD PTR [rsp]
+        vmovdqu	xmm4, OWORD PTR [rsp+128]
+        vmovdqu	xmm15, OWORD PTR [rsp+144]
+L_AES_GCM_decrypt_avx2_done_128:
+        cmp	ebx, r10d
+        jge	L_AES_GCM_decrypt_avx2_done_dec
+        mov	r13d, r10d
+        and	r13d, 4294967280
+        cmp	ebx, r13d
+        jge	L_AES_GCM_decrypt_avx2_last_block_done
+L_AES_GCM_decrypt_avx2_last_block_start:
+        vmovdqu	xmm11, OWORD PTR [rdi+rbx]
+        vpshufb	xmm10, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64
+        vpshufb	xmm12, xmm11, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpaddd	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_one
+        vpxor	xmm12, xmm12, xmm6
+        ; aesenc_gfmul_sb
+        vpclmulqdq	xmm2, xmm12, xmm5, 1
+        vpclmulqdq	xmm3, xmm12, xmm5, 16
+        vpclmulqdq	xmm1, xmm12, xmm5, 0
+        vpclmulqdq	xmm8, xmm12, xmm5, 17
+        vpxor	xmm10, xmm10, [rsi]
+        vaesenc	xmm10, xmm10, [rsi+16]
+        vpxor	xmm3, xmm3, xmm2
+        vpslldq	xmm2, xmm3, 8
+        vpsrldq	xmm3, xmm3, 8
+        vaesenc	xmm10, xmm10, [rsi+32]
+        vpxor	xmm2, xmm2, xmm1
+        vpclmulqdq	xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
+        vaesenc	xmm10, xmm10, [rsi+48]
+        vaesenc	xmm10, xmm10, [rsi+64]
+        vaesenc	xmm10, xmm10, [rsi+80]
+        vpshufd	xmm2, xmm2, 78
+        vpxor	xmm2, xmm2, xmm1
+        vpclmulqdq	xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
+        vaesenc	xmm10, xmm10, [rsi+96]
+        vaesenc	xmm10, xmm10, [rsi+112]
+        vaesenc	xmm10, xmm10, [rsi+128]
+        vpshufd	xmm2, xmm2, 78
+        vaesenc	xmm10, xmm10, [rsi+144]
+        vpxor	xmm8, xmm8, xmm3
+        vpxor	xmm2, xmm2, xmm8
+        vmovdqu	xmm0, OWORD PTR [rsi+160]
+        cmp	r9d, 11
+        jl	L_AES_GCM_decrypt_avx2_aesenc_gfmul_sb_last
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm10, xmm10, [rsi+176]
+        vmovdqu	xmm0, OWORD PTR [rsi+192]
+        cmp	r9d, 13
+        jl	L_AES_GCM_decrypt_avx2_aesenc_gfmul_sb_last
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm10, xmm10, [rsi+208]
+        vmovdqu	xmm0, OWORD PTR [rsi+224]
+L_AES_GCM_decrypt_avx2_aesenc_gfmul_sb_last:
+        vaesenclast	xmm10, xmm10, xmm0
+        vpxor	xmm6, xmm2, xmm1
+        vpxor	xmm10, xmm10, xmm11
+        vmovdqu	OWORD PTR [r8+rbx], xmm10
+        add	ebx, 16
+        cmp	ebx, r13d
+        jl	L_AES_GCM_decrypt_avx2_last_block_start
+L_AES_GCM_decrypt_avx2_last_block_done:
+        mov	ecx, r10d
+        mov	edx, r10d
+        and	ecx, 15
+        jz	L_AES_GCM_decrypt_avx2_done_dec
+        ; aesenc_last15_dec
+        vpshufb	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64
+        vpxor	xmm4, xmm4, [rsi]
+        vaesenc	xmm4, xmm4, [rsi+16]
+        vaesenc	xmm4, xmm4, [rsi+32]
+        vaesenc	xmm4, xmm4, [rsi+48]
+        vaesenc	xmm4, xmm4, [rsi+64]
+        vaesenc	xmm4, xmm4, [rsi+80]
+        vaesenc	xmm4, xmm4, [rsi+96]
+        vaesenc	xmm4, xmm4, [rsi+112]
+        vaesenc	xmm4, xmm4, [rsi+128]
+        vaesenc	xmm4, xmm4, [rsi+144]
+        cmp	r9d, 11
+        vmovdqu	xmm1, OWORD PTR [rsi+160]
+        jl	L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_aesenc_avx_last
+        vaesenc	xmm4, xmm4, xmm1
+        vaesenc	xmm4, xmm4, [rsi+176]
+        cmp	r9d, 13
+        vmovdqu	xmm1, OWORD PTR [rsi+192]
+        jl	L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_aesenc_avx_last
+        vaesenc	xmm4, xmm4, xmm1
+        vaesenc	xmm4, xmm4, [rsi+208]
+        vmovdqu	xmm1, OWORD PTR [rsi+224]
+L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_aesenc_avx_last:
+        vaesenclast	xmm4, xmm4, xmm1
+        xor	ecx, ecx
+        vpxor	xmm0, xmm0, xmm0
+        vmovdqu	OWORD PTR [rsp], xmm4
+        vmovdqu	OWORD PTR [rsp+16], xmm0
+L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_loop:
+        movzx	r13d, BYTE PTR [rdi+rbx]
+        mov	BYTE PTR [rsp+rcx+16], r13b
+        xor	r13b, BYTE PTR [rsp+rcx]
+        mov	BYTE PTR [r8+rbx], r13b
+        inc	ebx
+        inc	ecx
+        cmp	ebx, edx
+        jl	L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_loop
+        vmovdqu	xmm4, OWORD PTR [rsp+16]
+        vpshufb	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm6, xmm6, xmm4
+        ; ghash_gfmul_red
+        vpclmulqdq	xmm2, xmm6, xmm5, 16
+        vpclmulqdq	xmm1, xmm6, xmm5, 1
+        vpclmulqdq	xmm0, xmm6, xmm5, 0
+        vpxor	xmm2, xmm2, xmm1
+        vpslldq	xmm1, xmm2, 8
+        vpsrldq	xmm2, xmm2, 8
+        vpxor	xmm1, xmm1, xmm0
+        vpclmulqdq	xmm6, xmm6, xmm5, 17
+        vpclmulqdq	xmm0, xmm1, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
+        vpshufd	xmm1, xmm1, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpclmulqdq	xmm0, xmm1, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
+        vpshufd	xmm1, xmm1, 78
+        vpxor	xmm6, xmm6, xmm2
+        vpxor	xmm6, xmm6, xmm1
+        vpxor	xmm6, xmm6, xmm0
+L_AES_GCM_decrypt_avx2_done_dec:
+        ; calc_tag
+        shl	r10, 3
+        shl	r11, 3
+        vmovq	xmm0, r10
+        vmovq	xmm1, r11
+        vpunpcklqdq	xmm0, xmm0, xmm1
+        vpxor	xmm0, xmm0, xmm6
+        ; ghash_gfmul_red
+        vpclmulqdq	xmm4, xmm0, xmm5, 16
+        vpclmulqdq	xmm3, xmm0, xmm5, 1
+        vpclmulqdq	xmm2, xmm0, xmm5, 0
+        vpxor	xmm4, xmm4, xmm3
+        vpslldq	xmm3, xmm4, 8
+        vpsrldq	xmm4, xmm4, 8
+        vpxor	xmm3, xmm3, xmm2
+        vpclmulqdq	xmm0, xmm0, xmm5, 17
+        vpclmulqdq	xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
+        vpshufd	xmm3, xmm3, 78
+        vpxor	xmm3, xmm3, xmm2
+        vpclmulqdq	xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
+        vpshufd	xmm3, xmm3, 78
+        vpxor	xmm0, xmm0, xmm4
+        vpxor	xmm0, xmm0, xmm3
+        vpxor	xmm0, xmm0, xmm2
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm0, xmm0, xmm15
+        ; cmp_tag
+        cmp	r15d, 16
+        je	L_AES_GCM_decrypt_avx2_cmp_tag_16
+        xor	rdx, rdx
+        xor	rax, rax
+        vmovdqu	OWORD PTR [rsp], xmm0
+L_AES_GCM_decrypt_avx2_cmp_tag_loop:
+        movzx	r13d, BYTE PTR [rsp+rdx]
+        xor	r13b, BYTE PTR [r14+rdx]
+        or	al, r13b
+        inc	edx
+        cmp	edx, r15d
+        jne	L_AES_GCM_decrypt_avx2_cmp_tag_loop
+        cmp	rax, 0
+        sete	al
+        jmp	L_AES_GCM_decrypt_avx2_cmp_tag_done
+L_AES_GCM_decrypt_avx2_cmp_tag_16:
+        vmovdqu	xmm1, OWORD PTR [r14]
+        vpcmpeqb	xmm0, xmm0, xmm1
+        vpmovmskb	rdx, xmm0
+        ; %%edx == 0xFFFF then return 1 else => return 0
+        xor	eax, eax
+        cmp	edx, 65535
+        sete	al
+L_AES_GCM_decrypt_avx2_cmp_tag_done:
+        mov	DWORD PTR [rbp], eax
+        vzeroupper
+        add	rsp, 168
+        pop	rbp
+        pop	rsi
+        pop	r15
+        pop	rbx
+        pop	r14
+        pop	r12
+        pop	rdi
+        pop	r13
+        ret
+AES_GCM_decrypt_avx2 ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+AES_GCM_init_avx2 PROC
+        push	rbx
+        push	rdi
+        push	rsi
+        push	r12
+        mov	rdi, rcx
+        mov	rsi, rdx
+        mov	r10, r8
+        mov	r11d, r9d
+        mov	rax, QWORD PTR [rsp+72]
+        mov	r8, QWORD PTR [rsp+80]
+        mov	r9, QWORD PTR [rsp+88]
+        sub	rsp, 16
+        vpxor	xmm4, xmm4, xmm4
+        mov	edx, r11d
+        cmp	edx, 12
+        je	L_AES_GCM_init_avx2_iv_12
+        ; Calculate values when IV is not 12 bytes
+        ; H = Encrypt X(=0)
+        vmovdqu	xmm5, OWORD PTR [rdi]
+        vaesenc	xmm5, xmm5, [rdi+16]
+        vaesenc	xmm5, xmm5, [rdi+32]
+        vaesenc	xmm5, xmm5, [rdi+48]
+        vaesenc	xmm5, xmm5, [rdi+64]
+        vaesenc	xmm5, xmm5, [rdi+80]
+        vaesenc	xmm5, xmm5, [rdi+96]
+        vaesenc	xmm5, xmm5, [rdi+112]
+        vaesenc	xmm5, xmm5, [rdi+128]
+        vaesenc	xmm5, xmm5, [rdi+144]
+        cmp	esi, 11
+        vmovdqu	xmm0, OWORD PTR [rdi+160]
+        jl	L_AES_GCM_init_avx2_calc_iv_1_aesenc_avx_last
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm5, xmm5, [rdi+176]
+        cmp	esi, 13
+        vmovdqu	xmm0, OWORD PTR [rdi+192]
+        jl	L_AES_GCM_init_avx2_calc_iv_1_aesenc_avx_last
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm5, xmm5, [rdi+208]
+        vmovdqu	xmm0, OWORD PTR [rdi+224]
+L_AES_GCM_init_avx2_calc_iv_1_aesenc_avx_last:
+        vaesenclast	xmm5, xmm5, xmm0
+        vpshufb	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        ; Calc counter
+        ; Initialization vector
+        cmp	edx, 0
+        mov	rcx, 0
+        je	L_AES_GCM_init_avx2_calc_iv_done
+        cmp	edx, 16
+        jl	L_AES_GCM_init_avx2_calc_iv_lt16
+        and	edx, 4294967280
+L_AES_GCM_init_avx2_calc_iv_16_loop:
+        vmovdqu	xmm0, OWORD PTR [r10+rcx]
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm4, xmm4, xmm0
+        ; ghash_gfmul_avx
+        vpclmulqdq	xmm2, xmm5, xmm4, 16
+        vpclmulqdq	xmm1, xmm5, xmm4, 1
+        vpclmulqdq	xmm0, xmm5, xmm4, 0
+        vpclmulqdq	xmm3, xmm5, xmm4, 17
+        vpxor	xmm2, xmm2, xmm1
+        vpslldq	xmm1, xmm2, 8
+        vpsrldq	xmm2, xmm2, 8
+        vpxor	xmm6, xmm0, xmm1
+        vpxor	xmm4, xmm3, xmm2
+        ; ghash_mid
+        vpsrld	xmm0, xmm6, 31
+        vpsrld	xmm1, xmm4, 31
+        vpslld	xmm6, xmm6, 1
+        vpslld	xmm4, xmm4, 1
+        vpsrldq	xmm2, xmm0, 12
+        vpslldq	xmm0, xmm0, 4
+        vpslldq	xmm1, xmm1, 4
+        vpor	xmm4, xmm4, xmm2
+        vpor	xmm6, xmm6, xmm0
+        vpor	xmm4, xmm4, xmm1
+        ; ghash_red
+        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
+        vpclmulqdq	xmm0, xmm6, xmm2, 16
+        vpshufd	xmm1, xmm6, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpclmulqdq	xmm0, xmm1, xmm2, 16
+        vpshufd	xmm1, xmm1, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm4, xmm4, xmm1
+        add	ecx, 16
+        cmp	ecx, edx
+        jl	L_AES_GCM_init_avx2_calc_iv_16_loop
+        mov	edx, r11d
+        cmp	ecx, edx
+        je	L_AES_GCM_init_avx2_calc_iv_done
+L_AES_GCM_init_avx2_calc_iv_lt16:
+        vpxor	xmm0, xmm0, xmm0
+        xor	ebx, ebx
+        vmovdqu	OWORD PTR [rsp], xmm0
+L_AES_GCM_init_avx2_calc_iv_loop:
+        movzx	r12d, BYTE PTR [r10+rcx]
+        mov	BYTE PTR [rsp+rbx], r12b
+        inc	ecx
+        inc	ebx
+        cmp	ecx, edx
+        jl	L_AES_GCM_init_avx2_calc_iv_loop
+        vmovdqu	xmm0, OWORD PTR [rsp]
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm4, xmm4, xmm0
+        ; ghash_gfmul_avx
+        vpclmulqdq	xmm2, xmm5, xmm4, 16
+        vpclmulqdq	xmm1, xmm5, xmm4, 1
+        vpclmulqdq	xmm0, xmm5, xmm4, 0
+        vpclmulqdq	xmm3, xmm5, xmm4, 17
+        vpxor	xmm2, xmm2, xmm1
+        vpslldq	xmm1, xmm2, 8
+        vpsrldq	xmm2, xmm2, 8
+        vpxor	xmm6, xmm0, xmm1
+        vpxor	xmm4, xmm3, xmm2
+        ; ghash_mid
+        vpsrld	xmm0, xmm6, 31
+        vpsrld	xmm1, xmm4, 31
+        vpslld	xmm6, xmm6, 1
+        vpslld	xmm4, xmm4, 1
+        vpsrldq	xmm2, xmm0, 12
+        vpslldq	xmm0, xmm0, 4
+        vpslldq	xmm1, xmm1, 4
+        vpor	xmm4, xmm4, xmm2
+        vpor	xmm6, xmm6, xmm0
+        vpor	xmm4, xmm4, xmm1
+        ; ghash_red
+        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
+        vpclmulqdq	xmm0, xmm6, xmm2, 16
+        vpshufd	xmm1, xmm6, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpclmulqdq	xmm0, xmm1, xmm2, 16
+        vpshufd	xmm1, xmm1, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm4, xmm4, xmm1
+L_AES_GCM_init_avx2_calc_iv_done:
+        ; T = Encrypt counter
+        vpxor	xmm0, xmm0, xmm0
+        shl	edx, 3
+        vmovq	xmm0, rdx
+        vpxor	xmm4, xmm4, xmm0
+        ; ghash_gfmul_avx
+        vpclmulqdq	xmm2, xmm5, xmm4, 16
+        vpclmulqdq	xmm1, xmm5, xmm4, 1
+        vpclmulqdq	xmm0, xmm5, xmm4, 0
+        vpclmulqdq	xmm3, xmm5, xmm4, 17
+        vpxor	xmm2, xmm2, xmm1
+        vpslldq	xmm1, xmm2, 8
+        vpsrldq	xmm2, xmm2, 8
+        vpxor	xmm6, xmm0, xmm1
+        vpxor	xmm4, xmm3, xmm2
+        ; ghash_mid
+        vpsrld	xmm0, xmm6, 31
+        vpsrld	xmm1, xmm4, 31
+        vpslld	xmm6, xmm6, 1
+        vpslld	xmm4, xmm4, 1
+        vpsrldq	xmm2, xmm0, 12
+        vpslldq	xmm0, xmm0, 4
+        vpslldq	xmm1, xmm1, 4
+        vpor	xmm4, xmm4, xmm2
+        vpor	xmm6, xmm6, xmm0
+        vpor	xmm4, xmm4, xmm1
+        ; ghash_red
+        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
+        vpclmulqdq	xmm0, xmm6, xmm2, 16
+        vpshufd	xmm1, xmm6, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpclmulqdq	xmm0, xmm1, xmm2, 16
+        vpshufd	xmm1, xmm1, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm4, xmm4, xmm1
+        vpshufb	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        ;   Encrypt counter
+        vmovdqu	xmm7, OWORD PTR [rdi]
+        vpxor	xmm7, xmm7, xmm4
+        vaesenc	xmm7, xmm7, [rdi+16]
+        vaesenc	xmm7, xmm7, [rdi+32]
+        vaesenc	xmm7, xmm7, [rdi+48]
+        vaesenc	xmm7, xmm7, [rdi+64]
+        vaesenc	xmm7, xmm7, [rdi+80]
+        vaesenc	xmm7, xmm7, [rdi+96]
+        vaesenc	xmm7, xmm7, [rdi+112]
+        vaesenc	xmm7, xmm7, [rdi+128]
+        vaesenc	xmm7, xmm7, [rdi+144]
+        cmp	esi, 11
+        vmovdqu	xmm0, OWORD PTR [rdi+160]
+        jl	L_AES_GCM_init_avx2_calc_iv_2_aesenc_avx_last
+        vaesenc	xmm7, xmm7, xmm0
+        vaesenc	xmm7, xmm7, [rdi+176]
+        cmp	esi, 13
+        vmovdqu	xmm0, OWORD PTR [rdi+192]
+        jl	L_AES_GCM_init_avx2_calc_iv_2_aesenc_avx_last
+        vaesenc	xmm7, xmm7, xmm0
+        vaesenc	xmm7, xmm7, [rdi+208]
+        vmovdqu	xmm0, OWORD PTR [rdi+224]
+L_AES_GCM_init_avx2_calc_iv_2_aesenc_avx_last:
+        vaesenclast	xmm7, xmm7, xmm0
+        jmp	L_AES_GCM_init_avx2_iv_done
+L_AES_GCM_init_avx2_iv_12:
+        ; # Calculate values when IV is 12 bytes
+        ; Set counter based on IV
+        vmovdqu	xmm4, OWORD PTR L_avx2_aes_gcm_bswap_one
+        vmovdqu	xmm5, OWORD PTR [rdi]
+        vpblendd	xmm4, xmm4, [r10], 7
+        ; H = Encrypt X(=0) and T = Encrypt counter
+        vmovdqu	xmm6, OWORD PTR [rdi+16]
+        vpxor	xmm7, xmm4, xmm5
+        vaesenc	xmm5, xmm5, xmm6
+        vaesenc	xmm7, xmm7, xmm6
+        vmovdqu	xmm0, OWORD PTR [rdi+32]
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm7, xmm7, xmm0
+        vmovdqu	xmm0, OWORD PTR [rdi+48]
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm7, xmm7, xmm0
+        vmovdqu	xmm0, OWORD PTR [rdi+64]
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm7, xmm7, xmm0
+        vmovdqu	xmm0, OWORD PTR [rdi+80]
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm7, xmm7, xmm0
+        vmovdqu	xmm0, OWORD PTR [rdi+96]
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm7, xmm7, xmm0
+        vmovdqu	xmm0, OWORD PTR [rdi+112]
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm7, xmm7, xmm0
+        vmovdqu	xmm0, OWORD PTR [rdi+128]
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm7, xmm7, xmm0
+        vmovdqu	xmm0, OWORD PTR [rdi+144]
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm7, xmm7, xmm0
+        cmp	esi, 11
+        vmovdqu	xmm0, OWORD PTR [rdi+160]
+        jl	L_AES_GCM_init_avx2_calc_iv_12_last
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm7, xmm7, xmm0
+        vmovdqu	xmm0, OWORD PTR [rdi+176]
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm7, xmm7, xmm0
+        cmp	esi, 13
+        vmovdqu	xmm0, OWORD PTR [rdi+192]
+        jl	L_AES_GCM_init_avx2_calc_iv_12_last
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm7, xmm7, xmm0
+        vmovdqu	xmm0, OWORD PTR [rdi+208]
+        vaesenc	xmm5, xmm5, xmm0
+        vaesenc	xmm7, xmm7, xmm0
+        vmovdqu	xmm0, OWORD PTR [rdi+224]
+L_AES_GCM_init_avx2_calc_iv_12_last:
+        vaesenclast	xmm5, xmm5, xmm0
+        vaesenclast	xmm7, xmm7, xmm0
+        vpshufb	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_bswap_mask
+L_AES_GCM_init_avx2_iv_done:
+        vmovdqu	OWORD PTR [r9], xmm7
+        vpshufb	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64
+        vpaddd	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_one
+        vmovdqu	OWORD PTR [rax], xmm5
+        vmovdqu	OWORD PTR [r8], xmm4
+        vzeroupper
+        add	rsp, 16
+        pop	r12
+        pop	rsi
+        pop	rdi
+        pop	rbx
+        ret
+AES_GCM_init_avx2 ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+AES_GCM_aad_update_avx2 PROC
+        mov	rax, rcx
+        vmovdqu	xmm4, OWORD PTR [r8]
+        vmovdqu	xmm5, OWORD PTR [r9]
+        xor	ecx, ecx
+L_AES_GCM_aad_update_avx2_16_loop:
+        vmovdqu	xmm0, OWORD PTR [rax+rcx]
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm4, xmm4, xmm0
+        ; ghash_gfmul_avx
+        vpclmulqdq	xmm2, xmm5, xmm4, 16
+        vpclmulqdq	xmm1, xmm5, xmm4, 1
+        vpclmulqdq	xmm0, xmm5, xmm4, 0
+        vpclmulqdq	xmm3, xmm5, xmm4, 17
+        vpxor	xmm2, xmm2, xmm1
+        vpslldq	xmm1, xmm2, 8
+        vpsrldq	xmm2, xmm2, 8
+        vpxor	xmm6, xmm0, xmm1
+        vpxor	xmm4, xmm3, xmm2
+        ; ghash_mid
+        vpsrld	xmm0, xmm6, 31
+        vpsrld	xmm1, xmm4, 31
+        vpslld	xmm6, xmm6, 1
+        vpslld	xmm4, xmm4, 1
+        vpsrldq	xmm2, xmm0, 12
+        vpslldq	xmm0, xmm0, 4
+        vpslldq	xmm1, xmm1, 4
+        vpor	xmm4, xmm4, xmm2
+        vpor	xmm6, xmm6, xmm0
+        vpor	xmm4, xmm4, xmm1
+        ; ghash_red
+        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
+        vpclmulqdq	xmm0, xmm6, xmm2, 16
+        vpshufd	xmm1, xmm6, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpclmulqdq	xmm0, xmm1, xmm2, 16
+        vpshufd	xmm1, xmm1, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm4, xmm4, xmm1
+        add	ecx, 16
+        cmp	ecx, edx
+        jl	L_AES_GCM_aad_update_avx2_16_loop
+        vmovdqu	OWORD PTR [r8], xmm4
+        vzeroupper
+        ret
+AES_GCM_aad_update_avx2 ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+AES_GCM_encrypt_block_avx2 PROC
+        mov	r10, r8
+        mov	r11, r9
+        mov	rax, QWORD PTR [rsp+40]
+        sub	rsp, 152
+        vmovdqu	xmm3, OWORD PTR [rax]
+        ; aesenc_block
+        vmovdqu	xmm1, xmm3
+        vpshufb	xmm0, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64
+        vpaddd	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_one
+        vpxor	xmm0, xmm0, [rcx]
+        vmovdqu	xmm2, OWORD PTR [rcx+16]
+        vaesenc	xmm0, xmm0, xmm2
+        vmovdqu	xmm2, OWORD PTR [rcx+32]
+        vaesenc	xmm0, xmm0, xmm2
+        vmovdqu	xmm2, OWORD PTR [rcx+48]
+        vaesenc	xmm0, xmm0, xmm2
+        vmovdqu	xmm2, OWORD PTR [rcx+64]
+        vaesenc	xmm0, xmm0, xmm2
+        vmovdqu	xmm2, OWORD PTR [rcx+80]
+        vaesenc	xmm0, xmm0, xmm2
+        vmovdqu	xmm2, OWORD PTR [rcx+96]
+        vaesenc	xmm0, xmm0, xmm2
+        vmovdqu	xmm2, OWORD PTR [rcx+112]
+        vaesenc	xmm0, xmm0, xmm2
+        vmovdqu	xmm2, OWORD PTR [rcx+128]
+        vaesenc	xmm0, xmm0, xmm2
+        vmovdqu	xmm2, OWORD PTR [rcx+144]
+        vaesenc	xmm0, xmm0, xmm2
+        vmovdqu	xmm3, xmm1
+        cmp	edx, 11
+        vmovdqu	xmm1, OWORD PTR [rcx+160]
+        jl	L_AES_GCM_encrypt_block_avx2_aesenc_block_last
+        vaesenc	xmm0, xmm0, xmm1
+        vmovdqu	xmm2, OWORD PTR [rcx+176]
+        vaesenc	xmm0, xmm0, xmm2
+        cmp	edx, 13
+        vmovdqu	xmm1, OWORD PTR [rcx+192]
+        jl	L_AES_GCM_encrypt_block_avx2_aesenc_block_last
+        vaesenc	xmm0, xmm0, xmm1
+        vmovdqu	xmm2, OWORD PTR [rcx+208]
+        vaesenc	xmm0, xmm0, xmm2
+        vmovdqu	xmm1, OWORD PTR [rcx+224]
+L_AES_GCM_encrypt_block_avx2_aesenc_block_last:
+        vaesenclast	xmm0, xmm0, xmm1
+        vmovdqu	xmm1, OWORD PTR [r11]
+        vpxor	xmm0, xmm0, xmm1
+        vmovdqu	OWORD PTR [r10], xmm0
+        vmovdqu	OWORD PTR [rax], xmm3
+        vzeroupper
+        add	rsp, 152
+        ret
+AES_GCM_encrypt_block_avx2 ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+AES_GCM_ghash_block_avx2 PROC
+        vmovdqu	xmm4, OWORD PTR [rdx]
+        vmovdqu	xmm5, OWORD PTR [r8]
+        vmovdqu	xmm0, OWORD PTR [rcx]
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm4, xmm4, xmm0
+        ; ghash_gfmul_avx
+        vpclmulqdq	xmm2, xmm5, xmm4, 16
+        vpclmulqdq	xmm1, xmm5, xmm4, 1
+        vpclmulqdq	xmm0, xmm5, xmm4, 0
+        vpclmulqdq	xmm3, xmm5, xmm4, 17
+        vpxor	xmm2, xmm2, xmm1
+        vpslldq	xmm1, xmm2, 8
+        vpsrldq	xmm2, xmm2, 8
+        vpxor	xmm6, xmm0, xmm1
+        vpxor	xmm4, xmm3, xmm2
+        ; ghash_mid
+        vpsrld	xmm0, xmm6, 31
+        vpsrld	xmm1, xmm4, 31
+        vpslld	xmm6, xmm6, 1
+        vpslld	xmm4, xmm4, 1
+        vpsrldq	xmm2, xmm0, 12
+        vpslldq	xmm0, xmm0, 4
+        vpslldq	xmm1, xmm1, 4
+        vpor	xmm4, xmm4, xmm2
+        vpor	xmm6, xmm6, xmm0
+        vpor	xmm4, xmm4, xmm1
+        ; ghash_red
+        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
+        vpclmulqdq	xmm0, xmm6, xmm2, 16
+        vpshufd	xmm1, xmm6, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpclmulqdq	xmm0, xmm1, xmm2, 16
+        vpshufd	xmm1, xmm1, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm4, xmm4, xmm1
+        vmovdqu	OWORD PTR [rdx], xmm4
+        vzeroupper
+        ret
+AES_GCM_ghash_block_avx2 ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+AES_GCM_encrypt_update_avx2 PROC
+        push	r12
+        push	r13
+        push	r14
+        push	r15
+        push	rdi
+        mov	rax, rcx
+        mov	r10, r8
+        mov	r8d, edx
+        mov	r11, r9
+        mov	r9d, DWORD PTR [rsp+80]
+        mov	r12, QWORD PTR [rsp+88]
+        mov	r13, QWORD PTR [rsp+96]
+        mov	r14, QWORD PTR [rsp+104]
+        sub	rsp, 152
+        vmovdqu	xmm6, OWORD PTR [r12]
+        vmovdqu	xmm5, OWORD PTR [r13]
+        vmovdqu	xmm4, OWORD PTR [r14]
+        vpsrlq	xmm1, xmm5, 63
+        vpsllq	xmm0, xmm5, 1
+        vpslldq	xmm1, xmm1, 8
+        vpor	xmm0, xmm0, xmm1
+        vpshufd	xmm5, xmm5, 255
+        vpsrad	xmm5, xmm5, 31
+        vpand	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_mod2_128
+        vpxor	xmm5, xmm5, xmm0
+        xor	edi, edi
+        cmp	r9d, 128
+        mov	r15d, r9d
+        jl	L_AES_GCM_encrypt_update_avx2_done_128
+        and	r15d, 4294967168
+        vmovdqu	OWORD PTR [rsp+128], xmm4
+        vmovdqu	xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128
+        ; H ^ 1 and H ^ 2
+        vpclmulqdq	xmm9, xmm5, xmm5, 0
+        vpclmulqdq	xmm10, xmm5, xmm5, 17
+        vpclmulqdq	xmm8, xmm9, xmm3, 16
+        vpshufd	xmm9, xmm9, 78
+        vpxor	xmm9, xmm9, xmm8
+        vpclmulqdq	xmm8, xmm9, xmm3, 16
+        vpshufd	xmm9, xmm9, 78
+        vpxor	xmm9, xmm9, xmm8
+        vpxor	xmm0, xmm10, xmm9
+        vmovdqu	OWORD PTR [rsp], xmm5
+        vmovdqu	OWORD PTR [rsp+16], xmm0
+        ; H ^ 3 and H ^ 4
+        vpclmulqdq	xmm11, xmm0, xmm5, 16
+        vpclmulqdq	xmm10, xmm0, xmm5, 1
+        vpclmulqdq	xmm9, xmm0, xmm5, 0
+        vpclmulqdq	xmm12, xmm0, xmm5, 17
+        vpclmulqdq	xmm13, xmm0, xmm0, 0
+        vpclmulqdq	xmm14, xmm0, xmm0, 17
+        vpxor	xmm11, xmm11, xmm10
+        vpslldq	xmm10, xmm11, 8
+        vpsrldq	xmm11, xmm11, 8
+        vpxor	xmm10, xmm10, xmm9
+        vpclmulqdq	xmm8, xmm13, xmm3, 16
+        vpclmulqdq	xmm9, xmm10, xmm3, 16
+        vpshufd	xmm10, xmm10, 78
+        vpshufd	xmm13, xmm13, 78
+        vpxor	xmm10, xmm10, xmm9
+        vpxor	xmm13, xmm13, xmm8
+        vpclmulqdq	xmm9, xmm10, xmm3, 16
+        vpclmulqdq	xmm8, xmm13, xmm3, 16
+        vpshufd	xmm10, xmm10, 78
+        vpshufd	xmm13, xmm13, 78
+        vpxor	xmm12, xmm12, xmm11
+        vpxor	xmm13, xmm13, xmm8
+        vpxor	xmm10, xmm10, xmm12
+        vpxor	xmm2, xmm13, xmm14
+        vpxor	xmm1, xmm10, xmm9
+        vmovdqu	OWORD PTR [rsp+32], xmm1
+        vmovdqu	OWORD PTR [rsp+48], xmm2
+        ; H ^ 5 and H ^ 6
+        vpclmulqdq	xmm11, xmm1, xmm0, 16
+        vpclmulqdq	xmm10, xmm1, xmm0, 1
+        vpclmulqdq	xmm9, xmm1, xmm0, 0
+        vpclmulqdq	xmm12, xmm1, xmm0, 17
+        vpclmulqdq	xmm13, xmm1, xmm1, 0
+        vpclmulqdq	xmm14, xmm1, xmm1, 17
+        vpxor	xmm11, xmm11, xmm10
+        vpslldq	xmm10, xmm11, 8
+        vpsrldq	xmm11, xmm11, 8
+        vpxor	xmm10, xmm10, xmm9
+        vpclmulqdq	xmm8, xmm13, xmm3, 16
+        vpclmulqdq	xmm9, xmm10, xmm3, 16
+        vpshufd	xmm10, xmm10, 78
+        vpshufd	xmm13, xmm13, 78
+        vpxor	xmm10, xmm10, xmm9
+        vpxor	xmm13, xmm13, xmm8
+        vpclmulqdq	xmm9, xmm10, xmm3, 16
+        vpclmulqdq	xmm8, xmm13, xmm3, 16
+        vpshufd	xmm10, xmm10, 78
+        vpshufd	xmm13, xmm13, 78
+        vpxor	xmm12, xmm12, xmm11
+        vpxor	xmm13, xmm13, xmm8
+        vpxor	xmm10, xmm10, xmm12
+        vpxor	xmm0, xmm13, xmm14
+        vpxor	xmm7, xmm10, xmm9
+        vmovdqu	OWORD PTR [rsp+64], xmm7
+        vmovdqu	OWORD PTR [rsp+80], xmm0
+        ; H ^ 7 and H ^ 8
+        vpclmulqdq	xmm11, xmm2, xmm1, 16
+        vpclmulqdq	xmm10, xmm2, xmm1, 1
+        vpclmulqdq	xmm9, xmm2, xmm1, 0
+        vpclmulqdq	xmm12, xmm2, xmm1, 17
+        vpclmulqdq	xmm13, xmm2, xmm2, 0
+        vpclmulqdq	xmm14, xmm2, xmm2, 17
+        vpxor	xmm11, xmm11, xmm10
+        vpslldq	xmm10, xmm11, 8
+        vpsrldq	xmm11, xmm11, 8
+        vpxor	xmm10, xmm10, xmm9
+        vpclmulqdq	xmm8, xmm13, xmm3, 16
+        vpclmulqdq	xmm9, xmm10, xmm3, 16
+        vpshufd	xmm10, xmm10, 78
+        vpshufd	xmm13, xmm13, 78
+        vpxor	xmm10, xmm10, xmm9
+        vpxor	xmm13, xmm13, xmm8
+        vpclmulqdq	xmm9, xmm10, xmm3, 16
+        vpclmulqdq	xmm8, xmm13, xmm3, 16
+        vpshufd	xmm10, xmm10, 78
+        vpshufd	xmm13, xmm13, 78
+        vpxor	xmm12, xmm12, xmm11
+        vpxor	xmm13, xmm13, xmm8
+        vpxor	xmm10, xmm10, xmm12
+        vpxor	xmm0, xmm13, xmm14
+        vpxor	xmm7, xmm10, xmm9
+        vmovdqu	OWORD PTR [rsp+96], xmm7
+        vmovdqu	OWORD PTR [rsp+112], xmm0
+        ; First 128 bytes of input
+        ; aesenc_128
+        ; aesenc_ctr
+        vmovdqu	xmm0, OWORD PTR [rsp+128]
+        vmovdqu	xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64
+        vpaddd	xmm9, xmm0, OWORD PTR L_avx2_aes_gcm_one
+        vpshufb	xmm8, xmm0, xmm1
+        vpaddd	xmm10, xmm0, OWORD PTR L_avx2_aes_gcm_two
+        vpshufb	xmm9, xmm9, xmm1
+        vpaddd	xmm11, xmm0, OWORD PTR L_avx2_aes_gcm_three
+        vpshufb	xmm10, xmm10, xmm1
+        vpaddd	xmm12, xmm0, OWORD PTR L_avx2_aes_gcm_four
+        vpshufb	xmm11, xmm11, xmm1
+        vpaddd	xmm13, xmm0, OWORD PTR L_avx2_aes_gcm_five
+        vpshufb	xmm12, xmm12, xmm1
+        vpaddd	xmm14, xmm0, OWORD PTR L_avx2_aes_gcm_six
+        vpshufb	xmm13, xmm13, xmm1
+        vpaddd	xmm15, xmm0, OWORD PTR L_avx2_aes_gcm_seven
+        vpshufb	xmm14, xmm14, xmm1
+        vpaddd	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_eight
+        vpshufb	xmm15, xmm15, xmm1
+        ; aesenc_xor
+        vmovdqu	xmm7, OWORD PTR [rax]
+        vmovdqu	OWORD PTR [rsp+128], xmm0
+        vpxor	xmm8, xmm8, xmm7
+        vpxor	xmm9, xmm9, xmm7
+        vpxor	xmm10, xmm10, xmm7
+        vpxor	xmm11, xmm11, xmm7
+        vpxor	xmm12, xmm12, xmm7
+        vpxor	xmm13, xmm13, xmm7
+        vpxor	xmm14, xmm14, xmm7
+        vpxor	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rax+16]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rax+32]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rax+48]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rax+64]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rax+80]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rax+96]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rax+112]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rax+128]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rax+144]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        cmp	r8d, 11
+        vmovdqu	xmm7, OWORD PTR [rax+160]
+        jl	L_AES_GCM_encrypt_update_avx2_aesenc_128_enc_done
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rax+176]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        cmp	r8d, 13
+        vmovdqu	xmm7, OWORD PTR [rax+192]
+        jl	L_AES_GCM_encrypt_update_avx2_aesenc_128_enc_done
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rax+208]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rax+224]
+L_AES_GCM_encrypt_update_avx2_aesenc_128_enc_done:
+        ; aesenc_last
+        vaesenclast	xmm8, xmm8, xmm7
+        vaesenclast	xmm9, xmm9, xmm7
+        vaesenclast	xmm10, xmm10, xmm7
+        vaesenclast	xmm11, xmm11, xmm7
+        vmovdqu	xmm0, OWORD PTR [r11]
+        vmovdqu	xmm1, OWORD PTR [r11+16]
+        vmovdqu	xmm2, OWORD PTR [r11+32]
+        vmovdqu	xmm3, OWORD PTR [r11+48]
+        vpxor	xmm8, xmm8, xmm0
+        vpxor	xmm9, xmm9, xmm1
+        vpxor	xmm10, xmm10, xmm2
+        vpxor	xmm11, xmm11, xmm3
+        vmovdqu	OWORD PTR [r10], xmm8
+        vmovdqu	OWORD PTR [r10+16], xmm9
+        vmovdqu	OWORD PTR [r10+32], xmm10
+        vmovdqu	OWORD PTR [r10+48], xmm11
+        vaesenclast	xmm12, xmm12, xmm7
+        vaesenclast	xmm13, xmm13, xmm7
+        vaesenclast	xmm14, xmm14, xmm7
+        vaesenclast	xmm15, xmm15, xmm7
+        vmovdqu	xmm0, OWORD PTR [r11+64]
+        vmovdqu	xmm1, OWORD PTR [r11+80]
+        vmovdqu	xmm2, OWORD PTR [r11+96]
+        vmovdqu	xmm3, OWORD PTR [r11+112]
+        vpxor	xmm12, xmm12, xmm0
+        vpxor	xmm13, xmm13, xmm1
+        vpxor	xmm14, xmm14, xmm2
+        vpxor	xmm15, xmm15, xmm3
+        vmovdqu	OWORD PTR [r10+64], xmm12
+        vmovdqu	OWORD PTR [r10+80], xmm13
+        vmovdqu	OWORD PTR [r10+96], xmm14
+        vmovdqu	OWORD PTR [r10+112], xmm15
+        cmp	r15d, 128
+        mov	edi, 128
+        jle	L_AES_GCM_encrypt_update_avx2_end_128
+        ; More 128 bytes of input
+L_AES_GCM_encrypt_update_avx2_ghash_128:
+        ; aesenc_128_ghash
+        lea	rcx, QWORD PTR [r11+rdi]
+        lea	rdx, QWORD PTR [r10+rdi]
+        ; aesenc_ctr
+        vmovdqu	xmm0, OWORD PTR [rsp+128]
+        vmovdqu	xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64
+        vpaddd	xmm9, xmm0, OWORD PTR L_avx2_aes_gcm_one
+        vpshufb	xmm8, xmm0, xmm1
+        vpaddd	xmm10, xmm0, OWORD PTR L_avx2_aes_gcm_two
+        vpshufb	xmm9, xmm9, xmm1
+        vpaddd	xmm11, xmm0, OWORD PTR L_avx2_aes_gcm_three
+        vpshufb	xmm10, xmm10, xmm1
+        vpaddd	xmm12, xmm0, OWORD PTR L_avx2_aes_gcm_four
+        vpshufb	xmm11, xmm11, xmm1
+        vpaddd	xmm13, xmm0, OWORD PTR L_avx2_aes_gcm_five
+        vpshufb	xmm12, xmm12, xmm1
+        vpaddd	xmm14, xmm0, OWORD PTR L_avx2_aes_gcm_six
+        vpshufb	xmm13, xmm13, xmm1
+        vpaddd	xmm15, xmm0, OWORD PTR L_avx2_aes_gcm_seven
+        vpshufb	xmm14, xmm14, xmm1
+        vpaddd	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_eight
+        vpshufb	xmm15, xmm15, xmm1
+        ; aesenc_xor
+        vmovdqu	xmm7, OWORD PTR [rax]
+        vmovdqu	OWORD PTR [rsp+128], xmm0
+        vpxor	xmm8, xmm8, xmm7
+        vpxor	xmm9, xmm9, xmm7
+        vpxor	xmm10, xmm10, xmm7
+        vpxor	xmm11, xmm11, xmm7
+        vpxor	xmm12, xmm12, xmm7
+        vpxor	xmm13, xmm13, xmm7
+        vpxor	xmm14, xmm14, xmm7
+        vpxor	xmm15, xmm15, xmm7
+        ; aesenc_pclmul_1
+        vmovdqu	xmm1, OWORD PTR [rdx+-128]
+        vmovdqu	xmm0, OWORD PTR [rax+16]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vmovdqu	xmm2, OWORD PTR [rsp+112]
+        vpxor	xmm1, xmm1, xmm6
+        vpclmulqdq	xmm5, xmm1, xmm2, 16
+        vpclmulqdq	xmm3, xmm1, xmm2, 1
+        vpclmulqdq	xmm6, xmm1, xmm2, 0
+        vpclmulqdq	xmm7, xmm1, xmm2, 17
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_2
+        vmovdqu	xmm1, OWORD PTR [rdx+-112]
+        vmovdqu	xmm0, OWORD PTR [rsp+96]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm5, xmm5, xmm3
+        vpclmulqdq	xmm2, xmm1, xmm0, 16
+        vpclmulqdq	xmm3, xmm1, xmm0, 1
+        vpclmulqdq	xmm4, xmm1, xmm0, 0
+        vpclmulqdq	xmm1, xmm1, xmm0, 17
+        vmovdqu	xmm0, OWORD PTR [rax+32]
+        vpxor	xmm7, xmm7, xmm1
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_n
+        vmovdqu	xmm1, OWORD PTR [rdx+-96]
+        vmovdqu	xmm0, OWORD PTR [rsp+80]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm5, xmm5, xmm2
+        vpclmulqdq	xmm2, xmm1, xmm0, 16
+        vpxor	xmm5, xmm5, xmm3
+        vpclmulqdq	xmm3, xmm1, xmm0, 1
+        vpxor	xmm6, xmm6, xmm4
+        vpclmulqdq	xmm4, xmm1, xmm0, 0
+        vpclmulqdq	xmm1, xmm1, xmm0, 17
+        vmovdqu	xmm0, OWORD PTR [rax+48]
+        vpxor	xmm7, xmm7, xmm1
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_n
+        vmovdqu	xmm1, OWORD PTR [rdx+-80]
+        vmovdqu	xmm0, OWORD PTR [rsp+64]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm5, xmm5, xmm2
+        vpclmulqdq	xmm2, xmm1, xmm0, 16
+        vpxor	xmm5, xmm5, xmm3
+        vpclmulqdq	xmm3, xmm1, xmm0, 1
+        vpxor	xmm6, xmm6, xmm4
+        vpclmulqdq	xmm4, xmm1, xmm0, 0
+        vpclmulqdq	xmm1, xmm1, xmm0, 17
+        vmovdqu	xmm0, OWORD PTR [rax+64]
+        vpxor	xmm7, xmm7, xmm1
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_n
+        vmovdqu	xmm1, OWORD PTR [rdx+-64]
+        vmovdqu	xmm0, OWORD PTR [rsp+48]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm5, xmm5, xmm2
+        vpclmulqdq	xmm2, xmm1, xmm0, 16
+        vpxor	xmm5, xmm5, xmm3
+        vpclmulqdq	xmm3, xmm1, xmm0, 1
+        vpxor	xmm6, xmm6, xmm4
+        vpclmulqdq	xmm4, xmm1, xmm0, 0
+        vpclmulqdq	xmm1, xmm1, xmm0, 17
+        vmovdqu	xmm0, OWORD PTR [rax+80]
+        vpxor	xmm7, xmm7, xmm1
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_n
+        vmovdqu	xmm1, OWORD PTR [rdx+-48]
+        vmovdqu	xmm0, OWORD PTR [rsp+32]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm5, xmm5, xmm2
+        vpclmulqdq	xmm2, xmm1, xmm0, 16
+        vpxor	xmm5, xmm5, xmm3
+        vpclmulqdq	xmm3, xmm1, xmm0, 1
+        vpxor	xmm6, xmm6, xmm4
+        vpclmulqdq	xmm4, xmm1, xmm0, 0
+        vpclmulqdq	xmm1, xmm1, xmm0, 17
+        vmovdqu	xmm0, OWORD PTR [rax+96]
+        vpxor	xmm7, xmm7, xmm1
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_n
+        vmovdqu	xmm1, OWORD PTR [rdx+-32]
+        vmovdqu	xmm0, OWORD PTR [rsp+16]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm5, xmm5, xmm2
+        vpclmulqdq	xmm2, xmm1, xmm0, 16
+        vpxor	xmm5, xmm5, xmm3
+        vpclmulqdq	xmm3, xmm1, xmm0, 1
+        vpxor	xmm6, xmm6, xmm4
+        vpclmulqdq	xmm4, xmm1, xmm0, 0
+        vpclmulqdq	xmm1, xmm1, xmm0, 17
+        vmovdqu	xmm0, OWORD PTR [rax+112]
+        vpxor	xmm7, xmm7, xmm1
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_n
+        vmovdqu	xmm1, OWORD PTR [rdx+-16]
+        vmovdqu	xmm0, OWORD PTR [rsp]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm5, xmm5, xmm2
+        vpclmulqdq	xmm2, xmm1, xmm0, 16
+        vpxor	xmm5, xmm5, xmm3
+        vpclmulqdq	xmm3, xmm1, xmm0, 1
+        vpxor	xmm6, xmm6, xmm4
+        vpclmulqdq	xmm4, xmm1, xmm0, 0
+        vpclmulqdq	xmm1, xmm1, xmm0, 17
+        vmovdqu	xmm0, OWORD PTR [rax+128]
+        vpxor	xmm7, xmm7, xmm1
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_l
+        vpxor	xmm5, xmm5, xmm2
+        vpxor	xmm6, xmm6, xmm4
+        vpxor	xmm5, xmm5, xmm3
+        vpslldq	xmm1, xmm5, 8
+        vpsrldq	xmm5, xmm5, 8
+        vmovdqu	xmm4, OWORD PTR [rax+144]
+        vmovdqu	xmm0, OWORD PTR L_avx2_aes_gcm_mod2_128
+        vaesenc	xmm8, xmm8, xmm4
+        vpxor	xmm6, xmm6, xmm1
+        vpxor	xmm7, xmm7, xmm5
+        vpclmulqdq	xmm3, xmm6, xmm0, 16
+        vaesenc	xmm9, xmm9, xmm4
+        vaesenc	xmm10, xmm10, xmm4
+        vaesenc	xmm11, xmm11, xmm4
+        vpshufd	xmm6, xmm6, 78
+        vpxor	xmm6, xmm6, xmm3
+        vpclmulqdq	xmm3, xmm6, xmm0, 16
+        vaesenc	xmm12, xmm12, xmm4
+        vaesenc	xmm13, xmm13, xmm4
+        vaesenc	xmm14, xmm14, xmm4
+        vpshufd	xmm6, xmm6, 78
+        vpxor	xmm6, xmm6, xmm3
+        vpxor	xmm6, xmm6, xmm7
+        vaesenc	xmm15, xmm15, xmm4
+        cmp	r8d, 11
+        vmovdqu	xmm7, OWORD PTR [rax+160]
+        jl	L_AES_GCM_encrypt_update_avx2_aesenc_128_ghash_avx_done
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rax+176]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        cmp	r8d, 13
+        vmovdqu	xmm7, OWORD PTR [rax+192]
+        jl	L_AES_GCM_encrypt_update_avx2_aesenc_128_ghash_avx_done
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rax+208]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rax+224]
+L_AES_GCM_encrypt_update_avx2_aesenc_128_ghash_avx_done:
+        ; aesenc_last
+        vaesenclast	xmm8, xmm8, xmm7
+        vaesenclast	xmm9, xmm9, xmm7
+        vaesenclast	xmm10, xmm10, xmm7
+        vaesenclast	xmm11, xmm11, xmm7
+        vmovdqu	xmm0, OWORD PTR [rcx]
+        vmovdqu	xmm1, OWORD PTR [rcx+16]
+        vmovdqu	xmm2, OWORD PTR [rcx+32]
+        vmovdqu	xmm3, OWORD PTR [rcx+48]
+        vpxor	xmm8, xmm8, xmm0
+        vpxor	xmm9, xmm9, xmm1
+        vpxor	xmm10, xmm10, xmm2
+        vpxor	xmm11, xmm11, xmm3
+        vmovdqu	OWORD PTR [rdx], xmm8
+        vmovdqu	OWORD PTR [rdx+16], xmm9
+        vmovdqu	OWORD PTR [rdx+32], xmm10
+        vmovdqu	OWORD PTR [rdx+48], xmm11
+        vaesenclast	xmm12, xmm12, xmm7
+        vaesenclast	xmm13, xmm13, xmm7
+        vaesenclast	xmm14, xmm14, xmm7
+        vaesenclast	xmm15, xmm15, xmm7
+        vmovdqu	xmm0, OWORD PTR [rcx+64]
+        vmovdqu	xmm1, OWORD PTR [rcx+80]
+        vmovdqu	xmm2, OWORD PTR [rcx+96]
+        vmovdqu	xmm3, OWORD PTR [rcx+112]
+        vpxor	xmm12, xmm12, xmm0
+        vpxor	xmm13, xmm13, xmm1
+        vpxor	xmm14, xmm14, xmm2
+        vpxor	xmm15, xmm15, xmm3
+        vmovdqu	OWORD PTR [rdx+64], xmm12
+        vmovdqu	OWORD PTR [rdx+80], xmm13
+        vmovdqu	OWORD PTR [rdx+96], xmm14
+        vmovdqu	OWORD PTR [rdx+112], xmm15
+        ; aesenc_128_ghash - end
+        add	edi, 128
+        cmp	edi, r15d
+        jl	L_AES_GCM_encrypt_update_avx2_ghash_128
+L_AES_GCM_encrypt_update_avx2_end_128:
+        vmovdqu	xmm4, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpshufb	xmm8, xmm8, xmm4
+        vpshufb	xmm9, xmm9, xmm4
+        vpshufb	xmm10, xmm10, xmm4
+        vpshufb	xmm11, xmm11, xmm4
+        vpshufb	xmm12, xmm12, xmm4
+        vpshufb	xmm13, xmm13, xmm4
+        vpshufb	xmm14, xmm14, xmm4
+        vpshufb	xmm15, xmm15, xmm4
+        vpxor	xmm8, xmm8, xmm6
+        vmovdqu	xmm7, OWORD PTR [rsp]
+        vpclmulqdq	xmm5, xmm7, xmm15, 16
+        vpclmulqdq	xmm1, xmm7, xmm15, 1
+        vpclmulqdq	xmm4, xmm7, xmm15, 0
+        vpclmulqdq	xmm6, xmm7, xmm15, 17
+        vpxor	xmm5, xmm5, xmm1
+        vmovdqu	xmm7, OWORD PTR [rsp+16]
+        vpclmulqdq	xmm2, xmm7, xmm14, 16
+        vpclmulqdq	xmm1, xmm7, xmm14, 1
+        vpclmulqdq	xmm0, xmm7, xmm14, 0
+        vpclmulqdq	xmm3, xmm7, xmm14, 17
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm6, xmm6, xmm3
+        vpxor	xmm5, xmm5, xmm2
+        vpxor	xmm4, xmm4, xmm0
+        vmovdqu	xmm15, OWORD PTR [rsp+32]
+        vmovdqu	xmm7, OWORD PTR [rsp+48]
+        vpclmulqdq	xmm2, xmm15, xmm13, 16
+        vpclmulqdq	xmm1, xmm15, xmm13, 1
+        vpclmulqdq	xmm0, xmm15, xmm13, 0
+        vpclmulqdq	xmm3, xmm15, xmm13, 17
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm6, xmm6, xmm3
+        vpxor	xmm5, xmm5, xmm2
+        vpxor	xmm4, xmm4, xmm0
+        vpclmulqdq	xmm2, xmm7, xmm12, 16
+        vpclmulqdq	xmm1, xmm7, xmm12, 1
+        vpclmulqdq	xmm0, xmm7, xmm12, 0
+        vpclmulqdq	xmm3, xmm7, xmm12, 17
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm6, xmm6, xmm3
+        vpxor	xmm5, xmm5, xmm2
+        vpxor	xmm4, xmm4, xmm0
+        vmovdqu	xmm15, OWORD PTR [rsp+64]
+        vmovdqu	xmm7, OWORD PTR [rsp+80]
+        vpclmulqdq	xmm2, xmm15, xmm11, 16
+        vpclmulqdq	xmm1, xmm15, xmm11, 1
+        vpclmulqdq	xmm0, xmm15, xmm11, 0
+        vpclmulqdq	xmm3, xmm15, xmm11, 17
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm6, xmm6, xmm3
+        vpxor	xmm5, xmm5, xmm2
+        vpxor	xmm4, xmm4, xmm0
+        vpclmulqdq	xmm2, xmm7, xmm10, 16
+        vpclmulqdq	xmm1, xmm7, xmm10, 1
+        vpclmulqdq	xmm0, xmm7, xmm10, 0
+        vpclmulqdq	xmm3, xmm7, xmm10, 17
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm6, xmm6, xmm3
+        vpxor	xmm5, xmm5, xmm2
+        vpxor	xmm4, xmm4, xmm0
+        vmovdqu	xmm15, OWORD PTR [rsp+96]
+        vmovdqu	xmm7, OWORD PTR [rsp+112]
+        vpclmulqdq	xmm2, xmm15, xmm9, 16
+        vpclmulqdq	xmm1, xmm15, xmm9, 1
+        vpclmulqdq	xmm0, xmm15, xmm9, 0
+        vpclmulqdq	xmm3, xmm15, xmm9, 17
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm6, xmm6, xmm3
+        vpxor	xmm5, xmm5, xmm2
+        vpxor	xmm4, xmm4, xmm0
+        vpclmulqdq	xmm2, xmm7, xmm8, 16
+        vpclmulqdq	xmm1, xmm7, xmm8, 1
+        vpclmulqdq	xmm0, xmm7, xmm8, 0
+        vpclmulqdq	xmm3, xmm7, xmm8, 17
+        vpxor	xmm2, xmm2, xmm1
+        vpxor	xmm6, xmm6, xmm3
+        vpxor	xmm5, xmm5, xmm2
+        vpxor	xmm4, xmm4, xmm0
+        vpslldq	xmm7, xmm5, 8
+        vpsrldq	xmm5, xmm5, 8
+        vpxor	xmm4, xmm4, xmm7
+        vpxor	xmm6, xmm6, xmm5
+        ; ghash_red
+        vmovdqu	xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128
+        vpclmulqdq	xmm0, xmm4, xmm2, 16
+        vpshufd	xmm1, xmm4, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpclmulqdq	xmm0, xmm1, xmm2, 16
+        vpshufd	xmm1, xmm1, 78
+        vpxor	xmm1, xmm1, xmm0
+        vpxor	xmm6, xmm6, xmm1
+        vmovdqu	xmm5, OWORD PTR [rsp]
+        vmovdqu	xmm4, OWORD PTR [rsp+128]
+L_AES_GCM_encrypt_update_avx2_done_128:
+        cmp	edi, r9d
+        je	L_AES_GCM_encrypt_update_avx2_done_enc
+        mov	r15d, r9d
+        and	r15d, 4294967280
+        cmp	edi, r15d
+        jge	L_AES_GCM_encrypt_update_avx2_last_block_done
+        ; aesenc_block
+        vmovdqu	xmm1, xmm4
+        vpshufb	xmm0, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64
+        vpaddd	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_one
+        vpxor	xmm0, xmm0, [rax]
+        vmovdqu	xmm2, OWORD PTR [rax+16]
+        vaesenc	xmm0, xmm0, xmm2
+        vmovdqu	xmm2, OWORD PTR [rax+32]
+        vaesenc	xmm0, xmm0, xmm2
+        vmovdqu	xmm2, OWORD PTR [rax+48]
+        vaesenc	xmm0, xmm0, xmm2
+        vmovdqu	xmm2, OWORD PTR [rax+64]
+        vaesenc	xmm0, xmm0, xmm2
+        vmovdqu	xmm2, OWORD PTR [rax+80]
+        vaesenc	xmm0, xmm0, xmm2
+        vmovdqu	xmm2, OWORD PTR [rax+96]
+        vaesenc	xmm0, xmm0, xmm2
+        vmovdqu	xmm2, OWORD PTR [rax+112]
+        vaesenc	xmm0, xmm0, xmm2
+        vmovdqu	xmm2, OWORD PTR [rax+128]
+        vaesenc	xmm0, xmm0, xmm2
+        vmovdqu	xmm2, OWORD PTR [rax+144]
+        vaesenc	xmm0, xmm0, xmm2
+        vmovdqu	xmm4, xmm1
+        cmp	r8d, 11
+        vmovdqu	xmm1, OWORD PTR [rax+160]
+        jl	L_AES_GCM_encrypt_update_avx2_aesenc_block_last
+        vaesenc	xmm0, xmm0, xmm1
+        vmovdqu	xmm2, OWORD PTR [rax+176]
+        vaesenc	xmm0, xmm0, xmm2
+        cmp	r8d, 13
+        vmovdqu	xmm1, OWORD PTR [rax+192]
+        jl	L_AES_GCM_encrypt_update_avx2_aesenc_block_last
+        vaesenc	xmm0, xmm0, xmm1
+        vmovdqu	xmm2, OWORD PTR [rax+208]
+        vaesenc	xmm0, xmm0, xmm2
+        vmovdqu	xmm1, OWORD PTR [rax+224]
+L_AES_GCM_encrypt_update_avx2_aesenc_block_last:
+        vaesenclast	xmm0, xmm0, xmm1
+        vmovdqu	xmm1, OWORD PTR [r11+rdi]
+        vpxor	xmm0, xmm0, xmm1
+        vmovdqu	OWORD PTR [r10+rdi], xmm0
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm6, xmm6, xmm0
+        add	edi, 16
+        cmp	edi, r15d
+        jge	L_AES_GCM_encrypt_update_avx2_last_block_ghash
+L_AES_GCM_encrypt_update_avx2_last_block_start:
+        vmovdqu	xmm12, OWORD PTR [r11+rdi]
+        vpshufb	xmm11, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64
+        vpaddd	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_one
+        ; aesenc_gfmul_sb
+        vpclmulqdq	xmm2, xmm6, xmm5, 1
+        vpclmulqdq	xmm3, xmm6, xmm5, 16
+        vpclmulqdq	xmm1, xmm6, xmm5, 0
+        vpclmulqdq	xmm8, xmm6, xmm5, 17
+        vpxor	xmm11, xmm11, [rax]
+        vaesenc	xmm11, xmm11, [rax+16]
+        vpxor	xmm3, xmm3, xmm2
+        vpslldq	xmm2, xmm3, 8
+        vpsrldq	xmm3, xmm3, 8
+        vaesenc	xmm11, xmm11, [rax+32]
+        vpxor	xmm2, xmm2, xmm1
+        vpclmulqdq	xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
+        vaesenc	xmm11, xmm11, [rax+48]
+        vaesenc	xmm11, xmm11, [rax+64]
+        vaesenc	xmm11, xmm11, [rax+80]
+        vpshufd	xmm2, xmm2, 78
+        vpxor	xmm2, xmm2, xmm1
+        vpclmulqdq	xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
+        vaesenc	xmm11, xmm11, [rax+96]
+        vaesenc	xmm11, xmm11, [rax+112]
+        vaesenc	xmm11, xmm11, [rax+128]
+        vpshufd	xmm2, xmm2, 78
+        vaesenc	xmm11, xmm11, [rax+144]
+        vpxor	xmm8, xmm8, xmm3
+        vpxor	xmm2, xmm2, xmm8
+        vmovdqu	xmm0, OWORD PTR [rax+160]
+        cmp	r8d, 11
+        jl	L_AES_GCM_encrypt_update_avx2_aesenc_gfmul_sb_last
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm11, xmm11, [rax+176]
+        vmovdqu	xmm0, OWORD PTR [rax+192]
+        cmp	r8d, 13
+        jl	L_AES_GCM_encrypt_update_avx2_aesenc_gfmul_sb_last
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm11, xmm11, [rax+208]
+        vmovdqu	xmm0, OWORD PTR [rax+224]
+L_AES_GCM_encrypt_update_avx2_aesenc_gfmul_sb_last:
+        vaesenclast	xmm11, xmm11, xmm0
+        vpxor	xmm6, xmm2, xmm1
+        vpxor	xmm11, xmm11, xmm12
+        vmovdqu	OWORD PTR [r10+rdi], xmm11
+        vpshufb	xmm11, xmm11, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm6, xmm6, xmm11
+        add	edi, 16
+        cmp	edi, r15d
+        jl	L_AES_GCM_encrypt_update_avx2_last_block_start
+L_AES_GCM_encrypt_update_avx2_last_block_ghash:
+        ; ghash_gfmul_red
+        vpclmulqdq	xmm10, xmm6, xmm5, 16
+        vpclmulqdq	xmm9, xmm6, xmm5, 1
+        vpclmulqdq	xmm8, xmm6, xmm5, 0
+        vpxor	xmm10, xmm10, xmm9
+        vpslldq	xmm9, xmm10, 8
+        vpsrldq	xmm10, xmm10, 8
+        vpxor	xmm9, xmm9, xmm8
+        vpclmulqdq	xmm6, xmm6, xmm5, 17
+        vpclmulqdq	xmm8, xmm9, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
+        vpshufd	xmm9, xmm9, 78
+        vpxor	xmm9, xmm9, xmm8
+        vpclmulqdq	xmm8, xmm9, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
+        vpshufd	xmm9, xmm9, 78
+        vpxor	xmm6, xmm6, xmm10
+        vpxor	xmm6, xmm6, xmm9
+        vpxor	xmm6, xmm6, xmm8
+L_AES_GCM_encrypt_update_avx2_last_block_done:
+L_AES_GCM_encrypt_update_avx2_done_enc:
+        vmovdqu	OWORD PTR [r12], xmm6
+        vmovdqu	OWORD PTR [r14], xmm4
+        vzeroupper
+        add	rsp, 152
+        pop	rdi
+        pop	r15
+        pop	r14
+        pop	r13
+        pop	r12
+        ret
+AES_GCM_encrypt_update_avx2 ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+AES_GCM_encrypt_final_avx2 PROC
+        push	r12
+        push	r13
+        mov	eax, DWORD PTR [rsp+56]
+        mov	r10, QWORD PTR [rsp+64]
+        mov	r11, QWORD PTR [rsp+72]
+        sub	rsp, 16
+        vmovdqu	xmm4, OWORD PTR [rcx]
+        vmovdqu	xmm5, OWORD PTR [r10]
+        vmovdqu	xmm6, OWORD PTR [r11]
+        vpsrlq	xmm1, xmm5, 63
+        vpsllq	xmm0, xmm5, 1
+        vpslldq	xmm1, xmm1, 8
+        vpor	xmm0, xmm0, xmm1
+        vpshufd	xmm5, xmm5, 255
+        vpsrad	xmm5, xmm5, 31
+        vpand	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_mod2_128
+        vpxor	xmm5, xmm5, xmm0
+        ; calc_tag
+        shl	r9, 3
+        shl	rax, 3
+        vmovq	xmm0, r9
+        vmovq	xmm1, rax
+        vpunpcklqdq	xmm0, xmm0, xmm1
+        vpxor	xmm0, xmm0, xmm4
+        ; ghash_gfmul_red
+        vpclmulqdq	xmm7, xmm0, xmm5, 16
+        vpclmulqdq	xmm3, xmm0, xmm5, 1
+        vpclmulqdq	xmm2, xmm0, xmm5, 0
+        vpxor	xmm7, xmm7, xmm3
+        vpslldq	xmm3, xmm7, 8
+        vpsrldq	xmm7, xmm7, 8
+        vpxor	xmm3, xmm3, xmm2
+        vpclmulqdq	xmm0, xmm0, xmm5, 17
+        vpclmulqdq	xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
+        vpshufd	xmm3, xmm3, 78
+        vpxor	xmm3, xmm3, xmm2
+        vpclmulqdq	xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
+        vpshufd	xmm3, xmm3, 78
+        vpxor	xmm0, xmm0, xmm7
+        vpxor	xmm0, xmm0, xmm3
+        vpxor	xmm0, xmm0, xmm2
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm0, xmm0, xmm6
+        ; store_tag
+        cmp	r8d, 16
+        je	L_AES_GCM_encrypt_final_avx2_store_tag_16
+        xor	r12, r12
+        vmovdqu	OWORD PTR [rsp], xmm0
+L_AES_GCM_encrypt_final_avx2_store_tag_loop:
+        movzx	r13d, BYTE PTR [rsp+r12]
+        mov	BYTE PTR [rdx+r12], r13b
+        inc	r12d
+        cmp	r12d, r8d
+        jne	L_AES_GCM_encrypt_final_avx2_store_tag_loop
+        jmp	L_AES_GCM_encrypt_final_avx2_store_tag_done
+L_AES_GCM_encrypt_final_avx2_store_tag_16:
+        vmovdqu	OWORD PTR [rdx], xmm0
+L_AES_GCM_encrypt_final_avx2_store_tag_done:
+        vzeroupper
+        add	rsp, 16
+        pop	r13
+        pop	r12
+        ret
+AES_GCM_encrypt_final_avx2 ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+AES_GCM_decrypt_update_avx2 PROC
+        push	r13
+        push	r12
+        push	r14
+        push	r15
+        push	rdi
+        mov	rax, rcx
+        mov	r10, r8
+        mov	r8d, edx
+        mov	r11, r9
+        mov	r9d, DWORD PTR [rsp+80]
+        mov	r12, QWORD PTR [rsp+88]
+        mov	r14, QWORD PTR [rsp+96]
+        mov	r15, QWORD PTR [rsp+104]
+        sub	rsp, 168
+        vmovdqu	xmm6, OWORD PTR [r12]
+        vmovdqu	xmm5, OWORD PTR [r14]
+        vmovdqu	xmm4, OWORD PTR [r15]
+        ; Calculate H
+        vpsrlq	xmm1, xmm5, 63
+        vpsllq	xmm0, xmm5, 1
+        vpslldq	xmm1, xmm1, 8
+        vpor	xmm0, xmm0, xmm1
+        vpshufd	xmm5, xmm5, 255
+        vpsrad	xmm5, xmm5, 31
+        vpand	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_mod2_128
+        vpxor	xmm5, xmm5, xmm0
+        xor	edi, edi
+        cmp	r9d, 128
+        mov	r13d, r9d
+        jl	L_AES_GCM_decrypt_update_avx2_done_128
+        and	r13d, 4294967168
+        vmovdqu	OWORD PTR [rsp+128], xmm4
+        vmovdqu	OWORD PTR [rsp+144], xmm15
+        vmovdqu	xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128
+        ; H ^ 1 and H ^ 2
+        vpclmulqdq	xmm9, xmm5, xmm5, 0
+        vpclmulqdq	xmm10, xmm5, xmm5, 17
+        vpclmulqdq	xmm8, xmm9, xmm3, 16
+        vpshufd	xmm9, xmm9, 78
+        vpxor	xmm9, xmm9, xmm8
+        vpclmulqdq	xmm8, xmm9, xmm3, 16
+        vpshufd	xmm9, xmm9, 78
+        vpxor	xmm9, xmm9, xmm8
+        vpxor	xmm0, xmm10, xmm9
+        vmovdqu	OWORD PTR [rsp], xmm5
+        vmovdqu	OWORD PTR [rsp+16], xmm0
+        ; H ^ 3 and H ^ 4
+        vpclmulqdq	xmm11, xmm0, xmm5, 16
+        vpclmulqdq	xmm10, xmm0, xmm5, 1
+        vpclmulqdq	xmm9, xmm0, xmm5, 0
+        vpclmulqdq	xmm12, xmm0, xmm5, 17
+        vpclmulqdq	xmm13, xmm0, xmm0, 0
+        vpclmulqdq	xmm14, xmm0, xmm0, 17
+        vpxor	xmm11, xmm11, xmm10
+        vpslldq	xmm10, xmm11, 8
+        vpsrldq	xmm11, xmm11, 8
+        vpxor	xmm10, xmm10, xmm9
+        vpclmulqdq	xmm8, xmm13, xmm3, 16
+        vpclmulqdq	xmm9, xmm10, xmm3, 16
+        vpshufd	xmm10, xmm10, 78
+        vpshufd	xmm13, xmm13, 78
+        vpxor	xmm10, xmm10, xmm9
+        vpxor	xmm13, xmm13, xmm8
+        vpclmulqdq	xmm9, xmm10, xmm3, 16
+        vpclmulqdq	xmm8, xmm13, xmm3, 16
+        vpshufd	xmm10, xmm10, 78
+        vpshufd	xmm13, xmm13, 78
+        vpxor	xmm12, xmm12, xmm11
+        vpxor	xmm13, xmm13, xmm8
+        vpxor	xmm10, xmm10, xmm12
+        vpxor	xmm2, xmm13, xmm14
+        vpxor	xmm1, xmm10, xmm9
+        vmovdqu	OWORD PTR [rsp+32], xmm1
+        vmovdqu	OWORD PTR [rsp+48], xmm2
+        ; H ^ 5 and H ^ 6
+        vpclmulqdq	xmm11, xmm1, xmm0, 16
+        vpclmulqdq	xmm10, xmm1, xmm0, 1
+        vpclmulqdq	xmm9, xmm1, xmm0, 0
+        vpclmulqdq	xmm12, xmm1, xmm0, 17
+        vpclmulqdq	xmm13, xmm1, xmm1, 0
+        vpclmulqdq	xmm14, xmm1, xmm1, 17
+        vpxor	xmm11, xmm11, xmm10
+        vpslldq	xmm10, xmm11, 8
+        vpsrldq	xmm11, xmm11, 8
+        vpxor	xmm10, xmm10, xmm9
+        vpclmulqdq	xmm8, xmm13, xmm3, 16
+        vpclmulqdq	xmm9, xmm10, xmm3, 16
+        vpshufd	xmm10, xmm10, 78
+        vpshufd	xmm13, xmm13, 78
+        vpxor	xmm10, xmm10, xmm9
+        vpxor	xmm13, xmm13, xmm8
+        vpclmulqdq	xmm9, xmm10, xmm3, 16
+        vpclmulqdq	xmm8, xmm13, xmm3, 16
+        vpshufd	xmm10, xmm10, 78
+        vpshufd	xmm13, xmm13, 78
+        vpxor	xmm12, xmm12, xmm11
+        vpxor	xmm13, xmm13, xmm8
+        vpxor	xmm10, xmm10, xmm12
+        vpxor	xmm0, xmm13, xmm14
+        vpxor	xmm7, xmm10, xmm9
+        vmovdqu	OWORD PTR [rsp+64], xmm7
+        vmovdqu	OWORD PTR [rsp+80], xmm0
+        ; H ^ 7 and H ^ 8
+        vpclmulqdq	xmm11, xmm2, xmm1, 16
+        vpclmulqdq	xmm10, xmm2, xmm1, 1
+        vpclmulqdq	xmm9, xmm2, xmm1, 0
+        vpclmulqdq	xmm12, xmm2, xmm1, 17
+        vpclmulqdq	xmm13, xmm2, xmm2, 0
+        vpclmulqdq	xmm14, xmm2, xmm2, 17
+        vpxor	xmm11, xmm11, xmm10
+        vpslldq	xmm10, xmm11, 8
+        vpsrldq	xmm11, xmm11, 8
+        vpxor	xmm10, xmm10, xmm9
+        vpclmulqdq	xmm8, xmm13, xmm3, 16
+        vpclmulqdq	xmm9, xmm10, xmm3, 16
+        vpshufd	xmm10, xmm10, 78
+        vpshufd	xmm13, xmm13, 78
+        vpxor	xmm10, xmm10, xmm9
+        vpxor	xmm13, xmm13, xmm8
+        vpclmulqdq	xmm9, xmm10, xmm3, 16
+        vpclmulqdq	xmm8, xmm13, xmm3, 16
+        vpshufd	xmm10, xmm10, 78
+        vpshufd	xmm13, xmm13, 78
+        vpxor	xmm12, xmm12, xmm11
+        vpxor	xmm13, xmm13, xmm8
+        vpxor	xmm10, xmm10, xmm12
+        vpxor	xmm0, xmm13, xmm14
+        vpxor	xmm7, xmm10, xmm9
+        vmovdqu	OWORD PTR [rsp+96], xmm7
+        vmovdqu	OWORD PTR [rsp+112], xmm0
+L_AES_GCM_decrypt_update_avx2_ghash_128:
+        ; aesenc_128_ghash
+        lea	rcx, QWORD PTR [r11+rdi]
+        lea	rdx, QWORD PTR [r10+rdi]
+        ; aesenc_ctr
+        vmovdqu	xmm0, OWORD PTR [rsp+128]
+        vmovdqu	xmm1, OWORD PTR L_avx2_aes_gcm_bswap_epi64
+        vpaddd	xmm9, xmm0, OWORD PTR L_avx2_aes_gcm_one
+        vpshufb	xmm8, xmm0, xmm1
+        vpaddd	xmm10, xmm0, OWORD PTR L_avx2_aes_gcm_two
+        vpshufb	xmm9, xmm9, xmm1
+        vpaddd	xmm11, xmm0, OWORD PTR L_avx2_aes_gcm_three
+        vpshufb	xmm10, xmm10, xmm1
+        vpaddd	xmm12, xmm0, OWORD PTR L_avx2_aes_gcm_four
+        vpshufb	xmm11, xmm11, xmm1
+        vpaddd	xmm13, xmm0, OWORD PTR L_avx2_aes_gcm_five
+        vpshufb	xmm12, xmm12, xmm1
+        vpaddd	xmm14, xmm0, OWORD PTR L_avx2_aes_gcm_six
+        vpshufb	xmm13, xmm13, xmm1
+        vpaddd	xmm15, xmm0, OWORD PTR L_avx2_aes_gcm_seven
+        vpshufb	xmm14, xmm14, xmm1
+        vpaddd	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_eight
+        vpshufb	xmm15, xmm15, xmm1
+        ; aesenc_xor
+        vmovdqu	xmm7, OWORD PTR [rax]
+        vmovdqu	OWORD PTR [rsp+128], xmm0
+        vpxor	xmm8, xmm8, xmm7
+        vpxor	xmm9, xmm9, xmm7
+        vpxor	xmm10, xmm10, xmm7
+        vpxor	xmm11, xmm11, xmm7
+        vpxor	xmm12, xmm12, xmm7
+        vpxor	xmm13, xmm13, xmm7
+        vpxor	xmm14, xmm14, xmm7
+        vpxor	xmm15, xmm15, xmm7
+        ; aesenc_pclmul_1
+        vmovdqu	xmm1, OWORD PTR [rcx]
+        vmovdqu	xmm0, OWORD PTR [rax+16]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vmovdqu	xmm2, OWORD PTR [rsp+112]
+        vpxor	xmm1, xmm1, xmm6
+        vpclmulqdq	xmm5, xmm1, xmm2, 16
+        vpclmulqdq	xmm3, xmm1, xmm2, 1
+        vpclmulqdq	xmm6, xmm1, xmm2, 0
+        vpclmulqdq	xmm7, xmm1, xmm2, 17
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_2
+        vmovdqu	xmm1, OWORD PTR [rcx+16]
+        vmovdqu	xmm0, OWORD PTR [rsp+96]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm5, xmm5, xmm3
+        vpclmulqdq	xmm2, xmm1, xmm0, 16
+        vpclmulqdq	xmm3, xmm1, xmm0, 1
+        vpclmulqdq	xmm4, xmm1, xmm0, 0
+        vpclmulqdq	xmm1, xmm1, xmm0, 17
+        vmovdqu	xmm0, OWORD PTR [rax+32]
+        vpxor	xmm7, xmm7, xmm1
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_n
+        vmovdqu	xmm1, OWORD PTR [rcx+32]
+        vmovdqu	xmm0, OWORD PTR [rsp+80]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm5, xmm5, xmm2
+        vpclmulqdq	xmm2, xmm1, xmm0, 16
+        vpxor	xmm5, xmm5, xmm3
+        vpclmulqdq	xmm3, xmm1, xmm0, 1
+        vpxor	xmm6, xmm6, xmm4
+        vpclmulqdq	xmm4, xmm1, xmm0, 0
+        vpclmulqdq	xmm1, xmm1, xmm0, 17
+        vmovdqu	xmm0, OWORD PTR [rax+48]
+        vpxor	xmm7, xmm7, xmm1
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_n
+        vmovdqu	xmm1, OWORD PTR [rcx+48]
+        vmovdqu	xmm0, OWORD PTR [rsp+64]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm5, xmm5, xmm2
+        vpclmulqdq	xmm2, xmm1, xmm0, 16
+        vpxor	xmm5, xmm5, xmm3
+        vpclmulqdq	xmm3, xmm1, xmm0, 1
+        vpxor	xmm6, xmm6, xmm4
+        vpclmulqdq	xmm4, xmm1, xmm0, 0
+        vpclmulqdq	xmm1, xmm1, xmm0, 17
+        vmovdqu	xmm0, OWORD PTR [rax+64]
+        vpxor	xmm7, xmm7, xmm1
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_n
+        vmovdqu	xmm1, OWORD PTR [rcx+64]
+        vmovdqu	xmm0, OWORD PTR [rsp+48]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm5, xmm5, xmm2
+        vpclmulqdq	xmm2, xmm1, xmm0, 16
+        vpxor	xmm5, xmm5, xmm3
+        vpclmulqdq	xmm3, xmm1, xmm0, 1
+        vpxor	xmm6, xmm6, xmm4
+        vpclmulqdq	xmm4, xmm1, xmm0, 0
+        vpclmulqdq	xmm1, xmm1, xmm0, 17
+        vmovdqu	xmm0, OWORD PTR [rax+80]
+        vpxor	xmm7, xmm7, xmm1
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_n
+        vmovdqu	xmm1, OWORD PTR [rcx+80]
+        vmovdqu	xmm0, OWORD PTR [rsp+32]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm5, xmm5, xmm2
+        vpclmulqdq	xmm2, xmm1, xmm0, 16
+        vpxor	xmm5, xmm5, xmm3
+        vpclmulqdq	xmm3, xmm1, xmm0, 1
+        vpxor	xmm6, xmm6, xmm4
+        vpclmulqdq	xmm4, xmm1, xmm0, 0
+        vpclmulqdq	xmm1, xmm1, xmm0, 17
+        vmovdqu	xmm0, OWORD PTR [rax+96]
+        vpxor	xmm7, xmm7, xmm1
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_n
+        vmovdqu	xmm1, OWORD PTR [rcx+96]
+        vmovdqu	xmm0, OWORD PTR [rsp+16]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm5, xmm5, xmm2
+        vpclmulqdq	xmm2, xmm1, xmm0, 16
+        vpxor	xmm5, xmm5, xmm3
+        vpclmulqdq	xmm3, xmm1, xmm0, 1
+        vpxor	xmm6, xmm6, xmm4
+        vpclmulqdq	xmm4, xmm1, xmm0, 0
+        vpclmulqdq	xmm1, xmm1, xmm0, 17
+        vmovdqu	xmm0, OWORD PTR [rax+112]
+        vpxor	xmm7, xmm7, xmm1
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_n
+        vmovdqu	xmm1, OWORD PTR [rcx+112]
+        vmovdqu	xmm0, OWORD PTR [rsp]
+        vpshufb	xmm1, xmm1, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm5, xmm5, xmm2
+        vpclmulqdq	xmm2, xmm1, xmm0, 16
+        vpxor	xmm5, xmm5, xmm3
+        vpclmulqdq	xmm3, xmm1, xmm0, 1
+        vpxor	xmm6, xmm6, xmm4
+        vpclmulqdq	xmm4, xmm1, xmm0, 0
+        vpclmulqdq	xmm1, xmm1, xmm0, 17
+        vmovdqu	xmm0, OWORD PTR [rax+128]
+        vpxor	xmm7, xmm7, xmm1
+        vaesenc	xmm8, xmm8, xmm0
+        vaesenc	xmm9, xmm9, xmm0
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm11, xmm11, xmm0
+        vaesenc	xmm12, xmm12, xmm0
+        vaesenc	xmm13, xmm13, xmm0
+        vaesenc	xmm14, xmm14, xmm0
+        vaesenc	xmm15, xmm15, xmm0
+        ; aesenc_pclmul_l
+        vpxor	xmm5, xmm5, xmm2
+        vpxor	xmm6, xmm6, xmm4
+        vpxor	xmm5, xmm5, xmm3
+        vpslldq	xmm1, xmm5, 8
+        vpsrldq	xmm5, xmm5, 8
+        vmovdqu	xmm4, OWORD PTR [rax+144]
+        vmovdqu	xmm0, OWORD PTR L_avx2_aes_gcm_mod2_128
+        vaesenc	xmm8, xmm8, xmm4
+        vpxor	xmm6, xmm6, xmm1
+        vpxor	xmm7, xmm7, xmm5
+        vpclmulqdq	xmm3, xmm6, xmm0, 16
+        vaesenc	xmm9, xmm9, xmm4
+        vaesenc	xmm10, xmm10, xmm4
+        vaesenc	xmm11, xmm11, xmm4
+        vpshufd	xmm6, xmm6, 78
+        vpxor	xmm6, xmm6, xmm3
+        vpclmulqdq	xmm3, xmm6, xmm0, 16
+        vaesenc	xmm12, xmm12, xmm4
+        vaesenc	xmm13, xmm13, xmm4
+        vaesenc	xmm14, xmm14, xmm4
+        vpshufd	xmm6, xmm6, 78
+        vpxor	xmm6, xmm6, xmm3
+        vpxor	xmm6, xmm6, xmm7
+        vaesenc	xmm15, xmm15, xmm4
+        cmp	r8d, 11
+        vmovdqu	xmm7, OWORD PTR [rax+160]
+        jl	L_AES_GCM_decrypt_update_avx2_aesenc_128_ghash_avx_done
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rax+176]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        cmp	r8d, 13
+        vmovdqu	xmm7, OWORD PTR [rax+192]
+        jl	L_AES_GCM_decrypt_update_avx2_aesenc_128_ghash_avx_done
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rax+208]
+        vaesenc	xmm8, xmm8, xmm7
+        vaesenc	xmm9, xmm9, xmm7
+        vaesenc	xmm10, xmm10, xmm7
+        vaesenc	xmm11, xmm11, xmm7
+        vaesenc	xmm12, xmm12, xmm7
+        vaesenc	xmm13, xmm13, xmm7
+        vaesenc	xmm14, xmm14, xmm7
+        vaesenc	xmm15, xmm15, xmm7
+        vmovdqu	xmm7, OWORD PTR [rax+224]
+L_AES_GCM_decrypt_update_avx2_aesenc_128_ghash_avx_done:
+        ; aesenc_last
+        vaesenclast	xmm8, xmm8, xmm7
+        vaesenclast	xmm9, xmm9, xmm7
+        vaesenclast	xmm10, xmm10, xmm7
+        vaesenclast	xmm11, xmm11, xmm7
+        vmovdqu	xmm0, OWORD PTR [rcx]
+        vmovdqu	xmm1, OWORD PTR [rcx+16]
+        vmovdqu	xmm2, OWORD PTR [rcx+32]
+        vmovdqu	xmm3, OWORD PTR [rcx+48]
+        vpxor	xmm8, xmm8, xmm0
+        vpxor	xmm9, xmm9, xmm1
+        vpxor	xmm10, xmm10, xmm2
+        vpxor	xmm11, xmm11, xmm3
+        vmovdqu	OWORD PTR [rdx], xmm8
+        vmovdqu	OWORD PTR [rdx+16], xmm9
+        vmovdqu	OWORD PTR [rdx+32], xmm10
+        vmovdqu	OWORD PTR [rdx+48], xmm11
+        vaesenclast	xmm12, xmm12, xmm7
+        vaesenclast	xmm13, xmm13, xmm7
+        vaesenclast	xmm14, xmm14, xmm7
+        vaesenclast	xmm15, xmm15, xmm7
+        vmovdqu	xmm0, OWORD PTR [rcx+64]
+        vmovdqu	xmm1, OWORD PTR [rcx+80]
+        vmovdqu	xmm2, OWORD PTR [rcx+96]
+        vmovdqu	xmm3, OWORD PTR [rcx+112]
+        vpxor	xmm12, xmm12, xmm0
+        vpxor	xmm13, xmm13, xmm1
+        vpxor	xmm14, xmm14, xmm2
+        vpxor	xmm15, xmm15, xmm3
+        vmovdqu	OWORD PTR [rdx+64], xmm12
+        vmovdqu	OWORD PTR [rdx+80], xmm13
+        vmovdqu	OWORD PTR [rdx+96], xmm14
+        vmovdqu	OWORD PTR [rdx+112], xmm15
+        ; aesenc_128_ghash - end
+        add	edi, 128
+        cmp	edi, r13d
+        jl	L_AES_GCM_decrypt_update_avx2_ghash_128
+        vmovdqu	xmm5, OWORD PTR [rsp]
+        vmovdqu	xmm4, OWORD PTR [rsp+128]
+        vmovdqu	xmm15, OWORD PTR [rsp+144]
+L_AES_GCM_decrypt_update_avx2_done_128:
+        cmp	edi, r9d
+        jge	L_AES_GCM_decrypt_update_avx2_done_dec
+        mov	r13d, r9d
+        and	r13d, 4294967280
+        cmp	edi, r13d
+        jge	L_AES_GCM_decrypt_update_avx2_last_block_done
+L_AES_GCM_decrypt_update_avx2_last_block_start:
+        vmovdqu	xmm11, OWORD PTR [r11+rdi]
+        vpshufb	xmm10, xmm4, OWORD PTR L_avx2_aes_gcm_bswap_epi64
+        vpshufb	xmm12, xmm11, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpaddd	xmm4, xmm4, OWORD PTR L_avx2_aes_gcm_one
+        vpxor	xmm12, xmm12, xmm6
+        ; aesenc_gfmul_sb
+        vpclmulqdq	xmm2, xmm12, xmm5, 1
+        vpclmulqdq	xmm3, xmm12, xmm5, 16
+        vpclmulqdq	xmm1, xmm12, xmm5, 0
+        vpclmulqdq	xmm8, xmm12, xmm5, 17
+        vpxor	xmm10, xmm10, [rax]
+        vaesenc	xmm10, xmm10, [rax+16]
+        vpxor	xmm3, xmm3, xmm2
+        vpslldq	xmm2, xmm3, 8
+        vpsrldq	xmm3, xmm3, 8
+        vaesenc	xmm10, xmm10, [rax+32]
+        vpxor	xmm2, xmm2, xmm1
+        vpclmulqdq	xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
+        vaesenc	xmm10, xmm10, [rax+48]
+        vaesenc	xmm10, xmm10, [rax+64]
+        vaesenc	xmm10, xmm10, [rax+80]
+        vpshufd	xmm2, xmm2, 78
+        vpxor	xmm2, xmm2, xmm1
+        vpclmulqdq	xmm1, xmm2, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
+        vaesenc	xmm10, xmm10, [rax+96]
+        vaesenc	xmm10, xmm10, [rax+112]
+        vaesenc	xmm10, xmm10, [rax+128]
+        vpshufd	xmm2, xmm2, 78
+        vaesenc	xmm10, xmm10, [rax+144]
+        vpxor	xmm8, xmm8, xmm3
+        vpxor	xmm2, xmm2, xmm8
+        vmovdqu	xmm0, OWORD PTR [rax+160]
+        cmp	r8d, 11
+        jl	L_AES_GCM_decrypt_update_avx2_aesenc_gfmul_sb_last
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm10, xmm10, [rax+176]
+        vmovdqu	xmm0, OWORD PTR [rax+192]
+        cmp	r8d, 13
+        jl	L_AES_GCM_decrypt_update_avx2_aesenc_gfmul_sb_last
+        vaesenc	xmm10, xmm10, xmm0
+        vaesenc	xmm10, xmm10, [rax+208]
+        vmovdqu	xmm0, OWORD PTR [rax+224]
+L_AES_GCM_decrypt_update_avx2_aesenc_gfmul_sb_last:
+        vaesenclast	xmm10, xmm10, xmm0
+        vpxor	xmm6, xmm2, xmm1
+        vpxor	xmm10, xmm10, xmm11
+        vmovdqu	OWORD PTR [r10+rdi], xmm10
+        add	edi, 16
+        cmp	edi, r13d
+        jl	L_AES_GCM_decrypt_update_avx2_last_block_start
+L_AES_GCM_decrypt_update_avx2_last_block_done:
+L_AES_GCM_decrypt_update_avx2_done_dec:
+        vmovdqu	OWORD PTR [r12], xmm6
+        vmovdqu	OWORD PTR [r15], xmm4
+        vzeroupper
+        add	rsp, 168
+        pop	rdi
+        pop	r15
+        pop	r14
+        pop	r12
+        pop	r13
+        ret
+AES_GCM_decrypt_update_avx2 ENDP
+_text ENDS
+_text SEGMENT READONLY PARA
+AES_GCM_decrypt_final_avx2 PROC
+        push	r12
+        push	r13
+        push	r14
+        mov	eax, DWORD PTR [rsp+64]
+        mov	r10, QWORD PTR [rsp+72]
+        mov	r11, QWORD PTR [rsp+80]
+        mov	r12, QWORD PTR [rsp+88]
+        sub	rsp, 16
+        vmovdqu	xmm4, OWORD PTR [rcx]
+        vmovdqu	xmm5, OWORD PTR [r10]
+        vmovdqu	xmm6, OWORD PTR [r11]
+        vpsrlq	xmm1, xmm5, 63
+        vpsllq	xmm0, xmm5, 1
+        vpslldq	xmm1, xmm1, 8
+        vpor	xmm0, xmm0, xmm1
+        vpshufd	xmm5, xmm5, 255
+        vpsrad	xmm5, xmm5, 31
+        vpand	xmm5, xmm5, OWORD PTR L_avx2_aes_gcm_mod2_128
+        vpxor	xmm5, xmm5, xmm0
+        ; calc_tag
+        shl	r9, 3
+        shl	rax, 3
+        vmovq	xmm0, r9
+        vmovq	xmm1, rax
+        vpunpcklqdq	xmm0, xmm0, xmm1
+        vpxor	xmm0, xmm0, xmm4
+        ; ghash_gfmul_red
+        vpclmulqdq	xmm7, xmm0, xmm5, 16
+        vpclmulqdq	xmm3, xmm0, xmm5, 1
+        vpclmulqdq	xmm2, xmm0, xmm5, 0
+        vpxor	xmm7, xmm7, xmm3
+        vpslldq	xmm3, xmm7, 8
+        vpsrldq	xmm7, xmm7, 8
+        vpxor	xmm3, xmm3, xmm2
+        vpclmulqdq	xmm0, xmm0, xmm5, 17
+        vpclmulqdq	xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
+        vpshufd	xmm3, xmm3, 78
+        vpxor	xmm3, xmm3, xmm2
+        vpclmulqdq	xmm2, xmm3, OWORD PTR L_avx2_aes_gcm_mod2_128, 16
+        vpshufd	xmm3, xmm3, 78
+        vpxor	xmm0, xmm0, xmm7
+        vpxor	xmm0, xmm0, xmm3
+        vpxor	xmm0, xmm0, xmm2
+        vpshufb	xmm0, xmm0, OWORD PTR L_avx2_aes_gcm_bswap_mask
+        vpxor	xmm0, xmm0, xmm6
+        ; cmp_tag
+        cmp	r8d, 16
+        je	L_AES_GCM_decrypt_final_avx2_cmp_tag_16
+        xor	r13, r13
+        xor	r10, r10
+        vmovdqu	OWORD PTR [rsp], xmm0
+L_AES_GCM_decrypt_final_avx2_cmp_tag_loop:
+        movzx	r14d, BYTE PTR [rsp+r13]
+        xor	r14b, BYTE PTR [rdx+r13]
+        or	r10b, r14b
+        inc	r13d
+        cmp	r13d, r8d
+        jne	L_AES_GCM_decrypt_final_avx2_cmp_tag_loop
+        cmp	r10, 0
+        sete	r10b
+        jmp	L_AES_GCM_decrypt_final_avx2_cmp_tag_done
+L_AES_GCM_decrypt_final_avx2_cmp_tag_16:
+        vmovdqu	xmm1, OWORD PTR [rdx]
+        vpcmpeqb	xmm0, xmm0, xmm1
+        vpmovmskb	r13, xmm0
+        ; %%edx == 0xFFFF then return 1 else => return 0
+        xor	r10d, r10d
+        cmp	r13d, 65535
+        sete	r10b
+L_AES_GCM_decrypt_final_avx2_cmp_tag_done:
+        mov	DWORD PTR [r12], r10d
+        vzeroupper
+        add	rsp, 16
+        pop	r14
+        pop	r13
+        pop	r12
+        ret
+AES_GCM_decrypt_final_avx2 ENDP
+_text ENDS
+ENDIF
+END

+ 1 - 0
wolfcrypt/src/include.am

@@ -14,6 +14,7 @@ EXTRA_DIST += wolfcrypt/src/misc.c
 EXTRA_DIST += wolfcrypt/src/evp.c
 EXTRA_DIST += wolfcrypt/src/asm.c
 EXTRA_DIST += wolfcrypt/src/aes_asm.asm
+EXTRA_DIST += wolfcrypt/src/aes_gcm_asm.asm
 EXTRA_DIST += wolfcrypt/src/wc_dsp.c
 EXTRA_DIST += wolfcrypt/src/sp_dsp32.c
 EXTRA_DIST += wolfcrypt/src/sp_x86_64_asm.asm

+ 6 - 1
wolfcrypt/src/sp_int.c

@@ -618,7 +618,12 @@ This library provides single precision (SP) integer math functions.
     while (0)
 /* Index of highest bit set. */
 #define SP_ASM_HI_BIT_SET_IDX(va, vi)                    \
-    vi = _BitScanReverse64(va)
+    do {                                                 \
+        unsigned long idx;                               \
+        _BitScanReverse64(&idx, va);                     \
+        vi = idx;                                        \
+    }                                                    \
+    while (0)
 #endif
 
 #if !defined(WOLFSSL_SP_DIV_WORD_HALF) && (!defined(_MSC_VER) || \

+ 5 - 5
wolfcrypt/src/sp_x86_64_asm.asm

@@ -20598,7 +20598,7 @@ _text SEGMENT READONLY PARA
 sp_2048_lshift_32 PROC
         push	r12
         push	r13
-        mov	rcx, r8
+        mov	cl, r8b
         mov	rax, rcx
         mov	r12, 0
         mov	r13, QWORD PTR [rdx+216]
@@ -40446,7 +40446,7 @@ _text SEGMENT READONLY PARA
 sp_3072_lshift_48 PROC
         push	r12
         push	r13
-        mov	rcx, r8
+        mov	cl, r8b
         mov	rax, rcx
         mov	r12, 0
         mov	r13, QWORD PTR [rdx+344]
@@ -53973,7 +53973,7 @@ _text SEGMENT READONLY PARA
 sp_4096_lshift_64 PROC
         push	r12
         push	r13
-        mov	rcx, r8
+        mov	cl, r8b
         mov	rax, rcx
         mov	r12, 0
         mov	r13, QWORD PTR [rdx+472]
@@ -67878,7 +67878,7 @@ _text SEGMENT READONLY PARA
 sp_521_lshift_9 PROC
         push	r12
         push	r13
-        mov	rcx, r8
+        mov	cl, r8b
         mov	rax, rcx
         mov	r12, 0
         mov	r13, QWORD PTR [rdx+32]
@@ -67925,7 +67925,7 @@ _text SEGMENT READONLY PARA
 sp_521_lshift_18 PROC
         push	r12
         push	r13
-        mov	rcx, r8
+        mov	cl, r8b
         mov	rax, rcx
         mov	r12, 0
         mov	r13, QWORD PTR [rdx+104]

+ 14 - 0
wolfssl.vcxproj

@@ -362,6 +362,20 @@
       <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(OutDir)%(Filename).obj</Outputs>
       <Outputs Condition="'$(Configuration)|$(Platform)'=='DLL Release|x64'">$(IntDir)%(Filename).obj</Outputs>
     </CustomBuild>
+    <CustomBuild Include="wolfcrypt\src\aes_gcm_asm.asm">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='DLL Debug|x64'">false</ExcludedFromBuild>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='DLL Debug|x64'">ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(OutDir)%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='DLL Debug|x64'">$(IntDir)%(Filename).obj</Outputs>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</ExcludedFromBuild>
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='DLL Release|x64'">false</ExcludedFromBuild>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">ml64.exe /c /Zi /Fo"$(OutDir)%(Filename).obj" %(Identity)</Command>
+      <Command Condition="'$(Configuration)|$(Platform)'=='DLL Release|x64'">ml64.exe /c /Zi /Fo"$(IntDir)%(Filename).obj" %(Identity)</Command>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(OutDir)%(Filename).obj</Outputs>
+      <Outputs Condition="'$(Configuration)|$(Platform)'=='DLL Release|x64'">$(IntDir)%(Filename).obj</Outputs>
+    </CustomBuild>
     <CustomBuild Include="wolfcrypt\src\sp_x86_64_asm.asm">
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='DLL Debug|x64'">false</ExcludedFromBuild>

Some files were not shown because too many files changed in this diff