Browse Source

AES x86 ASM: new assembly

Added new x86 assembly for AES.
AES-CBC decrypt only 4 blocks at a time (not 6 or 8) due to reduces
register count.
GCM implementation for AVX2, AVX1 and AESNI only.
Disabled looking for other assembly files for x86.
Sean Parkinson 1 year ago
parent
commit
66ce7635b9

+ 21 - 10
configure.ac

@@ -576,6 +576,15 @@ then
     ENABLED_FASTMATH="yes"
 fi
 
+if test "$host_cpu" = "x86_64" || test "$host_cpu" = "amd64"
+then
+    AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_X86_64_BUILD"
+fi
+if test "$host_cpu" = "x86"
+then
+    AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_X86_BUILD"
+fi
+
 # if sp-math-all is not set, then enable fast math
 if test "x$ENABLED_FASTMATH" = "xyes" && test "$enable_sp_math_all" = "" && test "$enable_sp_math" = ""
 then
@@ -594,11 +603,6 @@ then
         ENABLED_HEAPMATH="no"
         ENABLED_SP_MATH_ALL="no"
     fi
-    if test "$host_cpu" = "x86_64" || test "$host_cpu" = "amd64"
-    then
-        # Have settings.h set FP_MAX_BITS higher if user didn't set directly
-        AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_X86_64_BUILD"
-    fi
     AS_IF([test "x$host_cpu" = "xaarch64"],[AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_AARCH64_BUILD"])
 
     if test "$ENABLED_SAKKE" = "yes" && test "$ENABLED_SAKKE_SMALL" != "yes"
@@ -2313,6 +2317,16 @@ then
         AM_CFLAGS="$AM_CFLAGS -DUSE_INTEL_SPEEDUP"
         ENABLED_AESNI=yes
     fi
+
+    if test "$host_cpu" = "x86_64" || test "$host_cpu" = "amd64"
+    then
+        AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_X86_64_BUILD"
+    fi
+    if test "$host_cpu" = "x86"
+    then
+        AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_X86_BUILD"
+        ENABLED_X86_ASM=yes
+    fi
 fi
 
 AC_ARG_ENABLE([aligndata],
@@ -6732,7 +6746,7 @@ if test "$ENABLED_SP_MATH_ALL" = "yes" && test "$ENABLED_ASM" != "no"; then
 
   case $host_cpu in
   *x86_64* | *amd64*)
-    AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_X86_64 -DWOLFSSL_X86_64_BUILD"
+    AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_X86_64"
     ;;
   *x86*)
     AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_X86"
@@ -6817,10 +6831,6 @@ if test "$ENABLED_SP_ASM" = "yes" && test "$ENABLED_SP" = "yes"; then
     AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_SP_X86_64_ASM"
     AM_CCASFLAGS="$AM_CCASFLAGS -DWOLFSSL_SP_X86_64_ASM"
     ENABLED_SP_X86_64_ASM=yes
-    if test "x$ENABLED_FASTMATH" = "xno"
-    then
-      AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_X86_64_BUILD"
-    fi
     ;;
   *)
     AC_MSG_ERROR([ASM not available for CPU. Supported CPUs: x86_64, aarch64, arm])
@@ -8097,6 +8107,7 @@ AM_CONDITIONAL([BUILD_ARMASM_CRYPTO],[test "x$ENABLED_ARMASM_CRYPTO" = "xyes"])
 AM_CONDITIONAL([BUILD_XILINX],[test "x$ENABLED_XILINX" = "xyes"])
 AM_CONDITIONAL([BUILD_AESNI],[test "x$ENABLED_AESNI" = "xyes"])
 AM_CONDITIONAL([BUILD_INTELASM],[test "x$ENABLED_INTELASM" = "xyes"])
+AM_CONDITIONAL([BUILD_X86_ASM],[test "x$ENABLED_X86_ASM" = "xyes"])
 AM_CONDITIONAL([BUILD_AFALG],[test "x$ENABLED_AFALG" = "xyes"])
 AM_CONDITIONAL([BUILD_KCAPI],[test "x$ENABLED_KCAPI" = "xyes"])
 AM_CONDITIONAL([BUILD_DEVCRYPTO],[test "x$ENABLED_DEVCRYPTO" = "xyes"])

+ 12 - 0
src/include.am

@@ -118,8 +118,12 @@ endif
 
 if BUILD_AESNI
 src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_asm.S
+if BUILD_X86_ASM
+src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S
+else
 src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_gcm_asm.S
 endif
+endif
 
 if BUILD_DES3
 src_libwolfssl_la_SOURCES += wolfcrypt/src/des3.c
@@ -210,8 +214,12 @@ endif
 
 if BUILD_AESNI
 src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_asm.S
+if BUILD_X86_ASM
+src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S
+else
 src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_gcm_asm.S
 endif
+endif
 
 if BUILD_SHA
 src_libwolfssl_la_SOURCES += wolfcrypt/src/sha.c
@@ -552,9 +560,13 @@ endif
 if !BUILD_FIPS_CURRENT
 if BUILD_AESNI
 src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_asm.S
+if BUILD_X86_ASM
+src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_gcm_x86_asm.S
+else
 src_libwolfssl_la_SOURCES += wolfcrypt/src/aes_gcm_asm.S
 endif
 endif
+endif
 
 if BUILD_CAMELLIA
 src_libwolfssl_la_SOURCES += wolfcrypt/src/camellia.c

+ 5 - 5
wolfcrypt/src/aes.c

@@ -738,7 +738,7 @@ block cipher mechanism that uses n-bit binary string parameter key with 128-bits
                              XASM_LINK("AES_CBC_encrypt");
 
         #ifdef HAVE_AES_DECRYPT
-            #if defined(WOLFSSL_AESNI_BY4)
+            #if defined(WOLFSSL_AESNI_BY4) || defined(WOLFSSL_X86_BUILD)
                 void AES_CBC_decrypt_by4(const unsigned char* in, unsigned char* out,
                                          unsigned char* ivec, unsigned long length,
                                          const unsigned char* KS, int nr)
@@ -4191,7 +4191,7 @@ int wc_AesSetIV(Aes* aes, const byte* iv)
             /* if input and output same will overwrite input iv */
             XMEMCPY(aes->tmp, in + sz - AES_BLOCK_SIZE, AES_BLOCK_SIZE);
             SAVE_VECTOR_REGISTERS(return _svr_ret;);
-            #if defined(WOLFSSL_AESNI_BY4)
+            #if defined(WOLFSSL_AESNI_BY4) || defined(WOLFSSL_X86_BUILD)
             AES_CBC_decrypt_by4(in, out, (byte*)aes->reg, sz, (byte*)aes->key,
                             aes->rounds);
             #elif defined(WOLFSSL_AESNI_BY6)
@@ -7867,7 +7867,7 @@ int wc_AesGcmEncrypt(Aes* aes, byte* out, const byte* in, word32 sz,
     }
     else
     #endif
-    #ifdef HAVE_INTEL_AVX1
+    #if defined(HAVE_INTEL_AVX1)
     if (IS_INTEL_AVX1(intel_flags)) {
         SAVE_VECTOR_REGISTERS(return _svr_ret;);
         AES_GCM_encrypt_avx1(in, out, authIn, iv, authTag, sz, authInSz, ivSz,
@@ -8414,7 +8414,7 @@ int wc_AesGcmDecrypt(Aes* aes, byte* out, const byte* in, word32 sz,
     }
     else
     #endif
-    #ifdef HAVE_INTEL_AVX1
+    #if defined(HAVE_INTEL_AVX1)
     if (IS_INTEL_AVX1(intel_flags)) {
         SAVE_VECTOR_REGISTERS(return _svr_ret;);
         AES_GCM_decrypt_avx1(in, out, authIn, iv, authTag, sz, authInSz, ivSz,
@@ -9035,7 +9035,7 @@ static WARN_UNUSED_RESULT int AesGcmEncryptFinal_aesni(
     extern "C" {
 #endif
 
-/* Assembly code implementations in: aes_gcm_asm.S */
+/* Assembly code implementations in: aes_gcm_asm.S and aes_gcm_x86_asm.S */
 #ifdef HAVE_INTEL_AVX2
 extern void AES_GCM_decrypt_update_avx2(const unsigned char* key, int nr,
     unsigned char* out, const unsigned char* in, unsigned int nbytes,

+ 888 - 0
wolfcrypt/src/aes_asm.S

@@ -27,6 +27,7 @@
  * by Intel Mobility Group, Israel Development Center, Israel Shay Gueron
  */
 
+#ifdef WOLFSSL_X86_64_BUILD
 
 /*
 AES_CBC_encrypt (const unsigned char *in,
@@ -1333,6 +1334,893 @@ pxor  %xmm4, %xmm3
 pxor   %xmm2, %xmm3
 ret
 
+#elif defined WOLFSSL_X86_BUILD
+
+/*
+AES_CBC_encrypt (const unsigned char *in,
+	unsigned char *out,
+	unsigned char ivec[16],
+	unsigned long length,
+	const unsigned char *KS,
+	int nr)
+*/
+#ifndef __APPLE__
+.globl AES_CBC_encrypt
+AES_CBC_encrypt:
+#else
+.globl _AES_CBC_encrypt
+_AES_CBC_encrypt:
+#endif
+        # parameter 1: stack[4] => %edi
+        # parameter 2: stack[8] => %esi
+        # parameter 3: stack[12] => %edx
+        # parameter 4: stack[16] => %ecx
+        # parameter 5: stack[20] => %eax
+        # parameter 6: stack[24] => %ebx
+        push	%edi
+        push	%esi
+        push	%ebx
+        push	%ebp
+        movl	20(%esp), %edi
+        movl	24(%esp), %esi
+        movl	28(%esp), %edx
+        movl	32(%esp), %ecx
+        movl	36(%esp), %eax
+        movl	40(%esp), %ebx
+
+        movl	%ecx, %ebp
+        shrl	$4, %ecx
+        shll	$60, %ebp
+        je	NO_PARTS
+        addl	$1, %ecx
+        NO_PARTS:
+        subl	$16, %esi
+        movdqa	(%edx), %xmm1
+        LOOP:
+        pxor	(%edi), %xmm1
+        pxor	(%eax), %xmm1
+        addl	$16,%esi
+        addl	$16,%edi
+        cmpl	$12, %ebx
+        aesenc	16(%eax),%xmm1
+        aesenc	32(%eax),%xmm1
+        aesenc	48(%eax),%xmm1
+        aesenc	64(%eax),%xmm1
+        aesenc	80(%eax),%xmm1
+        aesenc	96(%eax),%xmm1
+        aesenc	112(%eax),%xmm1
+        aesenc	128(%eax),%xmm1
+        aesenc	144(%eax),%xmm1
+        movdqa	160(%eax),%xmm2
+        jb	LAST
+        cmpl	$14, %ebx
+
+        aesenc	160(%eax),%xmm1
+        aesenc	176(%eax),%xmm1
+        movdqa	192(%eax),%xmm2
+        jb	LAST
+        aesenc	192(%eax),%xmm1
+        aesenc	208(%eax),%xmm1
+        movdqa	224(%eax),%xmm2
+        LAST:
+        decl	%ecx
+        aesenclast %xmm2,%xmm1
+        movdqu	%xmm1,(%esi)
+        jne	LOOP
+
+        pop	%ebp
+        pop	%ebx
+        pop	%esi
+        pop	%edi
+        ret
+
+
+/*
+AES_CBC_decrypt_by4 (const unsigned char *in,
+  unsigned char *out,
+  unsigned char ivec[16],
+  unsigned long length,
+  const unsigned char *KS,
+  int nr)
+*/
+#ifndef __APPLE__
+.globl AES_CBC_decrypt_by4
+AES_CBC_decrypt_by4:
+#else
+.globl _AES_CBC_decrypt_by4
+_AES_CBC_decrypt_by4:
+#endif
+# parameter 1: stack[4] => %edi
+# parameter 2: stack[8] => %esi
+# parameter 3: stack[12] => %edx
+# parameter 4: stack[16] => %ecx
+# parameter 5: stack[20] => %eax
+# parameter 6: stack[24] => %ebx
+        push	%edi
+        push	%esi
+        push	%ebx
+        push	%ebp
+        movl	20(%esp), %edi
+        movl	24(%esp), %esi
+        movl	28(%esp), %edx
+        movl	32(%esp), %ecx
+        movl	36(%esp), %eax
+        movl	40(%esp), %ebx
+        subl	$16, %esp
+
+        movdqu      (%edx), %xmm0
+        movl        %ecx, %ebp
+        shrl        $4, %ecx
+        shll        $60, %ebp
+        movdqu      %xmm0, (%esp)
+        je          DNO_PARTS_4
+        addl        $1, %ecx
+DNO_PARTS_4:
+        movl        %ecx, %ebp
+        shll        $62, %ebp
+        shrl        $62, %ebp
+        shrl        $2, %ecx
+        je          DREMAINDER_4
+        subl        $64, %esi
+DLOOP_4:
+        movdqu      (%edi), %xmm1
+        movdqu      16(%edi), %xmm2
+        movdqu      32(%edi), %xmm3
+        movdqu      48(%edi), %xmm4
+        movdqa      (%eax), %xmm5
+        movdqa      16(%eax), %xmm6
+        movdqa      32(%eax), %xmm7
+        movdqa      48(%eax), %xmm0
+        pxor        %xmm5, %xmm1
+        pxor        %xmm5, %xmm2
+        pxor        %xmm5, %xmm3
+        pxor        %xmm5, %xmm4
+        aesdec      %xmm6, %xmm1
+        aesdec      %xmm6, %xmm2
+        aesdec      %xmm6, %xmm3
+        aesdec      %xmm6, %xmm4
+        aesdec      %xmm7, %xmm1
+        aesdec      %xmm7, %xmm2
+        aesdec      %xmm7, %xmm3
+        aesdec      %xmm7, %xmm4
+        aesdec      %xmm0, %xmm1
+        aesdec      %xmm0, %xmm2
+        aesdec      %xmm0, %xmm3
+        aesdec      %xmm0, %xmm4
+        movdqa      64(%eax), %xmm5
+        movdqa      80(%eax), %xmm6
+        movdqa      96(%eax), %xmm7
+        movdqa      112(%eax), %xmm0
+        aesdec      %xmm5, %xmm1
+        aesdec      %xmm5, %xmm2
+        aesdec      %xmm5, %xmm3
+        aesdec      %xmm5, %xmm4
+        aesdec      %xmm6, %xmm1
+        aesdec      %xmm6, %xmm2
+        aesdec      %xmm6, %xmm3
+        aesdec      %xmm6, %xmm4
+        aesdec      %xmm7, %xmm1
+        aesdec      %xmm7, %xmm2
+        aesdec      %xmm7, %xmm3
+        aesdec      %xmm7, %xmm4
+        aesdec      %xmm0, %xmm1
+        aesdec      %xmm0, %xmm2
+        aesdec      %xmm0, %xmm3
+        aesdec      %xmm0, %xmm4
+        movdqa      128(%eax), %xmm5
+        movdqa      144(%eax), %xmm6
+        movdqa      160(%eax), %xmm7
+        cmpl        $12, %ebx
+        aesdec      %xmm5, %xmm1
+        aesdec      %xmm5, %xmm2
+        aesdec      %xmm5, %xmm3
+        aesdec      %xmm5, %xmm4
+        aesdec      %xmm6, %xmm1
+        aesdec      %xmm6, %xmm2
+        aesdec      %xmm6, %xmm3
+        aesdec      %xmm6, %xmm4
+        jb          DLAST_4
+        movdqa      160(%eax), %xmm5
+        movdqa      176(%eax), %xmm6
+        movdqa      192(%eax), %xmm7
+        cmpl        $14, %ebx
+        aesdec      %xmm5, %xmm1
+        aesdec      %xmm5, %xmm2
+        aesdec      %xmm5, %xmm3
+        aesdec      %xmm5, %xmm4
+        aesdec      %xmm6, %xmm1
+        aesdec      %xmm6, %xmm2
+        aesdec      %xmm6, %xmm3
+        aesdec      %xmm6, %xmm4
+        jb          DLAST_4
+        movdqa      192(%eax), %xmm5
+        movdqa      208(%eax), %xmm6
+        movdqa      224(%eax), %xmm7
+        aesdec      %xmm5, %xmm1
+        aesdec      %xmm5, %xmm2
+        aesdec      %xmm5, %xmm3
+        aesdec      %xmm5, %xmm4
+        aesdec      %xmm6, %xmm1
+        aesdec      %xmm6, %xmm2
+        aesdec      %xmm6, %xmm3
+        aesdec      %xmm6, %xmm4
+DLAST_4:
+        addl        $64, %esi
+        aesdeclast  %xmm7, %xmm1
+        aesdeclast  %xmm7, %xmm2
+        aesdeclast  %xmm7, %xmm3
+        aesdeclast  %xmm7, %xmm4
+        movdqu      (%esp), %xmm0
+        movdqu      (%edi), %xmm5
+        movdqu      16(%edi), %xmm6
+        movdqu      32(%edi), %xmm7
+        pxor        %xmm0, %xmm1
+        pxor        %xmm5, %xmm2
+        pxor        %xmm6, %xmm3
+        pxor        %xmm7, %xmm4
+        movdqu      48(%edi), %xmm0
+        movdqu      %xmm1, (%esi)
+        movdqu      %xmm2, 16(%esi)
+        movdqu      %xmm3, 32(%esi)
+        movdqu      %xmm4, 48(%esi)
+        movdqu      %xmm0, (%esp)
+        addl        $64, %edi
+        decl        %ecx
+        jne         DLOOP_4
+        addl        $64, %esi
+DREMAINDER_4:
+        cmpl        $0, %ebp
+        je          DEND_4
+DLOOP_4_2:
+        movdqu      (%edi), %xmm1
+        movdqa      %xmm1, %xmm5
+        addl        $16, %edi
+        pxor        (%eax), %xmm1
+        movdqu      160(%eax), %xmm2
+        cmpl        $12, %ebx
+        aesdec      16(%eax), %xmm1
+        aesdec      32(%eax), %xmm1
+        aesdec      48(%eax), %xmm1
+        aesdec      64(%eax), %xmm1
+        aesdec      80(%eax), %xmm1
+        aesdec      96(%eax), %xmm1
+        aesdec      112(%eax), %xmm1
+        aesdec      128(%eax), %xmm1
+        aesdec      144(%eax), %xmm1
+        jb          DLAST_4_2
+        movdqu      192(%eax), %xmm2
+        cmpl        $14, %ebx
+        aesdec      160(%eax), %xmm1
+        aesdec      176(%eax), %xmm1
+        jb          DLAST_4_2
+        movdqu      224(%eax), %xmm2
+        aesdec      192(%eax), %xmm1
+        aesdec      208(%eax), %xmm1
+DLAST_4_2:
+        aesdeclast  %xmm2, %xmm1
+        pxor        %xmm0, %xmm1
+        movdqa      %xmm5, %xmm0
+        movdqu      %xmm1, (%esi)
+        addl        $16, %esi
+        decl        %ebp
+        jne         DLOOP_4_2
+DEND_4:
+
+        addl	$16, %esp
+        pop	%ebp
+        pop	%ebx
+        pop	%esi
+        pop	%edi
+        ret
+
+/*
+AES_ECB_encrypt (const unsigned char *in,
+	unsigned char *out,
+	unsigned long length,
+	const unsigned char *KS,
+	int nr)
+*/
+#ifndef __APPLE__
+.globl AES_ECB_encrypt
+AES_ECB_encrypt:
+#else
+.globl _AES_ECB_encrypt
+_AES_ECB_encrypt:
+#endif
+# parameter 1: stack[4] => %edi
+# parameter 2: stack[8] => %esi
+# parameter 3: stack[12] => %edx
+# parameter 4: stack[16] => %ecx
+# parameter 5: stack[20] => %eax
+        push	%edi
+        push	%esi
+        push	%ebx
+        movl	16(%esp), %edi
+        movl	20(%esp), %esi
+        movl	24(%esp), %edx
+        movl	28(%esp), %ecx
+        movl	32(%esp), %eax
+
+        movl    %edx, %ebx
+        shrl    $4, %edx
+        shll    $60, %ebx
+        je      EECB_NO_PARTS_4
+        addl    $1, %edx
+EECB_NO_PARTS_4:
+        movl    %edx, %ebx
+        shll    $62, %ebx
+        shrl    $62, %ebx
+        shrl    $2, %edx
+        je      EECB_REMAINDER_4
+        subl    $64, %esi
+EECB_LOOP_4:
+        movdqu  (%edi), %xmm1
+        movdqu  16(%edi), %xmm2
+        movdqu  32(%edi), %xmm3
+        movdqu  48(%edi), %xmm4
+        movdqa  (%ecx), %xmm5
+        movdqa  16(%ecx), %xmm6
+        movdqa  32(%ecx), %xmm7
+        movdqa  48(%ecx), %xmm0
+        pxor    %xmm5, %xmm1
+        pxor    %xmm5, %xmm2
+        pxor    %xmm5, %xmm3
+        pxor    %xmm5, %xmm4
+        aesenc  %xmm6, %xmm1
+        aesenc  %xmm6, %xmm2
+        aesenc  %xmm6, %xmm3
+        aesenc  %xmm6, %xmm4
+        aesenc  %xmm7, %xmm1
+        aesenc  %xmm7, %xmm2
+        aesenc  %xmm7, %xmm3
+        aesenc  %xmm7, %xmm4
+        aesenc  %xmm0, %xmm1
+        aesenc  %xmm0, %xmm2
+        aesenc  %xmm0, %xmm3
+        aesenc  %xmm0, %xmm4
+        movdqa  64(%ecx), %xmm5
+        movdqa  80(%ecx), %xmm6
+        movdqa  96(%ecx), %xmm7
+        movdqa  112(%ecx), %xmm0
+        aesenc  %xmm5, %xmm1
+        aesenc  %xmm5, %xmm2
+        aesenc  %xmm5, %xmm3
+        aesenc  %xmm5, %xmm4
+        aesenc  %xmm6, %xmm1
+        aesenc  %xmm6, %xmm2
+        aesenc  %xmm6, %xmm3
+        aesenc  %xmm6, %xmm4
+        aesenc  %xmm7, %xmm1
+        aesenc  %xmm7, %xmm2
+        aesenc  %xmm7, %xmm3
+        aesenc  %xmm7, %xmm4
+        aesenc  %xmm0, %xmm1
+        aesenc  %xmm0, %xmm2
+        aesenc  %xmm0, %xmm3
+        aesenc  %xmm0, %xmm4
+        movdqa  128(%ecx), %xmm5
+        movdqa  144(%ecx), %xmm6
+        movdqa  160(%ecx), %xmm7
+        cmpl    $12, %eax
+        aesenc  %xmm5, %xmm1
+        aesenc  %xmm5, %xmm2
+        aesenc  %xmm5, %xmm3
+        aesenc  %xmm5, %xmm4
+        aesenc  %xmm6, %xmm1
+        aesenc  %xmm6, %xmm2
+        aesenc  %xmm6, %xmm3
+        aesenc  %xmm6, %xmm4
+        jb      EECB_LAST_4
+        movdqa  160(%ecx), %xmm5
+        movdqa  176(%ecx), %xmm6
+        movdqa  192(%ecx), %xmm7
+        cmpl    $14, %eax
+        aesenc  %xmm5, %xmm1
+        aesenc  %xmm5, %xmm2
+        aesenc  %xmm5, %xmm3
+        aesenc  %xmm5, %xmm4
+        aesenc  %xmm6, %xmm1
+        aesenc  %xmm6, %xmm2
+        aesenc  %xmm6, %xmm3
+        aesenc  %xmm6, %xmm4
+        jb      EECB_LAST_4
+        movdqa  192(%ecx), %xmm5
+        movdqa  208(%ecx), %xmm6
+        movdqa  224(%ecx), %xmm7
+        aesenc  %xmm5, %xmm1
+        aesenc  %xmm5, %xmm2
+        aesenc  %xmm5, %xmm3
+        aesenc  %xmm5, %xmm4
+        aesenc  %xmm6, %xmm1
+        aesenc  %xmm6, %xmm2
+        aesenc  %xmm6, %xmm3
+        aesenc  %xmm6, %xmm4
+EECB_LAST_4:
+        addl    $64, %edi
+        addl    $64, %esi
+        decl    %edx
+        aesenclast %xmm7, %xmm1
+        aesenclast %xmm7, %xmm2
+        aesenclast %xmm7, %xmm3
+        aesenclast %xmm7, %xmm4
+        movdqu  %xmm1, (%esi)
+        movdqu  %xmm2, 16(%esi)
+        movdqu  %xmm3, 32(%esi)
+        movdqu  %xmm4, 48(%esi)
+        jne     EECB_LOOP_4
+        addl    $64, %esi
+EECB_REMAINDER_4:
+        cmpl    $0, %ebx
+        je      EECB_END_4
+EECB_LOOP_4_2:
+        movdqu  (%edi), %xmm1
+        addl    $16, %edi
+        pxor    (%ecx), %xmm1
+        movdqu  160(%ecx), %xmm2
+        aesenc  16(%ecx), %xmm1
+        aesenc  32(%ecx), %xmm1
+        aesenc  48(%ecx), %xmm1
+        aesenc  64(%ecx), %xmm1
+        aesenc  80(%ecx), %xmm1
+        aesenc  96(%ecx), %xmm1
+        aesenc  112(%ecx), %xmm1
+        aesenc  128(%ecx), %xmm1
+        aesenc  144(%ecx), %xmm1
+        cmpl    $12, %eax
+        jb      EECB_LAST_4_2
+        movdqu  192(%ecx), %xmm2
+        aesenc  160(%ecx), %xmm1
+        aesenc  176(%ecx), %xmm1
+        cmpl    $14, %eax
+        jb      EECB_LAST_4_2
+        movdqu  224(%ecx), %xmm2
+        aesenc  192(%ecx), %xmm1
+        aesenc  208(%ecx), %xmm1
+EECB_LAST_4_2:
+        aesenclast %xmm2, %xmm1
+        movdqu  %xmm1, (%esi)
+        addl    $16, %esi
+        decl    %ebx
+        jne     EECB_LOOP_4_2
+EECB_END_4:
+
+        pop	%ebx
+        pop	%esi
+        pop	%edi
+        ret
+
+
+/*
+AES_ECB_decrypt (const unsigned char *in,
+  unsigned char *out,
+  unsigned long length,
+  const unsigned char *KS,
+  int nr)
+*/
+#ifndef __APPLE__
+.globl AES_ECB_decrypt
+AES_ECB_decrypt:
+#else
+.globl _AES_ECB_decrypt
+_AES_ECB_decrypt:
+#endif
+# parameter 1: stack[4] => %edi
+# parameter 2: stack[8] => %esi
+# parameter 3: stack[12] => %edx
+# parameter 4: stack[16] => %ecx
+# parameter 5: stack[20] => %eax
+        push	%edi
+        push	%esi
+        push	%ebx
+        movl	20(%esp), %edi
+        movl	24(%esp), %esi
+        movl	28(%esp), %edx
+        movl	32(%esp), %ecx
+        movl	36(%esp), %eax
+
+
+        movl    %edx, %ebx
+        shrl    $4, %edx
+        shll    $60, %ebx
+        je      DECB_NO_PARTS_4
+        addl    $1, %edx
+DECB_NO_PARTS_4:
+        movl    %edx, %ebx
+        shll    $62, %ebx
+        shrl    $62, %ebx
+        shrl    $2, %edx
+        je      DECB_REMAINDER_4
+        subl    $64, %esi
+DECB_LOOP_4:
+        movdqu  (%edi), %xmm1
+        movdqu  16(%edi), %xmm2
+        movdqu  32(%edi), %xmm3
+        movdqu  48(%edi), %xmm4
+        movdqa  (%ecx), %xmm5
+        movdqa  16(%ecx), %xmm6
+        movdqa  32(%ecx), %xmm7
+        movdqa  48(%ecx), %xmm0
+        pxor    %xmm5, %xmm1
+        pxor    %xmm5, %xmm2
+        pxor    %xmm5, %xmm3
+        pxor    %xmm5, %xmm4
+        aesdec  %xmm6, %xmm1
+        aesdec  %xmm6, %xmm2
+        aesdec  %xmm6, %xmm3
+        aesdec  %xmm6, %xmm4
+        aesdec  %xmm7, %xmm1
+        aesdec  %xmm7, %xmm2
+        aesdec  %xmm7, %xmm3
+        aesdec  %xmm7, %xmm4
+        aesdec  %xmm0, %xmm1
+        aesdec  %xmm0, %xmm2
+        aesdec  %xmm0, %xmm3
+        aesdec  %xmm0, %xmm4
+        movdqa  64(%ecx), %xmm5
+        movdqa  80(%ecx), %xmm6
+        movdqa  96(%ecx), %xmm7
+        movdqa  112(%ecx), %xmm0
+        aesdec  %xmm5, %xmm1
+        aesdec  %xmm5, %xmm2
+        aesdec  %xmm5, %xmm3
+        aesdec  %xmm5, %xmm4
+        aesdec  %xmm6, %xmm1
+        aesdec  %xmm6, %xmm2
+        aesdec  %xmm6, %xmm3
+        aesdec  %xmm6, %xmm4
+        aesdec  %xmm7, %xmm1
+        aesdec  %xmm7, %xmm2
+        aesdec  %xmm7, %xmm3
+        aesdec  %xmm7, %xmm4
+        aesdec  %xmm0, %xmm1
+        aesdec  %xmm0, %xmm2
+        aesdec  %xmm0, %xmm3
+        aesdec  %xmm0, %xmm4
+        movdqa  128(%ecx), %xmm5
+        movdqa  144(%ecx), %xmm6
+        movdqa  160(%ecx), %xmm7
+        cmpl    $12, %eax
+        aesdec  %xmm5, %xmm1
+        aesdec  %xmm5, %xmm2
+        aesdec  %xmm5, %xmm3
+        aesdec  %xmm5, %xmm4
+        aesdec  %xmm6, %xmm1
+        aesdec  %xmm6, %xmm2
+        aesdec  %xmm6, %xmm3
+        aesdec  %xmm6, %xmm4
+        jb      DECB_LAST_4
+        movdqa  160(%ecx), %xmm5
+        movdqa  176(%ecx), %xmm6
+        movdqa  192(%ecx), %xmm7
+        cmpl    $14, %eax
+        aesdec  %xmm5, %xmm1
+        aesdec  %xmm5, %xmm2
+        aesdec  %xmm5, %xmm3
+        aesdec  %xmm5, %xmm4
+        aesdec  %xmm6, %xmm1
+        aesdec  %xmm6, %xmm2
+        aesdec  %xmm6, %xmm3
+        aesdec  %xmm6, %xmm4
+        jb      DECB_LAST_4
+        movdqa  192(%ecx), %xmm5
+        movdqa  208(%ecx), %xmm6
+        movdqa  224(%ecx), %xmm7
+        aesdec  %xmm5, %xmm1
+        aesdec  %xmm5, %xmm2
+        aesdec  %xmm5, %xmm3
+        aesdec  %xmm5, %xmm4
+        aesdec  %xmm6, %xmm1
+        aesdec  %xmm6, %xmm2
+        aesdec  %xmm6, %xmm3
+        aesdec  %xmm6, %xmm4
+DECB_LAST_4:
+        addl    $64, %edi
+        addl    $64, %esi
+        decl    %edx
+        aesdeclast %xmm7, %xmm1
+        aesdeclast %xmm7, %xmm2
+        aesdeclast %xmm7, %xmm3
+        aesdeclast %xmm7, %xmm4
+        movdqu  %xmm1, (%esi)
+        movdqu  %xmm2, 16(%esi)
+        movdqu  %xmm3, 32(%esi)
+        movdqu  %xmm4, 48(%esi)
+        jne     DECB_LOOP_4
+        addl    $64, %esi
+DECB_REMAINDER_4:
+        cmpl    $0, %ebx
+        je      DECB_END_4
+DECB_LOOP_4_2:
+        movdqu  (%edi), %xmm1
+        addl    $16, %edi
+        pxor    (%ecx), %xmm1
+        movdqu  160(%ecx), %xmm2
+        cmpl    $12, %eax
+        aesdec  16(%ecx), %xmm1
+        aesdec  32(%ecx), %xmm1
+        aesdec  48(%ecx), %xmm1
+        aesdec  64(%ecx), %xmm1
+        aesdec  80(%ecx), %xmm1
+        aesdec  96(%ecx), %xmm1
+        aesdec  112(%ecx), %xmm1
+        aesdec  128(%ecx), %xmm1
+        aesdec  144(%ecx), %xmm1
+        jb      DECB_LAST_4_2
+        cmpl    $14, %eax
+        movdqu  192(%ecx), %xmm2
+        aesdec  160(%ecx), %xmm1
+        aesdec  176(%ecx), %xmm1
+        jb      DECB_LAST_4_2
+        movdqu  224(%ecx), %xmm2
+        aesdec  192(%ecx), %xmm1
+        aesdec  208(%ecx), %xmm1
+DECB_LAST_4_2:
+        aesdeclast %xmm2, %xmm1
+        movdqu  %xmm1, (%esi)
+        addl    $16, %esi
+        decl    %ebx
+        jne     DECB_LOOP_4_2
+DECB_END_4:
+        pop	%ebx
+        pop	%esi
+        pop	%edi
+        ret
+
+
+
+/*
+void AES_128_Key_Expansion(const unsigned char* userkey,
+   unsigned char* key_schedule);
+*/
+.align  16,0x90
+#ifndef __APPLE__
+.globl AES_128_Key_Expansion
+AES_128_Key_Expansion:
+#else
+.globl _AES_128_Key_Expansion
+_AES_128_Key_Expansion:
+#endif
+        # parameter 1: stack[4] => %eax
+        # parameter 2: stack[8] => %edx
+        movl	4(%esp), %eax
+        movl	8(%esp), %edx
+
+        movl    $10, 240(%edx)
+
+        movdqu  (%eax), %xmm1
+        movdqa    %xmm1, (%edx)
+
+
+ASSISTS:
+        aeskeygenassist $1, %xmm1, %xmm2
+        call PREPARE_ROUNDKEY_128
+        movdqa %xmm1, 16(%edx)
+        aeskeygenassist $2, %xmm1, %xmm2
+        call PREPARE_ROUNDKEY_128
+        movdqa %xmm1, 32(%edx)
+        aeskeygenassist $4, %xmm1, %xmm2
+        call PREPARE_ROUNDKEY_128
+        movdqa %xmm1, 48(%edx)
+        aeskeygenassist $8, %xmm1, %xmm2
+        call PREPARE_ROUNDKEY_128
+        movdqa %xmm1, 64(%edx)
+        aeskeygenassist $16, %xmm1, %xmm2
+        call PREPARE_ROUNDKEY_128
+        movdqa %xmm1, 80(%edx)
+        aeskeygenassist $32, %xmm1, %xmm2
+        call PREPARE_ROUNDKEY_128
+        movdqa %xmm1, 96(%edx)
+        aeskeygenassist $64, %xmm1, %xmm2
+        call PREPARE_ROUNDKEY_128
+        movdqa %xmm1, 112(%edx)
+        aeskeygenassist $0x80, %xmm1, %xmm2
+        call PREPARE_ROUNDKEY_128
+        movdqa %xmm1, 128(%edx)
+        aeskeygenassist $0x1b, %xmm1, %xmm2
+        call PREPARE_ROUNDKEY_128
+        movdqa %xmm1, 144(%edx)
+        aeskeygenassist $0x36, %xmm1, %xmm2
+        call PREPARE_ROUNDKEY_128
+        movdqa %xmm1, 160(%edx)
+        ret
+
+PREPARE_ROUNDKEY_128:
+        pshufd $255, %xmm2, %xmm2
+        movdqa %xmm1, %xmm3
+        pslldq $4, %xmm3
+        pxor %xmm3, %xmm1
+        pslldq $4, %xmm3
+        pxor %xmm3, %xmm1
+        pslldq $4, %xmm3
+        pxor %xmm3, %xmm1
+        pxor %xmm2, %xmm1
+        ret
+
+
+/*
+void AES_192_Key_Expansion (const unsigned char *userkey,
+  unsigned char *key)
+*/
+#ifndef __APPLE__
+.globl AES_192_Key_Expansion
+AES_192_Key_Expansion:
+#else
+.globl _AES_192_Key_Expansion
+_AES_192_Key_Expansion:
+#endif
+        # parameter 1: stack[4] => %eax
+        # parameter 2: stack[8] => %edx
+        movl	4(%esp), %eax
+        movl	8(%esp), %edx
+
+        movdqu (%eax), %xmm1
+        movq 16(%eax), %xmm3
+        movdqa %xmm1, (%edx)
+        movdqa %xmm3, %xmm5
+
+        aeskeygenassist $0x1, %xmm3, %xmm2
+        call PREPARE_ROUNDKEY_192
+        shufpd $0, %xmm1, %xmm5
+        movdqa %xmm5, 16(%edx)
+        movdqa %xmm1, %xmm6
+        shufpd $1, %xmm3, %xmm6
+        movdqa %xmm6, 32(%edx)
+
+        aeskeygenassist $0x2, %xmm3, %xmm2
+        call PREPARE_ROUNDKEY_192
+        movdqa %xmm1, 48(%edx)
+        movdqa %xmm3, %xmm5
+
+        aeskeygenassist $0x4, %xmm3, %xmm2
+        call PREPARE_ROUNDKEY_192
+        shufpd $0, %xmm1, %xmm5
+        movdqa %xmm5, 64(%edx)
+        movdqa %xmm1, %xmm6
+        shufpd $1, %xmm3, %xmm6
+        movdqa %xmm6, 80(%edx)
+
+        aeskeygenassist $0x8, %xmm3, %xmm2
+        call PREPARE_ROUNDKEY_192
+        movdqa %xmm1, 96(%edx)
+        movdqa %xmm3, %xmm5
+
+        aeskeygenassist $0x10, %xmm3, %xmm2
+        call PREPARE_ROUNDKEY_192
+        shufpd $0, %xmm1, %xmm5
+        movdqa %xmm5, 112(%edx)
+        movdqa %xmm1, %xmm6
+        shufpd $1, %xmm3, %xmm6
+        movdqa %xmm6, 128(%edx)
+
+        aeskeygenassist $0x20, %xmm3, %xmm2
+        call PREPARE_ROUNDKEY_192
+        movdqa %xmm1, 144(%edx)
+        movdqa %xmm3, %xmm5
+
+        aeskeygenassist $0x40, %xmm3, %xmm2
+        call PREPARE_ROUNDKEY_192
+        shufpd $0, %xmm1, %xmm5
+        movdqa %xmm5, 160(%edx)
+        movdqa %xmm1, %xmm6
+        shufpd $1, %xmm3, %xmm6
+        movdqa %xmm6, 176(%edx)
+
+        aeskeygenassist $0x80, %xmm3, %xmm2
+        call PREPARE_ROUNDKEY_192
+        movdqa %xmm1, 192(%edx)
+        movdqa %xmm3, 208(%edx)
+        ret
+
+PREPARE_ROUNDKEY_192:
+        pshufd $0x55, %xmm2, %xmm2
+        movdqu %xmm1, %xmm4
+        pslldq $4, %xmm4
+        pxor   %xmm4, %xmm1
+
+        pslldq $4, %xmm4
+        pxor   %xmm4, %xmm1
+        pslldq $4, %xmm4
+        pxor  %xmm4, %xmm1
+        pxor   %xmm2, %xmm1
+        pshufd $0xff, %xmm1, %xmm2
+        movdqu %xmm3, %xmm4
+        pslldq $4, %xmm4
+        pxor   %xmm4, %xmm3
+        pxor   %xmm2, %xmm3
+        ret
+
+
+/*
+void AES_256_Key_Expansion (const unsigned char *userkey,
+  unsigned char *key)
+*/
+#ifndef __APPLE__
+.globl AES_256_Key_Expansion
+AES_256_Key_Expansion:
+#else
+.globl _AES_256_Key_Expansion
+_AES_256_Key_Expansion:
+#endif
+        # parameter 1: stack[4] => %eax
+        # parameter 2: stack[8] => %edx
+        movl	4(%esp), %eax
+        movl	8(%esp), %edx
+
+        movdqu (%eax), %xmm1
+        movdqu 16(%eax), %xmm3
+        movdqa %xmm1, (%edx)
+        movdqa %xmm3, 16(%edx)
+
+        aeskeygenassist $0x1, %xmm3, %xmm2
+        call MAKE_RK256_a
+        movdqa %xmm1, 32(%edx)
+        aeskeygenassist $0x0, %xmm1, %xmm2
+        call MAKE_RK256_b
+        movdqa %xmm3, 48(%edx)
+        aeskeygenassist $0x2, %xmm3, %xmm2
+        call MAKE_RK256_a
+        movdqa %xmm1, 64(%edx)
+        aeskeygenassist $0x0, %xmm1, %xmm2
+        call MAKE_RK256_b
+        movdqa %xmm3, 80(%edx)
+        aeskeygenassist $0x4, %xmm3, %xmm2
+        call MAKE_RK256_a
+        movdqa %xmm1, 96(%edx)
+        aeskeygenassist $0x0, %xmm1, %xmm2
+        call MAKE_RK256_b
+        movdqa %xmm3, 112(%edx)
+        aeskeygenassist $0x8, %xmm3, %xmm2
+        call MAKE_RK256_a
+        movdqa %xmm1, 128(%edx)
+        aeskeygenassist $0x0, %xmm1, %xmm2
+        call MAKE_RK256_b
+        movdqa %xmm3, 144(%edx)
+        aeskeygenassist $0x10, %xmm3, %xmm2
+        call MAKE_RK256_a
+        movdqa %xmm1, 160(%edx)
+        aeskeygenassist $0x0, %xmm1, %xmm2
+        call MAKE_RK256_b
+        movdqa %xmm3, 176(%edx)
+        aeskeygenassist $0x20, %xmm3, %xmm2
+        call MAKE_RK256_a
+        movdqa %xmm1, 192(%edx)
+
+        aeskeygenassist $0x0, %xmm1, %xmm2
+        call MAKE_RK256_b
+        movdqa %xmm3, 208(%edx)
+        aeskeygenassist $0x40, %xmm3, %xmm2
+        call MAKE_RK256_a
+        movdqa %xmm1, 224(%edx)
+
+        ret
+
+MAKE_RK256_a:
+        pshufd $0xff, %xmm2, %xmm2
+        movdqa %xmm1, %xmm4
+        pslldq $4, %xmm4
+        pxor   %xmm4, %xmm1
+        pslldq $4, %xmm4
+        pxor  %xmm4, %xmm1
+        pslldq $4, %xmm4
+        pxor  %xmm4, %xmm1
+        pxor   %xmm2, %xmm1
+        ret
+
+MAKE_RK256_b:
+        pshufd $0xaa, %xmm2, %xmm2
+        movdqa %xmm3, %xmm4
+        pslldq $4, %xmm4
+        pxor   %xmm4, %xmm3
+        pslldq $4, %xmm4
+        pxor  %xmm4, %xmm3
+        pslldq $4, %xmm4
+        pxor  %xmm4, %xmm3
+        pxor   %xmm2, %xmm3
+        ret
+
+#endif /* WOLFSSL_X86_64_BUILD */
+
 #if defined(__linux__) && defined(__ELF__)
     .section .note.GNU-stack,"",%progbits
 #endif
+

+ 2 - 0
wolfcrypt/src/aes_gcm_asm.S

@@ -30,6 +30,7 @@
 #define HAVE_INTEL_AVX2
 #endif /* NO_AVX2_SUPPORT */
 
+#ifdef WOLFSSL_X86_64_BUILD
 #ifndef __APPLE__
 .data
 #else
@@ -15833,6 +15834,7 @@ L_AES_GCM_decrypt_final_avx2_cmp_tag_done:
 #endif /* __APPLE__ */
 #endif /* WOLFSSL_AESGCM_STREAM */
 #endif /* HAVE_INTEL_AVX2 */
+#endif /* WOLFSSL_X86_64_BUILD */
 
 #if defined(__linux__) && defined(__ELF__)
 .section	.note.GNU-stack,"",%progbits

+ 12962 - 0
wolfcrypt/src/aes_gcm_x86_asm.S

@@ -0,0 +1,12962 @@
+/* aes_gcm_x86_asm
+ *
+ * Copyright (C) 2006-2022 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+#ifdef WOLFSSL_USER_SETTINGS
+#include "wolfssl/wolfcrypt/settings.h"
+#endif
+
+#ifndef HAVE_INTEL_AVX1
+#define HAVE_INTEL_AVX1
+#endif /* HAVE_INTEL_AVX1 */
+#ifndef NO_AVX2_SUPPORT
+#define HAVE_INTEL_AVX2
+#endif /* NO_AVX2_SUPPORT */
+
+.type	data, @object
+L_aes_gcm_one:
+.long	0x0,0x0,0x1,0x0
+.type	data, @object
+L_aes_gcm_two:
+.long	0x0,0x0,0x2,0x0
+.type	data, @object
+L_aes_gcm_three:
+.long	0x0,0x0,0x3,0x0
+.type	data, @object
+L_aes_gcm_four:
+.long	0x0,0x0,0x4,0x0
+.type	data, @object
+L_aes_gcm_bswap_epi64:
+.long	0x4050607,0x10203,0xc0d0e0f,0x8090a0b
+.type	data, @object
+L_aes_gcm_bswap_mask:
+.long	0xc0d0e0f,0x8090a0b,0x4050607,0x10203
+.type	data, @object
+L_aes_gcm_mod2_128:
+.long	0x1,0x0,0x0,0xc2000000
+.type	data, @object
+L_aes_gcm_avx1_one:
+.long	0x0,0x0,0x1,0x0
+.type	data, @object
+L_aes_gcm_avx1_two:
+.long	0x0,0x0,0x2,0x0
+.type	data, @object
+L_aes_gcm_avx1_three:
+.long	0x0,0x0,0x3,0x0
+.type	data, @object
+L_aes_gcm_avx1_four:
+.long	0x0,0x0,0x4,0x0
+.type	data, @object
+L_aes_gcm_avx1_bswap_epi64:
+.long	0x4050607,0x10203,0xc0d0e0f,0x8090a0b
+.type	data, @object
+L_aes_gcm_avx1_bswap_mask:
+.long	0xc0d0e0f,0x8090a0b,0x4050607,0x10203
+.type	data, @object
+L_aes_gcm_avx1_mod2_128:
+.long	0x1,0x0,0x0,0xc2000000
+.type	data, @object
+L_aes_gcm_avx2_one:
+.long	0x0,0x0,0x1,0x0
+.type	data, @object
+L_aes_gcm_avx2_two:
+.long	0x0,0x0,0x2,0x0
+.type	data, @object
+L_aes_gcm_avx2_three:
+.long	0x0,0x0,0x3,0x0
+.type	data, @object
+L_aes_gcm_avx2_four:
+.long	0x0,0x0,0x4,0x0
+.type	data, @object
+L_avx2_aes_gcm_bswap_one:
+.long	0x0,0x0,0x0,0x1000000
+.type	data, @object
+L_aes_gcm_avx2_bswap_epi64:
+.long	0x4050607,0x10203,0xc0d0e0f,0x8090a0b
+.type	data, @object
+L_aes_gcm_avx2_bswap_mask:
+.long	0xc0d0e0f,0x8090a0b,0x4050607,0x10203
+.type	data, @object
+L_aes_gcm_avx2_mod2_128:
+.long	0x1,0x0,0x0,0xc2000000
+.text
+.globl	AES_GCM_encrypt
+.type	AES_GCM_encrypt,@function
+.align	16
+AES_GCM_encrypt:
+        pushl	%ebx
+        pushl	%esi
+        pushl	%edi
+        pushl	%ebp
+        subl	$0x70, %esp
+        movl	144(%esp), %esi
+        movl	168(%esp), %ebp
+        movl	160(%esp), %edx
+        pxor	%xmm0, %xmm0
+        pxor	%xmm2, %xmm2
+        cmpl	$12, %edx
+        jne	L_AES_GCM_encrypt_iv_not_12
+        # # Calculate values when IV is 12 bytes
+        # Set counter based on IV
+        movl	$0x1000000, %ecx
+        pinsrd	$0x00, (%esi), %xmm0
+        pinsrd	$0x01, 4(%esi), %xmm0
+        pinsrd	$2, 8(%esi), %xmm0
+        pinsrd	$3, %ecx, %xmm0
+        # H = Encrypt X(=0) and T = Encrypt counter
+        movdqa	%xmm0, %xmm5
+        movdqa	(%ebp), %xmm1
+        pxor	%xmm1, %xmm5
+        movdqa	16(%ebp), %xmm3
+        aesenc	%xmm3, %xmm1
+        aesenc	%xmm3, %xmm5
+        movdqa	32(%ebp), %xmm3
+        aesenc	%xmm3, %xmm1
+        aesenc	%xmm3, %xmm5
+        movdqa	48(%ebp), %xmm3
+        aesenc	%xmm3, %xmm1
+        aesenc	%xmm3, %xmm5
+        movdqa	64(%ebp), %xmm3
+        aesenc	%xmm3, %xmm1
+        aesenc	%xmm3, %xmm5
+        movdqa	80(%ebp), %xmm3
+        aesenc	%xmm3, %xmm1
+        aesenc	%xmm3, %xmm5
+        movdqa	96(%ebp), %xmm3
+        aesenc	%xmm3, %xmm1
+        aesenc	%xmm3, %xmm5
+        movdqa	112(%ebp), %xmm3
+        aesenc	%xmm3, %xmm1
+        aesenc	%xmm3, %xmm5
+        movdqa	128(%ebp), %xmm3
+        aesenc	%xmm3, %xmm1
+        aesenc	%xmm3, %xmm5
+        movdqa	144(%ebp), %xmm3
+        aesenc	%xmm3, %xmm1
+        aesenc	%xmm3, %xmm5
+        cmpl	$11, 172(%esp)
+        movdqa	160(%ebp), %xmm3
+        jl	L_AES_GCM_encrypt_calc_iv_12_last
+        aesenc	%xmm3, %xmm1
+        aesenc	%xmm3, %xmm5
+        movdqa	176(%ebp), %xmm3
+        aesenc	%xmm3, %xmm1
+        aesenc	%xmm3, %xmm5
+        cmpl	$13, 172(%esp)
+        movdqa	192(%ebp), %xmm3
+        jl	L_AES_GCM_encrypt_calc_iv_12_last
+        aesenc	%xmm3, %xmm1
+        aesenc	%xmm3, %xmm5
+        movdqa	208(%ebp), %xmm3
+        aesenc	%xmm3, %xmm1
+        aesenc	%xmm3, %xmm5
+        movdqa	224(%ebp), %xmm3
+L_AES_GCM_encrypt_calc_iv_12_last:
+        aesenclast	%xmm3, %xmm1
+        aesenclast	%xmm3, %xmm5
+        pshufb	L_aes_gcm_bswap_mask, %xmm1
+        movdqu	%xmm5, 80(%esp)
+        jmp	L_AES_GCM_encrypt_iv_done
+L_AES_GCM_encrypt_iv_not_12:
+        # Calculate values when IV is not 12 bytes
+        # H = Encrypt X(=0)
+        movdqa	(%ebp), %xmm1
+        aesenc	16(%ebp), %xmm1
+        aesenc	32(%ebp), %xmm1
+        aesenc	48(%ebp), %xmm1
+        aesenc	64(%ebp), %xmm1
+        aesenc	80(%ebp), %xmm1
+        aesenc	96(%ebp), %xmm1
+        aesenc	112(%ebp), %xmm1
+        aesenc	128(%ebp), %xmm1
+        aesenc	144(%ebp), %xmm1
+        cmpl	$11, 172(%esp)
+        movdqa	160(%ebp), %xmm5
+        jl	L_AES_GCM_encrypt_calc_iv_1_aesenc_avx_last
+        aesenc	%xmm5, %xmm1
+        aesenc	176(%ebp), %xmm1
+        cmpl	$13, 172(%esp)
+        movdqa	192(%ebp), %xmm5
+        jl	L_AES_GCM_encrypt_calc_iv_1_aesenc_avx_last
+        aesenc	%xmm5, %xmm1
+        aesenc	208(%ebp), %xmm1
+        movdqa	224(%ebp), %xmm5
+L_AES_GCM_encrypt_calc_iv_1_aesenc_avx_last:
+        aesenclast	%xmm5, %xmm1
+        pshufb	L_aes_gcm_bswap_mask, %xmm1
+        # Calc counter
+        # Initialization vector
+        cmpl	$0x00, %edx
+        movl	$0x00, %ecx
+        je	L_AES_GCM_encrypt_calc_iv_done
+        cmpl	$16, %edx
+        jl	L_AES_GCM_encrypt_calc_iv_lt16
+        andl	$0xfffffff0, %edx
+L_AES_GCM_encrypt_calc_iv_16_loop:
+        movdqu	(%esi,%ecx,1), %xmm4
+        pshufb	L_aes_gcm_bswap_mask, %xmm4
+        pxor	%xmm4, %xmm0
+        pshufd	$0x4e, %xmm0, %xmm5
+        pshufd	$0x4e, %xmm1, %xmm6
+        movdqa	%xmm1, %xmm7
+        movdqa	%xmm1, %xmm4
+        pclmulqdq	$0x11, %xmm0, %xmm7
+        pclmulqdq	$0x00, %xmm0, %xmm4
+        pxor	%xmm0, %xmm5
+        pxor	%xmm1, %xmm6
+        pclmulqdq	$0x00, %xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm6
+        movdqa	%xmm4, %xmm3
+        movdqa	%xmm7, %xmm0
+        pslldq	$8, %xmm6
+        psrldq	$8, %xmm5
+        pxor	%xmm6, %xmm3
+        pxor	%xmm5, %xmm0
+        movdqa	%xmm3, %xmm4
+        movdqa	%xmm0, %xmm5
+        psrld	$31, %xmm4
+        psrld	$31, %xmm5
+        pslld	$0x01, %xmm3
+        pslld	$0x01, %xmm0
+        movdqa	%xmm4, %xmm6
+        pslldq	$4, %xmm4
+        psrldq	$12, %xmm6
+        pslldq	$4, %xmm5
+        por	%xmm6, %xmm0
+        por	%xmm4, %xmm3
+        por	%xmm5, %xmm0
+        movdqa	%xmm3, %xmm4
+        movdqa	%xmm3, %xmm5
+        movdqa	%xmm3, %xmm6
+        pslld	$31, %xmm4
+        pslld	$30, %xmm5
+        pslld	$25, %xmm6
+        pxor	%xmm5, %xmm4
+        pxor	%xmm6, %xmm4
+        movdqa	%xmm4, %xmm5
+        psrldq	$4, %xmm5
+        pslldq	$12, %xmm4
+        pxor	%xmm4, %xmm3
+        movdqa	%xmm3, %xmm6
+        movdqa	%xmm3, %xmm7
+        movdqa	%xmm3, %xmm4
+        psrld	$0x01, %xmm6
+        psrld	$2, %xmm7
+        psrld	$7, %xmm4
+        pxor	%xmm7, %xmm6
+        pxor	%xmm4, %xmm6
+        pxor	%xmm5, %xmm6
+        pxor	%xmm3, %xmm6
+        pxor	%xmm6, %xmm0
+        addl	$16, %ecx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_encrypt_calc_iv_16_loop
+        movl	160(%esp), %edx
+        cmpl	%edx, %ecx
+        je	L_AES_GCM_encrypt_calc_iv_done
+L_AES_GCM_encrypt_calc_iv_lt16:
+        subl	$16, %esp
+        pxor	%xmm4, %xmm4
+        xorl	%ebx, %ebx
+        movdqu	%xmm4, (%esp)
+L_AES_GCM_encrypt_calc_iv_loop:
+        movzbl	(%esi,%ecx,1), %eax
+        movb	%al, (%esp,%ebx,1)
+        incl	%ecx
+        incl	%ebx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_encrypt_calc_iv_loop
+        movdqu	(%esp), %xmm4
+        addl	$16, %esp
+        pshufb	L_aes_gcm_bswap_mask, %xmm4
+        pxor	%xmm4, %xmm0
+        pshufd	$0x4e, %xmm0, %xmm5
+        pshufd	$0x4e, %xmm1, %xmm6
+        movdqa	%xmm1, %xmm7
+        movdqa	%xmm1, %xmm4
+        pclmulqdq	$0x11, %xmm0, %xmm7
+        pclmulqdq	$0x00, %xmm0, %xmm4
+        pxor	%xmm0, %xmm5
+        pxor	%xmm1, %xmm6
+        pclmulqdq	$0x00, %xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm6
+        movdqa	%xmm4, %xmm3
+        movdqa	%xmm7, %xmm0
+        pslldq	$8, %xmm6
+        psrldq	$8, %xmm5
+        pxor	%xmm6, %xmm3
+        pxor	%xmm5, %xmm0
+        movdqa	%xmm3, %xmm4
+        movdqa	%xmm0, %xmm5
+        psrld	$31, %xmm4
+        psrld	$31, %xmm5
+        pslld	$0x01, %xmm3
+        pslld	$0x01, %xmm0
+        movdqa	%xmm4, %xmm6
+        pslldq	$4, %xmm4
+        psrldq	$12, %xmm6
+        pslldq	$4, %xmm5
+        por	%xmm6, %xmm0
+        por	%xmm4, %xmm3
+        por	%xmm5, %xmm0
+        movdqa	%xmm3, %xmm4
+        movdqa	%xmm3, %xmm5
+        movdqa	%xmm3, %xmm6
+        pslld	$31, %xmm4
+        pslld	$30, %xmm5
+        pslld	$25, %xmm6
+        pxor	%xmm5, %xmm4
+        pxor	%xmm6, %xmm4
+        movdqa	%xmm4, %xmm5
+        psrldq	$4, %xmm5
+        pslldq	$12, %xmm4
+        pxor	%xmm4, %xmm3
+        movdqa	%xmm3, %xmm6
+        movdqa	%xmm3, %xmm7
+        movdqa	%xmm3, %xmm4
+        psrld	$0x01, %xmm6
+        psrld	$2, %xmm7
+        psrld	$7, %xmm4
+        pxor	%xmm7, %xmm6
+        pxor	%xmm4, %xmm6
+        pxor	%xmm5, %xmm6
+        pxor	%xmm3, %xmm6
+        pxor	%xmm6, %xmm0
+L_AES_GCM_encrypt_calc_iv_done:
+        # T = Encrypt counter
+        pxor	%xmm4, %xmm4
+        shll	$3, %edx
+        pinsrd	$0x00, %edx, %xmm4
+        pxor	%xmm4, %xmm0
+        pshufd	$0x4e, %xmm0, %xmm5
+        pshufd	$0x4e, %xmm1, %xmm6
+        movdqa	%xmm1, %xmm7
+        movdqa	%xmm1, %xmm4
+        pclmulqdq	$0x11, %xmm0, %xmm7
+        pclmulqdq	$0x00, %xmm0, %xmm4
+        pxor	%xmm0, %xmm5
+        pxor	%xmm1, %xmm6
+        pclmulqdq	$0x00, %xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm6
+        movdqa	%xmm4, %xmm3
+        movdqa	%xmm7, %xmm0
+        pslldq	$8, %xmm6
+        psrldq	$8, %xmm5
+        pxor	%xmm6, %xmm3
+        pxor	%xmm5, %xmm0
+        movdqa	%xmm3, %xmm4
+        movdqa	%xmm0, %xmm5
+        psrld	$31, %xmm4
+        psrld	$31, %xmm5
+        pslld	$0x01, %xmm3
+        pslld	$0x01, %xmm0
+        movdqa	%xmm4, %xmm6
+        pslldq	$4, %xmm4
+        psrldq	$12, %xmm6
+        pslldq	$4, %xmm5
+        por	%xmm6, %xmm0
+        por	%xmm4, %xmm3
+        por	%xmm5, %xmm0
+        movdqa	%xmm3, %xmm4
+        movdqa	%xmm3, %xmm5
+        movdqa	%xmm3, %xmm6
+        pslld	$31, %xmm4
+        pslld	$30, %xmm5
+        pslld	$25, %xmm6
+        pxor	%xmm5, %xmm4
+        pxor	%xmm6, %xmm4
+        movdqa	%xmm4, %xmm5
+        psrldq	$4, %xmm5
+        pslldq	$12, %xmm4
+        pxor	%xmm4, %xmm3
+        movdqa	%xmm3, %xmm6
+        movdqa	%xmm3, %xmm7
+        movdqa	%xmm3, %xmm4
+        psrld	$0x01, %xmm6
+        psrld	$2, %xmm7
+        psrld	$7, %xmm4
+        pxor	%xmm7, %xmm6
+        pxor	%xmm4, %xmm6
+        pxor	%xmm5, %xmm6
+        pxor	%xmm3, %xmm6
+        pxor	%xmm6, %xmm0
+        pshufb	L_aes_gcm_bswap_mask, %xmm0
+        #   Encrypt counter
+        movdqa	(%ebp), %xmm4
+        pxor	%xmm0, %xmm4
+        aesenc	16(%ebp), %xmm4
+        aesenc	32(%ebp), %xmm4
+        aesenc	48(%ebp), %xmm4
+        aesenc	64(%ebp), %xmm4
+        aesenc	80(%ebp), %xmm4
+        aesenc	96(%ebp), %xmm4
+        aesenc	112(%ebp), %xmm4
+        aesenc	128(%ebp), %xmm4
+        aesenc	144(%ebp), %xmm4
+        cmpl	$11, 172(%esp)
+        movdqa	160(%ebp), %xmm5
+        jl	L_AES_GCM_encrypt_calc_iv_2_aesenc_avx_last
+        aesenc	%xmm5, %xmm4
+        aesenc	176(%ebp), %xmm4
+        cmpl	$13, 172(%esp)
+        movdqa	192(%ebp), %xmm5
+        jl	L_AES_GCM_encrypt_calc_iv_2_aesenc_avx_last
+        aesenc	%xmm5, %xmm4
+        aesenc	208(%ebp), %xmm4
+        movdqa	224(%ebp), %xmm5
+L_AES_GCM_encrypt_calc_iv_2_aesenc_avx_last:
+        aesenclast	%xmm5, %xmm4
+        movdqu	%xmm4, 80(%esp)
+L_AES_GCM_encrypt_iv_done:
+        movl	140(%esp), %esi
+        # Additional authentication data
+        movl	156(%esp), %edx
+        cmpl	$0x00, %edx
+        je	L_AES_GCM_encrypt_calc_aad_done
+        xorl	%ecx, %ecx
+        cmpl	$16, %edx
+        jl	L_AES_GCM_encrypt_calc_aad_lt16
+        andl	$0xfffffff0, %edx
+L_AES_GCM_encrypt_calc_aad_16_loop:
+        movdqu	(%esi,%ecx,1), %xmm4
+        pshufb	L_aes_gcm_bswap_mask, %xmm4
+        pxor	%xmm4, %xmm2
+        pshufd	$0x4e, %xmm2, %xmm5
+        pshufd	$0x4e, %xmm1, %xmm6
+        movdqa	%xmm1, %xmm7
+        movdqa	%xmm1, %xmm4
+        pclmulqdq	$0x11, %xmm2, %xmm7
+        pclmulqdq	$0x00, %xmm2, %xmm4
+        pxor	%xmm2, %xmm5
+        pxor	%xmm1, %xmm6
+        pclmulqdq	$0x00, %xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm6
+        movdqa	%xmm4, %xmm3
+        movdqa	%xmm7, %xmm2
+        pslldq	$8, %xmm6
+        psrldq	$8, %xmm5
+        pxor	%xmm6, %xmm3
+        pxor	%xmm5, %xmm2
+        movdqa	%xmm3, %xmm4
+        movdqa	%xmm2, %xmm5
+        psrld	$31, %xmm4
+        psrld	$31, %xmm5
+        pslld	$0x01, %xmm3
+        pslld	$0x01, %xmm2
+        movdqa	%xmm4, %xmm6
+        pslldq	$4, %xmm4
+        psrldq	$12, %xmm6
+        pslldq	$4, %xmm5
+        por	%xmm6, %xmm2
+        por	%xmm4, %xmm3
+        por	%xmm5, %xmm2
+        movdqa	%xmm3, %xmm4
+        movdqa	%xmm3, %xmm5
+        movdqa	%xmm3, %xmm6
+        pslld	$31, %xmm4
+        pslld	$30, %xmm5
+        pslld	$25, %xmm6
+        pxor	%xmm5, %xmm4
+        pxor	%xmm6, %xmm4
+        movdqa	%xmm4, %xmm5
+        psrldq	$4, %xmm5
+        pslldq	$12, %xmm4
+        pxor	%xmm4, %xmm3
+        movdqa	%xmm3, %xmm6
+        movdqa	%xmm3, %xmm7
+        movdqa	%xmm3, %xmm4
+        psrld	$0x01, %xmm6
+        psrld	$2, %xmm7
+        psrld	$7, %xmm4
+        pxor	%xmm7, %xmm6
+        pxor	%xmm4, %xmm6
+        pxor	%xmm5, %xmm6
+        pxor	%xmm3, %xmm6
+        pxor	%xmm6, %xmm2
+        addl	$16, %ecx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_encrypt_calc_aad_16_loop
+        movl	156(%esp), %edx
+        cmpl	%edx, %ecx
+        je	L_AES_GCM_encrypt_calc_aad_done
+L_AES_GCM_encrypt_calc_aad_lt16:
+        subl	$16, %esp
+        pxor	%xmm4, %xmm4
+        xorl	%ebx, %ebx
+        movdqu	%xmm4, (%esp)
+L_AES_GCM_encrypt_calc_aad_loop:
+        movzbl	(%esi,%ecx,1), %eax
+        movb	%al, (%esp,%ebx,1)
+        incl	%ecx
+        incl	%ebx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_encrypt_calc_aad_loop
+        movdqu	(%esp), %xmm4
+        addl	$16, %esp
+        pshufb	L_aes_gcm_bswap_mask, %xmm4
+        pxor	%xmm4, %xmm2
+        pshufd	$0x4e, %xmm2, %xmm5
+        pshufd	$0x4e, %xmm1, %xmm6
+        movdqa	%xmm1, %xmm7
+        movdqa	%xmm1, %xmm4
+        pclmulqdq	$0x11, %xmm2, %xmm7
+        pclmulqdq	$0x00, %xmm2, %xmm4
+        pxor	%xmm2, %xmm5
+        pxor	%xmm1, %xmm6
+        pclmulqdq	$0x00, %xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm6
+        movdqa	%xmm4, %xmm3
+        movdqa	%xmm7, %xmm2
+        pslldq	$8, %xmm6
+        psrldq	$8, %xmm5
+        pxor	%xmm6, %xmm3
+        pxor	%xmm5, %xmm2
+        movdqa	%xmm3, %xmm4
+        movdqa	%xmm2, %xmm5
+        psrld	$31, %xmm4
+        psrld	$31, %xmm5
+        pslld	$0x01, %xmm3
+        pslld	$0x01, %xmm2
+        movdqa	%xmm4, %xmm6
+        pslldq	$4, %xmm4
+        psrldq	$12, %xmm6
+        pslldq	$4, %xmm5
+        por	%xmm6, %xmm2
+        por	%xmm4, %xmm3
+        por	%xmm5, %xmm2
+        movdqa	%xmm3, %xmm4
+        movdqa	%xmm3, %xmm5
+        movdqa	%xmm3, %xmm6
+        pslld	$31, %xmm4
+        pslld	$30, %xmm5
+        pslld	$25, %xmm6
+        pxor	%xmm5, %xmm4
+        pxor	%xmm6, %xmm4
+        movdqa	%xmm4, %xmm5
+        psrldq	$4, %xmm5
+        pslldq	$12, %xmm4
+        pxor	%xmm4, %xmm3
+        movdqa	%xmm3, %xmm6
+        movdqa	%xmm3, %xmm7
+        movdqa	%xmm3, %xmm4
+        psrld	$0x01, %xmm6
+        psrld	$2, %xmm7
+        psrld	$7, %xmm4
+        pxor	%xmm7, %xmm6
+        pxor	%xmm4, %xmm6
+        pxor	%xmm5, %xmm6
+        pxor	%xmm3, %xmm6
+        pxor	%xmm6, %xmm2
+L_AES_GCM_encrypt_calc_aad_done:
+        movdqu	%xmm2, 96(%esp)
+        movl	132(%esp), %esi
+        movl	136(%esp), %edi
+        # Calculate counter and H
+        pshufb	L_aes_gcm_bswap_epi64, %xmm0
+        movdqa	%xmm1, %xmm5
+        paddd	L_aes_gcm_one, %xmm0
+        movdqa	%xmm1, %xmm4
+        movdqu	%xmm0, 64(%esp)
+        psrlq	$63, %xmm5
+        psllq	$0x01, %xmm4
+        pslldq	$8, %xmm5
+        por	%xmm5, %xmm4
+        pshufd	$0xff, %xmm1, %xmm1
+        psrad	$31, %xmm1
+        pand	L_aes_gcm_mod2_128, %xmm1
+        pxor	%xmm4, %xmm1
+        xorl	%ebx, %ebx
+        movl	152(%esp), %eax
+        cmpl	$0x40, %eax
+        jl	L_AES_GCM_encrypt_done_64
+        andl	$0xffffffc0, %eax
+        movdqa	%xmm2, %xmm6
+        # H ^ 1
+        movdqu	%xmm1, (%esp)
+        # H ^ 2
+        pshufd	$0x4e, %xmm1, %xmm5
+        pshufd	$0x4e, %xmm1, %xmm6
+        movdqa	%xmm1, %xmm7
+        movdqa	%xmm1, %xmm4
+        pclmulqdq	$0x11, %xmm1, %xmm7
+        pclmulqdq	$0x00, %xmm1, %xmm4
+        pxor	%xmm1, %xmm5
+        pxor	%xmm1, %xmm6
+        pclmulqdq	$0x00, %xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm6
+        movdqa	%xmm7, %xmm0
+        pslldq	$8, %xmm6
+        psrldq	$8, %xmm5
+        pxor	%xmm6, %xmm4
+        pxor	%xmm5, %xmm0
+        movdqa	%xmm4, %xmm5
+        movdqa	%xmm4, %xmm6
+        movdqa	%xmm4, %xmm7
+        pslld	$31, %xmm5
+        pslld	$30, %xmm6
+        pslld	$25, %xmm7
+        pxor	%xmm6, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm7
+        psrldq	$4, %xmm7
+        pslldq	$12, %xmm5
+        pxor	%xmm5, %xmm4
+        movdqa	%xmm4, %xmm5
+        movdqa	%xmm4, %xmm6
+        psrld	$0x01, %xmm5
+        psrld	$2, %xmm6
+        pxor	%xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        psrld	$7, %xmm4
+        pxor	%xmm7, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm5, %xmm0
+        movdqu	%xmm0, 16(%esp)
+        # H ^ 3
+        pshufd	$0x4e, %xmm1, %xmm5
+        pshufd	$0x4e, %xmm0, %xmm6
+        movdqa	%xmm0, %xmm7
+        movdqa	%xmm0, %xmm4
+        pclmulqdq	$0x11, %xmm1, %xmm7
+        pclmulqdq	$0x00, %xmm1, %xmm4
+        pxor	%xmm1, %xmm5
+        pxor	%xmm0, %xmm6
+        pclmulqdq	$0x00, %xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm6
+        movdqa	%xmm7, %xmm3
+        pslldq	$8, %xmm6
+        psrldq	$8, %xmm5
+        pxor	%xmm6, %xmm4
+        pxor	%xmm5, %xmm3
+        movdqa	%xmm4, %xmm5
+        movdqa	%xmm4, %xmm6
+        movdqa	%xmm4, %xmm7
+        pslld	$31, %xmm5
+        pslld	$30, %xmm6
+        pslld	$25, %xmm7
+        pxor	%xmm6, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm7
+        psrldq	$4, %xmm7
+        pslldq	$12, %xmm5
+        pxor	%xmm5, %xmm4
+        movdqa	%xmm4, %xmm5
+        movdqa	%xmm4, %xmm6
+        psrld	$0x01, %xmm5
+        psrld	$2, %xmm6
+        pxor	%xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        psrld	$7, %xmm4
+        pxor	%xmm7, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm5, %xmm3
+        movdqu	%xmm3, 32(%esp)
+        # H ^ 4
+        pshufd	$0x4e, %xmm0, %xmm5
+        pshufd	$0x4e, %xmm0, %xmm6
+        movdqa	%xmm0, %xmm7
+        movdqa	%xmm0, %xmm4
+        pclmulqdq	$0x11, %xmm0, %xmm7
+        pclmulqdq	$0x00, %xmm0, %xmm4
+        pxor	%xmm0, %xmm5
+        pxor	%xmm0, %xmm6
+        pclmulqdq	$0x00, %xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm6
+        movdqa	%xmm7, %xmm3
+        pslldq	$8, %xmm6
+        psrldq	$8, %xmm5
+        pxor	%xmm6, %xmm4
+        pxor	%xmm5, %xmm3
+        movdqa	%xmm4, %xmm5
+        movdqa	%xmm4, %xmm6
+        movdqa	%xmm4, %xmm7
+        pslld	$31, %xmm5
+        pslld	$30, %xmm6
+        pslld	$25, %xmm7
+        pxor	%xmm6, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm7
+        psrldq	$4, %xmm7
+        pslldq	$12, %xmm5
+        pxor	%xmm5, %xmm4
+        movdqa	%xmm4, %xmm5
+        movdqa	%xmm4, %xmm6
+        psrld	$0x01, %xmm5
+        psrld	$2, %xmm6
+        pxor	%xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        psrld	$7, %xmm4
+        pxor	%xmm7, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm5, %xmm3
+        movdqu	%xmm3, 48(%esp)
+        # First 64 bytes of input
+        # Encrypt 64 bytes of counter
+        movdqu	64(%esp), %xmm4
+        movdqa	L_aes_gcm_bswap_epi64, %xmm3
+        movdqa	%xmm4, %xmm5
+        movdqa	%xmm4, %xmm6
+        movdqa	%xmm4, %xmm7
+        pshufb	%xmm3, %xmm4
+        paddd	L_aes_gcm_one, %xmm5
+        pshufb	%xmm3, %xmm5
+        paddd	L_aes_gcm_two, %xmm6
+        pshufb	%xmm3, %xmm6
+        paddd	L_aes_gcm_three, %xmm7
+        pshufb	%xmm3, %xmm7
+        movdqu	64(%esp), %xmm3
+        paddd	L_aes_gcm_four, %xmm3
+        movdqu	%xmm3, 64(%esp)
+        movdqa	(%ebp), %xmm3
+        pxor	%xmm3, %xmm4
+        pxor	%xmm3, %xmm5
+        pxor	%xmm3, %xmm6
+        pxor	%xmm3, %xmm7
+        movdqa	16(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	32(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	48(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	64(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	80(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	96(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	112(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	128(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	144(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        cmpl	$11, 172(%esp)
+        movdqa	160(%ebp), %xmm3
+        jl	L_AES_GCM_encrypt_enc_done
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	176(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        cmpl	$13, 172(%esp)
+        movdqa	192(%ebp), %xmm3
+        jl	L_AES_GCM_encrypt_enc_done
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	208(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	224(%ebp), %xmm3
+L_AES_GCM_encrypt_enc_done:
+        aesenclast	%xmm3, %xmm4
+        aesenclast	%xmm3, %xmm5
+        movdqu	(%esi), %xmm0
+        movdqu	16(%esi), %xmm1
+        pxor	%xmm0, %xmm4
+        pxor	%xmm1, %xmm5
+        movdqu	%xmm4, (%edi)
+        movdqu	%xmm5, 16(%edi)
+        aesenclast	%xmm3, %xmm6
+        aesenclast	%xmm3, %xmm7
+        movdqu	32(%esi), %xmm0
+        movdqu	48(%esi), %xmm1
+        pxor	%xmm0, %xmm6
+        pxor	%xmm1, %xmm7
+        movdqu	%xmm6, 32(%edi)
+        movdqu	%xmm7, 48(%edi)
+        cmpl	$0x40, %eax
+        movl	$0x40, %ebx
+        movl	%esi, %ecx
+        movl	%edi, %edx
+        jle	L_AES_GCM_encrypt_end_64
+        # More 64 bytes of input
+L_AES_GCM_encrypt_ghash_64:
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        # Encrypt 64 bytes of counter
+        movdqu	64(%esp), %xmm4
+        movdqa	L_aes_gcm_bswap_epi64, %xmm3
+        movdqa	%xmm4, %xmm5
+        movdqa	%xmm4, %xmm6
+        movdqa	%xmm4, %xmm7
+        pshufb	%xmm3, %xmm4
+        paddd	L_aes_gcm_one, %xmm5
+        pshufb	%xmm3, %xmm5
+        paddd	L_aes_gcm_two, %xmm6
+        pshufb	%xmm3, %xmm6
+        paddd	L_aes_gcm_three, %xmm7
+        pshufb	%xmm3, %xmm7
+        movdqu	64(%esp), %xmm3
+        paddd	L_aes_gcm_four, %xmm3
+        movdqu	%xmm3, 64(%esp)
+        movdqa	(%ebp), %xmm3
+        pxor	%xmm3, %xmm4
+        pxor	%xmm3, %xmm5
+        pxor	%xmm3, %xmm6
+        pxor	%xmm3, %xmm7
+        movdqa	16(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	32(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	48(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	64(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	80(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	96(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	112(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	128(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	144(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        cmpl	$11, 172(%esp)
+        movdqa	160(%ebp), %xmm3
+        jl	L_AES_GCM_encrypt_aesenc_64_ghash_avx_done
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	176(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        cmpl	$13, 172(%esp)
+        movdqa	192(%ebp), %xmm3
+        jl	L_AES_GCM_encrypt_aesenc_64_ghash_avx_done
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	208(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	224(%ebp), %xmm3
+L_AES_GCM_encrypt_aesenc_64_ghash_avx_done:
+        aesenclast	%xmm3, %xmm4
+        aesenclast	%xmm3, %xmm5
+        movdqu	(%ecx), %xmm0
+        movdqu	16(%ecx), %xmm1
+        pxor	%xmm0, %xmm4
+        pxor	%xmm1, %xmm5
+        movdqu	%xmm4, (%edx)
+        movdqu	%xmm5, 16(%edx)
+        aesenclast	%xmm3, %xmm6
+        aesenclast	%xmm3, %xmm7
+        movdqu	32(%ecx), %xmm0
+        movdqu	48(%ecx), %xmm1
+        pxor	%xmm0, %xmm6
+        pxor	%xmm1, %xmm7
+        movdqu	%xmm6, 32(%edx)
+        movdqu	%xmm7, 48(%edx)
+        # ghash encrypted counter
+        movdqu	96(%esp), %xmm6
+        movdqu	48(%esp), %xmm3
+        movdqu	-64(%edx), %xmm4
+        pshufb	L_aes_gcm_bswap_mask, %xmm4
+        pxor	%xmm6, %xmm4
+        pshufd	$0x4e, %xmm3, %xmm5
+        pshufd	$0x4e, %xmm4, %xmm1
+        pxor	%xmm3, %xmm5
+        pxor	%xmm4, %xmm1
+        movdqa	%xmm4, %xmm7
+        pclmulqdq	$0x11, %xmm3, %xmm7
+        movdqa	%xmm4, %xmm6
+        pclmulqdq	$0x00, %xmm3, %xmm6
+        pclmulqdq	$0x00, %xmm1, %xmm5
+        pxor	%xmm6, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqu	32(%esp), %xmm3
+        movdqu	-48(%edx), %xmm4
+        pshufd	$0x4e, %xmm3, %xmm0
+        pshufb	L_aes_gcm_bswap_mask, %xmm4
+        pxor	%xmm3, %xmm0
+        pshufd	$0x4e, %xmm4, %xmm1
+        pxor	%xmm4, %xmm1
+        movdqa	%xmm4, %xmm2
+        pclmulqdq	$0x11, %xmm3, %xmm2
+        pclmulqdq	$0x00, %xmm4, %xmm3
+        pclmulqdq	$0x00, %xmm1, %xmm0
+        pxor	%xmm3, %xmm5
+        pxor	%xmm3, %xmm6
+        pxor	%xmm2, %xmm5
+        pxor	%xmm2, %xmm7
+        pxor	%xmm0, %xmm5
+        movdqu	16(%esp), %xmm3
+        movdqu	-32(%edx), %xmm4
+        pshufd	$0x4e, %xmm3, %xmm0
+        pshufb	L_aes_gcm_bswap_mask, %xmm4
+        pxor	%xmm3, %xmm0
+        pshufd	$0x4e, %xmm4, %xmm1
+        pxor	%xmm4, %xmm1
+        movdqa	%xmm4, %xmm2
+        pclmulqdq	$0x11, %xmm3, %xmm2
+        pclmulqdq	$0x00, %xmm4, %xmm3
+        pclmulqdq	$0x00, %xmm1, %xmm0
+        pxor	%xmm3, %xmm5
+        pxor	%xmm3, %xmm6
+        pxor	%xmm2, %xmm5
+        pxor	%xmm2, %xmm7
+        pxor	%xmm0, %xmm5
+        movdqu	(%esp), %xmm3
+        movdqu	-16(%edx), %xmm4
+        pshufd	$0x4e, %xmm3, %xmm0
+        pshufb	L_aes_gcm_bswap_mask, %xmm4
+        pxor	%xmm3, %xmm0
+        pshufd	$0x4e, %xmm4, %xmm1
+        pxor	%xmm4, %xmm1
+        movdqa	%xmm4, %xmm2
+        pclmulqdq	$0x11, %xmm3, %xmm2
+        pclmulqdq	$0x00, %xmm4, %xmm3
+        pclmulqdq	$0x00, %xmm1, %xmm0
+        pxor	%xmm3, %xmm5
+        pxor	%xmm3, %xmm6
+        pxor	%xmm2, %xmm5
+        pxor	%xmm2, %xmm7
+        pxor	%xmm0, %xmm5
+        movdqa	%xmm5, %xmm1
+        psrldq	$8, %xmm5
+        pslldq	$8, %xmm1
+        pxor	%xmm1, %xmm6
+        pxor	%xmm5, %xmm7
+        movdqa	%xmm6, %xmm3
+        movdqa	%xmm6, %xmm0
+        movdqa	%xmm6, %xmm1
+        pslld	$31, %xmm3
+        pslld	$30, %xmm0
+        pslld	$25, %xmm1
+        pxor	%xmm0, %xmm3
+        pxor	%xmm1, %xmm3
+        movdqa	%xmm3, %xmm0
+        pslldq	$12, %xmm3
+        psrldq	$4, %xmm0
+        pxor	%xmm3, %xmm6
+        movdqa	%xmm6, %xmm1
+        movdqa	%xmm6, %xmm5
+        movdqa	%xmm6, %xmm4
+        psrld	$0x01, %xmm1
+        psrld	$2, %xmm5
+        psrld	$7, %xmm4
+        pxor	%xmm5, %xmm1
+        pxor	%xmm4, %xmm1
+        pxor	%xmm0, %xmm1
+        pxor	%xmm1, %xmm6
+        pxor	%xmm7, %xmm6
+        movdqu	%xmm6, 96(%esp)
+        addl	$0x40, %ebx
+        cmpl	%eax, %ebx
+        jl	L_AES_GCM_encrypt_ghash_64
+L_AES_GCM_encrypt_end_64:
+        movdqu	96(%esp), %xmm2
+        # Block 1
+        movdqa	L_aes_gcm_bswap_mask, %xmm4
+        movdqu	(%edx), %xmm1
+        pshufb	%xmm4, %xmm1
+        movdqu	48(%esp), %xmm3
+        pxor	%xmm2, %xmm1
+        pshufd	$0x4e, %xmm1, %xmm5
+        pshufd	$0x4e, %xmm3, %xmm6
+        movdqa	%xmm3, %xmm7
+        movdqa	%xmm3, %xmm4
+        pclmulqdq	$0x11, %xmm1, %xmm7
+        pclmulqdq	$0x00, %xmm1, %xmm4
+        pxor	%xmm1, %xmm5
+        pxor	%xmm3, %xmm6
+        pclmulqdq	$0x00, %xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm6
+        movdqa	%xmm4, %xmm0
+        movdqa	%xmm7, %xmm2
+        pslldq	$8, %xmm6
+        psrldq	$8, %xmm5
+        pxor	%xmm6, %xmm0
+        pxor	%xmm5, %xmm2
+        # Block 2
+        movdqa	L_aes_gcm_bswap_mask, %xmm4
+        movdqu	16(%edx), %xmm1
+        pshufb	%xmm4, %xmm1
+        movdqu	32(%esp), %xmm3
+        pshufd	$0x4e, %xmm1, %xmm5
+        pshufd	$0x4e, %xmm3, %xmm6
+        movdqa	%xmm3, %xmm7
+        movdqa	%xmm3, %xmm4
+        pclmulqdq	$0x11, %xmm1, %xmm7
+        pclmulqdq	$0x00, %xmm1, %xmm4
+        pxor	%xmm1, %xmm5
+        pxor	%xmm3, %xmm6
+        pclmulqdq	$0x00, %xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm6
+        pxor	%xmm4, %xmm0
+        pxor	%xmm7, %xmm2
+        pslldq	$8, %xmm6
+        psrldq	$8, %xmm5
+        pxor	%xmm6, %xmm0
+        pxor	%xmm5, %xmm2
+        # Block 3
+        movdqa	L_aes_gcm_bswap_mask, %xmm4
+        movdqu	32(%edx), %xmm1
+        pshufb	%xmm4, %xmm1
+        movdqu	16(%esp), %xmm3
+        pshufd	$0x4e, %xmm1, %xmm5
+        pshufd	$0x4e, %xmm3, %xmm6
+        movdqa	%xmm3, %xmm7
+        movdqa	%xmm3, %xmm4
+        pclmulqdq	$0x11, %xmm1, %xmm7
+        pclmulqdq	$0x00, %xmm1, %xmm4
+        pxor	%xmm1, %xmm5
+        pxor	%xmm3, %xmm6
+        pclmulqdq	$0x00, %xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm6
+        pxor	%xmm4, %xmm0
+        pxor	%xmm7, %xmm2
+        pslldq	$8, %xmm6
+        psrldq	$8, %xmm5
+        pxor	%xmm6, %xmm0
+        pxor	%xmm5, %xmm2
+        # Block 4
+        movdqa	L_aes_gcm_bswap_mask, %xmm4
+        movdqu	48(%edx), %xmm1
+        pshufb	%xmm4, %xmm1
+        movdqu	(%esp), %xmm3
+        pshufd	$0x4e, %xmm1, %xmm5
+        pshufd	$0x4e, %xmm3, %xmm6
+        movdqa	%xmm3, %xmm7
+        movdqa	%xmm3, %xmm4
+        pclmulqdq	$0x11, %xmm1, %xmm7
+        pclmulqdq	$0x00, %xmm1, %xmm4
+        pxor	%xmm1, %xmm5
+        pxor	%xmm3, %xmm6
+        pclmulqdq	$0x00, %xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm6
+        pxor	%xmm4, %xmm0
+        pxor	%xmm7, %xmm2
+        pslldq	$8, %xmm6
+        psrldq	$8, %xmm5
+        pxor	%xmm6, %xmm0
+        pxor	%xmm5, %xmm2
+        movdqa	%xmm0, %xmm4
+        movdqa	%xmm0, %xmm5
+        movdqa	%xmm0, %xmm6
+        pslld	$31, %xmm4
+        pslld	$30, %xmm5
+        pslld	$25, %xmm6
+        pxor	%xmm5, %xmm4
+        pxor	%xmm6, %xmm4
+        movdqa	%xmm4, %xmm5
+        psrldq	$4, %xmm5
+        pslldq	$12, %xmm4
+        pxor	%xmm4, %xmm0
+        movdqa	%xmm0, %xmm6
+        movdqa	%xmm0, %xmm7
+        movdqa	%xmm0, %xmm4
+        psrld	$0x01, %xmm6
+        psrld	$2, %xmm7
+        psrld	$7, %xmm4
+        pxor	%xmm7, %xmm6
+        pxor	%xmm4, %xmm6
+        pxor	%xmm5, %xmm6
+        pxor	%xmm0, %xmm6
+        pxor	%xmm6, %xmm2
+        movdqu	(%esp), %xmm1
+L_AES_GCM_encrypt_done_64:
+        movl	152(%esp), %edx
+        cmpl	%edx, %ebx
+        jge	L_AES_GCM_encrypt_done_enc
+        movl	152(%esp), %eax
+        andl	$0xfffffff0, %eax
+        cmpl	%eax, %ebx
+        jge	L_AES_GCM_encrypt_last_block_done
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        movdqu	64(%esp), %xmm4
+        movdqa	%xmm4, %xmm5
+        pshufb	L_aes_gcm_bswap_epi64, %xmm4
+        paddd	L_aes_gcm_one, %xmm5
+        pxor	(%ebp), %xmm4
+        movdqu	%xmm5, 64(%esp)
+        aesenc	16(%ebp), %xmm4
+        aesenc	32(%ebp), %xmm4
+        aesenc	48(%ebp), %xmm4
+        aesenc	64(%ebp), %xmm4
+        aesenc	80(%ebp), %xmm4
+        aesenc	96(%ebp), %xmm4
+        aesenc	112(%ebp), %xmm4
+        aesenc	128(%ebp), %xmm4
+        aesenc	144(%ebp), %xmm4
+        cmpl	$11, 172(%esp)
+        movdqa	160(%ebp), %xmm5
+        jl	L_AES_GCM_encrypt_aesenc_block_aesenc_avx_last
+        aesenc	%xmm5, %xmm4
+        aesenc	176(%ebp), %xmm4
+        cmpl	$13, 172(%esp)
+        movdqa	192(%ebp), %xmm5
+        jl	L_AES_GCM_encrypt_aesenc_block_aesenc_avx_last
+        aesenc	%xmm5, %xmm4
+        aesenc	208(%ebp), %xmm4
+        movdqa	224(%ebp), %xmm5
+L_AES_GCM_encrypt_aesenc_block_aesenc_avx_last:
+        aesenclast	%xmm5, %xmm4
+        movdqu	(%ecx), %xmm5
+        pxor	%xmm5, %xmm4
+        movdqu	%xmm4, (%edx)
+        pshufb	L_aes_gcm_bswap_mask, %xmm4
+        pxor	%xmm4, %xmm2
+        addl	$16, %ebx
+        cmpl	%eax, %ebx
+        jge	L_AES_GCM_encrypt_last_block_ghash
+L_AES_GCM_encrypt_last_block_start:
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        movdqu	64(%esp), %xmm4
+        movdqa	%xmm4, %xmm5
+        pshufb	L_aes_gcm_bswap_epi64, %xmm4
+        paddd	L_aes_gcm_one, %xmm5
+        pxor	(%ebp), %xmm4
+        movdqu	%xmm5, 64(%esp)
+        movdqu	%xmm2, %xmm0
+        pclmulqdq	$16, %xmm1, %xmm0
+        aesenc	16(%ebp), %xmm4
+        aesenc	32(%ebp), %xmm4
+        movdqu	%xmm2, %xmm3
+        pclmulqdq	$0x01, %xmm1, %xmm3
+        aesenc	48(%ebp), %xmm4
+        aesenc	64(%ebp), %xmm4
+        aesenc	80(%ebp), %xmm4
+        movdqu	%xmm2, %xmm5
+        pclmulqdq	$0x11, %xmm1, %xmm5
+        aesenc	96(%ebp), %xmm4
+        pxor	%xmm3, %xmm0
+        movdqa	%xmm0, %xmm6
+        psrldq	$8, %xmm0
+        pslldq	$8, %xmm6
+        aesenc	112(%ebp), %xmm4
+        movdqu	%xmm2, %xmm3
+        pclmulqdq	$0x00, %xmm1, %xmm3
+        pxor	%xmm3, %xmm6
+        pxor	%xmm0, %xmm5
+        movdqa	L_aes_gcm_mod2_128, %xmm7
+        movdqa	%xmm6, %xmm3
+        pclmulqdq	$16, %xmm7, %xmm3
+        aesenc	128(%ebp), %xmm4
+        pshufd	$0x4e, %xmm6, %xmm0
+        pxor	%xmm3, %xmm0
+        movdqa	%xmm0, %xmm3
+        pclmulqdq	$16, %xmm7, %xmm3
+        aesenc	144(%ebp), %xmm4
+        pshufd	$0x4e, %xmm0, %xmm2
+        pxor	%xmm3, %xmm2
+        pxor	%xmm5, %xmm2
+        cmpl	$11, 172(%esp)
+        movdqa	160(%ebp), %xmm5
+        jl	L_AES_GCM_encrypt_aesenc_gfmul_last
+        aesenc	%xmm5, %xmm4
+        aesenc	176(%ebp), %xmm4
+        cmpl	$13, 172(%esp)
+        movdqa	192(%ebp), %xmm5
+        jl	L_AES_GCM_encrypt_aesenc_gfmul_last
+        aesenc	%xmm5, %xmm4
+        aesenc	208(%ebp), %xmm4
+        movdqa	224(%ebp), %xmm5
+L_AES_GCM_encrypt_aesenc_gfmul_last:
+        aesenclast	%xmm5, %xmm4
+        movdqu	(%ecx), %xmm5
+        pxor	%xmm5, %xmm4
+        movdqu	%xmm4, (%edx)
+        pshufb	L_aes_gcm_bswap_mask, %xmm4
+        pxor	%xmm4, %xmm2
+        addl	$16, %ebx
+        cmpl	%eax, %ebx
+        jl	L_AES_GCM_encrypt_last_block_start
+L_AES_GCM_encrypt_last_block_ghash:
+        pshufd	$0x4e, %xmm1, %xmm5
+        pshufd	$0x4e, %xmm2, %xmm6
+        movdqa	%xmm2, %xmm7
+        movdqa	%xmm2, %xmm4
+        pclmulqdq	$0x11, %xmm1, %xmm7
+        pclmulqdq	$0x00, %xmm1, %xmm4
+        pxor	%xmm1, %xmm5
+        pxor	%xmm2, %xmm6
+        pclmulqdq	$0x00, %xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm6
+        movdqa	%xmm7, %xmm2
+        pslldq	$8, %xmm6
+        psrldq	$8, %xmm5
+        pxor	%xmm6, %xmm4
+        pxor	%xmm5, %xmm2
+        movdqa	%xmm4, %xmm5
+        movdqa	%xmm4, %xmm6
+        movdqa	%xmm4, %xmm7
+        pslld	$31, %xmm5
+        pslld	$30, %xmm6
+        pslld	$25, %xmm7
+        pxor	%xmm6, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm7
+        psrldq	$4, %xmm7
+        pslldq	$12, %xmm5
+        pxor	%xmm5, %xmm4
+        movdqa	%xmm4, %xmm5
+        movdqa	%xmm4, %xmm6
+        psrld	$0x01, %xmm5
+        psrld	$2, %xmm6
+        pxor	%xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        psrld	$7, %xmm4
+        pxor	%xmm7, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm5, %xmm2
+L_AES_GCM_encrypt_last_block_done:
+        movl	152(%esp), %ecx
+        movl	%ecx, %edx
+        andl	$15, %ecx
+        jz	L_AES_GCM_encrypt_aesenc_last15_enc_avx_done
+        movdqu	64(%esp), %xmm0
+        pshufb	L_aes_gcm_bswap_epi64, %xmm0
+        pxor	(%ebp), %xmm0
+        aesenc	16(%ebp), %xmm0
+        aesenc	32(%ebp), %xmm0
+        aesenc	48(%ebp), %xmm0
+        aesenc	64(%ebp), %xmm0
+        aesenc	80(%ebp), %xmm0
+        aesenc	96(%ebp), %xmm0
+        aesenc	112(%ebp), %xmm0
+        aesenc	128(%ebp), %xmm0
+        aesenc	144(%ebp), %xmm0
+        cmpl	$11, 172(%esp)
+        movdqa	160(%ebp), %xmm5
+        jl	L_AES_GCM_encrypt_aesenc_last15_enc_avx_aesenc_avx_last
+        aesenc	%xmm5, %xmm0
+        aesenc	176(%ebp), %xmm0
+        cmpl	$13, 172(%esp)
+        movdqa	192(%ebp), %xmm5
+        jl	L_AES_GCM_encrypt_aesenc_last15_enc_avx_aesenc_avx_last
+        aesenc	%xmm5, %xmm0
+        aesenc	208(%ebp), %xmm0
+        movdqa	224(%ebp), %xmm5
+L_AES_GCM_encrypt_aesenc_last15_enc_avx_aesenc_avx_last:
+        aesenclast	%xmm5, %xmm0
+        subl	$16, %esp
+        xorl	%ecx, %ecx
+        movdqu	%xmm0, (%esp)
+L_AES_GCM_encrypt_aesenc_last15_enc_avx_loop:
+        movzbl	(%esi,%ebx,1), %eax
+        xorb	(%esp,%ecx,1), %al
+        movb	%al, (%edi,%ebx,1)
+        movb	%al, (%esp,%ecx,1)
+        incl	%ebx
+        incl	%ecx
+        cmpl	%edx, %ebx
+        jl	L_AES_GCM_encrypt_aesenc_last15_enc_avx_loop
+        xorl	%eax, %eax
+        cmpl	$16, %ecx
+        je	L_AES_GCM_encrypt_aesenc_last15_enc_avx_finish_enc
+L_AES_GCM_encrypt_aesenc_last15_enc_avx_byte_loop:
+        movb	%al, (%esp,%ecx,1)
+        incl	%ecx
+        cmpl	$16, %ecx
+        jl	L_AES_GCM_encrypt_aesenc_last15_enc_avx_byte_loop
+L_AES_GCM_encrypt_aesenc_last15_enc_avx_finish_enc:
+        movdqu	(%esp), %xmm0
+        addl	$16, %esp
+        pshufb	L_aes_gcm_bswap_mask, %xmm0
+        pxor	%xmm0, %xmm2
+        pshufd	$0x4e, %xmm1, %xmm5
+        pshufd	$0x4e, %xmm2, %xmm6
+        movdqa	%xmm2, %xmm7
+        movdqa	%xmm2, %xmm4
+        pclmulqdq	$0x11, %xmm1, %xmm7
+        pclmulqdq	$0x00, %xmm1, %xmm4
+        pxor	%xmm1, %xmm5
+        pxor	%xmm2, %xmm6
+        pclmulqdq	$0x00, %xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm6
+        movdqa	%xmm7, %xmm2
+        pslldq	$8, %xmm6
+        psrldq	$8, %xmm5
+        pxor	%xmm6, %xmm4
+        pxor	%xmm5, %xmm2
+        movdqa	%xmm4, %xmm5
+        movdqa	%xmm4, %xmm6
+        movdqa	%xmm4, %xmm7
+        pslld	$31, %xmm5
+        pslld	$30, %xmm6
+        pslld	$25, %xmm7
+        pxor	%xmm6, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm7
+        psrldq	$4, %xmm7
+        pslldq	$12, %xmm5
+        pxor	%xmm5, %xmm4
+        movdqa	%xmm4, %xmm5
+        movdqa	%xmm4, %xmm6
+        psrld	$0x01, %xmm5
+        psrld	$2, %xmm6
+        pxor	%xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        psrld	$7, %xmm4
+        pxor	%xmm7, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm5, %xmm2
+L_AES_GCM_encrypt_aesenc_last15_enc_avx_done:
+L_AES_GCM_encrypt_done_enc:
+        movl	148(%esp), %edi
+        movl	164(%esp), %ebx
+        movl	152(%esp), %edx
+        movl	156(%esp), %ecx
+        shll	$3, %edx
+        shll	$3, %ecx
+        pinsrd	$0x00, %edx, %xmm4
+        pinsrd	$2, %ecx, %xmm4
+        movl	152(%esp), %edx
+        movl	156(%esp), %ecx
+        shrl	$29, %edx
+        shrl	$29, %ecx
+        pinsrd	$0x01, %edx, %xmm4
+        pinsrd	$3, %ecx, %xmm4
+        pxor	%xmm4, %xmm2
+        pshufd	$0x4e, %xmm1, %xmm5
+        pshufd	$0x4e, %xmm2, %xmm6
+        movdqa	%xmm2, %xmm7
+        movdqa	%xmm2, %xmm4
+        pclmulqdq	$0x11, %xmm1, %xmm7
+        pclmulqdq	$0x00, %xmm1, %xmm4
+        pxor	%xmm1, %xmm5
+        pxor	%xmm2, %xmm6
+        pclmulqdq	$0x00, %xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm6
+        movdqa	%xmm7, %xmm2
+        pslldq	$8, %xmm6
+        psrldq	$8, %xmm5
+        pxor	%xmm6, %xmm4
+        pxor	%xmm5, %xmm2
+        movdqa	%xmm4, %xmm5
+        movdqa	%xmm4, %xmm6
+        movdqa	%xmm4, %xmm7
+        pslld	$31, %xmm5
+        pslld	$30, %xmm6
+        pslld	$25, %xmm7
+        pxor	%xmm6, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm7
+        psrldq	$4, %xmm7
+        pslldq	$12, %xmm5
+        pxor	%xmm5, %xmm4
+        movdqa	%xmm4, %xmm5
+        movdqa	%xmm4, %xmm6
+        psrld	$0x01, %xmm5
+        psrld	$2, %xmm6
+        pxor	%xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        psrld	$7, %xmm4
+        pxor	%xmm7, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm5, %xmm2
+        pshufb	L_aes_gcm_bswap_mask, %xmm2
+        movdqu	80(%esp), %xmm4
+        pxor	%xmm2, %xmm4
+        cmpl	$16, %ebx
+        je	L_AES_GCM_encrypt_store_tag_16
+        xorl	%ecx, %ecx
+        movdqu	%xmm4, (%esp)
+L_AES_GCM_encrypt_store_tag_loop:
+        movzbl	(%esp,%ecx,1), %eax
+        movb	%al, (%edi,%ecx,1)
+        incl	%ecx
+        cmpl	%ebx, %ecx
+        jne	L_AES_GCM_encrypt_store_tag_loop
+        jmp	L_AES_GCM_encrypt_store_tag_done
+L_AES_GCM_encrypt_store_tag_16:
+        movdqu	%xmm4, (%edi)
+L_AES_GCM_encrypt_store_tag_done:
+        addl	$0x70, %esp
+        popl	%ebp
+        popl	%edi
+        popl	%esi
+        popl	%ebx
+        ret
+.size	AES_GCM_encrypt,.-AES_GCM_encrypt
+.text
+.globl	AES_GCM_decrypt
+.type	AES_GCM_decrypt,@function
+.align	16
+AES_GCM_decrypt:
+        pushl	%ebx
+        pushl	%esi
+        pushl	%edi
+        pushl	%ebp
+        subl	$0xb0, %esp
+        movl	208(%esp), %esi
+        movl	232(%esp), %ebp
+        movl	224(%esp), %edx
+        pxor	%xmm0, %xmm0
+        pxor	%xmm2, %xmm2
+        cmpl	$12, %edx
+        jne	L_AES_GCM_decrypt_iv_not_12
+        # # Calculate values when IV is 12 bytes
+        # Set counter based on IV
+        movl	$0x1000000, %ecx
+        pinsrd	$0x00, (%esi), %xmm0
+        pinsrd	$0x01, 4(%esi), %xmm0
+        pinsrd	$2, 8(%esi), %xmm0
+        pinsrd	$3, %ecx, %xmm0
+        # H = Encrypt X(=0) and T = Encrypt counter
+        movdqa	%xmm0, %xmm5
+        movdqa	(%ebp), %xmm1
+        pxor	%xmm1, %xmm5
+        movdqa	16(%ebp), %xmm3
+        aesenc	%xmm3, %xmm1
+        aesenc	%xmm3, %xmm5
+        movdqa	32(%ebp), %xmm3
+        aesenc	%xmm3, %xmm1
+        aesenc	%xmm3, %xmm5
+        movdqa	48(%ebp), %xmm3
+        aesenc	%xmm3, %xmm1
+        aesenc	%xmm3, %xmm5
+        movdqa	64(%ebp), %xmm3
+        aesenc	%xmm3, %xmm1
+        aesenc	%xmm3, %xmm5
+        movdqa	80(%ebp), %xmm3
+        aesenc	%xmm3, %xmm1
+        aesenc	%xmm3, %xmm5
+        movdqa	96(%ebp), %xmm3
+        aesenc	%xmm3, %xmm1
+        aesenc	%xmm3, %xmm5
+        movdqa	112(%ebp), %xmm3
+        aesenc	%xmm3, %xmm1
+        aesenc	%xmm3, %xmm5
+        movdqa	128(%ebp), %xmm3
+        aesenc	%xmm3, %xmm1
+        aesenc	%xmm3, %xmm5
+        movdqa	144(%ebp), %xmm3
+        aesenc	%xmm3, %xmm1
+        aesenc	%xmm3, %xmm5
+        cmpl	$11, 236(%esp)
+        movdqa	160(%ebp), %xmm3
+        jl	L_AES_GCM_decrypt_calc_iv_12_last
+        aesenc	%xmm3, %xmm1
+        aesenc	%xmm3, %xmm5
+        movdqa	176(%ebp), %xmm3
+        aesenc	%xmm3, %xmm1
+        aesenc	%xmm3, %xmm5
+        cmpl	$13, 236(%esp)
+        movdqa	192(%ebp), %xmm3
+        jl	L_AES_GCM_decrypt_calc_iv_12_last
+        aesenc	%xmm3, %xmm1
+        aesenc	%xmm3, %xmm5
+        movdqa	208(%ebp), %xmm3
+        aesenc	%xmm3, %xmm1
+        aesenc	%xmm3, %xmm5
+        movdqa	224(%ebp), %xmm3
+L_AES_GCM_decrypt_calc_iv_12_last:
+        aesenclast	%xmm3, %xmm1
+        aesenclast	%xmm3, %xmm5
+        pshufb	L_aes_gcm_bswap_mask, %xmm1
+        movdqu	%xmm5, 80(%esp)
+        jmp	L_AES_GCM_decrypt_iv_done
+L_AES_GCM_decrypt_iv_not_12:
+        # Calculate values when IV is not 12 bytes
+        # H = Encrypt X(=0)
+        movdqa	(%ebp), %xmm1
+        aesenc	16(%ebp), %xmm1
+        aesenc	32(%ebp), %xmm1
+        aesenc	48(%ebp), %xmm1
+        aesenc	64(%ebp), %xmm1
+        aesenc	80(%ebp), %xmm1
+        aesenc	96(%ebp), %xmm1
+        aesenc	112(%ebp), %xmm1
+        aesenc	128(%ebp), %xmm1
+        aesenc	144(%ebp), %xmm1
+        cmpl	$11, 236(%esp)
+        movdqa	160(%ebp), %xmm5
+        jl	L_AES_GCM_decrypt_calc_iv_1_aesenc_avx_last
+        aesenc	%xmm5, %xmm1
+        aesenc	176(%ebp), %xmm1
+        cmpl	$13, 236(%esp)
+        movdqa	192(%ebp), %xmm5
+        jl	L_AES_GCM_decrypt_calc_iv_1_aesenc_avx_last
+        aesenc	%xmm5, %xmm1
+        aesenc	208(%ebp), %xmm1
+        movdqa	224(%ebp), %xmm5
+L_AES_GCM_decrypt_calc_iv_1_aesenc_avx_last:
+        aesenclast	%xmm5, %xmm1
+        pshufb	L_aes_gcm_bswap_mask, %xmm1
+        # Calc counter
+        # Initialization vector
+        cmpl	$0x00, %edx
+        movl	$0x00, %ecx
+        je	L_AES_GCM_decrypt_calc_iv_done
+        cmpl	$16, %edx
+        jl	L_AES_GCM_decrypt_calc_iv_lt16
+        andl	$0xfffffff0, %edx
+L_AES_GCM_decrypt_calc_iv_16_loop:
+        movdqu	(%esi,%ecx,1), %xmm4
+        pshufb	L_aes_gcm_bswap_mask, %xmm4
+        pxor	%xmm4, %xmm0
+        pshufd	$0x4e, %xmm0, %xmm5
+        pshufd	$0x4e, %xmm1, %xmm6
+        movdqa	%xmm1, %xmm7
+        movdqa	%xmm1, %xmm4
+        pclmulqdq	$0x11, %xmm0, %xmm7
+        pclmulqdq	$0x00, %xmm0, %xmm4
+        pxor	%xmm0, %xmm5
+        pxor	%xmm1, %xmm6
+        pclmulqdq	$0x00, %xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm6
+        movdqa	%xmm4, %xmm3
+        movdqa	%xmm7, %xmm0
+        pslldq	$8, %xmm6
+        psrldq	$8, %xmm5
+        pxor	%xmm6, %xmm3
+        pxor	%xmm5, %xmm0
+        movdqa	%xmm3, %xmm4
+        movdqa	%xmm0, %xmm5
+        psrld	$31, %xmm4
+        psrld	$31, %xmm5
+        pslld	$0x01, %xmm3
+        pslld	$0x01, %xmm0
+        movdqa	%xmm4, %xmm6
+        pslldq	$4, %xmm4
+        psrldq	$12, %xmm6
+        pslldq	$4, %xmm5
+        por	%xmm6, %xmm0
+        por	%xmm4, %xmm3
+        por	%xmm5, %xmm0
+        movdqa	%xmm3, %xmm4
+        movdqa	%xmm3, %xmm5
+        movdqa	%xmm3, %xmm6
+        pslld	$31, %xmm4
+        pslld	$30, %xmm5
+        pslld	$25, %xmm6
+        pxor	%xmm5, %xmm4
+        pxor	%xmm6, %xmm4
+        movdqa	%xmm4, %xmm5
+        psrldq	$4, %xmm5
+        pslldq	$12, %xmm4
+        pxor	%xmm4, %xmm3
+        movdqa	%xmm3, %xmm6
+        movdqa	%xmm3, %xmm7
+        movdqa	%xmm3, %xmm4
+        psrld	$0x01, %xmm6
+        psrld	$2, %xmm7
+        psrld	$7, %xmm4
+        pxor	%xmm7, %xmm6
+        pxor	%xmm4, %xmm6
+        pxor	%xmm5, %xmm6
+        pxor	%xmm3, %xmm6
+        pxor	%xmm6, %xmm0
+        addl	$16, %ecx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_decrypt_calc_iv_16_loop
+        movl	224(%esp), %edx
+        cmpl	%edx, %ecx
+        je	L_AES_GCM_decrypt_calc_iv_done
+L_AES_GCM_decrypt_calc_iv_lt16:
+        subl	$16, %esp
+        pxor	%xmm4, %xmm4
+        xorl	%ebx, %ebx
+        movdqu	%xmm4, (%esp)
+L_AES_GCM_decrypt_calc_iv_loop:
+        movzbl	(%esi,%ecx,1), %eax
+        movb	%al, (%esp,%ebx,1)
+        incl	%ecx
+        incl	%ebx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_decrypt_calc_iv_loop
+        movdqu	(%esp), %xmm4
+        addl	$16, %esp
+        pshufb	L_aes_gcm_bswap_mask, %xmm4
+        pxor	%xmm4, %xmm0
+        pshufd	$0x4e, %xmm0, %xmm5
+        pshufd	$0x4e, %xmm1, %xmm6
+        movdqa	%xmm1, %xmm7
+        movdqa	%xmm1, %xmm4
+        pclmulqdq	$0x11, %xmm0, %xmm7
+        pclmulqdq	$0x00, %xmm0, %xmm4
+        pxor	%xmm0, %xmm5
+        pxor	%xmm1, %xmm6
+        pclmulqdq	$0x00, %xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm6
+        movdqa	%xmm4, %xmm3
+        movdqa	%xmm7, %xmm0
+        pslldq	$8, %xmm6
+        psrldq	$8, %xmm5
+        pxor	%xmm6, %xmm3
+        pxor	%xmm5, %xmm0
+        movdqa	%xmm3, %xmm4
+        movdqa	%xmm0, %xmm5
+        psrld	$31, %xmm4
+        psrld	$31, %xmm5
+        pslld	$0x01, %xmm3
+        pslld	$0x01, %xmm0
+        movdqa	%xmm4, %xmm6
+        pslldq	$4, %xmm4
+        psrldq	$12, %xmm6
+        pslldq	$4, %xmm5
+        por	%xmm6, %xmm0
+        por	%xmm4, %xmm3
+        por	%xmm5, %xmm0
+        movdqa	%xmm3, %xmm4
+        movdqa	%xmm3, %xmm5
+        movdqa	%xmm3, %xmm6
+        pslld	$31, %xmm4
+        pslld	$30, %xmm5
+        pslld	$25, %xmm6
+        pxor	%xmm5, %xmm4
+        pxor	%xmm6, %xmm4
+        movdqa	%xmm4, %xmm5
+        psrldq	$4, %xmm5
+        pslldq	$12, %xmm4
+        pxor	%xmm4, %xmm3
+        movdqa	%xmm3, %xmm6
+        movdqa	%xmm3, %xmm7
+        movdqa	%xmm3, %xmm4
+        psrld	$0x01, %xmm6
+        psrld	$2, %xmm7
+        psrld	$7, %xmm4
+        pxor	%xmm7, %xmm6
+        pxor	%xmm4, %xmm6
+        pxor	%xmm5, %xmm6
+        pxor	%xmm3, %xmm6
+        pxor	%xmm6, %xmm0
+L_AES_GCM_decrypt_calc_iv_done:
+        # T = Encrypt counter
+        pxor	%xmm4, %xmm4
+        shll	$3, %edx
+        pinsrd	$0x00, %edx, %xmm4
+        pxor	%xmm4, %xmm0
+        pshufd	$0x4e, %xmm0, %xmm5
+        pshufd	$0x4e, %xmm1, %xmm6
+        movdqa	%xmm1, %xmm7
+        movdqa	%xmm1, %xmm4
+        pclmulqdq	$0x11, %xmm0, %xmm7
+        pclmulqdq	$0x00, %xmm0, %xmm4
+        pxor	%xmm0, %xmm5
+        pxor	%xmm1, %xmm6
+        pclmulqdq	$0x00, %xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm6
+        movdqa	%xmm4, %xmm3
+        movdqa	%xmm7, %xmm0
+        pslldq	$8, %xmm6
+        psrldq	$8, %xmm5
+        pxor	%xmm6, %xmm3
+        pxor	%xmm5, %xmm0
+        movdqa	%xmm3, %xmm4
+        movdqa	%xmm0, %xmm5
+        psrld	$31, %xmm4
+        psrld	$31, %xmm5
+        pslld	$0x01, %xmm3
+        pslld	$0x01, %xmm0
+        movdqa	%xmm4, %xmm6
+        pslldq	$4, %xmm4
+        psrldq	$12, %xmm6
+        pslldq	$4, %xmm5
+        por	%xmm6, %xmm0
+        por	%xmm4, %xmm3
+        por	%xmm5, %xmm0
+        movdqa	%xmm3, %xmm4
+        movdqa	%xmm3, %xmm5
+        movdqa	%xmm3, %xmm6
+        pslld	$31, %xmm4
+        pslld	$30, %xmm5
+        pslld	$25, %xmm6
+        pxor	%xmm5, %xmm4
+        pxor	%xmm6, %xmm4
+        movdqa	%xmm4, %xmm5
+        psrldq	$4, %xmm5
+        pslldq	$12, %xmm4
+        pxor	%xmm4, %xmm3
+        movdqa	%xmm3, %xmm6
+        movdqa	%xmm3, %xmm7
+        movdqa	%xmm3, %xmm4
+        psrld	$0x01, %xmm6
+        psrld	$2, %xmm7
+        psrld	$7, %xmm4
+        pxor	%xmm7, %xmm6
+        pxor	%xmm4, %xmm6
+        pxor	%xmm5, %xmm6
+        pxor	%xmm3, %xmm6
+        pxor	%xmm6, %xmm0
+        pshufb	L_aes_gcm_bswap_mask, %xmm0
+        #   Encrypt counter
+        movdqa	(%ebp), %xmm4
+        pxor	%xmm0, %xmm4
+        aesenc	16(%ebp), %xmm4
+        aesenc	32(%ebp), %xmm4
+        aesenc	48(%ebp), %xmm4
+        aesenc	64(%ebp), %xmm4
+        aesenc	80(%ebp), %xmm4
+        aesenc	96(%ebp), %xmm4
+        aesenc	112(%ebp), %xmm4
+        aesenc	128(%ebp), %xmm4
+        aesenc	144(%ebp), %xmm4
+        cmpl	$11, 236(%esp)
+        movdqa	160(%ebp), %xmm5
+        jl	L_AES_GCM_decrypt_calc_iv_2_aesenc_avx_last
+        aesenc	%xmm5, %xmm4
+        aesenc	176(%ebp), %xmm4
+        cmpl	$13, 236(%esp)
+        movdqa	192(%ebp), %xmm5
+        jl	L_AES_GCM_decrypt_calc_iv_2_aesenc_avx_last
+        aesenc	%xmm5, %xmm4
+        aesenc	208(%ebp), %xmm4
+        movdqa	224(%ebp), %xmm5
+L_AES_GCM_decrypt_calc_iv_2_aesenc_avx_last:
+        aesenclast	%xmm5, %xmm4
+        movdqu	%xmm4, 80(%esp)
+L_AES_GCM_decrypt_iv_done:
+        movl	204(%esp), %esi
+        # Additional authentication data
+        movl	220(%esp), %edx
+        cmpl	$0x00, %edx
+        je	L_AES_GCM_decrypt_calc_aad_done
+        xorl	%ecx, %ecx
+        cmpl	$16, %edx
+        jl	L_AES_GCM_decrypt_calc_aad_lt16
+        andl	$0xfffffff0, %edx
+L_AES_GCM_decrypt_calc_aad_16_loop:
+        movdqu	(%esi,%ecx,1), %xmm4
+        pshufb	L_aes_gcm_bswap_mask, %xmm4
+        pxor	%xmm4, %xmm2
+        pshufd	$0x4e, %xmm2, %xmm5
+        pshufd	$0x4e, %xmm1, %xmm6
+        movdqa	%xmm1, %xmm7
+        movdqa	%xmm1, %xmm4
+        pclmulqdq	$0x11, %xmm2, %xmm7
+        pclmulqdq	$0x00, %xmm2, %xmm4
+        pxor	%xmm2, %xmm5
+        pxor	%xmm1, %xmm6
+        pclmulqdq	$0x00, %xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm6
+        movdqa	%xmm4, %xmm3
+        movdqa	%xmm7, %xmm2
+        pslldq	$8, %xmm6
+        psrldq	$8, %xmm5
+        pxor	%xmm6, %xmm3
+        pxor	%xmm5, %xmm2
+        movdqa	%xmm3, %xmm4
+        movdqa	%xmm2, %xmm5
+        psrld	$31, %xmm4
+        psrld	$31, %xmm5
+        pslld	$0x01, %xmm3
+        pslld	$0x01, %xmm2
+        movdqa	%xmm4, %xmm6
+        pslldq	$4, %xmm4
+        psrldq	$12, %xmm6
+        pslldq	$4, %xmm5
+        por	%xmm6, %xmm2
+        por	%xmm4, %xmm3
+        por	%xmm5, %xmm2
+        movdqa	%xmm3, %xmm4
+        movdqa	%xmm3, %xmm5
+        movdqa	%xmm3, %xmm6
+        pslld	$31, %xmm4
+        pslld	$30, %xmm5
+        pslld	$25, %xmm6
+        pxor	%xmm5, %xmm4
+        pxor	%xmm6, %xmm4
+        movdqa	%xmm4, %xmm5
+        psrldq	$4, %xmm5
+        pslldq	$12, %xmm4
+        pxor	%xmm4, %xmm3
+        movdqa	%xmm3, %xmm6
+        movdqa	%xmm3, %xmm7
+        movdqa	%xmm3, %xmm4
+        psrld	$0x01, %xmm6
+        psrld	$2, %xmm7
+        psrld	$7, %xmm4
+        pxor	%xmm7, %xmm6
+        pxor	%xmm4, %xmm6
+        pxor	%xmm5, %xmm6
+        pxor	%xmm3, %xmm6
+        pxor	%xmm6, %xmm2
+        addl	$16, %ecx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_decrypt_calc_aad_16_loop
+        movl	220(%esp), %edx
+        cmpl	%edx, %ecx
+        je	L_AES_GCM_decrypt_calc_aad_done
+L_AES_GCM_decrypt_calc_aad_lt16:
+        subl	$16, %esp
+        pxor	%xmm4, %xmm4
+        xorl	%ebx, %ebx
+        movdqu	%xmm4, (%esp)
+L_AES_GCM_decrypt_calc_aad_loop:
+        movzbl	(%esi,%ecx,1), %eax
+        movb	%al, (%esp,%ebx,1)
+        incl	%ecx
+        incl	%ebx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_decrypt_calc_aad_loop
+        movdqu	(%esp), %xmm4
+        addl	$16, %esp
+        pshufb	L_aes_gcm_bswap_mask, %xmm4
+        pxor	%xmm4, %xmm2
+        pshufd	$0x4e, %xmm2, %xmm5
+        pshufd	$0x4e, %xmm1, %xmm6
+        movdqa	%xmm1, %xmm7
+        movdqa	%xmm1, %xmm4
+        pclmulqdq	$0x11, %xmm2, %xmm7
+        pclmulqdq	$0x00, %xmm2, %xmm4
+        pxor	%xmm2, %xmm5
+        pxor	%xmm1, %xmm6
+        pclmulqdq	$0x00, %xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm6
+        movdqa	%xmm4, %xmm3
+        movdqa	%xmm7, %xmm2
+        pslldq	$8, %xmm6
+        psrldq	$8, %xmm5
+        pxor	%xmm6, %xmm3
+        pxor	%xmm5, %xmm2
+        movdqa	%xmm3, %xmm4
+        movdqa	%xmm2, %xmm5
+        psrld	$31, %xmm4
+        psrld	$31, %xmm5
+        pslld	$0x01, %xmm3
+        pslld	$0x01, %xmm2
+        movdqa	%xmm4, %xmm6
+        pslldq	$4, %xmm4
+        psrldq	$12, %xmm6
+        pslldq	$4, %xmm5
+        por	%xmm6, %xmm2
+        por	%xmm4, %xmm3
+        por	%xmm5, %xmm2
+        movdqa	%xmm3, %xmm4
+        movdqa	%xmm3, %xmm5
+        movdqa	%xmm3, %xmm6
+        pslld	$31, %xmm4
+        pslld	$30, %xmm5
+        pslld	$25, %xmm6
+        pxor	%xmm5, %xmm4
+        pxor	%xmm6, %xmm4
+        movdqa	%xmm4, %xmm5
+        psrldq	$4, %xmm5
+        pslldq	$12, %xmm4
+        pxor	%xmm4, %xmm3
+        movdqa	%xmm3, %xmm6
+        movdqa	%xmm3, %xmm7
+        movdqa	%xmm3, %xmm4
+        psrld	$0x01, %xmm6
+        psrld	$2, %xmm7
+        psrld	$7, %xmm4
+        pxor	%xmm7, %xmm6
+        pxor	%xmm4, %xmm6
+        pxor	%xmm5, %xmm6
+        pxor	%xmm3, %xmm6
+        pxor	%xmm6, %xmm2
+L_AES_GCM_decrypt_calc_aad_done:
+        movdqu	%xmm2, 96(%esp)
+        movl	196(%esp), %esi
+        movl	200(%esp), %edi
+        # Calculate counter and H
+        pshufb	L_aes_gcm_bswap_epi64, %xmm0
+        movdqa	%xmm1, %xmm5
+        paddd	L_aes_gcm_one, %xmm0
+        movdqa	%xmm1, %xmm4
+        movdqu	%xmm0, 64(%esp)
+        psrlq	$63, %xmm5
+        psllq	$0x01, %xmm4
+        pslldq	$8, %xmm5
+        por	%xmm5, %xmm4
+        pshufd	$0xff, %xmm1, %xmm1
+        psrad	$31, %xmm1
+        pand	L_aes_gcm_mod2_128, %xmm1
+        pxor	%xmm4, %xmm1
+        xorl	%ebx, %ebx
+        cmpl	$0x40, 216(%esp)
+        movl	216(%esp), %eax
+        jl	L_AES_GCM_decrypt_done_64
+        andl	$0xffffffc0, %eax
+        movdqa	%xmm2, %xmm6
+        # H ^ 1
+        movdqu	%xmm1, (%esp)
+        # H ^ 2
+        pshufd	$0x4e, %xmm1, %xmm5
+        pshufd	$0x4e, %xmm1, %xmm6
+        movdqa	%xmm1, %xmm7
+        movdqa	%xmm1, %xmm4
+        pclmulqdq	$0x11, %xmm1, %xmm7
+        pclmulqdq	$0x00, %xmm1, %xmm4
+        pxor	%xmm1, %xmm5
+        pxor	%xmm1, %xmm6
+        pclmulqdq	$0x00, %xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm6
+        movdqa	%xmm7, %xmm0
+        pslldq	$8, %xmm6
+        psrldq	$8, %xmm5
+        pxor	%xmm6, %xmm4
+        pxor	%xmm5, %xmm0
+        movdqa	%xmm4, %xmm5
+        movdqa	%xmm4, %xmm6
+        movdqa	%xmm4, %xmm7
+        pslld	$31, %xmm5
+        pslld	$30, %xmm6
+        pslld	$25, %xmm7
+        pxor	%xmm6, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm7
+        psrldq	$4, %xmm7
+        pslldq	$12, %xmm5
+        pxor	%xmm5, %xmm4
+        movdqa	%xmm4, %xmm5
+        movdqa	%xmm4, %xmm6
+        psrld	$0x01, %xmm5
+        psrld	$2, %xmm6
+        pxor	%xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        psrld	$7, %xmm4
+        pxor	%xmm7, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm5, %xmm0
+        movdqu	%xmm0, 16(%esp)
+        # H ^ 3
+        pshufd	$0x4e, %xmm1, %xmm5
+        pshufd	$0x4e, %xmm0, %xmm6
+        movdqa	%xmm0, %xmm7
+        movdqa	%xmm0, %xmm4
+        pclmulqdq	$0x11, %xmm1, %xmm7
+        pclmulqdq	$0x00, %xmm1, %xmm4
+        pxor	%xmm1, %xmm5
+        pxor	%xmm0, %xmm6
+        pclmulqdq	$0x00, %xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm6
+        movdqa	%xmm7, %xmm3
+        pslldq	$8, %xmm6
+        psrldq	$8, %xmm5
+        pxor	%xmm6, %xmm4
+        pxor	%xmm5, %xmm3
+        movdqa	%xmm4, %xmm5
+        movdqa	%xmm4, %xmm6
+        movdqa	%xmm4, %xmm7
+        pslld	$31, %xmm5
+        pslld	$30, %xmm6
+        pslld	$25, %xmm7
+        pxor	%xmm6, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm7
+        psrldq	$4, %xmm7
+        pslldq	$12, %xmm5
+        pxor	%xmm5, %xmm4
+        movdqa	%xmm4, %xmm5
+        movdqa	%xmm4, %xmm6
+        psrld	$0x01, %xmm5
+        psrld	$2, %xmm6
+        pxor	%xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        psrld	$7, %xmm4
+        pxor	%xmm7, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm5, %xmm3
+        movdqu	%xmm3, 32(%esp)
+        # H ^ 4
+        pshufd	$0x4e, %xmm0, %xmm5
+        pshufd	$0x4e, %xmm0, %xmm6
+        movdqa	%xmm0, %xmm7
+        movdqa	%xmm0, %xmm4
+        pclmulqdq	$0x11, %xmm0, %xmm7
+        pclmulqdq	$0x00, %xmm0, %xmm4
+        pxor	%xmm0, %xmm5
+        pxor	%xmm0, %xmm6
+        pclmulqdq	$0x00, %xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm6
+        movdqa	%xmm7, %xmm3
+        pslldq	$8, %xmm6
+        psrldq	$8, %xmm5
+        pxor	%xmm6, %xmm4
+        pxor	%xmm5, %xmm3
+        movdqa	%xmm4, %xmm5
+        movdqa	%xmm4, %xmm6
+        movdqa	%xmm4, %xmm7
+        pslld	$31, %xmm5
+        pslld	$30, %xmm6
+        pslld	$25, %xmm7
+        pxor	%xmm6, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm7
+        psrldq	$4, %xmm7
+        pslldq	$12, %xmm5
+        pxor	%xmm5, %xmm4
+        movdqa	%xmm4, %xmm5
+        movdqa	%xmm4, %xmm6
+        psrld	$0x01, %xmm5
+        psrld	$2, %xmm6
+        pxor	%xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        psrld	$7, %xmm4
+        pxor	%xmm7, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm5, %xmm3
+        movdqu	%xmm3, 48(%esp)
+        cmpl	%esi, %edi
+        jne	L_AES_GCM_decrypt_ghash_64
+L_AES_GCM_decrypt_ghash_64_inplace:
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        # Encrypt 64 bytes of counter
+        movdqu	64(%esp), %xmm4
+        movdqa	L_aes_gcm_bswap_epi64, %xmm3
+        movdqa	%xmm4, %xmm5
+        movdqa	%xmm4, %xmm6
+        movdqa	%xmm4, %xmm7
+        pshufb	%xmm3, %xmm4
+        paddd	L_aes_gcm_one, %xmm5
+        pshufb	%xmm3, %xmm5
+        paddd	L_aes_gcm_two, %xmm6
+        pshufb	%xmm3, %xmm6
+        paddd	L_aes_gcm_three, %xmm7
+        pshufb	%xmm3, %xmm7
+        movdqu	64(%esp), %xmm3
+        paddd	L_aes_gcm_four, %xmm3
+        movdqu	%xmm3, 64(%esp)
+        movdqa	(%ebp), %xmm3
+        pxor	%xmm3, %xmm4
+        pxor	%xmm3, %xmm5
+        pxor	%xmm3, %xmm6
+        pxor	%xmm3, %xmm7
+        movdqa	16(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	32(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	48(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	64(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	80(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	96(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	112(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	128(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	144(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        cmpl	$11, 236(%esp)
+        movdqa	160(%ebp), %xmm3
+        jl	L_AES_GCM_decryptinplace_aesenc_64_ghash_avx_done
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	176(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        cmpl	$13, 236(%esp)
+        movdqa	192(%ebp), %xmm3
+        jl	L_AES_GCM_decryptinplace_aesenc_64_ghash_avx_done
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	208(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	224(%ebp), %xmm3
+L_AES_GCM_decryptinplace_aesenc_64_ghash_avx_done:
+        aesenclast	%xmm3, %xmm4
+        aesenclast	%xmm3, %xmm5
+        movdqu	(%ecx), %xmm0
+        movdqu	16(%ecx), %xmm1
+        pxor	%xmm0, %xmm4
+        pxor	%xmm1, %xmm5
+        movdqu	%xmm0, 112(%esp)
+        movdqu	%xmm1, 128(%esp)
+        movdqu	%xmm4, (%edx)
+        movdqu	%xmm5, 16(%edx)
+        aesenclast	%xmm3, %xmm6
+        aesenclast	%xmm3, %xmm7
+        movdqu	32(%ecx), %xmm0
+        movdqu	48(%ecx), %xmm1
+        pxor	%xmm0, %xmm6
+        pxor	%xmm1, %xmm7
+        movdqu	%xmm0, 144(%esp)
+        movdqu	%xmm1, 160(%esp)
+        movdqu	%xmm6, 32(%edx)
+        movdqu	%xmm7, 48(%edx)
+        # ghash encrypted counter
+        movdqu	96(%esp), %xmm6
+        movdqu	48(%esp), %xmm3
+        movdqu	112(%esp), %xmm4
+        pshufb	L_aes_gcm_bswap_mask, %xmm4
+        pxor	%xmm6, %xmm4
+        pshufd	$0x4e, %xmm3, %xmm5
+        pshufd	$0x4e, %xmm4, %xmm1
+        pxor	%xmm3, %xmm5
+        pxor	%xmm4, %xmm1
+        movdqa	%xmm4, %xmm7
+        pclmulqdq	$0x11, %xmm3, %xmm7
+        movdqa	%xmm4, %xmm6
+        pclmulqdq	$0x00, %xmm3, %xmm6
+        pclmulqdq	$0x00, %xmm1, %xmm5
+        pxor	%xmm6, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqu	32(%esp), %xmm3
+        movdqu	128(%esp), %xmm4
+        pshufd	$0x4e, %xmm3, %xmm0
+        pshufb	L_aes_gcm_bswap_mask, %xmm4
+        pxor	%xmm3, %xmm0
+        pshufd	$0x4e, %xmm4, %xmm1
+        pxor	%xmm4, %xmm1
+        movdqa	%xmm4, %xmm2
+        pclmulqdq	$0x11, %xmm3, %xmm2
+        pclmulqdq	$0x00, %xmm4, %xmm3
+        pclmulqdq	$0x00, %xmm1, %xmm0
+        pxor	%xmm3, %xmm5
+        pxor	%xmm3, %xmm6
+        pxor	%xmm2, %xmm5
+        pxor	%xmm2, %xmm7
+        pxor	%xmm0, %xmm5
+        movdqu	16(%esp), %xmm3
+        movdqu	144(%esp), %xmm4
+        pshufd	$0x4e, %xmm3, %xmm0
+        pshufb	L_aes_gcm_bswap_mask, %xmm4
+        pxor	%xmm3, %xmm0
+        pshufd	$0x4e, %xmm4, %xmm1
+        pxor	%xmm4, %xmm1
+        movdqa	%xmm4, %xmm2
+        pclmulqdq	$0x11, %xmm3, %xmm2
+        pclmulqdq	$0x00, %xmm4, %xmm3
+        pclmulqdq	$0x00, %xmm1, %xmm0
+        pxor	%xmm3, %xmm5
+        pxor	%xmm3, %xmm6
+        pxor	%xmm2, %xmm5
+        pxor	%xmm2, %xmm7
+        pxor	%xmm0, %xmm5
+        movdqu	(%esp), %xmm3
+        movdqu	160(%esp), %xmm4
+        pshufd	$0x4e, %xmm3, %xmm0
+        pshufb	L_aes_gcm_bswap_mask, %xmm4
+        pxor	%xmm3, %xmm0
+        pshufd	$0x4e, %xmm4, %xmm1
+        pxor	%xmm4, %xmm1
+        movdqa	%xmm4, %xmm2
+        pclmulqdq	$0x11, %xmm3, %xmm2
+        pclmulqdq	$0x00, %xmm4, %xmm3
+        pclmulqdq	$0x00, %xmm1, %xmm0
+        pxor	%xmm3, %xmm5
+        pxor	%xmm3, %xmm6
+        pxor	%xmm2, %xmm5
+        pxor	%xmm2, %xmm7
+        pxor	%xmm0, %xmm5
+        movdqa	%xmm5, %xmm1
+        psrldq	$8, %xmm5
+        pslldq	$8, %xmm1
+        pxor	%xmm1, %xmm6
+        pxor	%xmm5, %xmm7
+        movdqa	%xmm6, %xmm3
+        movdqa	%xmm6, %xmm0
+        movdqa	%xmm6, %xmm1
+        pslld	$31, %xmm3
+        pslld	$30, %xmm0
+        pslld	$25, %xmm1
+        pxor	%xmm0, %xmm3
+        pxor	%xmm1, %xmm3
+        movdqa	%xmm3, %xmm0
+        pslldq	$12, %xmm3
+        psrldq	$4, %xmm0
+        pxor	%xmm3, %xmm6
+        movdqa	%xmm6, %xmm1
+        movdqa	%xmm6, %xmm5
+        movdqa	%xmm6, %xmm4
+        psrld	$0x01, %xmm1
+        psrld	$2, %xmm5
+        psrld	$7, %xmm4
+        pxor	%xmm5, %xmm1
+        pxor	%xmm4, %xmm1
+        pxor	%xmm0, %xmm1
+        pxor	%xmm1, %xmm6
+        pxor	%xmm7, %xmm6
+        movdqu	%xmm6, 96(%esp)
+        addl	$0x40, %ebx
+        cmpl	%eax, %ebx
+        jl	L_AES_GCM_decrypt_ghash_64_inplace
+        jmp	L_AES_GCM_decrypt_ghash_64_done
+L_AES_GCM_decrypt_ghash_64:
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        # Encrypt 64 bytes of counter
+        movdqu	64(%esp), %xmm4
+        movdqa	L_aes_gcm_bswap_epi64, %xmm3
+        movdqa	%xmm4, %xmm5
+        movdqa	%xmm4, %xmm6
+        movdqa	%xmm4, %xmm7
+        pshufb	%xmm3, %xmm4
+        paddd	L_aes_gcm_one, %xmm5
+        pshufb	%xmm3, %xmm5
+        paddd	L_aes_gcm_two, %xmm6
+        pshufb	%xmm3, %xmm6
+        paddd	L_aes_gcm_three, %xmm7
+        pshufb	%xmm3, %xmm7
+        movdqu	64(%esp), %xmm3
+        paddd	L_aes_gcm_four, %xmm3
+        movdqu	%xmm3, 64(%esp)
+        movdqa	(%ebp), %xmm3
+        pxor	%xmm3, %xmm4
+        pxor	%xmm3, %xmm5
+        pxor	%xmm3, %xmm6
+        pxor	%xmm3, %xmm7
+        movdqa	16(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	32(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	48(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	64(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	80(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	96(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	112(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	128(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	144(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        cmpl	$11, 236(%esp)
+        movdqa	160(%ebp), %xmm3
+        jl	L_AES_GCM_decrypt_aesenc_64_ghash_avx_done
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	176(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        cmpl	$13, 236(%esp)
+        movdqa	192(%ebp), %xmm3
+        jl	L_AES_GCM_decrypt_aesenc_64_ghash_avx_done
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	208(%ebp), %xmm3
+        aesenc	%xmm3, %xmm4
+        aesenc	%xmm3, %xmm5
+        aesenc	%xmm3, %xmm6
+        aesenc	%xmm3, %xmm7
+        movdqa	224(%ebp), %xmm3
+L_AES_GCM_decrypt_aesenc_64_ghash_avx_done:
+        aesenclast	%xmm3, %xmm4
+        aesenclast	%xmm3, %xmm5
+        movdqu	(%ecx), %xmm0
+        movdqu	16(%ecx), %xmm1
+        pxor	%xmm0, %xmm4
+        pxor	%xmm1, %xmm5
+        movdqu	%xmm0, (%ecx)
+        movdqu	%xmm1, 16(%ecx)
+        movdqu	%xmm4, (%edx)
+        movdqu	%xmm5, 16(%edx)
+        aesenclast	%xmm3, %xmm6
+        aesenclast	%xmm3, %xmm7
+        movdqu	32(%ecx), %xmm0
+        movdqu	48(%ecx), %xmm1
+        pxor	%xmm0, %xmm6
+        pxor	%xmm1, %xmm7
+        movdqu	%xmm0, 32(%ecx)
+        movdqu	%xmm1, 48(%ecx)
+        movdqu	%xmm6, 32(%edx)
+        movdqu	%xmm7, 48(%edx)
+        # ghash encrypted counter
+        movdqu	96(%esp), %xmm6
+        movdqu	48(%esp), %xmm3
+        movdqu	(%ecx), %xmm4
+        pshufb	L_aes_gcm_bswap_mask, %xmm4
+        pxor	%xmm6, %xmm4
+        pshufd	$0x4e, %xmm3, %xmm5
+        pshufd	$0x4e, %xmm4, %xmm1
+        pxor	%xmm3, %xmm5
+        pxor	%xmm4, %xmm1
+        movdqa	%xmm4, %xmm7
+        pclmulqdq	$0x11, %xmm3, %xmm7
+        movdqa	%xmm4, %xmm6
+        pclmulqdq	$0x00, %xmm3, %xmm6
+        pclmulqdq	$0x00, %xmm1, %xmm5
+        pxor	%xmm6, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqu	32(%esp), %xmm3
+        movdqu	16(%ecx), %xmm4
+        pshufd	$0x4e, %xmm3, %xmm0
+        pshufb	L_aes_gcm_bswap_mask, %xmm4
+        pxor	%xmm3, %xmm0
+        pshufd	$0x4e, %xmm4, %xmm1
+        pxor	%xmm4, %xmm1
+        movdqa	%xmm4, %xmm2
+        pclmulqdq	$0x11, %xmm3, %xmm2
+        pclmulqdq	$0x00, %xmm4, %xmm3
+        pclmulqdq	$0x00, %xmm1, %xmm0
+        pxor	%xmm3, %xmm5
+        pxor	%xmm3, %xmm6
+        pxor	%xmm2, %xmm5
+        pxor	%xmm2, %xmm7
+        pxor	%xmm0, %xmm5
+        movdqu	16(%esp), %xmm3
+        movdqu	32(%ecx), %xmm4
+        pshufd	$0x4e, %xmm3, %xmm0
+        pshufb	L_aes_gcm_bswap_mask, %xmm4
+        pxor	%xmm3, %xmm0
+        pshufd	$0x4e, %xmm4, %xmm1
+        pxor	%xmm4, %xmm1
+        movdqa	%xmm4, %xmm2
+        pclmulqdq	$0x11, %xmm3, %xmm2
+        pclmulqdq	$0x00, %xmm4, %xmm3
+        pclmulqdq	$0x00, %xmm1, %xmm0
+        pxor	%xmm3, %xmm5
+        pxor	%xmm3, %xmm6
+        pxor	%xmm2, %xmm5
+        pxor	%xmm2, %xmm7
+        pxor	%xmm0, %xmm5
+        movdqu	(%esp), %xmm3
+        movdqu	48(%ecx), %xmm4
+        pshufd	$0x4e, %xmm3, %xmm0
+        pshufb	L_aes_gcm_bswap_mask, %xmm4
+        pxor	%xmm3, %xmm0
+        pshufd	$0x4e, %xmm4, %xmm1
+        pxor	%xmm4, %xmm1
+        movdqa	%xmm4, %xmm2
+        pclmulqdq	$0x11, %xmm3, %xmm2
+        pclmulqdq	$0x00, %xmm4, %xmm3
+        pclmulqdq	$0x00, %xmm1, %xmm0
+        pxor	%xmm3, %xmm5
+        pxor	%xmm3, %xmm6
+        pxor	%xmm2, %xmm5
+        pxor	%xmm2, %xmm7
+        pxor	%xmm0, %xmm5
+        movdqa	%xmm5, %xmm1
+        psrldq	$8, %xmm5
+        pslldq	$8, %xmm1
+        pxor	%xmm1, %xmm6
+        pxor	%xmm5, %xmm7
+        movdqa	%xmm6, %xmm3
+        movdqa	%xmm6, %xmm0
+        movdqa	%xmm6, %xmm1
+        pslld	$31, %xmm3
+        pslld	$30, %xmm0
+        pslld	$25, %xmm1
+        pxor	%xmm0, %xmm3
+        pxor	%xmm1, %xmm3
+        movdqa	%xmm3, %xmm0
+        pslldq	$12, %xmm3
+        psrldq	$4, %xmm0
+        pxor	%xmm3, %xmm6
+        movdqa	%xmm6, %xmm1
+        movdqa	%xmm6, %xmm5
+        movdqa	%xmm6, %xmm4
+        psrld	$0x01, %xmm1
+        psrld	$2, %xmm5
+        psrld	$7, %xmm4
+        pxor	%xmm5, %xmm1
+        pxor	%xmm4, %xmm1
+        pxor	%xmm0, %xmm1
+        pxor	%xmm1, %xmm6
+        pxor	%xmm7, %xmm6
+        movdqu	%xmm6, 96(%esp)
+        addl	$0x40, %ebx
+        cmpl	%eax, %ebx
+        jl	L_AES_GCM_decrypt_ghash_64
+L_AES_GCM_decrypt_ghash_64_done:
+        movdqa	%xmm6, %xmm2
+        movdqu	(%esp), %xmm1
+L_AES_GCM_decrypt_done_64:
+        movl	216(%esp), %edx
+        cmpl	%edx, %ebx
+        jge	L_AES_GCM_decrypt_done_dec
+        movl	216(%esp), %eax
+        andl	$0xfffffff0, %eax
+        cmpl	%eax, %ebx
+        jge	L_AES_GCM_decrypt_last_block_done
+L_AES_GCM_decrypt_last_block_start:
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        movdqu	(%ecx), %xmm5
+        pshufb	L_aes_gcm_bswap_mask, %xmm5
+        pxor	%xmm2, %xmm5
+        movdqu	%xmm5, (%esp)
+        movdqu	64(%esp), %xmm4
+        movdqa	%xmm4, %xmm5
+        pshufb	L_aes_gcm_bswap_epi64, %xmm4
+        paddd	L_aes_gcm_one, %xmm5
+        pxor	(%ebp), %xmm4
+        movdqu	%xmm5, 64(%esp)
+        movdqu	(%esp), %xmm0
+        pclmulqdq	$16, %xmm1, %xmm0
+        aesenc	16(%ebp), %xmm4
+        aesenc	32(%ebp), %xmm4
+        movdqu	(%esp), %xmm3
+        pclmulqdq	$0x01, %xmm1, %xmm3
+        aesenc	48(%ebp), %xmm4
+        aesenc	64(%ebp), %xmm4
+        aesenc	80(%ebp), %xmm4
+        movdqu	(%esp), %xmm5
+        pclmulqdq	$0x11, %xmm1, %xmm5
+        aesenc	96(%ebp), %xmm4
+        pxor	%xmm3, %xmm0
+        movdqa	%xmm0, %xmm6
+        psrldq	$8, %xmm0
+        pslldq	$8, %xmm6
+        aesenc	112(%ebp), %xmm4
+        movdqu	(%esp), %xmm3
+        pclmulqdq	$0x00, %xmm1, %xmm3
+        pxor	%xmm3, %xmm6
+        pxor	%xmm0, %xmm5
+        movdqa	L_aes_gcm_mod2_128, %xmm7
+        movdqa	%xmm6, %xmm3
+        pclmulqdq	$16, %xmm7, %xmm3
+        aesenc	128(%ebp), %xmm4
+        pshufd	$0x4e, %xmm6, %xmm0
+        pxor	%xmm3, %xmm0
+        movdqa	%xmm0, %xmm3
+        pclmulqdq	$16, %xmm7, %xmm3
+        aesenc	144(%ebp), %xmm4
+        pshufd	$0x4e, %xmm0, %xmm2
+        pxor	%xmm3, %xmm2
+        pxor	%xmm5, %xmm2
+        cmpl	$11, 236(%esp)
+        movdqa	160(%ebp), %xmm5
+        jl	L_AES_GCM_decrypt_aesenc_gfmul_last
+        aesenc	%xmm5, %xmm4
+        aesenc	176(%ebp), %xmm4
+        cmpl	$13, 236(%esp)
+        movdqa	192(%ebp), %xmm5
+        jl	L_AES_GCM_decrypt_aesenc_gfmul_last
+        aesenc	%xmm5, %xmm4
+        aesenc	208(%ebp), %xmm4
+        movdqa	224(%ebp), %xmm5
+L_AES_GCM_decrypt_aesenc_gfmul_last:
+        aesenclast	%xmm5, %xmm4
+        movdqu	(%ecx), %xmm5
+        pxor	%xmm5, %xmm4
+        movdqu	%xmm4, (%edx)
+        addl	$16, %ebx
+        cmpl	%eax, %ebx
+        jl	L_AES_GCM_decrypt_last_block_start
+L_AES_GCM_decrypt_last_block_done:
+        movl	216(%esp), %ecx
+        movl	%ecx, %edx
+        andl	$15, %ecx
+        jz	L_AES_GCM_decrypt_aesenc_last15_dec_avx_done
+        movdqu	64(%esp), %xmm0
+        pshufb	L_aes_gcm_bswap_epi64, %xmm0
+        pxor	(%ebp), %xmm0
+        aesenc	16(%ebp), %xmm0
+        aesenc	32(%ebp), %xmm0
+        aesenc	48(%ebp), %xmm0
+        aesenc	64(%ebp), %xmm0
+        aesenc	80(%ebp), %xmm0
+        aesenc	96(%ebp), %xmm0
+        aesenc	112(%ebp), %xmm0
+        aesenc	128(%ebp), %xmm0
+        aesenc	144(%ebp), %xmm0
+        cmpl	$11, 236(%esp)
+        movdqa	160(%ebp), %xmm5
+        jl	L_AES_GCM_decrypt_aesenc_last15_dec_avx_aesenc_avx_last
+        aesenc	%xmm5, %xmm0
+        aesenc	176(%ebp), %xmm0
+        cmpl	$13, 236(%esp)
+        movdqa	192(%ebp), %xmm5
+        jl	L_AES_GCM_decrypt_aesenc_last15_dec_avx_aesenc_avx_last
+        aesenc	%xmm5, %xmm0
+        aesenc	208(%ebp), %xmm0
+        movdqa	224(%ebp), %xmm5
+L_AES_GCM_decrypt_aesenc_last15_dec_avx_aesenc_avx_last:
+        aesenclast	%xmm5, %xmm0
+        subl	$32, %esp
+        xorl	%ecx, %ecx
+        movdqu	%xmm0, (%esp)
+        pxor	%xmm4, %xmm4
+        movdqu	%xmm4, 16(%esp)
+L_AES_GCM_decrypt_aesenc_last15_dec_avx_loop:
+        movzbl	(%esi,%ebx,1), %eax
+        movb	%al, 16(%esp,%ecx,1)
+        xorb	(%esp,%ecx,1), %al
+        movb	%al, (%edi,%ebx,1)
+        incl	%ebx
+        incl	%ecx
+        cmpl	%edx, %ebx
+        jl	L_AES_GCM_decrypt_aesenc_last15_dec_avx_loop
+        movdqu	16(%esp), %xmm0
+        addl	$32, %esp
+        pshufb	L_aes_gcm_bswap_mask, %xmm0
+        pxor	%xmm0, %xmm2
+        pshufd	$0x4e, %xmm1, %xmm5
+        pshufd	$0x4e, %xmm2, %xmm6
+        movdqa	%xmm2, %xmm7
+        movdqa	%xmm2, %xmm4
+        pclmulqdq	$0x11, %xmm1, %xmm7
+        pclmulqdq	$0x00, %xmm1, %xmm4
+        pxor	%xmm1, %xmm5
+        pxor	%xmm2, %xmm6
+        pclmulqdq	$0x00, %xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm6
+        movdqa	%xmm7, %xmm2
+        pslldq	$8, %xmm6
+        psrldq	$8, %xmm5
+        pxor	%xmm6, %xmm4
+        pxor	%xmm5, %xmm2
+        movdqa	%xmm4, %xmm5
+        movdqa	%xmm4, %xmm6
+        movdqa	%xmm4, %xmm7
+        pslld	$31, %xmm5
+        pslld	$30, %xmm6
+        pslld	$25, %xmm7
+        pxor	%xmm6, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm7
+        psrldq	$4, %xmm7
+        pslldq	$12, %xmm5
+        pxor	%xmm5, %xmm4
+        movdqa	%xmm4, %xmm5
+        movdqa	%xmm4, %xmm6
+        psrld	$0x01, %xmm5
+        psrld	$2, %xmm6
+        pxor	%xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        psrld	$7, %xmm4
+        pxor	%xmm7, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm5, %xmm2
+L_AES_GCM_decrypt_aesenc_last15_dec_avx_done:
+L_AES_GCM_decrypt_done_dec:
+        movl	212(%esp), %esi
+        movl	228(%esp), %ebp
+        movl	216(%esp), %edx
+        movl	220(%esp), %ecx
+        shll	$3, %edx
+        shll	$3, %ecx
+        pinsrd	$0x00, %edx, %xmm4
+        pinsrd	$2, %ecx, %xmm4
+        movl	216(%esp), %edx
+        movl	220(%esp), %ecx
+        shrl	$29, %edx
+        shrl	$29, %ecx
+        pinsrd	$0x01, %edx, %xmm4
+        pinsrd	$3, %ecx, %xmm4
+        pxor	%xmm4, %xmm2
+        pshufd	$0x4e, %xmm1, %xmm5
+        pshufd	$0x4e, %xmm2, %xmm6
+        movdqa	%xmm2, %xmm7
+        movdqa	%xmm2, %xmm4
+        pclmulqdq	$0x11, %xmm1, %xmm7
+        pclmulqdq	$0x00, %xmm1, %xmm4
+        pxor	%xmm1, %xmm5
+        pxor	%xmm2, %xmm6
+        pclmulqdq	$0x00, %xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm6
+        movdqa	%xmm7, %xmm2
+        pslldq	$8, %xmm6
+        psrldq	$8, %xmm5
+        pxor	%xmm6, %xmm4
+        pxor	%xmm5, %xmm2
+        movdqa	%xmm4, %xmm5
+        movdqa	%xmm4, %xmm6
+        movdqa	%xmm4, %xmm7
+        pslld	$31, %xmm5
+        pslld	$30, %xmm6
+        pslld	$25, %xmm7
+        pxor	%xmm6, %xmm5
+        pxor	%xmm7, %xmm5
+        movdqa	%xmm5, %xmm7
+        psrldq	$4, %xmm7
+        pslldq	$12, %xmm5
+        pxor	%xmm5, %xmm4
+        movdqa	%xmm4, %xmm5
+        movdqa	%xmm4, %xmm6
+        psrld	$0x01, %xmm5
+        psrld	$2, %xmm6
+        pxor	%xmm6, %xmm5
+        pxor	%xmm4, %xmm5
+        psrld	$7, %xmm4
+        pxor	%xmm7, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm5, %xmm2
+        pshufb	L_aes_gcm_bswap_mask, %xmm2
+        movdqu	80(%esp), %xmm4
+        pxor	%xmm2, %xmm4
+        movl	240(%esp), %edi
+        cmpl	$16, %ebp
+        je	L_AES_GCM_decrypt_cmp_tag_16
+        subl	$16, %esp
+        xorl	%ecx, %ecx
+        xorl	%ebx, %ebx
+        movdqu	%xmm4, (%esp)
+L_AES_GCM_decrypt_cmp_tag_loop:
+        movzbl	(%esp,%ecx,1), %eax
+        xorb	(%esi,%ecx,1), %al
+        orb	%al, %bl
+        incl	%ecx
+        cmpl	%ebp, %ecx
+        jne	L_AES_GCM_decrypt_cmp_tag_loop
+        cmpb	$0x00, %bl
+        sete	%bl
+        addl	$16, %esp
+        xorl	%ecx, %ecx
+        jmp	L_AES_GCM_decrypt_cmp_tag_done
+L_AES_GCM_decrypt_cmp_tag_16:
+        movdqu	(%esi), %xmm5
+        pcmpeqb	%xmm5, %xmm4
+        pmovmskb	%xmm4, %edx
+        # %%edx == 0xFFFF then return 1 else => return 0
+        xorl	%ebx, %ebx
+        cmpl	$0xffff, %edx
+        sete	%bl
+L_AES_GCM_decrypt_cmp_tag_done:
+        movl	%ebx, (%edi)
+        addl	$0xb0, %esp
+        popl	%ebp
+        popl	%edi
+        popl	%esi
+        popl	%ebx
+        ret
+.size	AES_GCM_decrypt,.-AES_GCM_decrypt
+#ifdef WOLFSSL_AESGCM_STREAM
+.text
+.globl	AES_GCM_init_aesni
+.type	AES_GCM_init_aesni,@function
+.align	16
+AES_GCM_init_aesni:
+        pushl	%ebx
+        pushl	%esi
+        pushl	%edi
+        pushl	%ebp
+        subl	$16, %esp
+        movl	36(%esp), %ebp
+        movl	44(%esp), %esi
+        movl	60(%esp), %edi
+        pxor	%xmm4, %xmm4
+        movl	48(%esp), %edx
+        cmpl	$12, %edx
+        jne	L_AES_GCM_init_aesni_iv_not_12
+        # # Calculate values when IV is 12 bytes
+        # Set counter based on IV
+        movl	$0x1000000, %ecx
+        pinsrd	$0x00, (%esi), %xmm4
+        pinsrd	$0x01, 4(%esi), %xmm4
+        pinsrd	$2, 8(%esi), %xmm4
+        pinsrd	$3, %ecx, %xmm4
+        # H = Encrypt X(=0) and T = Encrypt counter
+        movdqa	%xmm4, %xmm1
+        movdqa	(%ebp), %xmm5
+        pxor	%xmm5, %xmm1
+        movdqa	16(%ebp), %xmm7
+        aesenc	%xmm7, %xmm5
+        aesenc	%xmm7, %xmm1
+        movdqa	32(%ebp), %xmm7
+        aesenc	%xmm7, %xmm5
+        aesenc	%xmm7, %xmm1
+        movdqa	48(%ebp), %xmm7
+        aesenc	%xmm7, %xmm5
+        aesenc	%xmm7, %xmm1
+        movdqa	64(%ebp), %xmm7
+        aesenc	%xmm7, %xmm5
+        aesenc	%xmm7, %xmm1
+        movdqa	80(%ebp), %xmm7
+        aesenc	%xmm7, %xmm5
+        aesenc	%xmm7, %xmm1
+        movdqa	96(%ebp), %xmm7
+        aesenc	%xmm7, %xmm5
+        aesenc	%xmm7, %xmm1
+        movdqa	112(%ebp), %xmm7
+        aesenc	%xmm7, %xmm5
+        aesenc	%xmm7, %xmm1
+        movdqa	128(%ebp), %xmm7
+        aesenc	%xmm7, %xmm5
+        aesenc	%xmm7, %xmm1
+        movdqa	144(%ebp), %xmm7
+        aesenc	%xmm7, %xmm5
+        aesenc	%xmm7, %xmm1
+        cmpl	$11, 40(%esp)
+        movdqa	160(%ebp), %xmm7
+        jl	L_AES_GCM_init_aesni_calc_iv_12_last
+        aesenc	%xmm7, %xmm5
+        aesenc	%xmm7, %xmm1
+        movdqa	176(%ebp), %xmm7
+        aesenc	%xmm7, %xmm5
+        aesenc	%xmm7, %xmm1
+        cmpl	$13, 40(%esp)
+        movdqa	192(%ebp), %xmm7
+        jl	L_AES_GCM_init_aesni_calc_iv_12_last
+        aesenc	%xmm7, %xmm5
+        aesenc	%xmm7, %xmm1
+        movdqa	208(%ebp), %xmm7
+        aesenc	%xmm7, %xmm5
+        aesenc	%xmm7, %xmm1
+        movdqa	224(%ebp), %xmm7
+L_AES_GCM_init_aesni_calc_iv_12_last:
+        aesenclast	%xmm7, %xmm5
+        aesenclast	%xmm7, %xmm1
+        pshufb	L_aes_gcm_bswap_mask, %xmm5
+        movdqu	%xmm1, (%edi)
+        jmp	L_AES_GCM_init_aesni_iv_done
+L_AES_GCM_init_aesni_iv_not_12:
+        # Calculate values when IV is not 12 bytes
+        # H = Encrypt X(=0)
+        movdqa	(%ebp), %xmm5
+        aesenc	16(%ebp), %xmm5
+        aesenc	32(%ebp), %xmm5
+        aesenc	48(%ebp), %xmm5
+        aesenc	64(%ebp), %xmm5
+        aesenc	80(%ebp), %xmm5
+        aesenc	96(%ebp), %xmm5
+        aesenc	112(%ebp), %xmm5
+        aesenc	128(%ebp), %xmm5
+        aesenc	144(%ebp), %xmm5
+        cmpl	$11, 40(%esp)
+        movdqa	160(%ebp), %xmm1
+        jl	L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last
+        aesenc	%xmm1, %xmm5
+        aesenc	176(%ebp), %xmm5
+        cmpl	$13, 40(%esp)
+        movdqa	192(%ebp), %xmm1
+        jl	L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last
+        aesenc	%xmm1, %xmm5
+        aesenc	208(%ebp), %xmm5
+        movdqa	224(%ebp), %xmm1
+L_AES_GCM_init_aesni_calc_iv_1_aesenc_avx_last:
+        aesenclast	%xmm1, %xmm5
+        pshufb	L_aes_gcm_bswap_mask, %xmm5
+        # Calc counter
+        # Initialization vector
+        cmpl	$0x00, %edx
+        movl	$0x00, %ecx
+        je	L_AES_GCM_init_aesni_calc_iv_done
+        cmpl	$16, %edx
+        jl	L_AES_GCM_init_aesni_calc_iv_lt16
+        andl	$0xfffffff0, %edx
+L_AES_GCM_init_aesni_calc_iv_16_loop:
+        movdqu	(%esi,%ecx,1), %xmm0
+        pshufb	L_aes_gcm_bswap_mask, %xmm0
+        pxor	%xmm0, %xmm4
+        pshufd	$0x4e, %xmm4, %xmm1
+        pshufd	$0x4e, %xmm5, %xmm2
+        movdqa	%xmm5, %xmm3
+        movdqa	%xmm5, %xmm0
+        pclmulqdq	$0x11, %xmm4, %xmm3
+        pclmulqdq	$0x00, %xmm4, %xmm0
+        pxor	%xmm4, %xmm1
+        pxor	%xmm5, %xmm2
+        pclmulqdq	$0x00, %xmm2, %xmm1
+        pxor	%xmm0, %xmm1
+        pxor	%xmm3, %xmm1
+        movdqa	%xmm1, %xmm2
+        movdqa	%xmm0, %xmm7
+        movdqa	%xmm3, %xmm4
+        pslldq	$8, %xmm2
+        psrldq	$8, %xmm1
+        pxor	%xmm2, %xmm7
+        pxor	%xmm1, %xmm4
+        movdqa	%xmm7, %xmm0
+        movdqa	%xmm4, %xmm1
+        psrld	$31, %xmm0
+        psrld	$31, %xmm1
+        pslld	$0x01, %xmm7
+        pslld	$0x01, %xmm4
+        movdqa	%xmm0, %xmm2
+        pslldq	$4, %xmm0
+        psrldq	$12, %xmm2
+        pslldq	$4, %xmm1
+        por	%xmm2, %xmm4
+        por	%xmm0, %xmm7
+        por	%xmm1, %xmm4
+        movdqa	%xmm7, %xmm0
+        movdqa	%xmm7, %xmm1
+        movdqa	%xmm7, %xmm2
+        pslld	$31, %xmm0
+        pslld	$30, %xmm1
+        pslld	$25, %xmm2
+        pxor	%xmm1, %xmm0
+        pxor	%xmm2, %xmm0
+        movdqa	%xmm0, %xmm1
+        psrldq	$4, %xmm1
+        pslldq	$12, %xmm0
+        pxor	%xmm0, %xmm7
+        movdqa	%xmm7, %xmm2
+        movdqa	%xmm7, %xmm3
+        movdqa	%xmm7, %xmm0
+        psrld	$0x01, %xmm2
+        psrld	$2, %xmm3
+        psrld	$7, %xmm0
+        pxor	%xmm3, %xmm2
+        pxor	%xmm0, %xmm2
+        pxor	%xmm1, %xmm2
+        pxor	%xmm7, %xmm2
+        pxor	%xmm2, %xmm4
+        addl	$16, %ecx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_init_aesni_calc_iv_16_loop
+        movl	48(%esp), %edx
+        cmpl	%edx, %ecx
+        je	L_AES_GCM_init_aesni_calc_iv_done
+L_AES_GCM_init_aesni_calc_iv_lt16:
+        subl	$16, %esp
+        pxor	%xmm0, %xmm0
+        xorl	%ebx, %ebx
+        movdqu	%xmm0, (%esp)
+L_AES_GCM_init_aesni_calc_iv_loop:
+        movzbl	(%esi,%ecx,1), %eax
+        movb	%al, (%esp,%ebx,1)
+        incl	%ecx
+        incl	%ebx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_init_aesni_calc_iv_loop
+        movdqu	(%esp), %xmm0
+        addl	$16, %esp
+        pshufb	L_aes_gcm_bswap_mask, %xmm0
+        pxor	%xmm0, %xmm4
+        pshufd	$0x4e, %xmm4, %xmm1
+        pshufd	$0x4e, %xmm5, %xmm2
+        movdqa	%xmm5, %xmm3
+        movdqa	%xmm5, %xmm0
+        pclmulqdq	$0x11, %xmm4, %xmm3
+        pclmulqdq	$0x00, %xmm4, %xmm0
+        pxor	%xmm4, %xmm1
+        pxor	%xmm5, %xmm2
+        pclmulqdq	$0x00, %xmm2, %xmm1
+        pxor	%xmm0, %xmm1
+        pxor	%xmm3, %xmm1
+        movdqa	%xmm1, %xmm2
+        movdqa	%xmm0, %xmm7
+        movdqa	%xmm3, %xmm4
+        pslldq	$8, %xmm2
+        psrldq	$8, %xmm1
+        pxor	%xmm2, %xmm7
+        pxor	%xmm1, %xmm4
+        movdqa	%xmm7, %xmm0
+        movdqa	%xmm4, %xmm1
+        psrld	$31, %xmm0
+        psrld	$31, %xmm1
+        pslld	$0x01, %xmm7
+        pslld	$0x01, %xmm4
+        movdqa	%xmm0, %xmm2
+        pslldq	$4, %xmm0
+        psrldq	$12, %xmm2
+        pslldq	$4, %xmm1
+        por	%xmm2, %xmm4
+        por	%xmm0, %xmm7
+        por	%xmm1, %xmm4
+        movdqa	%xmm7, %xmm0
+        movdqa	%xmm7, %xmm1
+        movdqa	%xmm7, %xmm2
+        pslld	$31, %xmm0
+        pslld	$30, %xmm1
+        pslld	$25, %xmm2
+        pxor	%xmm1, %xmm0
+        pxor	%xmm2, %xmm0
+        movdqa	%xmm0, %xmm1
+        psrldq	$4, %xmm1
+        pslldq	$12, %xmm0
+        pxor	%xmm0, %xmm7
+        movdqa	%xmm7, %xmm2
+        movdqa	%xmm7, %xmm3
+        movdqa	%xmm7, %xmm0
+        psrld	$0x01, %xmm2
+        psrld	$2, %xmm3
+        psrld	$7, %xmm0
+        pxor	%xmm3, %xmm2
+        pxor	%xmm0, %xmm2
+        pxor	%xmm1, %xmm2
+        pxor	%xmm7, %xmm2
+        pxor	%xmm2, %xmm4
+L_AES_GCM_init_aesni_calc_iv_done:
+        # T = Encrypt counter
+        pxor	%xmm0, %xmm0
+        shll	$3, %edx
+        pinsrd	$0x00, %edx, %xmm0
+        pxor	%xmm0, %xmm4
+        pshufd	$0x4e, %xmm4, %xmm1
+        pshufd	$0x4e, %xmm5, %xmm2
+        movdqa	%xmm5, %xmm3
+        movdqa	%xmm5, %xmm0
+        pclmulqdq	$0x11, %xmm4, %xmm3
+        pclmulqdq	$0x00, %xmm4, %xmm0
+        pxor	%xmm4, %xmm1
+        pxor	%xmm5, %xmm2
+        pclmulqdq	$0x00, %xmm2, %xmm1
+        pxor	%xmm0, %xmm1
+        pxor	%xmm3, %xmm1
+        movdqa	%xmm1, %xmm2
+        movdqa	%xmm0, %xmm7
+        movdqa	%xmm3, %xmm4
+        pslldq	$8, %xmm2
+        psrldq	$8, %xmm1
+        pxor	%xmm2, %xmm7
+        pxor	%xmm1, %xmm4
+        movdqa	%xmm7, %xmm0
+        movdqa	%xmm4, %xmm1
+        psrld	$31, %xmm0
+        psrld	$31, %xmm1
+        pslld	$0x01, %xmm7
+        pslld	$0x01, %xmm4
+        movdqa	%xmm0, %xmm2
+        pslldq	$4, %xmm0
+        psrldq	$12, %xmm2
+        pslldq	$4, %xmm1
+        por	%xmm2, %xmm4
+        por	%xmm0, %xmm7
+        por	%xmm1, %xmm4
+        movdqa	%xmm7, %xmm0
+        movdqa	%xmm7, %xmm1
+        movdqa	%xmm7, %xmm2
+        pslld	$31, %xmm0
+        pslld	$30, %xmm1
+        pslld	$25, %xmm2
+        pxor	%xmm1, %xmm0
+        pxor	%xmm2, %xmm0
+        movdqa	%xmm0, %xmm1
+        psrldq	$4, %xmm1
+        pslldq	$12, %xmm0
+        pxor	%xmm0, %xmm7
+        movdqa	%xmm7, %xmm2
+        movdqa	%xmm7, %xmm3
+        movdqa	%xmm7, %xmm0
+        psrld	$0x01, %xmm2
+        psrld	$2, %xmm3
+        psrld	$7, %xmm0
+        pxor	%xmm3, %xmm2
+        pxor	%xmm0, %xmm2
+        pxor	%xmm1, %xmm2
+        pxor	%xmm7, %xmm2
+        pxor	%xmm2, %xmm4
+        pshufb	L_aes_gcm_bswap_mask, %xmm4
+        #   Encrypt counter
+        movdqa	(%ebp), %xmm0
+        pxor	%xmm4, %xmm0
+        aesenc	16(%ebp), %xmm0
+        aesenc	32(%ebp), %xmm0
+        aesenc	48(%ebp), %xmm0
+        aesenc	64(%ebp), %xmm0
+        aesenc	80(%ebp), %xmm0
+        aesenc	96(%ebp), %xmm0
+        aesenc	112(%ebp), %xmm0
+        aesenc	128(%ebp), %xmm0
+        aesenc	144(%ebp), %xmm0
+        cmpl	$11, 40(%esp)
+        movdqa	160(%ebp), %xmm1
+        jl	L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last
+        aesenc	%xmm1, %xmm0
+        aesenc	176(%ebp), %xmm0
+        cmpl	$13, 40(%esp)
+        movdqa	192(%ebp), %xmm1
+        jl	L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last
+        aesenc	%xmm1, %xmm0
+        aesenc	208(%ebp), %xmm0
+        movdqa	224(%ebp), %xmm1
+L_AES_GCM_init_aesni_calc_iv_2_aesenc_avx_last:
+        aesenclast	%xmm1, %xmm0
+        movdqu	%xmm0, (%edi)
+L_AES_GCM_init_aesni_iv_done:
+        movl	52(%esp), %ebp
+        movl	56(%esp), %edi
+        pshufb	L_aes_gcm_bswap_epi64, %xmm4
+        paddd	L_aes_gcm_one, %xmm4
+        movdqa	%xmm5, (%ebp)
+        movdqa	%xmm4, (%edi)
+        addl	$16, %esp
+        popl	%ebp
+        popl	%edi
+        popl	%esi
+        popl	%ebx
+        ret
+.size	AES_GCM_init_aesni,.-AES_GCM_init_aesni
+.text
+.globl	AES_GCM_aad_update_aesni
+.type	AES_GCM_aad_update_aesni,@function
+.align	16
+AES_GCM_aad_update_aesni:
+        pushl	%esi
+        pushl	%edi
+        movl	12(%esp), %esi
+        movl	16(%esp), %edx
+        movl	20(%esp), %edi
+        movl	24(%esp), %eax
+        movdqa	(%edi), %xmm5
+        movdqa	(%eax), %xmm6
+        xorl	%ecx, %ecx
+L_AES_GCM_aad_update_aesni_16_loop:
+        movdqu	(%esi,%ecx,1), %xmm0
+        pshufb	L_aes_gcm_bswap_mask, %xmm0
+        pxor	%xmm0, %xmm5
+        pshufd	$0x4e, %xmm5, %xmm1
+        pshufd	$0x4e, %xmm6, %xmm2
+        movdqa	%xmm6, %xmm3
+        movdqa	%xmm6, %xmm0
+        pclmulqdq	$0x11, %xmm5, %xmm3
+        pclmulqdq	$0x00, %xmm5, %xmm0
+        pxor	%xmm5, %xmm1
+        pxor	%xmm6, %xmm2
+        pclmulqdq	$0x00, %xmm2, %xmm1
+        pxor	%xmm0, %xmm1
+        pxor	%xmm3, %xmm1
+        movdqa	%xmm1, %xmm2
+        movdqa	%xmm0, %xmm4
+        movdqa	%xmm3, %xmm5
+        pslldq	$8, %xmm2
+        psrldq	$8, %xmm1
+        pxor	%xmm2, %xmm4
+        pxor	%xmm1, %xmm5
+        movdqa	%xmm4, %xmm0
+        movdqa	%xmm5, %xmm1
+        psrld	$31, %xmm0
+        psrld	$31, %xmm1
+        pslld	$0x01, %xmm4
+        pslld	$0x01, %xmm5
+        movdqa	%xmm0, %xmm2
+        pslldq	$4, %xmm0
+        psrldq	$12, %xmm2
+        pslldq	$4, %xmm1
+        por	%xmm2, %xmm5
+        por	%xmm0, %xmm4
+        por	%xmm1, %xmm5
+        movdqa	%xmm4, %xmm0
+        movdqa	%xmm4, %xmm1
+        movdqa	%xmm4, %xmm2
+        pslld	$31, %xmm0
+        pslld	$30, %xmm1
+        pslld	$25, %xmm2
+        pxor	%xmm1, %xmm0
+        pxor	%xmm2, %xmm0
+        movdqa	%xmm0, %xmm1
+        psrldq	$4, %xmm1
+        pslldq	$12, %xmm0
+        pxor	%xmm0, %xmm4
+        movdqa	%xmm4, %xmm2
+        movdqa	%xmm4, %xmm3
+        movdqa	%xmm4, %xmm0
+        psrld	$0x01, %xmm2
+        psrld	$2, %xmm3
+        psrld	$7, %xmm0
+        pxor	%xmm3, %xmm2
+        pxor	%xmm0, %xmm2
+        pxor	%xmm1, %xmm2
+        pxor	%xmm4, %xmm2
+        pxor	%xmm2, %xmm5
+        addl	$16, %ecx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_aad_update_aesni_16_loop
+        movdqa	%xmm5, (%edi)
+        popl	%edi
+        popl	%esi
+        ret
+.size	AES_GCM_aad_update_aesni,.-AES_GCM_aad_update_aesni
+.text
+.globl	AES_GCM_encrypt_block_aesni
+.type	AES_GCM_encrypt_block_aesni,@function
+.align	16
+AES_GCM_encrypt_block_aesni:
+        pushl	%esi
+        pushl	%edi
+        movl	12(%esp), %ecx
+        movl	16(%esp), %eax
+        movl	20(%esp), %edi
+        movl	24(%esp), %esi
+        movl	28(%esp), %edx
+        movdqu	(%edx), %xmm0
+        movdqa	%xmm0, %xmm1
+        pshufb	L_aes_gcm_bswap_epi64, %xmm0
+        paddd	L_aes_gcm_one, %xmm1
+        pxor	(%ecx), %xmm0
+        movdqu	%xmm1, (%edx)
+        aesenc	16(%ecx), %xmm0
+        aesenc	32(%ecx), %xmm0
+        aesenc	48(%ecx), %xmm0
+        aesenc	64(%ecx), %xmm0
+        aesenc	80(%ecx), %xmm0
+        aesenc	96(%ecx), %xmm0
+        aesenc	112(%ecx), %xmm0
+        aesenc	128(%ecx), %xmm0
+        aesenc	144(%ecx), %xmm0
+        cmpl	$11, %eax
+        movdqa	160(%ecx), %xmm1
+        jl	L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last
+        aesenc	%xmm1, %xmm0
+        aesenc	176(%ecx), %xmm0
+        cmpl	$13, %eax
+        movdqa	192(%ecx), %xmm1
+        jl	L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last
+        aesenc	%xmm1, %xmm0
+        aesenc	208(%ecx), %xmm0
+        movdqa	224(%ecx), %xmm1
+L_AES_GCM_encrypt_block_aesni_aesenc_block_aesenc_avx_last:
+        aesenclast	%xmm1, %xmm0
+        movdqu	(%esi), %xmm1
+        pxor	%xmm1, %xmm0
+        movdqu	%xmm0, (%edi)
+        pshufb	L_aes_gcm_bswap_mask, %xmm0
+        popl	%edi
+        popl	%esi
+        ret
+.size	AES_GCM_encrypt_block_aesni,.-AES_GCM_encrypt_block_aesni
+.text
+.globl	AES_GCM_ghash_block_aesni
+.type	AES_GCM_ghash_block_aesni,@function
+.align	16
+AES_GCM_ghash_block_aesni:
+        movl	4(%esp), %edx
+        movl	8(%esp), %eax
+        movl	12(%esp), %ecx
+        movdqa	(%eax), %xmm4
+        movdqa	(%ecx), %xmm5
+        movdqu	(%edx), %xmm0
+        pshufb	L_aes_gcm_bswap_mask, %xmm0
+        pxor	%xmm0, %xmm4
+        pshufd	$0x4e, %xmm4, %xmm1
+        pshufd	$0x4e, %xmm5, %xmm2
+        movdqa	%xmm5, %xmm3
+        movdqa	%xmm5, %xmm0
+        pclmulqdq	$0x11, %xmm4, %xmm3
+        pclmulqdq	$0x00, %xmm4, %xmm0
+        pxor	%xmm4, %xmm1
+        pxor	%xmm5, %xmm2
+        pclmulqdq	$0x00, %xmm2, %xmm1
+        pxor	%xmm0, %xmm1
+        pxor	%xmm3, %xmm1
+        movdqa	%xmm1, %xmm2
+        movdqa	%xmm0, %xmm6
+        movdqa	%xmm3, %xmm4
+        pslldq	$8, %xmm2
+        psrldq	$8, %xmm1
+        pxor	%xmm2, %xmm6
+        pxor	%xmm1, %xmm4
+        movdqa	%xmm6, %xmm0
+        movdqa	%xmm4, %xmm1
+        psrld	$31, %xmm0
+        psrld	$31, %xmm1
+        pslld	$0x01, %xmm6
+        pslld	$0x01, %xmm4
+        movdqa	%xmm0, %xmm2
+        pslldq	$4, %xmm0
+        psrldq	$12, %xmm2
+        pslldq	$4, %xmm1
+        por	%xmm2, %xmm4
+        por	%xmm0, %xmm6
+        por	%xmm1, %xmm4
+        movdqa	%xmm6, %xmm0
+        movdqa	%xmm6, %xmm1
+        movdqa	%xmm6, %xmm2
+        pslld	$31, %xmm0
+        pslld	$30, %xmm1
+        pslld	$25, %xmm2
+        pxor	%xmm1, %xmm0
+        pxor	%xmm2, %xmm0
+        movdqa	%xmm0, %xmm1
+        psrldq	$4, %xmm1
+        pslldq	$12, %xmm0
+        pxor	%xmm0, %xmm6
+        movdqa	%xmm6, %xmm2
+        movdqa	%xmm6, %xmm3
+        movdqa	%xmm6, %xmm0
+        psrld	$0x01, %xmm2
+        psrld	$2, %xmm3
+        psrld	$7, %xmm0
+        pxor	%xmm3, %xmm2
+        pxor	%xmm0, %xmm2
+        pxor	%xmm1, %xmm2
+        pxor	%xmm6, %xmm2
+        pxor	%xmm2, %xmm4
+        movdqa	%xmm4, (%eax)
+        ret
+.size	AES_GCM_ghash_block_aesni,.-AES_GCM_ghash_block_aesni
+.text
+.globl	AES_GCM_encrypt_update_aesni
+.type	AES_GCM_encrypt_update_aesni,@function
+.align	16
+AES_GCM_encrypt_update_aesni:
+        pushl	%ebx
+        pushl	%esi
+        pushl	%edi
+        pushl	%ebp
+        subl	$0x60, %esp
+        movl	144(%esp), %esi
+        movdqa	(%esi), %xmm4
+        movdqu	%xmm4, 64(%esp)
+        movl	136(%esp), %esi
+        movl	140(%esp), %ebp
+        movdqa	(%esi), %xmm6
+        movdqa	(%ebp), %xmm5
+        movdqu	%xmm6, 80(%esp)
+        movl	116(%esp), %ebp
+        movl	124(%esp), %edi
+        movl	128(%esp), %esi
+        movdqa	%xmm5, %xmm1
+        movdqa	%xmm5, %xmm0
+        psrlq	$63, %xmm1
+        psllq	$0x01, %xmm0
+        pslldq	$8, %xmm1
+        por	%xmm1, %xmm0
+        pshufd	$0xff, %xmm5, %xmm5
+        psrad	$31, %xmm5
+        pand	L_aes_gcm_mod2_128, %xmm5
+        pxor	%xmm0, %xmm5
+        xorl	%ebx, %ebx
+        cmpl	$0x40, 132(%esp)
+        movl	132(%esp), %eax
+        jl	L_AES_GCM_encrypt_update_aesni_done_64
+        andl	$0xffffffc0, %eax
+        movdqa	%xmm6, %xmm2
+        # H ^ 1
+        movdqu	%xmm5, (%esp)
+        # H ^ 2
+        pshufd	$0x4e, %xmm5, %xmm1
+        pshufd	$0x4e, %xmm5, %xmm2
+        movdqa	%xmm5, %xmm3
+        movdqa	%xmm5, %xmm0
+        pclmulqdq	$0x11, %xmm5, %xmm3
+        pclmulqdq	$0x00, %xmm5, %xmm0
+        pxor	%xmm5, %xmm1
+        pxor	%xmm5, %xmm2
+        pclmulqdq	$0x00, %xmm2, %xmm1
+        pxor	%xmm0, %xmm1
+        pxor	%xmm3, %xmm1
+        movdqa	%xmm1, %xmm2
+        movdqa	%xmm3, %xmm4
+        pslldq	$8, %xmm2
+        psrldq	$8, %xmm1
+        pxor	%xmm2, %xmm0
+        pxor	%xmm1, %xmm4
+        movdqa	%xmm0, %xmm1
+        movdqa	%xmm0, %xmm2
+        movdqa	%xmm0, %xmm3
+        pslld	$31, %xmm1
+        pslld	$30, %xmm2
+        pslld	$25, %xmm3
+        pxor	%xmm2, %xmm1
+        pxor	%xmm3, %xmm1
+        movdqa	%xmm1, %xmm3
+        psrldq	$4, %xmm3
+        pslldq	$12, %xmm1
+        pxor	%xmm1, %xmm0
+        movdqa	%xmm0, %xmm1
+        movdqa	%xmm0, %xmm2
+        psrld	$0x01, %xmm1
+        psrld	$2, %xmm2
+        pxor	%xmm2, %xmm1
+        pxor	%xmm0, %xmm1
+        psrld	$7, %xmm0
+        pxor	%xmm3, %xmm1
+        pxor	%xmm0, %xmm1
+        pxor	%xmm1, %xmm4
+        movdqu	%xmm4, 16(%esp)
+        # H ^ 3
+        pshufd	$0x4e, %xmm5, %xmm1
+        pshufd	$0x4e, %xmm4, %xmm2
+        movdqa	%xmm4, %xmm3
+        movdqa	%xmm4, %xmm0
+        pclmulqdq	$0x11, %xmm5, %xmm3
+        pclmulqdq	$0x00, %xmm5, %xmm0
+        pxor	%xmm5, %xmm1
+        pxor	%xmm4, %xmm2
+        pclmulqdq	$0x00, %xmm2, %xmm1
+        pxor	%xmm0, %xmm1
+        pxor	%xmm3, %xmm1
+        movdqa	%xmm1, %xmm2
+        movdqa	%xmm3, %xmm7
+        pslldq	$8, %xmm2
+        psrldq	$8, %xmm1
+        pxor	%xmm2, %xmm0
+        pxor	%xmm1, %xmm7
+        movdqa	%xmm0, %xmm1
+        movdqa	%xmm0, %xmm2
+        movdqa	%xmm0, %xmm3
+        pslld	$31, %xmm1
+        pslld	$30, %xmm2
+        pslld	$25, %xmm3
+        pxor	%xmm2, %xmm1
+        pxor	%xmm3, %xmm1
+        movdqa	%xmm1, %xmm3
+        psrldq	$4, %xmm3
+        pslldq	$12, %xmm1
+        pxor	%xmm1, %xmm0
+        movdqa	%xmm0, %xmm1
+        movdqa	%xmm0, %xmm2
+        psrld	$0x01, %xmm1
+        psrld	$2, %xmm2
+        pxor	%xmm2, %xmm1
+        pxor	%xmm0, %xmm1
+        psrld	$7, %xmm0
+        pxor	%xmm3, %xmm1
+        pxor	%xmm0, %xmm1
+        pxor	%xmm1, %xmm7
+        movdqu	%xmm7, 32(%esp)
+        # H ^ 4
+        pshufd	$0x4e, %xmm4, %xmm1
+        pshufd	$0x4e, %xmm4, %xmm2
+        movdqa	%xmm4, %xmm3
+        movdqa	%xmm4, %xmm0
+        pclmulqdq	$0x11, %xmm4, %xmm3
+        pclmulqdq	$0x00, %xmm4, %xmm0
+        pxor	%xmm4, %xmm1
+        pxor	%xmm4, %xmm2
+        pclmulqdq	$0x00, %xmm2, %xmm1
+        pxor	%xmm0, %xmm1
+        pxor	%xmm3, %xmm1
+        movdqa	%xmm1, %xmm2
+        movdqa	%xmm3, %xmm7
+        pslldq	$8, %xmm2
+        psrldq	$8, %xmm1
+        pxor	%xmm2, %xmm0
+        pxor	%xmm1, %xmm7
+        movdqa	%xmm0, %xmm1
+        movdqa	%xmm0, %xmm2
+        movdqa	%xmm0, %xmm3
+        pslld	$31, %xmm1
+        pslld	$30, %xmm2
+        pslld	$25, %xmm3
+        pxor	%xmm2, %xmm1
+        pxor	%xmm3, %xmm1
+        movdqa	%xmm1, %xmm3
+        psrldq	$4, %xmm3
+        pslldq	$12, %xmm1
+        pxor	%xmm1, %xmm0
+        movdqa	%xmm0, %xmm1
+        movdqa	%xmm0, %xmm2
+        psrld	$0x01, %xmm1
+        psrld	$2, %xmm2
+        pxor	%xmm2, %xmm1
+        pxor	%xmm0, %xmm1
+        psrld	$7, %xmm0
+        pxor	%xmm3, %xmm1
+        pxor	%xmm0, %xmm1
+        pxor	%xmm1, %xmm7
+        movdqu	%xmm7, 48(%esp)
+        # First 64 bytes of input
+        # Encrypt 64 bytes of counter
+        movdqu	64(%esp), %xmm0
+        movdqa	L_aes_gcm_bswap_epi64, %xmm7
+        movdqa	%xmm0, %xmm1
+        movdqa	%xmm0, %xmm2
+        movdqa	%xmm0, %xmm3
+        pshufb	%xmm7, %xmm0
+        paddd	L_aes_gcm_one, %xmm1
+        pshufb	%xmm7, %xmm1
+        paddd	L_aes_gcm_two, %xmm2
+        pshufb	%xmm7, %xmm2
+        paddd	L_aes_gcm_three, %xmm3
+        pshufb	%xmm7, %xmm3
+        movdqu	64(%esp), %xmm7
+        paddd	L_aes_gcm_four, %xmm7
+        movdqu	%xmm7, 64(%esp)
+        movdqa	(%ebp), %xmm7
+        pxor	%xmm7, %xmm0
+        pxor	%xmm7, %xmm1
+        pxor	%xmm7, %xmm2
+        pxor	%xmm7, %xmm3
+        movdqa	16(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	32(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	48(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	64(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	80(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	96(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	112(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	128(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	144(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        cmpl	$11, 120(%esp)
+        movdqa	160(%ebp), %xmm7
+        jl	L_AES_GCM_encrypt_update_aesni_enc_done
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	176(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        cmpl	$13, 120(%esp)
+        movdqa	192(%ebp), %xmm7
+        jl	L_AES_GCM_encrypt_update_aesni_enc_done
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	208(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	224(%ebp), %xmm7
+L_AES_GCM_encrypt_update_aesni_enc_done:
+        aesenclast	%xmm7, %xmm0
+        aesenclast	%xmm7, %xmm1
+        movdqu	(%esi), %xmm4
+        movdqu	16(%esi), %xmm5
+        pxor	%xmm4, %xmm0
+        pxor	%xmm5, %xmm1
+        movdqu	%xmm0, (%edi)
+        movdqu	%xmm1, 16(%edi)
+        aesenclast	%xmm7, %xmm2
+        aesenclast	%xmm7, %xmm3
+        movdqu	32(%esi), %xmm4
+        movdqu	48(%esi), %xmm5
+        pxor	%xmm4, %xmm2
+        pxor	%xmm5, %xmm3
+        movdqu	%xmm2, 32(%edi)
+        movdqu	%xmm3, 48(%edi)
+        cmpl	$0x40, %eax
+        movl	$0x40, %ebx
+        jle	L_AES_GCM_encrypt_update_aesni_end_64
+        # More 64 bytes of input
+L_AES_GCM_encrypt_update_aesni_ghash_64:
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        # Encrypt 64 bytes of counter
+        movdqu	64(%esp), %xmm0
+        movdqa	L_aes_gcm_bswap_epi64, %xmm7
+        movdqa	%xmm0, %xmm1
+        movdqa	%xmm0, %xmm2
+        movdqa	%xmm0, %xmm3
+        pshufb	%xmm7, %xmm0
+        paddd	L_aes_gcm_one, %xmm1
+        pshufb	%xmm7, %xmm1
+        paddd	L_aes_gcm_two, %xmm2
+        pshufb	%xmm7, %xmm2
+        paddd	L_aes_gcm_three, %xmm3
+        pshufb	%xmm7, %xmm3
+        movdqu	64(%esp), %xmm7
+        paddd	L_aes_gcm_four, %xmm7
+        movdqu	%xmm7, 64(%esp)
+        movdqa	(%ebp), %xmm7
+        pxor	%xmm7, %xmm0
+        pxor	%xmm7, %xmm1
+        pxor	%xmm7, %xmm2
+        pxor	%xmm7, %xmm3
+        movdqa	16(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	32(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	48(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	64(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	80(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	96(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	112(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	128(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	144(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        cmpl	$11, 120(%esp)
+        movdqa	160(%ebp), %xmm7
+        jl	L_AES_GCM_encrypt_update_aesni_aesenc_64_ghash_avx_done
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	176(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        cmpl	$13, 120(%esp)
+        movdqa	192(%ebp), %xmm7
+        jl	L_AES_GCM_encrypt_update_aesni_aesenc_64_ghash_avx_done
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	208(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	224(%ebp), %xmm7
+L_AES_GCM_encrypt_update_aesni_aesenc_64_ghash_avx_done:
+        aesenclast	%xmm7, %xmm0
+        aesenclast	%xmm7, %xmm1
+        movdqu	(%ecx), %xmm4
+        movdqu	16(%ecx), %xmm5
+        pxor	%xmm4, %xmm0
+        pxor	%xmm5, %xmm1
+        movdqu	%xmm0, (%edx)
+        movdqu	%xmm1, 16(%edx)
+        aesenclast	%xmm7, %xmm2
+        aesenclast	%xmm7, %xmm3
+        movdqu	32(%ecx), %xmm4
+        movdqu	48(%ecx), %xmm5
+        pxor	%xmm4, %xmm2
+        pxor	%xmm5, %xmm3
+        movdqu	%xmm2, 32(%edx)
+        movdqu	%xmm3, 48(%edx)
+        # ghash encrypted counter
+        movdqu	80(%esp), %xmm2
+        movdqu	48(%esp), %xmm7
+        movdqu	-64(%edx), %xmm0
+        pshufb	L_aes_gcm_bswap_mask, %xmm0
+        pxor	%xmm2, %xmm0
+        pshufd	$0x4e, %xmm7, %xmm1
+        pshufd	$0x4e, %xmm0, %xmm5
+        pxor	%xmm7, %xmm1
+        pxor	%xmm0, %xmm5
+        movdqa	%xmm0, %xmm3
+        pclmulqdq	$0x11, %xmm7, %xmm3
+        movdqa	%xmm0, %xmm2
+        pclmulqdq	$0x00, %xmm7, %xmm2
+        pclmulqdq	$0x00, %xmm5, %xmm1
+        pxor	%xmm2, %xmm1
+        pxor	%xmm3, %xmm1
+        movdqu	32(%esp), %xmm7
+        movdqu	-48(%edx), %xmm0
+        pshufd	$0x4e, %xmm7, %xmm4
+        pshufb	L_aes_gcm_bswap_mask, %xmm0
+        pxor	%xmm7, %xmm4
+        pshufd	$0x4e, %xmm0, %xmm5
+        pxor	%xmm0, %xmm5
+        movdqa	%xmm0, %xmm6
+        pclmulqdq	$0x11, %xmm7, %xmm6
+        pclmulqdq	$0x00, %xmm0, %xmm7
+        pclmulqdq	$0x00, %xmm5, %xmm4
+        pxor	%xmm7, %xmm1
+        pxor	%xmm7, %xmm2
+        pxor	%xmm6, %xmm1
+        pxor	%xmm6, %xmm3
+        pxor	%xmm4, %xmm1
+        movdqu	16(%esp), %xmm7
+        movdqu	-32(%edx), %xmm0
+        pshufd	$0x4e, %xmm7, %xmm4
+        pshufb	L_aes_gcm_bswap_mask, %xmm0
+        pxor	%xmm7, %xmm4
+        pshufd	$0x4e, %xmm0, %xmm5
+        pxor	%xmm0, %xmm5
+        movdqa	%xmm0, %xmm6
+        pclmulqdq	$0x11, %xmm7, %xmm6
+        pclmulqdq	$0x00, %xmm0, %xmm7
+        pclmulqdq	$0x00, %xmm5, %xmm4
+        pxor	%xmm7, %xmm1
+        pxor	%xmm7, %xmm2
+        pxor	%xmm6, %xmm1
+        pxor	%xmm6, %xmm3
+        pxor	%xmm4, %xmm1
+        movdqu	(%esp), %xmm7
+        movdqu	-16(%edx), %xmm0
+        pshufd	$0x4e, %xmm7, %xmm4
+        pshufb	L_aes_gcm_bswap_mask, %xmm0
+        pxor	%xmm7, %xmm4
+        pshufd	$0x4e, %xmm0, %xmm5
+        pxor	%xmm0, %xmm5
+        movdqa	%xmm0, %xmm6
+        pclmulqdq	$0x11, %xmm7, %xmm6
+        pclmulqdq	$0x00, %xmm0, %xmm7
+        pclmulqdq	$0x00, %xmm5, %xmm4
+        pxor	%xmm7, %xmm1
+        pxor	%xmm7, %xmm2
+        pxor	%xmm6, %xmm1
+        pxor	%xmm6, %xmm3
+        pxor	%xmm4, %xmm1
+        movdqa	%xmm1, %xmm5
+        psrldq	$8, %xmm1
+        pslldq	$8, %xmm5
+        pxor	%xmm5, %xmm2
+        pxor	%xmm1, %xmm3
+        movdqa	%xmm2, %xmm7
+        movdqa	%xmm2, %xmm4
+        movdqa	%xmm2, %xmm5
+        pslld	$31, %xmm7
+        pslld	$30, %xmm4
+        pslld	$25, %xmm5
+        pxor	%xmm4, %xmm7
+        pxor	%xmm5, %xmm7
+        movdqa	%xmm7, %xmm4
+        pslldq	$12, %xmm7
+        psrldq	$4, %xmm4
+        pxor	%xmm7, %xmm2
+        movdqa	%xmm2, %xmm5
+        movdqa	%xmm2, %xmm1
+        movdqa	%xmm2, %xmm0
+        psrld	$0x01, %xmm5
+        psrld	$2, %xmm1
+        psrld	$7, %xmm0
+        pxor	%xmm1, %xmm5
+        pxor	%xmm0, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm5, %xmm2
+        pxor	%xmm3, %xmm2
+        movdqu	%xmm2, 80(%esp)
+        addl	$0x40, %ebx
+        cmpl	%eax, %ebx
+        jl	L_AES_GCM_encrypt_update_aesni_ghash_64
+L_AES_GCM_encrypt_update_aesni_end_64:
+        movdqu	80(%esp), %xmm6
+        # Block 1
+        movdqa	L_aes_gcm_bswap_mask, %xmm0
+        movdqu	(%edx), %xmm5
+        pshufb	%xmm0, %xmm5
+        movdqu	48(%esp), %xmm7
+        pxor	%xmm6, %xmm5
+        pshufd	$0x4e, %xmm5, %xmm1
+        pshufd	$0x4e, %xmm7, %xmm2
+        movdqa	%xmm7, %xmm3
+        movdqa	%xmm7, %xmm0
+        pclmulqdq	$0x11, %xmm5, %xmm3
+        pclmulqdq	$0x00, %xmm5, %xmm0
+        pxor	%xmm5, %xmm1
+        pxor	%xmm7, %xmm2
+        pclmulqdq	$0x00, %xmm2, %xmm1
+        pxor	%xmm0, %xmm1
+        pxor	%xmm3, %xmm1
+        movdqa	%xmm1, %xmm2
+        movdqa	%xmm0, %xmm4
+        movdqa	%xmm3, %xmm6
+        pslldq	$8, %xmm2
+        psrldq	$8, %xmm1
+        pxor	%xmm2, %xmm4
+        pxor	%xmm1, %xmm6
+        # Block 2
+        movdqa	L_aes_gcm_bswap_mask, %xmm0
+        movdqu	16(%edx), %xmm5
+        pshufb	%xmm0, %xmm5
+        movdqu	32(%esp), %xmm7
+        pshufd	$0x4e, %xmm5, %xmm1
+        pshufd	$0x4e, %xmm7, %xmm2
+        movdqa	%xmm7, %xmm3
+        movdqa	%xmm7, %xmm0
+        pclmulqdq	$0x11, %xmm5, %xmm3
+        pclmulqdq	$0x00, %xmm5, %xmm0
+        pxor	%xmm5, %xmm1
+        pxor	%xmm7, %xmm2
+        pclmulqdq	$0x00, %xmm2, %xmm1
+        pxor	%xmm0, %xmm1
+        pxor	%xmm3, %xmm1
+        movdqa	%xmm1, %xmm2
+        pxor	%xmm0, %xmm4
+        pxor	%xmm3, %xmm6
+        pslldq	$8, %xmm2
+        psrldq	$8, %xmm1
+        pxor	%xmm2, %xmm4
+        pxor	%xmm1, %xmm6
+        # Block 3
+        movdqa	L_aes_gcm_bswap_mask, %xmm0
+        movdqu	32(%edx), %xmm5
+        pshufb	%xmm0, %xmm5
+        movdqu	16(%esp), %xmm7
+        pshufd	$0x4e, %xmm5, %xmm1
+        pshufd	$0x4e, %xmm7, %xmm2
+        movdqa	%xmm7, %xmm3
+        movdqa	%xmm7, %xmm0
+        pclmulqdq	$0x11, %xmm5, %xmm3
+        pclmulqdq	$0x00, %xmm5, %xmm0
+        pxor	%xmm5, %xmm1
+        pxor	%xmm7, %xmm2
+        pclmulqdq	$0x00, %xmm2, %xmm1
+        pxor	%xmm0, %xmm1
+        pxor	%xmm3, %xmm1
+        movdqa	%xmm1, %xmm2
+        pxor	%xmm0, %xmm4
+        pxor	%xmm3, %xmm6
+        pslldq	$8, %xmm2
+        psrldq	$8, %xmm1
+        pxor	%xmm2, %xmm4
+        pxor	%xmm1, %xmm6
+        # Block 4
+        movdqa	L_aes_gcm_bswap_mask, %xmm0
+        movdqu	48(%edx), %xmm5
+        pshufb	%xmm0, %xmm5
+        movdqu	(%esp), %xmm7
+        pshufd	$0x4e, %xmm5, %xmm1
+        pshufd	$0x4e, %xmm7, %xmm2
+        movdqa	%xmm7, %xmm3
+        movdqa	%xmm7, %xmm0
+        pclmulqdq	$0x11, %xmm5, %xmm3
+        pclmulqdq	$0x00, %xmm5, %xmm0
+        pxor	%xmm5, %xmm1
+        pxor	%xmm7, %xmm2
+        pclmulqdq	$0x00, %xmm2, %xmm1
+        pxor	%xmm0, %xmm1
+        pxor	%xmm3, %xmm1
+        movdqa	%xmm1, %xmm2
+        pxor	%xmm0, %xmm4
+        pxor	%xmm3, %xmm6
+        pslldq	$8, %xmm2
+        psrldq	$8, %xmm1
+        pxor	%xmm2, %xmm4
+        pxor	%xmm1, %xmm6
+        movdqa	%xmm4, %xmm0
+        movdqa	%xmm4, %xmm1
+        movdqa	%xmm4, %xmm2
+        pslld	$31, %xmm0
+        pslld	$30, %xmm1
+        pslld	$25, %xmm2
+        pxor	%xmm1, %xmm0
+        pxor	%xmm2, %xmm0
+        movdqa	%xmm0, %xmm1
+        psrldq	$4, %xmm1
+        pslldq	$12, %xmm0
+        pxor	%xmm0, %xmm4
+        movdqa	%xmm4, %xmm2
+        movdqa	%xmm4, %xmm3
+        movdqa	%xmm4, %xmm0
+        psrld	$0x01, %xmm2
+        psrld	$2, %xmm3
+        psrld	$7, %xmm0
+        pxor	%xmm3, %xmm2
+        pxor	%xmm0, %xmm2
+        pxor	%xmm1, %xmm2
+        pxor	%xmm4, %xmm2
+        pxor	%xmm2, %xmm6
+        movdqu	(%esp), %xmm5
+L_AES_GCM_encrypt_update_aesni_done_64:
+        movl	132(%esp), %edx
+        cmpl	%edx, %ebx
+        jge	L_AES_GCM_encrypt_update_aesni_done_enc
+        movl	132(%esp), %eax
+        andl	$0xfffffff0, %eax
+        cmpl	%eax, %ebx
+        jge	L_AES_GCM_encrypt_update_aesni_last_block_done
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        movdqu	64(%esp), %xmm0
+        movdqa	%xmm0, %xmm1
+        pshufb	L_aes_gcm_bswap_epi64, %xmm0
+        paddd	L_aes_gcm_one, %xmm1
+        pxor	(%ebp), %xmm0
+        movdqu	%xmm1, 64(%esp)
+        aesenc	16(%ebp), %xmm0
+        aesenc	32(%ebp), %xmm0
+        aesenc	48(%ebp), %xmm0
+        aesenc	64(%ebp), %xmm0
+        aesenc	80(%ebp), %xmm0
+        aesenc	96(%ebp), %xmm0
+        aesenc	112(%ebp), %xmm0
+        aesenc	128(%ebp), %xmm0
+        aesenc	144(%ebp), %xmm0
+        cmpl	$11, 120(%esp)
+        movdqa	160(%ebp), %xmm1
+        jl	L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last
+        aesenc	%xmm1, %xmm0
+        aesenc	176(%ebp), %xmm0
+        cmpl	$13, 120(%esp)
+        movdqa	192(%ebp), %xmm1
+        jl	L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last
+        aesenc	%xmm1, %xmm0
+        aesenc	208(%ebp), %xmm0
+        movdqa	224(%ebp), %xmm1
+L_AES_GCM_encrypt_update_aesni_aesenc_block_aesenc_avx_last:
+        aesenclast	%xmm1, %xmm0
+        movdqu	(%ecx), %xmm1
+        pxor	%xmm1, %xmm0
+        movdqu	%xmm0, (%edx)
+        pshufb	L_aes_gcm_bswap_mask, %xmm0
+        pxor	%xmm0, %xmm6
+        addl	$16, %ebx
+        cmpl	%eax, %ebx
+        jge	L_AES_GCM_encrypt_update_aesni_last_block_ghash
+L_AES_GCM_encrypt_update_aesni_last_block_start:
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        movdqu	64(%esp), %xmm0
+        movdqa	%xmm0, %xmm1
+        pshufb	L_aes_gcm_bswap_epi64, %xmm0
+        paddd	L_aes_gcm_one, %xmm1
+        pxor	(%ebp), %xmm0
+        movdqu	%xmm1, 64(%esp)
+        movdqu	%xmm6, %xmm4
+        pclmulqdq	$16, %xmm5, %xmm4
+        aesenc	16(%ebp), %xmm0
+        aesenc	32(%ebp), %xmm0
+        movdqu	%xmm6, %xmm7
+        pclmulqdq	$0x01, %xmm5, %xmm7
+        aesenc	48(%ebp), %xmm0
+        aesenc	64(%ebp), %xmm0
+        aesenc	80(%ebp), %xmm0
+        movdqu	%xmm6, %xmm1
+        pclmulqdq	$0x11, %xmm5, %xmm1
+        aesenc	96(%ebp), %xmm0
+        pxor	%xmm7, %xmm4
+        movdqa	%xmm4, %xmm2
+        psrldq	$8, %xmm4
+        pslldq	$8, %xmm2
+        aesenc	112(%ebp), %xmm0
+        movdqu	%xmm6, %xmm7
+        pclmulqdq	$0x00, %xmm5, %xmm7
+        pxor	%xmm7, %xmm2
+        pxor	%xmm4, %xmm1
+        movdqa	L_aes_gcm_mod2_128, %xmm3
+        movdqa	%xmm2, %xmm7
+        pclmulqdq	$16, %xmm3, %xmm7
+        aesenc	128(%ebp), %xmm0
+        pshufd	$0x4e, %xmm2, %xmm4
+        pxor	%xmm7, %xmm4
+        movdqa	%xmm4, %xmm7
+        pclmulqdq	$16, %xmm3, %xmm7
+        aesenc	144(%ebp), %xmm0
+        pshufd	$0x4e, %xmm4, %xmm6
+        pxor	%xmm7, %xmm6
+        pxor	%xmm1, %xmm6
+        cmpl	$11, 120(%esp)
+        movdqa	160(%ebp), %xmm1
+        jl	L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last
+        aesenc	%xmm1, %xmm0
+        aesenc	176(%ebp), %xmm0
+        cmpl	$13, 120(%esp)
+        movdqa	192(%ebp), %xmm1
+        jl	L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last
+        aesenc	%xmm1, %xmm0
+        aesenc	208(%ebp), %xmm0
+        movdqa	224(%ebp), %xmm1
+L_AES_GCM_encrypt_update_aesni_aesenc_gfmul_last:
+        aesenclast	%xmm1, %xmm0
+        movdqu	(%ecx), %xmm1
+        pxor	%xmm1, %xmm0
+        movdqu	%xmm0, (%edx)
+        pshufb	L_aes_gcm_bswap_mask, %xmm0
+        pxor	%xmm0, %xmm6
+        addl	$16, %ebx
+        cmpl	%eax, %ebx
+        jl	L_AES_GCM_encrypt_update_aesni_last_block_start
+L_AES_GCM_encrypt_update_aesni_last_block_ghash:
+        pshufd	$0x4e, %xmm5, %xmm1
+        pshufd	$0x4e, %xmm6, %xmm2
+        movdqa	%xmm6, %xmm3
+        movdqa	%xmm6, %xmm0
+        pclmulqdq	$0x11, %xmm5, %xmm3
+        pclmulqdq	$0x00, %xmm5, %xmm0
+        pxor	%xmm5, %xmm1
+        pxor	%xmm6, %xmm2
+        pclmulqdq	$0x00, %xmm2, %xmm1
+        pxor	%xmm0, %xmm1
+        pxor	%xmm3, %xmm1
+        movdqa	%xmm1, %xmm2
+        movdqa	%xmm3, %xmm6
+        pslldq	$8, %xmm2
+        psrldq	$8, %xmm1
+        pxor	%xmm2, %xmm0
+        pxor	%xmm1, %xmm6
+        movdqa	%xmm0, %xmm1
+        movdqa	%xmm0, %xmm2
+        movdqa	%xmm0, %xmm3
+        pslld	$31, %xmm1
+        pslld	$30, %xmm2
+        pslld	$25, %xmm3
+        pxor	%xmm2, %xmm1
+        pxor	%xmm3, %xmm1
+        movdqa	%xmm1, %xmm3
+        psrldq	$4, %xmm3
+        pslldq	$12, %xmm1
+        pxor	%xmm1, %xmm0
+        movdqa	%xmm0, %xmm1
+        movdqa	%xmm0, %xmm2
+        psrld	$0x01, %xmm1
+        psrld	$2, %xmm2
+        pxor	%xmm2, %xmm1
+        pxor	%xmm0, %xmm1
+        psrld	$7, %xmm0
+        pxor	%xmm3, %xmm1
+        pxor	%xmm0, %xmm1
+        pxor	%xmm1, %xmm6
+L_AES_GCM_encrypt_update_aesni_last_block_done:
+L_AES_GCM_encrypt_update_aesni_done_enc:
+        movl	136(%esp), %esi
+        movl	144(%esp), %edi
+        movdqu	64(%esp), %xmm4
+        movdqa	%xmm6, (%esi)
+        movdqu	%xmm4, (%edi)
+        addl	$0x60, %esp
+        popl	%ebp
+        popl	%edi
+        popl	%esi
+        popl	%ebx
+        ret
+.size	AES_GCM_encrypt_update_aesni,.-AES_GCM_encrypt_update_aesni
+.text
+.globl	AES_GCM_encrypt_final_aesni
+.type	AES_GCM_encrypt_final_aesni,@function
+.align	16
+AES_GCM_encrypt_final_aesni:
+        pushl	%esi
+        pushl	%edi
+        pushl	%ebp
+        subl	$16, %esp
+        movl	32(%esp), %ebp
+        movl	52(%esp), %esi
+        movl	56(%esp), %edi
+        movdqa	(%ebp), %xmm4
+        movdqa	(%esi), %xmm5
+        movdqa	(%edi), %xmm6
+        movdqa	%xmm5, %xmm1
+        movdqa	%xmm5, %xmm0
+        psrlq	$63, %xmm1
+        psllq	$0x01, %xmm0
+        pslldq	$8, %xmm1
+        por	%xmm1, %xmm0
+        pshufd	$0xff, %xmm5, %xmm5
+        psrad	$31, %xmm5
+        pand	L_aes_gcm_mod2_128, %xmm5
+        pxor	%xmm0, %xmm5
+        movl	44(%esp), %edx
+        movl	48(%esp), %ecx
+        shll	$3, %edx
+        shll	$3, %ecx
+        pinsrd	$0x00, %edx, %xmm0
+        pinsrd	$2, %ecx, %xmm0
+        movl	44(%esp), %edx
+        movl	48(%esp), %ecx
+        shrl	$29, %edx
+        shrl	$29, %ecx
+        pinsrd	$0x01, %edx, %xmm0
+        pinsrd	$3, %ecx, %xmm0
+        pxor	%xmm0, %xmm4
+        pshufd	$0x4e, %xmm5, %xmm1
+        pshufd	$0x4e, %xmm4, %xmm2
+        movdqa	%xmm4, %xmm3
+        movdqa	%xmm4, %xmm0
+        pclmulqdq	$0x11, %xmm5, %xmm3
+        pclmulqdq	$0x00, %xmm5, %xmm0
+        pxor	%xmm5, %xmm1
+        pxor	%xmm4, %xmm2
+        pclmulqdq	$0x00, %xmm2, %xmm1
+        pxor	%xmm0, %xmm1
+        pxor	%xmm3, %xmm1
+        movdqa	%xmm1, %xmm2
+        movdqa	%xmm3, %xmm4
+        pslldq	$8, %xmm2
+        psrldq	$8, %xmm1
+        pxor	%xmm2, %xmm0
+        pxor	%xmm1, %xmm4
+        movdqa	%xmm0, %xmm1
+        movdqa	%xmm0, %xmm2
+        movdqa	%xmm0, %xmm3
+        pslld	$31, %xmm1
+        pslld	$30, %xmm2
+        pslld	$25, %xmm3
+        pxor	%xmm2, %xmm1
+        pxor	%xmm3, %xmm1
+        movdqa	%xmm1, %xmm3
+        psrldq	$4, %xmm3
+        pslldq	$12, %xmm1
+        pxor	%xmm1, %xmm0
+        movdqa	%xmm0, %xmm1
+        movdqa	%xmm0, %xmm2
+        psrld	$0x01, %xmm1
+        psrld	$2, %xmm2
+        pxor	%xmm2, %xmm1
+        pxor	%xmm0, %xmm1
+        psrld	$7, %xmm0
+        pxor	%xmm3, %xmm1
+        pxor	%xmm0, %xmm1
+        pxor	%xmm1, %xmm4
+        pshufb	L_aes_gcm_bswap_mask, %xmm4
+        movdqu	%xmm6, %xmm0
+        pxor	%xmm4, %xmm0
+        movl	36(%esp), %edi
+        cmpl	$16, 40(%esp)
+        je	L_AES_GCM_encrypt_final_aesni_store_tag_16
+        xorl	%ecx, %ecx
+        movdqu	%xmm0, (%esp)
+L_AES_GCM_encrypt_final_aesni_store_tag_loop:
+        movzbl	(%esp,%ecx,1), %eax
+        movb	%al, (%edi,%ecx,1)
+        incl	%ecx
+        cmpl	40(%esp), %ecx
+        jne	L_AES_GCM_encrypt_final_aesni_store_tag_loop
+        jmp	L_AES_GCM_encrypt_final_aesni_store_tag_done
+L_AES_GCM_encrypt_final_aesni_store_tag_16:
+        movdqu	%xmm0, (%edi)
+L_AES_GCM_encrypt_final_aesni_store_tag_done:
+        addl	$16, %esp
+        popl	%ebp
+        popl	%edi
+        popl	%esi
+        ret
+.size	AES_GCM_encrypt_final_aesni,.-AES_GCM_encrypt_final_aesni
+.text
+.globl	AES_GCM_decrypt_update_aesni
+.type	AES_GCM_decrypt_update_aesni,@function
+.align	16
+AES_GCM_decrypt_update_aesni:
+        pushl	%ebx
+        pushl	%esi
+        pushl	%edi
+        pushl	%ebp
+        subl	$0xa0, %esp
+        movl	208(%esp), %esi
+        movdqa	(%esi), %xmm4
+        movdqu	%xmm4, 64(%esp)
+        movl	200(%esp), %esi
+        movl	204(%esp), %ebp
+        movdqa	(%esi), %xmm6
+        movdqa	(%ebp), %xmm5
+        movdqu	%xmm6, 80(%esp)
+        movl	180(%esp), %ebp
+        movl	188(%esp), %edi
+        movl	192(%esp), %esi
+        movdqa	%xmm5, %xmm1
+        movdqa	%xmm5, %xmm0
+        psrlq	$63, %xmm1
+        psllq	$0x01, %xmm0
+        pslldq	$8, %xmm1
+        por	%xmm1, %xmm0
+        pshufd	$0xff, %xmm5, %xmm5
+        psrad	$31, %xmm5
+        pand	L_aes_gcm_mod2_128, %xmm5
+        pxor	%xmm0, %xmm5
+        xorl	%ebx, %ebx
+        cmpl	$0x40, 196(%esp)
+        movl	196(%esp), %eax
+        jl	L_AES_GCM_decrypt_update_aesni_done_64
+        andl	$0xffffffc0, %eax
+        movdqa	%xmm6, %xmm2
+        # H ^ 1
+        movdqu	%xmm5, (%esp)
+        # H ^ 2
+        pshufd	$0x4e, %xmm5, %xmm1
+        pshufd	$0x4e, %xmm5, %xmm2
+        movdqa	%xmm5, %xmm3
+        movdqa	%xmm5, %xmm0
+        pclmulqdq	$0x11, %xmm5, %xmm3
+        pclmulqdq	$0x00, %xmm5, %xmm0
+        pxor	%xmm5, %xmm1
+        pxor	%xmm5, %xmm2
+        pclmulqdq	$0x00, %xmm2, %xmm1
+        pxor	%xmm0, %xmm1
+        pxor	%xmm3, %xmm1
+        movdqa	%xmm1, %xmm2
+        movdqa	%xmm3, %xmm4
+        pslldq	$8, %xmm2
+        psrldq	$8, %xmm1
+        pxor	%xmm2, %xmm0
+        pxor	%xmm1, %xmm4
+        movdqa	%xmm0, %xmm1
+        movdqa	%xmm0, %xmm2
+        movdqa	%xmm0, %xmm3
+        pslld	$31, %xmm1
+        pslld	$30, %xmm2
+        pslld	$25, %xmm3
+        pxor	%xmm2, %xmm1
+        pxor	%xmm3, %xmm1
+        movdqa	%xmm1, %xmm3
+        psrldq	$4, %xmm3
+        pslldq	$12, %xmm1
+        pxor	%xmm1, %xmm0
+        movdqa	%xmm0, %xmm1
+        movdqa	%xmm0, %xmm2
+        psrld	$0x01, %xmm1
+        psrld	$2, %xmm2
+        pxor	%xmm2, %xmm1
+        pxor	%xmm0, %xmm1
+        psrld	$7, %xmm0
+        pxor	%xmm3, %xmm1
+        pxor	%xmm0, %xmm1
+        pxor	%xmm1, %xmm4
+        movdqu	%xmm4, 16(%esp)
+        # H ^ 3
+        pshufd	$0x4e, %xmm5, %xmm1
+        pshufd	$0x4e, %xmm4, %xmm2
+        movdqa	%xmm4, %xmm3
+        movdqa	%xmm4, %xmm0
+        pclmulqdq	$0x11, %xmm5, %xmm3
+        pclmulqdq	$0x00, %xmm5, %xmm0
+        pxor	%xmm5, %xmm1
+        pxor	%xmm4, %xmm2
+        pclmulqdq	$0x00, %xmm2, %xmm1
+        pxor	%xmm0, %xmm1
+        pxor	%xmm3, %xmm1
+        movdqa	%xmm1, %xmm2
+        movdqa	%xmm3, %xmm7
+        pslldq	$8, %xmm2
+        psrldq	$8, %xmm1
+        pxor	%xmm2, %xmm0
+        pxor	%xmm1, %xmm7
+        movdqa	%xmm0, %xmm1
+        movdqa	%xmm0, %xmm2
+        movdqa	%xmm0, %xmm3
+        pslld	$31, %xmm1
+        pslld	$30, %xmm2
+        pslld	$25, %xmm3
+        pxor	%xmm2, %xmm1
+        pxor	%xmm3, %xmm1
+        movdqa	%xmm1, %xmm3
+        psrldq	$4, %xmm3
+        pslldq	$12, %xmm1
+        pxor	%xmm1, %xmm0
+        movdqa	%xmm0, %xmm1
+        movdqa	%xmm0, %xmm2
+        psrld	$0x01, %xmm1
+        psrld	$2, %xmm2
+        pxor	%xmm2, %xmm1
+        pxor	%xmm0, %xmm1
+        psrld	$7, %xmm0
+        pxor	%xmm3, %xmm1
+        pxor	%xmm0, %xmm1
+        pxor	%xmm1, %xmm7
+        movdqu	%xmm7, 32(%esp)
+        # H ^ 4
+        pshufd	$0x4e, %xmm4, %xmm1
+        pshufd	$0x4e, %xmm4, %xmm2
+        movdqa	%xmm4, %xmm3
+        movdqa	%xmm4, %xmm0
+        pclmulqdq	$0x11, %xmm4, %xmm3
+        pclmulqdq	$0x00, %xmm4, %xmm0
+        pxor	%xmm4, %xmm1
+        pxor	%xmm4, %xmm2
+        pclmulqdq	$0x00, %xmm2, %xmm1
+        pxor	%xmm0, %xmm1
+        pxor	%xmm3, %xmm1
+        movdqa	%xmm1, %xmm2
+        movdqa	%xmm3, %xmm7
+        pslldq	$8, %xmm2
+        psrldq	$8, %xmm1
+        pxor	%xmm2, %xmm0
+        pxor	%xmm1, %xmm7
+        movdqa	%xmm0, %xmm1
+        movdqa	%xmm0, %xmm2
+        movdqa	%xmm0, %xmm3
+        pslld	$31, %xmm1
+        pslld	$30, %xmm2
+        pslld	$25, %xmm3
+        pxor	%xmm2, %xmm1
+        pxor	%xmm3, %xmm1
+        movdqa	%xmm1, %xmm3
+        psrldq	$4, %xmm3
+        pslldq	$12, %xmm1
+        pxor	%xmm1, %xmm0
+        movdqa	%xmm0, %xmm1
+        movdqa	%xmm0, %xmm2
+        psrld	$0x01, %xmm1
+        psrld	$2, %xmm2
+        pxor	%xmm2, %xmm1
+        pxor	%xmm0, %xmm1
+        psrld	$7, %xmm0
+        pxor	%xmm3, %xmm1
+        pxor	%xmm0, %xmm1
+        pxor	%xmm1, %xmm7
+        movdqu	%xmm7, 48(%esp)
+        cmpl	%esi, %edi
+        je	L_AES_GCM_decrypt_update_aesni_ghash_64
+L_AES_GCM_decrypt_update_aesni_ghash_64_inplace:
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        # Encrypt 64 bytes of counter
+        movdqu	64(%esp), %xmm0
+        movdqa	L_aes_gcm_bswap_epi64, %xmm7
+        movdqa	%xmm0, %xmm1
+        movdqa	%xmm0, %xmm2
+        movdqa	%xmm0, %xmm3
+        pshufb	%xmm7, %xmm0
+        paddd	L_aes_gcm_one, %xmm1
+        pshufb	%xmm7, %xmm1
+        paddd	L_aes_gcm_two, %xmm2
+        pshufb	%xmm7, %xmm2
+        paddd	L_aes_gcm_three, %xmm3
+        pshufb	%xmm7, %xmm3
+        movdqu	64(%esp), %xmm7
+        paddd	L_aes_gcm_four, %xmm7
+        movdqu	%xmm7, 64(%esp)
+        movdqa	(%ebp), %xmm7
+        pxor	%xmm7, %xmm0
+        pxor	%xmm7, %xmm1
+        pxor	%xmm7, %xmm2
+        pxor	%xmm7, %xmm3
+        movdqa	16(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	32(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	48(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	64(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	80(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	96(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	112(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	128(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	144(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        cmpl	$11, 184(%esp)
+        movdqa	160(%ebp), %xmm7
+        jl	L_AES_GCM_decrypt_update_aesniinplace_aesenc_64_ghash_avx_done
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	176(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        cmpl	$13, 184(%esp)
+        movdqa	192(%ebp), %xmm7
+        jl	L_AES_GCM_decrypt_update_aesniinplace_aesenc_64_ghash_avx_done
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	208(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	224(%ebp), %xmm7
+L_AES_GCM_decrypt_update_aesniinplace_aesenc_64_ghash_avx_done:
+        aesenclast	%xmm7, %xmm0
+        aesenclast	%xmm7, %xmm1
+        movdqu	(%ecx), %xmm4
+        movdqu	16(%ecx), %xmm5
+        pxor	%xmm4, %xmm0
+        pxor	%xmm5, %xmm1
+        movdqu	%xmm4, 96(%esp)
+        movdqu	%xmm5, 112(%esp)
+        movdqu	%xmm0, (%edx)
+        movdqu	%xmm1, 16(%edx)
+        aesenclast	%xmm7, %xmm2
+        aesenclast	%xmm7, %xmm3
+        movdqu	32(%ecx), %xmm4
+        movdqu	48(%ecx), %xmm5
+        pxor	%xmm4, %xmm2
+        pxor	%xmm5, %xmm3
+        movdqu	%xmm4, 128(%esp)
+        movdqu	%xmm5, 144(%esp)
+        movdqu	%xmm2, 32(%edx)
+        movdqu	%xmm3, 48(%edx)
+        # ghash encrypted counter
+        movdqu	80(%esp), %xmm2
+        movdqu	48(%esp), %xmm7
+        movdqu	96(%esp), %xmm0
+        pshufb	L_aes_gcm_bswap_mask, %xmm0
+        pxor	%xmm2, %xmm0
+        pshufd	$0x4e, %xmm7, %xmm1
+        pshufd	$0x4e, %xmm0, %xmm5
+        pxor	%xmm7, %xmm1
+        pxor	%xmm0, %xmm5
+        movdqa	%xmm0, %xmm3
+        pclmulqdq	$0x11, %xmm7, %xmm3
+        movdqa	%xmm0, %xmm2
+        pclmulqdq	$0x00, %xmm7, %xmm2
+        pclmulqdq	$0x00, %xmm5, %xmm1
+        pxor	%xmm2, %xmm1
+        pxor	%xmm3, %xmm1
+        movdqu	32(%esp), %xmm7
+        movdqu	112(%esp), %xmm0
+        pshufd	$0x4e, %xmm7, %xmm4
+        pshufb	L_aes_gcm_bswap_mask, %xmm0
+        pxor	%xmm7, %xmm4
+        pshufd	$0x4e, %xmm0, %xmm5
+        pxor	%xmm0, %xmm5
+        movdqa	%xmm0, %xmm6
+        pclmulqdq	$0x11, %xmm7, %xmm6
+        pclmulqdq	$0x00, %xmm0, %xmm7
+        pclmulqdq	$0x00, %xmm5, %xmm4
+        pxor	%xmm7, %xmm1
+        pxor	%xmm7, %xmm2
+        pxor	%xmm6, %xmm1
+        pxor	%xmm6, %xmm3
+        pxor	%xmm4, %xmm1
+        movdqu	16(%esp), %xmm7
+        movdqu	128(%esp), %xmm0
+        pshufd	$0x4e, %xmm7, %xmm4
+        pshufb	L_aes_gcm_bswap_mask, %xmm0
+        pxor	%xmm7, %xmm4
+        pshufd	$0x4e, %xmm0, %xmm5
+        pxor	%xmm0, %xmm5
+        movdqa	%xmm0, %xmm6
+        pclmulqdq	$0x11, %xmm7, %xmm6
+        pclmulqdq	$0x00, %xmm0, %xmm7
+        pclmulqdq	$0x00, %xmm5, %xmm4
+        pxor	%xmm7, %xmm1
+        pxor	%xmm7, %xmm2
+        pxor	%xmm6, %xmm1
+        pxor	%xmm6, %xmm3
+        pxor	%xmm4, %xmm1
+        movdqu	(%esp), %xmm7
+        movdqu	144(%esp), %xmm0
+        pshufd	$0x4e, %xmm7, %xmm4
+        pshufb	L_aes_gcm_bswap_mask, %xmm0
+        pxor	%xmm7, %xmm4
+        pshufd	$0x4e, %xmm0, %xmm5
+        pxor	%xmm0, %xmm5
+        movdqa	%xmm0, %xmm6
+        pclmulqdq	$0x11, %xmm7, %xmm6
+        pclmulqdq	$0x00, %xmm0, %xmm7
+        pclmulqdq	$0x00, %xmm5, %xmm4
+        pxor	%xmm7, %xmm1
+        pxor	%xmm7, %xmm2
+        pxor	%xmm6, %xmm1
+        pxor	%xmm6, %xmm3
+        pxor	%xmm4, %xmm1
+        movdqa	%xmm1, %xmm5
+        psrldq	$8, %xmm1
+        pslldq	$8, %xmm5
+        pxor	%xmm5, %xmm2
+        pxor	%xmm1, %xmm3
+        movdqa	%xmm2, %xmm7
+        movdqa	%xmm2, %xmm4
+        movdqa	%xmm2, %xmm5
+        pslld	$31, %xmm7
+        pslld	$30, %xmm4
+        pslld	$25, %xmm5
+        pxor	%xmm4, %xmm7
+        pxor	%xmm5, %xmm7
+        movdqa	%xmm7, %xmm4
+        pslldq	$12, %xmm7
+        psrldq	$4, %xmm4
+        pxor	%xmm7, %xmm2
+        movdqa	%xmm2, %xmm5
+        movdqa	%xmm2, %xmm1
+        movdqa	%xmm2, %xmm0
+        psrld	$0x01, %xmm5
+        psrld	$2, %xmm1
+        psrld	$7, %xmm0
+        pxor	%xmm1, %xmm5
+        pxor	%xmm0, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm5, %xmm2
+        pxor	%xmm3, %xmm2
+        movdqu	%xmm2, 80(%esp)
+        addl	$0x40, %ebx
+        cmpl	%eax, %ebx
+        jl	L_AES_GCM_decrypt_update_aesni_ghash_64_inplace
+        jmp	L_AES_GCM_decrypt_update_aesni_ghash_64_done
+L_AES_GCM_decrypt_update_aesni_ghash_64:
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        # Encrypt 64 bytes of counter
+        movdqu	64(%esp), %xmm0
+        movdqa	L_aes_gcm_bswap_epi64, %xmm7
+        movdqa	%xmm0, %xmm1
+        movdqa	%xmm0, %xmm2
+        movdqa	%xmm0, %xmm3
+        pshufb	%xmm7, %xmm0
+        paddd	L_aes_gcm_one, %xmm1
+        pshufb	%xmm7, %xmm1
+        paddd	L_aes_gcm_two, %xmm2
+        pshufb	%xmm7, %xmm2
+        paddd	L_aes_gcm_three, %xmm3
+        pshufb	%xmm7, %xmm3
+        movdqu	64(%esp), %xmm7
+        paddd	L_aes_gcm_four, %xmm7
+        movdqu	%xmm7, 64(%esp)
+        movdqa	(%ebp), %xmm7
+        pxor	%xmm7, %xmm0
+        pxor	%xmm7, %xmm1
+        pxor	%xmm7, %xmm2
+        pxor	%xmm7, %xmm3
+        movdqa	16(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	32(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	48(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	64(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	80(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	96(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	112(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	128(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	144(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        cmpl	$11, 184(%esp)
+        movdqa	160(%ebp), %xmm7
+        jl	L_AES_GCM_decrypt_update_aesni_aesenc_64_ghash_avx_done
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	176(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        cmpl	$13, 184(%esp)
+        movdqa	192(%ebp), %xmm7
+        jl	L_AES_GCM_decrypt_update_aesni_aesenc_64_ghash_avx_done
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	208(%ebp), %xmm7
+        aesenc	%xmm7, %xmm0
+        aesenc	%xmm7, %xmm1
+        aesenc	%xmm7, %xmm2
+        aesenc	%xmm7, %xmm3
+        movdqa	224(%ebp), %xmm7
+L_AES_GCM_decrypt_update_aesni_aesenc_64_ghash_avx_done:
+        aesenclast	%xmm7, %xmm0
+        aesenclast	%xmm7, %xmm1
+        movdqu	(%ecx), %xmm4
+        movdqu	16(%ecx), %xmm5
+        pxor	%xmm4, %xmm0
+        pxor	%xmm5, %xmm1
+        movdqu	%xmm4, (%ecx)
+        movdqu	%xmm5, 16(%ecx)
+        movdqu	%xmm0, (%edx)
+        movdqu	%xmm1, 16(%edx)
+        aesenclast	%xmm7, %xmm2
+        aesenclast	%xmm7, %xmm3
+        movdqu	32(%ecx), %xmm4
+        movdqu	48(%ecx), %xmm5
+        pxor	%xmm4, %xmm2
+        pxor	%xmm5, %xmm3
+        movdqu	%xmm4, 32(%ecx)
+        movdqu	%xmm5, 48(%ecx)
+        movdqu	%xmm2, 32(%edx)
+        movdqu	%xmm3, 48(%edx)
+        # ghash encrypted counter
+        movdqu	80(%esp), %xmm2
+        movdqu	48(%esp), %xmm7
+        movdqu	(%ecx), %xmm0
+        pshufb	L_aes_gcm_bswap_mask, %xmm0
+        pxor	%xmm2, %xmm0
+        pshufd	$0x4e, %xmm7, %xmm1
+        pshufd	$0x4e, %xmm0, %xmm5
+        pxor	%xmm7, %xmm1
+        pxor	%xmm0, %xmm5
+        movdqa	%xmm0, %xmm3
+        pclmulqdq	$0x11, %xmm7, %xmm3
+        movdqa	%xmm0, %xmm2
+        pclmulqdq	$0x00, %xmm7, %xmm2
+        pclmulqdq	$0x00, %xmm5, %xmm1
+        pxor	%xmm2, %xmm1
+        pxor	%xmm3, %xmm1
+        movdqu	32(%esp), %xmm7
+        movdqu	16(%ecx), %xmm0
+        pshufd	$0x4e, %xmm7, %xmm4
+        pshufb	L_aes_gcm_bswap_mask, %xmm0
+        pxor	%xmm7, %xmm4
+        pshufd	$0x4e, %xmm0, %xmm5
+        pxor	%xmm0, %xmm5
+        movdqa	%xmm0, %xmm6
+        pclmulqdq	$0x11, %xmm7, %xmm6
+        pclmulqdq	$0x00, %xmm0, %xmm7
+        pclmulqdq	$0x00, %xmm5, %xmm4
+        pxor	%xmm7, %xmm1
+        pxor	%xmm7, %xmm2
+        pxor	%xmm6, %xmm1
+        pxor	%xmm6, %xmm3
+        pxor	%xmm4, %xmm1
+        movdqu	16(%esp), %xmm7
+        movdqu	32(%ecx), %xmm0
+        pshufd	$0x4e, %xmm7, %xmm4
+        pshufb	L_aes_gcm_bswap_mask, %xmm0
+        pxor	%xmm7, %xmm4
+        pshufd	$0x4e, %xmm0, %xmm5
+        pxor	%xmm0, %xmm5
+        movdqa	%xmm0, %xmm6
+        pclmulqdq	$0x11, %xmm7, %xmm6
+        pclmulqdq	$0x00, %xmm0, %xmm7
+        pclmulqdq	$0x00, %xmm5, %xmm4
+        pxor	%xmm7, %xmm1
+        pxor	%xmm7, %xmm2
+        pxor	%xmm6, %xmm1
+        pxor	%xmm6, %xmm3
+        pxor	%xmm4, %xmm1
+        movdqu	(%esp), %xmm7
+        movdqu	48(%ecx), %xmm0
+        pshufd	$0x4e, %xmm7, %xmm4
+        pshufb	L_aes_gcm_bswap_mask, %xmm0
+        pxor	%xmm7, %xmm4
+        pshufd	$0x4e, %xmm0, %xmm5
+        pxor	%xmm0, %xmm5
+        movdqa	%xmm0, %xmm6
+        pclmulqdq	$0x11, %xmm7, %xmm6
+        pclmulqdq	$0x00, %xmm0, %xmm7
+        pclmulqdq	$0x00, %xmm5, %xmm4
+        pxor	%xmm7, %xmm1
+        pxor	%xmm7, %xmm2
+        pxor	%xmm6, %xmm1
+        pxor	%xmm6, %xmm3
+        pxor	%xmm4, %xmm1
+        movdqa	%xmm1, %xmm5
+        psrldq	$8, %xmm1
+        pslldq	$8, %xmm5
+        pxor	%xmm5, %xmm2
+        pxor	%xmm1, %xmm3
+        movdqa	%xmm2, %xmm7
+        movdqa	%xmm2, %xmm4
+        movdqa	%xmm2, %xmm5
+        pslld	$31, %xmm7
+        pslld	$30, %xmm4
+        pslld	$25, %xmm5
+        pxor	%xmm4, %xmm7
+        pxor	%xmm5, %xmm7
+        movdqa	%xmm7, %xmm4
+        pslldq	$12, %xmm7
+        psrldq	$4, %xmm4
+        pxor	%xmm7, %xmm2
+        movdqa	%xmm2, %xmm5
+        movdqa	%xmm2, %xmm1
+        movdqa	%xmm2, %xmm0
+        psrld	$0x01, %xmm5
+        psrld	$2, %xmm1
+        psrld	$7, %xmm0
+        pxor	%xmm1, %xmm5
+        pxor	%xmm0, %xmm5
+        pxor	%xmm4, %xmm5
+        pxor	%xmm5, %xmm2
+        pxor	%xmm3, %xmm2
+        movdqu	%xmm2, 80(%esp)
+        addl	$0x40, %ebx
+        cmpl	%eax, %ebx
+        jl	L_AES_GCM_decrypt_update_aesni_ghash_64
+L_AES_GCM_decrypt_update_aesni_ghash_64_done:
+        movdqa	%xmm2, %xmm6
+        movdqu	(%esp), %xmm5
+L_AES_GCM_decrypt_update_aesni_done_64:
+        movl	196(%esp), %edx
+        cmpl	%edx, %ebx
+        jge	L_AES_GCM_decrypt_update_aesni_done_dec
+        movl	196(%esp), %eax
+        andl	$0xfffffff0, %eax
+        cmpl	%eax, %ebx
+        jge	L_AES_GCM_decrypt_update_aesni_last_block_done
+L_AES_GCM_decrypt_update_aesni_last_block_start:
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        movdqu	(%ecx), %xmm1
+        pshufb	L_aes_gcm_bswap_mask, %xmm1
+        pxor	%xmm6, %xmm1
+        movdqu	%xmm1, (%esp)
+        movdqu	64(%esp), %xmm0
+        movdqa	%xmm0, %xmm1
+        pshufb	L_aes_gcm_bswap_epi64, %xmm0
+        paddd	L_aes_gcm_one, %xmm1
+        pxor	(%ebp), %xmm0
+        movdqu	%xmm1, 64(%esp)
+        movdqu	(%esp), %xmm4
+        pclmulqdq	$16, %xmm5, %xmm4
+        aesenc	16(%ebp), %xmm0
+        aesenc	32(%ebp), %xmm0
+        movdqu	(%esp), %xmm7
+        pclmulqdq	$0x01, %xmm5, %xmm7
+        aesenc	48(%ebp), %xmm0
+        aesenc	64(%ebp), %xmm0
+        aesenc	80(%ebp), %xmm0
+        movdqu	(%esp), %xmm1
+        pclmulqdq	$0x11, %xmm5, %xmm1
+        aesenc	96(%ebp), %xmm0
+        pxor	%xmm7, %xmm4
+        movdqa	%xmm4, %xmm2
+        psrldq	$8, %xmm4
+        pslldq	$8, %xmm2
+        aesenc	112(%ebp), %xmm0
+        movdqu	(%esp), %xmm7
+        pclmulqdq	$0x00, %xmm5, %xmm7
+        pxor	%xmm7, %xmm2
+        pxor	%xmm4, %xmm1
+        movdqa	L_aes_gcm_mod2_128, %xmm3
+        movdqa	%xmm2, %xmm7
+        pclmulqdq	$16, %xmm3, %xmm7
+        aesenc	128(%ebp), %xmm0
+        pshufd	$0x4e, %xmm2, %xmm4
+        pxor	%xmm7, %xmm4
+        movdqa	%xmm4, %xmm7
+        pclmulqdq	$16, %xmm3, %xmm7
+        aesenc	144(%ebp), %xmm0
+        pshufd	$0x4e, %xmm4, %xmm6
+        pxor	%xmm7, %xmm6
+        pxor	%xmm1, %xmm6
+        cmpl	$11, 184(%esp)
+        movdqa	160(%ebp), %xmm1
+        jl	L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last
+        aesenc	%xmm1, %xmm0
+        aesenc	176(%ebp), %xmm0
+        cmpl	$13, 184(%esp)
+        movdqa	192(%ebp), %xmm1
+        jl	L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last
+        aesenc	%xmm1, %xmm0
+        aesenc	208(%ebp), %xmm0
+        movdqa	224(%ebp), %xmm1
+L_AES_GCM_decrypt_update_aesni_aesenc_gfmul_last:
+        aesenclast	%xmm1, %xmm0
+        movdqu	(%ecx), %xmm1
+        pxor	%xmm1, %xmm0
+        movdqu	%xmm0, (%edx)
+        addl	$16, %ebx
+        cmpl	%eax, %ebx
+        jl	L_AES_GCM_decrypt_update_aesni_last_block_start
+L_AES_GCM_decrypt_update_aesni_last_block_done:
+L_AES_GCM_decrypt_update_aesni_done_dec:
+        movl	200(%esp), %esi
+        movl	208(%esp), %edi
+        movdqu	64(%esp), %xmm4
+        movdqa	%xmm6, (%esi)
+        movdqu	%xmm4, (%edi)
+        addl	$0xa0, %esp
+        popl	%ebp
+        popl	%edi
+        popl	%esi
+        popl	%ebx
+        ret
+.size	AES_GCM_decrypt_update_aesni,.-AES_GCM_decrypt_update_aesni
+.text
+.globl	AES_GCM_decrypt_final_aesni
+.type	AES_GCM_decrypt_final_aesni,@function
+.align	16
+AES_GCM_decrypt_final_aesni:
+        pushl	%ebx
+        pushl	%esi
+        pushl	%edi
+        pushl	%ebp
+        subl	$16, %esp
+        movl	36(%esp), %ebp
+        movl	56(%esp), %esi
+        movl	60(%esp), %edi
+        movdqa	(%ebp), %xmm6
+        movdqa	(%esi), %xmm5
+        movdqa	(%edi), %xmm7
+        movdqa	%xmm5, %xmm1
+        movdqa	%xmm5, %xmm0
+        psrlq	$63, %xmm1
+        psllq	$0x01, %xmm0
+        pslldq	$8, %xmm1
+        por	%xmm1, %xmm0
+        pshufd	$0xff, %xmm5, %xmm5
+        psrad	$31, %xmm5
+        pand	L_aes_gcm_mod2_128, %xmm5
+        pxor	%xmm0, %xmm5
+        movl	48(%esp), %edx
+        movl	52(%esp), %ecx
+        shll	$3, %edx
+        shll	$3, %ecx
+        pinsrd	$0x00, %edx, %xmm0
+        pinsrd	$2, %ecx, %xmm0
+        movl	48(%esp), %edx
+        movl	52(%esp), %ecx
+        shrl	$29, %edx
+        shrl	$29, %ecx
+        pinsrd	$0x01, %edx, %xmm0
+        pinsrd	$3, %ecx, %xmm0
+        pxor	%xmm0, %xmm6
+        pshufd	$0x4e, %xmm5, %xmm1
+        pshufd	$0x4e, %xmm6, %xmm2
+        movdqa	%xmm6, %xmm3
+        movdqa	%xmm6, %xmm0
+        pclmulqdq	$0x11, %xmm5, %xmm3
+        pclmulqdq	$0x00, %xmm5, %xmm0
+        pxor	%xmm5, %xmm1
+        pxor	%xmm6, %xmm2
+        pclmulqdq	$0x00, %xmm2, %xmm1
+        pxor	%xmm0, %xmm1
+        pxor	%xmm3, %xmm1
+        movdqa	%xmm1, %xmm2
+        movdqa	%xmm3, %xmm6
+        pslldq	$8, %xmm2
+        psrldq	$8, %xmm1
+        pxor	%xmm2, %xmm0
+        pxor	%xmm1, %xmm6
+        movdqa	%xmm0, %xmm1
+        movdqa	%xmm0, %xmm2
+        movdqa	%xmm0, %xmm3
+        pslld	$31, %xmm1
+        pslld	$30, %xmm2
+        pslld	$25, %xmm3
+        pxor	%xmm2, %xmm1
+        pxor	%xmm3, %xmm1
+        movdqa	%xmm1, %xmm3
+        psrldq	$4, %xmm3
+        pslldq	$12, %xmm1
+        pxor	%xmm1, %xmm0
+        movdqa	%xmm0, %xmm1
+        movdqa	%xmm0, %xmm2
+        psrld	$0x01, %xmm1
+        psrld	$2, %xmm2
+        pxor	%xmm2, %xmm1
+        pxor	%xmm0, %xmm1
+        psrld	$7, %xmm0
+        pxor	%xmm3, %xmm1
+        pxor	%xmm0, %xmm1
+        pxor	%xmm1, %xmm6
+        pshufb	L_aes_gcm_bswap_mask, %xmm6
+        movdqu	%xmm7, %xmm0
+        pxor	%xmm6, %xmm0
+        movl	40(%esp), %esi
+        movl	64(%esp), %edi
+        cmpl	$16, 44(%esp)
+        je	L_AES_GCM_decrypt_final_aesni_cmp_tag_16
+        subl	$16, %esp
+        xorl	%ecx, %ecx
+        xorl	%ebx, %ebx
+        movdqu	%xmm0, (%esp)
+L_AES_GCM_decrypt_final_aesni_cmp_tag_loop:
+        movzbl	(%esp,%ecx,1), %eax
+        xorb	(%esi,%ecx,1), %al
+        orb	%al, %bl
+        incl	%ecx
+        cmpl	44(%esp), %ecx
+        jne	L_AES_GCM_decrypt_final_aesni_cmp_tag_loop
+        cmpb	$0x00, %bl
+        sete	%bl
+        addl	$16, %esp
+        xorl	%ecx, %ecx
+        jmp	L_AES_GCM_decrypt_final_aesni_cmp_tag_done
+L_AES_GCM_decrypt_final_aesni_cmp_tag_16:
+        movdqu	(%esi), %xmm1
+        pcmpeqb	%xmm1, %xmm0
+        pmovmskb	%xmm0, %edx
+        # %%edx == 0xFFFF then return 1 else => return 0
+        xorl	%ebx, %ebx
+        cmpl	$0xffff, %edx
+        sete	%bl
+L_AES_GCM_decrypt_final_aesni_cmp_tag_done:
+        movl	%ebx, (%edi)
+        addl	$16, %esp
+        popl	%ebp
+        popl	%edi
+        popl	%esi
+        popl	%ebx
+        ret
+.size	AES_GCM_decrypt_final_aesni,.-AES_GCM_decrypt_final_aesni
+#endif /* WOLFSSL_AESGCM_STREAM */
+#ifdef HAVE_INTEL_AVX1
+.text
+.globl	AES_GCM_encrypt_avx1
+.type	AES_GCM_encrypt_avx1,@function
+.align	16
+AES_GCM_encrypt_avx1:
+        pushl	%ebx
+        pushl	%esi
+        pushl	%edi
+        pushl	%ebp
+        subl	$0x70, %esp
+        movl	144(%esp), %esi
+        movl	168(%esp), %ebp
+        movl	160(%esp), %edx
+        vpxor	%xmm0, %xmm0, %xmm0
+        vpxor	%xmm2, %xmm2, %xmm2
+        cmpl	$12, %edx
+        jne	L_AES_GCM_encrypt_avx1_iv_not_12
+        # # Calculate values when IV is 12 bytes
+        # Set counter based on IV
+        movl	$0x1000000, %ecx
+        vpinsrd	$0x00, (%esi), %xmm0, %xmm0
+        vpinsrd	$0x01, 4(%esi), %xmm0, %xmm0
+        vpinsrd	$2, 8(%esi), %xmm0, %xmm0
+        vpinsrd	$3, %ecx, %xmm0, %xmm0
+        # H = Encrypt X(=0) and T = Encrypt counter
+        vmovdqa	(%ebp), %xmm1
+        vpxor	%xmm1, %xmm0, %xmm5
+        vmovdqa	16(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm1, %xmm1
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vmovdqa	32(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm1, %xmm1
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vmovdqa	48(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm1, %xmm1
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vmovdqa	64(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm1, %xmm1
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vmovdqa	80(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm1, %xmm1
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vmovdqa	96(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm1, %xmm1
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vmovdqa	112(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm1, %xmm1
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vmovdqa	128(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm1, %xmm1
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vmovdqa	144(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm1, %xmm1
+        vaesenc	%xmm3, %xmm5, %xmm5
+        cmpl	$11, 172(%esp)
+        vmovdqa	160(%ebp), %xmm3
+        jl	L_AES_GCM_encrypt_avx1_calc_iv_12_last
+        vaesenc	%xmm3, %xmm1, %xmm1
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vmovdqa	176(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm1, %xmm1
+        vaesenc	%xmm3, %xmm5, %xmm5
+        cmpl	$13, 172(%esp)
+        vmovdqa	192(%ebp), %xmm3
+        jl	L_AES_GCM_encrypt_avx1_calc_iv_12_last
+        vaesenc	%xmm3, %xmm1, %xmm1
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vmovdqa	208(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm1, %xmm1
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vmovdqa	224(%ebp), %xmm3
+L_AES_GCM_encrypt_avx1_calc_iv_12_last:
+        vaesenclast	%xmm3, %xmm1, %xmm1
+        vaesenclast	%xmm3, %xmm5, %xmm5
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm1, %xmm1
+        vmovdqu	%xmm5, 80(%esp)
+        jmp	L_AES_GCM_encrypt_avx1_iv_done
+L_AES_GCM_encrypt_avx1_iv_not_12:
+        # Calculate values when IV is not 12 bytes
+        # H = Encrypt X(=0)
+        vmovdqa	(%ebp), %xmm1
+        vaesenc	16(%ebp), %xmm1, %xmm1
+        vaesenc	32(%ebp), %xmm1, %xmm1
+        vaesenc	48(%ebp), %xmm1, %xmm1
+        vaesenc	64(%ebp), %xmm1, %xmm1
+        vaesenc	80(%ebp), %xmm1, %xmm1
+        vaesenc	96(%ebp), %xmm1, %xmm1
+        vaesenc	112(%ebp), %xmm1, %xmm1
+        vaesenc	128(%ebp), %xmm1, %xmm1
+        vaesenc	144(%ebp), %xmm1, %xmm1
+        cmpl	$11, 172(%esp)
+        vmovdqa	160(%ebp), %xmm5
+        jl	L_AES_GCM_encrypt_avx1_calc_iv_1_aesenc_avx_last
+        vaesenc	%xmm5, %xmm1, %xmm1
+        vaesenc	176(%ebp), %xmm1, %xmm1
+        cmpl	$13, 172(%esp)
+        vmovdqa	192(%ebp), %xmm5
+        jl	L_AES_GCM_encrypt_avx1_calc_iv_1_aesenc_avx_last
+        vaesenc	%xmm5, %xmm1, %xmm1
+        vaesenc	208(%ebp), %xmm1, %xmm1
+        vmovdqa	224(%ebp), %xmm5
+L_AES_GCM_encrypt_avx1_calc_iv_1_aesenc_avx_last:
+        vaesenclast	%xmm5, %xmm1, %xmm1
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm1, %xmm1
+        # Calc counter
+        # Initialization vector
+        cmpl	$0x00, %edx
+        movl	$0x00, %ecx
+        je	L_AES_GCM_encrypt_avx1_calc_iv_done
+        cmpl	$16, %edx
+        jl	L_AES_GCM_encrypt_avx1_calc_iv_lt16
+        andl	$0xfffffff0, %edx
+L_AES_GCM_encrypt_avx1_calc_iv_16_loop:
+        vmovdqu	(%esi,%ecx,1), %xmm4
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
+        vpxor	%xmm4, %xmm0, %xmm0
+        # ghash_gfmul_avx
+        vpshufd	$0x4e, %xmm0, %xmm5
+        vpshufd	$0x4e, %xmm1, %xmm6
+        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm7
+        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
+        vpxor	%xmm0, %xmm5, %xmm5
+        vpxor	%xmm1, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vmovdqa	%xmm4, %xmm3
+        vmovdqa	%xmm7, %xmm0
+        vpslldq	$8, %xmm5, %xmm6
+        vpsrldq	$8, %xmm5, %xmm5
+        vpxor	%xmm6, %xmm3, %xmm3
+        vpxor	%xmm5, %xmm0, %xmm0
+        vpsrld	$31, %xmm3, %xmm4
+        vpsrld	$31, %xmm0, %xmm5
+        vpslld	$0x01, %xmm3, %xmm3
+        vpslld	$0x01, %xmm0, %xmm0
+        vpsrldq	$12, %xmm4, %xmm6
+        vpslldq	$4, %xmm4, %xmm4
+        vpslldq	$4, %xmm5, %xmm5
+        vpor	%xmm6, %xmm0, %xmm0
+        vpor	%xmm4, %xmm3, %xmm3
+        vpor	%xmm5, %xmm0, %xmm0
+        vpslld	$31, %xmm3, %xmm4
+        vpslld	$30, %xmm3, %xmm5
+        vpslld	$25, %xmm3, %xmm6
+        vpxor	%xmm5, %xmm4, %xmm4
+        vpxor	%xmm6, %xmm4, %xmm4
+        vmovdqa	%xmm4, %xmm5
+        vpsrldq	$4, %xmm5, %xmm5
+        vpslldq	$12, %xmm4, %xmm4
+        vpxor	%xmm4, %xmm3, %xmm3
+        vpsrld	$0x01, %xmm3, %xmm6
+        vpsrld	$2, %xmm3, %xmm7
+        vpsrld	$7, %xmm3, %xmm4
+        vpxor	%xmm7, %xmm6, %xmm6
+        vpxor	%xmm4, %xmm6, %xmm6
+        vpxor	%xmm5, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm6, %xmm0, %xmm0
+        addl	$16, %ecx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_encrypt_avx1_calc_iv_16_loop
+        movl	160(%esp), %edx
+        cmpl	%edx, %ecx
+        je	L_AES_GCM_encrypt_avx1_calc_iv_done
+L_AES_GCM_encrypt_avx1_calc_iv_lt16:
+        subl	$16, %esp
+        vpxor	%xmm4, %xmm4, %xmm4
+        xorl	%ebx, %ebx
+        vmovdqu	%xmm4, (%esp)
+L_AES_GCM_encrypt_avx1_calc_iv_loop:
+        movzbl	(%esi,%ecx,1), %eax
+        movb	%al, (%esp,%ebx,1)
+        incl	%ecx
+        incl	%ebx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_encrypt_avx1_calc_iv_loop
+        vmovdqu	(%esp), %xmm4
+        addl	$16, %esp
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
+        vpxor	%xmm4, %xmm0, %xmm0
+        # ghash_gfmul_avx
+        vpshufd	$0x4e, %xmm0, %xmm5
+        vpshufd	$0x4e, %xmm1, %xmm6
+        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm7
+        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
+        vpxor	%xmm0, %xmm5, %xmm5
+        vpxor	%xmm1, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vmovdqa	%xmm4, %xmm3
+        vmovdqa	%xmm7, %xmm0
+        vpslldq	$8, %xmm5, %xmm6
+        vpsrldq	$8, %xmm5, %xmm5
+        vpxor	%xmm6, %xmm3, %xmm3
+        vpxor	%xmm5, %xmm0, %xmm0
+        vpsrld	$31, %xmm3, %xmm4
+        vpsrld	$31, %xmm0, %xmm5
+        vpslld	$0x01, %xmm3, %xmm3
+        vpslld	$0x01, %xmm0, %xmm0
+        vpsrldq	$12, %xmm4, %xmm6
+        vpslldq	$4, %xmm4, %xmm4
+        vpslldq	$4, %xmm5, %xmm5
+        vpor	%xmm6, %xmm0, %xmm0
+        vpor	%xmm4, %xmm3, %xmm3
+        vpor	%xmm5, %xmm0, %xmm0
+        vpslld	$31, %xmm3, %xmm4
+        vpslld	$30, %xmm3, %xmm5
+        vpslld	$25, %xmm3, %xmm6
+        vpxor	%xmm5, %xmm4, %xmm4
+        vpxor	%xmm6, %xmm4, %xmm4
+        vmovdqa	%xmm4, %xmm5
+        vpsrldq	$4, %xmm5, %xmm5
+        vpslldq	$12, %xmm4, %xmm4
+        vpxor	%xmm4, %xmm3, %xmm3
+        vpsrld	$0x01, %xmm3, %xmm6
+        vpsrld	$2, %xmm3, %xmm7
+        vpsrld	$7, %xmm3, %xmm4
+        vpxor	%xmm7, %xmm6, %xmm6
+        vpxor	%xmm4, %xmm6, %xmm6
+        vpxor	%xmm5, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm6, %xmm0, %xmm0
+L_AES_GCM_encrypt_avx1_calc_iv_done:
+        # T = Encrypt counter
+        vpxor	%xmm4, %xmm4, %xmm4
+        shll	$3, %edx
+        vpinsrd	$0x00, %edx, %xmm4, %xmm4
+        vpxor	%xmm4, %xmm0, %xmm0
+        # ghash_gfmul_avx
+        vpshufd	$0x4e, %xmm0, %xmm5
+        vpshufd	$0x4e, %xmm1, %xmm6
+        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm7
+        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
+        vpxor	%xmm0, %xmm5, %xmm5
+        vpxor	%xmm1, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vmovdqa	%xmm4, %xmm3
+        vmovdqa	%xmm7, %xmm0
+        vpslldq	$8, %xmm5, %xmm6
+        vpsrldq	$8, %xmm5, %xmm5
+        vpxor	%xmm6, %xmm3, %xmm3
+        vpxor	%xmm5, %xmm0, %xmm0
+        vpsrld	$31, %xmm3, %xmm4
+        vpsrld	$31, %xmm0, %xmm5
+        vpslld	$0x01, %xmm3, %xmm3
+        vpslld	$0x01, %xmm0, %xmm0
+        vpsrldq	$12, %xmm4, %xmm6
+        vpslldq	$4, %xmm4, %xmm4
+        vpslldq	$4, %xmm5, %xmm5
+        vpor	%xmm6, %xmm0, %xmm0
+        vpor	%xmm4, %xmm3, %xmm3
+        vpor	%xmm5, %xmm0, %xmm0
+        vpslld	$31, %xmm3, %xmm4
+        vpslld	$30, %xmm3, %xmm5
+        vpslld	$25, %xmm3, %xmm6
+        vpxor	%xmm5, %xmm4, %xmm4
+        vpxor	%xmm6, %xmm4, %xmm4
+        vmovdqa	%xmm4, %xmm5
+        vpsrldq	$4, %xmm5, %xmm5
+        vpslldq	$12, %xmm4, %xmm4
+        vpxor	%xmm4, %xmm3, %xmm3
+        vpsrld	$0x01, %xmm3, %xmm6
+        vpsrld	$2, %xmm3, %xmm7
+        vpsrld	$7, %xmm3, %xmm4
+        vpxor	%xmm7, %xmm6, %xmm6
+        vpxor	%xmm4, %xmm6, %xmm6
+        vpxor	%xmm5, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm6, %xmm0, %xmm0
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
+        #   Encrypt counter
+        vmovdqa	(%ebp), %xmm4
+        vpxor	%xmm0, %xmm4, %xmm4
+        vaesenc	16(%ebp), %xmm4, %xmm4
+        vaesenc	32(%ebp), %xmm4, %xmm4
+        vaesenc	48(%ebp), %xmm4, %xmm4
+        vaesenc	64(%ebp), %xmm4, %xmm4
+        vaesenc	80(%ebp), %xmm4, %xmm4
+        vaesenc	96(%ebp), %xmm4, %xmm4
+        vaesenc	112(%ebp), %xmm4, %xmm4
+        vaesenc	128(%ebp), %xmm4, %xmm4
+        vaesenc	144(%ebp), %xmm4, %xmm4
+        cmpl	$11, 172(%esp)
+        vmovdqa	160(%ebp), %xmm5
+        jl	L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last
+        vaesenc	%xmm5, %xmm4, %xmm4
+        vaesenc	176(%ebp), %xmm4, %xmm4
+        cmpl	$13, 172(%esp)
+        vmovdqa	192(%ebp), %xmm5
+        jl	L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last
+        vaesenc	%xmm5, %xmm4, %xmm4
+        vaesenc	208(%ebp), %xmm4, %xmm4
+        vmovdqa	224(%ebp), %xmm5
+L_AES_GCM_encrypt_avx1_calc_iv_2_aesenc_avx_last:
+        vaesenclast	%xmm5, %xmm4, %xmm4
+        vmovdqu	%xmm4, 80(%esp)
+L_AES_GCM_encrypt_avx1_iv_done:
+        movl	140(%esp), %esi
+        # Additional authentication data
+        movl	156(%esp), %edx
+        cmpl	$0x00, %edx
+        je	L_AES_GCM_encrypt_avx1_calc_aad_done
+        xorl	%ecx, %ecx
+        cmpl	$16, %edx
+        jl	L_AES_GCM_encrypt_avx1_calc_aad_lt16
+        andl	$0xfffffff0, %edx
+L_AES_GCM_encrypt_avx1_calc_aad_16_loop:
+        vmovdqu	(%esi,%ecx,1), %xmm4
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
+        vpxor	%xmm4, %xmm2, %xmm2
+        # ghash_gfmul_avx
+        vpshufd	$0x4e, %xmm2, %xmm5
+        vpshufd	$0x4e, %xmm1, %xmm6
+        vpclmulqdq	$0x11, %xmm2, %xmm1, %xmm7
+        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm4
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpxor	%xmm1, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vmovdqa	%xmm4, %xmm3
+        vmovdqa	%xmm7, %xmm2
+        vpslldq	$8, %xmm5, %xmm6
+        vpsrldq	$8, %xmm5, %xmm5
+        vpxor	%xmm6, %xmm3, %xmm3
+        vpxor	%xmm5, %xmm2, %xmm2
+        vpsrld	$31, %xmm3, %xmm4
+        vpsrld	$31, %xmm2, %xmm5
+        vpslld	$0x01, %xmm3, %xmm3
+        vpslld	$0x01, %xmm2, %xmm2
+        vpsrldq	$12, %xmm4, %xmm6
+        vpslldq	$4, %xmm4, %xmm4
+        vpslldq	$4, %xmm5, %xmm5
+        vpor	%xmm6, %xmm2, %xmm2
+        vpor	%xmm4, %xmm3, %xmm3
+        vpor	%xmm5, %xmm2, %xmm2
+        vpslld	$31, %xmm3, %xmm4
+        vpslld	$30, %xmm3, %xmm5
+        vpslld	$25, %xmm3, %xmm6
+        vpxor	%xmm5, %xmm4, %xmm4
+        vpxor	%xmm6, %xmm4, %xmm4
+        vmovdqa	%xmm4, %xmm5
+        vpsrldq	$4, %xmm5, %xmm5
+        vpslldq	$12, %xmm4, %xmm4
+        vpxor	%xmm4, %xmm3, %xmm3
+        vpsrld	$0x01, %xmm3, %xmm6
+        vpsrld	$2, %xmm3, %xmm7
+        vpsrld	$7, %xmm3, %xmm4
+        vpxor	%xmm7, %xmm6, %xmm6
+        vpxor	%xmm4, %xmm6, %xmm6
+        vpxor	%xmm5, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm6, %xmm2, %xmm2
+        addl	$16, %ecx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_encrypt_avx1_calc_aad_16_loop
+        movl	156(%esp), %edx
+        cmpl	%edx, %ecx
+        je	L_AES_GCM_encrypt_avx1_calc_aad_done
+L_AES_GCM_encrypt_avx1_calc_aad_lt16:
+        subl	$16, %esp
+        vpxor	%xmm4, %xmm4, %xmm4
+        xorl	%ebx, %ebx
+        vmovdqu	%xmm4, (%esp)
+L_AES_GCM_encrypt_avx1_calc_aad_loop:
+        movzbl	(%esi,%ecx,1), %eax
+        movb	%al, (%esp,%ebx,1)
+        incl	%ecx
+        incl	%ebx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_encrypt_avx1_calc_aad_loop
+        vmovdqu	(%esp), %xmm4
+        addl	$16, %esp
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
+        vpxor	%xmm4, %xmm2, %xmm2
+        # ghash_gfmul_avx
+        vpshufd	$0x4e, %xmm2, %xmm5
+        vpshufd	$0x4e, %xmm1, %xmm6
+        vpclmulqdq	$0x11, %xmm2, %xmm1, %xmm7
+        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm4
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpxor	%xmm1, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vmovdqa	%xmm4, %xmm3
+        vmovdqa	%xmm7, %xmm2
+        vpslldq	$8, %xmm5, %xmm6
+        vpsrldq	$8, %xmm5, %xmm5
+        vpxor	%xmm6, %xmm3, %xmm3
+        vpxor	%xmm5, %xmm2, %xmm2
+        vpsrld	$31, %xmm3, %xmm4
+        vpsrld	$31, %xmm2, %xmm5
+        vpslld	$0x01, %xmm3, %xmm3
+        vpslld	$0x01, %xmm2, %xmm2
+        vpsrldq	$12, %xmm4, %xmm6
+        vpslldq	$4, %xmm4, %xmm4
+        vpslldq	$4, %xmm5, %xmm5
+        vpor	%xmm6, %xmm2, %xmm2
+        vpor	%xmm4, %xmm3, %xmm3
+        vpor	%xmm5, %xmm2, %xmm2
+        vpslld	$31, %xmm3, %xmm4
+        vpslld	$30, %xmm3, %xmm5
+        vpslld	$25, %xmm3, %xmm6
+        vpxor	%xmm5, %xmm4, %xmm4
+        vpxor	%xmm6, %xmm4, %xmm4
+        vmovdqa	%xmm4, %xmm5
+        vpsrldq	$4, %xmm5, %xmm5
+        vpslldq	$12, %xmm4, %xmm4
+        vpxor	%xmm4, %xmm3, %xmm3
+        vpsrld	$0x01, %xmm3, %xmm6
+        vpsrld	$2, %xmm3, %xmm7
+        vpsrld	$7, %xmm3, %xmm4
+        vpxor	%xmm7, %xmm6, %xmm6
+        vpxor	%xmm4, %xmm6, %xmm6
+        vpxor	%xmm5, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm6, %xmm2, %xmm2
+L_AES_GCM_encrypt_avx1_calc_aad_done:
+        vmovdqu	%xmm2, 96(%esp)
+        movl	132(%esp), %esi
+        movl	136(%esp), %edi
+        # Calculate counter and H
+        vpsrlq	$63, %xmm1, %xmm5
+        vpsllq	$0x01, %xmm1, %xmm4
+        vpslldq	$8, %xmm5, %xmm5
+        vpor	%xmm5, %xmm4, %xmm4
+        vpshufd	$0xff, %xmm1, %xmm1
+        vpsrad	$31, %xmm1, %xmm1
+        vpshufb	L_aes_gcm_avx1_bswap_epi64, %xmm0, %xmm0
+        vpand	L_aes_gcm_avx1_mod2_128, %xmm1, %xmm1
+        vpaddd	L_aes_gcm_avx1_one, %xmm0, %xmm0
+        vpxor	%xmm4, %xmm1, %xmm1
+        vmovdqu	%xmm0, 64(%esp)
+        xorl	%ebx, %ebx
+        cmpl	$0x40, 152(%esp)
+        movl	152(%esp), %eax
+        jl	L_AES_GCM_encrypt_avx1_done_64
+        andl	$0xffffffc0, %eax
+        vmovdqa	%xmm2, %xmm6
+        # H ^ 1
+        vmovdqu	%xmm1, (%esp)
+        # H ^ 2
+        vpclmulqdq	$0x00, %xmm1, %xmm1, %xmm4
+        vpclmulqdq	$0x11, %xmm1, %xmm1, %xmm0
+        vpslld	$31, %xmm4, %xmm5
+        vpslld	$30, %xmm4, %xmm6
+        vpslld	$25, %xmm4, %xmm7
+        vpxor	%xmm6, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpsrldq	$4, %xmm5, %xmm7
+        vpslldq	$12, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm4, %xmm4
+        vpsrld	$0x01, %xmm4, %xmm5
+        vpsrld	$2, %xmm4, %xmm6
+        vpxor	%xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpsrld	$7, %xmm4, %xmm4
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm0, %xmm0
+        vmovdqu	%xmm0, 16(%esp)
+        # H ^ 3
+        # ghash_gfmul_red_avx
+        vpshufd	$0x4e, %xmm1, %xmm5
+        vpshufd	$0x4e, %xmm0, %xmm6
+        vpclmulqdq	$0x11, %xmm1, %xmm0, %xmm7
+        vpclmulqdq	$0x00, %xmm1, %xmm0, %xmm4
+        vpxor	%xmm1, %xmm5, %xmm5
+        vpxor	%xmm0, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpslldq	$8, %xmm5, %xmm6
+        vpsrldq	$8, %xmm5, %xmm5
+        vpxor	%xmm6, %xmm4, %xmm4
+        vpxor	%xmm5, %xmm7, %xmm3
+        vpslld	$31, %xmm4, %xmm5
+        vpslld	$30, %xmm4, %xmm6
+        vpslld	$25, %xmm4, %xmm7
+        vpxor	%xmm6, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpsrldq	$4, %xmm5, %xmm7
+        vpslldq	$12, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm4, %xmm4
+        vpsrld	$0x01, %xmm4, %xmm5
+        vpsrld	$2, %xmm4, %xmm6
+        vpxor	%xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpsrld	$7, %xmm4, %xmm4
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm3, %xmm3
+        vmovdqu	%xmm3, 32(%esp)
+        # H ^ 4
+        vpclmulqdq	$0x00, %xmm0, %xmm0, %xmm4
+        vpclmulqdq	$0x11, %xmm0, %xmm0, %xmm3
+        vpslld	$31, %xmm4, %xmm5
+        vpslld	$30, %xmm4, %xmm6
+        vpslld	$25, %xmm4, %xmm7
+        vpxor	%xmm6, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpsrldq	$4, %xmm5, %xmm7
+        vpslldq	$12, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm4, %xmm4
+        vpsrld	$0x01, %xmm4, %xmm5
+        vpsrld	$2, %xmm4, %xmm6
+        vpxor	%xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpsrld	$7, %xmm4, %xmm4
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm3, %xmm3
+        vmovdqu	%xmm3, 48(%esp)
+        # First 64 bytes of input
+        vmovdqu	64(%esp), %xmm4
+        vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm3
+        vpaddd	L_aes_gcm_avx1_one, %xmm4, %xmm5
+        vpshufb	%xmm3, %xmm5, %xmm5
+        vpaddd	L_aes_gcm_avx1_two, %xmm4, %xmm6
+        vpshufb	%xmm3, %xmm6, %xmm6
+        vpaddd	L_aes_gcm_avx1_three, %xmm4, %xmm7
+        vpshufb	%xmm3, %xmm7, %xmm7
+        vpshufb	%xmm3, %xmm4, %xmm4
+        vmovdqu	64(%esp), %xmm3
+        vpaddd	L_aes_gcm_avx1_four, %xmm3, %xmm3
+        vmovdqu	%xmm3, 64(%esp)
+        vmovdqa	(%ebp), %xmm3
+        vpxor	%xmm3, %xmm4, %xmm4
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm7, %xmm7
+        vmovdqa	16(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	32(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	48(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	64(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	80(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	96(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	112(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	128(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	144(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        cmpl	$11, 172(%esp)
+        vmovdqa	160(%ebp), %xmm3
+        jl	L_AES_GCM_encrypt_avx1_aesenc_64_enc_done
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	176(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        cmpl	$13, 172(%esp)
+        vmovdqa	192(%ebp), %xmm3
+        jl	L_AES_GCM_encrypt_avx1_aesenc_64_enc_done
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	208(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	224(%ebp), %xmm3
+L_AES_GCM_encrypt_avx1_aesenc_64_enc_done:
+        vaesenclast	%xmm3, %xmm4, %xmm4
+        vaesenclast	%xmm3, %xmm5, %xmm5
+        vmovdqu	(%esi), %xmm0
+        vmovdqu	16(%esi), %xmm1
+        vpxor	%xmm0, %xmm4, %xmm4
+        vpxor	%xmm1, %xmm5, %xmm5
+        vmovdqu	%xmm0, (%esi)
+        vmovdqu	%xmm1, 16(%esi)
+        vmovdqu	%xmm4, (%edi)
+        vmovdqu	%xmm5, 16(%edi)
+        vaesenclast	%xmm3, %xmm6, %xmm6
+        vaesenclast	%xmm3, %xmm7, %xmm7
+        vmovdqu	32(%esi), %xmm0
+        vmovdqu	48(%esi), %xmm1
+        vpxor	%xmm0, %xmm6, %xmm6
+        vpxor	%xmm1, %xmm7, %xmm7
+        vmovdqu	%xmm0, 32(%esi)
+        vmovdqu	%xmm1, 48(%esi)
+        vmovdqu	%xmm6, 32(%edi)
+        vmovdqu	%xmm7, 48(%edi)
+        cmpl	$0x40, %eax
+        movl	$0x40, %ebx
+        movl	%esi, %ecx
+        movl	%edi, %edx
+        jle	L_AES_GCM_encrypt_avx1_end_64
+        # More 64 bytes of input
+L_AES_GCM_encrypt_avx1_ghash_64:
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        vmovdqu	64(%esp), %xmm4
+        vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm3
+        vpaddd	L_aes_gcm_avx1_one, %xmm4, %xmm5
+        vpshufb	%xmm3, %xmm5, %xmm5
+        vpaddd	L_aes_gcm_avx1_two, %xmm4, %xmm6
+        vpshufb	%xmm3, %xmm6, %xmm6
+        vpaddd	L_aes_gcm_avx1_three, %xmm4, %xmm7
+        vpshufb	%xmm3, %xmm7, %xmm7
+        vpshufb	%xmm3, %xmm4, %xmm4
+        vmovdqu	64(%esp), %xmm3
+        vpaddd	L_aes_gcm_avx1_four, %xmm3, %xmm3
+        vmovdqu	%xmm3, 64(%esp)
+        vmovdqa	(%ebp), %xmm3
+        vpxor	%xmm3, %xmm4, %xmm4
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm7, %xmm7
+        vmovdqa	16(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	32(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	48(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	64(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	80(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	96(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	112(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	128(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	144(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        cmpl	$11, 172(%esp)
+        vmovdqa	160(%ebp), %xmm3
+        jl	L_AES_GCM_encrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	176(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        cmpl	$13, 172(%esp)
+        vmovdqa	192(%ebp), %xmm3
+        jl	L_AES_GCM_encrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	208(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	224(%ebp), %xmm3
+L_AES_GCM_encrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done:
+        vaesenclast	%xmm3, %xmm4, %xmm4
+        vaesenclast	%xmm3, %xmm5, %xmm5
+        vmovdqu	(%ecx), %xmm0
+        vmovdqu	16(%ecx), %xmm1
+        vpxor	%xmm0, %xmm4, %xmm4
+        vpxor	%xmm1, %xmm5, %xmm5
+        vmovdqu	%xmm4, (%edx)
+        vmovdqu	%xmm5, 16(%edx)
+        vaesenclast	%xmm3, %xmm6, %xmm6
+        vaesenclast	%xmm3, %xmm7, %xmm7
+        vmovdqu	32(%ecx), %xmm0
+        vmovdqu	48(%ecx), %xmm1
+        vpxor	%xmm0, %xmm6, %xmm6
+        vpxor	%xmm1, %xmm7, %xmm7
+        vmovdqu	%xmm6, 32(%edx)
+        vmovdqu	%xmm7, 48(%edx)
+        # ghash encrypted counter
+        vmovdqu	96(%esp), %xmm6
+        vmovdqu	48(%esp), %xmm3
+        vmovdqu	-64(%edx), %xmm4
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
+        vpxor	%xmm6, %xmm4, %xmm4
+        vpshufd	$0x4e, %xmm3, %xmm5
+        vpshufd	$0x4e, %xmm4, %xmm1
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm1, %xmm1
+        vpclmulqdq	$0x11, %xmm3, %xmm4, %xmm7
+        vpclmulqdq	$0x00, %xmm3, %xmm4, %xmm6
+        vpclmulqdq	$0x00, %xmm1, %xmm5, %xmm5
+        vpxor	%xmm6, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vmovdqu	32(%esp), %xmm3
+        vmovdqu	-48(%edx), %xmm4
+        vpshufd	$0x4e, %xmm3, %xmm0
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
+        vpxor	%xmm3, %xmm0, %xmm0
+        vpshufd	$0x4e, %xmm4, %xmm1
+        vpxor	%xmm4, %xmm1, %xmm1
+        vpclmulqdq	$0x11, %xmm3, %xmm4, %xmm2
+        vpclmulqdq	$0x00, %xmm3, %xmm4, %xmm3
+        vpclmulqdq	$0x00, %xmm1, %xmm0, %xmm0
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpxor	%xmm2, %xmm7, %xmm7
+        vpxor	%xmm0, %xmm5, %xmm5
+        vmovdqu	16(%esp), %xmm3
+        vmovdqu	-32(%edx), %xmm4
+        vpshufd	$0x4e, %xmm3, %xmm0
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
+        vpxor	%xmm3, %xmm0, %xmm0
+        vpshufd	$0x4e, %xmm4, %xmm1
+        vpxor	%xmm4, %xmm1, %xmm1
+        vpclmulqdq	$0x11, %xmm3, %xmm4, %xmm2
+        vpclmulqdq	$0x00, %xmm3, %xmm4, %xmm3
+        vpclmulqdq	$0x00, %xmm1, %xmm0, %xmm0
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpxor	%xmm2, %xmm7, %xmm7
+        vpxor	%xmm0, %xmm5, %xmm5
+        vmovdqu	(%esp), %xmm3
+        vmovdqu	-16(%edx), %xmm4
+        vpshufd	$0x4e, %xmm3, %xmm0
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
+        vpxor	%xmm3, %xmm0, %xmm0
+        vpshufd	$0x4e, %xmm4, %xmm1
+        vpxor	%xmm4, %xmm1, %xmm1
+        vpclmulqdq	$0x11, %xmm3, %xmm4, %xmm2
+        vpclmulqdq	$0x00, %xmm3, %xmm4, %xmm3
+        vpclmulqdq	$0x00, %xmm1, %xmm0, %xmm0
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpxor	%xmm2, %xmm7, %xmm7
+        vpxor	%xmm0, %xmm5, %xmm5
+        vpslldq	$8, %xmm5, %xmm1
+        vpsrldq	$8, %xmm5, %xmm5
+        vpxor	%xmm1, %xmm6, %xmm6
+        vpxor	%xmm5, %xmm7, %xmm7
+        vpslld	$31, %xmm6, %xmm3
+        vpslld	$30, %xmm6, %xmm0
+        vpslld	$25, %xmm6, %xmm1
+        vpxor	%xmm0, %xmm3, %xmm3
+        vpxor	%xmm1, %xmm3, %xmm3
+        vpsrldq	$4, %xmm3, %xmm0
+        vpslldq	$12, %xmm3, %xmm3
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpsrld	$0x01, %xmm6, %xmm1
+        vpsrld	$2, %xmm6, %xmm5
+        vpsrld	$7, %xmm6, %xmm4
+        vpxor	%xmm5, %xmm1, %xmm1
+        vpxor	%xmm4, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm6, %xmm6
+        vpxor	%xmm7, %xmm6, %xmm6
+        vmovdqu	%xmm6, 96(%esp)
+        addl	$0x40, %ebx
+        cmpl	%eax, %ebx
+        jl	L_AES_GCM_encrypt_avx1_ghash_64
+L_AES_GCM_encrypt_avx1_end_64:
+        vmovdqu	96(%esp), %xmm2
+        # Block 1
+        vmovdqa	L_aes_gcm_avx1_bswap_mask, %xmm4
+        vmovdqa	(%edx), %xmm1
+        vpshufb	%xmm4, %xmm1, %xmm1
+        vmovdqu	48(%esp), %xmm3
+        vpxor	%xmm2, %xmm1, %xmm1
+        # ghash_gfmul_avx
+        vpshufd	$0x4e, %xmm1, %xmm5
+        vpshufd	$0x4e, %xmm3, %xmm6
+        vpclmulqdq	$0x11, %xmm1, %xmm3, %xmm7
+        vpclmulqdq	$0x00, %xmm1, %xmm3, %xmm4
+        vpxor	%xmm1, %xmm5, %xmm5
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vmovdqa	%xmm4, %xmm0
+        vmovdqa	%xmm7, %xmm2
+        vpslldq	$8, %xmm5, %xmm6
+        vpsrldq	$8, %xmm5, %xmm5
+        vpxor	%xmm6, %xmm0, %xmm0
+        vpxor	%xmm5, %xmm2, %xmm2
+        # Block 2
+        vmovdqa	L_aes_gcm_avx1_bswap_mask, %xmm4
+        vmovdqa	16(%edx), %xmm1
+        vpshufb	%xmm4, %xmm1, %xmm1
+        vmovdqu	32(%esp), %xmm3
+        # ghash_gfmul_xor_avx
+        vpshufd	$0x4e, %xmm1, %xmm5
+        vpshufd	$0x4e, %xmm3, %xmm6
+        vpclmulqdq	$0x11, %xmm1, %xmm3, %xmm7
+        vpclmulqdq	$0x00, %xmm1, %xmm3, %xmm4
+        vpxor	%xmm1, %xmm5, %xmm5
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm0, %xmm0
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpslldq	$8, %xmm5, %xmm6
+        vpsrldq	$8, %xmm5, %xmm5
+        vpxor	%xmm6, %xmm0, %xmm0
+        vpxor	%xmm5, %xmm2, %xmm2
+        # Block 3
+        vmovdqa	L_aes_gcm_avx1_bswap_mask, %xmm4
+        vmovdqa	32(%edx), %xmm1
+        vpshufb	%xmm4, %xmm1, %xmm1
+        vmovdqu	16(%esp), %xmm3
+        # ghash_gfmul_xor_avx
+        vpshufd	$0x4e, %xmm1, %xmm5
+        vpshufd	$0x4e, %xmm3, %xmm6
+        vpclmulqdq	$0x11, %xmm1, %xmm3, %xmm7
+        vpclmulqdq	$0x00, %xmm1, %xmm3, %xmm4
+        vpxor	%xmm1, %xmm5, %xmm5
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm0, %xmm0
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpslldq	$8, %xmm5, %xmm6
+        vpsrldq	$8, %xmm5, %xmm5
+        vpxor	%xmm6, %xmm0, %xmm0
+        vpxor	%xmm5, %xmm2, %xmm2
+        # Block 4
+        vmovdqa	L_aes_gcm_avx1_bswap_mask, %xmm4
+        vmovdqa	48(%edx), %xmm1
+        vpshufb	%xmm4, %xmm1, %xmm1
+        vmovdqu	(%esp), %xmm3
+        # ghash_gfmul_xor_avx
+        vpshufd	$0x4e, %xmm1, %xmm5
+        vpshufd	$0x4e, %xmm3, %xmm6
+        vpclmulqdq	$0x11, %xmm1, %xmm3, %xmm7
+        vpclmulqdq	$0x00, %xmm1, %xmm3, %xmm4
+        vpxor	%xmm1, %xmm5, %xmm5
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm0, %xmm0
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpslldq	$8, %xmm5, %xmm6
+        vpsrldq	$8, %xmm5, %xmm5
+        vpxor	%xmm6, %xmm0, %xmm0
+        vpxor	%xmm5, %xmm2, %xmm2
+        vpslld	$31, %xmm0, %xmm4
+        vpslld	$30, %xmm0, %xmm5
+        vpslld	$25, %xmm0, %xmm6
+        vpxor	%xmm5, %xmm4, %xmm4
+        vpxor	%xmm6, %xmm4, %xmm4
+        vmovdqa	%xmm4, %xmm5
+        vpsrldq	$4, %xmm5, %xmm5
+        vpslldq	$12, %xmm4, %xmm4
+        vpxor	%xmm4, %xmm0, %xmm0
+        vpsrld	$0x01, %xmm0, %xmm6
+        vpsrld	$2, %xmm0, %xmm7
+        vpsrld	$7, %xmm0, %xmm4
+        vpxor	%xmm7, %xmm6, %xmm6
+        vpxor	%xmm4, %xmm6, %xmm6
+        vpxor	%xmm5, %xmm6, %xmm6
+        vpxor	%xmm0, %xmm6, %xmm6
+        vpxor	%xmm6, %xmm2, %xmm2
+        vmovdqu	(%esp), %xmm1
+L_AES_GCM_encrypt_avx1_done_64:
+        movl	152(%esp), %edx
+        cmpl	%edx, %ebx
+        jge	L_AES_GCM_encrypt_avx1_done_enc
+        movl	152(%esp), %eax
+        andl	$0xfffffff0, %eax
+        cmpl	%eax, %ebx
+        jge	L_AES_GCM_encrypt_avx1_last_block_done
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        vmovdqu	64(%esp), %xmm5
+        vpshufb	L_aes_gcm_avx1_bswap_epi64, %xmm5, %xmm4
+        vpaddd	L_aes_gcm_avx1_one, %xmm5, %xmm5
+        vmovdqu	%xmm5, 64(%esp)
+        vpxor	(%ebp), %xmm4, %xmm4
+        vaesenc	16(%ebp), %xmm4, %xmm4
+        vaesenc	32(%ebp), %xmm4, %xmm4
+        vaesenc	48(%ebp), %xmm4, %xmm4
+        vaesenc	64(%ebp), %xmm4, %xmm4
+        vaesenc	80(%ebp), %xmm4, %xmm4
+        vaesenc	96(%ebp), %xmm4, %xmm4
+        vaesenc	112(%ebp), %xmm4, %xmm4
+        vaesenc	128(%ebp), %xmm4, %xmm4
+        vaesenc	144(%ebp), %xmm4, %xmm4
+        cmpl	$11, 172(%esp)
+        vmovdqa	160(%ebp), %xmm5
+        jl	L_AES_GCM_encrypt_avx1_aesenc_block_aesenc_avx_last
+        vaesenc	%xmm5, %xmm4, %xmm4
+        vaesenc	176(%ebp), %xmm4, %xmm4
+        cmpl	$13, 172(%esp)
+        vmovdqa	192(%ebp), %xmm5
+        jl	L_AES_GCM_encrypt_avx1_aesenc_block_aesenc_avx_last
+        vaesenc	%xmm5, %xmm4, %xmm4
+        vaesenc	208(%ebp), %xmm4, %xmm4
+        vmovdqa	224(%ebp), %xmm5
+L_AES_GCM_encrypt_avx1_aesenc_block_aesenc_avx_last:
+        vaesenclast	%xmm5, %xmm4, %xmm4
+        vmovdqu	(%ecx), %xmm5
+        vpxor	%xmm5, %xmm4, %xmm4
+        vmovdqu	%xmm4, (%edx)
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
+        vpxor	%xmm4, %xmm2, %xmm2
+        addl	$16, %ebx
+        cmpl	%eax, %ebx
+        jge	L_AES_GCM_encrypt_avx1_last_block_ghash
+L_AES_GCM_encrypt_avx1_last_block_start:
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        vmovdqu	64(%esp), %xmm5
+        vmovdqu	%xmm2, %xmm7
+        vpshufb	L_aes_gcm_avx1_bswap_epi64, %xmm5, %xmm4
+        vpaddd	L_aes_gcm_avx1_one, %xmm5, %xmm5
+        vmovdqu	%xmm5, 64(%esp)
+        vpxor	(%ebp), %xmm4, %xmm4
+        vpclmulqdq	$16, %xmm1, %xmm7, %xmm0
+        vaesenc	16(%ebp), %xmm4, %xmm4
+        vaesenc	32(%ebp), %xmm4, %xmm4
+        vpclmulqdq	$0x01, %xmm1, %xmm7, %xmm3
+        vaesenc	48(%ebp), %xmm4, %xmm4
+        vaesenc	64(%ebp), %xmm4, %xmm4
+        vaesenc	80(%ebp), %xmm4, %xmm4
+        vpclmulqdq	$0x11, %xmm1, %xmm7, %xmm5
+        vaesenc	96(%ebp), %xmm4, %xmm4
+        vpxor	%xmm3, %xmm0, %xmm0
+        vpslldq	$8, %xmm0, %xmm6
+        vpsrldq	$8, %xmm0, %xmm0
+        vaesenc	112(%ebp), %xmm4, %xmm4
+        vpclmulqdq	$0x00, %xmm1, %xmm7, %xmm3
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm0, %xmm5, %xmm5
+        vmovdqa	L_aes_gcm_avx1_mod2_128, %xmm7
+        vpclmulqdq	$16, %xmm7, %xmm6, %xmm3
+        vaesenc	128(%ebp), %xmm4, %xmm4
+        vpshufd	$0x4e, %xmm6, %xmm0
+        vpxor	%xmm3, %xmm0, %xmm0
+        vpclmulqdq	$16, %xmm7, %xmm0, %xmm3
+        vaesenc	144(%ebp), %xmm4, %xmm4
+        vpshufd	$0x4e, %xmm0, %xmm2
+        vpxor	%xmm3, %xmm2, %xmm2
+        vpxor	%xmm5, %xmm2, %xmm2
+        cmpl	$11, 172(%esp)
+        vmovdqa	160(%ebp), %xmm5
+        jl	L_AES_GCM_encrypt_avx1_aesenc_gfmul_last
+        vaesenc	%xmm5, %xmm4, %xmm4
+        vaesenc	176(%ebp), %xmm4, %xmm4
+        cmpl	$13, 172(%esp)
+        vmovdqa	192(%ebp), %xmm5
+        jl	L_AES_GCM_encrypt_avx1_aesenc_gfmul_last
+        vaesenc	%xmm5, %xmm4, %xmm4
+        vaesenc	208(%ebp), %xmm4, %xmm4
+        vmovdqa	224(%ebp), %xmm5
+L_AES_GCM_encrypt_avx1_aesenc_gfmul_last:
+        vaesenclast	%xmm5, %xmm4, %xmm4
+        vmovdqu	(%ecx), %xmm5
+        vpxor	%xmm5, %xmm4, %xmm4
+        vmovdqu	%xmm4, (%edx)
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
+        addl	$16, %ebx
+        vpxor	%xmm4, %xmm2, %xmm2
+        cmpl	%eax, %ebx
+        jl	L_AES_GCM_encrypt_avx1_last_block_start
+L_AES_GCM_encrypt_avx1_last_block_ghash:
+        # ghash_gfmul_red_avx
+        vpshufd	$0x4e, %xmm1, %xmm5
+        vpshufd	$0x4e, %xmm2, %xmm6
+        vpclmulqdq	$0x11, %xmm1, %xmm2, %xmm7
+        vpclmulqdq	$0x00, %xmm1, %xmm2, %xmm4
+        vpxor	%xmm1, %xmm5, %xmm5
+        vpxor	%xmm2, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpslldq	$8, %xmm5, %xmm6
+        vpsrldq	$8, %xmm5, %xmm5
+        vpxor	%xmm6, %xmm4, %xmm4
+        vpxor	%xmm5, %xmm7, %xmm2
+        vpslld	$31, %xmm4, %xmm5
+        vpslld	$30, %xmm4, %xmm6
+        vpslld	$25, %xmm4, %xmm7
+        vpxor	%xmm6, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpsrldq	$4, %xmm5, %xmm7
+        vpslldq	$12, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm4, %xmm4
+        vpsrld	$0x01, %xmm4, %xmm5
+        vpsrld	$2, %xmm4, %xmm6
+        vpxor	%xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpsrld	$7, %xmm4, %xmm4
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm2, %xmm2
+L_AES_GCM_encrypt_avx1_last_block_done:
+        movl	152(%esp), %ecx
+        movl	%ecx, %edx
+        andl	$15, %ecx
+        jz	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_done
+        vmovdqu	64(%esp), %xmm0
+        vpshufb	L_aes_gcm_avx1_bswap_epi64, %xmm0, %xmm0
+        vpxor	(%ebp), %xmm0, %xmm0
+        vaesenc	16(%ebp), %xmm0, %xmm0
+        vaesenc	32(%ebp), %xmm0, %xmm0
+        vaesenc	48(%ebp), %xmm0, %xmm0
+        vaesenc	64(%ebp), %xmm0, %xmm0
+        vaesenc	80(%ebp), %xmm0, %xmm0
+        vaesenc	96(%ebp), %xmm0, %xmm0
+        vaesenc	112(%ebp), %xmm0, %xmm0
+        vaesenc	128(%ebp), %xmm0, %xmm0
+        vaesenc	144(%ebp), %xmm0, %xmm0
+        cmpl	$11, 172(%esp)
+        vmovdqa	160(%ebp), %xmm5
+        jl	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_aesenc_avx_last
+        vaesenc	%xmm5, %xmm0, %xmm0
+        vaesenc	176(%ebp), %xmm0, %xmm0
+        cmpl	$13, 172(%esp)
+        vmovdqa	192(%ebp), %xmm5
+        jl	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_aesenc_avx_last
+        vaesenc	%xmm5, %xmm0, %xmm0
+        vaesenc	208(%ebp), %xmm0, %xmm0
+        vmovdqa	224(%ebp), %xmm5
+L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_aesenc_avx_last:
+        vaesenclast	%xmm5, %xmm0, %xmm0
+        subl	$16, %esp
+        xorl	%ecx, %ecx
+        vmovdqu	%xmm0, (%esp)
+L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_loop:
+        movzbl	(%esi,%ebx,1), %eax
+        xorb	(%esp,%ecx,1), %al
+        movb	%al, (%edi,%ebx,1)
+        movb	%al, (%esp,%ecx,1)
+        incl	%ebx
+        incl	%ecx
+        cmpl	%edx, %ebx
+        jl	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_loop
+        xorl	%eax, %eax
+        cmpl	$16, %ecx
+        je	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_finish_enc
+L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_byte_loop:
+        movb	%al, (%esp,%ecx,1)
+        incl	%ecx
+        cmpl	$16, %ecx
+        jl	L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_byte_loop
+L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_finish_enc:
+        vmovdqu	(%esp), %xmm0
+        addl	$16, %esp
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm2, %xmm2
+        # ghash_gfmul_red_avx
+        vpshufd	$0x4e, %xmm1, %xmm5
+        vpshufd	$0x4e, %xmm2, %xmm6
+        vpclmulqdq	$0x11, %xmm1, %xmm2, %xmm7
+        vpclmulqdq	$0x00, %xmm1, %xmm2, %xmm4
+        vpxor	%xmm1, %xmm5, %xmm5
+        vpxor	%xmm2, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpslldq	$8, %xmm5, %xmm6
+        vpsrldq	$8, %xmm5, %xmm5
+        vpxor	%xmm6, %xmm4, %xmm4
+        vpxor	%xmm5, %xmm7, %xmm2
+        vpslld	$31, %xmm4, %xmm5
+        vpslld	$30, %xmm4, %xmm6
+        vpslld	$25, %xmm4, %xmm7
+        vpxor	%xmm6, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpsrldq	$4, %xmm5, %xmm7
+        vpslldq	$12, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm4, %xmm4
+        vpsrld	$0x01, %xmm4, %xmm5
+        vpsrld	$2, %xmm4, %xmm6
+        vpxor	%xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpsrld	$7, %xmm4, %xmm4
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm2, %xmm2
+L_AES_GCM_encrypt_avx1_aesenc_last15_enc_avx_done:
+L_AES_GCM_encrypt_avx1_done_enc:
+        movl	148(%esp), %edi
+        movl	164(%esp), %ebx
+        movl	152(%esp), %edx
+        movl	156(%esp), %ecx
+        shll	$3, %edx
+        shll	$3, %ecx
+        vpinsrd	$0x00, %edx, %xmm4, %xmm4
+        vpinsrd	$2, %ecx, %xmm4, %xmm4
+        movl	152(%esp), %edx
+        movl	156(%esp), %ecx
+        shrl	$29, %edx
+        shrl	$29, %ecx
+        vpinsrd	$0x01, %edx, %xmm4, %xmm4
+        vpinsrd	$3, %ecx, %xmm4, %xmm4
+        vpxor	%xmm4, %xmm2, %xmm2
+        # ghash_gfmul_red_avx
+        vpshufd	$0x4e, %xmm1, %xmm5
+        vpshufd	$0x4e, %xmm2, %xmm6
+        vpclmulqdq	$0x11, %xmm1, %xmm2, %xmm7
+        vpclmulqdq	$0x00, %xmm1, %xmm2, %xmm4
+        vpxor	%xmm1, %xmm5, %xmm5
+        vpxor	%xmm2, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpslldq	$8, %xmm5, %xmm6
+        vpsrldq	$8, %xmm5, %xmm5
+        vpxor	%xmm6, %xmm4, %xmm4
+        vpxor	%xmm5, %xmm7, %xmm2
+        vpslld	$31, %xmm4, %xmm5
+        vpslld	$30, %xmm4, %xmm6
+        vpslld	$25, %xmm4, %xmm7
+        vpxor	%xmm6, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpsrldq	$4, %xmm5, %xmm7
+        vpslldq	$12, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm4, %xmm4
+        vpsrld	$0x01, %xmm4, %xmm5
+        vpsrld	$2, %xmm4, %xmm6
+        vpxor	%xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpsrld	$7, %xmm4, %xmm4
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm2, %xmm2
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm2, %xmm2
+        vpxor	80(%esp), %xmm2, %xmm4
+        cmpl	$16, %ebx
+        je	L_AES_GCM_encrypt_avx1_store_tag_16
+        xorl	%ecx, %ecx
+        vmovdqu	%xmm4, (%esp)
+L_AES_GCM_encrypt_avx1_store_tag_loop:
+        movzbl	(%esp,%ecx,1), %eax
+        movb	%al, (%edi,%ecx,1)
+        incl	%ecx
+        cmpl	%ebx, %ecx
+        jne	L_AES_GCM_encrypt_avx1_store_tag_loop
+        jmp	L_AES_GCM_encrypt_avx1_store_tag_done
+L_AES_GCM_encrypt_avx1_store_tag_16:
+        vmovdqu	%xmm4, (%edi)
+L_AES_GCM_encrypt_avx1_store_tag_done:
+        addl	$0x70, %esp
+        popl	%ebp
+        popl	%edi
+        popl	%esi
+        popl	%ebx
+        ret
+.size	AES_GCM_encrypt_avx1,.-AES_GCM_encrypt_avx1
+.text
+.globl	AES_GCM_decrypt_avx1
+.type	AES_GCM_decrypt_avx1,@function
+.align	16
+AES_GCM_decrypt_avx1:
+        pushl	%ebx
+        pushl	%esi
+        pushl	%edi
+        pushl	%ebp
+        subl	$0xb0, %esp
+        movl	208(%esp), %esi
+        movl	232(%esp), %ebp
+        movl	224(%esp), %edx
+        vpxor	%xmm0, %xmm0, %xmm0
+        vpxor	%xmm2, %xmm2, %xmm2
+        cmpl	$12, %edx
+        jne	L_AES_GCM_decrypt_avx1_iv_not_12
+        # # Calculate values when IV is 12 bytes
+        # Set counter based on IV
+        movl	$0x1000000, %ecx
+        vpinsrd	$0x00, (%esi), %xmm0, %xmm0
+        vpinsrd	$0x01, 4(%esi), %xmm0, %xmm0
+        vpinsrd	$2, 8(%esi), %xmm0, %xmm0
+        vpinsrd	$3, %ecx, %xmm0, %xmm0
+        # H = Encrypt X(=0) and T = Encrypt counter
+        vmovdqa	(%ebp), %xmm1
+        vpxor	%xmm1, %xmm0, %xmm5
+        vmovdqa	16(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm1, %xmm1
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vmovdqa	32(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm1, %xmm1
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vmovdqa	48(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm1, %xmm1
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vmovdqa	64(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm1, %xmm1
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vmovdqa	80(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm1, %xmm1
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vmovdqa	96(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm1, %xmm1
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vmovdqa	112(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm1, %xmm1
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vmovdqa	128(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm1, %xmm1
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vmovdqa	144(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm1, %xmm1
+        vaesenc	%xmm3, %xmm5, %xmm5
+        cmpl	$11, 236(%esp)
+        vmovdqa	160(%ebp), %xmm3
+        jl	L_AES_GCM_decrypt_avx1_calc_iv_12_last
+        vaesenc	%xmm3, %xmm1, %xmm1
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vmovdqa	176(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm1, %xmm1
+        vaesenc	%xmm3, %xmm5, %xmm5
+        cmpl	$13, 236(%esp)
+        vmovdqa	192(%ebp), %xmm3
+        jl	L_AES_GCM_decrypt_avx1_calc_iv_12_last
+        vaesenc	%xmm3, %xmm1, %xmm1
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vmovdqa	208(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm1, %xmm1
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vmovdqa	224(%ebp), %xmm3
+L_AES_GCM_decrypt_avx1_calc_iv_12_last:
+        vaesenclast	%xmm3, %xmm1, %xmm1
+        vaesenclast	%xmm3, %xmm5, %xmm5
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm1, %xmm1
+        vmovdqu	%xmm5, 80(%esp)
+        jmp	L_AES_GCM_decrypt_avx1_iv_done
+L_AES_GCM_decrypt_avx1_iv_not_12:
+        # Calculate values when IV is not 12 bytes
+        # H = Encrypt X(=0)
+        vmovdqa	(%ebp), %xmm1
+        vaesenc	16(%ebp), %xmm1, %xmm1
+        vaesenc	32(%ebp), %xmm1, %xmm1
+        vaesenc	48(%ebp), %xmm1, %xmm1
+        vaesenc	64(%ebp), %xmm1, %xmm1
+        vaesenc	80(%ebp), %xmm1, %xmm1
+        vaesenc	96(%ebp), %xmm1, %xmm1
+        vaesenc	112(%ebp), %xmm1, %xmm1
+        vaesenc	128(%ebp), %xmm1, %xmm1
+        vaesenc	144(%ebp), %xmm1, %xmm1
+        cmpl	$11, 236(%esp)
+        vmovdqa	160(%ebp), %xmm5
+        jl	L_AES_GCM_decrypt_avx1_calc_iv_1_aesenc_avx_last
+        vaesenc	%xmm5, %xmm1, %xmm1
+        vaesenc	176(%ebp), %xmm1, %xmm1
+        cmpl	$13, 236(%esp)
+        vmovdqa	192(%ebp), %xmm5
+        jl	L_AES_GCM_decrypt_avx1_calc_iv_1_aesenc_avx_last
+        vaesenc	%xmm5, %xmm1, %xmm1
+        vaesenc	208(%ebp), %xmm1, %xmm1
+        vmovdqa	224(%ebp), %xmm5
+L_AES_GCM_decrypt_avx1_calc_iv_1_aesenc_avx_last:
+        vaesenclast	%xmm5, %xmm1, %xmm1
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm1, %xmm1
+        # Calc counter
+        # Initialization vector
+        cmpl	$0x00, %edx
+        movl	$0x00, %ecx
+        je	L_AES_GCM_decrypt_avx1_calc_iv_done
+        cmpl	$16, %edx
+        jl	L_AES_GCM_decrypt_avx1_calc_iv_lt16
+        andl	$0xfffffff0, %edx
+L_AES_GCM_decrypt_avx1_calc_iv_16_loop:
+        vmovdqu	(%esi,%ecx,1), %xmm4
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
+        vpxor	%xmm4, %xmm0, %xmm0
+        # ghash_gfmul_avx
+        vpshufd	$0x4e, %xmm0, %xmm5
+        vpshufd	$0x4e, %xmm1, %xmm6
+        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm7
+        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
+        vpxor	%xmm0, %xmm5, %xmm5
+        vpxor	%xmm1, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vmovdqa	%xmm4, %xmm3
+        vmovdqa	%xmm7, %xmm0
+        vpslldq	$8, %xmm5, %xmm6
+        vpsrldq	$8, %xmm5, %xmm5
+        vpxor	%xmm6, %xmm3, %xmm3
+        vpxor	%xmm5, %xmm0, %xmm0
+        vpsrld	$31, %xmm3, %xmm4
+        vpsrld	$31, %xmm0, %xmm5
+        vpslld	$0x01, %xmm3, %xmm3
+        vpslld	$0x01, %xmm0, %xmm0
+        vpsrldq	$12, %xmm4, %xmm6
+        vpslldq	$4, %xmm4, %xmm4
+        vpslldq	$4, %xmm5, %xmm5
+        vpor	%xmm6, %xmm0, %xmm0
+        vpor	%xmm4, %xmm3, %xmm3
+        vpor	%xmm5, %xmm0, %xmm0
+        vpslld	$31, %xmm3, %xmm4
+        vpslld	$30, %xmm3, %xmm5
+        vpslld	$25, %xmm3, %xmm6
+        vpxor	%xmm5, %xmm4, %xmm4
+        vpxor	%xmm6, %xmm4, %xmm4
+        vmovdqa	%xmm4, %xmm5
+        vpsrldq	$4, %xmm5, %xmm5
+        vpslldq	$12, %xmm4, %xmm4
+        vpxor	%xmm4, %xmm3, %xmm3
+        vpsrld	$0x01, %xmm3, %xmm6
+        vpsrld	$2, %xmm3, %xmm7
+        vpsrld	$7, %xmm3, %xmm4
+        vpxor	%xmm7, %xmm6, %xmm6
+        vpxor	%xmm4, %xmm6, %xmm6
+        vpxor	%xmm5, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm6, %xmm0, %xmm0
+        addl	$16, %ecx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_decrypt_avx1_calc_iv_16_loop
+        movl	224(%esp), %edx
+        cmpl	%edx, %ecx
+        je	L_AES_GCM_decrypt_avx1_calc_iv_done
+L_AES_GCM_decrypt_avx1_calc_iv_lt16:
+        subl	$16, %esp
+        vpxor	%xmm4, %xmm4, %xmm4
+        xorl	%ebx, %ebx
+        vmovdqu	%xmm4, (%esp)
+L_AES_GCM_decrypt_avx1_calc_iv_loop:
+        movzbl	(%esi,%ecx,1), %eax
+        movb	%al, (%esp,%ebx,1)
+        incl	%ecx
+        incl	%ebx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_decrypt_avx1_calc_iv_loop
+        vmovdqu	(%esp), %xmm4
+        addl	$16, %esp
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
+        vpxor	%xmm4, %xmm0, %xmm0
+        # ghash_gfmul_avx
+        vpshufd	$0x4e, %xmm0, %xmm5
+        vpshufd	$0x4e, %xmm1, %xmm6
+        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm7
+        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
+        vpxor	%xmm0, %xmm5, %xmm5
+        vpxor	%xmm1, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vmovdqa	%xmm4, %xmm3
+        vmovdqa	%xmm7, %xmm0
+        vpslldq	$8, %xmm5, %xmm6
+        vpsrldq	$8, %xmm5, %xmm5
+        vpxor	%xmm6, %xmm3, %xmm3
+        vpxor	%xmm5, %xmm0, %xmm0
+        vpsrld	$31, %xmm3, %xmm4
+        vpsrld	$31, %xmm0, %xmm5
+        vpslld	$0x01, %xmm3, %xmm3
+        vpslld	$0x01, %xmm0, %xmm0
+        vpsrldq	$12, %xmm4, %xmm6
+        vpslldq	$4, %xmm4, %xmm4
+        vpslldq	$4, %xmm5, %xmm5
+        vpor	%xmm6, %xmm0, %xmm0
+        vpor	%xmm4, %xmm3, %xmm3
+        vpor	%xmm5, %xmm0, %xmm0
+        vpslld	$31, %xmm3, %xmm4
+        vpslld	$30, %xmm3, %xmm5
+        vpslld	$25, %xmm3, %xmm6
+        vpxor	%xmm5, %xmm4, %xmm4
+        vpxor	%xmm6, %xmm4, %xmm4
+        vmovdqa	%xmm4, %xmm5
+        vpsrldq	$4, %xmm5, %xmm5
+        vpslldq	$12, %xmm4, %xmm4
+        vpxor	%xmm4, %xmm3, %xmm3
+        vpsrld	$0x01, %xmm3, %xmm6
+        vpsrld	$2, %xmm3, %xmm7
+        vpsrld	$7, %xmm3, %xmm4
+        vpxor	%xmm7, %xmm6, %xmm6
+        vpxor	%xmm4, %xmm6, %xmm6
+        vpxor	%xmm5, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm6, %xmm0, %xmm0
+L_AES_GCM_decrypt_avx1_calc_iv_done:
+        # T = Encrypt counter
+        vpxor	%xmm4, %xmm4, %xmm4
+        shll	$3, %edx
+        vpinsrd	$0x00, %edx, %xmm4, %xmm4
+        vpxor	%xmm4, %xmm0, %xmm0
+        # ghash_gfmul_avx
+        vpshufd	$0x4e, %xmm0, %xmm5
+        vpshufd	$0x4e, %xmm1, %xmm6
+        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm7
+        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
+        vpxor	%xmm0, %xmm5, %xmm5
+        vpxor	%xmm1, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vmovdqa	%xmm4, %xmm3
+        vmovdqa	%xmm7, %xmm0
+        vpslldq	$8, %xmm5, %xmm6
+        vpsrldq	$8, %xmm5, %xmm5
+        vpxor	%xmm6, %xmm3, %xmm3
+        vpxor	%xmm5, %xmm0, %xmm0
+        vpsrld	$31, %xmm3, %xmm4
+        vpsrld	$31, %xmm0, %xmm5
+        vpslld	$0x01, %xmm3, %xmm3
+        vpslld	$0x01, %xmm0, %xmm0
+        vpsrldq	$12, %xmm4, %xmm6
+        vpslldq	$4, %xmm4, %xmm4
+        vpslldq	$4, %xmm5, %xmm5
+        vpor	%xmm6, %xmm0, %xmm0
+        vpor	%xmm4, %xmm3, %xmm3
+        vpor	%xmm5, %xmm0, %xmm0
+        vpslld	$31, %xmm3, %xmm4
+        vpslld	$30, %xmm3, %xmm5
+        vpslld	$25, %xmm3, %xmm6
+        vpxor	%xmm5, %xmm4, %xmm4
+        vpxor	%xmm6, %xmm4, %xmm4
+        vmovdqa	%xmm4, %xmm5
+        vpsrldq	$4, %xmm5, %xmm5
+        vpslldq	$12, %xmm4, %xmm4
+        vpxor	%xmm4, %xmm3, %xmm3
+        vpsrld	$0x01, %xmm3, %xmm6
+        vpsrld	$2, %xmm3, %xmm7
+        vpsrld	$7, %xmm3, %xmm4
+        vpxor	%xmm7, %xmm6, %xmm6
+        vpxor	%xmm4, %xmm6, %xmm6
+        vpxor	%xmm5, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm6, %xmm0, %xmm0
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
+        #   Encrypt counter
+        vmovdqa	(%ebp), %xmm4
+        vpxor	%xmm0, %xmm4, %xmm4
+        vaesenc	16(%ebp), %xmm4, %xmm4
+        vaesenc	32(%ebp), %xmm4, %xmm4
+        vaesenc	48(%ebp), %xmm4, %xmm4
+        vaesenc	64(%ebp), %xmm4, %xmm4
+        vaesenc	80(%ebp), %xmm4, %xmm4
+        vaesenc	96(%ebp), %xmm4, %xmm4
+        vaesenc	112(%ebp), %xmm4, %xmm4
+        vaesenc	128(%ebp), %xmm4, %xmm4
+        vaesenc	144(%ebp), %xmm4, %xmm4
+        cmpl	$11, 236(%esp)
+        vmovdqa	160(%ebp), %xmm5
+        jl	L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last
+        vaesenc	%xmm5, %xmm4, %xmm4
+        vaesenc	176(%ebp), %xmm4, %xmm4
+        cmpl	$13, 236(%esp)
+        vmovdqa	192(%ebp), %xmm5
+        jl	L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last
+        vaesenc	%xmm5, %xmm4, %xmm4
+        vaesenc	208(%ebp), %xmm4, %xmm4
+        vmovdqa	224(%ebp), %xmm5
+L_AES_GCM_decrypt_avx1_calc_iv_2_aesenc_avx_last:
+        vaesenclast	%xmm5, %xmm4, %xmm4
+        vmovdqu	%xmm4, 80(%esp)
+L_AES_GCM_decrypt_avx1_iv_done:
+        movl	204(%esp), %esi
+        # Additional authentication data
+        movl	220(%esp), %edx
+        cmpl	$0x00, %edx
+        je	L_AES_GCM_decrypt_avx1_calc_aad_done
+        xorl	%ecx, %ecx
+        cmpl	$16, %edx
+        jl	L_AES_GCM_decrypt_avx1_calc_aad_lt16
+        andl	$0xfffffff0, %edx
+L_AES_GCM_decrypt_avx1_calc_aad_16_loop:
+        vmovdqu	(%esi,%ecx,1), %xmm4
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
+        vpxor	%xmm4, %xmm2, %xmm2
+        # ghash_gfmul_avx
+        vpshufd	$0x4e, %xmm2, %xmm5
+        vpshufd	$0x4e, %xmm1, %xmm6
+        vpclmulqdq	$0x11, %xmm2, %xmm1, %xmm7
+        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm4
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpxor	%xmm1, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vmovdqa	%xmm4, %xmm3
+        vmovdqa	%xmm7, %xmm2
+        vpslldq	$8, %xmm5, %xmm6
+        vpsrldq	$8, %xmm5, %xmm5
+        vpxor	%xmm6, %xmm3, %xmm3
+        vpxor	%xmm5, %xmm2, %xmm2
+        vpsrld	$31, %xmm3, %xmm4
+        vpsrld	$31, %xmm2, %xmm5
+        vpslld	$0x01, %xmm3, %xmm3
+        vpslld	$0x01, %xmm2, %xmm2
+        vpsrldq	$12, %xmm4, %xmm6
+        vpslldq	$4, %xmm4, %xmm4
+        vpslldq	$4, %xmm5, %xmm5
+        vpor	%xmm6, %xmm2, %xmm2
+        vpor	%xmm4, %xmm3, %xmm3
+        vpor	%xmm5, %xmm2, %xmm2
+        vpslld	$31, %xmm3, %xmm4
+        vpslld	$30, %xmm3, %xmm5
+        vpslld	$25, %xmm3, %xmm6
+        vpxor	%xmm5, %xmm4, %xmm4
+        vpxor	%xmm6, %xmm4, %xmm4
+        vmovdqa	%xmm4, %xmm5
+        vpsrldq	$4, %xmm5, %xmm5
+        vpslldq	$12, %xmm4, %xmm4
+        vpxor	%xmm4, %xmm3, %xmm3
+        vpsrld	$0x01, %xmm3, %xmm6
+        vpsrld	$2, %xmm3, %xmm7
+        vpsrld	$7, %xmm3, %xmm4
+        vpxor	%xmm7, %xmm6, %xmm6
+        vpxor	%xmm4, %xmm6, %xmm6
+        vpxor	%xmm5, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm6, %xmm2, %xmm2
+        addl	$16, %ecx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_decrypt_avx1_calc_aad_16_loop
+        movl	220(%esp), %edx
+        cmpl	%edx, %ecx
+        je	L_AES_GCM_decrypt_avx1_calc_aad_done
+L_AES_GCM_decrypt_avx1_calc_aad_lt16:
+        subl	$16, %esp
+        vpxor	%xmm4, %xmm4, %xmm4
+        xorl	%ebx, %ebx
+        vmovdqu	%xmm4, (%esp)
+L_AES_GCM_decrypt_avx1_calc_aad_loop:
+        movzbl	(%esi,%ecx,1), %eax
+        movb	%al, (%esp,%ebx,1)
+        incl	%ecx
+        incl	%ebx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_decrypt_avx1_calc_aad_loop
+        vmovdqu	(%esp), %xmm4
+        addl	$16, %esp
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
+        vpxor	%xmm4, %xmm2, %xmm2
+        # ghash_gfmul_avx
+        vpshufd	$0x4e, %xmm2, %xmm5
+        vpshufd	$0x4e, %xmm1, %xmm6
+        vpclmulqdq	$0x11, %xmm2, %xmm1, %xmm7
+        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm4
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpxor	%xmm1, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vmovdqa	%xmm4, %xmm3
+        vmovdqa	%xmm7, %xmm2
+        vpslldq	$8, %xmm5, %xmm6
+        vpsrldq	$8, %xmm5, %xmm5
+        vpxor	%xmm6, %xmm3, %xmm3
+        vpxor	%xmm5, %xmm2, %xmm2
+        vpsrld	$31, %xmm3, %xmm4
+        vpsrld	$31, %xmm2, %xmm5
+        vpslld	$0x01, %xmm3, %xmm3
+        vpslld	$0x01, %xmm2, %xmm2
+        vpsrldq	$12, %xmm4, %xmm6
+        vpslldq	$4, %xmm4, %xmm4
+        vpslldq	$4, %xmm5, %xmm5
+        vpor	%xmm6, %xmm2, %xmm2
+        vpor	%xmm4, %xmm3, %xmm3
+        vpor	%xmm5, %xmm2, %xmm2
+        vpslld	$31, %xmm3, %xmm4
+        vpslld	$30, %xmm3, %xmm5
+        vpslld	$25, %xmm3, %xmm6
+        vpxor	%xmm5, %xmm4, %xmm4
+        vpxor	%xmm6, %xmm4, %xmm4
+        vmovdqa	%xmm4, %xmm5
+        vpsrldq	$4, %xmm5, %xmm5
+        vpslldq	$12, %xmm4, %xmm4
+        vpxor	%xmm4, %xmm3, %xmm3
+        vpsrld	$0x01, %xmm3, %xmm6
+        vpsrld	$2, %xmm3, %xmm7
+        vpsrld	$7, %xmm3, %xmm4
+        vpxor	%xmm7, %xmm6, %xmm6
+        vpxor	%xmm4, %xmm6, %xmm6
+        vpxor	%xmm5, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm6, %xmm2, %xmm2
+L_AES_GCM_decrypt_avx1_calc_aad_done:
+        vmovdqu	%xmm2, 96(%esp)
+        movl	196(%esp), %esi
+        movl	200(%esp), %edi
+        # Calculate counter and H
+        vpsrlq	$63, %xmm1, %xmm5
+        vpsllq	$0x01, %xmm1, %xmm4
+        vpslldq	$8, %xmm5, %xmm5
+        vpor	%xmm5, %xmm4, %xmm4
+        vpshufd	$0xff, %xmm1, %xmm1
+        vpsrad	$31, %xmm1, %xmm1
+        vpshufb	L_aes_gcm_avx1_bswap_epi64, %xmm0, %xmm0
+        vpand	L_aes_gcm_avx1_mod2_128, %xmm1, %xmm1
+        vpaddd	L_aes_gcm_avx1_one, %xmm0, %xmm0
+        vpxor	%xmm4, %xmm1, %xmm1
+        vmovdqu	%xmm0, 64(%esp)
+        xorl	%ebx, %ebx
+        cmpl	$0x40, 216(%esp)
+        movl	216(%esp), %eax
+        jl	L_AES_GCM_decrypt_avx1_done_64
+        andl	$0xffffffc0, %eax
+        vmovdqa	%xmm2, %xmm6
+        # H ^ 1
+        vmovdqu	%xmm1, (%esp)
+        # H ^ 2
+        vpclmulqdq	$0x00, %xmm1, %xmm1, %xmm4
+        vpclmulqdq	$0x11, %xmm1, %xmm1, %xmm0
+        vpslld	$31, %xmm4, %xmm5
+        vpslld	$30, %xmm4, %xmm6
+        vpslld	$25, %xmm4, %xmm7
+        vpxor	%xmm6, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpsrldq	$4, %xmm5, %xmm7
+        vpslldq	$12, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm4, %xmm4
+        vpsrld	$0x01, %xmm4, %xmm5
+        vpsrld	$2, %xmm4, %xmm6
+        vpxor	%xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpsrld	$7, %xmm4, %xmm4
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm0, %xmm0
+        vmovdqu	%xmm0, 16(%esp)
+        # H ^ 3
+        # ghash_gfmul_red_avx
+        vpshufd	$0x4e, %xmm1, %xmm5
+        vpshufd	$0x4e, %xmm0, %xmm6
+        vpclmulqdq	$0x11, %xmm1, %xmm0, %xmm7
+        vpclmulqdq	$0x00, %xmm1, %xmm0, %xmm4
+        vpxor	%xmm1, %xmm5, %xmm5
+        vpxor	%xmm0, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpslldq	$8, %xmm5, %xmm6
+        vpsrldq	$8, %xmm5, %xmm5
+        vpxor	%xmm6, %xmm4, %xmm4
+        vpxor	%xmm5, %xmm7, %xmm3
+        vpslld	$31, %xmm4, %xmm5
+        vpslld	$30, %xmm4, %xmm6
+        vpslld	$25, %xmm4, %xmm7
+        vpxor	%xmm6, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpsrldq	$4, %xmm5, %xmm7
+        vpslldq	$12, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm4, %xmm4
+        vpsrld	$0x01, %xmm4, %xmm5
+        vpsrld	$2, %xmm4, %xmm6
+        vpxor	%xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpsrld	$7, %xmm4, %xmm4
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm3, %xmm3
+        vmovdqu	%xmm3, 32(%esp)
+        # H ^ 4
+        vpclmulqdq	$0x00, %xmm0, %xmm0, %xmm4
+        vpclmulqdq	$0x11, %xmm0, %xmm0, %xmm3
+        vpslld	$31, %xmm4, %xmm5
+        vpslld	$30, %xmm4, %xmm6
+        vpslld	$25, %xmm4, %xmm7
+        vpxor	%xmm6, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpsrldq	$4, %xmm5, %xmm7
+        vpslldq	$12, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm4, %xmm4
+        vpsrld	$0x01, %xmm4, %xmm5
+        vpsrld	$2, %xmm4, %xmm6
+        vpxor	%xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpsrld	$7, %xmm4, %xmm4
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm3, %xmm3
+        vmovdqu	%xmm3, 48(%esp)
+        cmpl	%esi, %edi
+        jne	L_AES_GCM_decrypt_avx1_ghash_64
+L_AES_GCM_decrypt_avx1_ghash_64_inplace:
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        vmovdqu	64(%esp), %xmm4
+        vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm3
+        vpaddd	L_aes_gcm_avx1_one, %xmm4, %xmm5
+        vpshufb	%xmm3, %xmm5, %xmm5
+        vpaddd	L_aes_gcm_avx1_two, %xmm4, %xmm6
+        vpshufb	%xmm3, %xmm6, %xmm6
+        vpaddd	L_aes_gcm_avx1_three, %xmm4, %xmm7
+        vpshufb	%xmm3, %xmm7, %xmm7
+        vpshufb	%xmm3, %xmm4, %xmm4
+        vmovdqu	64(%esp), %xmm3
+        vpaddd	L_aes_gcm_avx1_four, %xmm3, %xmm3
+        vmovdqu	%xmm3, 64(%esp)
+        vmovdqa	(%ebp), %xmm3
+        vpxor	%xmm3, %xmm4, %xmm4
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm7, %xmm7
+        vmovdqa	16(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	32(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	48(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	64(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	80(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	96(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	112(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	128(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	144(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        cmpl	$11, 236(%esp)
+        vmovdqa	160(%ebp), %xmm3
+        jl	L_AES_GCM_decrypt_avx1inplace_aesenc_64_ghash_avx_aesenc_64_enc_done
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	176(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        cmpl	$13, 236(%esp)
+        vmovdqa	192(%ebp), %xmm3
+        jl	L_AES_GCM_decrypt_avx1inplace_aesenc_64_ghash_avx_aesenc_64_enc_done
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	208(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	224(%ebp), %xmm3
+L_AES_GCM_decrypt_avx1inplace_aesenc_64_ghash_avx_aesenc_64_enc_done:
+        vaesenclast	%xmm3, %xmm4, %xmm4
+        vaesenclast	%xmm3, %xmm5, %xmm5
+        vmovdqu	(%ecx), %xmm0
+        vmovdqu	16(%ecx), %xmm1
+        vpxor	%xmm0, %xmm4, %xmm4
+        vpxor	%xmm1, %xmm5, %xmm5
+        vmovdqu	%xmm0, 112(%esp)
+        vmovdqu	%xmm1, 128(%esp)
+        vmovdqu	%xmm4, (%edx)
+        vmovdqu	%xmm5, 16(%edx)
+        vaesenclast	%xmm3, %xmm6, %xmm6
+        vaesenclast	%xmm3, %xmm7, %xmm7
+        vmovdqu	32(%ecx), %xmm0
+        vmovdqu	48(%ecx), %xmm1
+        vpxor	%xmm0, %xmm6, %xmm6
+        vpxor	%xmm1, %xmm7, %xmm7
+        vmovdqu	%xmm0, 144(%esp)
+        vmovdqu	%xmm1, 160(%esp)
+        vmovdqu	%xmm6, 32(%edx)
+        vmovdqu	%xmm7, 48(%edx)
+        # ghash encrypted counter
+        vmovdqu	96(%esp), %xmm6
+        vmovdqu	48(%esp), %xmm3
+        vmovdqu	112(%esp), %xmm4
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
+        vpxor	%xmm6, %xmm4, %xmm4
+        vpshufd	$0x4e, %xmm3, %xmm5
+        vpshufd	$0x4e, %xmm4, %xmm1
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm1, %xmm1
+        vpclmulqdq	$0x11, %xmm3, %xmm4, %xmm7
+        vpclmulqdq	$0x00, %xmm3, %xmm4, %xmm6
+        vpclmulqdq	$0x00, %xmm1, %xmm5, %xmm5
+        vpxor	%xmm6, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vmovdqu	32(%esp), %xmm3
+        vmovdqu	128(%esp), %xmm4
+        vpshufd	$0x4e, %xmm3, %xmm0
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
+        vpxor	%xmm3, %xmm0, %xmm0
+        vpshufd	$0x4e, %xmm4, %xmm1
+        vpxor	%xmm4, %xmm1, %xmm1
+        vpclmulqdq	$0x11, %xmm3, %xmm4, %xmm2
+        vpclmulqdq	$0x00, %xmm3, %xmm4, %xmm3
+        vpclmulqdq	$0x00, %xmm1, %xmm0, %xmm0
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpxor	%xmm2, %xmm7, %xmm7
+        vpxor	%xmm0, %xmm5, %xmm5
+        vmovdqu	16(%esp), %xmm3
+        vmovdqu	144(%esp), %xmm4
+        vpshufd	$0x4e, %xmm3, %xmm0
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
+        vpxor	%xmm3, %xmm0, %xmm0
+        vpshufd	$0x4e, %xmm4, %xmm1
+        vpxor	%xmm4, %xmm1, %xmm1
+        vpclmulqdq	$0x11, %xmm3, %xmm4, %xmm2
+        vpclmulqdq	$0x00, %xmm3, %xmm4, %xmm3
+        vpclmulqdq	$0x00, %xmm1, %xmm0, %xmm0
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpxor	%xmm2, %xmm7, %xmm7
+        vpxor	%xmm0, %xmm5, %xmm5
+        vmovdqu	(%esp), %xmm3
+        vmovdqu	160(%esp), %xmm4
+        vpshufd	$0x4e, %xmm3, %xmm0
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
+        vpxor	%xmm3, %xmm0, %xmm0
+        vpshufd	$0x4e, %xmm4, %xmm1
+        vpxor	%xmm4, %xmm1, %xmm1
+        vpclmulqdq	$0x11, %xmm3, %xmm4, %xmm2
+        vpclmulqdq	$0x00, %xmm3, %xmm4, %xmm3
+        vpclmulqdq	$0x00, %xmm1, %xmm0, %xmm0
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpxor	%xmm2, %xmm7, %xmm7
+        vpxor	%xmm0, %xmm5, %xmm5
+        vpslldq	$8, %xmm5, %xmm1
+        vpsrldq	$8, %xmm5, %xmm5
+        vpxor	%xmm1, %xmm6, %xmm6
+        vpxor	%xmm5, %xmm7, %xmm7
+        vpslld	$31, %xmm6, %xmm3
+        vpslld	$30, %xmm6, %xmm0
+        vpslld	$25, %xmm6, %xmm1
+        vpxor	%xmm0, %xmm3, %xmm3
+        vpxor	%xmm1, %xmm3, %xmm3
+        vpsrldq	$4, %xmm3, %xmm0
+        vpslldq	$12, %xmm3, %xmm3
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpsrld	$0x01, %xmm6, %xmm1
+        vpsrld	$2, %xmm6, %xmm5
+        vpsrld	$7, %xmm6, %xmm4
+        vpxor	%xmm5, %xmm1, %xmm1
+        vpxor	%xmm4, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm6, %xmm6
+        vpxor	%xmm7, %xmm6, %xmm6
+        vmovdqu	%xmm6, 96(%esp)
+        addl	$0x40, %ebx
+        cmpl	%eax, %ebx
+        jl	L_AES_GCM_decrypt_avx1_ghash_64_inplace
+        jmp	L_AES_GCM_decrypt_avx1_ghash_64_done
+L_AES_GCM_decrypt_avx1_ghash_64:
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        vmovdqu	64(%esp), %xmm4
+        vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm3
+        vpaddd	L_aes_gcm_avx1_one, %xmm4, %xmm5
+        vpshufb	%xmm3, %xmm5, %xmm5
+        vpaddd	L_aes_gcm_avx1_two, %xmm4, %xmm6
+        vpshufb	%xmm3, %xmm6, %xmm6
+        vpaddd	L_aes_gcm_avx1_three, %xmm4, %xmm7
+        vpshufb	%xmm3, %xmm7, %xmm7
+        vpshufb	%xmm3, %xmm4, %xmm4
+        vmovdqu	64(%esp), %xmm3
+        vpaddd	L_aes_gcm_avx1_four, %xmm3, %xmm3
+        vmovdqu	%xmm3, 64(%esp)
+        vmovdqa	(%ebp), %xmm3
+        vpxor	%xmm3, %xmm4, %xmm4
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm7, %xmm7
+        vmovdqa	16(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	32(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	48(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	64(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	80(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	96(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	112(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	128(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	144(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        cmpl	$11, 236(%esp)
+        vmovdqa	160(%ebp), %xmm3
+        jl	L_AES_GCM_decrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	176(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        cmpl	$13, 236(%esp)
+        vmovdqa	192(%ebp), %xmm3
+        jl	L_AES_GCM_decrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	208(%ebp), %xmm3
+        vaesenc	%xmm3, %xmm4, %xmm4
+        vaesenc	%xmm3, %xmm5, %xmm5
+        vaesenc	%xmm3, %xmm6, %xmm6
+        vaesenc	%xmm3, %xmm7, %xmm7
+        vmovdqa	224(%ebp), %xmm3
+L_AES_GCM_decrypt_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done:
+        vaesenclast	%xmm3, %xmm4, %xmm4
+        vaesenclast	%xmm3, %xmm5, %xmm5
+        vmovdqu	(%ecx), %xmm0
+        vmovdqu	16(%ecx), %xmm1
+        vpxor	%xmm0, %xmm4, %xmm4
+        vpxor	%xmm1, %xmm5, %xmm5
+        vmovdqu	%xmm0, (%ecx)
+        vmovdqu	%xmm1, 16(%ecx)
+        vmovdqu	%xmm4, (%edx)
+        vmovdqu	%xmm5, 16(%edx)
+        vaesenclast	%xmm3, %xmm6, %xmm6
+        vaesenclast	%xmm3, %xmm7, %xmm7
+        vmovdqu	32(%ecx), %xmm0
+        vmovdqu	48(%ecx), %xmm1
+        vpxor	%xmm0, %xmm6, %xmm6
+        vpxor	%xmm1, %xmm7, %xmm7
+        vmovdqu	%xmm0, 32(%ecx)
+        vmovdqu	%xmm1, 48(%ecx)
+        vmovdqu	%xmm6, 32(%edx)
+        vmovdqu	%xmm7, 48(%edx)
+        # ghash encrypted counter
+        vmovdqu	96(%esp), %xmm6
+        vmovdqu	48(%esp), %xmm3
+        vmovdqu	(%ecx), %xmm4
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
+        vpxor	%xmm6, %xmm4, %xmm4
+        vpshufd	$0x4e, %xmm3, %xmm5
+        vpshufd	$0x4e, %xmm4, %xmm1
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm1, %xmm1
+        vpclmulqdq	$0x11, %xmm3, %xmm4, %xmm7
+        vpclmulqdq	$0x00, %xmm3, %xmm4, %xmm6
+        vpclmulqdq	$0x00, %xmm1, %xmm5, %xmm5
+        vpxor	%xmm6, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vmovdqu	32(%esp), %xmm3
+        vmovdqu	16(%ecx), %xmm4
+        vpshufd	$0x4e, %xmm3, %xmm0
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
+        vpxor	%xmm3, %xmm0, %xmm0
+        vpshufd	$0x4e, %xmm4, %xmm1
+        vpxor	%xmm4, %xmm1, %xmm1
+        vpclmulqdq	$0x11, %xmm3, %xmm4, %xmm2
+        vpclmulqdq	$0x00, %xmm3, %xmm4, %xmm3
+        vpclmulqdq	$0x00, %xmm1, %xmm0, %xmm0
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpxor	%xmm2, %xmm7, %xmm7
+        vpxor	%xmm0, %xmm5, %xmm5
+        vmovdqu	16(%esp), %xmm3
+        vmovdqu	32(%ecx), %xmm4
+        vpshufd	$0x4e, %xmm3, %xmm0
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
+        vpxor	%xmm3, %xmm0, %xmm0
+        vpshufd	$0x4e, %xmm4, %xmm1
+        vpxor	%xmm4, %xmm1, %xmm1
+        vpclmulqdq	$0x11, %xmm3, %xmm4, %xmm2
+        vpclmulqdq	$0x00, %xmm3, %xmm4, %xmm3
+        vpclmulqdq	$0x00, %xmm1, %xmm0, %xmm0
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpxor	%xmm2, %xmm7, %xmm7
+        vpxor	%xmm0, %xmm5, %xmm5
+        vmovdqu	(%esp), %xmm3
+        vmovdqu	48(%ecx), %xmm4
+        vpshufd	$0x4e, %xmm3, %xmm0
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
+        vpxor	%xmm3, %xmm0, %xmm0
+        vpshufd	$0x4e, %xmm4, %xmm1
+        vpxor	%xmm4, %xmm1, %xmm1
+        vpclmulqdq	$0x11, %xmm3, %xmm4, %xmm2
+        vpclmulqdq	$0x00, %xmm3, %xmm4, %xmm3
+        vpclmulqdq	$0x00, %xmm1, %xmm0, %xmm0
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpxor	%xmm2, %xmm7, %xmm7
+        vpxor	%xmm0, %xmm5, %xmm5
+        vpslldq	$8, %xmm5, %xmm1
+        vpsrldq	$8, %xmm5, %xmm5
+        vpxor	%xmm1, %xmm6, %xmm6
+        vpxor	%xmm5, %xmm7, %xmm7
+        vpslld	$31, %xmm6, %xmm3
+        vpslld	$30, %xmm6, %xmm0
+        vpslld	$25, %xmm6, %xmm1
+        vpxor	%xmm0, %xmm3, %xmm3
+        vpxor	%xmm1, %xmm3, %xmm3
+        vpsrldq	$4, %xmm3, %xmm0
+        vpslldq	$12, %xmm3, %xmm3
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpsrld	$0x01, %xmm6, %xmm1
+        vpsrld	$2, %xmm6, %xmm5
+        vpsrld	$7, %xmm6, %xmm4
+        vpxor	%xmm5, %xmm1, %xmm1
+        vpxor	%xmm4, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm6, %xmm6
+        vpxor	%xmm7, %xmm6, %xmm6
+        vmovdqu	%xmm6, 96(%esp)
+        addl	$0x40, %ebx
+        cmpl	%eax, %ebx
+        jl	L_AES_GCM_decrypt_avx1_ghash_64
+L_AES_GCM_decrypt_avx1_ghash_64_done:
+        vmovdqa	%xmm6, %xmm2
+        vmovdqu	(%esp), %xmm1
+L_AES_GCM_decrypt_avx1_done_64:
+        movl	216(%esp), %edx
+        cmpl	%edx, %ebx
+        jge	L_AES_GCM_decrypt_avx1_done_dec
+        movl	216(%esp), %eax
+        andl	$0xfffffff0, %eax
+        cmpl	%eax, %ebx
+        jge	L_AES_GCM_decrypt_avx1_last_block_done
+L_AES_GCM_decrypt_avx1_last_block_start:
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        vmovdqu	(%ecx), %xmm7
+        pshufb	L_aes_gcm_avx1_bswap_mask, %xmm7
+        pxor	%xmm2, %xmm7
+        vmovdqu	64(%esp), %xmm5
+        vmovdqu	%xmm7, %xmm7
+        vpshufb	L_aes_gcm_avx1_bswap_epi64, %xmm5, %xmm4
+        vpaddd	L_aes_gcm_avx1_one, %xmm5, %xmm5
+        vmovdqu	%xmm5, 64(%esp)
+        vpxor	(%ebp), %xmm4, %xmm4
+        vpclmulqdq	$16, %xmm1, %xmm7, %xmm0
+        vaesenc	16(%ebp), %xmm4, %xmm4
+        vaesenc	32(%ebp), %xmm4, %xmm4
+        vpclmulqdq	$0x01, %xmm1, %xmm7, %xmm3
+        vaesenc	48(%ebp), %xmm4, %xmm4
+        vaesenc	64(%ebp), %xmm4, %xmm4
+        vaesenc	80(%ebp), %xmm4, %xmm4
+        vpclmulqdq	$0x11, %xmm1, %xmm7, %xmm5
+        vaesenc	96(%ebp), %xmm4, %xmm4
+        vpxor	%xmm3, %xmm0, %xmm0
+        vpslldq	$8, %xmm0, %xmm6
+        vpsrldq	$8, %xmm0, %xmm0
+        vaesenc	112(%ebp), %xmm4, %xmm4
+        vpclmulqdq	$0x00, %xmm1, %xmm7, %xmm3
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm0, %xmm5, %xmm5
+        vmovdqa	L_aes_gcm_avx1_mod2_128, %xmm7
+        vpclmulqdq	$16, %xmm7, %xmm6, %xmm3
+        vaesenc	128(%ebp), %xmm4, %xmm4
+        vpshufd	$0x4e, %xmm6, %xmm0
+        vpxor	%xmm3, %xmm0, %xmm0
+        vpclmulqdq	$16, %xmm7, %xmm0, %xmm3
+        vaesenc	144(%ebp), %xmm4, %xmm4
+        vpshufd	$0x4e, %xmm0, %xmm2
+        vpxor	%xmm3, %xmm2, %xmm2
+        vpxor	%xmm5, %xmm2, %xmm2
+        cmpl	$11, 236(%esp)
+        vmovdqa	160(%ebp), %xmm5
+        jl	L_AES_GCM_decrypt_avx1_aesenc_gfmul_last
+        vaesenc	%xmm5, %xmm4, %xmm4
+        vaesenc	176(%ebp), %xmm4, %xmm4
+        cmpl	$13, 236(%esp)
+        vmovdqa	192(%ebp), %xmm5
+        jl	L_AES_GCM_decrypt_avx1_aesenc_gfmul_last
+        vaesenc	%xmm5, %xmm4, %xmm4
+        vaesenc	208(%ebp), %xmm4, %xmm4
+        vmovdqa	224(%ebp), %xmm5
+L_AES_GCM_decrypt_avx1_aesenc_gfmul_last:
+        vaesenclast	%xmm5, %xmm4, %xmm4
+        vmovdqu	(%ecx), %xmm5
+        vpxor	%xmm5, %xmm4, %xmm4
+        vmovdqu	%xmm4, (%edx)
+        addl	$16, %ebx
+        cmpl	%eax, %ebx
+        jl	L_AES_GCM_decrypt_avx1_last_block_start
+L_AES_GCM_decrypt_avx1_last_block_done:
+        movl	216(%esp), %ecx
+        movl	%ecx, %edx
+        andl	$15, %ecx
+        jz	L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_done
+        vmovdqu	64(%esp), %xmm0
+        vpshufb	L_aes_gcm_avx1_bswap_epi64, %xmm0, %xmm0
+        vpxor	(%ebp), %xmm0, %xmm0
+        vaesenc	16(%ebp), %xmm0, %xmm0
+        vaesenc	32(%ebp), %xmm0, %xmm0
+        vaesenc	48(%ebp), %xmm0, %xmm0
+        vaesenc	64(%ebp), %xmm0, %xmm0
+        vaesenc	80(%ebp), %xmm0, %xmm0
+        vaesenc	96(%ebp), %xmm0, %xmm0
+        vaesenc	112(%ebp), %xmm0, %xmm0
+        vaesenc	128(%ebp), %xmm0, %xmm0
+        vaesenc	144(%ebp), %xmm0, %xmm0
+        cmpl	$11, 236(%esp)
+        vmovdqa	160(%ebp), %xmm5
+        jl	L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_aesenc_avx_last
+        vaesenc	%xmm5, %xmm0, %xmm0
+        vaesenc	176(%ebp), %xmm0, %xmm0
+        cmpl	$13, 236(%esp)
+        vmovdqa	192(%ebp), %xmm5
+        jl	L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_aesenc_avx_last
+        vaesenc	%xmm5, %xmm0, %xmm0
+        vaesenc	208(%ebp), %xmm0, %xmm0
+        vmovdqa	224(%ebp), %xmm5
+L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_aesenc_avx_last:
+        vaesenclast	%xmm5, %xmm0, %xmm0
+        subl	$32, %esp
+        xorl	%ecx, %ecx
+        vmovdqu	%xmm0, (%esp)
+        vpxor	%xmm4, %xmm4, %xmm4
+        vmovdqu	%xmm4, 16(%esp)
+L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_loop:
+        movzbl	(%esi,%ebx,1), %eax
+        movb	%al, 16(%esp,%ecx,1)
+        xorb	(%esp,%ecx,1), %al
+        movb	%al, (%edi,%ebx,1)
+        incl	%ebx
+        incl	%ecx
+        cmpl	%edx, %ebx
+        jl	L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_loop
+        vmovdqu	16(%esp), %xmm0
+        addl	$32, %esp
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm2, %xmm2
+        # ghash_gfmul_red_avx
+        vpshufd	$0x4e, %xmm1, %xmm5
+        vpshufd	$0x4e, %xmm2, %xmm6
+        vpclmulqdq	$0x11, %xmm1, %xmm2, %xmm7
+        vpclmulqdq	$0x00, %xmm1, %xmm2, %xmm4
+        vpxor	%xmm1, %xmm5, %xmm5
+        vpxor	%xmm2, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpslldq	$8, %xmm5, %xmm6
+        vpsrldq	$8, %xmm5, %xmm5
+        vpxor	%xmm6, %xmm4, %xmm4
+        vpxor	%xmm5, %xmm7, %xmm2
+        vpslld	$31, %xmm4, %xmm5
+        vpslld	$30, %xmm4, %xmm6
+        vpslld	$25, %xmm4, %xmm7
+        vpxor	%xmm6, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpsrldq	$4, %xmm5, %xmm7
+        vpslldq	$12, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm4, %xmm4
+        vpsrld	$0x01, %xmm4, %xmm5
+        vpsrld	$2, %xmm4, %xmm6
+        vpxor	%xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpsrld	$7, %xmm4, %xmm4
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm2, %xmm2
+L_AES_GCM_decrypt_avx1_aesenc_last15_dec_avx_done:
+L_AES_GCM_decrypt_avx1_done_dec:
+        movl	212(%esp), %esi
+        movl	228(%esp), %ebp
+        movl	216(%esp), %edx
+        movl	220(%esp), %ecx
+        shll	$3, %edx
+        shll	$3, %ecx
+        vpinsrd	$0x00, %edx, %xmm4, %xmm4
+        vpinsrd	$2, %ecx, %xmm4, %xmm4
+        movl	216(%esp), %edx
+        movl	220(%esp), %ecx
+        shrl	$29, %edx
+        shrl	$29, %ecx
+        vpinsrd	$0x01, %edx, %xmm4, %xmm4
+        vpinsrd	$3, %ecx, %xmm4, %xmm4
+        vpxor	%xmm4, %xmm2, %xmm2
+        # ghash_gfmul_red_avx
+        vpshufd	$0x4e, %xmm1, %xmm5
+        vpshufd	$0x4e, %xmm2, %xmm6
+        vpclmulqdq	$0x11, %xmm1, %xmm2, %xmm7
+        vpclmulqdq	$0x00, %xmm1, %xmm2, %xmm4
+        vpxor	%xmm1, %xmm5, %xmm5
+        vpxor	%xmm2, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpslldq	$8, %xmm5, %xmm6
+        vpsrldq	$8, %xmm5, %xmm5
+        vpxor	%xmm6, %xmm4, %xmm4
+        vpxor	%xmm5, %xmm7, %xmm2
+        vpslld	$31, %xmm4, %xmm5
+        vpslld	$30, %xmm4, %xmm6
+        vpslld	$25, %xmm4, %xmm7
+        vpxor	%xmm6, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpsrldq	$4, %xmm5, %xmm7
+        vpslldq	$12, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm4, %xmm4
+        vpsrld	$0x01, %xmm4, %xmm5
+        vpsrld	$2, %xmm4, %xmm6
+        vpxor	%xmm6, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpsrld	$7, %xmm4, %xmm4
+        vpxor	%xmm7, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm2, %xmm2
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm2, %xmm2
+        vpxor	80(%esp), %xmm2, %xmm4
+        movl	240(%esp), %edi
+        cmpl	$16, %ebp
+        je	L_AES_GCM_decrypt_avx1_cmp_tag_16
+        subl	$16, %esp
+        xorl	%ecx, %ecx
+        xorl	%ebx, %ebx
+        vmovdqu	%xmm4, (%esp)
+L_AES_GCM_decrypt_avx1_cmp_tag_loop:
+        movzbl	(%esp,%ecx,1), %eax
+        xorb	(%esi,%ecx,1), %al
+        orb	%al, %bl
+        incl	%ecx
+        cmpl	%ebp, %ecx
+        jne	L_AES_GCM_decrypt_avx1_cmp_tag_loop
+        cmpb	$0x00, %bl
+        sete	%bl
+        addl	$16, %esp
+        xorl	%ecx, %ecx
+        jmp	L_AES_GCM_decrypt_avx1_cmp_tag_done
+L_AES_GCM_decrypt_avx1_cmp_tag_16:
+        vmovdqu	(%esi), %xmm5
+        vpcmpeqb	%xmm5, %xmm4, %xmm4
+        vpmovmskb	%xmm4, %edx
+        # %%edx == 0xFFFF then return 1 else => return 0
+        xorl	%ebx, %ebx
+        cmpl	$0xffff, %edx
+        sete	%bl
+L_AES_GCM_decrypt_avx1_cmp_tag_done:
+        movl	%ebx, (%edi)
+        addl	$0xb0, %esp
+        popl	%ebp
+        popl	%edi
+        popl	%esi
+        popl	%ebx
+        ret
+.size	AES_GCM_decrypt_avx1,.-AES_GCM_decrypt_avx1
+#ifdef WOLFSSL_AESGCM_STREAM
+.text
+.globl	AES_GCM_init_avx1
+.type	AES_GCM_init_avx1,@function
+.align	16
+AES_GCM_init_avx1:
+        pushl	%ebx
+        pushl	%esi
+        pushl	%edi
+        pushl	%ebp
+        subl	$16, %esp
+        movl	36(%esp), %ebp
+        movl	44(%esp), %esi
+        movl	60(%esp), %edi
+        vpxor	%xmm4, %xmm4, %xmm4
+        movl	48(%esp), %edx
+        cmpl	$12, %edx
+        jne	L_AES_GCM_init_avx1_iv_not_12
+        # # Calculate values when IV is 12 bytes
+        # Set counter based on IV
+        movl	$0x1000000, %ecx
+        vpinsrd	$0x00, (%esi), %xmm4, %xmm4
+        vpinsrd	$0x01, 4(%esi), %xmm4, %xmm4
+        vpinsrd	$2, 8(%esi), %xmm4, %xmm4
+        vpinsrd	$3, %ecx, %xmm4, %xmm4
+        # H = Encrypt X(=0) and T = Encrypt counter
+        vmovdqa	(%ebp), %xmm5
+        vpxor	%xmm5, %xmm4, %xmm1
+        vmovdqa	16(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm5, %xmm5
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vmovdqa	32(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm5, %xmm5
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vmovdqa	48(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm5, %xmm5
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vmovdqa	64(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm5, %xmm5
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vmovdqa	80(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm5, %xmm5
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vmovdqa	96(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm5, %xmm5
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vmovdqa	112(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm5, %xmm5
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vmovdqa	128(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm5, %xmm5
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vmovdqa	144(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm5, %xmm5
+        vaesenc	%xmm7, %xmm1, %xmm1
+        cmpl	$11, 40(%esp)
+        vmovdqa	160(%ebp), %xmm7
+        jl	L_AES_GCM_init_avx1_calc_iv_12_last
+        vaesenc	%xmm7, %xmm5, %xmm5
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vmovdqa	176(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm5, %xmm5
+        vaesenc	%xmm7, %xmm1, %xmm1
+        cmpl	$13, 40(%esp)
+        vmovdqa	192(%ebp), %xmm7
+        jl	L_AES_GCM_init_avx1_calc_iv_12_last
+        vaesenc	%xmm7, %xmm5, %xmm5
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vmovdqa	208(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm5, %xmm5
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vmovdqa	224(%ebp), %xmm7
+L_AES_GCM_init_avx1_calc_iv_12_last:
+        vaesenclast	%xmm7, %xmm5, %xmm5
+        vaesenclast	%xmm7, %xmm1, %xmm1
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm5, %xmm5
+        vmovdqu	%xmm1, (%edi)
+        jmp	L_AES_GCM_init_avx1_iv_done
+L_AES_GCM_init_avx1_iv_not_12:
+        # Calculate values when IV is not 12 bytes
+        # H = Encrypt X(=0)
+        vmovdqa	(%ebp), %xmm5
+        vaesenc	16(%ebp), %xmm5, %xmm5
+        vaesenc	32(%ebp), %xmm5, %xmm5
+        vaesenc	48(%ebp), %xmm5, %xmm5
+        vaesenc	64(%ebp), %xmm5, %xmm5
+        vaesenc	80(%ebp), %xmm5, %xmm5
+        vaesenc	96(%ebp), %xmm5, %xmm5
+        vaesenc	112(%ebp), %xmm5, %xmm5
+        vaesenc	128(%ebp), %xmm5, %xmm5
+        vaesenc	144(%ebp), %xmm5, %xmm5
+        cmpl	$11, 40(%esp)
+        vmovdqa	160(%ebp), %xmm1
+        jl	L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last
+        vaesenc	%xmm1, %xmm5, %xmm5
+        vaesenc	176(%ebp), %xmm5, %xmm5
+        cmpl	$13, 40(%esp)
+        vmovdqa	192(%ebp), %xmm1
+        jl	L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last
+        vaesenc	%xmm1, %xmm5, %xmm5
+        vaesenc	208(%ebp), %xmm5, %xmm5
+        vmovdqa	224(%ebp), %xmm1
+L_AES_GCM_init_avx1_calc_iv_1_aesenc_avx_last:
+        vaesenclast	%xmm1, %xmm5, %xmm5
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm5, %xmm5
+        # Calc counter
+        # Initialization vector
+        cmpl	$0x00, %edx
+        movl	$0x00, %ecx
+        je	L_AES_GCM_init_avx1_calc_iv_done
+        cmpl	$16, %edx
+        jl	L_AES_GCM_init_avx1_calc_iv_lt16
+        andl	$0xfffffff0, %edx
+L_AES_GCM_init_avx1_calc_iv_16_loop:
+        vmovdqu	(%esi,%ecx,1), %xmm0
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm4, %xmm4
+        # ghash_gfmul_avx
+        vpshufd	$0x4e, %xmm4, %xmm1
+        vpshufd	$0x4e, %xmm5, %xmm2
+        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
+        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
+        vpxor	%xmm4, %xmm1, %xmm1
+        vpxor	%xmm5, %xmm2, %xmm2
+        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm1, %xmm1
+        vmovdqa	%xmm0, %xmm7
+        vmovdqa	%xmm3, %xmm4
+        vpslldq	$8, %xmm1, %xmm2
+        vpsrldq	$8, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm7, %xmm7
+        vpxor	%xmm1, %xmm4, %xmm4
+        vpsrld	$31, %xmm7, %xmm0
+        vpsrld	$31, %xmm4, %xmm1
+        vpslld	$0x01, %xmm7, %xmm7
+        vpslld	$0x01, %xmm4, %xmm4
+        vpsrldq	$12, %xmm0, %xmm2
+        vpslldq	$4, %xmm0, %xmm0
+        vpslldq	$4, %xmm1, %xmm1
+        vpor	%xmm2, %xmm4, %xmm4
+        vpor	%xmm0, %xmm7, %xmm7
+        vpor	%xmm1, %xmm4, %xmm4
+        vpslld	$31, %xmm7, %xmm0
+        vpslld	$30, %xmm7, %xmm1
+        vpslld	$25, %xmm7, %xmm2
+        vpxor	%xmm1, %xmm0, %xmm0
+        vpxor	%xmm2, %xmm0, %xmm0
+        vmovdqa	%xmm0, %xmm1
+        vpsrldq	$4, %xmm1, %xmm1
+        vpslldq	$12, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm7, %xmm7
+        vpsrld	$0x01, %xmm7, %xmm2
+        vpsrld	$2, %xmm7, %xmm3
+        vpsrld	$7, %xmm7, %xmm0
+        vpxor	%xmm3, %xmm2, %xmm2
+        vpxor	%xmm0, %xmm2, %xmm2
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm2, %xmm4, %xmm4
+        addl	$16, %ecx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_init_avx1_calc_iv_16_loop
+        movl	48(%esp), %edx
+        cmpl	%edx, %ecx
+        je	L_AES_GCM_init_avx1_calc_iv_done
+L_AES_GCM_init_avx1_calc_iv_lt16:
+        subl	$16, %esp
+        vpxor	%xmm0, %xmm0, %xmm0
+        xorl	%ebx, %ebx
+        vmovdqu	%xmm0, (%esp)
+L_AES_GCM_init_avx1_calc_iv_loop:
+        movzbl	(%esi,%ecx,1), %eax
+        movb	%al, (%esp,%ebx,1)
+        incl	%ecx
+        incl	%ebx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_init_avx1_calc_iv_loop
+        vmovdqu	(%esp), %xmm0
+        addl	$16, %esp
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm4, %xmm4
+        # ghash_gfmul_avx
+        vpshufd	$0x4e, %xmm4, %xmm1
+        vpshufd	$0x4e, %xmm5, %xmm2
+        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
+        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
+        vpxor	%xmm4, %xmm1, %xmm1
+        vpxor	%xmm5, %xmm2, %xmm2
+        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm1, %xmm1
+        vmovdqa	%xmm0, %xmm7
+        vmovdqa	%xmm3, %xmm4
+        vpslldq	$8, %xmm1, %xmm2
+        vpsrldq	$8, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm7, %xmm7
+        vpxor	%xmm1, %xmm4, %xmm4
+        vpsrld	$31, %xmm7, %xmm0
+        vpsrld	$31, %xmm4, %xmm1
+        vpslld	$0x01, %xmm7, %xmm7
+        vpslld	$0x01, %xmm4, %xmm4
+        vpsrldq	$12, %xmm0, %xmm2
+        vpslldq	$4, %xmm0, %xmm0
+        vpslldq	$4, %xmm1, %xmm1
+        vpor	%xmm2, %xmm4, %xmm4
+        vpor	%xmm0, %xmm7, %xmm7
+        vpor	%xmm1, %xmm4, %xmm4
+        vpslld	$31, %xmm7, %xmm0
+        vpslld	$30, %xmm7, %xmm1
+        vpslld	$25, %xmm7, %xmm2
+        vpxor	%xmm1, %xmm0, %xmm0
+        vpxor	%xmm2, %xmm0, %xmm0
+        vmovdqa	%xmm0, %xmm1
+        vpsrldq	$4, %xmm1, %xmm1
+        vpslldq	$12, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm7, %xmm7
+        vpsrld	$0x01, %xmm7, %xmm2
+        vpsrld	$2, %xmm7, %xmm3
+        vpsrld	$7, %xmm7, %xmm0
+        vpxor	%xmm3, %xmm2, %xmm2
+        vpxor	%xmm0, %xmm2, %xmm2
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm2, %xmm4, %xmm4
+L_AES_GCM_init_avx1_calc_iv_done:
+        # T = Encrypt counter
+        vpxor	%xmm0, %xmm0, %xmm0
+        shll	$3, %edx
+        vpinsrd	$0x00, %edx, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm4, %xmm4
+        # ghash_gfmul_avx
+        vpshufd	$0x4e, %xmm4, %xmm1
+        vpshufd	$0x4e, %xmm5, %xmm2
+        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
+        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
+        vpxor	%xmm4, %xmm1, %xmm1
+        vpxor	%xmm5, %xmm2, %xmm2
+        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm1, %xmm1
+        vmovdqa	%xmm0, %xmm7
+        vmovdqa	%xmm3, %xmm4
+        vpslldq	$8, %xmm1, %xmm2
+        vpsrldq	$8, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm7, %xmm7
+        vpxor	%xmm1, %xmm4, %xmm4
+        vpsrld	$31, %xmm7, %xmm0
+        vpsrld	$31, %xmm4, %xmm1
+        vpslld	$0x01, %xmm7, %xmm7
+        vpslld	$0x01, %xmm4, %xmm4
+        vpsrldq	$12, %xmm0, %xmm2
+        vpslldq	$4, %xmm0, %xmm0
+        vpslldq	$4, %xmm1, %xmm1
+        vpor	%xmm2, %xmm4, %xmm4
+        vpor	%xmm0, %xmm7, %xmm7
+        vpor	%xmm1, %xmm4, %xmm4
+        vpslld	$31, %xmm7, %xmm0
+        vpslld	$30, %xmm7, %xmm1
+        vpslld	$25, %xmm7, %xmm2
+        vpxor	%xmm1, %xmm0, %xmm0
+        vpxor	%xmm2, %xmm0, %xmm0
+        vmovdqa	%xmm0, %xmm1
+        vpsrldq	$4, %xmm1, %xmm1
+        vpslldq	$12, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm7, %xmm7
+        vpsrld	$0x01, %xmm7, %xmm2
+        vpsrld	$2, %xmm7, %xmm3
+        vpsrld	$7, %xmm7, %xmm0
+        vpxor	%xmm3, %xmm2, %xmm2
+        vpxor	%xmm0, %xmm2, %xmm2
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm2, %xmm4, %xmm4
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
+        #   Encrypt counter
+        vmovdqa	(%ebp), %xmm0
+        vpxor	%xmm4, %xmm0, %xmm0
+        vaesenc	16(%ebp), %xmm0, %xmm0
+        vaesenc	32(%ebp), %xmm0, %xmm0
+        vaesenc	48(%ebp), %xmm0, %xmm0
+        vaesenc	64(%ebp), %xmm0, %xmm0
+        vaesenc	80(%ebp), %xmm0, %xmm0
+        vaesenc	96(%ebp), %xmm0, %xmm0
+        vaesenc	112(%ebp), %xmm0, %xmm0
+        vaesenc	128(%ebp), %xmm0, %xmm0
+        vaesenc	144(%ebp), %xmm0, %xmm0
+        cmpl	$11, 40(%esp)
+        vmovdqa	160(%ebp), %xmm1
+        jl	L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last
+        vaesenc	%xmm1, %xmm0, %xmm0
+        vaesenc	176(%ebp), %xmm0, %xmm0
+        cmpl	$13, 40(%esp)
+        vmovdqa	192(%ebp), %xmm1
+        jl	L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last
+        vaesenc	%xmm1, %xmm0, %xmm0
+        vaesenc	208(%ebp), %xmm0, %xmm0
+        vmovdqa	224(%ebp), %xmm1
+L_AES_GCM_init_avx1_calc_iv_2_aesenc_avx_last:
+        vaesenclast	%xmm1, %xmm0, %xmm0
+        vmovdqu	%xmm0, (%edi)
+L_AES_GCM_init_avx1_iv_done:
+        movl	52(%esp), %ebp
+        movl	56(%esp), %edi
+        vpshufb	L_aes_gcm_avx1_bswap_epi64, %xmm4, %xmm4
+        vpaddd	L_aes_gcm_avx1_one, %xmm4, %xmm4
+        vmovdqa	%xmm5, (%ebp)
+        vmovdqa	%xmm4, (%edi)
+        addl	$16, %esp
+        popl	%ebp
+        popl	%edi
+        popl	%esi
+        popl	%ebx
+        ret
+.size	AES_GCM_init_avx1,.-AES_GCM_init_avx1
+.text
+.globl	AES_GCM_aad_update_avx1
+.type	AES_GCM_aad_update_avx1,@function
+.align	16
+AES_GCM_aad_update_avx1:
+        pushl	%esi
+        pushl	%edi
+        movl	12(%esp), %esi
+        movl	16(%esp), %edx
+        movl	20(%esp), %edi
+        movl	24(%esp), %eax
+        vmovdqa	(%edi), %xmm5
+        vmovdqa	(%eax), %xmm6
+        xorl	%ecx, %ecx
+L_AES_GCM_aad_update_avx1_16_loop:
+        vmovdqu	(%esi,%ecx,1), %xmm0
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm5, %xmm5
+        # ghash_gfmul_avx
+        vpshufd	$0x4e, %xmm5, %xmm1
+        vpshufd	$0x4e, %xmm6, %xmm2
+        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm3
+        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm0
+        vpxor	%xmm5, %xmm1, %xmm1
+        vpxor	%xmm6, %xmm2, %xmm2
+        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm1, %xmm1
+        vmovdqa	%xmm0, %xmm4
+        vmovdqa	%xmm3, %xmm5
+        vpslldq	$8, %xmm1, %xmm2
+        vpsrldq	$8, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm4, %xmm4
+        vpxor	%xmm1, %xmm5, %xmm5
+        vpsrld	$31, %xmm4, %xmm0
+        vpsrld	$31, %xmm5, %xmm1
+        vpslld	$0x01, %xmm4, %xmm4
+        vpslld	$0x01, %xmm5, %xmm5
+        vpsrldq	$12, %xmm0, %xmm2
+        vpslldq	$4, %xmm0, %xmm0
+        vpslldq	$4, %xmm1, %xmm1
+        vpor	%xmm2, %xmm5, %xmm5
+        vpor	%xmm0, %xmm4, %xmm4
+        vpor	%xmm1, %xmm5, %xmm5
+        vpslld	$31, %xmm4, %xmm0
+        vpslld	$30, %xmm4, %xmm1
+        vpslld	$25, %xmm4, %xmm2
+        vpxor	%xmm1, %xmm0, %xmm0
+        vpxor	%xmm2, %xmm0, %xmm0
+        vmovdqa	%xmm0, %xmm1
+        vpsrldq	$4, %xmm1, %xmm1
+        vpslldq	$12, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm4, %xmm4
+        vpsrld	$0x01, %xmm4, %xmm2
+        vpsrld	$2, %xmm4, %xmm3
+        vpsrld	$7, %xmm4, %xmm0
+        vpxor	%xmm3, %xmm2, %xmm2
+        vpxor	%xmm0, %xmm2, %xmm2
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpxor	%xmm4, %xmm2, %xmm2
+        vpxor	%xmm2, %xmm5, %xmm5
+        addl	$16, %ecx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_aad_update_avx1_16_loop
+        vmovdqa	%xmm5, (%edi)
+        popl	%edi
+        popl	%esi
+        ret
+.size	AES_GCM_aad_update_avx1,.-AES_GCM_aad_update_avx1
+.text
+.globl	AES_GCM_encrypt_block_avx1
+.type	AES_GCM_encrypt_block_avx1,@function
+.align	16
+AES_GCM_encrypt_block_avx1:
+        pushl	%esi
+        pushl	%edi
+        movl	12(%esp), %ecx
+        movl	16(%esp), %eax
+        movl	20(%esp), %edi
+        movl	24(%esp), %esi
+        movl	28(%esp), %edx
+        vmovdqu	(%edx), %xmm1
+        vpshufb	L_aes_gcm_avx1_bswap_epi64, %xmm1, %xmm0
+        vpaddd	L_aes_gcm_avx1_one, %xmm1, %xmm1
+        vmovdqu	%xmm1, (%edx)
+        vpxor	(%ecx), %xmm0, %xmm0
+        vaesenc	16(%ecx), %xmm0, %xmm0
+        vaesenc	32(%ecx), %xmm0, %xmm0
+        vaesenc	48(%ecx), %xmm0, %xmm0
+        vaesenc	64(%ecx), %xmm0, %xmm0
+        vaesenc	80(%ecx), %xmm0, %xmm0
+        vaesenc	96(%ecx), %xmm0, %xmm0
+        vaesenc	112(%ecx), %xmm0, %xmm0
+        vaesenc	128(%ecx), %xmm0, %xmm0
+        vaesenc	144(%ecx), %xmm0, %xmm0
+        cmpl	$11, %eax
+        vmovdqa	160(%ecx), %xmm1
+        jl	L_AES_GCM_encrypt_block_avx1_aesenc_block_aesenc_avx_last
+        vaesenc	%xmm1, %xmm0, %xmm0
+        vaesenc	176(%ecx), %xmm0, %xmm0
+        cmpl	$13, %eax
+        vmovdqa	192(%ecx), %xmm1
+        jl	L_AES_GCM_encrypt_block_avx1_aesenc_block_aesenc_avx_last
+        vaesenc	%xmm1, %xmm0, %xmm0
+        vaesenc	208(%ecx), %xmm0, %xmm0
+        vmovdqa	224(%ecx), %xmm1
+L_AES_GCM_encrypt_block_avx1_aesenc_block_aesenc_avx_last:
+        vaesenclast	%xmm1, %xmm0, %xmm0
+        vmovdqu	(%esi), %xmm1
+        vpxor	%xmm1, %xmm0, %xmm0
+        vmovdqu	%xmm0, (%edi)
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
+        popl	%edi
+        popl	%esi
+        ret
+.size	AES_GCM_encrypt_block_avx1,.-AES_GCM_encrypt_block_avx1
+.text
+.globl	AES_GCM_ghash_block_avx1
+.type	AES_GCM_ghash_block_avx1,@function
+.align	16
+AES_GCM_ghash_block_avx1:
+        movl	4(%esp), %edx
+        movl	8(%esp), %eax
+        movl	12(%esp), %ecx
+        vmovdqa	(%eax), %xmm4
+        vmovdqa	(%ecx), %xmm5
+        vmovdqu	(%edx), %xmm0
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm4, %xmm4
+        # ghash_gfmul_avx
+        vpshufd	$0x4e, %xmm4, %xmm1
+        vpshufd	$0x4e, %xmm5, %xmm2
+        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
+        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
+        vpxor	%xmm4, %xmm1, %xmm1
+        vpxor	%xmm5, %xmm2, %xmm2
+        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm1, %xmm1
+        vmovdqa	%xmm0, %xmm6
+        vmovdqa	%xmm3, %xmm4
+        vpslldq	$8, %xmm1, %xmm2
+        vpsrldq	$8, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm6, %xmm6
+        vpxor	%xmm1, %xmm4, %xmm4
+        vpsrld	$31, %xmm6, %xmm0
+        vpsrld	$31, %xmm4, %xmm1
+        vpslld	$0x01, %xmm6, %xmm6
+        vpslld	$0x01, %xmm4, %xmm4
+        vpsrldq	$12, %xmm0, %xmm2
+        vpslldq	$4, %xmm0, %xmm0
+        vpslldq	$4, %xmm1, %xmm1
+        vpor	%xmm2, %xmm4, %xmm4
+        vpor	%xmm0, %xmm6, %xmm6
+        vpor	%xmm1, %xmm4, %xmm4
+        vpslld	$31, %xmm6, %xmm0
+        vpslld	$30, %xmm6, %xmm1
+        vpslld	$25, %xmm6, %xmm2
+        vpxor	%xmm1, %xmm0, %xmm0
+        vpxor	%xmm2, %xmm0, %xmm0
+        vmovdqa	%xmm0, %xmm1
+        vpsrldq	$4, %xmm1, %xmm1
+        vpslldq	$12, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm6, %xmm6
+        vpsrld	$0x01, %xmm6, %xmm2
+        vpsrld	$2, %xmm6, %xmm3
+        vpsrld	$7, %xmm6, %xmm0
+        vpxor	%xmm3, %xmm2, %xmm2
+        vpxor	%xmm0, %xmm2, %xmm2
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpxor	%xmm6, %xmm2, %xmm2
+        vpxor	%xmm2, %xmm4, %xmm4
+        vmovdqa	%xmm4, (%eax)
+        ret
+.size	AES_GCM_ghash_block_avx1,.-AES_GCM_ghash_block_avx1
+.text
+.globl	AES_GCM_encrypt_update_avx1
+.type	AES_GCM_encrypt_update_avx1,@function
+.align	16
+AES_GCM_encrypt_update_avx1:
+        pushl	%ebx
+        pushl	%esi
+        pushl	%edi
+        pushl	%ebp
+        subl	$0x60, %esp
+        movl	144(%esp), %esi
+        vmovdqa	(%esi), %xmm4
+        vmovdqu	%xmm4, 64(%esp)
+        movl	136(%esp), %esi
+        movl	140(%esp), %ebp
+        vmovdqa	(%esi), %xmm6
+        vmovdqa	(%ebp), %xmm5
+        vmovdqu	%xmm6, 80(%esp)
+        movl	116(%esp), %ebp
+        movl	124(%esp), %edi
+        movl	128(%esp), %esi
+        vpsrlq	$63, %xmm5, %xmm1
+        vpsllq	$0x01, %xmm5, %xmm0
+        vpslldq	$8, %xmm1, %xmm1
+        vpor	%xmm1, %xmm0, %xmm0
+        vpshufd	$0xff, %xmm5, %xmm5
+        vpsrad	$31, %xmm5, %xmm5
+        vpand	L_aes_gcm_avx1_mod2_128, %xmm5, %xmm5
+        vpxor	%xmm0, %xmm5, %xmm5
+        xorl	%ebx, %ebx
+        cmpl	$0x40, 132(%esp)
+        movl	132(%esp), %eax
+        jl	L_AES_GCM_encrypt_update_avx1_done_64
+        andl	$0xffffffc0, %eax
+        vmovdqa	%xmm6, %xmm2
+        # H ^ 1
+        vmovdqu	%xmm5, (%esp)
+        # H ^ 2
+        vpclmulqdq	$0x00, %xmm5, %xmm5, %xmm0
+        vpclmulqdq	$0x11, %xmm5, %xmm5, %xmm4
+        vpslld	$31, %xmm0, %xmm1
+        vpslld	$30, %xmm0, %xmm2
+        vpslld	$25, %xmm0, %xmm3
+        vpxor	%xmm2, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm1, %xmm1
+        vpsrldq	$4, %xmm1, %xmm3
+        vpslldq	$12, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm0, %xmm0
+        vpsrld	$0x01, %xmm0, %xmm1
+        vpsrld	$2, %xmm0, %xmm2
+        vpxor	%xmm2, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpsrld	$7, %xmm0, %xmm0
+        vpxor	%xmm3, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm4, %xmm4
+        vmovdqu	%xmm4, 16(%esp)
+        # H ^ 3
+        # ghash_gfmul_red_avx
+        vpshufd	$0x4e, %xmm5, %xmm1
+        vpshufd	$0x4e, %xmm4, %xmm2
+        vpclmulqdq	$0x11, %xmm5, %xmm4, %xmm3
+        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm0
+        vpxor	%xmm5, %xmm1, %xmm1
+        vpxor	%xmm4, %xmm2, %xmm2
+        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm1, %xmm1
+        vpslldq	$8, %xmm1, %xmm2
+        vpsrldq	$8, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm0, %xmm0
+        vpxor	%xmm1, %xmm3, %xmm7
+        vpslld	$31, %xmm0, %xmm1
+        vpslld	$30, %xmm0, %xmm2
+        vpslld	$25, %xmm0, %xmm3
+        vpxor	%xmm2, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm1, %xmm1
+        vpsrldq	$4, %xmm1, %xmm3
+        vpslldq	$12, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm0, %xmm0
+        vpsrld	$0x01, %xmm0, %xmm1
+        vpsrld	$2, %xmm0, %xmm2
+        vpxor	%xmm2, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpsrld	$7, %xmm0, %xmm0
+        vpxor	%xmm3, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm7, %xmm7
+        vmovdqu	%xmm7, 32(%esp)
+        # H ^ 4
+        vpclmulqdq	$0x00, %xmm4, %xmm4, %xmm0
+        vpclmulqdq	$0x11, %xmm4, %xmm4, %xmm7
+        vpslld	$31, %xmm0, %xmm1
+        vpslld	$30, %xmm0, %xmm2
+        vpslld	$25, %xmm0, %xmm3
+        vpxor	%xmm2, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm1, %xmm1
+        vpsrldq	$4, %xmm1, %xmm3
+        vpslldq	$12, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm0, %xmm0
+        vpsrld	$0x01, %xmm0, %xmm1
+        vpsrld	$2, %xmm0, %xmm2
+        vpxor	%xmm2, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpsrld	$7, %xmm0, %xmm0
+        vpxor	%xmm3, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm7, %xmm7
+        vmovdqu	%xmm7, 48(%esp)
+        # First 64 bytes of input
+        vmovdqu	64(%esp), %xmm0
+        vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm7
+        vpaddd	L_aes_gcm_avx1_one, %xmm0, %xmm1
+        vpshufb	%xmm7, %xmm1, %xmm1
+        vpaddd	L_aes_gcm_avx1_two, %xmm0, %xmm2
+        vpshufb	%xmm7, %xmm2, %xmm2
+        vpaddd	L_aes_gcm_avx1_three, %xmm0, %xmm3
+        vpshufb	%xmm7, %xmm3, %xmm3
+        vpshufb	%xmm7, %xmm0, %xmm0
+        vmovdqu	64(%esp), %xmm7
+        vpaddd	L_aes_gcm_avx1_four, %xmm7, %xmm7
+        vmovdqu	%xmm7, 64(%esp)
+        vmovdqa	(%ebp), %xmm7
+        vpxor	%xmm7, %xmm0, %xmm0
+        vpxor	%xmm7, %xmm1, %xmm1
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm7, %xmm3, %xmm3
+        vmovdqa	16(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	32(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	48(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	64(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	80(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	96(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	112(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	128(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	144(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        cmpl	$11, 120(%esp)
+        vmovdqa	160(%ebp), %xmm7
+        jl	L_AES_GCM_encrypt_update_avx1_aesenc_64_enc_done
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	176(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        cmpl	$13, 120(%esp)
+        vmovdqa	192(%ebp), %xmm7
+        jl	L_AES_GCM_encrypt_update_avx1_aesenc_64_enc_done
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	208(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	224(%ebp), %xmm7
+L_AES_GCM_encrypt_update_avx1_aesenc_64_enc_done:
+        vaesenclast	%xmm7, %xmm0, %xmm0
+        vaesenclast	%xmm7, %xmm1, %xmm1
+        vmovdqu	(%esi), %xmm4
+        vmovdqu	16(%esi), %xmm5
+        vpxor	%xmm4, %xmm0, %xmm0
+        vpxor	%xmm5, %xmm1, %xmm1
+        vmovdqu	%xmm4, (%esi)
+        vmovdqu	%xmm5, 16(%esi)
+        vmovdqu	%xmm0, (%edi)
+        vmovdqu	%xmm1, 16(%edi)
+        vaesenclast	%xmm7, %xmm2, %xmm2
+        vaesenclast	%xmm7, %xmm3, %xmm3
+        vmovdqu	32(%esi), %xmm4
+        vmovdqu	48(%esi), %xmm5
+        vpxor	%xmm4, %xmm2, %xmm2
+        vpxor	%xmm5, %xmm3, %xmm3
+        vmovdqu	%xmm4, 32(%esi)
+        vmovdqu	%xmm5, 48(%esi)
+        vmovdqu	%xmm2, 32(%edi)
+        vmovdqu	%xmm3, 48(%edi)
+        cmpl	$0x40, %eax
+        movl	$0x40, %ebx
+        movl	%esi, %ecx
+        movl	%edi, %edx
+        jle	L_AES_GCM_encrypt_update_avx1_end_64
+        # More 64 bytes of input
+L_AES_GCM_encrypt_update_avx1_ghash_64:
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        vmovdqu	64(%esp), %xmm0
+        vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm7
+        vpaddd	L_aes_gcm_avx1_one, %xmm0, %xmm1
+        vpshufb	%xmm7, %xmm1, %xmm1
+        vpaddd	L_aes_gcm_avx1_two, %xmm0, %xmm2
+        vpshufb	%xmm7, %xmm2, %xmm2
+        vpaddd	L_aes_gcm_avx1_three, %xmm0, %xmm3
+        vpshufb	%xmm7, %xmm3, %xmm3
+        vpshufb	%xmm7, %xmm0, %xmm0
+        vmovdqu	64(%esp), %xmm7
+        vpaddd	L_aes_gcm_avx1_four, %xmm7, %xmm7
+        vmovdqu	%xmm7, 64(%esp)
+        vmovdqa	(%ebp), %xmm7
+        vpxor	%xmm7, %xmm0, %xmm0
+        vpxor	%xmm7, %xmm1, %xmm1
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm7, %xmm3, %xmm3
+        vmovdqa	16(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	32(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	48(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	64(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	80(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	96(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	112(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	128(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	144(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        cmpl	$11, 120(%esp)
+        vmovdqa	160(%ebp), %xmm7
+        jl	L_AES_GCM_encrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	176(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        cmpl	$13, 120(%esp)
+        vmovdqa	192(%ebp), %xmm7
+        jl	L_AES_GCM_encrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	208(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	224(%ebp), %xmm7
+L_AES_GCM_encrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done:
+        vaesenclast	%xmm7, %xmm0, %xmm0
+        vaesenclast	%xmm7, %xmm1, %xmm1
+        vmovdqu	(%ecx), %xmm4
+        vmovdqu	16(%ecx), %xmm5
+        vpxor	%xmm4, %xmm0, %xmm0
+        vpxor	%xmm5, %xmm1, %xmm1
+        vmovdqu	%xmm0, (%edx)
+        vmovdqu	%xmm1, 16(%edx)
+        vaesenclast	%xmm7, %xmm2, %xmm2
+        vaesenclast	%xmm7, %xmm3, %xmm3
+        vmovdqu	32(%ecx), %xmm4
+        vmovdqu	48(%ecx), %xmm5
+        vpxor	%xmm4, %xmm2, %xmm2
+        vpxor	%xmm5, %xmm3, %xmm3
+        vmovdqu	%xmm2, 32(%edx)
+        vmovdqu	%xmm3, 48(%edx)
+        # ghash encrypted counter
+        vmovdqu	80(%esp), %xmm2
+        vmovdqu	48(%esp), %xmm7
+        vmovdqu	-64(%edx), %xmm0
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm2, %xmm0, %xmm0
+        vpshufd	$0x4e, %xmm7, %xmm1
+        vpshufd	$0x4e, %xmm0, %xmm5
+        vpxor	%xmm7, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm5, %xmm5
+        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm3
+        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm2
+        vpclmulqdq	$0x00, %xmm5, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm1, %xmm1
+        vmovdqu	32(%esp), %xmm7
+        vmovdqu	-48(%edx), %xmm0
+        vpshufd	$0x4e, %xmm7, %xmm4
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm7, %xmm4, %xmm4
+        vpshufd	$0x4e, %xmm0, %xmm5
+        vpxor	%xmm0, %xmm5, %xmm5
+        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
+        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
+        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
+        vpxor	%xmm7, %xmm1, %xmm1
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm6, %xmm1, %xmm1
+        vpxor	%xmm6, %xmm3, %xmm3
+        vpxor	%xmm4, %xmm1, %xmm1
+        vmovdqu	16(%esp), %xmm7
+        vmovdqu	-32(%edx), %xmm0
+        vpshufd	$0x4e, %xmm7, %xmm4
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm7, %xmm4, %xmm4
+        vpshufd	$0x4e, %xmm0, %xmm5
+        vpxor	%xmm0, %xmm5, %xmm5
+        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
+        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
+        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
+        vpxor	%xmm7, %xmm1, %xmm1
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm6, %xmm1, %xmm1
+        vpxor	%xmm6, %xmm3, %xmm3
+        vpxor	%xmm4, %xmm1, %xmm1
+        vmovdqu	(%esp), %xmm7
+        vmovdqu	-16(%edx), %xmm0
+        vpshufd	$0x4e, %xmm7, %xmm4
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm7, %xmm4, %xmm4
+        vpshufd	$0x4e, %xmm0, %xmm5
+        vpxor	%xmm0, %xmm5, %xmm5
+        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
+        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
+        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
+        vpxor	%xmm7, %xmm1, %xmm1
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm6, %xmm1, %xmm1
+        vpxor	%xmm6, %xmm3, %xmm3
+        vpxor	%xmm4, %xmm1, %xmm1
+        vpslldq	$8, %xmm1, %xmm5
+        vpsrldq	$8, %xmm1, %xmm1
+        vpxor	%xmm5, %xmm2, %xmm2
+        vpxor	%xmm1, %xmm3, %xmm3
+        vpslld	$31, %xmm2, %xmm7
+        vpslld	$30, %xmm2, %xmm4
+        vpslld	$25, %xmm2, %xmm5
+        vpxor	%xmm4, %xmm7, %xmm7
+        vpxor	%xmm5, %xmm7, %xmm7
+        vpsrldq	$4, %xmm7, %xmm4
+        vpslldq	$12, %xmm7, %xmm7
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpsrld	$0x01, %xmm2, %xmm5
+        vpsrld	$2, %xmm2, %xmm1
+        vpsrld	$7, %xmm2, %xmm0
+        vpxor	%xmm1, %xmm5, %xmm5
+        vpxor	%xmm0, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm2, %xmm2
+        vpxor	%xmm3, %xmm2, %xmm2
+        vmovdqu	%xmm2, 80(%esp)
+        addl	$0x40, %ebx
+        cmpl	%eax, %ebx
+        jl	L_AES_GCM_encrypt_update_avx1_ghash_64
+L_AES_GCM_encrypt_update_avx1_end_64:
+        movdqu	80(%esp), %xmm6
+        # Block 1
+        vmovdqa	L_aes_gcm_avx1_bswap_mask, %xmm0
+        vmovdqu	(%edx), %xmm5
+        pshufb	%xmm0, %xmm5
+        vmovdqu	48(%esp), %xmm7
+        pxor	%xmm6, %xmm5
+        # ghash_gfmul_avx
+        vpshufd	$0x4e, %xmm5, %xmm1
+        vpshufd	$0x4e, %xmm7, %xmm2
+        vpclmulqdq	$0x11, %xmm5, %xmm7, %xmm3
+        vpclmulqdq	$0x00, %xmm5, %xmm7, %xmm0
+        vpxor	%xmm5, %xmm1, %xmm1
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm1, %xmm1
+        vmovdqa	%xmm0, %xmm4
+        vmovdqa	%xmm3, %xmm6
+        vpslldq	$8, %xmm1, %xmm2
+        vpsrldq	$8, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm4, %xmm4
+        vpxor	%xmm1, %xmm6, %xmm6
+        # Block 2
+        vmovdqa	L_aes_gcm_avx1_bswap_mask, %xmm0
+        vmovdqu	16(%edx), %xmm5
+        pshufb	%xmm0, %xmm5
+        vmovdqu	32(%esp), %xmm7
+        # ghash_gfmul_xor_avx
+        vpshufd	$0x4e, %xmm5, %xmm1
+        vpshufd	$0x4e, %xmm7, %xmm2
+        vpclmulqdq	$0x11, %xmm5, %xmm7, %xmm3
+        vpclmulqdq	$0x00, %xmm5, %xmm7, %xmm0
+        vpxor	%xmm5, %xmm1, %xmm1
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm4, %xmm4
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpslldq	$8, %xmm1, %xmm2
+        vpsrldq	$8, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm4, %xmm4
+        vpxor	%xmm1, %xmm6, %xmm6
+        # Block 3
+        vmovdqa	L_aes_gcm_avx1_bswap_mask, %xmm0
+        vmovdqu	32(%edx), %xmm5
+        pshufb	%xmm0, %xmm5
+        vmovdqu	16(%esp), %xmm7
+        # ghash_gfmul_xor_avx
+        vpshufd	$0x4e, %xmm5, %xmm1
+        vpshufd	$0x4e, %xmm7, %xmm2
+        vpclmulqdq	$0x11, %xmm5, %xmm7, %xmm3
+        vpclmulqdq	$0x00, %xmm5, %xmm7, %xmm0
+        vpxor	%xmm5, %xmm1, %xmm1
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm4, %xmm4
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpslldq	$8, %xmm1, %xmm2
+        vpsrldq	$8, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm4, %xmm4
+        vpxor	%xmm1, %xmm6, %xmm6
+        # Block 4
+        vmovdqa	L_aes_gcm_avx1_bswap_mask, %xmm0
+        vmovdqu	48(%edx), %xmm5
+        pshufb	%xmm0, %xmm5
+        vmovdqu	(%esp), %xmm7
+        # ghash_gfmul_xor_avx
+        vpshufd	$0x4e, %xmm5, %xmm1
+        vpshufd	$0x4e, %xmm7, %xmm2
+        vpclmulqdq	$0x11, %xmm5, %xmm7, %xmm3
+        vpclmulqdq	$0x00, %xmm5, %xmm7, %xmm0
+        vpxor	%xmm5, %xmm1, %xmm1
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm4, %xmm4
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpslldq	$8, %xmm1, %xmm2
+        vpsrldq	$8, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm4, %xmm4
+        vpxor	%xmm1, %xmm6, %xmm6
+        vpslld	$31, %xmm4, %xmm0
+        vpslld	$30, %xmm4, %xmm1
+        vpslld	$25, %xmm4, %xmm2
+        vpxor	%xmm1, %xmm0, %xmm0
+        vpxor	%xmm2, %xmm0, %xmm0
+        vmovdqa	%xmm0, %xmm1
+        vpsrldq	$4, %xmm1, %xmm1
+        vpslldq	$12, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm4, %xmm4
+        vpsrld	$0x01, %xmm4, %xmm2
+        vpsrld	$2, %xmm4, %xmm3
+        vpsrld	$7, %xmm4, %xmm0
+        vpxor	%xmm3, %xmm2, %xmm2
+        vpxor	%xmm0, %xmm2, %xmm2
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpxor	%xmm4, %xmm2, %xmm2
+        vpxor	%xmm2, %xmm6, %xmm6
+        vmovdqu	(%esp), %xmm5
+L_AES_GCM_encrypt_update_avx1_done_64:
+        movl	132(%esp), %edx
+        cmpl	%edx, %ebx
+        jge	L_AES_GCM_encrypt_update_avx1_done_enc
+        movl	132(%esp), %eax
+        andl	$0xfffffff0, %eax
+        cmpl	%eax, %ebx
+        jge	L_AES_GCM_encrypt_update_avx1_last_block_done
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        vmovdqu	64(%esp), %xmm1
+        vpshufb	L_aes_gcm_avx1_bswap_epi64, %xmm1, %xmm0
+        vpaddd	L_aes_gcm_avx1_one, %xmm1, %xmm1
+        vmovdqu	%xmm1, 64(%esp)
+        vpxor	(%ebp), %xmm0, %xmm0
+        vaesenc	16(%ebp), %xmm0, %xmm0
+        vaesenc	32(%ebp), %xmm0, %xmm0
+        vaesenc	48(%ebp), %xmm0, %xmm0
+        vaesenc	64(%ebp), %xmm0, %xmm0
+        vaesenc	80(%ebp), %xmm0, %xmm0
+        vaesenc	96(%ebp), %xmm0, %xmm0
+        vaesenc	112(%ebp), %xmm0, %xmm0
+        vaesenc	128(%ebp), %xmm0, %xmm0
+        vaesenc	144(%ebp), %xmm0, %xmm0
+        cmpl	$11, 120(%esp)
+        vmovdqa	160(%ebp), %xmm1
+        jl	L_AES_GCM_encrypt_update_avx1_aesenc_block_aesenc_avx_last
+        vaesenc	%xmm1, %xmm0, %xmm0
+        vaesenc	176(%ebp), %xmm0, %xmm0
+        cmpl	$13, 120(%esp)
+        vmovdqa	192(%ebp), %xmm1
+        jl	L_AES_GCM_encrypt_update_avx1_aesenc_block_aesenc_avx_last
+        vaesenc	%xmm1, %xmm0, %xmm0
+        vaesenc	208(%ebp), %xmm0, %xmm0
+        vmovdqa	224(%ebp), %xmm1
+L_AES_GCM_encrypt_update_avx1_aesenc_block_aesenc_avx_last:
+        vaesenclast	%xmm1, %xmm0, %xmm0
+        vmovdqu	(%ecx), %xmm1
+        vpxor	%xmm1, %xmm0, %xmm0
+        vmovdqu	%xmm0, (%edx)
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm6, %xmm6
+        addl	$16, %ebx
+        cmpl	%eax, %ebx
+        jge	L_AES_GCM_encrypt_update_avx1_last_block_ghash
+L_AES_GCM_encrypt_update_avx1_last_block_start:
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        vmovdqu	64(%esp), %xmm1
+        vmovdqu	%xmm6, %xmm3
+        vpshufb	L_aes_gcm_avx1_bswap_epi64, %xmm1, %xmm0
+        vpaddd	L_aes_gcm_avx1_one, %xmm1, %xmm1
+        vmovdqu	%xmm1, 64(%esp)
+        vpxor	(%ebp), %xmm0, %xmm0
+        vpclmulqdq	$16, %xmm5, %xmm3, %xmm4
+        vaesenc	16(%ebp), %xmm0, %xmm0
+        vaesenc	32(%ebp), %xmm0, %xmm0
+        vpclmulqdq	$0x01, %xmm5, %xmm3, %xmm7
+        vaesenc	48(%ebp), %xmm0, %xmm0
+        vaesenc	64(%ebp), %xmm0, %xmm0
+        vaesenc	80(%ebp), %xmm0, %xmm0
+        vpclmulqdq	$0x11, %xmm5, %xmm3, %xmm1
+        vaesenc	96(%ebp), %xmm0, %xmm0
+        vpxor	%xmm7, %xmm4, %xmm4
+        vpslldq	$8, %xmm4, %xmm2
+        vpsrldq	$8, %xmm4, %xmm4
+        vaesenc	112(%ebp), %xmm0, %xmm0
+        vpclmulqdq	$0x00, %xmm5, %xmm3, %xmm7
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm4, %xmm1, %xmm1
+        vmovdqa	L_aes_gcm_avx1_mod2_128, %xmm3
+        vpclmulqdq	$16, %xmm3, %xmm2, %xmm7
+        vaesenc	128(%ebp), %xmm0, %xmm0
+        vpshufd	$0x4e, %xmm2, %xmm4
+        vpxor	%xmm7, %xmm4, %xmm4
+        vpclmulqdq	$16, %xmm3, %xmm4, %xmm7
+        vaesenc	144(%ebp), %xmm0, %xmm0
+        vpshufd	$0x4e, %xmm4, %xmm6
+        vpxor	%xmm7, %xmm6, %xmm6
+        vpxor	%xmm1, %xmm6, %xmm6
+        cmpl	$11, 120(%esp)
+        vmovdqa	160(%ebp), %xmm1
+        jl	L_AES_GCM_encrypt_update_avx1_aesenc_gfmul_last
+        vaesenc	%xmm1, %xmm0, %xmm0
+        vaesenc	176(%ebp), %xmm0, %xmm0
+        cmpl	$13, 120(%esp)
+        vmovdqa	192(%ebp), %xmm1
+        jl	L_AES_GCM_encrypt_update_avx1_aesenc_gfmul_last
+        vaesenc	%xmm1, %xmm0, %xmm0
+        vaesenc	208(%ebp), %xmm0, %xmm0
+        vmovdqa	224(%ebp), %xmm1
+L_AES_GCM_encrypt_update_avx1_aesenc_gfmul_last:
+        vaesenclast	%xmm1, %xmm0, %xmm0
+        vmovdqu	(%ecx), %xmm1
+        vpxor	%xmm1, %xmm0, %xmm0
+        vmovdqu	%xmm0, (%edx)
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
+        addl	$16, %ebx
+        vpxor	%xmm0, %xmm6, %xmm6
+        cmpl	%eax, %ebx
+        jl	L_AES_GCM_encrypt_update_avx1_last_block_start
+L_AES_GCM_encrypt_update_avx1_last_block_ghash:
+        # ghash_gfmul_red_avx
+        vpshufd	$0x4e, %xmm5, %xmm1
+        vpshufd	$0x4e, %xmm6, %xmm2
+        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm3
+        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm0
+        vpxor	%xmm5, %xmm1, %xmm1
+        vpxor	%xmm6, %xmm2, %xmm2
+        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm1, %xmm1
+        vpslldq	$8, %xmm1, %xmm2
+        vpsrldq	$8, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm0, %xmm0
+        vpxor	%xmm1, %xmm3, %xmm6
+        vpslld	$31, %xmm0, %xmm1
+        vpslld	$30, %xmm0, %xmm2
+        vpslld	$25, %xmm0, %xmm3
+        vpxor	%xmm2, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm1, %xmm1
+        vpsrldq	$4, %xmm1, %xmm3
+        vpslldq	$12, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm0, %xmm0
+        vpsrld	$0x01, %xmm0, %xmm1
+        vpsrld	$2, %xmm0, %xmm2
+        vpxor	%xmm2, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpsrld	$7, %xmm0, %xmm0
+        vpxor	%xmm3, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm6, %xmm6
+L_AES_GCM_encrypt_update_avx1_last_block_done:
+L_AES_GCM_encrypt_update_avx1_done_enc:
+        movl	136(%esp), %esi
+        movl	144(%esp), %edi
+        vmovdqu	64(%esp), %xmm4
+        vmovdqa	%xmm6, (%esi)
+        vmovdqu	%xmm4, (%edi)
+        addl	$0x60, %esp
+        popl	%ebp
+        popl	%edi
+        popl	%esi
+        popl	%ebx
+        ret
+.size	AES_GCM_encrypt_update_avx1,.-AES_GCM_encrypt_update_avx1
+.text
+.globl	AES_GCM_encrypt_final_avx1
+.type	AES_GCM_encrypt_final_avx1,@function
+.align	16
+AES_GCM_encrypt_final_avx1:
+        pushl	%esi
+        pushl	%edi
+        pushl	%ebp
+        subl	$16, %esp
+        movl	32(%esp), %ebp
+        movl	52(%esp), %esi
+        movl	56(%esp), %edi
+        vmovdqa	(%ebp), %xmm4
+        vmovdqa	(%esi), %xmm5
+        vmovdqa	(%edi), %xmm6
+        vpsrlq	$63, %xmm5, %xmm1
+        vpsllq	$0x01, %xmm5, %xmm0
+        vpslldq	$8, %xmm1, %xmm1
+        vpor	%xmm1, %xmm0, %xmm0
+        vpshufd	$0xff, %xmm5, %xmm5
+        vpsrad	$31, %xmm5, %xmm5
+        vpand	L_aes_gcm_avx1_mod2_128, %xmm5, %xmm5
+        vpxor	%xmm0, %xmm5, %xmm5
+        movl	44(%esp), %edx
+        movl	48(%esp), %ecx
+        shll	$3, %edx
+        shll	$3, %ecx
+        vpinsrd	$0x00, %edx, %xmm0, %xmm0
+        vpinsrd	$2, %ecx, %xmm0, %xmm0
+        movl	44(%esp), %edx
+        movl	48(%esp), %ecx
+        shrl	$29, %edx
+        shrl	$29, %ecx
+        vpinsrd	$0x01, %edx, %xmm0, %xmm0
+        vpinsrd	$3, %ecx, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm4, %xmm4
+        # ghash_gfmul_red_avx
+        vpshufd	$0x4e, %xmm5, %xmm1
+        vpshufd	$0x4e, %xmm4, %xmm2
+        vpclmulqdq	$0x11, %xmm5, %xmm4, %xmm3
+        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm0
+        vpxor	%xmm5, %xmm1, %xmm1
+        vpxor	%xmm4, %xmm2, %xmm2
+        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm1, %xmm1
+        vpslldq	$8, %xmm1, %xmm2
+        vpsrldq	$8, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm0, %xmm0
+        vpxor	%xmm1, %xmm3, %xmm4
+        vpslld	$31, %xmm0, %xmm1
+        vpslld	$30, %xmm0, %xmm2
+        vpslld	$25, %xmm0, %xmm3
+        vpxor	%xmm2, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm1, %xmm1
+        vpsrldq	$4, %xmm1, %xmm3
+        vpslldq	$12, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm0, %xmm0
+        vpsrld	$0x01, %xmm0, %xmm1
+        vpsrld	$2, %xmm0, %xmm2
+        vpxor	%xmm2, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpsrld	$7, %xmm0, %xmm0
+        vpxor	%xmm3, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm4, %xmm4
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm4, %xmm4
+        vpxor	%xmm6, %xmm4, %xmm0
+        movl	36(%esp), %edi
+        cmpl	$16, 40(%esp)
+        je	L_AES_GCM_encrypt_final_avx1_store_tag_16
+        xorl	%ecx, %ecx
+        vmovdqu	%xmm0, (%esp)
+L_AES_GCM_encrypt_final_avx1_store_tag_loop:
+        movzbl	(%esp,%ecx,1), %eax
+        movb	%al, (%edi,%ecx,1)
+        incl	%ecx
+        cmpl	40(%esp), %ecx
+        jne	L_AES_GCM_encrypt_final_avx1_store_tag_loop
+        jmp	L_AES_GCM_encrypt_final_avx1_store_tag_done
+L_AES_GCM_encrypt_final_avx1_store_tag_16:
+        vmovdqu	%xmm0, (%edi)
+L_AES_GCM_encrypt_final_avx1_store_tag_done:
+        addl	$16, %esp
+        popl	%ebp
+        popl	%edi
+        popl	%esi
+        ret
+.size	AES_GCM_encrypt_final_avx1,.-AES_GCM_encrypt_final_avx1
+.text
+.globl	AES_GCM_decrypt_update_avx1
+.type	AES_GCM_decrypt_update_avx1,@function
+.align	16
+AES_GCM_decrypt_update_avx1:
+        pushl	%ebx
+        pushl	%esi
+        pushl	%edi
+        pushl	%ebp
+        subl	$0xa0, %esp
+        movl	208(%esp), %esi
+        vmovdqa	(%esi), %xmm4
+        vmovdqu	%xmm4, 64(%esp)
+        movl	200(%esp), %esi
+        movl	204(%esp), %ebp
+        vmovdqa	(%esi), %xmm6
+        vmovdqa	(%ebp), %xmm5
+        vmovdqu	%xmm6, 80(%esp)
+        movl	180(%esp), %ebp
+        movl	188(%esp), %edi
+        movl	192(%esp), %esi
+        vpsrlq	$63, %xmm5, %xmm1
+        vpsllq	$0x01, %xmm5, %xmm0
+        vpslldq	$8, %xmm1, %xmm1
+        vpor	%xmm1, %xmm0, %xmm0
+        vpshufd	$0xff, %xmm5, %xmm5
+        vpsrad	$31, %xmm5, %xmm5
+        vpand	L_aes_gcm_avx1_mod2_128, %xmm5, %xmm5
+        vpxor	%xmm0, %xmm5, %xmm5
+        xorl	%ebx, %ebx
+        cmpl	$0x40, 196(%esp)
+        movl	196(%esp), %eax
+        jl	L_AES_GCM_decrypt_update_avx1_done_64
+        andl	$0xffffffc0, %eax
+        vmovdqa	%xmm6, %xmm2
+        # H ^ 1
+        vmovdqu	%xmm5, (%esp)
+        # H ^ 2
+        vpclmulqdq	$0x00, %xmm5, %xmm5, %xmm0
+        vpclmulqdq	$0x11, %xmm5, %xmm5, %xmm4
+        vpslld	$31, %xmm0, %xmm1
+        vpslld	$30, %xmm0, %xmm2
+        vpslld	$25, %xmm0, %xmm3
+        vpxor	%xmm2, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm1, %xmm1
+        vpsrldq	$4, %xmm1, %xmm3
+        vpslldq	$12, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm0, %xmm0
+        vpsrld	$0x01, %xmm0, %xmm1
+        vpsrld	$2, %xmm0, %xmm2
+        vpxor	%xmm2, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpsrld	$7, %xmm0, %xmm0
+        vpxor	%xmm3, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm4, %xmm4
+        vmovdqu	%xmm4, 16(%esp)
+        # H ^ 3
+        # ghash_gfmul_red_avx
+        vpshufd	$0x4e, %xmm5, %xmm1
+        vpshufd	$0x4e, %xmm4, %xmm2
+        vpclmulqdq	$0x11, %xmm5, %xmm4, %xmm3
+        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm0
+        vpxor	%xmm5, %xmm1, %xmm1
+        vpxor	%xmm4, %xmm2, %xmm2
+        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm1, %xmm1
+        vpslldq	$8, %xmm1, %xmm2
+        vpsrldq	$8, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm0, %xmm0
+        vpxor	%xmm1, %xmm3, %xmm7
+        vpslld	$31, %xmm0, %xmm1
+        vpslld	$30, %xmm0, %xmm2
+        vpslld	$25, %xmm0, %xmm3
+        vpxor	%xmm2, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm1, %xmm1
+        vpsrldq	$4, %xmm1, %xmm3
+        vpslldq	$12, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm0, %xmm0
+        vpsrld	$0x01, %xmm0, %xmm1
+        vpsrld	$2, %xmm0, %xmm2
+        vpxor	%xmm2, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpsrld	$7, %xmm0, %xmm0
+        vpxor	%xmm3, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm7, %xmm7
+        vmovdqu	%xmm7, 32(%esp)
+        # H ^ 4
+        vpclmulqdq	$0x00, %xmm4, %xmm4, %xmm0
+        vpclmulqdq	$0x11, %xmm4, %xmm4, %xmm7
+        vpslld	$31, %xmm0, %xmm1
+        vpslld	$30, %xmm0, %xmm2
+        vpslld	$25, %xmm0, %xmm3
+        vpxor	%xmm2, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm1, %xmm1
+        vpsrldq	$4, %xmm1, %xmm3
+        vpslldq	$12, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm0, %xmm0
+        vpsrld	$0x01, %xmm0, %xmm1
+        vpsrld	$2, %xmm0, %xmm2
+        vpxor	%xmm2, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpsrld	$7, %xmm0, %xmm0
+        vpxor	%xmm3, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm7, %xmm7
+        vmovdqu	%xmm7, 48(%esp)
+        cmpl	%esi, %edi
+        jne	L_AES_GCM_decrypt_update_avx1_ghash_64
+L_AES_GCM_decrypt_update_avx1_ghash_64_inplace:
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        vmovdqu	64(%esp), %xmm0
+        vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm7
+        vpaddd	L_aes_gcm_avx1_one, %xmm0, %xmm1
+        vpshufb	%xmm7, %xmm1, %xmm1
+        vpaddd	L_aes_gcm_avx1_two, %xmm0, %xmm2
+        vpshufb	%xmm7, %xmm2, %xmm2
+        vpaddd	L_aes_gcm_avx1_three, %xmm0, %xmm3
+        vpshufb	%xmm7, %xmm3, %xmm3
+        vpshufb	%xmm7, %xmm0, %xmm0
+        vmovdqu	64(%esp), %xmm7
+        vpaddd	L_aes_gcm_avx1_four, %xmm7, %xmm7
+        vmovdqu	%xmm7, 64(%esp)
+        vmovdqa	(%ebp), %xmm7
+        vpxor	%xmm7, %xmm0, %xmm0
+        vpxor	%xmm7, %xmm1, %xmm1
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm7, %xmm3, %xmm3
+        vmovdqa	16(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	32(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	48(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	64(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	80(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	96(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	112(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	128(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	144(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        cmpl	$11, 184(%esp)
+        vmovdqa	160(%ebp), %xmm7
+        jl	L_AES_GCM_decrypt_update_avx1inplace_aesenc_64_ghash_avx_aesenc_64_enc_done
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	176(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        cmpl	$13, 184(%esp)
+        vmovdqa	192(%ebp), %xmm7
+        jl	L_AES_GCM_decrypt_update_avx1inplace_aesenc_64_ghash_avx_aesenc_64_enc_done
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	208(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	224(%ebp), %xmm7
+L_AES_GCM_decrypt_update_avx1inplace_aesenc_64_ghash_avx_aesenc_64_enc_done:
+        vaesenclast	%xmm7, %xmm0, %xmm0
+        vaesenclast	%xmm7, %xmm1, %xmm1
+        vmovdqu	(%ecx), %xmm4
+        vmovdqu	16(%ecx), %xmm5
+        vpxor	%xmm4, %xmm0, %xmm0
+        vpxor	%xmm5, %xmm1, %xmm1
+        vmovdqu	%xmm4, 96(%esp)
+        vmovdqu	%xmm5, 112(%esp)
+        vmovdqu	%xmm0, (%edx)
+        vmovdqu	%xmm1, 16(%edx)
+        vaesenclast	%xmm7, %xmm2, %xmm2
+        vaesenclast	%xmm7, %xmm3, %xmm3
+        vmovdqu	32(%ecx), %xmm4
+        vmovdqu	48(%ecx), %xmm5
+        vpxor	%xmm4, %xmm2, %xmm2
+        vpxor	%xmm5, %xmm3, %xmm3
+        vmovdqu	%xmm4, 128(%esp)
+        vmovdqu	%xmm5, 144(%esp)
+        vmovdqu	%xmm2, 32(%edx)
+        vmovdqu	%xmm3, 48(%edx)
+        # ghash encrypted counter
+        vmovdqu	80(%esp), %xmm2
+        vmovdqu	48(%esp), %xmm7
+        vmovdqu	96(%esp), %xmm0
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm2, %xmm0, %xmm0
+        vpshufd	$0x4e, %xmm7, %xmm1
+        vpshufd	$0x4e, %xmm0, %xmm5
+        vpxor	%xmm7, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm5, %xmm5
+        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm3
+        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm2
+        vpclmulqdq	$0x00, %xmm5, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm1, %xmm1
+        vmovdqu	32(%esp), %xmm7
+        vmovdqu	112(%esp), %xmm0
+        vpshufd	$0x4e, %xmm7, %xmm4
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm7, %xmm4, %xmm4
+        vpshufd	$0x4e, %xmm0, %xmm5
+        vpxor	%xmm0, %xmm5, %xmm5
+        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
+        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
+        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
+        vpxor	%xmm7, %xmm1, %xmm1
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm6, %xmm1, %xmm1
+        vpxor	%xmm6, %xmm3, %xmm3
+        vpxor	%xmm4, %xmm1, %xmm1
+        vmovdqu	16(%esp), %xmm7
+        vmovdqu	128(%esp), %xmm0
+        vpshufd	$0x4e, %xmm7, %xmm4
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm7, %xmm4, %xmm4
+        vpshufd	$0x4e, %xmm0, %xmm5
+        vpxor	%xmm0, %xmm5, %xmm5
+        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
+        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
+        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
+        vpxor	%xmm7, %xmm1, %xmm1
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm6, %xmm1, %xmm1
+        vpxor	%xmm6, %xmm3, %xmm3
+        vpxor	%xmm4, %xmm1, %xmm1
+        vmovdqu	(%esp), %xmm7
+        vmovdqu	144(%esp), %xmm0
+        vpshufd	$0x4e, %xmm7, %xmm4
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm7, %xmm4, %xmm4
+        vpshufd	$0x4e, %xmm0, %xmm5
+        vpxor	%xmm0, %xmm5, %xmm5
+        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
+        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
+        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
+        vpxor	%xmm7, %xmm1, %xmm1
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm6, %xmm1, %xmm1
+        vpxor	%xmm6, %xmm3, %xmm3
+        vpxor	%xmm4, %xmm1, %xmm1
+        vpslldq	$8, %xmm1, %xmm5
+        vpsrldq	$8, %xmm1, %xmm1
+        vpxor	%xmm5, %xmm2, %xmm2
+        vpxor	%xmm1, %xmm3, %xmm3
+        vpslld	$31, %xmm2, %xmm7
+        vpslld	$30, %xmm2, %xmm4
+        vpslld	$25, %xmm2, %xmm5
+        vpxor	%xmm4, %xmm7, %xmm7
+        vpxor	%xmm5, %xmm7, %xmm7
+        vpsrldq	$4, %xmm7, %xmm4
+        vpslldq	$12, %xmm7, %xmm7
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpsrld	$0x01, %xmm2, %xmm5
+        vpsrld	$2, %xmm2, %xmm1
+        vpsrld	$7, %xmm2, %xmm0
+        vpxor	%xmm1, %xmm5, %xmm5
+        vpxor	%xmm0, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm2, %xmm2
+        vpxor	%xmm3, %xmm2, %xmm2
+        vmovdqu	%xmm2, 80(%esp)
+        addl	$0x40, %ebx
+        cmpl	%eax, %ebx
+        jl	L_AES_GCM_decrypt_update_avx1_ghash_64_inplace
+        jmp	L_AES_GCM_decrypt_update_avx1_ghash_64_done
+L_AES_GCM_decrypt_update_avx1_ghash_64:
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        vmovdqu	64(%esp), %xmm0
+        vmovdqa	L_aes_gcm_avx1_bswap_epi64, %xmm7
+        vpaddd	L_aes_gcm_avx1_one, %xmm0, %xmm1
+        vpshufb	%xmm7, %xmm1, %xmm1
+        vpaddd	L_aes_gcm_avx1_two, %xmm0, %xmm2
+        vpshufb	%xmm7, %xmm2, %xmm2
+        vpaddd	L_aes_gcm_avx1_three, %xmm0, %xmm3
+        vpshufb	%xmm7, %xmm3, %xmm3
+        vpshufb	%xmm7, %xmm0, %xmm0
+        vmovdqu	64(%esp), %xmm7
+        vpaddd	L_aes_gcm_avx1_four, %xmm7, %xmm7
+        vmovdqu	%xmm7, 64(%esp)
+        vmovdqa	(%ebp), %xmm7
+        vpxor	%xmm7, %xmm0, %xmm0
+        vpxor	%xmm7, %xmm1, %xmm1
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm7, %xmm3, %xmm3
+        vmovdqa	16(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	32(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	48(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	64(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	80(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	96(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	112(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	128(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	144(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        cmpl	$11, 184(%esp)
+        vmovdqa	160(%ebp), %xmm7
+        jl	L_AES_GCM_decrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	176(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        cmpl	$13, 184(%esp)
+        vmovdqa	192(%ebp), %xmm7
+        jl	L_AES_GCM_decrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	208(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqa	224(%ebp), %xmm7
+L_AES_GCM_decrypt_update_avx1_aesenc_64_ghash_avx_aesenc_64_enc_done:
+        vaesenclast	%xmm7, %xmm0, %xmm0
+        vaesenclast	%xmm7, %xmm1, %xmm1
+        vmovdqu	(%ecx), %xmm4
+        vmovdqu	16(%ecx), %xmm5
+        vpxor	%xmm4, %xmm0, %xmm0
+        vpxor	%xmm5, %xmm1, %xmm1
+        vmovdqu	%xmm4, (%ecx)
+        vmovdqu	%xmm5, 16(%ecx)
+        vmovdqu	%xmm0, (%edx)
+        vmovdqu	%xmm1, 16(%edx)
+        vaesenclast	%xmm7, %xmm2, %xmm2
+        vaesenclast	%xmm7, %xmm3, %xmm3
+        vmovdqu	32(%ecx), %xmm4
+        vmovdqu	48(%ecx), %xmm5
+        vpxor	%xmm4, %xmm2, %xmm2
+        vpxor	%xmm5, %xmm3, %xmm3
+        vmovdqu	%xmm4, 32(%ecx)
+        vmovdqu	%xmm5, 48(%ecx)
+        vmovdqu	%xmm2, 32(%edx)
+        vmovdqu	%xmm3, 48(%edx)
+        # ghash encrypted counter
+        vmovdqu	80(%esp), %xmm2
+        vmovdqu	48(%esp), %xmm7
+        vmovdqu	(%ecx), %xmm0
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm2, %xmm0, %xmm0
+        vpshufd	$0x4e, %xmm7, %xmm1
+        vpshufd	$0x4e, %xmm0, %xmm5
+        vpxor	%xmm7, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm5, %xmm5
+        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm3
+        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm2
+        vpclmulqdq	$0x00, %xmm5, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm1, %xmm1
+        vmovdqu	32(%esp), %xmm7
+        vmovdqu	16(%ecx), %xmm0
+        vpshufd	$0x4e, %xmm7, %xmm4
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm7, %xmm4, %xmm4
+        vpshufd	$0x4e, %xmm0, %xmm5
+        vpxor	%xmm0, %xmm5, %xmm5
+        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
+        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
+        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
+        vpxor	%xmm7, %xmm1, %xmm1
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm6, %xmm1, %xmm1
+        vpxor	%xmm6, %xmm3, %xmm3
+        vpxor	%xmm4, %xmm1, %xmm1
+        vmovdqu	16(%esp), %xmm7
+        vmovdqu	32(%ecx), %xmm0
+        vpshufd	$0x4e, %xmm7, %xmm4
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm7, %xmm4, %xmm4
+        vpshufd	$0x4e, %xmm0, %xmm5
+        vpxor	%xmm0, %xmm5, %xmm5
+        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
+        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
+        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
+        vpxor	%xmm7, %xmm1, %xmm1
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm6, %xmm1, %xmm1
+        vpxor	%xmm6, %xmm3, %xmm3
+        vpxor	%xmm4, %xmm1, %xmm1
+        vmovdqu	(%esp), %xmm7
+        vmovdqu	48(%ecx), %xmm0
+        vpshufd	$0x4e, %xmm7, %xmm4
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm7, %xmm4, %xmm4
+        vpshufd	$0x4e, %xmm0, %xmm5
+        vpxor	%xmm0, %xmm5, %xmm5
+        vpclmulqdq	$0x11, %xmm7, %xmm0, %xmm6
+        vpclmulqdq	$0x00, %xmm7, %xmm0, %xmm7
+        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm4
+        vpxor	%xmm7, %xmm1, %xmm1
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm6, %xmm1, %xmm1
+        vpxor	%xmm6, %xmm3, %xmm3
+        vpxor	%xmm4, %xmm1, %xmm1
+        vpslldq	$8, %xmm1, %xmm5
+        vpsrldq	$8, %xmm1, %xmm1
+        vpxor	%xmm5, %xmm2, %xmm2
+        vpxor	%xmm1, %xmm3, %xmm3
+        vpslld	$31, %xmm2, %xmm7
+        vpslld	$30, %xmm2, %xmm4
+        vpslld	$25, %xmm2, %xmm5
+        vpxor	%xmm4, %xmm7, %xmm7
+        vpxor	%xmm5, %xmm7, %xmm7
+        vpsrldq	$4, %xmm7, %xmm4
+        vpslldq	$12, %xmm7, %xmm7
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpsrld	$0x01, %xmm2, %xmm5
+        vpsrld	$2, %xmm2, %xmm1
+        vpsrld	$7, %xmm2, %xmm0
+        vpxor	%xmm1, %xmm5, %xmm5
+        vpxor	%xmm0, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm2, %xmm2
+        vpxor	%xmm3, %xmm2, %xmm2
+        vmovdqu	%xmm2, 80(%esp)
+        addl	$0x40, %ebx
+        cmpl	%eax, %ebx
+        jl	L_AES_GCM_decrypt_update_avx1_ghash_64
+L_AES_GCM_decrypt_update_avx1_ghash_64_done:
+        vmovdqa	%xmm2, %xmm6
+        vmovdqu	(%esp), %xmm5
+L_AES_GCM_decrypt_update_avx1_done_64:
+        movl	196(%esp), %edx
+        cmpl	%edx, %ebx
+        jge	L_AES_GCM_decrypt_update_avx1_done_dec
+        movl	196(%esp), %eax
+        andl	$0xfffffff0, %eax
+        cmpl	%eax, %ebx
+        jge	L_AES_GCM_decrypt_update_avx1_last_block_done
+L_AES_GCM_decrypt_update_avx1_last_block_start:
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        vmovdqu	(%ecx), %xmm1
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm1, %xmm1
+        vpxor	%xmm6, %xmm1, %xmm1
+        vmovdqu	%xmm1, (%esp)
+        vmovdqu	64(%esp), %xmm1
+        vmovdqu	(%esp), %xmm3
+        vpshufb	L_aes_gcm_avx1_bswap_epi64, %xmm1, %xmm0
+        vpaddd	L_aes_gcm_avx1_one, %xmm1, %xmm1
+        vmovdqu	%xmm1, 64(%esp)
+        vpxor	(%ebp), %xmm0, %xmm0
+        vpclmulqdq	$16, %xmm5, %xmm3, %xmm4
+        vaesenc	16(%ebp), %xmm0, %xmm0
+        vaesenc	32(%ebp), %xmm0, %xmm0
+        vpclmulqdq	$0x01, %xmm5, %xmm3, %xmm7
+        vaesenc	48(%ebp), %xmm0, %xmm0
+        vaesenc	64(%ebp), %xmm0, %xmm0
+        vaesenc	80(%ebp), %xmm0, %xmm0
+        vpclmulqdq	$0x11, %xmm5, %xmm3, %xmm1
+        vaesenc	96(%ebp), %xmm0, %xmm0
+        vpxor	%xmm7, %xmm4, %xmm4
+        vpslldq	$8, %xmm4, %xmm2
+        vpsrldq	$8, %xmm4, %xmm4
+        vaesenc	112(%ebp), %xmm0, %xmm0
+        vpclmulqdq	$0x00, %xmm5, %xmm3, %xmm7
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm4, %xmm1, %xmm1
+        vmovdqa	L_aes_gcm_avx1_mod2_128, %xmm3
+        vpclmulqdq	$16, %xmm3, %xmm2, %xmm7
+        vaesenc	128(%ebp), %xmm0, %xmm0
+        vpshufd	$0x4e, %xmm2, %xmm4
+        vpxor	%xmm7, %xmm4, %xmm4
+        vpclmulqdq	$16, %xmm3, %xmm4, %xmm7
+        vaesenc	144(%ebp), %xmm0, %xmm0
+        vpshufd	$0x4e, %xmm4, %xmm6
+        vpxor	%xmm7, %xmm6, %xmm6
+        vpxor	%xmm1, %xmm6, %xmm6
+        cmpl	$11, 184(%esp)
+        vmovdqa	160(%ebp), %xmm1
+        jl	L_AES_GCM_decrypt_update_avx1_aesenc_gfmul_last
+        vaesenc	%xmm1, %xmm0, %xmm0
+        vaesenc	176(%ebp), %xmm0, %xmm0
+        cmpl	$13, 184(%esp)
+        vmovdqa	192(%ebp), %xmm1
+        jl	L_AES_GCM_decrypt_update_avx1_aesenc_gfmul_last
+        vaesenc	%xmm1, %xmm0, %xmm0
+        vaesenc	208(%ebp), %xmm0, %xmm0
+        vmovdqa	224(%ebp), %xmm1
+L_AES_GCM_decrypt_update_avx1_aesenc_gfmul_last:
+        vaesenclast	%xmm1, %xmm0, %xmm0
+        vmovdqu	(%ecx), %xmm1
+        vpxor	%xmm1, %xmm0, %xmm0
+        vmovdqu	%xmm0, (%edx)
+        addl	$16, %ebx
+        cmpl	%eax, %ebx
+        jl	L_AES_GCM_decrypt_update_avx1_last_block_start
+L_AES_GCM_decrypt_update_avx1_last_block_done:
+L_AES_GCM_decrypt_update_avx1_done_dec:
+        movl	200(%esp), %esi
+        movl	208(%esp), %edi
+        vmovdqu	64(%esp), %xmm4
+        vmovdqa	%xmm6, (%esi)
+        vmovdqu	%xmm4, (%edi)
+        addl	$0xa0, %esp
+        popl	%ebp
+        popl	%edi
+        popl	%esi
+        popl	%ebx
+        ret
+.size	AES_GCM_decrypt_update_avx1,.-AES_GCM_decrypt_update_avx1
+.text
+.globl	AES_GCM_decrypt_final_avx1
+.type	AES_GCM_decrypt_final_avx1,@function
+.align	16
+AES_GCM_decrypt_final_avx1:
+        pushl	%ebx
+        pushl	%esi
+        pushl	%edi
+        pushl	%ebp
+        subl	$16, %esp
+        movl	36(%esp), %ebp
+        movl	56(%esp), %esi
+        movl	60(%esp), %edi
+        vmovdqa	(%ebp), %xmm6
+        vmovdqa	(%esi), %xmm5
+        vmovdqa	(%edi), %xmm7
+        vpsrlq	$63, %xmm5, %xmm1
+        vpsllq	$0x01, %xmm5, %xmm0
+        vpslldq	$8, %xmm1, %xmm1
+        vpor	%xmm1, %xmm0, %xmm0
+        vpshufd	$0xff, %xmm5, %xmm5
+        vpsrad	$31, %xmm5, %xmm5
+        vpand	L_aes_gcm_avx1_mod2_128, %xmm5, %xmm5
+        vpxor	%xmm0, %xmm5, %xmm5
+        movl	48(%esp), %edx
+        movl	52(%esp), %ecx
+        shll	$3, %edx
+        shll	$3, %ecx
+        vpinsrd	$0x00, %edx, %xmm0, %xmm0
+        vpinsrd	$2, %ecx, %xmm0, %xmm0
+        movl	48(%esp), %edx
+        movl	52(%esp), %ecx
+        shrl	$29, %edx
+        shrl	$29, %ecx
+        vpinsrd	$0x01, %edx, %xmm0, %xmm0
+        vpinsrd	$3, %ecx, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm6, %xmm6
+        # ghash_gfmul_red_avx
+        vpshufd	$0x4e, %xmm5, %xmm1
+        vpshufd	$0x4e, %xmm6, %xmm2
+        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm3
+        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm0
+        vpxor	%xmm5, %xmm1, %xmm1
+        vpxor	%xmm6, %xmm2, %xmm2
+        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm1, %xmm1
+        vpslldq	$8, %xmm1, %xmm2
+        vpsrldq	$8, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm0, %xmm0
+        vpxor	%xmm1, %xmm3, %xmm6
+        vpslld	$31, %xmm0, %xmm1
+        vpslld	$30, %xmm0, %xmm2
+        vpslld	$25, %xmm0, %xmm3
+        vpxor	%xmm2, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm1, %xmm1
+        vpsrldq	$4, %xmm1, %xmm3
+        vpslldq	$12, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm0, %xmm0
+        vpsrld	$0x01, %xmm0, %xmm1
+        vpsrld	$2, %xmm0, %xmm2
+        vpxor	%xmm2, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpsrld	$7, %xmm0, %xmm0
+        vpxor	%xmm3, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm6, %xmm6
+        vpshufb	L_aes_gcm_avx1_bswap_mask, %xmm6, %xmm6
+        vpxor	%xmm7, %xmm6, %xmm0
+        movl	40(%esp), %esi
+        movl	64(%esp), %edi
+        cmpl	$16, 44(%esp)
+        je	L_AES_GCM_decrypt_final_avx1_cmp_tag_16
+        subl	$16, %esp
+        xorl	%ecx, %ecx
+        xorl	%ebx, %ebx
+        vmovdqu	%xmm0, (%esp)
+L_AES_GCM_decrypt_final_avx1_cmp_tag_loop:
+        movzbl	(%esp,%ecx,1), %eax
+        xorb	(%esi,%ecx,1), %al
+        orb	%al, %bl
+        incl	%ecx
+        cmpl	44(%esp), %ecx
+        jne	L_AES_GCM_decrypt_final_avx1_cmp_tag_loop
+        cmpb	$0x00, %bl
+        sete	%bl
+        addl	$16, %esp
+        xorl	%ecx, %ecx
+        jmp	L_AES_GCM_decrypt_final_avx1_cmp_tag_done
+L_AES_GCM_decrypt_final_avx1_cmp_tag_16:
+        vmovdqu	(%esi), %xmm1
+        vpcmpeqb	%xmm1, %xmm0, %xmm0
+        vpmovmskb	%xmm0, %edx
+        # %%edx == 0xFFFF then return 1 else => return 0
+        xorl	%ebx, %ebx
+        cmpl	$0xffff, %edx
+        sete	%bl
+L_AES_GCM_decrypt_final_avx1_cmp_tag_done:
+        movl	%ebx, (%edi)
+        addl	$16, %esp
+        popl	%ebp
+        popl	%edi
+        popl	%esi
+        popl	%ebx
+        ret
+.size	AES_GCM_decrypt_final_avx1,.-AES_GCM_decrypt_final_avx1
+#endif /* WOLFSSL_AESGCM_STREAM */
+#endif /* HAVE_INTEL_AVX1 */
+#ifdef HAVE_INTEL_AVX2
+.text
+.globl	AES_GCM_encrypt_avx2
+.type	AES_GCM_encrypt_avx2,@function
+.align	16
+AES_GCM_encrypt_avx2:
+        pushl	%ebx
+        pushl	%esi
+        pushl	%edi
+        pushl	%ebp
+        subl	$0x70, %esp
+        movl	144(%esp), %esi
+        movl	168(%esp), %ebp
+        movl	160(%esp), %edx
+        vpxor	%xmm4, %xmm4, %xmm4
+        cmpl	$12, %edx
+        je	L_AES_GCM_encrypt_avx2_iv_12
+        # Calculate values when IV is not 12 bytes
+        # H = Encrypt X(=0)
+        vmovdqu	(%ebp), %xmm5
+        vaesenc	16(%ebp), %xmm5, %xmm5
+        vaesenc	32(%ebp), %xmm5, %xmm5
+        vaesenc	48(%ebp), %xmm5, %xmm5
+        vaesenc	64(%ebp), %xmm5, %xmm5
+        vaesenc	80(%ebp), %xmm5, %xmm5
+        vaesenc	96(%ebp), %xmm5, %xmm5
+        vaesenc	112(%ebp), %xmm5, %xmm5
+        vaesenc	128(%ebp), %xmm5, %xmm5
+        vaesenc	144(%ebp), %xmm5, %xmm5
+        cmpl	$11, 172(%esp)
+        vmovdqu	160(%ebp), %xmm0
+        jl	L_AES_GCM_encrypt_avx2_calc_iv_1_aesenc_avx_last
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	176(%ebp), %xmm5, %xmm5
+        cmpl	$13, 172(%esp)
+        vmovdqu	192(%ebp), %xmm0
+        jl	L_AES_GCM_encrypt_avx2_calc_iv_1_aesenc_avx_last
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	208(%ebp), %xmm5, %xmm5
+        vmovdqu	224(%ebp), %xmm0
+L_AES_GCM_encrypt_avx2_calc_iv_1_aesenc_avx_last:
+        vaesenclast	%xmm0, %xmm5, %xmm5
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm5, %xmm5
+        # Calc counter
+        # Initialization vector
+        cmpl	$0x00, %edx
+        movl	$0x00, %ecx
+        je	L_AES_GCM_encrypt_avx2_calc_iv_done
+        cmpl	$16, %edx
+        jl	L_AES_GCM_encrypt_avx2_calc_iv_lt16
+        andl	$0xfffffff0, %edx
+L_AES_GCM_encrypt_avx2_calc_iv_16_loop:
+        vmovdqu	(%esi,%ecx,1), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm4, %xmm4
+        # ghash_gfmul_avx
+        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
+        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
+        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
+        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpslldq	$8, %xmm2, %xmm1
+        vpsrldq	$8, %xmm2, %xmm2
+        vpxor	%xmm1, %xmm0, %xmm7
+        vpxor	%xmm2, %xmm3, %xmm4
+        # ghash_mid
+        vpsrld	$31, %xmm7, %xmm0
+        vpsrld	$31, %xmm4, %xmm1
+        vpslld	$0x01, %xmm7, %xmm7
+        vpslld	$0x01, %xmm4, %xmm4
+        vpsrldq	$12, %xmm0, %xmm2
+        vpslldq	$4, %xmm0, %xmm0
+        vpslldq	$4, %xmm1, %xmm1
+        vpor	%xmm2, %xmm4, %xmm4
+        vpor	%xmm0, %xmm7, %xmm7
+        vpor	%xmm1, %xmm4, %xmm4
+        # ghash_red
+        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
+        vpclmulqdq	$16, %xmm2, %xmm7, %xmm0
+        vpshufd	$0x4e, %xmm7, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
+        vpshufd	$0x4e, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm4, %xmm4
+        addl	$16, %ecx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_encrypt_avx2_calc_iv_16_loop
+        movl	160(%esp), %edx
+        cmpl	%edx, %ecx
+        je	L_AES_GCM_encrypt_avx2_calc_iv_done
+L_AES_GCM_encrypt_avx2_calc_iv_lt16:
+        vpxor	%xmm0, %xmm0, %xmm0
+        xorl	%ebx, %ebx
+        vmovdqu	%xmm0, (%esp)
+L_AES_GCM_encrypt_avx2_calc_iv_loop:
+        movzbl	(%esi,%ecx,1), %eax
+        movb	%al, (%esp,%ebx,1)
+        incl	%ecx
+        incl	%ebx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_encrypt_avx2_calc_iv_loop
+        vmovdqu	(%esp), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm4, %xmm4
+        # ghash_gfmul_avx
+        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
+        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
+        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
+        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpslldq	$8, %xmm2, %xmm1
+        vpsrldq	$8, %xmm2, %xmm2
+        vpxor	%xmm1, %xmm0, %xmm7
+        vpxor	%xmm2, %xmm3, %xmm4
+        # ghash_mid
+        vpsrld	$31, %xmm7, %xmm0
+        vpsrld	$31, %xmm4, %xmm1
+        vpslld	$0x01, %xmm7, %xmm7
+        vpslld	$0x01, %xmm4, %xmm4
+        vpsrldq	$12, %xmm0, %xmm2
+        vpslldq	$4, %xmm0, %xmm0
+        vpslldq	$4, %xmm1, %xmm1
+        vpor	%xmm2, %xmm4, %xmm4
+        vpor	%xmm0, %xmm7, %xmm7
+        vpor	%xmm1, %xmm4, %xmm4
+        # ghash_red
+        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
+        vpclmulqdq	$16, %xmm2, %xmm7, %xmm0
+        vpshufd	$0x4e, %xmm7, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
+        vpshufd	$0x4e, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm4, %xmm4
+L_AES_GCM_encrypt_avx2_calc_iv_done:
+        # T = Encrypt counter
+        vpxor	%xmm0, %xmm0, %xmm0
+        shll	$3, %edx
+        vpinsrd	$0x00, %edx, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm4, %xmm4
+        # ghash_gfmul_avx
+        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
+        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
+        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
+        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpslldq	$8, %xmm2, %xmm1
+        vpsrldq	$8, %xmm2, %xmm2
+        vpxor	%xmm1, %xmm0, %xmm7
+        vpxor	%xmm2, %xmm3, %xmm4
+        # ghash_mid
+        vpsrld	$31, %xmm7, %xmm0
+        vpsrld	$31, %xmm4, %xmm1
+        vpslld	$0x01, %xmm7, %xmm7
+        vpslld	$0x01, %xmm4, %xmm4
+        vpsrldq	$12, %xmm0, %xmm2
+        vpslldq	$4, %xmm0, %xmm0
+        vpslldq	$4, %xmm1, %xmm1
+        vpor	%xmm2, %xmm4, %xmm4
+        vpor	%xmm0, %xmm7, %xmm7
+        vpor	%xmm1, %xmm4, %xmm4
+        # ghash_red
+        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
+        vpclmulqdq	$16, %xmm2, %xmm7, %xmm0
+        vpshufd	$0x4e, %xmm7, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
+        vpshufd	$0x4e, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm4, %xmm4
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm4, %xmm4
+        #   Encrypt counter
+        vmovdqu	(%ebp), %xmm6
+        vpxor	%xmm4, %xmm6, %xmm6
+        vaesenc	16(%ebp), %xmm6, %xmm6
+        vaesenc	32(%ebp), %xmm6, %xmm6
+        vaesenc	48(%ebp), %xmm6, %xmm6
+        vaesenc	64(%ebp), %xmm6, %xmm6
+        vaesenc	80(%ebp), %xmm6, %xmm6
+        vaesenc	96(%ebp), %xmm6, %xmm6
+        vaesenc	112(%ebp), %xmm6, %xmm6
+        vaesenc	128(%ebp), %xmm6, %xmm6
+        vaesenc	144(%ebp), %xmm6, %xmm6
+        cmpl	$11, 172(%esp)
+        vmovdqu	160(%ebp), %xmm0
+        jl	L_AES_GCM_encrypt_avx2_calc_iv_2_aesenc_avx_last
+        vaesenc	%xmm0, %xmm6, %xmm6
+        vaesenc	176(%ebp), %xmm6, %xmm6
+        cmpl	$13, 172(%esp)
+        vmovdqu	192(%ebp), %xmm0
+        jl	L_AES_GCM_encrypt_avx2_calc_iv_2_aesenc_avx_last
+        vaesenc	%xmm0, %xmm6, %xmm6
+        vaesenc	208(%ebp), %xmm6, %xmm6
+        vmovdqu	224(%ebp), %xmm0
+L_AES_GCM_encrypt_avx2_calc_iv_2_aesenc_avx_last:
+        vaesenclast	%xmm0, %xmm6, %xmm6
+        jmp	L_AES_GCM_encrypt_avx2_iv_done
+L_AES_GCM_encrypt_avx2_iv_12:
+        # # Calculate values when IV is 12 bytes
+        # Set counter based on IV
+        vmovdqu	L_avx2_aes_gcm_bswap_one, %xmm4
+        vmovdqu	(%ebp), %xmm5
+        vpblendd	$7, (%esi), %xmm4, %xmm4
+        # H = Encrypt X(=0) and T = Encrypt counter
+        vmovdqu	16(%ebp), %xmm7
+        vpxor	%xmm5, %xmm4, %xmm6
+        vaesenc	%xmm7, %xmm5, %xmm5
+        vaesenc	%xmm7, %xmm6, %xmm6
+        vmovdqu	32(%ebp), %xmm0
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm6, %xmm6
+        vmovdqu	48(%ebp), %xmm0
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm6, %xmm6
+        vmovdqu	64(%ebp), %xmm0
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm6, %xmm6
+        vmovdqu	80(%ebp), %xmm0
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm6, %xmm6
+        vmovdqu	96(%ebp), %xmm0
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm6, %xmm6
+        vmovdqu	112(%ebp), %xmm0
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm6, %xmm6
+        vmovdqu	128(%ebp), %xmm0
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm6, %xmm6
+        vmovdqu	144(%ebp), %xmm0
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm6, %xmm6
+        cmpl	$11, 172(%esp)
+        vmovdqu	160(%ebp), %xmm0
+        jl	L_AES_GCM_encrypt_avx2_calc_iv_12_last
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm6, %xmm6
+        vmovdqu	176(%ebp), %xmm0
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm6, %xmm6
+        cmpl	$13, 172(%esp)
+        vmovdqu	192(%ebp), %xmm0
+        jl	L_AES_GCM_encrypt_avx2_calc_iv_12_last
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm6, %xmm6
+        vmovdqu	208(%ebp), %xmm0
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm6, %xmm6
+        vmovdqu	224(%ebp), %xmm0
+L_AES_GCM_encrypt_avx2_calc_iv_12_last:
+        vaesenclast	%xmm0, %xmm5, %xmm5
+        vaesenclast	%xmm0, %xmm6, %xmm6
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm5, %xmm5
+L_AES_GCM_encrypt_avx2_iv_done:
+        vmovdqu	%xmm6, 80(%esp)
+        vpxor	%xmm6, %xmm6, %xmm6
+        movl	140(%esp), %esi
+        # Additional authentication data
+        movl	156(%esp), %edx
+        cmpl	$0x00, %edx
+        je	L_AES_GCM_encrypt_avx2_calc_aad_done
+        xorl	%ecx, %ecx
+        cmpl	$16, %edx
+        jl	L_AES_GCM_encrypt_avx2_calc_aad_lt16
+        andl	$0xfffffff0, %edx
+L_AES_GCM_encrypt_avx2_calc_aad_16_loop:
+        vmovdqu	(%esi,%ecx,1), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm6, %xmm6
+        # ghash_gfmul_avx
+        vpclmulqdq	$16, %xmm6, %xmm5, %xmm2
+        vpclmulqdq	$0x01, %xmm6, %xmm5, %xmm1
+        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm0
+        vpclmulqdq	$0x11, %xmm6, %xmm5, %xmm3
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpslldq	$8, %xmm2, %xmm1
+        vpsrldq	$8, %xmm2, %xmm2
+        vpxor	%xmm1, %xmm0, %xmm7
+        vpxor	%xmm2, %xmm3, %xmm6
+        # ghash_mid
+        vpsrld	$31, %xmm7, %xmm0
+        vpsrld	$31, %xmm6, %xmm1
+        vpslld	$0x01, %xmm7, %xmm7
+        vpslld	$0x01, %xmm6, %xmm6
+        vpsrldq	$12, %xmm0, %xmm2
+        vpslldq	$4, %xmm0, %xmm0
+        vpslldq	$4, %xmm1, %xmm1
+        vpor	%xmm2, %xmm6, %xmm6
+        vpor	%xmm0, %xmm7, %xmm7
+        vpor	%xmm1, %xmm6, %xmm6
+        # ghash_red
+        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
+        vpclmulqdq	$16, %xmm2, %xmm7, %xmm0
+        vpshufd	$0x4e, %xmm7, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
+        vpshufd	$0x4e, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm6, %xmm6
+        addl	$16, %ecx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_encrypt_avx2_calc_aad_16_loop
+        movl	156(%esp), %edx
+        cmpl	%edx, %ecx
+        je	L_AES_GCM_encrypt_avx2_calc_aad_done
+L_AES_GCM_encrypt_avx2_calc_aad_lt16:
+        vpxor	%xmm0, %xmm0, %xmm0
+        xorl	%ebx, %ebx
+        vmovdqu	%xmm0, (%esp)
+L_AES_GCM_encrypt_avx2_calc_aad_loop:
+        movzbl	(%esi,%ecx,1), %eax
+        movb	%al, (%esp,%ebx,1)
+        incl	%ecx
+        incl	%ebx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_encrypt_avx2_calc_aad_loop
+        vmovdqu	(%esp), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm6, %xmm6
+        # ghash_gfmul_avx
+        vpclmulqdq	$16, %xmm6, %xmm5, %xmm2
+        vpclmulqdq	$0x01, %xmm6, %xmm5, %xmm1
+        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm0
+        vpclmulqdq	$0x11, %xmm6, %xmm5, %xmm3
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpslldq	$8, %xmm2, %xmm1
+        vpsrldq	$8, %xmm2, %xmm2
+        vpxor	%xmm1, %xmm0, %xmm7
+        vpxor	%xmm2, %xmm3, %xmm6
+        # ghash_mid
+        vpsrld	$31, %xmm7, %xmm0
+        vpsrld	$31, %xmm6, %xmm1
+        vpslld	$0x01, %xmm7, %xmm7
+        vpslld	$0x01, %xmm6, %xmm6
+        vpsrldq	$12, %xmm0, %xmm2
+        vpslldq	$4, %xmm0, %xmm0
+        vpslldq	$4, %xmm1, %xmm1
+        vpor	%xmm2, %xmm6, %xmm6
+        vpor	%xmm0, %xmm7, %xmm7
+        vpor	%xmm1, %xmm6, %xmm6
+        # ghash_red
+        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
+        vpclmulqdq	$16, %xmm2, %xmm7, %xmm0
+        vpshufd	$0x4e, %xmm7, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
+        vpshufd	$0x4e, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm6, %xmm6
+L_AES_GCM_encrypt_avx2_calc_aad_done:
+        movl	132(%esp), %esi
+        movl	136(%esp), %edi
+        # Calculate counter and H
+        vpsrlq	$63, %xmm5, %xmm1
+        vpsllq	$0x01, %xmm5, %xmm0
+        vpslldq	$8, %xmm1, %xmm1
+        vpor	%xmm1, %xmm0, %xmm0
+        vpshufd	$0xff, %xmm5, %xmm5
+        vpsrad	$31, %xmm5, %xmm5
+        vpshufb	L_aes_gcm_avx2_bswap_epi64, %xmm4, %xmm4
+        vpand	L_aes_gcm_avx2_mod2_128, %xmm5, %xmm5
+        vpaddd	L_aes_gcm_avx2_one, %xmm4, %xmm4
+        vpxor	%xmm0, %xmm5, %xmm5
+        xorl	%ebx, %ebx
+        cmpl	$0x40, 152(%esp)
+        movl	152(%esp), %eax
+        jl	L_AES_GCM_encrypt_avx2_done_64
+        andl	$0xffffffc0, %eax
+        vmovdqu	%xmm4, 64(%esp)
+        vmovdqu	%xmm6, 96(%esp)
+        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm3
+        # H ^ 1
+        vmovdqu	%xmm5, (%esp)
+        vmovdqu	%xmm5, %xmm2
+        # H ^ 2
+        vpclmulqdq	$0x00, %xmm2, %xmm2, %xmm5
+        vpclmulqdq	$0x11, %xmm2, %xmm2, %xmm6
+        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
+        vpshufd	$0x4e, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
+        vpshufd	$0x4e, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm6, %xmm0
+        vmovdqu	%xmm0, 16(%esp)
+        # H ^ 3
+        # ghash_gfmul_red
+        vpclmulqdq	$16, %xmm0, %xmm2, %xmm6
+        vpclmulqdq	$0x01, %xmm0, %xmm2, %xmm5
+        vpclmulqdq	$0x00, %xmm0, %xmm2, %xmm4
+        vpxor	%xmm5, %xmm6, %xmm6
+        vpslldq	$8, %xmm6, %xmm5
+        vpsrldq	$8, %xmm6, %xmm6
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpclmulqdq	$0x11, %xmm0, %xmm2, %xmm1
+        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
+        vpshufd	$0x4e, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
+        vpshufd	$0x4e, %xmm5, %xmm5
+        vpxor	%xmm6, %xmm1, %xmm1
+        vpxor	%xmm5, %xmm1, %xmm1
+        vpxor	%xmm4, %xmm1, %xmm1
+        vmovdqu	%xmm1, 32(%esp)
+        # H ^ 4
+        vpclmulqdq	$0x00, %xmm0, %xmm0, %xmm5
+        vpclmulqdq	$0x11, %xmm0, %xmm0, %xmm6
+        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
+        vpshufd	$0x4e, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
+        vpshufd	$0x4e, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm6, %xmm2
+        vmovdqu	%xmm2, 48(%esp)
+        vmovdqu	96(%esp), %xmm6
+        # First 64 bytes of input
+        # aesenc_64
+        # aesenc_ctr
+        vmovdqu	64(%esp), %xmm4
+        vmovdqu	L_aes_gcm_avx2_bswap_epi64, %xmm7
+        vpaddd	L_aes_gcm_avx2_one, %xmm4, %xmm1
+        vpshufb	%xmm7, %xmm4, %xmm0
+        vpaddd	L_aes_gcm_avx2_two, %xmm4, %xmm2
+        vpshufb	%xmm7, %xmm1, %xmm1
+        vpaddd	L_aes_gcm_avx2_three, %xmm4, %xmm3
+        vpshufb	%xmm7, %xmm2, %xmm2
+        vpaddd	L_aes_gcm_avx2_four, %xmm4, %xmm4
+        vpshufb	%xmm7, %xmm3, %xmm3
+        # aesenc_xor
+        vmovdqu	(%ebp), %xmm7
+        vmovdqu	%xmm4, 64(%esp)
+        vpxor	%xmm7, %xmm0, %xmm0
+        vpxor	%xmm7, %xmm1, %xmm1
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm7, %xmm3, %xmm3
+        vmovdqu	16(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	32(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	48(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	64(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	80(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	96(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	112(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	128(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	144(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        cmpl	$11, 172(%esp)
+        vmovdqu	160(%ebp), %xmm7
+        jl	L_AES_GCM_encrypt_avx2_aesenc_64_enc_done
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	176(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        cmpl	$13, 172(%esp)
+        vmovdqu	192(%ebp), %xmm7
+        jl	L_AES_GCM_encrypt_avx2_aesenc_64_enc_done
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	208(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	224(%ebp), %xmm7
+L_AES_GCM_encrypt_avx2_aesenc_64_enc_done:
+        # aesenc_last
+        vaesenclast	%xmm7, %xmm0, %xmm0
+        vaesenclast	%xmm7, %xmm1, %xmm1
+        vaesenclast	%xmm7, %xmm2, %xmm2
+        vaesenclast	%xmm7, %xmm3, %xmm3
+        vmovdqu	(%esi), %xmm7
+        vmovdqu	16(%esi), %xmm4
+        vpxor	%xmm7, %xmm0, %xmm0
+        vpxor	%xmm4, %xmm1, %xmm1
+        vmovdqu	%xmm0, (%edi)
+        vmovdqu	%xmm1, 16(%edi)
+        vmovdqu	32(%esi), %xmm7
+        vmovdqu	48(%esi), %xmm4
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm4, %xmm3, %xmm3
+        vmovdqu	%xmm2, 32(%edi)
+        vmovdqu	%xmm3, 48(%edi)
+        cmpl	$0x40, %eax
+        movl	$0x40, %ebx
+        movl	%esi, %ecx
+        movl	%edi, %edx
+        jle	L_AES_GCM_encrypt_avx2_end_64
+        # More 64 bytes of input
+L_AES_GCM_encrypt_avx2_ghash_64:
+        # aesenc_64_ghash
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        # aesenc_64
+        # aesenc_ctr
+        vmovdqu	64(%esp), %xmm4
+        vmovdqu	L_aes_gcm_avx2_bswap_epi64, %xmm7
+        vpaddd	L_aes_gcm_avx2_one, %xmm4, %xmm1
+        vpshufb	%xmm7, %xmm4, %xmm0
+        vpaddd	L_aes_gcm_avx2_two, %xmm4, %xmm2
+        vpshufb	%xmm7, %xmm1, %xmm1
+        vpaddd	L_aes_gcm_avx2_three, %xmm4, %xmm3
+        vpshufb	%xmm7, %xmm2, %xmm2
+        vpaddd	L_aes_gcm_avx2_four, %xmm4, %xmm4
+        vpshufb	%xmm7, %xmm3, %xmm3
+        # aesenc_xor
+        vmovdqu	(%ebp), %xmm7
+        vmovdqu	%xmm4, 64(%esp)
+        vpxor	%xmm7, %xmm0, %xmm0
+        vpxor	%xmm7, %xmm1, %xmm1
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm7, %xmm3, %xmm3
+        vmovdqu	16(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	32(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	48(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	64(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	80(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	96(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	112(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	128(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	144(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        cmpl	$11, 172(%esp)
+        vmovdqu	160(%ebp), %xmm7
+        jl	L_AES_GCM_encrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	176(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        cmpl	$13, 172(%esp)
+        vmovdqu	192(%ebp), %xmm7
+        jl	L_AES_GCM_encrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	208(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	224(%ebp), %xmm7
+L_AES_GCM_encrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done:
+        # aesenc_last
+        vaesenclast	%xmm7, %xmm0, %xmm0
+        vaesenclast	%xmm7, %xmm1, %xmm1
+        vaesenclast	%xmm7, %xmm2, %xmm2
+        vaesenclast	%xmm7, %xmm3, %xmm3
+        vmovdqu	(%ecx), %xmm7
+        vmovdqu	16(%ecx), %xmm4
+        vpxor	%xmm7, %xmm0, %xmm0
+        vpxor	%xmm4, %xmm1, %xmm1
+        vmovdqu	%xmm0, (%edx)
+        vmovdqu	%xmm1, 16(%edx)
+        vmovdqu	32(%ecx), %xmm7
+        vmovdqu	48(%ecx), %xmm4
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm4, %xmm3, %xmm3
+        vmovdqu	%xmm2, 32(%edx)
+        vmovdqu	%xmm3, 48(%edx)
+        # pclmul_1
+        vmovdqu	-64(%edx), %xmm1
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
+        vmovdqu	48(%esp), %xmm2
+        vpxor	%xmm6, %xmm1, %xmm1
+        vpclmulqdq	$16, %xmm2, %xmm1, %xmm5
+        vpclmulqdq	$0x01, %xmm2, %xmm1, %xmm3
+        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm6
+        vpclmulqdq	$0x11, %xmm2, %xmm1, %xmm7
+        # pclmul_2
+        vmovdqu	-48(%edx), %xmm1
+        vmovdqu	32(%esp), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
+        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
+        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
+        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm7, %xmm7
+        # pclmul_n
+        vmovdqu	-32(%edx), %xmm1
+        vmovdqu	16(%esp), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
+        vpxor	%xmm4, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
+        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm7, %xmm7
+        # pclmul_n
+        vmovdqu	-16(%edx), %xmm1
+        vmovdqu	(%esp), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
+        vpxor	%xmm4, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
+        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm7, %xmm7
+        # aesenc_pclmul_l
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpslldq	$8, %xmm5, %xmm1
+        vpsrldq	$8, %xmm5, %xmm5
+        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm0
+        vpxor	%xmm1, %xmm6, %xmm6
+        vpxor	%xmm5, %xmm7, %xmm7
+        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
+        vpshufd	$0x4e, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
+        vpshufd	$0x4e, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm7, %xmm6, %xmm6
+        # aesenc_64_ghash - end
+        addl	$0x40, %ebx
+        cmpl	%eax, %ebx
+        jl	L_AES_GCM_encrypt_avx2_ghash_64
+L_AES_GCM_encrypt_avx2_end_64:
+        vmovdqu	%xmm6, 96(%esp)
+        vmovdqu	48(%edx), %xmm3
+        vmovdqu	(%esp), %xmm7
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm3, %xmm3
+        vpclmulqdq	$16, %xmm3, %xmm7, %xmm5
+        vpclmulqdq	$0x01, %xmm3, %xmm7, %xmm1
+        vpclmulqdq	$0x00, %xmm3, %xmm7, %xmm4
+        vpclmulqdq	$0x11, %xmm3, %xmm7, %xmm6
+        vpxor	%xmm1, %xmm5, %xmm5
+        vmovdqu	32(%edx), %xmm3
+        vmovdqu	16(%esp), %xmm7
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm3, %xmm3
+        vpclmulqdq	$16, %xmm3, %xmm7, %xmm2
+        vpclmulqdq	$0x01, %xmm3, %xmm7, %xmm1
+        vpclmulqdq	$0x00, %xmm3, %xmm7, %xmm0
+        vpclmulqdq	$0x11, %xmm3, %xmm7, %xmm3
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpxor	%xmm0, %xmm4, %xmm4
+        vmovdqu	16(%edx), %xmm3
+        vmovdqu	32(%esp), %xmm7
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm3, %xmm3
+        vpclmulqdq	$16, %xmm3, %xmm7, %xmm2
+        vpclmulqdq	$0x01, %xmm3, %xmm7, %xmm1
+        vpclmulqdq	$0x00, %xmm3, %xmm7, %xmm0
+        vpclmulqdq	$0x11, %xmm3, %xmm7, %xmm3
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpxor	%xmm0, %xmm4, %xmm4
+        vmovdqu	96(%esp), %xmm0
+        vmovdqu	(%edx), %xmm3
+        vmovdqu	48(%esp), %xmm7
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm3, %xmm3
+        vpxor	%xmm0, %xmm3, %xmm3
+        vpclmulqdq	$16, %xmm3, %xmm7, %xmm2
+        vpclmulqdq	$0x01, %xmm3, %xmm7, %xmm1
+        vpclmulqdq	$0x00, %xmm3, %xmm7, %xmm0
+        vpclmulqdq	$0x11, %xmm3, %xmm7, %xmm3
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpxor	%xmm0, %xmm4, %xmm4
+        vpslldq	$8, %xmm5, %xmm7
+        vpsrldq	$8, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm4, %xmm4
+        vpxor	%xmm5, %xmm6, %xmm6
+        # ghash_red
+        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
+        vpclmulqdq	$16, %xmm2, %xmm4, %xmm0
+        vpshufd	$0x4e, %xmm4, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
+        vpshufd	$0x4e, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm6, %xmm6
+        vmovdqu	(%esp), %xmm5
+        vmovdqu	64(%esp), %xmm4
+L_AES_GCM_encrypt_avx2_done_64:
+        cmpl	152(%esp), %ebx
+        je	L_AES_GCM_encrypt_avx2_done_enc
+        movl	152(%esp), %eax
+        andl	$0xfffffff0, %eax
+        cmpl	%eax, %ebx
+        jge	L_AES_GCM_encrypt_avx2_last_block_done
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        # aesenc_block
+        vmovdqu	%xmm4, %xmm1
+        vpshufb	L_aes_gcm_avx2_bswap_epi64, %xmm1, %xmm0
+        vpaddd	L_aes_gcm_avx2_one, %xmm1, %xmm1
+        vpxor	(%ebp), %xmm0, %xmm0
+        vaesenc	16(%ebp), %xmm0, %xmm0
+        vaesenc	32(%ebp), %xmm0, %xmm0
+        vaesenc	48(%ebp), %xmm0, %xmm0
+        vaesenc	64(%ebp), %xmm0, %xmm0
+        vaesenc	80(%ebp), %xmm0, %xmm0
+        vaesenc	96(%ebp), %xmm0, %xmm0
+        vaesenc	112(%ebp), %xmm0, %xmm0
+        vaesenc	128(%ebp), %xmm0, %xmm0
+        vaesenc	144(%ebp), %xmm0, %xmm0
+        cmpl	$11, 172(%esp)
+        vmovdqu	160(%ebp), %xmm2
+        jl	L_AES_GCM_encrypt_avx2_aesenc_block_aesenc_avx_last
+        vaesenc	%xmm2, %xmm0, %xmm0
+        vaesenc	176(%ebp), %xmm0, %xmm0
+        cmpl	$13, 172(%esp)
+        vmovdqu	192(%ebp), %xmm2
+        jl	L_AES_GCM_encrypt_avx2_aesenc_block_aesenc_avx_last
+        vaesenc	%xmm2, %xmm0, %xmm0
+        vaesenc	208(%ebp), %xmm0, %xmm0
+        vmovdqu	224(%ebp), %xmm2
+L_AES_GCM_encrypt_avx2_aesenc_block_aesenc_avx_last:
+        vaesenclast	%xmm2, %xmm0, %xmm0
+        vmovdqu	%xmm1, %xmm4
+        vmovdqu	(%ecx), %xmm1
+        vpxor	%xmm1, %xmm0, %xmm0
+        vmovdqu	%xmm0, (%edx)
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm6, %xmm6
+        addl	$16, %ebx
+        cmpl	%eax, %ebx
+        jge	L_AES_GCM_encrypt_avx2_last_block_ghash
+L_AES_GCM_encrypt_avx2_last_block_start:
+        vpshufb	L_aes_gcm_avx2_bswap_epi64, %xmm4, %xmm7
+        vpaddd	L_aes_gcm_avx2_one, %xmm4, %xmm4
+        vmovdqu	%xmm4, 64(%esp)
+        # aesenc_gfmul_sb
+        vpclmulqdq	$0x01, %xmm5, %xmm6, %xmm2
+        vpclmulqdq	$16, %xmm5, %xmm6, %xmm3
+        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm1
+        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm4
+        vpxor	(%ebp), %xmm7, %xmm7
+        vaesenc	16(%ebp), %xmm7, %xmm7
+        vpxor	%xmm2, %xmm3, %xmm3
+        vpslldq	$8, %xmm3, %xmm2
+        vpsrldq	$8, %xmm3, %xmm3
+        vaesenc	32(%ebp), %xmm7, %xmm7
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm2, %xmm1
+        vaesenc	48(%ebp), %xmm7, %xmm7
+        vaesenc	64(%ebp), %xmm7, %xmm7
+        vaesenc	80(%ebp), %xmm7, %xmm7
+        vpshufd	$0x4e, %xmm2, %xmm2
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm2, %xmm1
+        vaesenc	96(%ebp), %xmm7, %xmm7
+        vaesenc	112(%ebp), %xmm7, %xmm7
+        vaesenc	128(%ebp), %xmm7, %xmm7
+        vpshufd	$0x4e, %xmm2, %xmm2
+        vaesenc	144(%ebp), %xmm7, %xmm7
+        vpxor	%xmm3, %xmm4, %xmm4
+        vpxor	%xmm4, %xmm2, %xmm2
+        vmovdqu	160(%ebp), %xmm0
+        cmpl	$11, 172(%esp)
+        jl	L_AES_GCM_encrypt_avx2_aesenc_gfmul_sb_last
+        vaesenc	%xmm0, %xmm7, %xmm7
+        vaesenc	176(%ebp), %xmm7, %xmm7
+        vmovdqu	192(%ebp), %xmm0
+        cmpl	$13, 172(%esp)
+        jl	L_AES_GCM_encrypt_avx2_aesenc_gfmul_sb_last
+        vaesenc	%xmm0, %xmm7, %xmm7
+        vaesenc	208(%ebp), %xmm7, %xmm7
+        vmovdqu	224(%ebp), %xmm0
+L_AES_GCM_encrypt_avx2_aesenc_gfmul_sb_last:
+        vaesenclast	%xmm0, %xmm7, %xmm7
+        vmovdqu	(%esi,%ebx,1), %xmm3
+        vpxor	%xmm1, %xmm2, %xmm6
+        vpxor	%xmm3, %xmm7, %xmm7
+        vmovdqu	%xmm7, (%edi,%ebx,1)
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm7, %xmm7
+        vpxor	%xmm7, %xmm6, %xmm6
+        vmovdqu	64(%esp), %xmm4
+        addl	$16, %ebx
+        cmpl	%eax, %ebx
+        jl	L_AES_GCM_encrypt_avx2_last_block_start
+L_AES_GCM_encrypt_avx2_last_block_ghash:
+        # ghash_gfmul_red
+        vpclmulqdq	$16, %xmm5, %xmm6, %xmm2
+        vpclmulqdq	$0x01, %xmm5, %xmm6, %xmm1
+        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm0
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpslldq	$8, %xmm2, %xmm1
+        vpsrldq	$8, %xmm2, %xmm2
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm6
+        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm1, %xmm0
+        vpshufd	$0x4e, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm1, %xmm0
+        vpshufd	$0x4e, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm6, %xmm6
+        vpxor	%xmm1, %xmm6, %xmm6
+        vpxor	%xmm0, %xmm6, %xmm6
+L_AES_GCM_encrypt_avx2_last_block_done:
+        movl	152(%esp), %ecx
+        movl	152(%esp), %edx
+        andl	$15, %ecx
+        jz	L_AES_GCM_encrypt_avx2_done_enc
+        # aesenc_last15_enc
+        vpshufb	L_aes_gcm_avx2_bswap_epi64, %xmm4, %xmm4
+        vpxor	(%ebp), %xmm4, %xmm4
+        vaesenc	16(%ebp), %xmm4, %xmm4
+        vaesenc	32(%ebp), %xmm4, %xmm4
+        vaesenc	48(%ebp), %xmm4, %xmm4
+        vaesenc	64(%ebp), %xmm4, %xmm4
+        vaesenc	80(%ebp), %xmm4, %xmm4
+        vaesenc	96(%ebp), %xmm4, %xmm4
+        vaesenc	112(%ebp), %xmm4, %xmm4
+        vaesenc	128(%ebp), %xmm4, %xmm4
+        vaesenc	144(%ebp), %xmm4, %xmm4
+        cmpl	$11, 172(%esp)
+        vmovdqu	160(%ebp), %xmm0
+        jl	L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_aesenc_avx_last
+        vaesenc	%xmm0, %xmm4, %xmm4
+        vaesenc	176(%ebp), %xmm4, %xmm4
+        cmpl	$13, 172(%esp)
+        vmovdqu	192(%ebp), %xmm0
+        jl	L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_aesenc_avx_last
+        vaesenc	%xmm0, %xmm4, %xmm4
+        vaesenc	208(%ebp), %xmm4, %xmm4
+        vmovdqu	224(%ebp), %xmm0
+L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_aesenc_avx_last:
+        vaesenclast	%xmm0, %xmm4, %xmm4
+        xorl	%ecx, %ecx
+        vpxor	%xmm0, %xmm0, %xmm0
+        vmovdqu	%xmm4, (%esp)
+        vmovdqu	%xmm0, 16(%esp)
+L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_loop:
+        movzbl	(%esi,%ebx,1), %eax
+        xorb	(%esp,%ecx,1), %al
+        movb	%al, 16(%esp,%ecx,1)
+        movb	%al, (%edi,%ebx,1)
+        incl	%ebx
+        incl	%ecx
+        cmpl	%edx, %ebx
+        jl	L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_loop
+L_AES_GCM_encrypt_avx2_aesenc_last15_enc_avx_finish_enc:
+        vmovdqu	16(%esp), %xmm4
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm4, %xmm4
+        vpxor	%xmm4, %xmm6, %xmm6
+        # ghash_gfmul_red
+        vpclmulqdq	$16, %xmm5, %xmm6, %xmm2
+        vpclmulqdq	$0x01, %xmm5, %xmm6, %xmm1
+        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm0
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpslldq	$8, %xmm2, %xmm1
+        vpsrldq	$8, %xmm2, %xmm2
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm6
+        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm1, %xmm0
+        vpshufd	$0x4e, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm1, %xmm0
+        vpshufd	$0x4e, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm6, %xmm6
+        vpxor	%xmm1, %xmm6, %xmm6
+        vpxor	%xmm0, %xmm6, %xmm6
+L_AES_GCM_encrypt_avx2_done_enc:
+        vmovdqu	80(%esp), %xmm7
+        # calc_tag
+        movl	152(%esp), %ecx
+        shll	$3, %ecx
+        vpinsrd	$0x00, %ecx, %xmm0, %xmm0
+        movl	156(%esp), %ecx
+        shll	$3, %ecx
+        vpinsrd	$2, %ecx, %xmm0, %xmm0
+        movl	152(%esp), %ecx
+        shrl	$29, %ecx
+        vpinsrd	$0x01, %ecx, %xmm0, %xmm0
+        movl	156(%esp), %ecx
+        shrl	$29, %ecx
+        vpinsrd	$3, %ecx, %xmm0, %xmm0
+        vpxor	%xmm6, %xmm0, %xmm0
+        # ghash_gfmul_red
+        vpclmulqdq	$16, %xmm5, %xmm0, %xmm4
+        vpclmulqdq	$0x01, %xmm5, %xmm0, %xmm3
+        vpclmulqdq	$0x00, %xmm5, %xmm0, %xmm2
+        vpxor	%xmm3, %xmm4, %xmm4
+        vpslldq	$8, %xmm4, %xmm3
+        vpsrldq	$8, %xmm4, %xmm4
+        vpxor	%xmm2, %xmm3, %xmm3
+        vpclmulqdq	$0x11, %xmm5, %xmm0, %xmm0
+        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm3, %xmm2
+        vpshufd	$0x4e, %xmm3, %xmm3
+        vpxor	%xmm2, %xmm3, %xmm3
+        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm3, %xmm2
+        vpshufd	$0x4e, %xmm3, %xmm3
+        vpxor	%xmm4, %xmm0, %xmm0
+        vpxor	%xmm3, %xmm0, %xmm0
+        vpxor	%xmm2, %xmm0, %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm7, %xmm0, %xmm0
+        movl	148(%esp), %edi
+        movl	164(%esp), %ebx
+        # store_tag
+        cmpl	$16, %ebx
+        je	L_AES_GCM_encrypt_avx2_store_tag_16
+        xorl	%ecx, %ecx
+        vmovdqu	%xmm0, (%esp)
+L_AES_GCM_encrypt_avx2_store_tag_loop:
+        movzbl	(%esp,%ecx,1), %eax
+        movb	%al, (%edi,%ecx,1)
+        incl	%ecx
+        cmpl	%ebx, %ecx
+        jne	L_AES_GCM_encrypt_avx2_store_tag_loop
+        jmp	L_AES_GCM_encrypt_avx2_store_tag_done
+L_AES_GCM_encrypt_avx2_store_tag_16:
+        vmovdqu	%xmm0, (%edi)
+L_AES_GCM_encrypt_avx2_store_tag_done:
+        addl	$0x70, %esp
+        popl	%ebp
+        popl	%edi
+        popl	%esi
+        popl	%ebx
+        ret
+.size	AES_GCM_encrypt_avx2,.-AES_GCM_encrypt_avx2
+.text
+.globl	AES_GCM_decrypt_avx2
+.type	AES_GCM_decrypt_avx2,@function
+.align	16
+AES_GCM_decrypt_avx2:
+        pushl	%ebx
+        pushl	%esi
+        pushl	%edi
+        pushl	%ebp
+        subl	$0xb0, %esp
+        movl	208(%esp), %esi
+        movl	232(%esp), %ebp
+        vpxor	%xmm4, %xmm4, %xmm4
+        movl	224(%esp), %edx
+        cmpl	$12, %edx
+        je	L_AES_GCM_decrypt_avx2_iv_12
+        # Calculate values when IV is not 12 bytes
+        # H = Encrypt X(=0)
+        vmovdqu	(%ebp), %xmm5
+        vaesenc	16(%ebp), %xmm5, %xmm5
+        vaesenc	32(%ebp), %xmm5, %xmm5
+        vaesenc	48(%ebp), %xmm5, %xmm5
+        vaesenc	64(%ebp), %xmm5, %xmm5
+        vaesenc	80(%ebp), %xmm5, %xmm5
+        vaesenc	96(%ebp), %xmm5, %xmm5
+        vaesenc	112(%ebp), %xmm5, %xmm5
+        vaesenc	128(%ebp), %xmm5, %xmm5
+        vaesenc	144(%ebp), %xmm5, %xmm5
+        cmpl	$11, 236(%esp)
+        vmovdqu	160(%ebp), %xmm0
+        jl	L_AES_GCM_decrypt_avx2_calc_iv_1_aesenc_avx_last
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	176(%ebp), %xmm5, %xmm5
+        cmpl	$13, 236(%esp)
+        vmovdqu	192(%ebp), %xmm0
+        jl	L_AES_GCM_decrypt_avx2_calc_iv_1_aesenc_avx_last
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	208(%ebp), %xmm5, %xmm5
+        vmovdqu	224(%ebp), %xmm0
+L_AES_GCM_decrypt_avx2_calc_iv_1_aesenc_avx_last:
+        vaesenclast	%xmm0, %xmm5, %xmm5
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm5, %xmm5
+        # Calc counter
+        # Initialization vector
+        cmpl	$0x00, %edx
+        movl	$0x00, %ecx
+        je	L_AES_GCM_decrypt_avx2_calc_iv_done
+        cmpl	$16, %edx
+        jl	L_AES_GCM_decrypt_avx2_calc_iv_lt16
+        andl	$0xfffffff0, %edx
+L_AES_GCM_decrypt_avx2_calc_iv_16_loop:
+        vmovdqu	(%esi,%ecx,1), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm4, %xmm4
+        # ghash_gfmul_avx
+        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
+        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
+        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
+        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpslldq	$8, %xmm2, %xmm1
+        vpsrldq	$8, %xmm2, %xmm2
+        vpxor	%xmm1, %xmm0, %xmm7
+        vpxor	%xmm2, %xmm3, %xmm4
+        # ghash_mid
+        vpsrld	$31, %xmm7, %xmm0
+        vpsrld	$31, %xmm4, %xmm1
+        vpslld	$0x01, %xmm7, %xmm7
+        vpslld	$0x01, %xmm4, %xmm4
+        vpsrldq	$12, %xmm0, %xmm2
+        vpslldq	$4, %xmm0, %xmm0
+        vpslldq	$4, %xmm1, %xmm1
+        vpor	%xmm2, %xmm4, %xmm4
+        vpor	%xmm0, %xmm7, %xmm7
+        vpor	%xmm1, %xmm4, %xmm4
+        # ghash_red
+        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
+        vpclmulqdq	$16, %xmm2, %xmm7, %xmm0
+        vpshufd	$0x4e, %xmm7, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
+        vpshufd	$0x4e, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm4, %xmm4
+        addl	$16, %ecx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_decrypt_avx2_calc_iv_16_loop
+        movl	224(%esp), %edx
+        cmpl	%edx, %ecx
+        je	L_AES_GCM_decrypt_avx2_calc_iv_done
+L_AES_GCM_decrypt_avx2_calc_iv_lt16:
+        vpxor	%xmm0, %xmm0, %xmm0
+        xorl	%ebx, %ebx
+        vmovdqu	%xmm0, (%esp)
+L_AES_GCM_decrypt_avx2_calc_iv_loop:
+        movzbl	(%esi,%ecx,1), %eax
+        movb	%al, (%esp,%ebx,1)
+        incl	%ecx
+        incl	%ebx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_decrypt_avx2_calc_iv_loop
+        vmovdqu	(%esp), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm4, %xmm4
+        # ghash_gfmul_avx
+        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
+        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
+        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
+        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpslldq	$8, %xmm2, %xmm1
+        vpsrldq	$8, %xmm2, %xmm2
+        vpxor	%xmm1, %xmm0, %xmm7
+        vpxor	%xmm2, %xmm3, %xmm4
+        # ghash_mid
+        vpsrld	$31, %xmm7, %xmm0
+        vpsrld	$31, %xmm4, %xmm1
+        vpslld	$0x01, %xmm7, %xmm7
+        vpslld	$0x01, %xmm4, %xmm4
+        vpsrldq	$12, %xmm0, %xmm2
+        vpslldq	$4, %xmm0, %xmm0
+        vpslldq	$4, %xmm1, %xmm1
+        vpor	%xmm2, %xmm4, %xmm4
+        vpor	%xmm0, %xmm7, %xmm7
+        vpor	%xmm1, %xmm4, %xmm4
+        # ghash_red
+        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
+        vpclmulqdq	$16, %xmm2, %xmm7, %xmm0
+        vpshufd	$0x4e, %xmm7, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
+        vpshufd	$0x4e, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm4, %xmm4
+L_AES_GCM_decrypt_avx2_calc_iv_done:
+        # T = Encrypt counter
+        vpxor	%xmm0, %xmm0, %xmm0
+        shll	$3, %edx
+        vpinsrd	$0x00, %edx, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm4, %xmm4
+        # ghash_gfmul_avx
+        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
+        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
+        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
+        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpslldq	$8, %xmm2, %xmm1
+        vpsrldq	$8, %xmm2, %xmm2
+        vpxor	%xmm1, %xmm0, %xmm7
+        vpxor	%xmm2, %xmm3, %xmm4
+        # ghash_mid
+        vpsrld	$31, %xmm7, %xmm0
+        vpsrld	$31, %xmm4, %xmm1
+        vpslld	$0x01, %xmm7, %xmm7
+        vpslld	$0x01, %xmm4, %xmm4
+        vpsrldq	$12, %xmm0, %xmm2
+        vpslldq	$4, %xmm0, %xmm0
+        vpslldq	$4, %xmm1, %xmm1
+        vpor	%xmm2, %xmm4, %xmm4
+        vpor	%xmm0, %xmm7, %xmm7
+        vpor	%xmm1, %xmm4, %xmm4
+        # ghash_red
+        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
+        vpclmulqdq	$16, %xmm2, %xmm7, %xmm0
+        vpshufd	$0x4e, %xmm7, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
+        vpshufd	$0x4e, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm4, %xmm4
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm4, %xmm4
+        #   Encrypt counter
+        vmovdqu	(%ebp), %xmm6
+        vpxor	%xmm4, %xmm6, %xmm6
+        vaesenc	16(%ebp), %xmm6, %xmm6
+        vaesenc	32(%ebp), %xmm6, %xmm6
+        vaesenc	48(%ebp), %xmm6, %xmm6
+        vaesenc	64(%ebp), %xmm6, %xmm6
+        vaesenc	80(%ebp), %xmm6, %xmm6
+        vaesenc	96(%ebp), %xmm6, %xmm6
+        vaesenc	112(%ebp), %xmm6, %xmm6
+        vaesenc	128(%ebp), %xmm6, %xmm6
+        vaesenc	144(%ebp), %xmm6, %xmm6
+        cmpl	$11, 236(%esp)
+        vmovdqu	160(%ebp), %xmm0
+        jl	L_AES_GCM_decrypt_avx2_calc_iv_2_aesenc_avx_last
+        vaesenc	%xmm0, %xmm6, %xmm6
+        vaesenc	176(%ebp), %xmm6, %xmm6
+        cmpl	$13, 236(%esp)
+        vmovdqu	192(%ebp), %xmm0
+        jl	L_AES_GCM_decrypt_avx2_calc_iv_2_aesenc_avx_last
+        vaesenc	%xmm0, %xmm6, %xmm6
+        vaesenc	208(%ebp), %xmm6, %xmm6
+        vmovdqu	224(%ebp), %xmm0
+L_AES_GCM_decrypt_avx2_calc_iv_2_aesenc_avx_last:
+        vaesenclast	%xmm0, %xmm6, %xmm6
+        jmp	L_AES_GCM_decrypt_avx2_iv_done
+L_AES_GCM_decrypt_avx2_iv_12:
+        # # Calculate values when IV is 12 bytes
+        # Set counter based on IV
+        vmovdqu	L_avx2_aes_gcm_bswap_one, %xmm4
+        vmovdqu	(%ebp), %xmm5
+        vpblendd	$7, (%esi), %xmm4, %xmm4
+        # H = Encrypt X(=0) and T = Encrypt counter
+        vmovdqu	16(%ebp), %xmm7
+        vpxor	%xmm5, %xmm4, %xmm6
+        vaesenc	%xmm7, %xmm5, %xmm5
+        vaesenc	%xmm7, %xmm6, %xmm6
+        vmovdqu	32(%ebp), %xmm0
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm6, %xmm6
+        vmovdqu	48(%ebp), %xmm0
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm6, %xmm6
+        vmovdqu	64(%ebp), %xmm0
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm6, %xmm6
+        vmovdqu	80(%ebp), %xmm0
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm6, %xmm6
+        vmovdqu	96(%ebp), %xmm0
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm6, %xmm6
+        vmovdqu	112(%ebp), %xmm0
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm6, %xmm6
+        vmovdqu	128(%ebp), %xmm0
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm6, %xmm6
+        vmovdqu	144(%ebp), %xmm0
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm6, %xmm6
+        cmpl	$11, 236(%esp)
+        vmovdqu	160(%ebp), %xmm0
+        jl	L_AES_GCM_decrypt_avx2_calc_iv_12_last
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm6, %xmm6
+        vmovdqu	176(%ebp), %xmm0
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm6, %xmm6
+        cmpl	$13, 236(%esp)
+        vmovdqu	192(%ebp), %xmm0
+        jl	L_AES_GCM_decrypt_avx2_calc_iv_12_last
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm6, %xmm6
+        vmovdqu	208(%ebp), %xmm0
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm6, %xmm6
+        vmovdqu	224(%ebp), %xmm0
+L_AES_GCM_decrypt_avx2_calc_iv_12_last:
+        vaesenclast	%xmm0, %xmm5, %xmm5
+        vaesenclast	%xmm0, %xmm6, %xmm6
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm5, %xmm5
+L_AES_GCM_decrypt_avx2_iv_done:
+        vmovdqu	%xmm6, 80(%esp)
+        vpxor	%xmm6, %xmm6, %xmm6
+        movl	204(%esp), %esi
+        # Additional authentication data
+        movl	220(%esp), %edx
+        cmpl	$0x00, %edx
+        je	L_AES_GCM_decrypt_avx2_calc_aad_done
+        xorl	%ecx, %ecx
+        cmpl	$16, %edx
+        jl	L_AES_GCM_decrypt_avx2_calc_aad_lt16
+        andl	$0xfffffff0, %edx
+L_AES_GCM_decrypt_avx2_calc_aad_16_loop:
+        vmovdqu	(%esi,%ecx,1), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm6, %xmm6
+        # ghash_gfmul_avx
+        vpclmulqdq	$16, %xmm6, %xmm5, %xmm2
+        vpclmulqdq	$0x01, %xmm6, %xmm5, %xmm1
+        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm0
+        vpclmulqdq	$0x11, %xmm6, %xmm5, %xmm3
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpslldq	$8, %xmm2, %xmm1
+        vpsrldq	$8, %xmm2, %xmm2
+        vpxor	%xmm1, %xmm0, %xmm7
+        vpxor	%xmm2, %xmm3, %xmm6
+        # ghash_mid
+        vpsrld	$31, %xmm7, %xmm0
+        vpsrld	$31, %xmm6, %xmm1
+        vpslld	$0x01, %xmm7, %xmm7
+        vpslld	$0x01, %xmm6, %xmm6
+        vpsrldq	$12, %xmm0, %xmm2
+        vpslldq	$4, %xmm0, %xmm0
+        vpslldq	$4, %xmm1, %xmm1
+        vpor	%xmm2, %xmm6, %xmm6
+        vpor	%xmm0, %xmm7, %xmm7
+        vpor	%xmm1, %xmm6, %xmm6
+        # ghash_red
+        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
+        vpclmulqdq	$16, %xmm2, %xmm7, %xmm0
+        vpshufd	$0x4e, %xmm7, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
+        vpshufd	$0x4e, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm6, %xmm6
+        addl	$16, %ecx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_decrypt_avx2_calc_aad_16_loop
+        movl	220(%esp), %edx
+        cmpl	%edx, %ecx
+        je	L_AES_GCM_decrypt_avx2_calc_aad_done
+L_AES_GCM_decrypt_avx2_calc_aad_lt16:
+        vpxor	%xmm0, %xmm0, %xmm0
+        xorl	%ebx, %ebx
+        vmovdqu	%xmm0, (%esp)
+L_AES_GCM_decrypt_avx2_calc_aad_loop:
+        movzbl	(%esi,%ecx,1), %eax
+        movb	%al, (%esp,%ebx,1)
+        incl	%ecx
+        incl	%ebx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_decrypt_avx2_calc_aad_loop
+        vmovdqu	(%esp), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm6, %xmm6
+        # ghash_gfmul_avx
+        vpclmulqdq	$16, %xmm6, %xmm5, %xmm2
+        vpclmulqdq	$0x01, %xmm6, %xmm5, %xmm1
+        vpclmulqdq	$0x00, %xmm6, %xmm5, %xmm0
+        vpclmulqdq	$0x11, %xmm6, %xmm5, %xmm3
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpslldq	$8, %xmm2, %xmm1
+        vpsrldq	$8, %xmm2, %xmm2
+        vpxor	%xmm1, %xmm0, %xmm7
+        vpxor	%xmm2, %xmm3, %xmm6
+        # ghash_mid
+        vpsrld	$31, %xmm7, %xmm0
+        vpsrld	$31, %xmm6, %xmm1
+        vpslld	$0x01, %xmm7, %xmm7
+        vpslld	$0x01, %xmm6, %xmm6
+        vpsrldq	$12, %xmm0, %xmm2
+        vpslldq	$4, %xmm0, %xmm0
+        vpslldq	$4, %xmm1, %xmm1
+        vpor	%xmm2, %xmm6, %xmm6
+        vpor	%xmm0, %xmm7, %xmm7
+        vpor	%xmm1, %xmm6, %xmm6
+        # ghash_red
+        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
+        vpclmulqdq	$16, %xmm2, %xmm7, %xmm0
+        vpshufd	$0x4e, %xmm7, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
+        vpshufd	$0x4e, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm6, %xmm6
+L_AES_GCM_decrypt_avx2_calc_aad_done:
+        movl	196(%esp), %esi
+        movl	200(%esp), %edi
+        # Calculate counter and H
+        vpsrlq	$63, %xmm5, %xmm1
+        vpsllq	$0x01, %xmm5, %xmm0
+        vpslldq	$8, %xmm1, %xmm1
+        vpor	%xmm1, %xmm0, %xmm0
+        vpshufd	$0xff, %xmm5, %xmm5
+        vpsrad	$31, %xmm5, %xmm5
+        vpshufb	L_aes_gcm_avx2_bswap_epi64, %xmm4, %xmm4
+        vpand	L_aes_gcm_avx2_mod2_128, %xmm5, %xmm5
+        vpaddd	L_aes_gcm_avx2_one, %xmm4, %xmm4
+        vpxor	%xmm0, %xmm5, %xmm5
+        xorl	%ebx, %ebx
+        cmpl	$0x40, 216(%esp)
+        movl	216(%esp), %eax
+        jl	L_AES_GCM_decrypt_avx2_done_64
+        andl	$0xffffffc0, %eax
+        vmovdqu	%xmm4, 64(%esp)
+        vmovdqu	%xmm6, 96(%esp)
+        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm3
+        # H ^ 1
+        vmovdqu	%xmm5, (%esp)
+        vmovdqu	%xmm5, %xmm2
+        # H ^ 2
+        vpclmulqdq	$0x00, %xmm2, %xmm2, %xmm5
+        vpclmulqdq	$0x11, %xmm2, %xmm2, %xmm6
+        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
+        vpshufd	$0x4e, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
+        vpshufd	$0x4e, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm6, %xmm0
+        vmovdqu	%xmm0, 16(%esp)
+        # H ^ 3
+        # ghash_gfmul_red
+        vpclmulqdq	$16, %xmm0, %xmm2, %xmm6
+        vpclmulqdq	$0x01, %xmm0, %xmm2, %xmm5
+        vpclmulqdq	$0x00, %xmm0, %xmm2, %xmm4
+        vpxor	%xmm5, %xmm6, %xmm6
+        vpslldq	$8, %xmm6, %xmm5
+        vpsrldq	$8, %xmm6, %xmm6
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpclmulqdq	$0x11, %xmm0, %xmm2, %xmm1
+        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
+        vpshufd	$0x4e, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
+        vpshufd	$0x4e, %xmm5, %xmm5
+        vpxor	%xmm6, %xmm1, %xmm1
+        vpxor	%xmm5, %xmm1, %xmm1
+        vpxor	%xmm4, %xmm1, %xmm1
+        vmovdqu	%xmm1, 32(%esp)
+        # H ^ 4
+        vpclmulqdq	$0x00, %xmm0, %xmm0, %xmm5
+        vpclmulqdq	$0x11, %xmm0, %xmm0, %xmm6
+        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
+        vpshufd	$0x4e, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
+        vpshufd	$0x4e, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm6, %xmm2
+        vmovdqu	%xmm2, 48(%esp)
+        vmovdqu	96(%esp), %xmm6
+        cmpl	%esi, %edi
+        jne	L_AES_GCM_decrypt_avx2_ghash_64
+L_AES_GCM_decrypt_avx2_ghash_64_inplace:
+        # aesenc_64_ghash
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        # aesenc_64
+        # aesenc_ctr
+        vmovdqu	64(%esp), %xmm4
+        vmovdqu	L_aes_gcm_avx2_bswap_epi64, %xmm7
+        vpaddd	L_aes_gcm_avx2_one, %xmm4, %xmm1
+        vpshufb	%xmm7, %xmm4, %xmm0
+        vpaddd	L_aes_gcm_avx2_two, %xmm4, %xmm2
+        vpshufb	%xmm7, %xmm1, %xmm1
+        vpaddd	L_aes_gcm_avx2_three, %xmm4, %xmm3
+        vpshufb	%xmm7, %xmm2, %xmm2
+        vpaddd	L_aes_gcm_avx2_four, %xmm4, %xmm4
+        vpshufb	%xmm7, %xmm3, %xmm3
+        # aesenc_xor
+        vmovdqu	(%ebp), %xmm7
+        vmovdqu	%xmm4, 64(%esp)
+        vpxor	%xmm7, %xmm0, %xmm0
+        vpxor	%xmm7, %xmm1, %xmm1
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm7, %xmm3, %xmm3
+        vmovdqu	16(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	32(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	48(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	64(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	80(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	96(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	112(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	128(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	144(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        cmpl	$11, 236(%esp)
+        vmovdqu	160(%ebp), %xmm7
+        jl	L_AES_GCM_decrypt_avx2_inplace_aesenc_64_ghash_aesenc_64_enc_done
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	176(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        cmpl	$13, 236(%esp)
+        vmovdqu	192(%ebp), %xmm7
+        jl	L_AES_GCM_decrypt_avx2_inplace_aesenc_64_ghash_aesenc_64_enc_done
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	208(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	224(%ebp), %xmm7
+L_AES_GCM_decrypt_avx2_inplace_aesenc_64_ghash_aesenc_64_enc_done:
+        # aesenc_last
+        vaesenclast	%xmm7, %xmm0, %xmm0
+        vaesenclast	%xmm7, %xmm1, %xmm1
+        vaesenclast	%xmm7, %xmm2, %xmm2
+        vaesenclast	%xmm7, %xmm3, %xmm3
+        vmovdqu	(%ecx), %xmm7
+        vmovdqu	16(%ecx), %xmm4
+        vpxor	%xmm7, %xmm0, %xmm0
+        vpxor	%xmm4, %xmm1, %xmm1
+        vmovdqu	%xmm7, 112(%esp)
+        vmovdqu	%xmm4, 128(%esp)
+        vmovdqu	%xmm0, (%edx)
+        vmovdqu	%xmm1, 16(%edx)
+        vmovdqu	32(%ecx), %xmm7
+        vmovdqu	48(%ecx), %xmm4
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm4, %xmm3, %xmm3
+        vmovdqu	%xmm7, 144(%esp)
+        vmovdqu	%xmm4, 160(%esp)
+        vmovdqu	%xmm2, 32(%edx)
+        vmovdqu	%xmm3, 48(%edx)
+        # pclmul_1
+        vmovdqu	112(%esp), %xmm1
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
+        vmovdqu	48(%esp), %xmm2
+        vpxor	%xmm6, %xmm1, %xmm1
+        vpclmulqdq	$16, %xmm2, %xmm1, %xmm5
+        vpclmulqdq	$0x01, %xmm2, %xmm1, %xmm3
+        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm6
+        vpclmulqdq	$0x11, %xmm2, %xmm1, %xmm7
+        # pclmul_2
+        vmovdqu	128(%esp), %xmm1
+        vmovdqu	32(%esp), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
+        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
+        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
+        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm7, %xmm7
+        # pclmul_n
+        vmovdqu	144(%esp), %xmm1
+        vmovdqu	16(%esp), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
+        vpxor	%xmm4, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
+        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm7, %xmm7
+        # pclmul_n
+        vmovdqu	160(%esp), %xmm1
+        vmovdqu	(%esp), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
+        vpxor	%xmm4, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
+        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm7, %xmm7
+        # aesenc_pclmul_l
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpslldq	$8, %xmm5, %xmm1
+        vpsrldq	$8, %xmm5, %xmm5
+        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm0
+        vpxor	%xmm1, %xmm6, %xmm6
+        vpxor	%xmm5, %xmm7, %xmm7
+        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
+        vpshufd	$0x4e, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
+        vpshufd	$0x4e, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm7, %xmm6, %xmm6
+        # aesenc_64_ghash - end
+        addl	$0x40, %ebx
+        cmpl	%eax, %ebx
+        jl	L_AES_GCM_decrypt_avx2_ghash_64_inplace
+        jmp	L_AES_GCM_decrypt_avx2_ghash_64_done
+L_AES_GCM_decrypt_avx2_ghash_64:
+        # aesenc_64_ghash
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        # aesenc_64
+        # aesenc_ctr
+        vmovdqu	64(%esp), %xmm4
+        vmovdqu	L_aes_gcm_avx2_bswap_epi64, %xmm7
+        vpaddd	L_aes_gcm_avx2_one, %xmm4, %xmm1
+        vpshufb	%xmm7, %xmm4, %xmm0
+        vpaddd	L_aes_gcm_avx2_two, %xmm4, %xmm2
+        vpshufb	%xmm7, %xmm1, %xmm1
+        vpaddd	L_aes_gcm_avx2_three, %xmm4, %xmm3
+        vpshufb	%xmm7, %xmm2, %xmm2
+        vpaddd	L_aes_gcm_avx2_four, %xmm4, %xmm4
+        vpshufb	%xmm7, %xmm3, %xmm3
+        # aesenc_xor
+        vmovdqu	(%ebp), %xmm7
+        vmovdqu	%xmm4, 64(%esp)
+        vpxor	%xmm7, %xmm0, %xmm0
+        vpxor	%xmm7, %xmm1, %xmm1
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm7, %xmm3, %xmm3
+        vmovdqu	16(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	32(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	48(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	64(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	80(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	96(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	112(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	128(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	144(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        cmpl	$11, 236(%esp)
+        vmovdqu	160(%ebp), %xmm7
+        jl	L_AES_GCM_decrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	176(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        cmpl	$13, 236(%esp)
+        vmovdqu	192(%ebp), %xmm7
+        jl	L_AES_GCM_decrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	208(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	224(%ebp), %xmm7
+L_AES_GCM_decrypt_avx2_aesenc_64_ghash_aesenc_64_enc_done:
+        # aesenc_last
+        vaesenclast	%xmm7, %xmm0, %xmm0
+        vaesenclast	%xmm7, %xmm1, %xmm1
+        vaesenclast	%xmm7, %xmm2, %xmm2
+        vaesenclast	%xmm7, %xmm3, %xmm3
+        vmovdqu	(%ecx), %xmm7
+        vmovdqu	16(%ecx), %xmm4
+        vpxor	%xmm7, %xmm0, %xmm0
+        vpxor	%xmm4, %xmm1, %xmm1
+        vmovdqu	%xmm7, (%ecx)
+        vmovdqu	%xmm4, 16(%ecx)
+        vmovdqu	%xmm0, (%edx)
+        vmovdqu	%xmm1, 16(%edx)
+        vmovdqu	32(%ecx), %xmm7
+        vmovdqu	48(%ecx), %xmm4
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm4, %xmm3, %xmm3
+        vmovdqu	%xmm7, 32(%ecx)
+        vmovdqu	%xmm4, 48(%ecx)
+        vmovdqu	%xmm2, 32(%edx)
+        vmovdqu	%xmm3, 48(%edx)
+        # pclmul_1
+        vmovdqu	(%ecx), %xmm1
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
+        vmovdqu	48(%esp), %xmm2
+        vpxor	%xmm6, %xmm1, %xmm1
+        vpclmulqdq	$16, %xmm2, %xmm1, %xmm5
+        vpclmulqdq	$0x01, %xmm2, %xmm1, %xmm3
+        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm6
+        vpclmulqdq	$0x11, %xmm2, %xmm1, %xmm7
+        # pclmul_2
+        vmovdqu	16(%ecx), %xmm1
+        vmovdqu	32(%esp), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
+        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
+        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
+        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm7, %xmm7
+        # pclmul_n
+        vmovdqu	32(%ecx), %xmm1
+        vmovdqu	16(%esp), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
+        vpxor	%xmm4, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
+        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm7, %xmm7
+        # pclmul_n
+        vmovdqu	48(%ecx), %xmm1
+        vmovdqu	(%esp), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
+        vpxor	%xmm4, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
+        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm7, %xmm7
+        # aesenc_pclmul_l
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpslldq	$8, %xmm5, %xmm1
+        vpsrldq	$8, %xmm5, %xmm5
+        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm0
+        vpxor	%xmm1, %xmm6, %xmm6
+        vpxor	%xmm5, %xmm7, %xmm7
+        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
+        vpshufd	$0x4e, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
+        vpshufd	$0x4e, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm7, %xmm6, %xmm6
+        # aesenc_64_ghash - end
+        addl	$0x40, %ebx
+        cmpl	%eax, %ebx
+        jl	L_AES_GCM_decrypt_avx2_ghash_64
+L_AES_GCM_decrypt_avx2_ghash_64_done:
+        vmovdqu	(%esp), %xmm5
+        vmovdqu	64(%esp), %xmm4
+L_AES_GCM_decrypt_avx2_done_64:
+        cmpl	216(%esp), %ebx
+        jge	L_AES_GCM_decrypt_avx2_done_dec
+        movl	216(%esp), %eax
+        andl	$0xfffffff0, %eax
+        cmpl	%eax, %ebx
+        jge	L_AES_GCM_decrypt_avx2_last_block_done
+L_AES_GCM_decrypt_avx2_last_block_start:
+        vmovdqu	(%esi,%ebx,1), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_epi64, %xmm4, %xmm7
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
+        vpaddd	L_aes_gcm_avx2_one, %xmm4, %xmm4
+        vmovdqu	%xmm4, 64(%esp)
+        vpxor	%xmm6, %xmm0, %xmm4
+        # aesenc_gfmul_sb
+        vpclmulqdq	$0x01, %xmm5, %xmm4, %xmm2
+        vpclmulqdq	$16, %xmm5, %xmm4, %xmm3
+        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm1
+        vpclmulqdq	$0x11, %xmm5, %xmm4, %xmm4
+        vpxor	(%ebp), %xmm7, %xmm7
+        vaesenc	16(%ebp), %xmm7, %xmm7
+        vpxor	%xmm2, %xmm3, %xmm3
+        vpslldq	$8, %xmm3, %xmm2
+        vpsrldq	$8, %xmm3, %xmm3
+        vaesenc	32(%ebp), %xmm7, %xmm7
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm2, %xmm1
+        vaesenc	48(%ebp), %xmm7, %xmm7
+        vaesenc	64(%ebp), %xmm7, %xmm7
+        vaesenc	80(%ebp), %xmm7, %xmm7
+        vpshufd	$0x4e, %xmm2, %xmm2
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm2, %xmm1
+        vaesenc	96(%ebp), %xmm7, %xmm7
+        vaesenc	112(%ebp), %xmm7, %xmm7
+        vaesenc	128(%ebp), %xmm7, %xmm7
+        vpshufd	$0x4e, %xmm2, %xmm2
+        vaesenc	144(%ebp), %xmm7, %xmm7
+        vpxor	%xmm3, %xmm4, %xmm4
+        vpxor	%xmm4, %xmm2, %xmm2
+        vmovdqu	160(%ebp), %xmm0
+        cmpl	$11, 236(%esp)
+        jl	L_AES_GCM_decrypt_avx2_aesenc_gfmul_sb_last
+        vaesenc	%xmm0, %xmm7, %xmm7
+        vaesenc	176(%ebp), %xmm7, %xmm7
+        vmovdqu	192(%ebp), %xmm0
+        cmpl	$13, 236(%esp)
+        jl	L_AES_GCM_decrypt_avx2_aesenc_gfmul_sb_last
+        vaesenc	%xmm0, %xmm7, %xmm7
+        vaesenc	208(%ebp), %xmm7, %xmm7
+        vmovdqu	224(%ebp), %xmm0
+L_AES_GCM_decrypt_avx2_aesenc_gfmul_sb_last:
+        vaesenclast	%xmm0, %xmm7, %xmm7
+        vmovdqu	(%esi,%ebx,1), %xmm3
+        vpxor	%xmm1, %xmm2, %xmm6
+        vpxor	%xmm3, %xmm7, %xmm7
+        vmovdqu	%xmm7, (%edi,%ebx,1)
+        vmovdqu	64(%esp), %xmm4
+        addl	$16, %ebx
+        cmpl	%eax, %ebx
+        jl	L_AES_GCM_decrypt_avx2_last_block_start
+L_AES_GCM_decrypt_avx2_last_block_done:
+        movl	216(%esp), %ecx
+        movl	216(%esp), %edx
+        andl	$15, %ecx
+        jz	L_AES_GCM_decrypt_avx2_done_dec
+        # aesenc_last15_dec
+        vpshufb	L_aes_gcm_avx2_bswap_epi64, %xmm4, %xmm4
+        vpxor	(%ebp), %xmm4, %xmm4
+        vaesenc	16(%ebp), %xmm4, %xmm4
+        vaesenc	32(%ebp), %xmm4, %xmm4
+        vaesenc	48(%ebp), %xmm4, %xmm4
+        vaesenc	64(%ebp), %xmm4, %xmm4
+        vaesenc	80(%ebp), %xmm4, %xmm4
+        vaesenc	96(%ebp), %xmm4, %xmm4
+        vaesenc	112(%ebp), %xmm4, %xmm4
+        vaesenc	128(%ebp), %xmm4, %xmm4
+        vaesenc	144(%ebp), %xmm4, %xmm4
+        cmpl	$11, 236(%esp)
+        vmovdqu	160(%ebp), %xmm1
+        jl	L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_aesenc_avx_last
+        vaesenc	%xmm1, %xmm4, %xmm4
+        vaesenc	176(%ebp), %xmm4, %xmm4
+        cmpl	$13, 236(%esp)
+        vmovdqu	192(%ebp), %xmm1
+        jl	L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_aesenc_avx_last
+        vaesenc	%xmm1, %xmm4, %xmm4
+        vaesenc	208(%ebp), %xmm4, %xmm4
+        vmovdqu	224(%ebp), %xmm1
+L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_aesenc_avx_last:
+        vaesenclast	%xmm1, %xmm4, %xmm4
+        xorl	%ecx, %ecx
+        vpxor	%xmm0, %xmm0, %xmm0
+        vmovdqu	%xmm4, (%esp)
+        vmovdqu	%xmm0, 16(%esp)
+L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_loop:
+        movzbl	(%esi,%ebx,1), %eax
+        movb	%al, 16(%esp,%ecx,1)
+        xorb	(%esp,%ecx,1), %al
+        movb	%al, (%edi,%ebx,1)
+        incl	%ebx
+        incl	%ecx
+        cmpl	%edx, %ebx
+        jl	L_AES_GCM_decrypt_avx2_aesenc_last15_dec_avx_loop
+        vmovdqu	16(%esp), %xmm4
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm4, %xmm4
+        vpxor	%xmm4, %xmm6, %xmm6
+        # ghash_gfmul_red
+        vpclmulqdq	$16, %xmm5, %xmm6, %xmm2
+        vpclmulqdq	$0x01, %xmm5, %xmm6, %xmm1
+        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm0
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpslldq	$8, %xmm2, %xmm1
+        vpsrldq	$8, %xmm2, %xmm2
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm6
+        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm1, %xmm0
+        vpshufd	$0x4e, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm1, %xmm0
+        vpshufd	$0x4e, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm6, %xmm6
+        vpxor	%xmm1, %xmm6, %xmm6
+        vpxor	%xmm0, %xmm6, %xmm6
+L_AES_GCM_decrypt_avx2_done_dec:
+        vmovdqu	80(%esp), %xmm7
+        # calc_tag
+        movl	216(%esp), %ecx
+        shll	$3, %ecx
+        vpinsrd	$0x00, %ecx, %xmm0, %xmm0
+        movl	220(%esp), %ecx
+        shll	$3, %ecx
+        vpinsrd	$2, %ecx, %xmm0, %xmm0
+        movl	216(%esp), %ecx
+        shrl	$29, %ecx
+        vpinsrd	$0x01, %ecx, %xmm0, %xmm0
+        movl	220(%esp), %ecx
+        shrl	$29, %ecx
+        vpinsrd	$3, %ecx, %xmm0, %xmm0
+        vpxor	%xmm6, %xmm0, %xmm0
+        # ghash_gfmul_red
+        vpclmulqdq	$16, %xmm5, %xmm0, %xmm4
+        vpclmulqdq	$0x01, %xmm5, %xmm0, %xmm3
+        vpclmulqdq	$0x00, %xmm5, %xmm0, %xmm2
+        vpxor	%xmm3, %xmm4, %xmm4
+        vpslldq	$8, %xmm4, %xmm3
+        vpsrldq	$8, %xmm4, %xmm4
+        vpxor	%xmm2, %xmm3, %xmm3
+        vpclmulqdq	$0x11, %xmm5, %xmm0, %xmm0
+        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm3, %xmm2
+        vpshufd	$0x4e, %xmm3, %xmm3
+        vpxor	%xmm2, %xmm3, %xmm3
+        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm3, %xmm2
+        vpshufd	$0x4e, %xmm3, %xmm3
+        vpxor	%xmm4, %xmm0, %xmm0
+        vpxor	%xmm3, %xmm0, %xmm0
+        vpxor	%xmm2, %xmm0, %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm7, %xmm0, %xmm0
+        movl	212(%esp), %edi
+        movl	228(%esp), %ebx
+        movl	240(%esp), %ebp
+        # cmp_tag
+        cmpl	$16, %ebx
+        je	L_AES_GCM_decrypt_avx2_cmp_tag_16
+        xorl	%edx, %edx
+        xorl	%ecx, %ecx
+        vmovdqu	%xmm0, (%esp)
+L_AES_GCM_decrypt_avx2_cmp_tag_loop:
+        movzbl	(%esp,%edx,1), %eax
+        xorb	(%edi,%edx,1), %al
+        orb	%al, %cl
+        incl	%edx
+        cmpl	%ebx, %edx
+        jne	L_AES_GCM_decrypt_avx2_cmp_tag_loop
+        cmpb	$0x00, %cl
+        sete	%cl
+        jmp	L_AES_GCM_decrypt_avx2_cmp_tag_done
+L_AES_GCM_decrypt_avx2_cmp_tag_16:
+        vmovdqu	(%edi), %xmm1
+        vpcmpeqb	%xmm1, %xmm0, %xmm0
+        vpmovmskb	%xmm0, %edx
+        # %%edx == 0xFFFF then return 1 else => return 0
+        xorl	%ecx, %ecx
+        cmpl	$0xffff, %edx
+        sete	%cl
+L_AES_GCM_decrypt_avx2_cmp_tag_done:
+        movl	%ecx, (%ebp)
+        addl	$0xb0, %esp
+        popl	%ebp
+        popl	%edi
+        popl	%esi
+        popl	%ebx
+        ret
+.size	AES_GCM_decrypt_avx2,.-AES_GCM_decrypt_avx2
+#ifdef WOLFSSL_AESGCM_STREAM
+.text
+.globl	AES_GCM_init_avx2
+.type	AES_GCM_init_avx2,@function
+.align	16
+AES_GCM_init_avx2:
+        pushl	%ebx
+        pushl	%esi
+        pushl	%edi
+        pushl	%ebp
+        subl	$32, %esp
+        movl	52(%esp), %ebp
+        movl	60(%esp), %esi
+        movl	76(%esp), %edi
+        vpxor	%xmm4, %xmm4, %xmm4
+        movl	64(%esp), %edx
+        cmpl	$12, %edx
+        je	L_AES_GCM_init_avx2_iv_12
+        # Calculate values when IV is not 12 bytes
+        # H = Encrypt X(=0)
+        vmovdqu	(%ebp), %xmm5
+        vaesenc	16(%ebp), %xmm5, %xmm5
+        vaesenc	32(%ebp), %xmm5, %xmm5
+        vaesenc	48(%ebp), %xmm5, %xmm5
+        vaesenc	64(%ebp), %xmm5, %xmm5
+        vaesenc	80(%ebp), %xmm5, %xmm5
+        vaesenc	96(%ebp), %xmm5, %xmm5
+        vaesenc	112(%ebp), %xmm5, %xmm5
+        vaesenc	128(%ebp), %xmm5, %xmm5
+        vaesenc	144(%ebp), %xmm5, %xmm5
+        cmpl	$11, 56(%esp)
+        vmovdqu	160(%ebp), %xmm0
+        jl	L_AES_GCM_init_avx2_calc_iv_1_aesenc_avx_last
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	176(%ebp), %xmm5, %xmm5
+        cmpl	$13, 56(%esp)
+        vmovdqu	192(%ebp), %xmm0
+        jl	L_AES_GCM_init_avx2_calc_iv_1_aesenc_avx_last
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	208(%ebp), %xmm5, %xmm5
+        vmovdqu	224(%ebp), %xmm0
+L_AES_GCM_init_avx2_calc_iv_1_aesenc_avx_last:
+        vaesenclast	%xmm0, %xmm5, %xmm5
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm5, %xmm5
+        # Calc counter
+        # Initialization vector
+        cmpl	$0x00, %edx
+        movl	$0x00, %ecx
+        je	L_AES_GCM_init_avx2_calc_iv_done
+        cmpl	$16, %edx
+        jl	L_AES_GCM_init_avx2_calc_iv_lt16
+        andl	$0xfffffff0, %edx
+L_AES_GCM_init_avx2_calc_iv_16_loop:
+        vmovdqu	(%esi,%ecx,1), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm4, %xmm4
+        # ghash_gfmul_avx
+        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
+        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
+        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
+        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpslldq	$8, %xmm2, %xmm1
+        vpsrldq	$8, %xmm2, %xmm2
+        vpxor	%xmm1, %xmm0, %xmm6
+        vpxor	%xmm2, %xmm3, %xmm4
+        # ghash_mid
+        vpsrld	$31, %xmm6, %xmm0
+        vpsrld	$31, %xmm4, %xmm1
+        vpslld	$0x01, %xmm6, %xmm6
+        vpslld	$0x01, %xmm4, %xmm4
+        vpsrldq	$12, %xmm0, %xmm2
+        vpslldq	$4, %xmm0, %xmm0
+        vpslldq	$4, %xmm1, %xmm1
+        vpor	%xmm2, %xmm4, %xmm4
+        vpor	%xmm0, %xmm6, %xmm6
+        vpor	%xmm1, %xmm4, %xmm4
+        # ghash_red
+        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
+        vpclmulqdq	$16, %xmm2, %xmm6, %xmm0
+        vpshufd	$0x4e, %xmm6, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
+        vpshufd	$0x4e, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm4, %xmm4
+        addl	$16, %ecx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_init_avx2_calc_iv_16_loop
+        movl	64(%esp), %edx
+        cmpl	%edx, %ecx
+        je	L_AES_GCM_init_avx2_calc_iv_done
+L_AES_GCM_init_avx2_calc_iv_lt16:
+        vpxor	%xmm0, %xmm0, %xmm0
+        xorl	%ebx, %ebx
+        vmovdqu	%xmm0, (%esp)
+L_AES_GCM_init_avx2_calc_iv_loop:
+        movzbl	(%esi,%ecx,1), %eax
+        movb	%al, (%esp,%ebx,1)
+        incl	%ecx
+        incl	%ebx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_init_avx2_calc_iv_loop
+        vmovdqu	(%esp), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm4, %xmm4
+        # ghash_gfmul_avx
+        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
+        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
+        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
+        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpslldq	$8, %xmm2, %xmm1
+        vpsrldq	$8, %xmm2, %xmm2
+        vpxor	%xmm1, %xmm0, %xmm6
+        vpxor	%xmm2, %xmm3, %xmm4
+        # ghash_mid
+        vpsrld	$31, %xmm6, %xmm0
+        vpsrld	$31, %xmm4, %xmm1
+        vpslld	$0x01, %xmm6, %xmm6
+        vpslld	$0x01, %xmm4, %xmm4
+        vpsrldq	$12, %xmm0, %xmm2
+        vpslldq	$4, %xmm0, %xmm0
+        vpslldq	$4, %xmm1, %xmm1
+        vpor	%xmm2, %xmm4, %xmm4
+        vpor	%xmm0, %xmm6, %xmm6
+        vpor	%xmm1, %xmm4, %xmm4
+        # ghash_red
+        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
+        vpclmulqdq	$16, %xmm2, %xmm6, %xmm0
+        vpshufd	$0x4e, %xmm6, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
+        vpshufd	$0x4e, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm4, %xmm4
+L_AES_GCM_init_avx2_calc_iv_done:
+        # T = Encrypt counter
+        vpxor	%xmm0, %xmm0, %xmm0
+        shll	$3, %edx
+        vpinsrd	$0x00, %edx, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm4, %xmm4
+        # ghash_gfmul_avx
+        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
+        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
+        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
+        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpslldq	$8, %xmm2, %xmm1
+        vpsrldq	$8, %xmm2, %xmm2
+        vpxor	%xmm1, %xmm0, %xmm6
+        vpxor	%xmm2, %xmm3, %xmm4
+        # ghash_mid
+        vpsrld	$31, %xmm6, %xmm0
+        vpsrld	$31, %xmm4, %xmm1
+        vpslld	$0x01, %xmm6, %xmm6
+        vpslld	$0x01, %xmm4, %xmm4
+        vpsrldq	$12, %xmm0, %xmm2
+        vpslldq	$4, %xmm0, %xmm0
+        vpslldq	$4, %xmm1, %xmm1
+        vpor	%xmm2, %xmm4, %xmm4
+        vpor	%xmm0, %xmm6, %xmm6
+        vpor	%xmm1, %xmm4, %xmm4
+        # ghash_red
+        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
+        vpclmulqdq	$16, %xmm2, %xmm6, %xmm0
+        vpshufd	$0x4e, %xmm6, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
+        vpshufd	$0x4e, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm4, %xmm4
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm4, %xmm4
+        #   Encrypt counter
+        vmovdqu	(%ebp), %xmm7
+        vpxor	%xmm4, %xmm7, %xmm7
+        vaesenc	16(%ebp), %xmm7, %xmm7
+        vaesenc	32(%ebp), %xmm7, %xmm7
+        vaesenc	48(%ebp), %xmm7, %xmm7
+        vaesenc	64(%ebp), %xmm7, %xmm7
+        vaesenc	80(%ebp), %xmm7, %xmm7
+        vaesenc	96(%ebp), %xmm7, %xmm7
+        vaesenc	112(%ebp), %xmm7, %xmm7
+        vaesenc	128(%ebp), %xmm7, %xmm7
+        vaesenc	144(%ebp), %xmm7, %xmm7
+        cmpl	$11, 56(%esp)
+        vmovdqu	160(%ebp), %xmm0
+        jl	L_AES_GCM_init_avx2_calc_iv_2_aesenc_avx_last
+        vaesenc	%xmm0, %xmm7, %xmm7
+        vaesenc	176(%ebp), %xmm7, %xmm7
+        cmpl	$13, 56(%esp)
+        vmovdqu	192(%ebp), %xmm0
+        jl	L_AES_GCM_init_avx2_calc_iv_2_aesenc_avx_last
+        vaesenc	%xmm0, %xmm7, %xmm7
+        vaesenc	208(%ebp), %xmm7, %xmm7
+        vmovdqu	224(%ebp), %xmm0
+L_AES_GCM_init_avx2_calc_iv_2_aesenc_avx_last:
+        vaesenclast	%xmm0, %xmm7, %xmm7
+        jmp	L_AES_GCM_init_avx2_iv_done
+L_AES_GCM_init_avx2_iv_12:
+        # # Calculate values when IV is 12 bytes
+        # Set counter based on IV
+        vmovdqu	L_avx2_aes_gcm_bswap_one, %xmm4
+        vmovdqu	(%ebp), %xmm5
+        vpblendd	$7, (%esi), %xmm4, %xmm4
+        # H = Encrypt X(=0) and T = Encrypt counter
+        vmovdqu	16(%ebp), %xmm6
+        vpxor	%xmm5, %xmm4, %xmm7
+        vaesenc	%xmm6, %xmm5, %xmm5
+        vaesenc	%xmm6, %xmm7, %xmm7
+        vmovdqu	32(%ebp), %xmm0
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm7, %xmm7
+        vmovdqu	48(%ebp), %xmm0
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm7, %xmm7
+        vmovdqu	64(%ebp), %xmm0
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm7, %xmm7
+        vmovdqu	80(%ebp), %xmm0
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm7, %xmm7
+        vmovdqu	96(%ebp), %xmm0
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm7, %xmm7
+        vmovdqu	112(%ebp), %xmm0
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm7, %xmm7
+        vmovdqu	128(%ebp), %xmm0
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm7, %xmm7
+        vmovdqu	144(%ebp), %xmm0
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm7, %xmm7
+        cmpl	$11, 56(%esp)
+        vmovdqu	160(%ebp), %xmm0
+        jl	L_AES_GCM_init_avx2_calc_iv_12_last
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm7, %xmm7
+        vmovdqu	176(%ebp), %xmm0
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm7, %xmm7
+        cmpl	$13, 56(%esp)
+        vmovdqu	192(%ebp), %xmm0
+        jl	L_AES_GCM_init_avx2_calc_iv_12_last
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm7, %xmm7
+        vmovdqu	208(%ebp), %xmm0
+        vaesenc	%xmm0, %xmm5, %xmm5
+        vaesenc	%xmm0, %xmm7, %xmm7
+        vmovdqu	224(%ebp), %xmm0
+L_AES_GCM_init_avx2_calc_iv_12_last:
+        vaesenclast	%xmm0, %xmm5, %xmm5
+        vaesenclast	%xmm0, %xmm7, %xmm7
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm5, %xmm5
+L_AES_GCM_init_avx2_iv_done:
+        vmovdqu	%xmm7, (%edi)
+        movl	68(%esp), %ebp
+        movl	72(%esp), %edi
+        vpshufb	L_aes_gcm_avx2_bswap_epi64, %xmm4, %xmm4
+        vpaddd	L_aes_gcm_avx2_one, %xmm4, %xmm4
+        vmovdqu	%xmm5, (%ebp)
+        vmovdqu	%xmm4, (%edi)
+        addl	$32, %esp
+        popl	%ebp
+        popl	%edi
+        popl	%esi
+        popl	%ebx
+        ret
+.size	AES_GCM_init_avx2,.-AES_GCM_init_avx2
+.text
+.globl	AES_GCM_aad_update_avx2
+.type	AES_GCM_aad_update_avx2,@function
+.align	16
+AES_GCM_aad_update_avx2:
+        pushl	%esi
+        pushl	%edi
+        movl	12(%esp), %esi
+        movl	16(%esp), %edx
+        movl	20(%esp), %edi
+        movl	24(%esp), %eax
+        vmovdqu	(%edi), %xmm4
+        vmovdqu	(%eax), %xmm5
+        xorl	%ecx, %ecx
+L_AES_GCM_aad_update_avx2_16_loop:
+        vmovdqu	(%esi,%ecx,1), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm4, %xmm4
+        # ghash_gfmul_avx
+        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
+        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
+        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
+        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpslldq	$8, %xmm2, %xmm1
+        vpsrldq	$8, %xmm2, %xmm2
+        vpxor	%xmm1, %xmm0, %xmm6
+        vpxor	%xmm2, %xmm3, %xmm4
+        # ghash_mid
+        vpsrld	$31, %xmm6, %xmm0
+        vpsrld	$31, %xmm4, %xmm1
+        vpslld	$0x01, %xmm6, %xmm6
+        vpslld	$0x01, %xmm4, %xmm4
+        vpsrldq	$12, %xmm0, %xmm2
+        vpslldq	$4, %xmm0, %xmm0
+        vpslldq	$4, %xmm1, %xmm1
+        vpor	%xmm2, %xmm4, %xmm4
+        vpor	%xmm0, %xmm6, %xmm6
+        vpor	%xmm1, %xmm4, %xmm4
+        # ghash_red
+        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
+        vpclmulqdq	$16, %xmm2, %xmm6, %xmm0
+        vpshufd	$0x4e, %xmm6, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
+        vpshufd	$0x4e, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm4, %xmm4
+        addl	$16, %ecx
+        cmpl	%edx, %ecx
+        jl	L_AES_GCM_aad_update_avx2_16_loop
+        vmovdqu	%xmm4, (%edi)
+        popl	%edi
+        popl	%esi
+        ret
+.size	AES_GCM_aad_update_avx2,.-AES_GCM_aad_update_avx2
+.text
+.globl	AES_GCM_encrypt_block_avx2
+.type	AES_GCM_encrypt_block_avx2,@function
+.align	16
+AES_GCM_encrypt_block_avx2:
+        pushl	%esi
+        pushl	%edi
+        movl	12(%esp), %ecx
+        movl	16(%esp), %eax
+        movl	20(%esp), %edi
+        movl	24(%esp), %esi
+        movl	28(%esp), %edx
+        vmovdqu	(%edx), %xmm3
+        # aesenc_block
+        vmovdqu	%xmm3, %xmm1
+        vpshufb	L_aes_gcm_avx2_bswap_epi64, %xmm1, %xmm0
+        vpaddd	L_aes_gcm_avx2_one, %xmm1, %xmm1
+        vpxor	(%ecx), %xmm0, %xmm0
+        vaesenc	16(%ecx), %xmm0, %xmm0
+        vaesenc	32(%ecx), %xmm0, %xmm0
+        vaesenc	48(%ecx), %xmm0, %xmm0
+        vaesenc	64(%ecx), %xmm0, %xmm0
+        vaesenc	80(%ecx), %xmm0, %xmm0
+        vaesenc	96(%ecx), %xmm0, %xmm0
+        vaesenc	112(%ecx), %xmm0, %xmm0
+        vaesenc	128(%ecx), %xmm0, %xmm0
+        vaesenc	144(%ecx), %xmm0, %xmm0
+        cmpl	$11, %eax
+        vmovdqu	160(%ecx), %xmm2
+        jl	L_AES_GCM_encrypt_block_avx2_aesenc_block_aesenc_avx_last
+        vaesenc	%xmm2, %xmm0, %xmm0
+        vaesenc	176(%ecx), %xmm0, %xmm0
+        cmpl	$13, %eax
+        vmovdqu	192(%ecx), %xmm2
+        jl	L_AES_GCM_encrypt_block_avx2_aesenc_block_aesenc_avx_last
+        vaesenc	%xmm2, %xmm0, %xmm0
+        vaesenc	208(%ecx), %xmm0, %xmm0
+        vmovdqu	224(%ecx), %xmm2
+L_AES_GCM_encrypt_block_avx2_aesenc_block_aesenc_avx_last:
+        vaesenclast	%xmm2, %xmm0, %xmm0
+        vmovdqu	%xmm1, %xmm3
+        vmovdqu	(%esi), %xmm1
+        vpxor	%xmm1, %xmm0, %xmm0
+        vmovdqu	%xmm0, (%edi)
+        vmovdqu	%xmm3, (%edx)
+        popl	%edi
+        popl	%esi
+        ret
+.size	AES_GCM_encrypt_block_avx2,.-AES_GCM_encrypt_block_avx2
+.text
+.globl	AES_GCM_ghash_block_avx2
+.type	AES_GCM_ghash_block_avx2,@function
+.align	16
+AES_GCM_ghash_block_avx2:
+        movl	4(%esp), %edx
+        movl	8(%esp), %eax
+        movl	12(%esp), %ecx
+        vmovdqu	(%eax), %xmm4
+        vmovdqu	(%ecx), %xmm5
+        vmovdqu	(%edx), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm4, %xmm4
+        # ghash_gfmul_avx
+        vpclmulqdq	$16, %xmm4, %xmm5, %xmm2
+        vpclmulqdq	$0x01, %xmm4, %xmm5, %xmm1
+        vpclmulqdq	$0x00, %xmm4, %xmm5, %xmm0
+        vpclmulqdq	$0x11, %xmm4, %xmm5, %xmm3
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpslldq	$8, %xmm2, %xmm1
+        vpsrldq	$8, %xmm2, %xmm2
+        vpxor	%xmm1, %xmm0, %xmm6
+        vpxor	%xmm2, %xmm3, %xmm4
+        # ghash_mid
+        vpsrld	$31, %xmm6, %xmm0
+        vpsrld	$31, %xmm4, %xmm1
+        vpslld	$0x01, %xmm6, %xmm6
+        vpslld	$0x01, %xmm4, %xmm4
+        vpsrldq	$12, %xmm0, %xmm2
+        vpslldq	$4, %xmm0, %xmm0
+        vpslldq	$4, %xmm1, %xmm1
+        vpor	%xmm2, %xmm4, %xmm4
+        vpor	%xmm0, %xmm6, %xmm6
+        vpor	%xmm1, %xmm4, %xmm4
+        # ghash_red
+        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
+        vpclmulqdq	$16, %xmm2, %xmm6, %xmm0
+        vpshufd	$0x4e, %xmm6, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
+        vpshufd	$0x4e, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm4, %xmm4
+        vmovdqu	%xmm4, (%eax)
+        ret
+.size	AES_GCM_ghash_block_avx2,.-AES_GCM_ghash_block_avx2
+.text
+.globl	AES_GCM_encrypt_update_avx2
+.type	AES_GCM_encrypt_update_avx2,@function
+.align	16
+AES_GCM_encrypt_update_avx2:
+        pushl	%ebx
+        pushl	%esi
+        pushl	%edi
+        pushl	%ebp
+        subl	$0x60, %esp
+        movl	144(%esp), %esi
+        vmovdqu	(%esi), %xmm4
+        vmovdqu	%xmm4, 64(%esp)
+        movl	136(%esp), %esi
+        movl	140(%esp), %ebp
+        vmovdqu	(%esi), %xmm6
+        vmovdqu	(%ebp), %xmm5
+        vmovdqu	%xmm6, 80(%esp)
+        movl	116(%esp), %ebp
+        movl	124(%esp), %edi
+        movl	128(%esp), %esi
+        # Calculate H
+        vpsrlq	$63, %xmm5, %xmm1
+        vpsllq	$0x01, %xmm5, %xmm0
+        vpslldq	$8, %xmm1, %xmm1
+        vpor	%xmm1, %xmm0, %xmm0
+        vpshufd	$0xff, %xmm5, %xmm5
+        vpsrad	$31, %xmm5, %xmm5
+        vpand	L_aes_gcm_avx2_mod2_128, %xmm5, %xmm5
+        vpxor	%xmm0, %xmm5, %xmm5
+        xorl	%ebx, %ebx
+        cmpl	$0x40, 132(%esp)
+        movl	132(%esp), %eax
+        jl	L_AES_GCM_encrypt_update_avx2_done_64
+        andl	$0xffffffc0, %eax
+        vmovdqu	%xmm4, 64(%esp)
+        vmovdqu	%xmm6, 80(%esp)
+        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm3
+        # H ^ 1
+        vmovdqu	%xmm5, (%esp)
+        vmovdqu	%xmm5, %xmm2
+        # H ^ 2
+        vpclmulqdq	$0x00, %xmm2, %xmm2, %xmm5
+        vpclmulqdq	$0x11, %xmm2, %xmm2, %xmm6
+        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
+        vpshufd	$0x4e, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
+        vpshufd	$0x4e, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm6, %xmm0
+        vmovdqu	%xmm0, 16(%esp)
+        # H ^ 3
+        # ghash_gfmul_red
+        vpclmulqdq	$16, %xmm0, %xmm2, %xmm6
+        vpclmulqdq	$0x01, %xmm0, %xmm2, %xmm5
+        vpclmulqdq	$0x00, %xmm0, %xmm2, %xmm4
+        vpxor	%xmm5, %xmm6, %xmm6
+        vpslldq	$8, %xmm6, %xmm5
+        vpsrldq	$8, %xmm6, %xmm6
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpclmulqdq	$0x11, %xmm0, %xmm2, %xmm1
+        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
+        vpshufd	$0x4e, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
+        vpshufd	$0x4e, %xmm5, %xmm5
+        vpxor	%xmm6, %xmm1, %xmm1
+        vpxor	%xmm5, %xmm1, %xmm1
+        vpxor	%xmm4, %xmm1, %xmm1
+        vmovdqu	%xmm1, 32(%esp)
+        # H ^ 4
+        vpclmulqdq	$0x00, %xmm0, %xmm0, %xmm5
+        vpclmulqdq	$0x11, %xmm0, %xmm0, %xmm6
+        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
+        vpshufd	$0x4e, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
+        vpshufd	$0x4e, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm6, %xmm2
+        vmovdqu	%xmm2, 48(%esp)
+        vmovdqu	80(%esp), %xmm6
+        # First 64 bytes of input
+        # aesenc_64
+        # aesenc_ctr
+        vmovdqu	64(%esp), %xmm4
+        vmovdqu	L_aes_gcm_avx2_bswap_epi64, %xmm7
+        vpaddd	L_aes_gcm_avx2_one, %xmm4, %xmm1
+        vpshufb	%xmm7, %xmm4, %xmm0
+        vpaddd	L_aes_gcm_avx2_two, %xmm4, %xmm2
+        vpshufb	%xmm7, %xmm1, %xmm1
+        vpaddd	L_aes_gcm_avx2_three, %xmm4, %xmm3
+        vpshufb	%xmm7, %xmm2, %xmm2
+        vpaddd	L_aes_gcm_avx2_four, %xmm4, %xmm4
+        vpshufb	%xmm7, %xmm3, %xmm3
+        # aesenc_xor
+        vmovdqu	(%ebp), %xmm7
+        vmovdqu	%xmm4, 64(%esp)
+        vpxor	%xmm7, %xmm0, %xmm0
+        vpxor	%xmm7, %xmm1, %xmm1
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm7, %xmm3, %xmm3
+        vmovdqu	16(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	32(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	48(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	64(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	80(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	96(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	112(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	128(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	144(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        cmpl	$11, 120(%esp)
+        vmovdqu	160(%ebp), %xmm7
+        jl	L_AES_GCM_encrypt_update_avx2_aesenc_64_enc_done
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	176(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        cmpl	$13, 120(%esp)
+        vmovdqu	192(%ebp), %xmm7
+        jl	L_AES_GCM_encrypt_update_avx2_aesenc_64_enc_done
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	208(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	224(%ebp), %xmm7
+L_AES_GCM_encrypt_update_avx2_aesenc_64_enc_done:
+        # aesenc_last
+        vaesenclast	%xmm7, %xmm0, %xmm0
+        vaesenclast	%xmm7, %xmm1, %xmm1
+        vaesenclast	%xmm7, %xmm2, %xmm2
+        vaesenclast	%xmm7, %xmm3, %xmm3
+        vmovdqu	(%esi), %xmm7
+        vmovdqu	16(%esi), %xmm4
+        vpxor	%xmm7, %xmm0, %xmm0
+        vpxor	%xmm4, %xmm1, %xmm1
+        vmovdqu	%xmm0, (%edi)
+        vmovdqu	%xmm1, 16(%edi)
+        vmovdqu	32(%esi), %xmm7
+        vmovdqu	48(%esi), %xmm4
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm4, %xmm3, %xmm3
+        vmovdqu	%xmm2, 32(%edi)
+        vmovdqu	%xmm3, 48(%edi)
+        cmpl	$0x40, %eax
+        movl	$0x40, %ebx
+        movl	%esi, %ecx
+        movl	%edi, %edx
+        jle	L_AES_GCM_encrypt_update_avx2_end_64
+        # More 64 bytes of input
+L_AES_GCM_encrypt_update_avx2_ghash_64:
+        # aesenc_64_ghash
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        # aesenc_64
+        # aesenc_ctr
+        vmovdqu	64(%esp), %xmm4
+        vmovdqu	L_aes_gcm_avx2_bswap_epi64, %xmm7
+        vpaddd	L_aes_gcm_avx2_one, %xmm4, %xmm1
+        vpshufb	%xmm7, %xmm4, %xmm0
+        vpaddd	L_aes_gcm_avx2_two, %xmm4, %xmm2
+        vpshufb	%xmm7, %xmm1, %xmm1
+        vpaddd	L_aes_gcm_avx2_three, %xmm4, %xmm3
+        vpshufb	%xmm7, %xmm2, %xmm2
+        vpaddd	L_aes_gcm_avx2_four, %xmm4, %xmm4
+        vpshufb	%xmm7, %xmm3, %xmm3
+        # aesenc_xor
+        vmovdqu	(%ebp), %xmm7
+        vmovdqu	%xmm4, 64(%esp)
+        vpxor	%xmm7, %xmm0, %xmm0
+        vpxor	%xmm7, %xmm1, %xmm1
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm7, %xmm3, %xmm3
+        vmovdqu	16(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	32(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	48(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	64(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	80(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	96(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	112(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	128(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	144(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        cmpl	$11, 120(%esp)
+        vmovdqu	160(%ebp), %xmm7
+        jl	L_AES_GCM_encrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	176(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        cmpl	$13, 120(%esp)
+        vmovdqu	192(%ebp), %xmm7
+        jl	L_AES_GCM_encrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	208(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	224(%ebp), %xmm7
+L_AES_GCM_encrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done:
+        # aesenc_last
+        vaesenclast	%xmm7, %xmm0, %xmm0
+        vaesenclast	%xmm7, %xmm1, %xmm1
+        vaesenclast	%xmm7, %xmm2, %xmm2
+        vaesenclast	%xmm7, %xmm3, %xmm3
+        vmovdqu	(%ecx), %xmm7
+        vmovdqu	16(%ecx), %xmm4
+        vpxor	%xmm7, %xmm0, %xmm0
+        vpxor	%xmm4, %xmm1, %xmm1
+        vmovdqu	%xmm0, (%edx)
+        vmovdqu	%xmm1, 16(%edx)
+        vmovdqu	32(%ecx), %xmm7
+        vmovdqu	48(%ecx), %xmm4
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm4, %xmm3, %xmm3
+        vmovdqu	%xmm2, 32(%edx)
+        vmovdqu	%xmm3, 48(%edx)
+        # pclmul_1
+        vmovdqu	-64(%edx), %xmm1
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
+        vmovdqu	48(%esp), %xmm2
+        vpxor	%xmm6, %xmm1, %xmm1
+        vpclmulqdq	$16, %xmm2, %xmm1, %xmm5
+        vpclmulqdq	$0x01, %xmm2, %xmm1, %xmm3
+        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm6
+        vpclmulqdq	$0x11, %xmm2, %xmm1, %xmm7
+        # pclmul_2
+        vmovdqu	-48(%edx), %xmm1
+        vmovdqu	32(%esp), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
+        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
+        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
+        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm7, %xmm7
+        # pclmul_n
+        vmovdqu	-32(%edx), %xmm1
+        vmovdqu	16(%esp), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
+        vpxor	%xmm4, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
+        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm7, %xmm7
+        # pclmul_n
+        vmovdqu	-16(%edx), %xmm1
+        vmovdqu	(%esp), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
+        vpxor	%xmm4, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
+        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm7, %xmm7
+        # aesenc_pclmul_l
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpslldq	$8, %xmm5, %xmm1
+        vpsrldq	$8, %xmm5, %xmm5
+        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm0
+        vpxor	%xmm1, %xmm6, %xmm6
+        vpxor	%xmm5, %xmm7, %xmm7
+        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
+        vpshufd	$0x4e, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
+        vpshufd	$0x4e, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm7, %xmm6, %xmm6
+        # aesenc_64_ghash - end
+        addl	$0x40, %ebx
+        cmpl	%eax, %ebx
+        jl	L_AES_GCM_encrypt_update_avx2_ghash_64
+L_AES_GCM_encrypt_update_avx2_end_64:
+        vmovdqu	%xmm6, 80(%esp)
+        vmovdqu	48(%edx), %xmm3
+        vmovdqu	(%esp), %xmm7
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm3, %xmm3
+        vpclmulqdq	$16, %xmm3, %xmm7, %xmm5
+        vpclmulqdq	$0x01, %xmm3, %xmm7, %xmm1
+        vpclmulqdq	$0x00, %xmm3, %xmm7, %xmm4
+        vpclmulqdq	$0x11, %xmm3, %xmm7, %xmm6
+        vpxor	%xmm1, %xmm5, %xmm5
+        vmovdqu	32(%edx), %xmm3
+        vmovdqu	16(%esp), %xmm7
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm3, %xmm3
+        vpclmulqdq	$16, %xmm3, %xmm7, %xmm2
+        vpclmulqdq	$0x01, %xmm3, %xmm7, %xmm1
+        vpclmulqdq	$0x00, %xmm3, %xmm7, %xmm0
+        vpclmulqdq	$0x11, %xmm3, %xmm7, %xmm3
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpxor	%xmm0, %xmm4, %xmm4
+        vmovdqu	16(%edx), %xmm3
+        vmovdqu	32(%esp), %xmm7
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm3, %xmm3
+        vpclmulqdq	$16, %xmm3, %xmm7, %xmm2
+        vpclmulqdq	$0x01, %xmm3, %xmm7, %xmm1
+        vpclmulqdq	$0x00, %xmm3, %xmm7, %xmm0
+        vpclmulqdq	$0x11, %xmm3, %xmm7, %xmm3
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpxor	%xmm0, %xmm4, %xmm4
+        vmovdqu	80(%esp), %xmm0
+        vmovdqu	(%edx), %xmm3
+        vmovdqu	48(%esp), %xmm7
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm3, %xmm3
+        vpxor	%xmm0, %xmm3, %xmm3
+        vpclmulqdq	$16, %xmm3, %xmm7, %xmm2
+        vpclmulqdq	$0x01, %xmm3, %xmm7, %xmm1
+        vpclmulqdq	$0x00, %xmm3, %xmm7, %xmm0
+        vpclmulqdq	$0x11, %xmm3, %xmm7, %xmm3
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpxor	%xmm0, %xmm4, %xmm4
+        vpslldq	$8, %xmm5, %xmm7
+        vpsrldq	$8, %xmm5, %xmm5
+        vpxor	%xmm7, %xmm4, %xmm4
+        vpxor	%xmm5, %xmm6, %xmm6
+        # ghash_red
+        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm2
+        vpclmulqdq	$16, %xmm2, %xmm4, %xmm0
+        vpshufd	$0x4e, %xmm4, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpclmulqdq	$16, %xmm2, %xmm1, %xmm0
+        vpshufd	$0x4e, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm6, %xmm6
+        vmovdqu	(%esp), %xmm5
+        vmovdqu	64(%esp), %xmm4
+L_AES_GCM_encrypt_update_avx2_done_64:
+        cmpl	132(%esp), %ebx
+        je	L_AES_GCM_encrypt_update_avx2_done_enc
+        movl	132(%esp), %eax
+        andl	$0xfffffff0, %eax
+        cmpl	%eax, %ebx
+        jge	L_AES_GCM_encrypt_update_avx2_last_block_done
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        # aesenc_block
+        vmovdqu	%xmm4, %xmm1
+        vpshufb	L_aes_gcm_avx2_bswap_epi64, %xmm1, %xmm0
+        vpaddd	L_aes_gcm_avx2_one, %xmm1, %xmm1
+        vpxor	(%ebp), %xmm0, %xmm0
+        vaesenc	16(%ebp), %xmm0, %xmm0
+        vaesenc	32(%ebp), %xmm0, %xmm0
+        vaesenc	48(%ebp), %xmm0, %xmm0
+        vaesenc	64(%ebp), %xmm0, %xmm0
+        vaesenc	80(%ebp), %xmm0, %xmm0
+        vaesenc	96(%ebp), %xmm0, %xmm0
+        vaesenc	112(%ebp), %xmm0, %xmm0
+        vaesenc	128(%ebp), %xmm0, %xmm0
+        vaesenc	144(%ebp), %xmm0, %xmm0
+        cmpl	$11, 120(%esp)
+        vmovdqu	160(%ebp), %xmm2
+        jl	L_AES_GCM_encrypt_update_avx2_aesenc_block_aesenc_avx_last
+        vaesenc	%xmm2, %xmm0, %xmm0
+        vaesenc	176(%ebp), %xmm0, %xmm0
+        cmpl	$13, 120(%esp)
+        vmovdqu	192(%ebp), %xmm2
+        jl	L_AES_GCM_encrypt_update_avx2_aesenc_block_aesenc_avx_last
+        vaesenc	%xmm2, %xmm0, %xmm0
+        vaesenc	208(%ebp), %xmm0, %xmm0
+        vmovdqu	224(%ebp), %xmm2
+L_AES_GCM_encrypt_update_avx2_aesenc_block_aesenc_avx_last:
+        vaesenclast	%xmm2, %xmm0, %xmm0
+        vmovdqu	%xmm1, %xmm4
+        vmovdqu	(%ecx), %xmm1
+        vpxor	%xmm1, %xmm0, %xmm0
+        vmovdqu	%xmm0, (%edx)
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm0, %xmm6, %xmm6
+        addl	$16, %ebx
+        cmpl	%eax, %ebx
+        jge	L_AES_GCM_encrypt_update_avx2_last_block_ghash
+L_AES_GCM_encrypt_update_avx2_last_block_start:
+        vpshufb	L_aes_gcm_avx2_bswap_epi64, %xmm4, %xmm7
+        vpaddd	L_aes_gcm_avx2_one, %xmm4, %xmm4
+        vmovdqu	%xmm4, 64(%esp)
+        # aesenc_gfmul_sb
+        vpclmulqdq	$0x01, %xmm5, %xmm6, %xmm2
+        vpclmulqdq	$16, %xmm5, %xmm6, %xmm3
+        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm1
+        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm4
+        vpxor	(%ebp), %xmm7, %xmm7
+        vaesenc	16(%ebp), %xmm7, %xmm7
+        vpxor	%xmm2, %xmm3, %xmm3
+        vpslldq	$8, %xmm3, %xmm2
+        vpsrldq	$8, %xmm3, %xmm3
+        vaesenc	32(%ebp), %xmm7, %xmm7
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm2, %xmm1
+        vaesenc	48(%ebp), %xmm7, %xmm7
+        vaesenc	64(%ebp), %xmm7, %xmm7
+        vaesenc	80(%ebp), %xmm7, %xmm7
+        vpshufd	$0x4e, %xmm2, %xmm2
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm2, %xmm1
+        vaesenc	96(%ebp), %xmm7, %xmm7
+        vaesenc	112(%ebp), %xmm7, %xmm7
+        vaesenc	128(%ebp), %xmm7, %xmm7
+        vpshufd	$0x4e, %xmm2, %xmm2
+        vaesenc	144(%ebp), %xmm7, %xmm7
+        vpxor	%xmm3, %xmm4, %xmm4
+        vpxor	%xmm4, %xmm2, %xmm2
+        vmovdqu	160(%ebp), %xmm0
+        cmpl	$11, 120(%esp)
+        jl	L_AES_GCM_encrypt_update_avx2_aesenc_gfmul_sb_last
+        vaesenc	%xmm0, %xmm7, %xmm7
+        vaesenc	176(%ebp), %xmm7, %xmm7
+        vmovdqu	192(%ebp), %xmm0
+        cmpl	$13, 120(%esp)
+        jl	L_AES_GCM_encrypt_update_avx2_aesenc_gfmul_sb_last
+        vaesenc	%xmm0, %xmm7, %xmm7
+        vaesenc	208(%ebp), %xmm7, %xmm7
+        vmovdqu	224(%ebp), %xmm0
+L_AES_GCM_encrypt_update_avx2_aesenc_gfmul_sb_last:
+        vaesenclast	%xmm0, %xmm7, %xmm7
+        vmovdqu	(%esi,%ebx,1), %xmm3
+        vpxor	%xmm1, %xmm2, %xmm6
+        vpxor	%xmm3, %xmm7, %xmm7
+        vmovdqu	%xmm7, (%edi,%ebx,1)
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm7, %xmm7
+        vpxor	%xmm7, %xmm6, %xmm6
+        vmovdqu	64(%esp), %xmm4
+        addl	$16, %ebx
+        cmpl	%eax, %ebx
+        jl	L_AES_GCM_encrypt_update_avx2_last_block_start
+L_AES_GCM_encrypt_update_avx2_last_block_ghash:
+        # ghash_gfmul_red
+        vpclmulqdq	$16, %xmm5, %xmm6, %xmm2
+        vpclmulqdq	$0x01, %xmm5, %xmm6, %xmm1
+        vpclmulqdq	$0x00, %xmm5, %xmm6, %xmm0
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpslldq	$8, %xmm2, %xmm1
+        vpsrldq	$8, %xmm2, %xmm2
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpclmulqdq	$0x11, %xmm5, %xmm6, %xmm6
+        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm1, %xmm0
+        vpshufd	$0x4e, %xmm1, %xmm1
+        vpxor	%xmm0, %xmm1, %xmm1
+        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm1, %xmm0
+        vpshufd	$0x4e, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm6, %xmm6
+        vpxor	%xmm1, %xmm6, %xmm6
+        vpxor	%xmm0, %xmm6, %xmm6
+L_AES_GCM_encrypt_update_avx2_last_block_done:
+L_AES_GCM_encrypt_update_avx2_done_enc:
+        movl	136(%esp), %esi
+        movl	144(%esp), %edi
+        vmovdqu	%xmm6, (%esi)
+        vmovdqu	%xmm4, (%edi)
+        addl	$0x60, %esp
+        popl	%ebp
+        popl	%edi
+        popl	%esi
+        popl	%ebx
+        ret
+.size	AES_GCM_encrypt_update_avx2,.-AES_GCM_encrypt_update_avx2
+.text
+.globl	AES_GCM_encrypt_final_avx2
+.type	AES_GCM_encrypt_final_avx2,@function
+.align	16
+AES_GCM_encrypt_final_avx2:
+        pushl	%esi
+        pushl	%edi
+        pushl	%ebp
+        subl	$16, %esp
+        movl	32(%esp), %ebp
+        movl	52(%esp), %esi
+        movl	56(%esp), %edi
+        vmovdqu	(%ebp), %xmm4
+        vmovdqu	(%esi), %xmm5
+        vmovdqu	(%edi), %xmm6
+        vpsrlq	$63, %xmm5, %xmm1
+        vpsllq	$0x01, %xmm5, %xmm0
+        vpslldq	$8, %xmm1, %xmm1
+        vpor	%xmm1, %xmm0, %xmm0
+        vpshufd	$0xff, %xmm5, %xmm5
+        vpsrad	$31, %xmm5, %xmm5
+        vpand	L_aes_gcm_avx2_mod2_128, %xmm5, %xmm5
+        vpxor	%xmm0, %xmm5, %xmm5
+        # calc_tag
+        movl	44(%esp), %ecx
+        shll	$3, %ecx
+        vpinsrd	$0x00, %ecx, %xmm0, %xmm0
+        movl	48(%esp), %ecx
+        shll	$3, %ecx
+        vpinsrd	$2, %ecx, %xmm0, %xmm0
+        movl	44(%esp), %ecx
+        shrl	$29, %ecx
+        vpinsrd	$0x01, %ecx, %xmm0, %xmm0
+        movl	48(%esp), %ecx
+        shrl	$29, %ecx
+        vpinsrd	$3, %ecx, %xmm0, %xmm0
+        vpxor	%xmm4, %xmm0, %xmm0
+        # ghash_gfmul_red
+        vpclmulqdq	$16, %xmm5, %xmm0, %xmm7
+        vpclmulqdq	$0x01, %xmm5, %xmm0, %xmm3
+        vpclmulqdq	$0x00, %xmm5, %xmm0, %xmm2
+        vpxor	%xmm3, %xmm7, %xmm7
+        vpslldq	$8, %xmm7, %xmm3
+        vpsrldq	$8, %xmm7, %xmm7
+        vpxor	%xmm2, %xmm3, %xmm3
+        vpclmulqdq	$0x11, %xmm5, %xmm0, %xmm0
+        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm3, %xmm2
+        vpshufd	$0x4e, %xmm3, %xmm3
+        vpxor	%xmm2, %xmm3, %xmm3
+        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm3, %xmm2
+        vpshufd	$0x4e, %xmm3, %xmm3
+        vpxor	%xmm7, %xmm0, %xmm0
+        vpxor	%xmm3, %xmm0, %xmm0
+        vpxor	%xmm2, %xmm0, %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm6, %xmm0, %xmm0
+        movl	36(%esp), %edi
+        # store_tag
+        cmpl	$16, 40(%esp)
+        je	L_AES_GCM_encrypt_final_avx2_store_tag_16
+        xorl	%ecx, %ecx
+        vmovdqu	%xmm0, (%esp)
+L_AES_GCM_encrypt_final_avx2_store_tag_loop:
+        movzbl	(%esp,%ecx,1), %eax
+        movb	%al, (%edi,%ecx,1)
+        incl	%ecx
+        cmpl	40(%esp), %ecx
+        jne	L_AES_GCM_encrypt_final_avx2_store_tag_loop
+        jmp	L_AES_GCM_encrypt_final_avx2_store_tag_done
+L_AES_GCM_encrypt_final_avx2_store_tag_16:
+        vmovdqu	%xmm0, (%edi)
+L_AES_GCM_encrypt_final_avx2_store_tag_done:
+        addl	$16, %esp
+        popl	%ebp
+        popl	%edi
+        popl	%esi
+        ret
+.size	AES_GCM_encrypt_final_avx2,.-AES_GCM_encrypt_final_avx2
+.text
+.globl	AES_GCM_decrypt_update_avx2
+.type	AES_GCM_decrypt_update_avx2,@function
+.align	16
+AES_GCM_decrypt_update_avx2:
+        pushl	%ebx
+        pushl	%esi
+        pushl	%edi
+        pushl	%ebp
+        subl	$0xa0, %esp
+        movl	208(%esp), %esi
+        vmovdqu	(%esi), %xmm4
+        movl	200(%esp), %esi
+        movl	204(%esp), %ebp
+        vmovdqu	(%esi), %xmm6
+        vmovdqu	(%ebp), %xmm5
+        movl	180(%esp), %ebp
+        movl	188(%esp), %edi
+        movl	192(%esp), %esi
+        # Calculate H
+        vpsrlq	$63, %xmm5, %xmm1
+        vpsllq	$0x01, %xmm5, %xmm0
+        vpslldq	$8, %xmm1, %xmm1
+        vpor	%xmm1, %xmm0, %xmm0
+        vpshufd	$0xff, %xmm5, %xmm5
+        vpsrad	$31, %xmm5, %xmm5
+        vpand	L_aes_gcm_avx2_mod2_128, %xmm5, %xmm5
+        vpxor	%xmm0, %xmm5, %xmm5
+        xorl	%ebx, %ebx
+        cmpl	$0x40, 196(%esp)
+        movl	196(%esp), %eax
+        jl	L_AES_GCM_decrypt_update_avx2_done_64
+        andl	$0xffffffc0, %eax
+        vmovdqu	%xmm4, 64(%esp)
+        vmovdqu	%xmm6, 80(%esp)
+        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm3
+        # H ^ 1
+        vmovdqu	%xmm5, (%esp)
+        vmovdqu	%xmm5, %xmm2
+        # H ^ 2
+        vpclmulqdq	$0x00, %xmm2, %xmm2, %xmm5
+        vpclmulqdq	$0x11, %xmm2, %xmm2, %xmm6
+        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
+        vpshufd	$0x4e, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
+        vpshufd	$0x4e, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm6, %xmm0
+        vmovdqu	%xmm0, 16(%esp)
+        # H ^ 3
+        # ghash_gfmul_red
+        vpclmulqdq	$16, %xmm0, %xmm2, %xmm6
+        vpclmulqdq	$0x01, %xmm0, %xmm2, %xmm5
+        vpclmulqdq	$0x00, %xmm0, %xmm2, %xmm4
+        vpxor	%xmm5, %xmm6, %xmm6
+        vpslldq	$8, %xmm6, %xmm5
+        vpsrldq	$8, %xmm6, %xmm6
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpclmulqdq	$0x11, %xmm0, %xmm2, %xmm1
+        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
+        vpshufd	$0x4e, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
+        vpshufd	$0x4e, %xmm5, %xmm5
+        vpxor	%xmm6, %xmm1, %xmm1
+        vpxor	%xmm5, %xmm1, %xmm1
+        vpxor	%xmm4, %xmm1, %xmm1
+        vmovdqu	%xmm1, 32(%esp)
+        # H ^ 4
+        vpclmulqdq	$0x00, %xmm0, %xmm0, %xmm5
+        vpclmulqdq	$0x11, %xmm0, %xmm0, %xmm6
+        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
+        vpshufd	$0x4e, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpclmulqdq	$16, %xmm3, %xmm5, %xmm4
+        vpshufd	$0x4e, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm5, %xmm5
+        vpxor	%xmm5, %xmm6, %xmm2
+        vmovdqu	%xmm2, 48(%esp)
+        vmovdqu	80(%esp), %xmm6
+        cmpl	%esi, %edi
+        jne	L_AES_GCM_decrypt_update_avx2_ghash_64
+L_AES_GCM_decrypt_update_avx2_ghash_64_inplace:
+        # aesenc_64_ghash
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        # aesenc_64
+        # aesenc_ctr
+        vmovdqu	64(%esp), %xmm4
+        vmovdqu	L_aes_gcm_avx2_bswap_epi64, %xmm7
+        vpaddd	L_aes_gcm_avx2_one, %xmm4, %xmm1
+        vpshufb	%xmm7, %xmm4, %xmm0
+        vpaddd	L_aes_gcm_avx2_two, %xmm4, %xmm2
+        vpshufb	%xmm7, %xmm1, %xmm1
+        vpaddd	L_aes_gcm_avx2_three, %xmm4, %xmm3
+        vpshufb	%xmm7, %xmm2, %xmm2
+        vpaddd	L_aes_gcm_avx2_four, %xmm4, %xmm4
+        vpshufb	%xmm7, %xmm3, %xmm3
+        # aesenc_xor
+        vmovdqu	(%ebp), %xmm7
+        vmovdqu	%xmm4, 64(%esp)
+        vpxor	%xmm7, %xmm0, %xmm0
+        vpxor	%xmm7, %xmm1, %xmm1
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm7, %xmm3, %xmm3
+        vmovdqu	16(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	32(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	48(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	64(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	80(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	96(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	112(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	128(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	144(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        cmpl	$11, 184(%esp)
+        vmovdqu	160(%ebp), %xmm7
+        jl	L_AES_GCM_decrypt_update_avx2_inplace_aesenc_64_ghash_aesenc_64_enc_done
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	176(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        cmpl	$13, 184(%esp)
+        vmovdqu	192(%ebp), %xmm7
+        jl	L_AES_GCM_decrypt_update_avx2_inplace_aesenc_64_ghash_aesenc_64_enc_done
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	208(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	224(%ebp), %xmm7
+L_AES_GCM_decrypt_update_avx2_inplace_aesenc_64_ghash_aesenc_64_enc_done:
+        # aesenc_last
+        vaesenclast	%xmm7, %xmm0, %xmm0
+        vaesenclast	%xmm7, %xmm1, %xmm1
+        vaesenclast	%xmm7, %xmm2, %xmm2
+        vaesenclast	%xmm7, %xmm3, %xmm3
+        vmovdqu	(%ecx), %xmm7
+        vmovdqu	16(%ecx), %xmm4
+        vpxor	%xmm7, %xmm0, %xmm0
+        vpxor	%xmm4, %xmm1, %xmm1
+        vmovdqu	%xmm7, 96(%esp)
+        vmovdqu	%xmm4, 112(%esp)
+        vmovdqu	%xmm0, (%edx)
+        vmovdqu	%xmm1, 16(%edx)
+        vmovdqu	32(%ecx), %xmm7
+        vmovdqu	48(%ecx), %xmm4
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm4, %xmm3, %xmm3
+        vmovdqu	%xmm7, 128(%esp)
+        vmovdqu	%xmm4, 144(%esp)
+        vmovdqu	%xmm2, 32(%edx)
+        vmovdqu	%xmm3, 48(%edx)
+        # pclmul_1
+        vmovdqu	96(%esp), %xmm1
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
+        vmovdqu	48(%esp), %xmm2
+        vpxor	%xmm6, %xmm1, %xmm1
+        vpclmulqdq	$16, %xmm2, %xmm1, %xmm5
+        vpclmulqdq	$0x01, %xmm2, %xmm1, %xmm3
+        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm6
+        vpclmulqdq	$0x11, %xmm2, %xmm1, %xmm7
+        # pclmul_2
+        vmovdqu	112(%esp), %xmm1
+        vmovdqu	32(%esp), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
+        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
+        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
+        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm7, %xmm7
+        # pclmul_n
+        vmovdqu	128(%esp), %xmm1
+        vmovdqu	16(%esp), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
+        vpxor	%xmm4, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
+        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm7, %xmm7
+        # pclmul_n
+        vmovdqu	144(%esp), %xmm1
+        vmovdqu	(%esp), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
+        vpxor	%xmm4, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
+        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm7, %xmm7
+        # aesenc_pclmul_l
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpslldq	$8, %xmm5, %xmm1
+        vpsrldq	$8, %xmm5, %xmm5
+        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm0
+        vpxor	%xmm1, %xmm6, %xmm6
+        vpxor	%xmm5, %xmm7, %xmm7
+        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
+        vpshufd	$0x4e, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
+        vpshufd	$0x4e, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm7, %xmm6, %xmm6
+        # aesenc_64_ghash - end
+        addl	$0x40, %ebx
+        cmpl	%eax, %ebx
+        jl	L_AES_GCM_decrypt_update_avx2_ghash_64_inplace
+        jmp	L_AES_GCM_decrypt_update_avx2_ghash_64_done
+L_AES_GCM_decrypt_update_avx2_ghash_64:
+        # aesenc_64_ghash
+        leal	(%esi,%ebx,1), %ecx
+        leal	(%edi,%ebx,1), %edx
+        # aesenc_64
+        # aesenc_ctr
+        vmovdqu	64(%esp), %xmm4
+        vmovdqu	L_aes_gcm_avx2_bswap_epi64, %xmm7
+        vpaddd	L_aes_gcm_avx2_one, %xmm4, %xmm1
+        vpshufb	%xmm7, %xmm4, %xmm0
+        vpaddd	L_aes_gcm_avx2_two, %xmm4, %xmm2
+        vpshufb	%xmm7, %xmm1, %xmm1
+        vpaddd	L_aes_gcm_avx2_three, %xmm4, %xmm3
+        vpshufb	%xmm7, %xmm2, %xmm2
+        vpaddd	L_aes_gcm_avx2_four, %xmm4, %xmm4
+        vpshufb	%xmm7, %xmm3, %xmm3
+        # aesenc_xor
+        vmovdqu	(%ebp), %xmm7
+        vmovdqu	%xmm4, 64(%esp)
+        vpxor	%xmm7, %xmm0, %xmm0
+        vpxor	%xmm7, %xmm1, %xmm1
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm7, %xmm3, %xmm3
+        vmovdqu	16(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	32(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	48(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	64(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	80(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	96(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	112(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	128(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	144(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        cmpl	$11, 184(%esp)
+        vmovdqu	160(%ebp), %xmm7
+        jl	L_AES_GCM_decrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	176(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        cmpl	$13, 184(%esp)
+        vmovdqu	192(%ebp), %xmm7
+        jl	L_AES_GCM_decrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	208(%ebp), %xmm7
+        vaesenc	%xmm7, %xmm0, %xmm0
+        vaesenc	%xmm7, %xmm1, %xmm1
+        vaesenc	%xmm7, %xmm2, %xmm2
+        vaesenc	%xmm7, %xmm3, %xmm3
+        vmovdqu	224(%ebp), %xmm7
+L_AES_GCM_decrypt_update_avx2_aesenc_64_ghash_aesenc_64_enc_done:
+        # aesenc_last
+        vaesenclast	%xmm7, %xmm0, %xmm0
+        vaesenclast	%xmm7, %xmm1, %xmm1
+        vaesenclast	%xmm7, %xmm2, %xmm2
+        vaesenclast	%xmm7, %xmm3, %xmm3
+        vmovdqu	(%ecx), %xmm7
+        vmovdqu	16(%ecx), %xmm4
+        vpxor	%xmm7, %xmm0, %xmm0
+        vpxor	%xmm4, %xmm1, %xmm1
+        vmovdqu	%xmm7, (%ecx)
+        vmovdqu	%xmm4, 16(%ecx)
+        vmovdqu	%xmm0, (%edx)
+        vmovdqu	%xmm1, 16(%edx)
+        vmovdqu	32(%ecx), %xmm7
+        vmovdqu	48(%ecx), %xmm4
+        vpxor	%xmm7, %xmm2, %xmm2
+        vpxor	%xmm4, %xmm3, %xmm3
+        vmovdqu	%xmm7, 32(%ecx)
+        vmovdqu	%xmm4, 48(%ecx)
+        vmovdqu	%xmm2, 32(%edx)
+        vmovdqu	%xmm3, 48(%edx)
+        # pclmul_1
+        vmovdqu	(%ecx), %xmm1
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
+        vmovdqu	48(%esp), %xmm2
+        vpxor	%xmm6, %xmm1, %xmm1
+        vpclmulqdq	$16, %xmm2, %xmm1, %xmm5
+        vpclmulqdq	$0x01, %xmm2, %xmm1, %xmm3
+        vpclmulqdq	$0x00, %xmm2, %xmm1, %xmm6
+        vpclmulqdq	$0x11, %xmm2, %xmm1, %xmm7
+        # pclmul_2
+        vmovdqu	16(%ecx), %xmm1
+        vmovdqu	32(%esp), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
+        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
+        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
+        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm7, %xmm7
+        # pclmul_n
+        vmovdqu	32(%ecx), %xmm1
+        vmovdqu	16(%esp), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
+        vpxor	%xmm4, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
+        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm7, %xmm7
+        # pclmul_n
+        vmovdqu	48(%ecx), %xmm1
+        vmovdqu	(%esp), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm1, %xmm1
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpclmulqdq	$16, %xmm0, %xmm1, %xmm2
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpclmulqdq	$0x01, %xmm0, %xmm1, %xmm3
+        vpxor	%xmm4, %xmm6, %xmm6
+        vpclmulqdq	$0x00, %xmm0, %xmm1, %xmm4
+        vpclmulqdq	$0x11, %xmm0, %xmm1, %xmm1
+        vpxor	%xmm1, %xmm7, %xmm7
+        # aesenc_pclmul_l
+        vpxor	%xmm2, %xmm5, %xmm5
+        vpxor	%xmm4, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm5, %xmm5
+        vpslldq	$8, %xmm5, %xmm1
+        vpsrldq	$8, %xmm5, %xmm5
+        vmovdqu	L_aes_gcm_avx2_mod2_128, %xmm0
+        vpxor	%xmm1, %xmm6, %xmm6
+        vpxor	%xmm5, %xmm7, %xmm7
+        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
+        vpshufd	$0x4e, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpclmulqdq	$16, %xmm0, %xmm6, %xmm3
+        vpshufd	$0x4e, %xmm6, %xmm6
+        vpxor	%xmm3, %xmm6, %xmm6
+        vpxor	%xmm7, %xmm6, %xmm6
+        # aesenc_64_ghash - end
+        addl	$0x40, %ebx
+        cmpl	%eax, %ebx
+        jl	L_AES_GCM_decrypt_update_avx2_ghash_64
+L_AES_GCM_decrypt_update_avx2_ghash_64_done:
+        vmovdqu	(%esp), %xmm5
+        vmovdqu	64(%esp), %xmm4
+L_AES_GCM_decrypt_update_avx2_done_64:
+        cmpl	196(%esp), %ebx
+        jge	L_AES_GCM_decrypt_update_avx2_done_dec
+        movl	196(%esp), %eax
+        andl	$0xfffffff0, %eax
+        cmpl	%eax, %ebx
+        jge	L_AES_GCM_decrypt_update_avx2_last_block_done
+L_AES_GCM_decrypt_update_avx2_last_block_start:
+        vmovdqu	(%esi,%ebx,1), %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_epi64, %xmm4, %xmm7
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
+        vpaddd	L_aes_gcm_avx2_one, %xmm4, %xmm4
+        vmovdqu	%xmm4, 64(%esp)
+        vpxor	%xmm6, %xmm0, %xmm4
+        # aesenc_gfmul_sb
+        vpclmulqdq	$0x01, %xmm5, %xmm4, %xmm2
+        vpclmulqdq	$16, %xmm5, %xmm4, %xmm3
+        vpclmulqdq	$0x00, %xmm5, %xmm4, %xmm1
+        vpclmulqdq	$0x11, %xmm5, %xmm4, %xmm4
+        vpxor	(%ebp), %xmm7, %xmm7
+        vaesenc	16(%ebp), %xmm7, %xmm7
+        vpxor	%xmm2, %xmm3, %xmm3
+        vpslldq	$8, %xmm3, %xmm2
+        vpsrldq	$8, %xmm3, %xmm3
+        vaesenc	32(%ebp), %xmm7, %xmm7
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm2, %xmm1
+        vaesenc	48(%ebp), %xmm7, %xmm7
+        vaesenc	64(%ebp), %xmm7, %xmm7
+        vaesenc	80(%ebp), %xmm7, %xmm7
+        vpshufd	$0x4e, %xmm2, %xmm2
+        vpxor	%xmm1, %xmm2, %xmm2
+        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm2, %xmm1
+        vaesenc	96(%ebp), %xmm7, %xmm7
+        vaesenc	112(%ebp), %xmm7, %xmm7
+        vaesenc	128(%ebp), %xmm7, %xmm7
+        vpshufd	$0x4e, %xmm2, %xmm2
+        vaesenc	144(%ebp), %xmm7, %xmm7
+        vpxor	%xmm3, %xmm4, %xmm4
+        vpxor	%xmm4, %xmm2, %xmm2
+        vmovdqu	160(%ebp), %xmm0
+        cmpl	$11, 184(%esp)
+        jl	L_AES_GCM_decrypt_update_avx2_aesenc_gfmul_sb_last
+        vaesenc	%xmm0, %xmm7, %xmm7
+        vaesenc	176(%ebp), %xmm7, %xmm7
+        vmovdqu	192(%ebp), %xmm0
+        cmpl	$13, 184(%esp)
+        jl	L_AES_GCM_decrypt_update_avx2_aesenc_gfmul_sb_last
+        vaesenc	%xmm0, %xmm7, %xmm7
+        vaesenc	208(%ebp), %xmm7, %xmm7
+        vmovdqu	224(%ebp), %xmm0
+L_AES_GCM_decrypt_update_avx2_aesenc_gfmul_sb_last:
+        vaesenclast	%xmm0, %xmm7, %xmm7
+        vmovdqu	(%esi,%ebx,1), %xmm3
+        vpxor	%xmm1, %xmm2, %xmm6
+        vpxor	%xmm3, %xmm7, %xmm7
+        vmovdqu	%xmm7, (%edi,%ebx,1)
+        vmovdqu	64(%esp), %xmm4
+        addl	$16, %ebx
+        cmpl	%eax, %ebx
+        jl	L_AES_GCM_decrypt_update_avx2_last_block_start
+L_AES_GCM_decrypt_update_avx2_last_block_done:
+L_AES_GCM_decrypt_update_avx2_done_dec:
+        movl	200(%esp), %esi
+        movl	208(%esp), %edi
+        vmovdqu	64(%esp), %xmm4
+        vmovdqu	%xmm6, (%esi)
+        vmovdqu	%xmm4, (%edi)
+        addl	$0xa0, %esp
+        popl	%ebp
+        popl	%edi
+        popl	%esi
+        popl	%ebx
+        ret
+.size	AES_GCM_decrypt_update_avx2,.-AES_GCM_decrypt_update_avx2
+.text
+.globl	AES_GCM_decrypt_final_avx2
+.type	AES_GCM_decrypt_final_avx2,@function
+.align	16
+AES_GCM_decrypt_final_avx2:
+        pushl	%ebx
+        pushl	%esi
+        pushl	%edi
+        pushl	%ebp
+        subl	$16, %esp
+        movl	36(%esp), %ebp
+        movl	56(%esp), %esi
+        movl	60(%esp), %edi
+        vmovdqu	(%ebp), %xmm4
+        vmovdqu	(%esi), %xmm5
+        vmovdqu	(%edi), %xmm6
+        vpsrlq	$63, %xmm5, %xmm1
+        vpsllq	$0x01, %xmm5, %xmm0
+        vpslldq	$8, %xmm1, %xmm1
+        vpor	%xmm1, %xmm0, %xmm0
+        vpshufd	$0xff, %xmm5, %xmm5
+        vpsrad	$31, %xmm5, %xmm5
+        vpand	L_aes_gcm_avx2_mod2_128, %xmm5, %xmm5
+        vpxor	%xmm0, %xmm5, %xmm5
+        # calc_tag
+        movl	48(%esp), %ecx
+        shll	$3, %ecx
+        vpinsrd	$0x00, %ecx, %xmm0, %xmm0
+        movl	52(%esp), %ecx
+        shll	$3, %ecx
+        vpinsrd	$2, %ecx, %xmm0, %xmm0
+        movl	48(%esp), %ecx
+        shrl	$29, %ecx
+        vpinsrd	$0x01, %ecx, %xmm0, %xmm0
+        movl	52(%esp), %ecx
+        shrl	$29, %ecx
+        vpinsrd	$3, %ecx, %xmm0, %xmm0
+        vpxor	%xmm4, %xmm0, %xmm0
+        # ghash_gfmul_red
+        vpclmulqdq	$16, %xmm5, %xmm0, %xmm7
+        vpclmulqdq	$0x01, %xmm5, %xmm0, %xmm3
+        vpclmulqdq	$0x00, %xmm5, %xmm0, %xmm2
+        vpxor	%xmm3, %xmm7, %xmm7
+        vpslldq	$8, %xmm7, %xmm3
+        vpsrldq	$8, %xmm7, %xmm7
+        vpxor	%xmm2, %xmm3, %xmm3
+        vpclmulqdq	$0x11, %xmm5, %xmm0, %xmm0
+        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm3, %xmm2
+        vpshufd	$0x4e, %xmm3, %xmm3
+        vpxor	%xmm2, %xmm3, %xmm3
+        vpclmulqdq	$16, L_aes_gcm_avx2_mod2_128, %xmm3, %xmm2
+        vpshufd	$0x4e, %xmm3, %xmm3
+        vpxor	%xmm7, %xmm0, %xmm0
+        vpxor	%xmm3, %xmm0, %xmm0
+        vpxor	%xmm2, %xmm0, %xmm0
+        vpshufb	L_aes_gcm_avx2_bswap_mask, %xmm0, %xmm0
+        vpxor	%xmm6, %xmm0, %xmm0
+        movl	40(%esp), %esi
+        movl	64(%esp), %edi
+        # cmp_tag
+        cmpl	$16, 44(%esp)
+        je	L_AES_GCM_decrypt_final_avx2_cmp_tag_16
+        xorl	%ecx, %ecx
+        xorl	%edx, %edx
+        vmovdqu	%xmm0, (%esp)
+L_AES_GCM_decrypt_final_avx2_cmp_tag_loop:
+        movzbl	(%esp,%ecx,1), %eax
+        xorb	(%esi,%ecx,1), %al
+        orb	%al, %dl
+        incl	%ecx
+        cmpl	44(%esp), %ecx
+        jne	L_AES_GCM_decrypt_final_avx2_cmp_tag_loop
+        cmpb	$0x00, %dl
+        sete	%dl
+        jmp	L_AES_GCM_decrypt_final_avx2_cmp_tag_done
+L_AES_GCM_decrypt_final_avx2_cmp_tag_16:
+        vmovdqu	(%esi), %xmm1
+        vpcmpeqb	%xmm1, %xmm0, %xmm0
+        vpmovmskb	%xmm0, %ecx
+        # %%edx == 0xFFFF then return 1 else => return 0
+        xorl	%edx, %edx
+        cmpl	$0xffff, %ecx
+        sete	%dl
+L_AES_GCM_decrypt_final_avx2_cmp_tag_done:
+        movl	%edx, (%edi)
+        addl	$16, %esp
+        popl	%ebp
+        popl	%edi
+        popl	%esi
+        popl	%ebx
+        ret
+.size	AES_GCM_decrypt_final_avx2,.-AES_GCM_decrypt_final_avx2
+#endif /* WOLFSSL_AESGCM_STREAM */
+#endif /* HAVE_INTEL_AVX2 */
+
+#if defined(__linux__) && defined(__ELF__)
+.section	.note.GNU-stack,"",%progbits
+#endif

+ 2 - 0
wolfcrypt/src/chacha_asm.S

@@ -30,6 +30,7 @@
 #define HAVE_INTEL_AVX2
 #endif /* NO_AVX2_SUPPORT */
 
+#ifdef WOLFSSL_X86_64_BUILD
 #ifndef __APPLE__
 .text
 .globl	chacha_encrypt_x64
@@ -1430,6 +1431,7 @@ L_chacha20_avx2_end256:
 .size	chacha_encrypt_avx2,.-chacha_encrypt_avx2
 #endif /* __APPLE__ */
 #endif /* HAVE_INTEL_AVX2 */
+#endif /* WOLFSSL_X86_64_BUILD */
 
 #if defined(__linux__) && defined(__ELF__)
 .section	.note.GNU-stack,"",%progbits

+ 13 - 11
wolfcrypt/src/poly1305.c

@@ -55,7 +55,7 @@ and Daniel J. Bernstein
     #pragma warning(disable: 4127)
 #endif
 
-#ifdef USE_INTEL_SPEEDUP
+#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
     #include <emmintrin.h>
     #include <immintrin.h>
 
@@ -77,12 +77,13 @@ and Daniel J. Bernstein
     #endif
 #endif
 
-#ifdef USE_INTEL_SPEEDUP
+#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
 static word32 intel_flags = 0;
 static word32 cpu_flags_set = 0;
 #endif
 
-#if defined(USE_INTEL_SPEEDUP) || defined(POLY130564)
+#if (defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)) || \
+        defined(POLY130564)
     #if defined(_MSC_VER)
         #define POLY1305_NOINLINE __declspec(noinline)
     #elif defined(__GNUC__)
@@ -122,7 +123,7 @@ static word32 cpu_flags_set = 0;
     #endif
 #endif
 
-#ifdef USE_INTEL_SPEEDUP
+#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
 #ifdef __cplusplus
     extern "C" {
 #endif
@@ -265,7 +266,7 @@ with a given ctx pointer to a Poly1305 structure.
 static int poly1305_blocks(Poly1305* ctx, const unsigned char *m,
                      size_t bytes)
 {
-#ifdef USE_INTEL_SPEEDUP
+#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
     /* AVX2 is handled in wc_Poly1305Update. */
     SAVE_VECTOR_REGISTERS(return _svr_ret;);
     poly1305_blocks_avx(ctx, m, bytes);
@@ -399,7 +400,7 @@ number of bytes is less than the block size.
 */
 static int poly1305_block(Poly1305* ctx, const unsigned char *m)
 {
-#ifdef USE_INTEL_SPEEDUP
+#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
     /* No call to poly1305_block when AVX2, AVX2 does 4 blocks at a time. */
     SAVE_VECTOR_REGISTERS(return _svr_ret;);
     poly1305_block_avx(ctx, m);
@@ -414,7 +415,8 @@ static int poly1305_block(Poly1305* ctx, const unsigned char *m)
 #if !defined(WOLFSSL_ARMASM) || !defined(__aarch64__)
 int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
 {
-#if defined(POLY130564) && !defined(USE_INTEL_SPEEDUP)
+#if defined(POLY130564) && \
+    !(defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP))
     word64 t0,t1;
 #endif
 
@@ -435,7 +437,7 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
     if (keySz != 32 || ctx == NULL)
         return BAD_FUNC_ARG;
 
-#ifdef USE_INTEL_SPEEDUP
+#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
     if (!cpu_flags_set) {
         intel_flags = cpuid_get_flags();
         cpu_flags_set = 1;
@@ -502,7 +504,7 @@ int wc_Poly1305SetKey(Poly1305* ctx, const byte* key, word32 keySz)
 
 int wc_Poly1305Final(Poly1305* ctx, byte* mac)
 {
-#ifdef USE_INTEL_SPEEDUP
+#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
 #elif defined(POLY130564)
 
     word64 h0,h1,h2,c;
@@ -521,7 +523,7 @@ int wc_Poly1305Final(Poly1305* ctx, byte* mac)
     if (ctx == NULL || mac == NULL)
         return BAD_FUNC_ARG;
 
-#ifdef USE_INTEL_SPEEDUP
+#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
     SAVE_VECTOR_REGISTERS(return _svr_ret;);
     #ifdef HAVE_INTEL_AVX2
     if (IS_INTEL_AVX2(intel_flags))
@@ -707,7 +709,7 @@ int wc_Poly1305Update(Poly1305* ctx, const byte* m, word32 bytes)
     printf("\n");
 #endif
 
-#ifdef USE_INTEL_SPEEDUP
+#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
     #ifdef HAVE_INTEL_AVX2
     if (IS_INTEL_AVX2(intel_flags)) {
         SAVE_VECTOR_REGISTERS(return _svr_ret;);

+ 2 - 0
wolfcrypt/src/poly1305_asm.S

@@ -30,6 +30,7 @@
 #define HAVE_INTEL_AVX2
 #endif /* NO_AVX2_SUPPORT */
 
+#ifdef WOLFSSL_X86_64_BUILD
 #ifdef HAVE_INTEL_AVX1
 #ifndef __APPLE__
 .text
@@ -1107,6 +1108,7 @@ L_poly1305_avx2_final_cmp_copy:
 .size	poly1305_final_avx2,.-poly1305_final_avx2
 #endif /* __APPLE__ */
 #endif /* HAVE_INTEL_AVX2 */
+#endif /* WOLFSSL_X86_64_BUILD */
 
 #if defined(__linux__) && defined(__ELF__)
 .section	.note.GNU-stack,"",%progbits

+ 21 - 16
wolfcrypt/src/sha256.c

@@ -174,7 +174,7 @@ on the specific device platform.
 #endif
 
 
-#if defined(USE_INTEL_SPEEDUP)
+#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
     #if defined(__GNUC__) && ((__GNUC__ < 4) || \
                               (__GNUC__ == 4 && __GNUC_MINOR__ <= 8))
         #undef  NO_AVX2_SUPPORT
@@ -194,7 +194,7 @@ on the specific device platform.
 #else
     #undef HAVE_INTEL_AVX1
     #undef HAVE_INTEL_AVX2
-#endif /* USE_INTEL_SPEEDUP */
+#endif /* WOLFSSL_X86_64_BUILD && USE_INTEL_SPEEDUP */
 
 #if defined(HAVE_INTEL_AVX2)
     #define HAVE_INTEL_RORX
@@ -253,8 +253,8 @@ static int InitSha256(wc_Sha256* sha256)
 
 
 /* Hardware Acceleration */
-#if defined(USE_INTEL_SPEEDUP) && (defined(HAVE_INTEL_AVX1) || \
-                                                       defined(HAVE_INTEL_AVX2))
+#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
+                          (defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
 
     /* in case intel instructions aren't available, plus we need the K[] global */
     #define NEED_SOFT_SHA256
@@ -1072,7 +1072,8 @@ static int InitSha256(wc_Sha256* sha256)
 
             if (sha256->buffLen == WC_SHA256_BLOCK_SIZE) {
             #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
-                #if defined(USE_INTEL_SPEEDUP) && \
+                #if defined(WOLFSSL_X86_64_BUILD) && \
+                          defined(USE_INTEL_SPEEDUP) && \
                           (defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
                 if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
                 #endif
@@ -1107,7 +1108,7 @@ static int InitSha256(wc_Sha256* sha256)
 
         /* process blocks */
     #ifdef XTRANSFORM_LEN
-        #if defined(USE_INTEL_SPEEDUP) && \
+        #if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
                           (defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
         if (Transform_Sha256_Len_p != NULL)
         #endif
@@ -1123,13 +1124,14 @@ static int InitSha256(wc_Sha256* sha256)
                 len  -= blocksLen;
             }
         }
-        #if defined(USE_INTEL_SPEEDUP) && \
+        #if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
                           (defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
         else
         #endif
     #endif /* XTRANSFORM_LEN */
-    #if !defined(XTRANSFORM_LEN) || (defined(USE_INTEL_SPEEDUP) && \
-                         (defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)))
+    #if !defined(XTRANSFORM_LEN) || \
+        (defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
+         (defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)))
         {
             while (len >= WC_SHA256_BLOCK_SIZE) {
                 word32* local32 = sha256->buffer;
@@ -1137,7 +1139,8 @@ static int InitSha256(wc_Sha256* sha256)
                 /* Intel transform function requires use of sha256->buffer */
                 /* Little Endian requires byte swap, so can't use data directly */
             #if defined(WC_HASH_DATA_ALIGNMENT) && !defined(LITTLE_ENDIAN_ORDER) && \
-                !(defined(USE_INTEL_SPEEDUP) && \
+                !(defined(WOLFSSL_X86_64_BUILD) && \
+                         defined(USE_INTEL_SPEEDUP) && \
                          (defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)))
                 if (((wc_ptr_t)data % WC_HASH_DATA_ALIGNMENT) == 0) {
                     local32 = (word32*)data;
@@ -1152,7 +1155,8 @@ static int InitSha256(wc_Sha256* sha256)
                 len  -= WC_SHA256_BLOCK_SIZE;
 
             #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
-                #if defined(USE_INTEL_SPEEDUP) && \
+                #if defined(WOLFSSL_X86_64_BUILD) && \
+                          defined(USE_INTEL_SPEEDUP) && \
                           (defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
                 if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
                 #endif
@@ -1245,7 +1249,7 @@ static int InitSha256(wc_Sha256* sha256)
             sha256->buffLen += WC_SHA256_BLOCK_SIZE - sha256->buffLen;
 
         #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
-            #if defined(USE_INTEL_SPEEDUP) && \
+            #if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
                           (defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
             if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
             #endif
@@ -1283,7 +1287,7 @@ static int InitSha256(wc_Sha256* sha256)
 
         /* store lengths */
     #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
-        #if defined(USE_INTEL_SPEEDUP) && \
+        #if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
                           (defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
         if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
         #endif
@@ -1297,10 +1301,11 @@ static int InitSha256(wc_Sha256* sha256)
         XMEMCPY(&local[WC_SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen,
                 sizeof(word32));
 
-    #if defined(FREESCALE_MMCAU_SHA) || (defined(USE_INTEL_SPEEDUP) && \
+    #if defined(FREESCALE_MMCAU_SHA) || \
+        (defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
                          (defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2)))
         /* Kinetis requires only these bytes reversed */
-        #if defined(USE_INTEL_SPEEDUP) && \
+        #if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
                           (defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
         if (IS_INTEL_AVX1(intel_flags) || IS_INTEL_AVX2(intel_flags))
         #endif
@@ -1532,7 +1537,7 @@ static int InitSha256(wc_Sha256* sha256)
         sha224->loLen   = 0;
         sha224->hiLen   = 0;
 
-    #if defined(USE_INTEL_SPEEDUP) && \
+    #if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
                           (defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
         /* choose best Transform function under this runtime environment */
         Sha256_SetTransform();

+ 2 - 0
wolfcrypt/src/sha256_asm.S

@@ -30,6 +30,7 @@
 #define HAVE_INTEL_AVX2
 #endif /* NO_AVX2_SUPPORT */
 
+#ifdef WOLFSSL_X86_64_BUILD
 #ifdef HAVE_INTEL_AVX1
 #ifndef __APPLE__
 .data
@@ -22655,6 +22656,7 @@ L_sha256_len_avx2_rorx_done:
 .size	Transform_Sha256_AVX2_RORX_Len,.-Transform_Sha256_AVX2_RORX_Len
 #endif /* __APPLE__ */
 #endif /* HAVE_INTEL_AVX2 */
+#endif /* WOLFSSL_X86_64_BUILD */
 
 #if defined(__linux__) && defined(__ELF__)
 .section	.note.GNU-stack,"",%progbits

+ 27 - 0
wolfcrypt/test/test.c

@@ -11325,6 +11325,33 @@ WOLFSSL_TEST_SUBROUTINE int aesgcm_test(void)
             ERROR_OUT(-6394, out);
     }
 #endif /* HAVE_AES_DECRYPT */
+#ifdef BENCH_AESGCM_LARGE
+    /* setup test buffer */
+    result = wc_AesGcmEncryptInit(enc, k1, sizeof(k1), iv1, sizeof(iv1));
+    if (result != 0)
+        ERROR_OUT(-6360, out);
+    result = wc_AesGcmEncryptUpdate(enc, large_output, large_input,
+                                    BENCH_AESGCM_LARGE, a, sizeof(a));
+    if (result != 0)
+        ERROR_OUT(-6361, out);
+    result = wc_AesGcmEncryptFinal(enc, resultT, sizeof(t1));
+    if (result != 0)
+        ERROR_OUT(-6362, out);
+#ifdef HAVE_AES_DECRYPT
+    result = wc_AesGcmDecryptInit(enc, k1, sizeof(k1), iv1, sizeof(iv1));
+    if (result != 0)
+        ERROR_OUT(-6363, out);
+    result = wc_AesGcmDecryptUpdate(enc, large_outdec, large_output,
+                                    BENCH_AESGCM_LARGE, a, sizeof(a));
+    if (result != 0)
+        ERROR_OUT(-6364, out);
+    result = wc_AesGcmDecryptFinal(enc, resultT, sizeof(t1));
+    if (result != 0)
+        ERROR_OUT(-6365, out);
+    if (XMEMCMP(large_input, large_outdec, BENCH_AESGCM_LARGE))
+        ERROR_OUT(-6366, out);
+#endif /* HAVE_AES_DECRYPT */
+#endif /* BENCH_AESGCM_LARGE */
 #endif /* WOLFSSL_AESGCM_STREAM */
 #endif /* WOLFSSL_AES_256 */
 #endif /* !WOLFSSL_AFALG_XILINX_AES && !WOLFSSL_XILINX_CRYPT */

+ 2 - 2
wolfssl/wolfcrypt/poly1305.h

@@ -48,7 +48,7 @@
 #define WC_HAS_GCC_4_4_64BIT
 #endif
 
-#ifdef USE_INTEL_SPEEDUP
+#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
 #elif (defined(WC_HAS_SIZEOF_INT128_64BIT) || defined(WC_HAS_MSVC_64BIT) ||  \
        defined(WC_HAS_GCC_4_4_64BIT))
 #define POLY130564
@@ -67,7 +67,7 @@ enum {
 
 /* Poly1305 state */
 typedef struct Poly1305 {
-#ifdef USE_INTEL_SPEEDUP
+#if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
     word64 r[3];
     word64 h[3];
     word64 pad[2];