Browse Source

Chacha20 ARM optimization

Juliusz Sosinowicz 4 years ago
parent
commit
a48981c3c6

+ 5 - 0
.gitignore

@@ -321,3 +321,8 @@ doc/pdf
 
 # XCODE Index
 IDE/XCODE/Index
+
+# ARM DS-5
+\.settings/
+\.cproject
+\.project

+ 10 - 9
IDE/LPCXPRESSO/README.md

@@ -2,15 +2,16 @@
 
 To use, install the NXP LPCXpresso IDE and import the projects in a new workspace.
 
-1. Run LPCXpresso and choose a workspace location.
-2. Right click in the project exporer window and choose Inport.
-3. Under General choose "Existing Projects into Workspace".
-4. Under "Select root directory" click browse and select the wolfSSL root.
-5. Check the "Search for nested projects" box.
-5. Make sure "wolfssl" and "wolfssl_example" are checked under "Projects:".
-6. Click finish.
-7. Download the board and chip LPCOpen package for your platform.
-8. Import the projects. For example "lpc_board_nxp_lpcxpresso_1837" and "lpc_chip_18xx" are the ones for the LPC18S37.
+1. Change names of `LPCExpresso.project` and `LPCExpresso.cproject` files to `.project` and `.cproject`
+2. Run LPCXpresso and choose a workspace location.
+3. Right click in the project explorer window and choose Import.
+4. Under General choose "Existing Projects into Workspace".
+5. Under "Select root directory" click browse and select the wolfSSL root.
+6. Check the "Search for nested projects" box.
+7. Make sure "wolfssl" and "wolfssl_example" are checked under "Projects:".
+8. Click finish.
+9. Download the board and chip LPCOpen package for your platform.
+10. Import the projects. For example "lpc_board_nxp_lpcxpresso_1837" and "lpc_chip_18xx" are the ones for the LPC18S37.
 
 To setup this example to work with different baords/chips you will need to locate the LPCOpen sources for LPCXpresso on the NXP website and import the board and chip projects. Then you will need to update the "wolfssl_example" project properties to reference these projects (C/C++ General -> Paths and Symbols -> References). See the [LPCOpen v2.xx LPCXpresso quickstart guide for all platforms](https://www.lpcware.com/content/project/lpcopen-platform-nxp-lpc-microcontrollers/lpcopen-v200-quickstart-guides/lpcopen-1) for additional information.
 

+ 0 - 0
.cproject → LPCExpresso.cproject


+ 0 - 0
.project → LPCExpresso.project


+ 4 - 0
src/include.am

@@ -340,10 +340,14 @@ src_libwolfssl_la_SOURCES += wolfcrypt/src/rabbit.c
 endif
 
 if BUILD_CHACHA
+if BUILD_ARMASM
+src_libwolfssl_la_SOURCES += wolfcrypt/src/port/arm/armv8-chacha.c
+else
 src_libwolfssl_la_SOURCES += wolfcrypt/src/chacha.c
 if BUILD_INTELASM
 src_libwolfssl_la_SOURCES += wolfcrypt/src/chacha_asm.S
 endif
+endif
 if BUILD_POLY1305
 src_libwolfssl_la_SOURCES += wolfcrypt/src/chacha20_poly1305.c
 endif

+ 5 - 0
wolfcrypt/src/chacha.c

@@ -27,6 +27,10 @@
  */
 
 
+#ifdef WOLFSSL_ARMASM
+    /* implementation is located in wolfcrypt/src/port/arm/armv8-chacha.c */
+
+#else
 #ifdef HAVE_CONFIG_H
     #include <config.h>
 #endif
@@ -316,3 +320,4 @@ int wc_Chacha_Process(ChaCha* ctx, byte* output, const byte* input,
 
 #endif /* HAVE_CHACHA*/
 
+#endif /* WOLFSSL_ARMASM */

+ 3 - 0
wolfcrypt/src/include.am

@@ -48,6 +48,9 @@ EXTRA_DIST += wolfcrypt/src/port/ti/ti-aes.c \
               wolfcrypt/src/port/ti/ti-ccm.c \
               wolfcrypt/src/port/pic32/pic32mz-crypt.c \
               wolfcrypt/src/port/nrf51.c \
+              wolfcrypt/src/port/arm/armv8-aes.c \
+              wolfcrypt/src/port/arm/armv8-sha256.c \
+              wolfcrypt/src/port/arm/armv8-chacha.c \
               wolfcrypt/src/port/arm/armv8-curve25519.c \
               wolfcrypt/src/port/arm/armv7-curve25519.c \
               wolfcrypt/src/port/arm/armv8-sha512-asm.c \

+ 2858 - 0
wolfcrypt/src/port/arm/armv8-chacha.c

@@ -0,0 +1,2858 @@
+/* armv8-chacha.c
+ *
+ * Copyright (C) 2006-2019 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ *
+ */
+
+/*  The paper NEON crypto by Daniel J. Bernstein and Peter Schwabe was used to optimize for ARM
+ *  https://cryptojedi.org/papers/neoncrypto-20120320.pdf
+ */
+
+#ifdef WOLFSSL_ARMASM
+
+#ifdef HAVE_CONFIG_H
+    #include <config.h>
+#endif
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#ifdef HAVE_CHACHA
+
+#include <wolfssl/wolfcrypt/chacha.h>
+#include <wolfssl/wolfcrypt/error-crypt.h>
+#include <wolfssl/wolfcrypt/logging.h>
+#include <wolfssl/wolfcrypt/cpuid.h>
+#ifdef NO_INLINE
+    #include <wolfssl/wolfcrypt/misc.h>
+#else
+    #define WOLFSSL_MISC_INCLUDED
+    #include <wolfcrypt/src/misc.c>
+#endif
+
+#ifdef CHACHA_AEAD_TEST
+    #include <stdio.h>
+#endif
+
+#ifdef CHACHA_TEST
+    #include <stdio.h>
+#endif
+
+#ifdef BIG_ENDIAN_ORDER
+    #define LITTLE32(x) ByteReverseWord32(x)
+#else
+    #define LITTLE32(x) (x)
+#endif
+
+/* Number of rounds */
+#define ROUNDS  20
+
+#define U32C(v) (v##U)
+#define U32V(v) ((word32)(v) & U32C(0xFFFFFFFF))
+#define U8TO32_LITTLE(p) LITTLE32(((word32*)(p))[0])
+
+#define PLUS(v,w)   (U32V((v) + (w)))
+#define PLUSONE(v)  (PLUS((v),1))
+
+#define ARM_SIMD_LEN_BYTES 16
+
+/**
+  * Set up iv(nonce). Earlier versions used 64 bits instead of 96, this version
+  * uses the typical AEAD 96 bit nonce and can do record sizes of 256 GB.
+  */
+int wc_Chacha_SetIV(ChaCha* ctx, const byte* inIv, word32 counter)
+{
+    word32 temp[CHACHA_IV_WORDS];/* used for alignment of memory */
+
+#ifdef CHACHA_AEAD_TEST
+    word32 i;
+    printf("NONCE : ");
+    for (i = 0; i < CHACHA_IV_BYTES; i++) {
+        printf("%02x", inIv[i]);
+    }
+    printf("\n\n");
+#endif
+
+    if (ctx == NULL)
+        return BAD_FUNC_ARG;
+
+    XMEMCPY(temp, inIv, CHACHA_IV_BYTES);
+
+    ctx->X[CHACHA_IV_BYTES+0] = counter;           /* block counter */
+    ctx->X[CHACHA_IV_BYTES+1] = LITTLE32(temp[0]); /* fixed variable from nonce */
+    ctx->X[CHACHA_IV_BYTES+2] = LITTLE32(temp[1]); /* counter from nonce */
+    ctx->X[CHACHA_IV_BYTES+3] = LITTLE32(temp[2]); /* counter from nonce */
+
+    return 0;
+}
+
+/* "expand 32-byte k" as unsigned 32 byte */
+static const word32 sigma[4] = {0x61707865, 0x3320646e, 0x79622d32, 0x6b206574};
+/* "expand 16-byte k" as unsigned 16 byte */
+static const word32 tau[4] = {0x61707865, 0x3120646e, 0x79622d36, 0x6b206574};
+
+/**
+  * Key setup. 8 word iv (nonce)
+  */
+int wc_Chacha_SetKey(ChaCha* ctx, const byte* key, word32 keySz)
+{
+    const word32* constants;
+    const byte*   k;
+
+#ifdef XSTREAM_ALIGN
+    word32 alignKey[8];
+#endif
+
+    if (ctx == NULL)
+        return BAD_FUNC_ARG;
+
+    if (keySz != (CHACHA_MAX_KEY_SZ/2) && keySz != CHACHA_MAX_KEY_SZ)
+        return BAD_FUNC_ARG;
+
+#ifdef XSTREAM_ALIGN
+    if ((wolfssl_word)key % 4) {
+        WOLFSSL_MSG("wc_ChachaSetKey unaligned key");
+        XMEMCPY(alignKey, key, keySz);
+        k = (byte*)alignKey;
+    }
+    else {
+        k = key;
+    }
+#else
+    k = key;
+#endif /* XSTREAM_ALIGN */
+
+#ifdef CHACHA_AEAD_TEST
+    word32 i;
+    printf("ChaCha key used :\n");
+    for (i = 0; i < keySz; i++) {
+        printf("%02x", key[i]);
+        if ((i + 1) % 8 == 0)
+           printf("\n");
+    }
+    printf("\n\n");
+#endif
+
+    ctx->X[4] = U8TO32_LITTLE(k +  0);
+    ctx->X[5] = U8TO32_LITTLE(k +  4);
+    ctx->X[6] = U8TO32_LITTLE(k +  8);
+    ctx->X[7] = U8TO32_LITTLE(k + 12);
+    if (keySz == CHACHA_MAX_KEY_SZ) {
+        k += 16;
+        constants = sigma;
+    }
+    else {
+        constants = tau;
+    }
+    ctx->X[ 8] = U8TO32_LITTLE(k +  0);
+    ctx->X[ 9] = U8TO32_LITTLE(k +  4);
+    ctx->X[10] = U8TO32_LITTLE(k +  8);
+    ctx->X[11] = U8TO32_LITTLE(k + 12);
+    ctx->X[ 0] = constants[0];
+    ctx->X[ 1] = constants[1];
+    ctx->X[ 2] = constants[2];
+    ctx->X[ 3] = constants[3];
+
+    return 0;
+}
+
+static const word32 L_chacha20_neon_inc_first_word[] = {
+    0x1,
+    0x0,
+    0x0,
+    0x0,
+};
+
+#ifdef __aarch64__
+
+static const word32 L_chacha20_neon_add_all_counters[] = {
+    0x0,
+    0x1,
+    0x2,
+    0x3,
+};
+
+static const word32 L_chacha20_neon_rol8[] = {
+    0x2010003,
+    0x6050407,
+    0xa09080b,
+    0xe0d0c0f,
+};
+
+static WC_INLINE void wc_Chacha_encrypt_320(const word32* input, const byte* m, byte* c, word32 bytes)
+{
+#ifdef CHACHA_TEST
+    printf("Entering wc_Chacha_encrypt_320 with %d bytes\n", bytes);
+#endif /*CHACHA_TEST */
+
+    __asm__ __volatile__ (
+        /*
+         * The layout of used registers is:
+         * ARM
+         * w4-w19: these registers hold the fifth Chacha block for calculation in regular ARM
+         * w20: loop counter for how many even-odd rounds need to be executed
+         * w21: the counter offset for the block in ARM registers
+         * NEON
+         * v0-v15: the vi'th register holds the i'th word of four blocks during the quarter rounds.
+         *         these registers are later transposed make ADDing the input and XORing the message easier.
+         * v16-v19: these are helper registers that are used as temporary location to store data
+         * v20-v23: load the next message block
+         * v24-v27: the 64 byte intial Chacha block
+         * v28: vector to increment the counter words of each block
+         * v29: vector of 5's to increment counters between L_chacha20_arm64_outer_%= loops
+         * v30: table lookup indices to rotate values by 8
+         */
+
+        /* Load counter-add values for each block */
+        "LD1    {v28.4s}, [%[L_chacha20_neon_add_all_counters]] \n\t"
+        /* Load index look-up for rotating left 8 bits */
+        "LD1    {v30.16b}, [%[L_chacha20_neon_rol8]] \n\t"
+        /* For adding 5 to each counter-add for next 320-byte chunk */
+        "MOVI   v29.4s, #5 \n\t"
+        /* Counter for 5th block in regular registers */
+        "MOV    w21, #4 \n\t"
+        /* Load state to encrypt */
+        "LD1    {v24.4s-v27.4s}, [%[input]] \n\t"
+        "\n"
+    "L_chacha20_arm64_outer_%=: \n\t"
+        /* Move state into regular registers */
+        "MOV    x4, v24.d[0] \n\t"
+        "MOV    x6, v24.d[1] \n\t"
+        "MOV    x8, v25.d[0] \n\t"
+        "MOV    x10, v25.d[1] \n\t"
+        "MOV    x12, v26.d[0] \n\t"
+        "MOV    x14, v26.d[1] \n\t"
+        "MOV    x16, v27.d[0] \n\t"
+        "MOV    x18, v27.d[1] \n\t"
+        /* Move state into vector registers (x4) */
+        "DUP    v0.4s, v24.s[0] \n\t"
+        "DUP    v1.4s, v24.s[1] \n\t"
+        "LSR    x5, x4, #32 \n\t"
+        "DUP    v2.4s, v24.s[2] \n\t"
+        "DUP    v3.4s, v24.s[3] \n\t"
+        "LSR    x7, x6, #32 \n\t"
+        "DUP    v4.4s, v25.s[0] \n\t"
+        "DUP    v5.4s, v25.s[1] \n\t"
+        "LSR    x9, x8, #32 \n\t"
+        "DUP    v6.4s, v25.s[2] \n\t"
+        "DUP    v7.4s, v25.s[3] \n\t"
+        "LSR    x11, x10, #32 \n\t"
+        "DUP    v8.4s, v26.s[0] \n\t"
+        "DUP    v9.4s, v26.s[1] \n\t"
+        "LSR    x13, x12, #32 \n\t"
+        "DUP    v10.4s, v26.s[2] \n\t"
+        "DUP    v11.4s, v26.s[3] \n\t"
+        "LSR    x15, x14, #32 \n\t"
+        "DUP    v12.4s, v27.s[0] \n\t"
+        "DUP    v13.4s, v27.s[1] \n\t"
+        "LSR    x17, x16, #32 \n\t"
+        "DUP    v14.4s, v27.s[2] \n\t"
+        "DUP    v15.4s, v27.s[3] \n\t"
+        "LSR    x19, x18, #32 \n\t"
+        /* Add to counter word */
+        "ADD    v12.4s, v12.4s, v28.4s \n\t"
+        "ADD    w16, w16, w21 \n\t"
+        /* Set number of odd+even rounds to perform */
+        "MOV    w20, #10 \n\t"
+        "\n"
+    "L_chacha20_arm64_inner_%=: \n\t"
+        "SUBS   w20, w20, #1 \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4s, v0.4s, v4.4s \n\t"
+        "ADD    w4, w4, w8 \n\t"
+        "ADD    v1.4s, v1.4s, v5.4s \n\t"
+        "ADD    w5, w5, w9 \n\t"
+        "ADD    v2.4s, v2.4s, v6.4s \n\t"
+        "ADD    w6, w6, w10 \n\t"
+        "ADD    v3.4s, v3.4s, v7.4s \n\t"
+        "ADD    w7, w7, w11 \n\t"
+        "EOR    v12.16b, v12.16b, v0.16b \n\t"
+        "EOR    w16, w16, w4 \n\t"
+        "EOR    v13.16b, v13.16b, v1.16b \n\t"
+        "EOR    w17, w17, w5 \n\t"
+        "EOR    v14.16b, v14.16b, v2.16b \n\t"
+        "EOR    w18, w18, w6 \n\t"
+        "EOR    v15.16b, v15.16b, v3.16b \n\t"
+        "EOR    w19, w19, w7 \n\t"
+        "REV32  v12.8h, v12.8h \n\t"
+        "ROR    w16, w16, #16 \n\t"
+        "REV32  v13.8h, v13.8h \n\t"
+        "ROR    w17, w17, #16 \n\t"
+        "REV32  v14.8h, v14.8h \n\t"
+        "ROR    w18, w18, #16 \n\t"
+        "REV32  v15.8h, v15.8h \n\t"
+        "ROR    w19, w19, #16 \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v8.4s, v8.4s, v12.4s \n\t"
+        "ADD    w12, w12, w16 \n\t"
+        "ADD    v9.4s, v9.4s, v13.4s \n\t"
+        "ADD    w13, w13, w17 \n\t"
+        "ADD    v10.4s, v10.4s, v14.4s \n\t"
+        "ADD    w14, w14, w18 \n\t"
+        "ADD    v11.4s, v11.4s, v15.4s \n\t"
+        "ADD    w15, w15, w19 \n\t"
+        "EOR    v16.16b, v4.16b, v8.16b \n\t"
+        "EOR    w8, w8, w12 \n\t"
+        "EOR    v17.16b, v5.16b, v9.16b \n\t"
+        "EOR    w9, w9, w13 \n\t"
+        "EOR    v18.16b, v6.16b, v10.16b \n\t"
+        "EOR    w10, w10, w14 \n\t"
+        "EOR    v19.16b, v7.16b, v11.16b \n\t"
+        "EOR    w11, w11, w15 \n\t"
+        "SHL    v4.4s, v16.4s, #12 \n\t"
+        "ROR    w8, w8, #20 \n\t"
+        "SHL    v5.4s, v17.4s, #12 \n\t"
+        "ROR    w9, w9, #20 \n\t"
+        "SHL    v6.4s, v18.4s, #12 \n\t"
+        "ROR    w10, w10, #20 \n\t"
+        "SHL    v7.4s, v19.4s, #12 \n\t"
+        "ROR    w11, w11, #20 \n\t"
+        "SRI    v4.4s, v16.4s, #20 \n\t"
+        "SRI    v5.4s, v17.4s, #20 \n\t"
+        "SRI    v6.4s, v18.4s, #20 \n\t"
+        "SRI    v7.4s, v19.4s, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4s, v0.4s, v4.4s \n\t"
+        "ADD    w4, w4, w8 \n\t"
+        "ADD    v1.4s, v1.4s, v5.4s \n\t"
+        "ADD    w5, w5, w9 \n\t"
+        "ADD    v2.4s, v2.4s, v6.4s \n\t"
+        "ADD    w6, w6, w10 \n\t"
+        "ADD    v3.4s, v3.4s, v7.4s \n\t"
+        "ADD    w7, w7, w11 \n\t"
+        "EOR    v12.16b, v12.16b, v0.16b \n\t"
+        "EOR    w16, w16, w4 \n\t"
+        "EOR    v13.16b, v13.16b, v1.16b \n\t"
+        "EOR    w17, w17, w5 \n\t"
+        "EOR    v14.16b, v14.16b, v2.16b \n\t"
+        "EOR    w18, w18, w6 \n\t"
+        "EOR    v15.16b, v15.16b, v3.16b \n\t"
+        "EOR    w19, w19, w7 \n\t"
+        "TBL    v12.16b, { v12.16b }, v30.16b \n\t"
+        "ROR    w16, w16, #24 \n\t"
+        "TBL    v13.16b, { v13.16b }, v30.16b \n\t"
+        "ROR    w17, w17, #24 \n\t"
+        "TBL    v14.16b, { v14.16b }, v30.16b \n\t"
+        "ROR    w18, w18, #24 \n\t"
+        "TBL    v15.16b, { v15.16b }, v30.16b \n\t"
+        "ROR    w19, w19, #24 \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v8.4s, v8.4s, v12.4s \n\t"
+        "ADD    w12, w12, w16 \n\t"
+        "ADD    v9.4s, v9.4s, v13.4s \n\t"
+        "ADD    w13, w13, w17 \n\t"
+        "ADD    v10.4s, v10.4s, v14.4s \n\t"
+        "ADD    w14, w14, w18 \n\t"
+        "ADD    v11.4s, v11.4s, v15.4s \n\t"
+        "ADD    w15, w15, w19 \n\t"
+        "EOR    v16.16b, v4.16b, v8.16b \n\t"
+        "EOR    w8, w8, w12 \n\t"
+        "EOR    v17.16b, v5.16b, v9.16b \n\t"
+        "EOR    w9, w9, w13 \n\t"
+        "EOR    v18.16b, v6.16b, v10.16b \n\t"
+        "EOR    w10, w10, w14 \n\t"
+        "EOR    v19.16b, v7.16b, v11.16b \n\t"
+        "EOR    w11, w11, w15 \n\t"
+        "SHL    v4.4s, v16.4s, #7 \n\t"
+        "ROR    w8, w8, #25 \n\t"
+        "SHL    v5.4s, v17.4s, #7 \n\t"
+        "ROR    w9, w9, #25 \n\t"
+        "SHL    v6.4s, v18.4s, #7 \n\t"
+        "ROR    w10, w10, #25 \n\t"
+        "SHL    v7.4s, v19.4s, #7 \n\t"
+        "ROR    w11, w11, #25 \n\t"
+        "SRI    v4.4s, v16.4s, #25 \n\t"
+        "SRI    v5.4s, v17.4s, #25 \n\t"
+        "SRI    v6.4s, v18.4s, #25 \n\t"
+        "SRI    v7.4s, v19.4s, #25 \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4s, v0.4s, v5.4s \n\t"
+        "ADD    w4, w4, w9 \n\t"
+        "ADD    v1.4s, v1.4s, v6.4s \n\t"
+        "ADD    w5, w5, w10 \n\t"
+        "ADD    v2.4s, v2.4s, v7.4s \n\t"
+        "ADD    w6, w6, w11 \n\t"
+        "ADD    v3.4s, v3.4s, v4.4s \n\t"
+        "ADD    w7, w7, w8 \n\t"
+        "EOR    v15.16b, v15.16b, v0.16b \n\t"
+        "EOR    w19, w19, w4 \n\t"
+        "EOR    v12.16b, v12.16b, v1.16b \n\t"
+        "EOR    w16, w16, w5 \n\t"
+        "EOR    v13.16b, v13.16b, v2.16b \n\t"
+        "EOR    w17, w17, w6 \n\t"
+        "EOR    v14.16b, v14.16b, v3.16b \n\t"
+        "EOR    w18, w18, w7 \n\t"
+        "REV32  v15.8h, v15.8h \n\t"
+        "ROR    w19, w19, #16 \n\t"
+        "REV32  v12.8h, v12.8h \n\t"
+        "ROR    w16, w16, #16 \n\t"
+        "REV32  v13.8h, v13.8h \n\t"
+        "ROR    w17, w17, #16 \n\t"
+        "REV32  v14.8h, v14.8h \n\t"
+        "ROR    w18, w18, #16 \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v10.4s, v10.4s, v15.4s \n\t"
+        "ADD    w14, w14, w19 \n\t"
+        "ADD    v11.4s, v11.4s, v12.4s \n\t"
+        "ADD    w15, w15, w16 \n\t"
+        "ADD    v8.4s, v8.4s, v13.4s \n\t"
+        "ADD    w12, w12, w17 \n\t"
+        "ADD    v9.4s, v9.4s, v14.4s \n\t"
+        "ADD    w13, w13, w18 \n\t"
+        "EOR    v16.16b, v5.16b, v10.16b \n\t"
+        "EOR    w9, w9, w14 \n\t"
+        "EOR    v17.16b, v6.16b, v11.16b \n\t"
+        "EOR    w10, w10, w15 \n\t"
+        "EOR    v18.16b, v7.16b, v8.16b \n\t"
+        "EOR    w11, w11, w12 \n\t"
+        "EOR    v19.16b, v4.16b, v9.16b \n\t"
+        "EOR    w8, w8, w13 \n\t"
+        "SHL    v5.4s, v16.4s, #12 \n\t"
+        "ROR    w9, w9, #20 \n\t"
+        "SHL    v6.4s, v17.4s, #12 \n\t"
+        "ROR    w10, w10, #20 \n\t"
+        "SHL    v7.4s, v18.4s, #12 \n\t"
+        "ROR    w11, w11, #20 \n\t"
+        "SHL    v4.4s, v19.4s, #12 \n\t"
+        "ROR    w8, w8, #20 \n\t"
+        "SRI    v5.4s, v16.4s, #20 \n\t"
+        "SRI    v6.4s, v17.4s, #20 \n\t"
+        "SRI    v7.4s, v18.4s, #20 \n\t"
+        "SRI    v4.4s, v19.4s, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4s, v0.4s, v5.4s \n\t"
+        "ADD    w4, w4, w9 \n\t"
+        "ADD    v1.4s, v1.4s, v6.4s \n\t"
+        "ADD    w5, w5, w10 \n\t"
+        "ADD    v2.4s, v2.4s, v7.4s \n\t"
+        "ADD    w6, w6, w11 \n\t"
+        "ADD    v3.4s, v3.4s, v4.4s \n\t"
+        "ADD    w7, w7, w8 \n\t"
+        "EOR    v15.16b, v15.16b, v0.16b \n\t"
+        "EOR    w19, w19, w4 \n\t"
+        "EOR    v12.16b, v12.16b, v1.16b \n\t"
+        "EOR    w16, w16, w5 \n\t"
+        "EOR    v13.16b, v13.16b, v2.16b \n\t"
+        "EOR    w17, w17, w6 \n\t"
+        "EOR    v14.16b, v14.16b, v3.16b \n\t"
+        "EOR    w18, w18, w7 \n\t"
+        "TBL    v15.16b, { v15.16b }, v30.16b \n\t"
+        "ROR    w19, w19, #24 \n\t"
+        "TBL    v12.16b, { v12.16b }, v30.16b \n\t"
+        "ROR    w16, w16, #24 \n\t"
+        "TBL    v13.16b, { v13.16b }, v30.16b \n\t"
+        "ROR    w17, w17, #24 \n\t"
+        "TBL    v14.16b, { v14.16b }, v30.16b \n\t"
+        "ROR    w18, w18, #24 \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v10.4s, v10.4s, v15.4s \n\t"
+        "ADD    w14, w14, w19 \n\t"
+        "ADD    v11.4s, v11.4s, v12.4s \n\t"
+        "ADD    w15, w15, w16 \n\t"
+        "ADD    v8.4s, v8.4s, v13.4s \n\t"
+        "ADD    w12, w12, w17 \n\t"
+        "ADD    v9.4s, v9.4s, v14.4s \n\t"
+        "ADD    w13, w13, w18 \n\t"
+        "EOR    v16.16b, v5.16b, v10.16b \n\t"
+        "EOR    w9, w9, w14 \n\t"
+        "EOR    v17.16b, v6.16b, v11.16b \n\t"
+        "EOR    w10, w10, w15 \n\t"
+        "EOR    v18.16b, v7.16b, v8.16b \n\t"
+        "EOR    w11, w11, w12 \n\t"
+        "EOR    v19.16b, v4.16b, v9.16b \n\t"
+        "EOR    w8, w8, w13 \n\t"
+        "SHL    v5.4s, v16.4s, #7 \n\t"
+        "ROR    w9, w9, #25 \n\t"
+        "SHL    v6.4s, v17.4s, #7 \n\t"
+        "ROR    w10, w10, #25 \n\t"
+        "SHL    v7.4s, v18.4s, #7 \n\t"
+        "ROR    w11, w11, #25 \n\t"
+        "SHL    v4.4s, v19.4s, #7 \n\t"
+        "ROR    w8, w8, #25 \n\t"
+        "SRI    v5.4s, v16.4s, #25 \n\t"
+        "SRI    v6.4s, v17.4s, #25 \n\t"
+        "SRI    v7.4s, v18.4s, #25 \n\t"
+        "SRI    v4.4s, v19.4s, #25 \n\t"
+        "BNE    L_chacha20_arm64_inner_%= \n\t"
+        /* Add counter now rather than after transposed */
+        "ADD    v12.4s, v12.4s, v28.4s \n\t"
+        "ADD    w16, w16, w21 \n\t"
+        /* Load message */
+        "LD1    {v20.4s-v23.4s}, [%[m]], #64 \n\t"
+        /* Transpose vectors */
+        "TRN1   v16.4s, v0.4s, v1.4s \n\t"
+        "TRN1   v18.4s, v2.4s, v3.4s \n\t"
+        "TRN2   v17.4s, v0.4s, v1.4s \n\t"
+        "TRN2   v19.4s, v2.4s, v3.4s \n\t"
+        "TRN1   v0.2d, v16.2d, v18.2d \n\t"
+        "TRN1   v1.2d, v17.2d, v19.2d \n\t"
+        "TRN2   v2.2d, v16.2d, v18.2d \n\t"
+        "TRN2   v3.2d, v17.2d, v19.2d \n\t"
+        "TRN1   v16.4s, v4.4s, v5.4s \n\t"
+        "TRN1   v18.4s, v6.4s, v7.4s \n\t"
+        "TRN2   v17.4s, v4.4s, v5.4s \n\t"
+        "TRN2   v19.4s, v6.4s, v7.4s \n\t"
+        "TRN1   v4.2d, v16.2d, v18.2d \n\t"
+        "TRN1   v5.2d, v17.2d, v19.2d \n\t"
+        "TRN2   v6.2d, v16.2d, v18.2d \n\t"
+        "TRN2   v7.2d, v17.2d, v19.2d \n\t"
+        "TRN1   v16.4s, v8.4s, v9.4s \n\t"
+        "TRN1   v18.4s, v10.4s, v11.4s \n\t"
+        "TRN2   v17.4s, v8.4s, v9.4s \n\t"
+        "TRN2   v19.4s, v10.4s, v11.4s \n\t"
+        "TRN1   v8.2d, v16.2d, v18.2d \n\t"
+        "TRN1   v9.2d, v17.2d, v19.2d \n\t"
+        "TRN2   v10.2d, v16.2d, v18.2d \n\t"
+        "TRN2   v11.2d, v17.2d, v19.2d \n\t"
+        "TRN1   v16.4s, v12.4s, v13.4s \n\t"
+        "TRN1   v18.4s, v14.4s, v15.4s \n\t"
+        "TRN2   v17.4s, v12.4s, v13.4s \n\t"
+        "TRN2   v19.4s, v14.4s, v15.4s \n\t"
+        "TRN1   v12.2d, v16.2d, v18.2d \n\t"
+        "TRN1   v13.2d, v17.2d, v19.2d \n\t"
+        "TRN2   v14.2d, v16.2d, v18.2d \n\t"
+        "TRN2   v15.2d, v17.2d, v19.2d \n\t"
+        /* Add back state, XOR in message and store (load next block) */
+        "ADD    v16.4s, v0.4s, v24.4s \n\t"
+        "ADD    v17.4s, v4.4s, v25.4s \n\t"
+        "ADD    v18.4s, v8.4s, v26.4s \n\t"
+        "ADD    v19.4s, v12.4s, v27.4s \n\t"
+        "EOR    v16.16b, v16.16b, v20.16b \n\t"
+        "EOR    v17.16b, v17.16b, v21.16b \n\t"
+        "EOR    v18.16b, v18.16b, v22.16b \n\t"
+        "EOR    v19.16b, v19.16b, v23.16b \n\t"
+        "LD1    {v20.4s-v23.4s}, [%[m]], #64 \n\t"
+        "ST1    {v16.4s-v19.4s}, [%[c]], #64 \n\t"
+        "ADD    v16.4s, v1.4s, v24.4s \n\t"
+        "ADD    v17.4s, v5.4s, v25.4s \n\t"
+        "ADD    v18.4s, v9.4s, v26.4s \n\t"
+        "ADD    v19.4s, v13.4s, v27.4s \n\t"
+        "EOR    v16.16b, v16.16b, v20.16b \n\t"
+        "EOR    v17.16b, v17.16b, v21.16b \n\t"
+        "EOR    v18.16b, v18.16b, v22.16b \n\t"
+        "EOR    v19.16b, v19.16b, v23.16b \n\t"
+        "LD1    {v20.4s-v23.4s}, [%[m]], #64 \n\t"
+        "ST1    {v16.4s-v19.4s}, [%[c]], #64 \n\t"
+        "ADD    v16.4s, v2.4s, v24.4s \n\t"
+        "ADD    v17.4s, v6.4s, v25.4s \n\t"
+        "ADD    v18.4s, v10.4s, v26.4s \n\t"
+        "ADD    v19.4s, v14.4s, v27.4s \n\t"
+        "EOR    v16.16b, v16.16b, v20.16b \n\t"
+        "EOR    v17.16b, v17.16b, v21.16b \n\t"
+        "EOR    v18.16b, v18.16b, v22.16b \n\t"
+        "EOR    v19.16b, v19.16b, v23.16b \n\t"
+        "LD1    {v20.4s-v23.4s}, [%[m]], #64 \n\t"
+        "ST1    {v16.4s-v19.4s}, [%[c]], #64 \n\t"
+        "ADD    v16.4s, v3.4s, v24.4s \n\t"
+        "ADD    v17.4s, v7.4s, v25.4s \n\t"
+        "ADD    v18.4s, v11.4s, v26.4s \n\t"
+        "ADD    v19.4s, v15.4s, v27.4s \n\t"
+        "EOR    v16.16b, v16.16b, v20.16b \n\t"
+        "EOR    v17.16b, v17.16b, v21.16b \n\t"
+        "EOR    v18.16b, v18.16b, v22.16b \n\t"
+        "EOR    v19.16b, v19.16b, v23.16b \n\t"
+        "LD1    {v20.4s-v23.4s}, [%[m]], #64 \n\t"
+        "ST1    {v16.4s-v19.4s}, [%[c]], #64 \n\t"
+        /* Move regular registers into vector registers for adding and xor */
+        "ORR    x4, x4, x5, LSL #32 \n\t"
+        "ORR    x6, x6, x7, LSL #32 \n\t"
+        "ORR    x8, x8, x9, LSL #32 \n\t"
+        "MOV    v16.d[0], x4 \n\t"
+        "ORR    x10, x10, x11, LSL #32 \n\t"
+        "MOV    v16.d[1], x6 \n\t"
+        "ORR    x12, x12, x13, LSL #32 \n\t"
+        "MOV    v17.d[0], x8 \n\t"
+        "ORR    x14, x14, x15, LSL #32 \n\t"
+        "MOV    v17.d[1], x10 \n\t"
+        "ORR    x16, x16, x17, LSL #32 \n\t"
+        "MOV    v18.d[0], x12 \n\t"
+        "ORR    x18, x18, x19, LSL #32 \n\t"
+        "MOV    v18.d[1], x14 \n\t"
+        "MOV    v19.d[0], x16 \n\t"
+        "MOV    v19.d[1], x18 \n\t"
+        /* Add back state, XOR in message and store */
+        "ADD    v16.4s, v16.4s, v24.4s \n\t"
+        "ADD    v17.4s, v17.4s, v25.4s \n\t"
+        "ADD    v18.4s, v18.4s, v26.4s \n\t"
+        "ADD    v19.4s, v19.4s, v27.4s \n\t"
+        "EOR    v16.16b, v16.16b, v20.16b \n\t"
+        "EOR    v17.16b, v17.16b, v21.16b \n\t"
+        "EOR    v18.16b, v18.16b, v22.16b \n\t"
+        "EOR    v19.16b, v19.16b, v23.16b \n\t"
+        "ADD    w21, w21, #5 \n\t"
+        "ST1    {v16.4s-v19.4s}, [%[c]], #64 \n\t"
+        "SUBS   %[bytes], %[bytes], #320 \n\t"
+        "ADD    v28.4s, v28.4s, v29.4s \n\t"
+        "BNE    L_chacha20_arm64_outer_%= \n\t"
+        : [input] "+r" (input), [m] "+r" (m), [c] "+r" (c),
+          [bytes] "+r" (bytes)
+        : [L_chacha20_neon_add_all_counters] "r" (L_chacha20_neon_add_all_counters),
+          [L_chacha20_neon_rol8] "r" (L_chacha20_neon_rol8)
+        : "memory", "cc",
+          "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12",
+          "x13", "x14", "x15", "x16", "x17", "x18", "x19", "x20", "x21",
+          "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+          "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+          "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"
+    );
+}
+#endif /* __aarch64__ */
+
+/**
+  * Converts word into bytes with rotations having been done.
+  */
+static WC_INLINE int wc_Chacha_encrypt_256(const word32 input[CHACHA_CHUNK_WORDS], const byte* m, byte* c)
+{
+#ifdef CHACHA_TEST
+    printf("Entering wc_Chacha_encrypt_256\n");
+#endif /*CHACHA_TEST */
+
+#ifdef __aarch64__
+    __asm__ __volatile__ (
+        // v0-v3 - first block
+        // v12 first block helper
+        // v4-v7 - second block
+        // v13 second block helper
+        // v8-v11 - third block
+        // v14 third block helper
+        // w4-w19 - fourth block
+
+        // v0  0  1  2  3
+        // v1  4  5  6  7
+        // v2  8  9 10 11
+        // v3 12 13 14 15
+        // load CHACHA state with indices placed as shown above
+        /* Load state to encrypt */
+        "LD1    {v20.4S-v23.4S}, [%[input]] \n\t"
+        /* Load index look-up for rotating left 8 bits */
+        "LD1    {v24.16B}, [%[L_chacha20_neon_rol8]] \n\t"
+        /* Move state into regular registers */
+        "MOV    x4, v20.D[0] \n\t"
+        "MOV    x6, v20.D[1] \n\t"
+        "MOV    x8, v21.D[0] \n\t"
+        "MOV    x10, v21.D[1] \n\t"
+        "MOV    x12, v22.D[0] \n\t"
+        "MOV    x14, v22.D[1] \n\t"
+        "MOV    x16, v23.D[0] \n\t"
+        "MOV    x18, v23.D[1] \n\t"
+        /* Move state into vector registers (x3) */
+        "MOV    v0.16B, v20.16B \n\t"
+        "MOV    v1.16B, v21.16B \n\t"
+        "LSR    x19, x18, #32 \n\t"
+        "MOV    v2.16B, v22.16B \n\t"
+        "ADD    w20, w16, #1 \n\t"
+        "MOV    v3.16B, v23.16B \n\t"
+        "LSR    x17, x16, #32 \n\t"
+        "MOV    v4.16B, v20.16B \n\t"
+        "MOV    v5.16B, v21.16B \n\t"
+        "LSR    x15, x14, #32 \n\t"
+        "MOV    v6.16B, v22.16B \n\t"
+        "ADD    w21, w16, #2 \n\t"
+        "MOV    v7.16B, v23.16B \n\t"
+        "LSR    x13, x12, #32 \n\t"
+        "MOV    v8.16B, v20.16B \n\t"
+        "MOV    v9.16B, v21.16B \n\t"
+        "LSR    x11, x10, #32 \n\t"
+        "MOV    v10.16B, v22.16B \n\t"
+        "ADD    w16, w16, #3 \n\t"
+        "MOV    v11.16B, v23.16B \n\t"
+        "LSR    x9, x8, #32 \n\t"
+        /* Set counter word */
+        "MOV    v7.S[0], w20 \n\t"
+        "LSR    x7, x6, #32 \n\t"
+        "MOV    v11.S[0], w21 \n\t"
+        "LSR    x5, x4, #32 \n\t"
+        /* Set number of odd+even rounds to perform */
+        "MOV    w3, #10 \n\t"
+        "\n"
+    "L_chacha20_arm64_256_loop_%=: \n\t"
+        "SUBS   w3, w3, #1 \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    w4, w4, w8 \n\t"
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "ADD    w5, w5, w9 \n\t"
+        "ADD    v4.4S, v4.4S, v5.4S \n\t"
+        "ADD    w6, w6, w10 \n\t"
+        "ADD    v8.4S, v8.4S, v9.4S \n\t"
+        "ADD    w7, w7, w11 \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "EOR    w16, w16, w4 \n\t"
+        "EOR    v7.16B, v7.16B, v4.16B \n\t"
+        "EOR    w17, w17, w5 \n\t"
+        "EOR    v11.16B, v11.16B, v8.16B \n\t"
+        "EOR    w18, w18, w6 \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        "EOR    w19, w19, w7 \n\t"
+        "REV32  v7.8H, v7.8H \n\t"
+        "ROR    w16, w16, #16 \n\t"
+        "REV32  v11.8H, v11.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ROR    w17, w17, #16 \n\t"
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "ROR    w18, w18, #16 \n\t"
+        "ADD    v6.4S, v6.4S, v7.4S \n\t"
+        "ROR    w19, w19, #16 \n\t"
+        "ADD    v10.4S, v10.4S, v11.4S \n\t"
+        "ADD    w12, w12, w16 \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "ADD    w13, w13, w17 \n\t"
+        "EOR    v13.16B, v5.16B, v6.16B \n\t"
+        "ADD    w14, w14, w18 \n\t"
+        "EOR    v14.16B, v9.16B, v10.16B \n\t"
+        "ADD    w15, w15, w19 \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "EOR    w8, w8, w12 \n\t"
+        "SHL    v5.4S, v13.4S, #12 \n\t"
+        "EOR    w9, w9, w13 \n\t"
+        "SHL    v9.4S, v14.4S, #12 \n\t"
+        "EOR    w10, w10, w14 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        "EOR    w11, w11, w15 \n\t"
+        "SRI    v5.4S, v13.4S, #20 \n\t"
+        "ROR    w8, w8, #20 \n\t"
+        "SRI    v9.4S, v14.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ROR    w9, w9, #20 \n\t"
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "ROR    w10, w10, #20 \n\t"
+        "ADD    v4.4S, v4.4S, v5.4S \n\t"
+        "ROR    w11, w11, #20 \n\t"
+        "ADD    v8.4S, v8.4S, v9.4S \n\t"
+        "ADD    w4, w4, w8 \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "ADD    w5, w5, w9 \n\t"
+        "EOR    v7.16B, v7.16B, v4.16B \n\t"
+        "ADD    w6, w6, w10 \n\t"
+        "EOR    v11.16B, v11.16B, v8.16B \n\t"
+        "ADD    w7, w7, w11 \n\t"
+        "TBL    v3.16B, { v3.16B }, v24.16B \n\t"
+        "EOR    w16, w16, w4 \n\t"
+        "TBL    v7.16B, { v7.16B }, v24.16B \n\t"
+        "EOR    w17, w17, w5 \n\t"
+        "TBL    v11.16B, { v11.16B }, v24.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "EOR    w18, w18, w6 \n\t"
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    w19, w19, w7 \n\t"
+        "ADD    v6.4S, v6.4S, v7.4S \n\t"
+        "ROR    w16, w16, #24 \n\t"
+        "ADD    v10.4S, v10.4S, v11.4S \n\t"
+        "ROR    w17, w17, #24 \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "ROR    w18, w18, #24 \n\t"
+        "EOR    v13.16B, v5.16B, v6.16B \n\t"
+        "ROR    w19, w19, #24 \n\t"
+        "EOR    v14.16B, v9.16B, v10.16B \n\t"
+        "ADD    w12, w12, w16 \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "ADD    w13, w13, w17 \n\t"
+        "SHL    v5.4S, v13.4S, #7 \n\t"
+        "ADD    w14, w14, w18 \n\t"
+        "SHL    v9.4S, v14.4S, #7 \n\t"
+        "ADD    w15, w15, w19 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EOR    w8, w8, w12 \n\t"
+        "SRI    v5.4S, v13.4S, #25 \n\t"
+        "EOR    w9, w9, w13 \n\t"
+        "SRI    v9.4S, v14.4S, #25 \n\t"
+        "EOR    w10, w10, w14 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #4 \n\t"
+        "EOR    w11, w11, w15 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        "ROR    w8, w8, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #12 \n\t"
+        "ROR    w9, w9, #25 \n\t"
+        "EXT    v5.16B, v5.16B, v5.16B, #4 \n\t"
+        "ROR    w10, w10, #25 \n\t"
+        "EXT    v6.16B, v6.16B, v6.16B, #8 \n\t"
+        "ROR    w11, w11, #25 \n\t"
+        "EXT    v7.16B, v7.16B, v7.16B, #12 \n\t"
+        "EXT    v9.16B, v9.16B, v9.16B, #4 \n\t"
+        "EXT    v10.16B, v10.16B, v10.16B, #8 \n\t"
+        "EXT    v11.16B, v11.16B, v11.16B, #12 \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    w4, w4, w9 \n\t"
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "ADD    w5, w5, w10 \n\t"
+        "ADD    v4.4S, v4.4S, v5.4S \n\t"
+        "ADD    w6, w6, w11 \n\t"
+        "ADD    v8.4S, v8.4S, v9.4S \n\t"
+        "ADD    w7, w7, w8 \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "EOR    w19, w19, w4 \n\t"
+        "EOR    v7.16B, v7.16B, v4.16B \n\t"
+        "EOR    w16, w16, w5 \n\t"
+        "EOR    v11.16B, v11.16B, v8.16B \n\t"
+        "EOR    w17, w17, w6 \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        "EOR    w18, w18, w7 \n\t"
+        "REV32  v7.8H, v7.8H \n\t"
+        "ROR    w19, w19, #16 \n\t"
+        "REV32  v11.8H, v11.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ROR    w16, w16, #16 \n\t"
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "ROR    w17, w17, #16 \n\t"
+        "ADD    v6.4S, v6.4S, v7.4S \n\t"
+        "ROR    w18, w18, #16 \n\t"
+        "ADD    v10.4S, v10.4S, v11.4S \n\t"
+        "ADD    w14, w14, w19 \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "ADD    w15, w15, w16 \n\t"
+        "EOR    v13.16B, v5.16B, v6.16B \n\t"
+        "ADD    w12, w12, w17 \n\t"
+        "EOR    v14.16B, v9.16B, v10.16B \n\t"
+        "ADD    w13, w13, w18 \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "EOR    w9, w9, w14 \n\t"
+        "SHL    v5.4S, v13.4S, #12 \n\t"
+        "EOR    w10, w10, w15 \n\t"
+        "SHL    v9.4S, v14.4S, #12 \n\t"
+        "EOR    w11, w11, w12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        "EOR    w8, w8, w13 \n\t"
+        "SRI    v5.4S, v13.4S, #20 \n\t"
+        "ROR    w9, w9, #20 \n\t"
+        "SRI    v9.4S, v14.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ROR    w10, w10, #20 \n\t"
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "ROR    w11, w11, #20 \n\t"
+        "ADD    v4.4S, v4.4S, v5.4S \n\t"
+        "ROR    w8, w8, #20 \n\t"
+        "ADD    v8.4S, v8.4S, v9.4S \n\t"
+        "ADD    w4, w4, w9 \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "ADD    w5, w5, w10 \n\t"
+        "EOR    v7.16B, v7.16B, v4.16B \n\t"
+        "ADD    w6, w6, w11 \n\t"
+        "EOR    v11.16B, v11.16B, v8.16B \n\t"
+        "ADD    w7, w7, w8 \n\t"
+        "TBL    v3.16B, { v3.16B }, v24.16B \n\t"
+        "EOR    w19, w19, w4 \n\t"
+        "TBL    v7.16B, { v7.16B }, v24.16B \n\t"
+        "EOR    w16, w16, w5 \n\t"
+        "TBL    v11.16B, { v11.16B }, v24.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "EOR    w17, w17, w6 \n\t"
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    w18, w18, w7 \n\t"
+        "ADD    v6.4S, v6.4S, v7.4S \n\t"
+        "ROR    w19, w19, #24 \n\t"
+        "ADD    v10.4S, v10.4S, v11.4S \n\t"
+        "ROR    w16, w16, #24 \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "ROR    w17, w17, #24 \n\t"
+        "EOR    v13.16B, v5.16B, v6.16B \n\t"
+        "ROR    w18, w18, #24 \n\t"
+        "EOR    v14.16B, v9.16B, v10.16B \n\t"
+        "ADD    w14, w14, w19 \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "ADD    w15, w15, w16 \n\t"
+        "SHL    v5.4S, v13.4S, #7 \n\t"
+        "ADD    w12, w12, w17 \n\t"
+        "SHL    v9.4S, v14.4S, #7 \n\t"
+        "ADD    w13, w13, w18 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EOR    w9, w9, w14 \n\t"
+        "SRI    v5.4S, v13.4S, #25 \n\t"
+        "EOR    w10, w10, w15 \n\t"
+        "SRI    v9.4S, v14.4S, #25 \n\t"
+        "EOR    w11, w11, w12 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #12 \n\t"
+        "EOR    w8, w8, w13 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        "ROR    w9, w9, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #4 \n\t"
+        "ROR    w10, w10, #25 \n\t"
+        "EXT    v5.16B, v5.16B, v5.16B, #12 \n\t"
+        "ROR    w11, w11, #25 \n\t"
+        "EXT    v6.16B, v6.16B, v6.16B, #8 \n\t"
+        "ROR    w8, w8, #25 \n\t"
+        "EXT    v7.16B, v7.16B, v7.16B, #4 \n\t"
+        "EXT    v9.16B, v9.16B, v9.16B, #12 \n\t"
+        "EXT    v10.16B, v10.16B, v10.16B, #8 \n\t"
+        "EXT    v11.16B, v11.16B, v11.16B, #4 \n\t"
+        "BNE    L_chacha20_arm64_256_loop_%= \n\t"
+        /* Load message */
+        "LD1    {v16.4S-v19.4S}, [%[m]], #64 \n\t"
+        /* Add one (2 added during calculating vector results) */
+        "ADD    w16, w16, #1 \n\t"
+        /* Add back state, XOR in message and store (load next block) */
+        "ADD    v0.4S, v0.4S, v20.4S \n\t"
+        "ADD    v1.4S, v1.4S, v21.4S \n\t"
+        "ADD    v2.4S, v2.4S, v22.4S \n\t"
+        "ADD    v3.4S, v3.4S, v23.4S \n\t"
+        "EOR    v0.16B, v0.16B, v16.16B \n\t"
+        "EOR    v1.16B, v1.16B, v17.16B \n\t"
+        "EOR    v2.16B, v2.16B, v18.16B \n\t"
+        "EOR    v3.16B, v3.16B, v19.16B \n\t"
+        "LD1    {v16.4S-v19.4S}, [%[m]], #64 \n\t"
+        "ST1    {v0.4S-v3.4S}, [%[c]], #64 \n\t"
+        "MOV    v23.S[0], w20 \n\t"
+        "ADD    v4.4S, v4.4S, v20.4S \n\t"
+        "ADD    v5.4S, v5.4S, v21.4S \n\t"
+        "ADD    v6.4S, v6.4S, v22.4S \n\t"
+        "ADD    v7.4S, v7.4S, v23.4S \n\t"
+        "EOR    v4.16B, v4.16B, v16.16B \n\t"
+        "EOR    v5.16B, v5.16B, v17.16B \n\t"
+        "EOR    v6.16B, v6.16B, v18.16B \n\t"
+        "EOR    v7.16B, v7.16B, v19.16B \n\t"
+        "LD1    {v16.4S-v19.4S}, [%[m]], #64 \n\t"
+        "ST1    {v4.4S-v7.4S}, [%[c]], #64 \n\t"
+        "MOV    v23.S[0], w21 \n\t"
+        "ADD    v8.4S, v8.4S, v20.4S \n\t"
+        "ADD    v9.4S, v9.4S, v21.4S \n\t"
+        "ADD    v10.4S, v10.4S, v22.4S \n\t"
+        "ADD    v11.4S, v11.4S, v23.4S \n\t"
+        "EOR    v8.16B, v8.16B, v16.16B \n\t"
+        "EOR    v9.16B, v9.16B, v17.16B \n\t"
+        "EOR    v10.16B, v10.16B, v18.16B \n\t"
+        "EOR    v11.16B, v11.16B, v19.16B \n\t"
+        "LD1    {v16.4S-v19.4S}, [%[m]], #64 \n\t"
+        "ST1    {v8.4S-v11.4S}, [%[c]], #64 \n\t"
+        /* Move regular registers into vector registers for adding and xor */
+        "ORR    x4, x4, x5, lsl #32 \n\t"
+        "ORR    x6, x6, x7, lsl #32 \n\t"
+        "ORR    x8, x8, x9, lsl #32 \n\t"
+        "MOV    v12.D[0], x4 \n\t"
+        "ORR    x10, x10, x11, lsl #32 \n\t"
+        "MOV    v12.D[1], x6 \n\t"
+        "ORR    x12, x12, x13, lsl #32 \n\t"
+        "MOV    v13.D[0], x8 \n\t"
+        "ORR    x14, x14, x15, lsl #32 \n\t"
+        "MOV    v13.D[1], x10 \n\t"
+        "ORR    x16, x16, x17, lsl #32 \n\t"
+        "MOV    v14.D[0], x12 \n\t"
+        "ORR    x18, x18, x19, lsl #32 \n\t"
+        "MOV    v14.D[1], x14 \n\t"
+        "MOV    v15.D[0], x16 \n\t"
+        "MOV    v15.D[1], x18 \n\t"
+        /* Add back state, XOR in message and store */
+        "ADD    v12.4S, v12.4S, v20.4S \n\t"
+        "ADD    v13.4S, v13.4S, v21.4S \n\t"
+        "ADD    v14.4S, v14.4S, v22.4S \n\t"
+        "ADD    v15.4S, v15.4S, v23.4S \n\t"
+        "EOR    v12.16B, v12.16B, v16.16B \n\t"
+        "EOR    v13.16B, v13.16B, v17.16B \n\t"
+        "EOR    v14.16B, v14.16B, v18.16B \n\t"
+        "EOR    v15.16B, v15.16B, v19.16B \n\t"
+        "ST1    {v12.4S-v15.4S}, [%[c]], #64 \n\t"
+        : [input] "+r" (input), [m] "+r" (m), [c] "+r" (c)
+        : [L_chacha20_neon_rol8] "r" (L_chacha20_neon_rol8)
+        : "memory", "x3", "x4", "x5", "x6", "x7", "x8", "x9",
+          "x10", "x11", "x12", "x13", "x14", "x15", "x16",
+          "x17", "x18", "x19", "x20", "x21", "v0", "v1",
+          "v2", "v3", "v4", "v5", "v6", "v7", "v8",
+          "v9", "v10", "v11", "v12", "v13", "v14",
+          "v15", "v16", "v17", "v18", "v19", "v20",
+          "v21", "v22", "v23"
+    );
+#else
+    word32 x[CHACHA_CHUNK_WORDS];
+    word32* x_addr = x;
+    __asm__ __volatile__ (
+        // The paper NEON crypto by Daniel J. Bernstein and Peter Schwabe was used to optimize for ARM
+        // https://cryptojedi.org/papers/neoncrypto-20120320.pdf
+
+        "LDR r14, %[input] \n\t" // load input address
+        "MOV r11, #1 \n\t"
+
+        "LDM r14, { r0-r12 } \n\t"
+        "STRD r10, r11, %[x_10] \n\t"
+        // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12
+        //  0  1  2  3  4  5  6  7  8  9  10  11  12
+        "VMOV d0, r0, r1 \n\t"
+        "VMOV d1, r2, r3 \n\t"
+        "VMOV d2, r4, r5 \n\t"
+        "VMOV d3, r6, r7 \n\t"
+        "VMOV d4, r8, r9 \n\t"
+        "VMOV d5, r10, r11 \n\t"
+        "VMOV q4, q0 \n\t"
+        "VMOV q5, q1 \n\t"
+        "VMOV q6, q2 \n\t"
+        "VMOV q8, q0 \n\t"
+        "VMOV q9, q1 \n\t"
+        "VMOV q10, q2 \n\t"
+        "LDRD r11, r10, [r14, #4*14] \n\t"
+        // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12
+        //  0  1  2  3  4  5  6  7  8  9  15  14  12
+        "VMOV d7, r11, r10 \n\t"
+        "STR r10, %[x_15] \n\t"
+        "VMOV d15, r11, r10 \n\t"
+        "VMOV d23, r11, r10 \n\t"
+        "MOV r10, r12 \n\t"
+        "MOV r12, r11 \n\t"
+        "LDR r11, [r14, #4*13] \n\t"
+        // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12
+        //  0  1  2  3  4  5  6  7  8  9  12  13  14
+
+        "MOV r14, %[rounds] \n\t"
+
+        "VMOV d6, r10, r11 \n\t"
+        "ADD r10, r10, #1 \n\t"
+        "VMOV d14, r10, r11 \n\t"
+        "ADD r10, r10, #1 \n\t"
+        "VMOV d22, r10, r11 \n\t"
+        "ADD r10, r10, #1 \n\t" // ARM calculates the fourth block (two was already added earlier)
+        "\n"
+    "L_chacha20_arm32_256_loop_%=: \n\t"
+        "SUBS r14, r14, #1 \n\t"
+
+        // 0, 4,  8, 12
+        // 1, 5,  9, 13
+
+        // ODD ROUND
+        "ADD r0, r0, r4 \n\t" // 0 0 4
+        "VADD.I32 q0, q0, q1 \n\t"
+        "ADD r1, r1, r5 \n\t" // 1 1 5
+        "VADD.I32 q4, q4, q5 \n\t"
+        "EOR r10, r10, r0 \n\t" // 12 12 0
+        "VADD.I32 q8, q8, q9 \n\t"
+        "EOR r11, r11, r1 \n\t" // 13 13 1
+        "VEOR q12, q3, q0 \n\t"
+        "ROR r10, r10, #16 \n\t" // 12 12
+        "VEOR q13, q7, q4 \n\t"
+        "ROR r11, r11, #16 \n\t" // 13 13
+        "VEOR q14, q11, q8 \n\t"
+        "ADD r8, r8, r10 \n\t" // 8 8 12
+        // rotation by 16 bits may be done by reversing the 16 bit elements in 32 bit words
+        "VREV32.16 q3, q12 \n\t"
+        "ADD r9, r9, r11 \n\t" //  9 9 13
+        "VREV32.16 q7, q13 \n\t"
+        "EOR r4, r4, r8 \n\t" // 4 4 8
+        "VREV32.16 q11, q14 \n\t"
+
+        "EOR r5, r5, r9 \n\t" // 5 5 9
+        "VADD.I32 q2, q2, q3 \n\t"
+        "ROR r4, r4, #20 \n\t" // 4 4
+        "VADD.I32 q6, q6, q7 \n\t"
+        "ROR r5, r5, #20 \n\t" // 5 5
+        "VADD.I32 q10, q10, q11 \n\t"
+        "ADD r0, r0, r4 \n\t" // 0 0 4
+        "VEOR q12, q1, q2 \n\t"
+        "ADD r1, r1, r5 \n\t" // 1 1 5
+        "VEOR q13, q5, q6 \n\t"
+        "EOR r10, r10, r0 \n\t" // 12 12 0
+        "VEOR q14, q9, q10 \n\t"
+        "EOR r11, r11, r1 \n\t" // 13 13 1
+        // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+        "VSHL.I32 q1, q12, #12 \n\t"
+        "ROR r10, r10, #24 \n\t" // 12 12
+        "VSHL.I32 q5, q13, #12 \n\t"
+        "ROR r11, r11, #24 \n\t" // 13 13
+        "VSHL.I32 q9, q14, #12 \n\t"
+        "ADD r8, r8, r10 \n\t" // 8 8 12
+        "VSRI.I32 q1, q12, #20 \n\t"
+        "ADD r9, r9, r11 \n\t" // 9 9 13
+        "VSRI.I32 q5, q13, #20 \n\t"
+        "STR r11, %[x_13] \n\t"
+        "VSRI.I32 q9, q14, #20 \n\t"
+
+        "LDR r11, %[x_15] \n\t"
+        "VADD.I32 q0, q0, q1 \n\t"
+        "EOR r4, r4, r8 \n\t" // 4 4 8
+        "VADD.I32 q4, q4, q5 \n\t"
+        "STR r8, %[x_8] \n\t"
+        "VADD.I32 q8, q8, q9 \n\t"
+        "LDR r8, %[x_10] \n\t"
+        "VEOR q12, q3, q0 \n\t"
+        "EOR r5, r5, r9 \n\t" // 5 5 9
+        "VEOR q13, q7, q4 \n\t"
+        "STR r9, %[x_9] \n\t"
+        "VEOR q14, q11, q8 \n\t"
+        "LDR r9, %[x_11] \n\t"
+        // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+        "VSHL.I32 q3, q12, #8 \n\t"
+        "ROR r4, r4, #25 \n\t" // 4 4
+        "VSHL.I32 q7, q13, #8 \n\t"
+        "ROR r5, r5, #25 \n\t" // 5 5
+        "VSHL.I32 q11, q14, #8 \n\t"
+
+        // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12
+        //  0  1  2  3  4  5  6  7 10 11  12  15  14
+
+        // 2, 6, 10, 14
+        // 3, 7, 11, 15
+
+        "ADD r2, r2, r6 \n\t" // 2 2 6
+        "VSRI.I32 q3, q12, #24 \n\t"
+        "ADD r3, r3, r7 \n\t" // 3 3 7
+        "VSRI.I32 q7, q13, #24 \n\t"
+        "EOR r12, r12, r2 \n\t" // 14 14 2
+        "VSRI.I32 q11, q14, #24 \n\t"
+
+        "EOR r11, r11, r3 \n\t" // 15 15 3
+        "VADD.I32 q2, q2, q3 \n\t"
+        "ROR r12, r12, #16 \n\t" // 14 14
+        "VADD.I32 q6, q6, q7 \n\t"
+        "ROR r11, r11, #16 \n\t" // 15 15
+        "VADD.I32 q10, q10, q11 \n\t"
+        "ADD r8, r8, r12 \n\t" // 10 10 14
+        "VEOR q12, q1, q2 \n\t"
+        "ADD r9, r9, r11 \n\t" // 11 11 15
+        "VEOR q13, q5, q6 \n\t"
+        "EOR r6, r6, r8 \n\t" // 6 6 10
+        "VEOR q14, q9, q10 \n\t"
+        "EOR r7, r7, r9 \n\t" // 7 7 11
+        // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+        "VSHL.I32 q1, q12, #7 \n\t"
+        "ROR r6, r6, #20 \n\t" // 6 6
+        "VSHL.I32 q5, q13, #7 \n\t"
+        "ROR r7, r7, #20 \n\t" // 7 7
+        "VSHL.I32 q9, q14, #7 \n\t"
+        "ADD r2, r2, r6 \n\t" // 2 2 6
+        "VSRI.I32 q1, q12, #25 \n\t"
+        "ADD r3, r3, r7 \n\t" // 3 3 7
+        "VSRI.I32 q5, q13, #25 \n\t"
+        "EOR r12, r12, r2 \n\t" // 14 14 2
+        "VSRI.I32 q9, q14, #25 \n\t"
+
+        // EVEN ROUND
+
+        "EOR r11, r11, r3 \n\t" // 15 15 3
+        "VEXT.8 q1, q1, q1, #4 \n\t" // permute elements left by one
+        "ROR r12, r12, #24 \n\t" // 14 14
+        "VEXT.8 q2, q2, q2, #8 \n\t" // permute elements left by two
+        "ROR r11, r11, #24 \n\t" // 15 15
+        "VEXT.8 q3, q3, q3, #12 \n\t" // permute elements left by three
+
+        "ADD r8, r8, r12 \n\t" // 10 10 14
+        "VEXT.8 q5, q5, q5, #4 \n\t" // permute elements left by one
+        "ADD r9, r9, r11 \n\t" // 11 11 15
+        "VEXT.8 q6, q6, q6, #8 \n\t" // permute elements left by two
+        "EOR r6, r6, r8 \n\t" // 6 6 10
+        "VEXT.8 q7, q7, q7, #12 \n\t" // permute elements left by three
+
+        "EOR r7, r7, r9 \n\t" // 7 7 11
+        "VEXT.8 q9, q9, q9, #4 \n\t" // permute elements left by one
+        "ROR r6, r6, #25 \n\t" // 6 6
+        "VEXT.8 q10, q10, q10, #8 \n\t" // permute elements left by two
+        "ROR r7, r7, #25 \n\t" // 7 7
+        "VEXT.8 q11, q11, q11, #12 \n\t" // permute elements left by three
+
+        // 0, 5, 10, 15
+        // 1, 6, 11, 12
+
+        "ADD r0, r0, r5 \n\t" // 0 0 5
+        "VADD.I32 q0, q0, q1 \n\t"
+        "ADD r1, r1, r6 \n\t" // 1 1 6
+        "VADD.I32 q4, q4, q5 \n\t"
+        "EOR r11, r11, r0 \n\t" // 15 15 0
+        "VADD.I32 q8, q8, q9 \n\t"
+        "EOR r10, r10, r1 \n\t" // 12 12 1
+        "VEOR q12, q3, q0 \n\t"
+        "ROR r11, r11, #16 \n\t" // 15 15
+        "VEOR q13, q7, q4 \n\t"
+        "ROR r10, r10, #16 \n\t" // 12 12
+        "VEOR q14, q11, q8 \n\t"
+        "ADD r8, r8, r11 \n\t" // 10 10 15
+        // rotation by 16 bits may be done by reversing the 16 bit elements in 32 bit words
+        "VREV32.16 q3, q12 \n\t"
+        "ADD r9, r9, r10 \n\t" // 11 11 12
+        "VREV32.16 q7, q13 \n\t"
+        "EOR r5, r5, r8 \n\t" // 5 5 10
+        "VREV32.16 q11, q14 \n\t"
+
+        "EOR r6, r6, r9 \n\t" // 6 6 11
+        "VADD.I32 q2, q2, q3 \n\t"
+        "ROR r5, r5, #20 \n\t" // 5 5
+        "VADD.I32 q6, q6, q7 \n\t"
+        "ROR r6, r6, #20 \n\t" // 6 6
+        "VADD.I32 q10, q10, q11 \n\t"
+        "ADD r0, r0, r5 \n\t" // 0 0 5
+        "VEOR q12, q1, q2 \n\t"
+        "ADD r1, r1, r6 \n\t" // 1 1 6
+        "VEOR q13, q5, q6 \n\t"
+        "EOR r11, r11, r0 \n\t" // 15 15 0
+        "VEOR q14, q9, q10 \n\t"
+        "EOR r10, r10, r1 \n\t" // 12 12 1
+        // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+        "VSHL.I32 q1, q12, #12 \n\t"
+        "ROR r11, r11, #24 \n\t" // 15 15
+        "VSHL.I32 q5, q13, #12 \n\t"
+        "ROR r10, r10, #24 \n\t" // 12 12
+        "VSHL.I32 q9, q14, #12 \n\t"
+        "ADD r8, r8, r11 \n\t" // 10 10 15
+        "VSRI.I32 q1, q12, #20 \n\t"
+        "STR r11, %[x_15] \n\t"
+        "VSRI.I32 q5, q13, #20 \n\t"
+        "LDR r11, %[x_13] \n\t"
+        "VSRI.I32 q9, q14, #20 \n\t"
+
+        "ADD r9, r9, r10 \n\t" // 11 11 12
+        "VADD.I32 q0, q0, q1 \n\t"
+        "EOR r5, r5, r8 \n\t" // 5 5 10
+        "VADD.I32 q4, q4, q5 \n\t"
+        "STR r8, %[x_10] \n\t"
+        "VADD.I32 q8, q8, q9 \n\t"
+        "LDR r8, %[x_8] \n\t"
+        "VEOR q12, q3, q0 \n\t"
+        "EOR r6, r6, r9 \n\t" // 6 6 11
+        "VEOR q13, q7, q4 \n\t"
+        "STR r9, %[x_11] \n\t"
+        "VEOR q14, q11, q8 \n\t"
+        "LDR r9, %[x_9] \n\t"
+        // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+        "VSHL.I32 q3, q12, #8 \n\t"
+        "ROR r5, r5, #25 \n\t" // 5 5
+        "VSHL.I32 q7, q13, #8 \n\t"
+        "ROR r6, r6, #25 \n\t" // 6 6
+        "VSHL.I32 q11, q14, #8 \n\t"
+
+        // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12
+        //  0  1  2  3  4  5  6  7  8  9  12  13  14
+
+        // 2, 7,  8, 13
+        // 3, 4,  9, 14
+
+        "ADD r2, r2, r7 \n\t" // 2 2 7
+        "VSRI.I32 q3, q12, #24 \n\t"
+        "ADD r3, r3, r4 \n\t" // 3 3 4
+        "VSRI.I32 q7, q13, #24 \n\t"
+        "EOR r11, r11, r2 \n\t" // 13 13 2
+        "VSRI.I32 q11, q14, #24 \n\t"
+
+        "EOR r12, r12, r3 \n\t" // 14 14 3
+        "VADD.I32 q2, q2, q3 \n\t"
+        "ROR r11, r11, #16 \n\t" // 13 13
+        "VADD.I32 q6, q6, q7 \n\t"
+        "ROR r12, r12, #16 \n\t" // 14 14
+        "VADD.I32 q10, q10, q11 \n\t"
+        "ADD r8, r8, r11 \n\t" // 8 8 13
+        "VEOR q12, q1, q2 \n\t"
+        "ADD r9, r9, r12 \n\t" // 9 9 14
+        "VEOR q13, q5, q6 \n\t"
+        "EOR r7, r7, r8 \n\t" // 7 7 8
+        "VEOR q14, q9, q10 \n\t"
+        "EOR r4, r4, r9 \n\t" // 4 4 9
+        // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+        "VSHL.I32 q1, q12, #7 \n\t"
+        "ROR r7, r7, #20 \n\t" // 7 7
+        "VSHL.I32 q5, q13, #7 \n\t"
+        "ROR r4, r4, #20 \n\t" // 4 4
+        "VSHL.I32 q9, q14, #7 \n\t"
+        "ADD r2, r2, r7 \n\t" // 2 2 7
+        "VSRI.I32 q1, q12, #25 \n\t"
+        "ADD r3, r3, r4 \n\t" // 3 3 4
+        "VSRI.I32 q5, q13, #25 \n\t"
+        "EOR r11, r11, r2 \n\t" // 13 13 2
+        "VSRI.I32 q9, q14, #25 \n\t"
+
+        "EOR r12, r12, r3 \n\t" // 14 14 3
+        "VEXT.8 q1, q1, q1, #12 \n\t" // permute elements left by three
+        "ROR r11, r11, #24 \n\t" // 13 13
+        "VEXT.8 q2, q2, q2, #8 \n\t" // permute elements left by two
+        "ROR r12, r12, #24 \n\t" // 14 14
+        "VEXT.8 q3, q3, q3, #4 \n\t" // permute elements left by one
+
+        "ADD r8, r8, r11 \n\t" // 8 8 13
+        "VEXT.8 q5, q5, q5, #12 \n\t" // permute elements left by three
+        "ADD r9, r9, r12 \n\t" // 9 9 14
+        "VEXT.8 q6, q6, q6, #8 \n\t" // permute elements left by two
+        "EOR r7, r7, r8 \n\t" // 7 7 8
+        "VEXT.8 q7, q7, q7, #4 \n\t" // permute elements left by one
+
+        "EOR r4, r4, r9 \n\t" // 4 4 9
+        "VEXT.8 q9, q9, q9, #12 \n\t" // permute elements left by three
+        "ROR r7, r7, #25 \n\t" // 7 7
+        "VEXT.8 q10, q10, q10, #8 \n\t" // permute elements left by two
+        "ROR r4, r4, #25 \n\t" // 4 4
+        "VEXT.8 q11, q11, q11, #4 \n\t" // permute elements left by one
+
+        "BNE L_chacha20_arm32_256_loop_%= \n\t"
+
+        "LDR r14, %[x_addr] \n\t" // load address of x to r14
+        // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 r10 r11 r12
+        //  0  1  2  3  4  5  6  7  8  9  12  13  14
+        "ADD r10, r10, #3 \n\t" // add three here to make later NEON easier
+        "STM r14, { r0-r9 } \n\t"
+        "STRD r10, r11, [r14, #4*12] \n\t"
+        "LDR r9, %[input] \n\t" // load input address
+        "STR r12, [r14, #4*14] \n\t"
+        "LDR r10, %[c] \n\t" // load c address
+
+        "VLDM r9, { q12-q15 } \n\t"
+        "LDR r12, %[m] \n\t" // load m address
+
+        "VADD.I32 q0, q0, q12 \n\t"
+        "VADD.I32 q1, q1, q13 \n\t"
+        "VADD.I32 q2, q2, q14 \n\t"
+        "VADD.I32 q3, q3, q15 \n\t"
+
+        "VADD.I32 q4, q4, q12 \n\t"
+        "VADD.I32 q5, q5, q13 \n\t"
+        "VADD.I32 q6, q6, q14 \n\t"
+        "VADD.I32 q7, q7, q15 \n\t"
+
+        "MOV r11, #1 \n\t"
+
+        "VADD.I32 q8, q8, q12 \n\t"
+        "VMOV.I32 q12, #0 \n\t"
+        "VADD.I32 q9, q9, q13 \n\t"
+        "VMOV.I32 d24[0], r11 \n\t"
+        "VADD.I32 q10, q10, q14 \n\t"
+        "VADD.I32 q11, q11, q15 \n\t"
+
+        "VADD.I32 q11, q11, q12 \n\t" // add one to counter
+        "VADD.I32 q7, q7, q12 \n\t" // add one to counter
+        "VADD.I32 q11, q11, q12 \n\t" // add one to counter
+
+        "VLDM r12!, { q12-q15 } \n\t" // load m
+        "VEOR q0, q0, q12 \n\t"
+        "VEOR q1, q1, q13 \n\t"
+        "VEOR q2, q2, q14 \n\t"
+        "VEOR q3, q3, q15 \n\t"
+        "VSTM r10!, { q0-q3 } \n\t" // store to c
+
+        "VLDM r14, { q0-q3 } \n\t " // load final block from x
+
+        "VLDM r12!, { q12-q15 } \n\t" // load m
+        "VEOR q4, q4, q12 \n\t"
+        "VEOR q5, q5, q13 \n\t"
+        "VEOR q6, q6, q14 \n\t"
+        "VEOR q7, q7, q15 \n\t"
+        "VSTM r10!, { q4-q7 } \n\t" // store to c
+
+        "VLDM r9, { q4-q7 } \n\t" // load input
+
+        "VLDM r12!, { q12-q15 } \n\t" // load m
+        "VEOR q8, q8, q12 \n\t"
+        "VEOR q9, q9, q13 \n\t"
+        "VEOR q10, q10, q14 \n\t"
+        "VEOR q11, q11, q15 \n\t"
+        "VSTM r10!, { q8-q11 } \n\t" // store to c
+
+        "VLDM r12!, { q12-q15 } \n\t" // load m
+        "VADD.I32 q0, q0, q4 \n\t"
+        "VADD.I32 q1, q1, q5 \n\t"
+        "VADD.I32 q2, q2, q6 \n\t"
+        "VADD.I32 q3, q3, q7 \n\t" // three was added earlier
+        "VEOR q0, q0, q12 \n\t"
+        "VEOR q1, q1, q13 \n\t"
+        "VEOR q2, q2, q14 \n\t"
+        "VEOR q3, q3, q15 \n\t"
+        "VSTM r10!, { q0-q3 } \n\t" // store to c
+
+            : [c] "+m" (c),
+              [x_0] "=m" (x),
+              [x_8] "=m" (x[8]),
+              [x_9] "=m" (x[9]),
+              [x_10] "=m" (x[10]),
+              [x_11] "=m" (x[11]),
+              [x_13] "=m" (x[13]),
+              [x_15] "=m" (x[15])
+            : [rounds] "I" (ROUNDS/2), [input] "m" (input),
+              [chacha_chunk_bytes] "I" (CHACHA_CHUNK_BYTES),
+              [m] "m" (m), [x_addr] "m" (x_addr)
+            : "memory", "cc",
+              "r0", "r1", "r2", "r3",
+              "r4", "r5", "r6", "r7",
+              "r8", "r9", "r10", "r11", "r12", "r14",
+              "q0",  "q1",  "q2", "q3", "q4",
+              "q5",  "q6",  "q7", "q8", "q9",
+              "q10", "q11", "q12", "q13", "q14", "q15"
+    );
+
+#endif /* __aarch64__ */
+    return CHACHA_CHUNK_BYTES * 4;
+}
+
+
+static WC_INLINE int wc_Chacha_encrypt_128(const word32 input[CHACHA_CHUNK_WORDS], const byte* m, byte* c)
+{
+#ifdef CHACHA_TEST
+    printf("Entering wc_Chacha_encrypt_128\n");
+#endif /*CHACHA_TEST */
+
+#ifdef __aarch64__
+    __asm__ __volatile__ (
+        /* Load incrementer register to modify counter */
+        "LD1    {v22.16B}, [%[L_chacha20_neon_inc_first_word]] \n\t"
+        /* Load index look-up for rotating left 8 bits */
+        "LD1    {v23.16B}, [%[L_chacha20_neon_rol8]] \n\t"
+        /* Load state to encrypt */
+        "LD1    {v18.4S-v21.4S}, [%[input]] \n\t"
+        /* Load message */
+        "LD1    {v14.4S-v17.4S}, [%[m]], #64 \n\t"
+        /* Move state into vector registers (x3) */
+        "MOV    v0.16B, v18.16B \n\t"
+        "MOV    v1.16B, v19.16B \n\t"
+        "MOV    v2.16B, v20.16B \n\t"
+        "MOV    v3.16B, v21.16B \n\t"
+        "MOV    v4.16B, v18.16B \n\t"
+        "MOV    v5.16B, v19.16B \n\t"
+        "MOV    v6.16B, v20.16B \n\t"
+        "MOV    v7.16B, v21.16B \n\t"
+        /* Add counter word */
+        "ADD    v7.4S, v7.4S, v22.4S \n\t"
+        /* Set number of odd+even rounds to perform */
+        "MOV    w3, #10 \n\t"
+        "\n"
+    "L_chacha20_arm64_128_loop_%=: \n\t"
+        "SUBS   w3, w3, #1 \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "ADD    v4.4S, v4.4S, v5.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "EOR    v7.16B, v7.16B, v4.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        "REV32  v7.8H, v7.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "ADD    v6.4S, v6.4S, v7.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "EOR    v13.16B, v5.16B, v6.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SHL    v5.4S, v13.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        "SRI    v5.4S, v13.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "ADD    v4.4S, v4.4S, v5.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "EOR    v7.16B, v7.16B, v4.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v23.16B \n\t"
+        "TBL    v7.16B, { v7.16B }, v23.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "ADD    v6.4S, v6.4S, v7.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "EOR    v13.16B, v5.16B, v6.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SHL    v5.4S, v13.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "SRI    v5.4S, v13.4S, #25 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #4 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #12 \n\t"
+        "EXT    v5.16B, v5.16B, v5.16B, #4 \n\t"
+        "EXT    v6.16B, v6.16B, v6.16B, #8 \n\t"
+        "EXT    v7.16B, v7.16B, v7.16B, #12 \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "ADD    v4.4S, v4.4S, v5.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "EOR    v7.16B, v7.16B, v4.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        "REV32  v7.8H, v7.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "ADD    v6.4S, v6.4S, v7.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "EOR    v13.16B, v5.16B, v6.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SHL    v5.4S, v13.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        "SRI    v5.4S, v13.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "ADD    v4.4S, v4.4S, v5.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "EOR    v7.16B, v7.16B, v4.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v23.16B \n\t"
+        "TBL    v7.16B, { v7.16B }, v23.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "ADD    v6.4S, v6.4S, v7.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "EOR    v13.16B, v5.16B, v6.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SHL    v5.4S, v13.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "SRI    v5.4S, v13.4S, #25 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #12 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #4 \n\t"
+        "EXT    v5.16B, v5.16B, v5.16B, #12 \n\t"
+        "EXT    v6.16B, v6.16B, v6.16B, #8 \n\t"
+        "EXT    v7.16B, v7.16B, v7.16B, #4 \n\t"
+        "BNE    L_chacha20_arm64_128_loop_%= \n\t"
+        /* Add back state, XOR in message and store (load next block) */
+        "ADD    v0.4S, v0.4S, v18.4S \n\t"
+        "ADD    v1.4S, v1.4S, v19.4S \n\t"
+        "ADD    v2.4S, v2.4S, v20.4S \n\t"
+        "ADD    v3.4S, v3.4S, v21.4S \n\t"
+        "EOR    v0.16B, v0.16B, v14.16B \n\t"
+        "EOR    v1.16B, v1.16B, v15.16B \n\t"
+        "EOR    v2.16B, v2.16B, v16.16B \n\t"
+        "EOR    v3.16B, v3.16B, v17.16B \n\t"
+        "LD1    {v14.4S-v17.4S}, [%[m]], #64 \n\t"
+        "ST1    {v0.4S-v3.4S}, [%[c]], #64 \n\t"
+        "ADD    v21.4S, v21.4S, v22.4S \n\t"
+        "ADD    v4.4S, v4.4S, v18.4S \n\t"
+        "ADD    v5.4S, v5.4S, v19.4S \n\t"
+        "ADD    v6.4S, v6.4S, v20.4S \n\t"
+        "ADD    v7.4S, v7.4S, v21.4S \n\t"
+        "EOR    v4.16B, v4.16B, v14.16B \n\t"
+        "EOR    v5.16B, v5.16B, v15.16B \n\t"
+        "EOR    v6.16B, v6.16B, v16.16B \n\t"
+        "EOR    v7.16B, v7.16B, v17.16B \n\t"
+        "ST1    {v4.4S-v7.4S}, [%[c]], #64 \n\t"
+        : [input] "+r" (input), [m] "+r" (m), [c] "+r" (c)
+        : [L_chacha20_neon_rol8] "r" (L_chacha20_neon_rol8),
+          [L_chacha20_neon_inc_first_word] "r" (L_chacha20_neon_inc_first_word)
+        : "memory", "x3", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
+          "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
+          "v16", "v17", "v18", "v19", "v20", "v21"
+    );
+#else
+    __asm__ __volatile__ (
+        "MOV r11, %[rounds] \n\t"
+        "MOV r12, #1 \n\t"
+        "VLDM %[input], { q0-q3 } \n\t"
+        "VMOV.I32 q8, #0 \n\t"
+        "VMOV q4, q0 \n\t"
+        "VMOV.I32 d16[0], r12 \n\t"
+        "VMOV q5, q1 \n\t"
+        "VMOV q6, q2 \n\t"
+        "VADD.I32 q7, q3, q8 \n\t" // add one to counter
+
+        // store input
+        "VMOV q10, q0 \n\t"
+        "VMOV q11, q1 \n\t"
+        "VMOV q12, q2 \n\t"
+        "VMOV q13, q3 \n\t"
+        "\n"
+    "L_chacha20_arm32_128_loop_%=: \n\t"
+        "SUBS r11, r11, #1 \n\t"
+
+        // ODD ROUND
+        "VADD.I32 q0, q0, q1 \n\t"
+        "VADD.I32 q4, q4, q5 \n\t"
+        "VEOR q8, q3, q0 \n\t"
+        "VEOR q9, q7, q4 \n\t"
+        // rotation by 16 bits may be done by reversing the 16 bit elements in 32 bit words
+        "VREV32.16 q3, q8 \n\t"
+        "VREV32.16 q7, q9 \n\t"
+
+        "VADD.I32 q2, q2, q3 \n\t"
+        "VADD.I32 q6, q6, q7 \n\t"
+        "VEOR q8, q1, q2 \n\t"
+        "VEOR q9, q5, q6 \n\t"
+        // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+        "VSHL.I32 q1, q8, #12 \n\t"
+        "VSHL.I32 q5, q9, #12 \n\t"
+        "VSRI.I32 q1, q8, #20 \n\t"
+        "VSRI.I32 q5, q9, #20 \n\t"
+
+        "VADD.I32 q0, q0, q1 \n\t"
+        "VADD.I32 q4, q4, q5 \n\t"
+        "VEOR q8, q3, q0 \n\t"
+        "VEOR q9, q7, q4 \n\t"
+        // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+        "VSHL.I32 q3, q8, #8 \n\t"
+        "VSHL.I32 q7, q9, #8 \n\t"
+        "VSRI.I32 q3, q8, #24 \n\t"
+        "VSRI.I32 q7, q9, #24 \n\t"
+
+        "VADD.I32 q2, q2, q3 \n\t"
+        "VADD.I32 q6, q6, q7 \n\t"
+        "VEOR q8, q1, q2 \n\t"
+        "VEOR q9, q5, q6 \n\t"
+        // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+        "VSHL.I32 q1, q8, #7 \n\t"
+        "VSHL.I32 q5, q9, #7 \n\t"
+        "VSRI.I32 q1, q8, #25 \n\t"
+        "VSRI.I32 q5, q9, #25 \n\t"
+
+        // EVEN ROUND
+
+        "VEXT.8 q1, q1, q1, #4 \n\t" // permute elements left by one
+        "VEXT.8 q2, q2, q2, #8 \n\t" // permute elements left by two
+        "VEXT.8 q3, q3, q3, #12 \n\t" // permute elements left by three
+
+        "VEXT.8 q5, q5, q5, #4 \n\t" // permute elements left by one
+        "VEXT.8 q6, q6, q6, #8 \n\t" // permute elements left by two
+        "VEXT.8 q7, q7, q7, #12 \n\t" // permute elements left by three
+
+        "VADD.I32 q0, q0, q1 \n\t"
+        "VADD.I32 q4, q4, q5 \n\t"
+        "VEOR q8, q3, q0 \n\t"
+        "VEOR q9, q7, q4 \n\t"
+        // rotation by 16 bits may be done by reversing the 16 bit elements in 32 bit words
+        "VREV32.16 q3, q8 \n\t"
+        "VREV32.16 q7, q9 \n\t"
+
+        "VADD.I32 q2, q2, q3 \n\t"
+        "VADD.I32 q6, q6, q7 \n\t"
+        "VEOR q8, q1, q2 \n\t"
+        "VEOR q9, q5, q6 \n\t"
+        // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+        "VSHL.I32 q1, q8, #12 \n\t"
+        "VSHL.I32 q5, q9, #12 \n\t"
+        "VSRI.I32 q1, q8, #20 \n\t"
+        "VSRI.I32 q5, q9, #20 \n\t"
+
+        "VADD.I32 q0, q0, q1 \n\t"
+        "VADD.I32 q4, q4, q5 \n\t"
+        "VEOR q8, q3, q0 \n\t"
+        "VEOR q9, q7, q4 \n\t"
+        // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+        "VSHL.I32 q3, q8, #8 \n\t"
+        "VSHL.I32 q7, q9, #8 \n\t"
+        "VSRI.I32 q3, q8, #24 \n\t"
+        "VSRI.I32 q7, q9, #24 \n\t"
+
+        "VADD.I32 q2, q2, q3 \n\t"
+        "VADD.I32 q6, q6, q7 \n\t"
+        "VEOR q8, q1, q2 \n\t"
+        "VEOR q9, q5, q6 \n\t"
+        // SIMD instructions don't support rotation so we have to cheat using shifts and a help register
+        "VSHL.I32 q1, q8, #7 \n\t"
+        "VSHL.I32 q5, q9, #7 \n\t"
+        "VSRI.I32 q1, q8, #25 \n\t"
+        "VSRI.I32 q5, q9, #25 \n\t"
+
+        "VEXT.8 q1, q1, q1, #12 \n\t" // permute elements left by three
+        "VEXT.8 q2, q2, q2, #8 \n\t" // permute elements left by two
+        "VEXT.8 q3, q3, q3, #4 \n\t" // permute elements left by one
+
+        "VEXT.8 q5, q5, q5, #12 \n\t" // permute elements left by three
+        "VEXT.8 q6, q6, q6, #8 \n\t" // permute elements left by two
+        "VEXT.8 q7, q7, q7, #4 \n\t" // permute elements left by one
+
+        "BNE L_chacha20_arm32_128_loop_%= \n\t"
+
+        "VMOV.I32 q8, #0 \n\t"
+        "VADD.I32 q0, q0, q10 \n\t"
+        "VADD.I32 q1, q1, q11 \n\t"
+        "VMOV.I32 d16[0], r12 \n\t"
+        "VADD.I32 q2, q2, q12 \n\t"
+        "VADD.I32 q3, q3, q13 \n\t"
+
+        "VADD.I32 q13, q13, q8 \n\t" // add one to counter
+
+        "VADD.I32 q4, q4, q10 \n\t"
+        "VADD.I32 q5, q5, q11 \n\t"
+        "VADD.I32 q6, q6, q12 \n\t"
+        "VADD.I32 q7, q7, q13 \n\t"
+
+        "VLDM %[m], { q8-q15 } \n\t"
+        "VEOR q0, q0, q8 \n\t"
+        "VEOR q1, q1, q9 \n\t"
+        "VEOR q2, q2, q10 \n\t"
+        "VEOR q3, q3, q11 \n\t"
+        "VEOR q4, q4, q12 \n\t"
+        "VEOR q5, q5, q13 \n\t"
+        "VEOR q6, q6, q14 \n\t"
+        "VEOR q7, q7, q15 \n\t"
+        "VSTM %[c], { q0-q7 } \n\t"
+
+        : [c] "+r" (c), [m] "+r" (m)
+        : [rounds] "I" (ROUNDS/2), [input] "r" (input),
+          [chacha_chunk_bytes] "I" (CHACHA_CHUNK_BYTES)
+        : "memory", "cc",
+          "r11", "r12",
+          "q0",  "q1",  "q2", "q3", "q4",
+          "q5",  "q6",  "q7", "q8", "q9",
+          "q10", "q11", "q12", "q13", "q14", "q15"
+    );
+#endif /* __aarch64__ */
+    return CHACHA_CHUNK_BYTES * 2;
+}
+
+static WC_INLINE void wc_Chacha_encrypt_64(const word32* input, const byte* m,
+                                           byte* c, word32 bytes)
+{
+#ifdef CHACHA_TEST
+    printf("Entering wc_Chacha_encrypt_64 with %d bytes\n", bytes);
+#endif /*CHACHA_TEST */
+
+#ifdef __aarch64__
+    __asm__ __volatile__ (
+        /* Load index look-up for rotating left 8 bits */
+        "LD1    {v13.16B}, [%[L_chacha20_neon_rol8]] \n\t"
+        "LD1    {v14.4S}, [%[L_chacha20_neon_inc_first_word]] \n\t"
+        /* Load state to encrypt */
+        "LD1    {v8.4S-v11.4S}, [%[input]] \n\t"
+        "\n"
+    "L_chacha20_arm64_64_loop_%=: \n\t"
+        /* Move state into vector registers (x3) */
+        "MOV    v0.16B, v8.16B \n\t"
+        "MOV    v1.16B, v9.16B \n\t"
+        "MOV    v2.16B, v10.16B \n\t"
+        "MOV    v3.16B, v11.16B \n\t"
+        /* Add counter word */
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #12 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #4 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #4 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #12 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #12 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #4 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #4 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #12 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #12 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #4 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #4 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #12 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #12 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #4 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #4 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #12 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #12 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #4 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #4 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #12 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #12 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #4 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #4 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #12 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #12 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #4 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #4 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #12 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #12 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #4 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #4 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #12 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #12 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #4 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #4 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #12 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #12 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #4 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "REV32  v3.8H, v3.8H \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #12 \n\t"
+        "SRI    v1.4S, v12.4S, #20 \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "ADD    v0.4S, v0.4S, v1.4S \n\t"
+        "EOR    v3.16B, v3.16B, v0.16B \n\t"
+        "TBL    v3.16B, { v3.16B }, v13.16B \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "ADD    v2.4S, v2.4S, v3.4S \n\t"
+        "EOR    v12.16B, v1.16B, v2.16B \n\t"
+        "SHL    v1.4S, v12.4S, #7 \n\t"
+        "SRI    v1.4S, v12.4S, #25 \n\t"
+        "EXT    v3.16B, v3.16B, v3.16B, #4 \n\t"
+        "EXT    v1.16B, v1.16B, v1.16B, #12 \n\t"
+        "EXT    v2.16B, v2.16B, v2.16B, #8 \n\t"
+        /* Add back state */
+        "ADD    v0.4S, v0.4S, v8.4S \n\t"
+        "ADD    v1.4S, v1.4S, v9.4S \n\t"
+        "ADD    v2.4S, v2.4S, v10.4S \n\t"
+        "ADD    v3.4S, v3.4S, v11.4S \n\t"
+        "CMP    %[bytes], #64 \n\t"
+        "BLT    L_chacha20_arm64_64_lt_64_%= \n\t"
+        "LD1    {v4.4S-v7.4S}, [%[m]], #64 \n\t"
+        "EOR    v4.16B, v4.16B, v0.16B \n\t"
+        "EOR    v5.16B, v5.16B, v1.16B \n\t"
+        "EOR    v6.16B, v6.16B, v2.16B \n\t"
+        "EOR    v7.16B, v7.16B, v3.16B \n\t"
+        "ST1    {v4.4S-v7.4S}, [%[c]], #64 \n\t"
+        "SUBS   %[bytes], %[bytes], #64 \n\t"
+        "ADD    v11.4S, v11.4S, v14.4S \n\t"
+        "BNE    L_chacha20_arm64_64_loop_%= \n\t"
+        "B      L_chacha20_arm64_64_done_%= \n\t"
+        "\n"
+    "L_chacha20_arm64_64_lt_64_%=: \n\t"
+        "CMP    %[bytes], #32 \n\t"
+        "BLT    L_chacha20_arm64_64_lt_32_%= \n\t"
+        "LD1    {v4.4S, v5.4S}, [%[m]], #32 \n\t"
+        "EOR    v4.16B, v4.16B, v0.16B \n\t"
+        "EOR    v5.16B, v5.16B, v1.16B \n\t"
+        "ST1    {v4.4S, v5.4S}, [%[c]], #32 \n\t"
+        "SUBS   %[bytes], %[bytes], #32 \n\t"
+        "MOV    v0.16B, v2.16B \n\t"
+        "MOV    v1.16B, v3.16B \n\t"
+        "BEQ    L_chacha20_arm64_64_done_%= \n\t"
+        "\n"
+    "L_chacha20_arm64_64_lt_32_%=: \n\t"
+        "CMP    %[bytes], #16 \n\t"
+        "BLT    L_chacha20_arm64_64_lt_16_%= \n\t"
+        "LD1    {v4.4S}, [%[m]], #16 \n\t"
+        "EOR    v4.16B, v4.16B, v0.16B \n\t"
+        "ST1    {v4.4S}, [%[c]], #16 \n\t"
+        "SUBS   %[bytes], %[bytes], #16 \n\t"
+        "MOV    v0.16B, v1.16B \n\t"
+        "BEQ    L_chacha20_arm64_64_done_%= \n\t"
+        "\n"
+    "L_chacha20_arm64_64_lt_16_%=: \n\t"
+        "CMP    %[bytes], #8 \n\t"
+        "BLT    L_chacha20_arm64_64_lt_8_%= \n\t"
+        "LD1    {v4.2S}, [%[m]], #8 \n\t"
+        "EOR    v4.8B, v4.8B, v0.8B \n\t"
+        "ST1    {v4.2S}, [%[c]], #8 \n\t"
+        "SUBS   %[bytes], %[bytes], #8 \n\t"
+        "MOV    v0.D[0], v0.D[1] \n\t"
+        "BEQ    L_chacha20_arm64_64_done_%= \n\t"
+        "\n"
+    "L_chacha20_arm64_64_lt_8_%=: \n\t"
+        "MOV    x4, v0.D[0] \n\t"
+        "LSL    x5, %[bytes], #3 \n\t"
+        "\n"
+    "L_chacha20_arm64_64_loop_lt_8_%=: \n\t"
+        "LDRB   w6, [%[m], %[bytes]] \n\t"
+        "ROR    x7, x4, x5 \n\t"
+        "EOR    w6, w6, w7 \n\t"
+        "STRB   w6, [%[c], %[bytes]] \n\t"
+        "SUBS   %[bytes], %[bytes], #1 \n\t"
+        "SUB    x5, x5, #8 \n\t"
+        "BGE    L_chacha20_arm64_64_loop_lt_8_%= \n\t"
+        "\n"
+    "L_chacha20_arm64_64_done_%=: \n\t"
+        : [input] "+r" (input), [m] "+r" (m), [c] "+r" (c), [bytes] "+r" (bytes)
+        : [L_chacha20_neon_rol8] "r" (L_chacha20_neon_rol8),
+          [L_chacha20_neon_inc_first_word] "r" (L_chacha20_neon_inc_first_word)
+        : "memory", "x4", "x5", "x6", "x7", "v0", "v1", "v2", "v3",
+          "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11"
+    );
+#else
+    __asm__ __volatile__ (
+        /* Get the input state */
+        "VLDM       %[input], { q8-q11 } \n\t"
+        /* Get the incrementer register */
+        "VLDM       %[L_chacha20_neon_inc_first_word], { q14 } \n\t"
+        "\n"
+    "L_chacha20_arm32_64_outer_loop_%=: \n\t"
+        /* Copy over the input state */
+        "VMOV       q0, q8               \n\t"
+        "VMOV       q1, q9               \n\t"
+        "VMOV       q2, q10              \n\t"
+        "VMOV       q3, q11              \n\t"
+        /* Compute quarter rounds */
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Odd->Even */
+        "VEXT.8     q1, q1, q1, #4       \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #12      \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Even->Odd */
+        "VEXT.8     q1, q1, q1, #12      \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #4       \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Odd->Even */
+        "VEXT.8     q1, q1, q1, #4       \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #12      \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Even->Odd */
+        "VEXT.8     q1, q1, q1, #12      \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #4       \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Odd->Even */
+        "VEXT.8     q1, q1, q1, #4       \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #12      \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Even->Odd */
+        "VEXT.8     q1, q1, q1, #12      \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #4       \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Odd->Even */
+        "VEXT.8     q1, q1, q1, #4       \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #12      \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Even->Odd */
+        "VEXT.8     q1, q1, q1, #12      \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #4       \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Odd->Even */
+        "VEXT.8     q1, q1, q1, #4       \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #12      \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Even->Odd */
+        "VEXT.8     q1, q1, q1, #12      \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #4       \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Odd->Even */
+        "VEXT.8     q1, q1, q1, #4       \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #12      \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Even->Odd */
+        "VEXT.8     q1, q1, q1, #12      \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #4       \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Odd->Even */
+        "VEXT.8     q1, q1, q1, #4       \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #12      \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Even->Odd */
+        "VEXT.8     q1, q1, q1, #12      \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #4       \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Odd->Even */
+        "VEXT.8     q1, q1, q1, #4       \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #12      \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Even->Odd */
+        "VEXT.8     q1, q1, q1, #12      \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #4       \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Odd->Even */
+        "VEXT.8     q1, q1, q1, #4       \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #12      \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Even->Odd */
+        "VEXT.8     q1, q1, q1, #12      \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #4       \n\t"
+        /* Odd Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Odd->Even */
+        "VEXT.8     q1, q1, q1, #4       \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #12      \n\t"
+        /* Even Round */
+        /* a += b; d ^= a; d <<<= 16; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VREV32.16  q3, q4               \n\t"
+        /* c += d; b ^= c; b <<<= 12; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #12          \n\t"
+        "VSRI.I32   q1, q4, #20          \n\t"
+        /* a += b; d ^= a; d <<<= 8; */
+        "VADD.I32   q0, q0, q1           \n\t"
+        "VEOR       q4, q3, q0           \n\t"
+        "VSHL.I32   q3, q4, #8           \n\t"
+        "VSRI.I32   q3, q4, #24          \n\t"
+        /* c += d; b ^= c; b <<<= 7; */
+        "VADD.I32   q2, q2, q3           \n\t"
+        "VEOR       q4, q1, q2           \n\t"
+        "VSHL.I32   q1, q4, #7           \n\t"
+        "VSRI.I32   q1, q4, #25          \n\t"
+        /* Permute Even->Odd */
+        "VEXT.8     q1, q1, q1, #12      \n\t"
+        "VEXT.8     q2, q2, q2, #8       \n\t"
+        "VEXT.8     q3, q3, q3, #4       \n\t"
+        /* Add back state */
+        "VADD.I32   q0, q0, q8           \n\t"
+        "VADD.I32   q1, q1, q9           \n\t"
+        "VADD.I32   q2, q2, q10          \n\t"
+        "VADD.I32   q3, q3, q11          \n\t"
+        "CMP        %[bytes], #64        \n\t"
+        "BLT        L_chacha20_arm32_64_lt_64_%= \n\t"
+        /* XOR full 64 byte block */
+        "VLDM       %[m], { q4-q7 }      \n\t"
+        "ADD        %[m], %[m], #64      \n\t"
+        "VEOR       q0, q0, q4           \n\t"
+        "VEOR       q1, q1, q5           \n\t"
+        "VEOR       q2, q2, q6           \n\t"
+        "VEOR       q3, q3, q7           \n\t"
+        "VSTM       %[c], { q0-q3 }      \n\t"
+        "ADD        %[c], %[c], #64      \n\t"
+        "SUBS       %[bytes], %[bytes], #64 \n\t"
+        "VADD.I32   q11, q11, q14        \n\t"
+        "BNE        L_chacha20_arm32_64_outer_loop_%= \n\t"
+        "B          L_chacha20_arm32_64_done_%= \n\t"
+        "\n"
+    "L_chacha20_arm32_64_lt_64_%=: \n\t"
+        /* XOR 32 bytes */
+        "CMP        %[bytes], #32        \n\t"
+        "BLT        L_chacha20_arm32_64_lt_32_%= \n\t"
+        "VLDM       %[m], { q4-q5 }      \n\t"
+        "ADD        %[m], %[m], #32      \n\t"
+        "VEOR       q4, q4, q0           \n\t"
+        "VEOR       q5, q5, q1           \n\t"
+        "VSTM       %[c], { q4-q5 }      \n\t"
+        "ADD        %[c], %[c], #32      \n\t"
+        "SUBS       %[bytes], %[bytes], #32 \n\t"
+        "VMOV       q0, q2               \n\t"
+        "VMOV       q1, q3               \n\t"
+        "BEQ        L_chacha20_arm32_64_done_%= \n\t"
+        "\n"
+    "L_chacha20_arm32_64_lt_32_%=: \n\t"
+        /* XOR 16 bytes */
+        "CMP        %[bytes], #16        \n\t"
+        "BLT        L_chacha20_arm32_64_lt_16_%= \n\t"
+        "VLDM       %[m], { q4 }         \n\t"
+        "ADD        %[m], %[m], #16      \n\t"
+        "VEOR       q4, q4, q0           \n\t"
+        "VSTM       %[c], { q4 }         \n\t"
+        "ADD        %[c], %[c], #16      \n\t"
+        "SUBS       %[bytes], %[bytes], #16 \n\t"
+        "VMOV       q0, q1               \n\t"
+        "BEQ        L_chacha20_arm32_64_done_%= \n\t"
+        "\n"
+    "L_chacha20_arm32_64_lt_16_%=: \n\t"
+        /* XOR 8 bytes */
+        "CMP        %[bytes], #8         \n\t"
+        "BLT        L_chacha20_arm32_64_lt_8_%= \n\t"
+        "VLDR       d8, [%[m], #0]       \n\t"
+        "ADD        %[m], %[m], #8       \n\t"
+        "VEOR       d8, d8, d0           \n\t"
+        "VSTR       d8, [%[c], #0]       \n\t"
+        "ADD        %[c], %[c], #8       \n\t"
+        "SUBS       %[bytes], %[bytes], #8 \n\t"
+        "VMOV       d0, d1               \n\t"
+        "BEQ        L_chacha20_arm32_64_done_%= \n\t"
+        "\n"
+    "L_chacha20_arm32_64_lt_8_%=: \n\t"
+        /* XOR 4 bytes */
+        "CMP        %[bytes], #4         \n\t"
+        "BLT        L_chacha20_arm32_64_lt_4_%= \n\t"
+        "LDR        r12, [%[m]], #4      \n\t"
+        "VMOV       r14, d0[0]           \n\t"
+        "EOR        r12, r12, r14        \n\t"
+        "STR        r12, [%[c]], #4      \n\t"
+        "SUBS       %[bytes], %[bytes], #4 \n\t"
+        "VTRN.32    d0, d0               \n\t"
+        "BEQ        L_chacha20_arm32_64_done_%= \n\t"
+        "\n"
+    "L_chacha20_arm32_64_lt_4_%=: \n\t"
+        /* XOR remaining bytes */
+        "VMOV       r14, d0[0]           \n\t"
+        "\n"
+    "L_chacha20_arm32_64_lt_4_loop_%=: \n\t"
+        "LDRB       r12, [%[m]], #1      \n\t"
+        "EOR        r12, r12, r14        \n\t"
+        "STRB       r12, [%[c]], #1      \n\t"
+        "SUBS       %[bytes], %[bytes], #1 \n\t"
+        "LSR        r14, r14, #8         \n\t"
+        "BGT        L_chacha20_arm32_64_lt_4_loop_%= \n\t"
+        "\n"
+    "L_chacha20_arm32_64_done_%=: \n\t"
+        : [input] "+r" (input), [m] "+r" (m), [c] "+r" (c), [bytes] "+r" (bytes)
+        : [L_chacha20_neon_inc_first_word] "r" (L_chacha20_neon_inc_first_word)
+        : "memory", "cc",
+          "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q14", "r12", "r14"
+    );
+#endif /* __aarch64__ */
+}
+
+/**
+  * Encrypt a stream of bytes
+  */
+static void wc_Chacha_encrypt_bytes(ChaCha* ctx, const byte* m, byte* c,
+                                    word32 bytes)
+{
+    int    processed;
+
+#ifdef __aarch64__
+    if (bytes >= CHACHA_CHUNK_BYTES * 5) {
+        processed = (bytes / (CHACHA_CHUNK_BYTES * 5)) * CHACHA_CHUNK_BYTES * 5;
+        wc_Chacha_encrypt_320(ctx->X, m, c, processed);
+
+        bytes -= processed;
+        c += processed;
+        m += processed;
+        ctx->X[CHACHA_IV_BYTES] = PLUS(ctx->X[CHACHA_IV_BYTES], processed / CHACHA_CHUNK_BYTES);
+    }
+    if (bytes >= CHACHA_CHUNK_BYTES * 4) {
+#else
+    while (bytes >= CHACHA_CHUNK_BYTES * 4) {
+#endif /*__aarch64__ */
+        processed = wc_Chacha_encrypt_256(ctx->X, m, c);
+
+        bytes -= processed;
+        c += processed;
+        m += processed;
+        ctx->X[CHACHA_IV_BYTES] = PLUS(ctx->X[CHACHA_IV_BYTES], processed / CHACHA_CHUNK_BYTES);
+    }
+    if (bytes >= CHACHA_CHUNK_BYTES * 2) {
+        processed = wc_Chacha_encrypt_128(ctx->X, m, c);
+
+        bytes -= processed;
+        c += processed;
+        m += processed;
+        ctx->X[CHACHA_IV_BYTES] = PLUS(ctx->X[CHACHA_IV_BYTES], processed / CHACHA_CHUNK_BYTES);
+    }
+    if (bytes > 0) {
+        wc_Chacha_encrypt_64(ctx->X, m, c, bytes);
+        if (bytes > 64)
+            ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]);
+        ctx->X[CHACHA_IV_BYTES] = PLUSONE(ctx->X[CHACHA_IV_BYTES]);
+    }
+}
+
+/**
+  * API to encrypt/decrypt a message of any size.
+  */
+int wc_Chacha_Process(ChaCha* ctx, byte* output, const byte* input,
+                      word32 msglen)
+{
+    if (ctx == NULL || output == NULL || input == NULL)
+        return BAD_FUNC_ARG;
+
+    wc_Chacha_encrypt_bytes(ctx, input, output, msglen);
+
+    return 0;
+}
+
+#endif /* HAVE_CHACHA*/
+
+#endif /* WOLFSSL_ARMASM */

+ 208 - 4
wolfcrypt/test/test.c

@@ -4363,14 +4363,138 @@ int chacha_test(void)
 
     const byte* keys[] = {key1, key2, key3, key4};
 
-    static const byte ivs1[] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};
-    static const byte ivs2[] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};
-    static const byte ivs3[] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01};
-    static const byte ivs4[] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};
+    static const byte ivs1[] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};
+    static const byte ivs2[] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};
+    static const byte ivs3[] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00};
+    static const byte ivs4[] = {0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};
 
 
     const byte* ivs[] = {ivs1, ivs2, ivs3, ivs4};
 
+#ifndef BENCH_EMBEDDED
+    static const byte cipher_big_result[] = {
+        0x06, 0xa6, 0x5d, 0x31, 0x21, 0x6c, 0xdb, 0x37, 0x48, 0x7c, 0x01, 0x9d,
+        0x72, 0xdf, 0x0a, 0x5b, 0x64, 0x74, 0x20, 0xba, 0x9e, 0xe0, 0x26, 0x7a,
+        0xbf, 0xdf, 0x83, 0x34, 0x3b, 0x4f, 0x94, 0x3f, 0x37, 0x89, 0xaf, 0x00,
+        0xdf, 0x0f, 0x2e, 0x75, 0x16, 0x41, 0xf6, 0x7a, 0x86, 0x94, 0x9d, 0x32,
+        0x56, 0xf0, 0x79, 0x71, 0x68, 0x6f, 0xa6, 0x6b, 0xc6, 0x59, 0x49, 0xf6,
+        0x10, 0x34, 0x03, 0x03, 0x16, 0x53, 0x9a, 0x98, 0x2a, 0x46, 0xde, 0x17,
+        0x06, 0x65, 0x70, 0xca, 0x0a, 0x1f, 0xab, 0x80, 0x26, 0x96, 0x3f, 0x3e,
+        0x7a, 0x3c, 0xa8, 0x87, 0xbb, 0x65, 0xdd, 0x5e, 0x07, 0x7b, 0x34, 0xe0,
+        0x56, 0xda, 0x32, 0x13, 0x30, 0xc9, 0x0c, 0xd7, 0xba, 0xe4, 0x1f, 0xa6,
+        0x91, 0x4f, 0x72, 0x9f, 0xd9, 0x5c, 0x62, 0x7d, 0xa6, 0xc2, 0xbc, 0x87,
+        0xae, 0x64, 0x11, 0x94, 0x3b, 0xbc, 0x6c, 0x23, 0xbd, 0x7d, 0x00, 0xb4,
+        0x99, 0xf2, 0x68, 0xb5, 0x59, 0x70, 0x93, 0xad, 0x69, 0xd0, 0xb1, 0x28,
+        0x70, 0x92, 0xeb, 0xec, 0x39, 0x80, 0x82, 0xde, 0x44, 0xe2, 0x8a, 0x26,
+        0xb3, 0xe9, 0x45, 0xcf, 0x83, 0x76, 0x9f, 0x6a, 0xa0, 0x46, 0x4a, 0x3d,
+        0x26, 0x56, 0xaf, 0x49, 0x41, 0x26, 0x1b, 0x6a, 0x41, 0x37, 0x65, 0x91,
+        0x72, 0xc4, 0xe7, 0x3c, 0x17, 0x31, 0xae, 0x2e, 0x2b, 0x31, 0x45, 0xe4,
+        0x93, 0xd3, 0x10, 0xaa, 0xc5, 0x62, 0xd5, 0x11, 0x4b, 0x57, 0x1d, 0xad,
+        0x48, 0x06, 0xd0, 0x0d, 0x98, 0xa5, 0xc6, 0x5b, 0xd0, 0x9e, 0x22, 0xc0,
+        0x00, 0x32, 0x5a, 0xf5, 0x1c, 0x89, 0x6d, 0x54, 0x97, 0x55, 0x6b, 0x46,
+        0xc5, 0xc7, 0xc4, 0x48, 0x9c, 0xbf, 0x47, 0xdc, 0x03, 0xc4, 0x1b, 0xcb,
+        0x65, 0xa6, 0x91, 0x9d, 0x6d, 0xf1, 0xb0, 0x7a, 0x4d, 0x3b, 0x03, 0x95,
+        0xf4, 0x8b, 0x0b, 0xae, 0x39, 0xff, 0x3f, 0xf6, 0xc0, 0x14, 0x18, 0x8a,
+        0xe5, 0x19, 0xbd, 0xc1, 0xb4, 0x05, 0x4e, 0x29, 0x2f, 0x0b, 0x33, 0x76,
+        0x28, 0x16, 0xa4, 0xa6, 0x93, 0x04, 0xb5, 0x55, 0x6b, 0x89, 0x3d, 0xa5,
+        0x0f, 0xd3, 0xad, 0xfa, 0xd9, 0xfd, 0x05, 0x5d, 0x48, 0x94, 0x25, 0x5a,
+        0x2c, 0x9a, 0x94, 0x80, 0xb0, 0xe7, 0xcb, 0x4d, 0x77, 0xbf, 0xca, 0xd8,
+        0x55, 0x48, 0xbd, 0x66, 0xb1, 0x85, 0x81, 0xb1, 0x37, 0x79, 0xab, 0x52,
+        0x08, 0x14, 0x12, 0xac, 0xcd, 0x45, 0x4d, 0x53, 0x6b, 0xca, 0x96, 0xc7,
+        0x3b, 0x2f, 0x73, 0xb1, 0x5a, 0x23, 0xbd, 0x65, 0xd5, 0xea, 0x17, 0xb3,
+        0xdc, 0xa1, 0x17, 0x1b, 0x2d, 0xb3, 0x9c, 0xd0, 0xdb, 0x41, 0x77, 0xef,
+        0x93, 0x20, 0x52, 0x3e, 0x9d, 0xf5, 0xbf, 0x33, 0xf7, 0x52, 0xc1, 0x90,
+        0xa0, 0x15, 0x17, 0xce, 0xf7, 0xf7, 0xd0, 0x3a, 0x3b, 0xd1, 0x72, 0x56,
+        0x31, 0x81, 0xae, 0x60, 0xab, 0x40, 0xc1, 0xd1, 0x28, 0x77, 0x53, 0xac,
+        0x9f, 0x11, 0x0a, 0x88, 0x36, 0x4b, 0xda, 0x57, 0xa7, 0x28, 0x5c, 0x85,
+        0xd3, 0x85, 0x9b, 0x79, 0xad, 0x05, 0x1c, 0x37, 0x14, 0x5e, 0x0d, 0xd0,
+        0x23, 0x03, 0x42, 0x1d, 0x48, 0x5d, 0xc5, 0x3c, 0x5a, 0x08, 0xa9, 0x0d,
+        0x6e, 0x82, 0x7c, 0x2e, 0x3c, 0x41, 0xcc, 0x96, 0x8e, 0xad, 0xee, 0x2a,
+        0x61, 0x0b, 0x16, 0x0f, 0xa9, 0x24, 0x40, 0x85, 0xbc, 0x9f, 0x28, 0x8d,
+        0xe6, 0x68, 0x4d, 0x8f, 0x30, 0x48, 0xd9, 0x73, 0x73, 0x6c, 0x9a, 0x7f,
+        0x67, 0xf7, 0xde, 0x4c, 0x0a, 0x8b, 0xe4, 0xb3, 0x08, 0x2a, 0x52, 0xda,
+        0x54, 0xee, 0xcd, 0xb5, 0x62, 0x4a, 0x26, 0x20, 0xfb, 0x40, 0xbb, 0x39,
+        0x3a, 0x0f, 0x09, 0xe8, 0x00, 0xd1, 0x24, 0x97, 0x60, 0xe9, 0x83, 0x83,
+        0xfe, 0x9f, 0x9c, 0x15, 0xcf, 0x69, 0x03, 0x9f, 0x03, 0xe1, 0xe8, 0x6e,
+        0xbd, 0x87, 0x58, 0x68, 0xee, 0xec, 0xd8, 0x29, 0x46, 0x23, 0x49, 0x92,
+        0x72, 0x95, 0x5b, 0x49, 0xca, 0xe0, 0x45, 0x59, 0xb2, 0xca, 0xf4, 0xfc,
+        0xb7, 0x59, 0x37, 0x49, 0x28, 0xbc, 0xf3, 0xd7, 0x61, 0xbc, 0x4b, 0xf3,
+        0xa9, 0x4b, 0x2f, 0x05, 0xa8, 0x01, 0xa5, 0xdc, 0x00, 0x6e, 0x01, 0xb6,
+        0x45, 0x3c, 0xd5, 0x49, 0x7d, 0x5c, 0x25, 0xe8, 0x31, 0x87, 0xb2, 0xb9,
+        0xbf, 0xb3, 0x01, 0x62, 0x0c, 0xd0, 0x48, 0x77, 0xa2, 0x34, 0x0f, 0x16,
+        0x22, 0x28, 0xee, 0x54, 0x08, 0x93, 0x3b, 0xe4, 0xde, 0x7e, 0x63, 0xf7,
+        0x97, 0x16, 0x5d, 0x71, 0x58, 0xc2, 0x2e, 0xf2, 0x36, 0xa6, 0x12, 0x65,
+        0x94, 0x17, 0xac, 0x66, 0x23, 0x7e, 0xc6, 0x72, 0x79, 0x24, 0xce, 0x8f,
+        0x55, 0x19, 0x97, 0x44, 0xfc, 0x55, 0xec, 0x85, 0x26, 0x27, 0xdb, 0x38,
+        0xb1, 0x42, 0x0a, 0xdd, 0x05, 0x99, 0x28, 0xeb, 0x03, 0x6c, 0x9a, 0xe9,
+        0x17, 0xf6, 0x2c, 0xb0, 0xfe, 0xe7, 0xa4, 0xa7, 0x31, 0xda, 0x4d, 0xb0,
+        0x29, 0xdb, 0xdd, 0x8d, 0x12, 0x13, 0x9c, 0xb4, 0xcc, 0x83, 0x97, 0xfb,
+        0x1a, 0xdc, 0x08, 0xd6, 0x30, 0x62, 0xe8, 0xeb, 0x8b, 0x61, 0xcb, 0x1d,
+        0x06, 0xe3, 0xa5, 0x4d, 0x35, 0xdb, 0x59, 0xa8, 0x2d, 0x87, 0x27, 0x44,
+        0x6f, 0xc0, 0x38, 0x97, 0xe4, 0x85, 0x00, 0x02, 0x09, 0xf6, 0x69, 0x3a,
+        0xcf, 0x08, 0x1b, 0x21, 0xbb, 0x79, 0xb1, 0xa1, 0x34, 0x09, 0xe0, 0x80,
+        0xca, 0xb0, 0x78, 0x8a, 0x11, 0x97, 0xd4, 0x07, 0xbe, 0x1b, 0x6a, 0x5d,
+        0xdb, 0xd6, 0x1f, 0x76, 0x6b, 0x16, 0xf0, 0x58, 0x84, 0x5f, 0x59, 0xce,
+        0x62, 0x34, 0xc3, 0xdf, 0x94, 0xb8, 0x2f, 0x84, 0x68, 0xf0, 0xb8, 0x51,
+        0xd9, 0x6d, 0x8e, 0x4a, 0x1d, 0xe6, 0x5c, 0xd8, 0x86, 0x25, 0xe3, 0x24,
+        0xfd, 0x21, 0x61, 0x13, 0x48, 0x3e, 0xf6, 0x7d, 0xa6, 0x71, 0x9b, 0xd2,
+        0x6e, 0xe6, 0xd2, 0x08, 0x94, 0x62, 0x6c, 0x98, 0xfe, 0x2f, 0x9c, 0x88,
+        0x7e, 0x78, 0x15, 0x02, 0x00, 0xf0, 0xba, 0x24, 0x91, 0xf2, 0xdc, 0x47,
+        0x51, 0x4d, 0x15, 0x5e, 0x91, 0x5f, 0x57, 0x5b, 0x1d, 0x35, 0x24, 0x45,
+        0x75, 0x9b, 0x88, 0x75, 0xf1, 0x2f, 0x85, 0xe7, 0x89, 0xd1, 0x01, 0xb4,
+        0xc8, 0x18, 0xb7, 0x97, 0xef, 0x4b, 0x90, 0xf4, 0xbf, 0x10, 0x27, 0x3c,
+        0x60, 0xff, 0xc4, 0x94, 0x20, 0x2f, 0x93, 0x4b, 0x4d, 0xe3, 0x80, 0xf7,
+        0x2c, 0x71, 0xd9, 0xe3, 0x68, 0xb4, 0x77, 0x2b, 0xc7, 0x0d, 0x39, 0x92,
+        0xef, 0x91, 0x0d, 0xb2, 0x11, 0x50, 0x0e, 0xe8, 0xad, 0x3b, 0xf6, 0xb5,
+        0xc6, 0x14, 0x4d, 0x33, 0x53, 0xa7, 0x60, 0x15, 0xc7, 0x27, 0x51, 0xdc,
+        0x54, 0x29, 0xa7, 0x0d, 0x6a, 0x7b, 0x72, 0x13, 0xad, 0x7d, 0x41, 0x19,
+        0x4e, 0x42, 0x49, 0xcc, 0x42, 0xe4, 0xbd, 0x99, 0x13, 0xd9, 0x7f, 0xf3,
+        0x38, 0xa4, 0xb6, 0x33, 0xed, 0x07, 0x48, 0x7e, 0x8e, 0x82, 0xfe, 0x3a,
+        0x9d, 0x75, 0x93, 0xba, 0x25, 0x4e, 0x37, 0x3c, 0x0c, 0xd5, 0x69, 0xa9,
+        0x2d, 0x9e, 0xfd, 0xe8, 0xbb, 0xf5, 0x0c, 0xe2, 0x86, 0xb9, 0x5e, 0x6f,
+        0x28, 0xe4, 0x19, 0xb3, 0x0b, 0xa4, 0x86, 0xd7, 0x24, 0xd0, 0xb8, 0x89,
+        0x7b, 0x76, 0xec, 0x05, 0x10, 0x5b, 0x68, 0xe9, 0x58, 0x66, 0xa3, 0xc5,
+        0xb6, 0x63, 0x20, 0x0e, 0x0e, 0xea, 0x3d, 0x61, 0x5e, 0xda, 0x3d, 0x3c,
+        0xf9, 0xfd, 0xed, 0xa9, 0xdb, 0x52, 0x94, 0x8a, 0x00, 0xca, 0x3c, 0x8d,
+        0x66, 0x8f, 0xb0, 0xf0, 0x5a, 0xca, 0x3f, 0x63, 0x71, 0xbf, 0xca, 0x99,
+        0x37, 0x9b, 0x75, 0x97, 0x89, 0x10, 0x6e, 0xcf, 0xf2, 0xf5, 0xe3, 0xd5,
+        0x45, 0x9b, 0xad, 0x10, 0x71, 0x6c, 0x5f, 0x6f, 0x7f, 0x22, 0x77, 0x18,
+        0x2f, 0xf9, 0x99, 0xc5, 0x69, 0x58, 0x03, 0x12, 0x86, 0x82, 0x3e, 0xbf,
+        0xc2, 0x12, 0x35, 0x43, 0xa3, 0xd9, 0x18, 0x4f, 0x41, 0x11, 0x6b, 0xf3,
+        0x67, 0xaf, 0x3d, 0x78, 0xe4, 0x22, 0x2d, 0xb3, 0x48, 0x43, 0x31, 0x1d,
+        0xef, 0xa8, 0xba, 0x49, 0x8e, 0xa9, 0xa7, 0xb6, 0x18, 0x77, 0x84, 0xca,
+        0xbd, 0xa2, 0x02, 0x1b, 0x6a, 0xf8, 0x5f, 0xda, 0xff, 0xcf, 0x01, 0x6a,
+        0x86, 0x69, 0xa9, 0xe9, 0xcb, 0x60, 0x1e, 0x15, 0xdc, 0x8f, 0x5d, 0x39,
+        0xb5, 0xce, 0x55, 0x5f, 0x47, 0x97, 0xb1, 0x19, 0x6e, 0x21, 0xd6, 0x13,
+        0x39, 0xb2, 0x24, 0xe0, 0x62, 0x82, 0x9f, 0xed, 0x12, 0x81, 0xed, 0xee,
+        0xab, 0xd0, 0x2f, 0x19, 0x89, 0x3f, 0x57, 0x2e, 0xc2, 0xe2, 0x67, 0xe8,
+        0xae, 0x03, 0x56, 0xba, 0xd4, 0xd0, 0xa4, 0x89, 0x03, 0x06, 0x5b, 0xcc,
+        0xf2, 0x22, 0xb8, 0x0e, 0x76, 0x79, 0x4a, 0x42, 0x1d, 0x37, 0x51, 0x5a,
+        0xaa, 0x46, 0x6c, 0x2a, 0xdd, 0x66, 0xfe, 0xc6, 0x68, 0xc3, 0x38, 0xa2,
+        0xae, 0x5b, 0x98, 0x24, 0x5d, 0x43, 0x05, 0x82, 0x38, 0x12, 0xd3, 0xd1,
+        0x75, 0x2d, 0x4f, 0x61, 0xbd, 0xb9, 0x10, 0x87, 0x44, 0x2a, 0x78, 0x07,
+        0xff, 0xf4, 0x0f, 0xa1, 0xf3, 0x68, 0x9f, 0xbe, 0xae, 0xa2, 0x91, 0xf0,
+        0xc7, 0x55, 0x7a, 0x52, 0xd5, 0xa3, 0x8d, 0x6f, 0xe4, 0x90, 0x5c, 0xf3,
+        0x5f, 0xce, 0x3d, 0x23, 0xf9, 0x8e, 0xae, 0x14, 0xfb, 0x82, 0x9a, 0xa3,
+        0x04, 0x5f, 0xbf, 0xad, 0x3e, 0xf2, 0x97, 0x0a, 0x60, 0x40, 0x70, 0x19,
+        0x72, 0xad, 0x66, 0xfb, 0x78, 0x1b, 0x84, 0x6c, 0x98, 0xbc, 0x8c, 0xf8,
+        0x4f, 0xcb, 0xb5, 0xf6, 0xaf, 0x7a, 0xb7, 0x93, 0xef, 0x67, 0x48, 0x02,
+        0x2c, 0xcb, 0xe6, 0x77, 0x0f, 0x7b, 0xc1, 0xee, 0xc5, 0xb6, 0x2d, 0x7e,
+        0x62, 0xa0, 0xc0, 0xa7, 0xa5, 0x80, 0x31, 0x92, 0x50, 0xa1, 0x28, 0x22,
+        0x95, 0x03, 0x17, 0xd1, 0x0f, 0xf6, 0x08, 0xe5, 0xec
+    };
+#define CHACHA_BIG_TEST_SIZE 1305
+#ifndef WOLFSSL_SMALL_STACK
+    byte   cipher_big[CHACHA_BIG_TEST_SIZE] = {0};
+    byte   plain_big[CHACHA_BIG_TEST_SIZE] = {0};
+    byte   input_big[CHACHA_BIG_TEST_SIZE] = {0};
+#else
+    byte*  cipher_big;
+    byte*  plain_big;
+    byte*  input_big;
+#endif /* WOLFSSL_SMALL_STACK */
+    int    block_size;
+#endif /* BENCH_EMBEDDED */
 
     byte a[] = {0x76,0xb8,0xe0,0xad,0xa0,0xf1,0x3d,0x90};
     byte b[] = {0x45,0x40,0xf0,0x5a,0x9f,0x1f,0xb2,0x96};
@@ -4384,6 +4508,26 @@ int chacha_test(void)
     test_chacha[2] = c;
     test_chacha[3] = d;
 
+#ifndef BENCH_EMBEDDED
+#ifdef WOLFSSL_SMALL_STACK
+    cipher_big = (byte*)XMALLOC(CHACHA_BIG_TEST_SIZE, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+    if (cipher_big == NULL) {
+        return MEMORY_E;
+    }
+    plain_big = (byte*)XMALLOC(CHACHA_BIG_TEST_SIZE, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+    if (plain_big == NULL) {
+        return MEMORY_E;
+    }
+    input_big = (byte*)XMALLOC(CHACHA_BIG_TEST_SIZE, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+    if (input_big == NULL) {
+        return MEMORY_E;
+    }
+    XMEMSET(cipher_big, 0, sizeof(CHACHA_BIG_TEST_SIZE));
+    XMEMSET(plain_big, 0, sizeof(CHACHA_BIG_TEST_SIZE));
+    XMEMSET(input_big, 0, sizeof(CHACHA_BIG_TEST_SIZE));
+#endif /* WOLFSSL_SMALL_STACK */
+#endif /* BENCH_EMBEDDED */
+
     for (i = 0; i < times; ++i) {
         if (i < 3) {
             keySz = 32;
@@ -4444,6 +4588,66 @@ int chacha_test(void)
     if (XMEMCMP(plain + 64, sliver, 64))
         return -4320;
 
+#ifndef BENCH_EMBEDDED
+    /* test of encrypting more data */
+    keySz = 32;
+
+    ret |= wc_Chacha_SetKey(&enc, keys[0], keySz);
+    ret |= wc_Chacha_SetKey(&dec, keys[0], keySz);
+    if (ret != 0)
+        return ret;
+
+    ret |= wc_Chacha_SetIV(&enc, ivs[2], 0);
+    ret |= wc_Chacha_SetIV(&dec, ivs[2], 0);
+    if (ret != 0)
+        return ret;
+
+    ret |= wc_Chacha_Process(&enc, cipher_big, plain_big,  CHACHA_BIG_TEST_SIZE);
+    ret |= wc_Chacha_Process(&dec, plain_big,  cipher_big, CHACHA_BIG_TEST_SIZE);
+    if (ret != 0)
+        return ret;
+
+    if (XMEMCMP(plain_big, input_big, sizeof(input_big)))
+        return -4330;
+
+    if (XMEMCMP(cipher_big, cipher_big_result, CHACHA_BIG_TEST_SIZE))
+        return -4331;
+
+    for (i = 0; i < 18; ++i) {
+        /* this will test all paths */
+        // block sizes: 1 2 3 4 7 8 15 16 31 32 63 64 127 128 255 256 511 512
+        block_size = (2 << (i%9)) - (i<9?1:0);
+        keySz = 32;
+
+        ret |= wc_Chacha_SetKey(&enc, keys[0], keySz);
+        ret |= wc_Chacha_SetKey(&dec, keys[0], keySz);
+        if (ret != 0)
+            return ret;
+
+        ret |= wc_Chacha_SetIV(&enc, ivs[2], 0);
+        ret |= wc_Chacha_SetIV(&dec, ivs[2], 0);
+        if (ret != 0)
+            return ret;
+
+        ret |= wc_Chacha_Process(&enc, cipher_big, plain_big,  block_size);
+        ret |= wc_Chacha_Process(&dec, plain_big,  cipher_big, block_size);
+        if (ret != 0)
+            return ret;
+
+        if (XMEMCMP(plain_big, input_big, block_size))
+            return -4340-i;
+
+        if (XMEMCMP(cipher_big, cipher_big_result, block_size))
+            return -4360-i;
+    }
+
+#ifdef WOLFSSL_SMALL_STACK
+    XFREE(cipher_big, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+    XFREE(plain_big, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+    XFREE(input_big, NULL, DYNAMIC_TYPE_TMP_BUFFER);
+#endif /* WOLFSSL_SMALL_STACK */
+#endif /* BENCH_EMBEDDED */
+
     return 0;
 }
 #endif /* HAVE_CHACHA */