瀏覽代碼

Merge pull request #7283 from SparkiDev/lms

LMS: initial implementation
Daniel Pouzzner 2 月之前
父節點
當前提交
3fd6af0cd2

+ 4 - 0
configure.ac

@@ -1289,6 +1289,10 @@ do
     LMS_VERIFY_ONLY=yes
     AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_LMS_VERIFY_ONLY"
     ;;
+  small)
+    ENABLED_WC_LMS=yes
+    AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_WC_LMS_SMALL"
+    ;;
   wolfssl)
     ENABLED_WC_LMS=yes
     AM_CFLAGS="$AM_CFLAGS -DWOLFSSL_WC_LMS"

+ 1 - 0
src/include.am

@@ -710,6 +710,7 @@ endif
 
 if BUILD_WC_LMS
 src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/wc_lms.c
+src_libwolfssl@LIBSUFFIX@_la_SOURCES += wolfcrypt/src/wc_lms_impl.c
 endif
 
 if BUILD_WC_XMSS

+ 240 - 48
wolfcrypt/benchmark/benchmark.c

@@ -178,6 +178,8 @@
     #include <wolfssl/wolfcrypt/lms.h>
     #ifdef HAVE_LIBLMS
         #include <wolfssl/wolfcrypt/ext_lms.h>
+    #else
+        #include <wolfssl/wolfcrypt/wc_lms.h>
     #endif
 #endif
 #if defined(WOLFSSL_HAVE_XMSS) && !defined(WOLFSSL_XMSS_VERIFY_ONLY)
@@ -9448,7 +9450,7 @@ static const byte lms_pub_L4_H5_W8[60] =
     0x74,0x24,0x12,0xC8
 };
 
-static int lms_write_key_mem(const byte * priv, word32 privSz, void *context)
+static int lms_write_key_mem(const byte* priv, word32 privSz, void* context)
 {
    /* WARNING: THIS IS AN INSECURE WRITE CALLBACK THAT SHOULD ONLY
     * BE USED FOR TESTING PURPOSES! Production applications should
@@ -9457,15 +9459,128 @@ static int lms_write_key_mem(const byte * priv, word32 privSz, void *context)
     return WC_LMS_RC_SAVED_TO_NV_MEMORY;
 }
 
-static int lms_read_key_mem(byte * priv, word32 privSz, void *context)
+static int lms_read_key_mem(byte* priv, word32 privSz, void* context)
 {
    /* WARNING: THIS IS AN INSECURE READ CALLBACK THAT SHOULD ONLY
     * BE USED FOR TESTING PURPOSES! */
     XMEMCPY(priv, context, privSz);
     return WC_LMS_RC_READ_TO_MEMORY;
 }
+static byte lms_priv[HSS_MAX_PRIVATE_KEY_LEN];
 
-static void bench_lms_sign_verify(enum wc_LmsParm parm)
+static void bench_lms_keygen(int parm, byte* pub)
+{
+    WC_RNG      rng;
+    LmsKey      key;
+    int         ret;
+    word32      pubLen = HSS_MAX_PUBLIC_KEY_LEN;
+    int         times = 0;
+    int         count = 0;
+    double      start = 0.0F;
+    int         levels;
+    int         height;
+    int         winternitz;
+    const char* str = wc_LmsKey_ParmToStr(parm);
+    DECLARE_MULTI_VALUE_STATS_VARS()
+
+#ifndef HAVE_FIPS
+    ret = wc_InitRng_ex(&rng, HEAP_HINT, INVALID_DEVID);
+#else
+    ret = wc_InitRng(&rng);
+#endif
+    if (ret != 0) {
+        fprintf(stderr, "error: wc_InitRng failed: %d\n", ret);
+        return;
+    }
+
+    ret = wc_LmsKey_Init(&key, NULL, INVALID_DEVID);
+    if (ret) {
+        printf("wc_LmsKey_Init failed: %d\n", ret);
+        wc_FreeRng(&rng);
+        return;
+    }
+
+    count = 0;
+    bench_stats_start(&count, &start);
+
+    do {
+        /* LMS is stateful. Async queuing not practical. */
+        for (times = 0; times < 1; ++times) {
+
+            wc_LmsKey_Free(&key);
+
+            ret = wc_LmsKey_Init(&key, NULL, INVALID_DEVID);
+            if (ret) {
+                printf("wc_LmsKey_Init failed: %d\n", ret);
+                goto exit_lms_keygen;
+            }
+
+            ret = wc_LmsKey_SetLmsParm(&key, parm);
+            if (ret) {
+                printf("wc_LmsKey_SetLmsParm failed: %d\n", ret);
+                goto exit_lms_keygen;
+            }
+
+            ret = wc_LmsKey_GetParameters(&key, &levels, &height, &winternitz);
+            if (ret) {
+                fprintf(stderr, "error: wc_LmsKey_GetParameters failed: %d\n",
+                    ret);
+                goto exit_lms_keygen;
+            }
+
+            ret = wc_LmsKey_SetWriteCb(&key, lms_write_key_mem);
+            if (ret) {
+                fprintf(stderr, "error: wc_LmsKey_SetWriteCb failed: %d\n",
+                    ret);
+                goto exit_lms_keygen;
+            }
+
+            ret = wc_LmsKey_SetReadCb(&key, lms_read_key_mem);
+            if (ret) {
+                fprintf(stderr, "error: wc_LmsKey_SetReadCb failed: %d\n", ret);
+                goto exit_lms_keygen;
+            }
+
+            ret = wc_LmsKey_SetContext(&key, (void*)lms_priv);
+            if (ret) {
+                fprintf(stderr, "error: wc_LmsKey_SetContext failed: %d\n",
+                    ret);
+                goto exit_lms_keygen;
+            }
+
+            ret = wc_LmsKey_MakeKey(&key, &rng);
+            if (ret) {
+                printf("wc_LmsKey_MakeKey failed: %d\n", ret);
+                goto exit_lms_keygen;
+            }
+
+            RECORD_MULTI_VALUE_STATS();
+        }
+
+        count += times;
+    } while (bench_stats_check(start)
+#ifdef MULTI_VALUE_STATISTICS
+       || runs < minimum_runs
+#endif
+       );
+
+    bench_stats_asym_finish(str, levels * height, "keygen", 0,
+                            count, start, ret);
+#ifdef MULTI_VALUE_STATISTICS
+    bench_multi_value_stats(max, min, sum, squareSum, runs);
+#endif
+
+    ret = wc_LmsKey_ExportPubRaw(&key, pub, &pubLen);
+    if (ret) {
+        fprintf(stderr, "error: wc_LmsKey_ExportPubRaw failed: %d\n", ret);
+    }
+
+exit_lms_keygen:
+    wc_LmsKey_Free(&key);
+    wc_FreeRng(&rng);
+}
+
+static void bench_lms_sign_verify(int parm, byte* pub)
 {
     LmsKey       key;
     int          ret = 0;
@@ -9478,8 +9593,8 @@ static void bench_lms_sign_verify(enum wc_LmsParm parm)
     int          times = 0;
     int          count = 0;
     double       start = 0.0F;
-    byte         priv[HSS_MAX_PRIVATE_KEY_LEN];
     const char * str = wc_LmsKey_ParmToStr(parm);
+    DECLARE_MULTI_VALUE_STATS_VARS()
 
     ret = wc_LmsKey_Init(&key, NULL, INVALID_DEVID);
     if (ret) {
@@ -9495,33 +9610,33 @@ static void bench_lms_sign_verify(enum wc_LmsParm parm)
 
     switch (parm) {
     case WC_LMS_PARM_L2_H10_W2:
-        XMEMCPY(priv, lms_priv_L2_H10_W2, sizeof(lms_priv_L2_H10_W2));
-        XMEMCPY(key.pub, lms_pub_L2_H10_W2, sizeof(lms_pub_L2_H10_W2));
+        XMEMCPY(lms_priv, lms_priv_L2_H10_W2, sizeof(lms_priv_L2_H10_W2));
+        XMEMCPY(key.pub, lms_pub_L2_H10_W2, HSS_MAX_PUBLIC_KEY_LEN);
         break;
 
     case WC_LMS_PARM_L2_H10_W4:
-        XMEMCPY(priv, lms_priv_L2_H10_W4, sizeof(lms_priv_L2_H10_W4));
-        XMEMCPY(key.pub, lms_pub_L2_H10_W4, sizeof(lms_pub_L2_H10_W4));
+        XMEMCPY(lms_priv, lms_priv_L2_H10_W4, sizeof(lms_priv_L2_H10_W4));
+        XMEMCPY(key.pub, lms_pub_L2_H10_W4, HSS_MAX_PUBLIC_KEY_LEN);
         break;
 
     case WC_LMS_PARM_L3_H5_W4:
-        XMEMCPY(priv, lms_priv_L3_H5_W4, sizeof(lms_priv_L3_H5_W4));
-        XMEMCPY(key.pub, lms_pub_L3_H5_W4, sizeof(lms_pub_L3_H5_W4));
+        XMEMCPY(lms_priv, lms_priv_L3_H5_W4, sizeof(lms_priv_L3_H5_W4));
+        XMEMCPY(key.pub, lms_pub_L3_H5_W4, HSS_MAX_PUBLIC_KEY_LEN);
         break;
 
     case WC_LMS_PARM_L3_H5_W8:
-        XMEMCPY(priv, lms_priv_L3_H5_W8, sizeof(lms_priv_L3_H5_W8));
-        XMEMCPY(key.pub, lms_pub_L3_H5_W8, sizeof(lms_pub_L3_H5_W8));
+        XMEMCPY(lms_priv, lms_priv_L3_H5_W8, sizeof(lms_priv_L3_H5_W8));
+        XMEMCPY(key.pub, lms_pub_L3_H5_W8, HSS_MAX_PUBLIC_KEY_LEN);
         break;
 
     case WC_LMS_PARM_L3_H10_W4:
-        XMEMCPY(priv, lms_priv_L3_H10_W4, sizeof(lms_priv_L3_H10_W4));
-        XMEMCPY(key.pub, lms_pub_L3_H10_W4, sizeof(lms_pub_L3_H10_W4));
+        XMEMCPY(lms_priv, lms_priv_L3_H10_W4, sizeof(lms_priv_L3_H10_W4));
+        XMEMCPY(key.pub, lms_pub_L3_H10_W4, HSS_MAX_PUBLIC_KEY_LEN);
         break;
 
     case WC_LMS_PARM_L4_H5_W8:
-        XMEMCPY(priv, lms_priv_L4_H5_W8, sizeof(lms_priv_L4_H5_W8));
-        XMEMCPY(key.pub, lms_pub_L4_H5_W8, sizeof(lms_pub_L4_H5_W8));
+        XMEMCPY(lms_priv, lms_priv_L4_H5_W8, sizeof(lms_priv_L4_H5_W8));
+        XMEMCPY(key.pub, lms_pub_L4_H5_W8, HSS_MAX_PUBLIC_KEY_LEN);
         break;
 
     case WC_LMS_PARM_NONE:
@@ -9529,9 +9644,9 @@ static void bench_lms_sign_verify(enum wc_LmsParm parm)
     case WC_LMS_PARM_L1_H15_W4:
     case WC_LMS_PARM_L2_H10_W8:
     case WC_LMS_PARM_L3_H5_W2:
-        printf("bench_lms_sign_verify: unsupported benchmark option: %d\n",
-               parm);
-        goto exit_lms_sign_verify;
+    default:
+        XMEMCPY(key.pub, pub, HSS_MAX_PUBLIC_KEY_LEN);
+        break;
     }
 
     ret = wc_LmsKey_SetWriteCb(&key, lms_write_key_mem);
@@ -9546,7 +9661,7 @@ static void bench_lms_sign_verify(enum wc_LmsParm parm)
         goto exit_lms_sign_verify;
     }
 
-    ret = wc_LmsKey_SetContext(&key, (void *) priv);
+    ret = wc_LmsKey_SetContext(&key, (void*)lms_priv);
     if (ret) {
         fprintf(stderr, "error: wc_LmsKey_SetContext failed: %d\n", ret);
         goto exit_lms_sign_verify;
@@ -9555,35 +9670,68 @@ static void bench_lms_sign_verify(enum wc_LmsParm parm)
     /* Even with saved priv/pub keys, we must still reload the private
      * key before using it. Reloading the private key is the bottleneck
      * for larger heights. Only print load time in debug builds. */
-#if defined(DEBUG_WOLFSSL)
+    count = 0;
     bench_stats_start(&count, &start);
-#endif /* if defined DEBUG_WOLFSSL*/
 
+#ifndef WOLFSSL_WC_LMS_SMALL
+    do {
+    #ifdef WOLFSSL_WC_LMS
+        key.priv.inited = 0;
+        key.state = WC_LMS_STATE_PARMSET;
+    #endif
+        ret = wc_LmsKey_Reload(&key);
+        if (ret) {
+            printf("wc_LmsKey_Reload failed: %d\n", ret);
+            goto exit_lms_sign_verify;
+        }
+        RECORD_MULTI_VALUE_STATS();
+
+        count++;
+
+        ret = wc_LmsKey_GetSigLen(&key, &sigSz);
+        if (ret) {
+            printf("wc_LmsKey_GetSigLen failed: %d\n", ret);
+            goto exit_lms_sign_verify;
+        }
+
+        ret = wc_LmsKey_GetPrivLen(&key, &privLen);
+        if (ret) {
+            printf("wc_LmsKey_GetPrivLen failed: %d\n", ret);
+            goto exit_lms_sign_verify;
+        }
+    #ifdef HAVE_LIBLMS
+        break;
+    #endif
+    } while (bench_stats_check(start)
+#ifdef MULTI_VALUE_STATISTICS
+       || runs < minimum_runs
+#endif
+       );
+
+    bench_stats_asym_finish(str, (int)privLen, "load", 0,
+                            count, start, ret);
+#ifdef MULTI_VALUE_STATISTICS
+    bench_multi_value_stats(max, min, sum, squareSum, runs);
+#endif
+
+    RESET_MULTI_VALUE_STATS_VARS();
+#else
     ret = wc_LmsKey_Reload(&key);
     if (ret) {
         printf("wc_LmsKey_Reload failed: %d\n", ret);
         goto exit_lms_sign_verify;
     }
-
-    count +=1;
-
     ret = wc_LmsKey_GetSigLen(&key, &sigSz);
     if (ret) {
         printf("wc_LmsKey_GetSigLen failed: %d\n", ret);
         goto exit_lms_sign_verify;
     }
-
     ret = wc_LmsKey_GetPrivLen(&key, &privLen);
     if (ret) {
         printf("wc_LmsKey_GetPrivLen failed: %d\n", ret);
         goto exit_lms_sign_verify;
     }
-
-#if defined(DEBUG_WOLFSSL)
-    bench_stats_check(start);
-    bench_stats_asym_finish(str, (int)privLen, "load", 0,
-                            count, start, ret);
-#endif /* if defined DEBUG_WOLFSSL*/
+#endif
 
     loaded = 1;
 
@@ -9598,22 +9746,29 @@ static void bench_lms_sign_verify(enum wc_LmsParm parm)
 
     do {
         /* LMS is stateful. Async queuing not practical. */
-        for (times = 0; times < ntimes; ++times) {
-
+#ifndef WOLFSSL_WC_LMS_SMALL
+        for (times = 0; times < ntimes; ++times)
+#else
+        for (times = 0; times < 1; ++times)
+#endif
+        {
             ret = wc_LmsKey_Sign(&key, sig, &sigSz, (byte *) msg, msgSz);
             if (ret) {
                 printf("wc_LmsKey_Sign failed: %d\n", ret);
                 goto exit_lms_sign_verify;
             }
             RECORD_MULTI_VALUE_STATS();
+            if (!wc_LmsKey_SigsLeft(&key)) {
+                break;
+            }
         }
 
         count += times;
-    } while (bench_stats_check(start)
+    } while (wc_LmsKey_SigsLeft(&key) && (bench_stats_check(start)
 #ifdef MULTI_VALUE_STATISTICS
        || runs < minimum_runs
 #endif
-       );
+       ));
 
     bench_stats_asym_finish(str, (int)sigSz, "sign", 0,
                             count, start, ret);
@@ -9653,25 +9808,62 @@ exit_lms_sign_verify:
 
     if (loaded) {
         wc_LmsKey_Free(&key);
-        loaded = 0;
-    }
-
-    if (sig != NULL) {
-        XFREE(sig, HEAP_HINT, DYNAMIC_TYPE_TMP_BUFFER);
-        sig = NULL;
     }
+    XFREE(sig, HEAP_HINT, DYNAMIC_TYPE_TMP_BUFFER);
 
     return;
 }
 
 void bench_lms(void)
 {
-    bench_lms_sign_verify(WC_LMS_PARM_L2_H10_W2);
-    bench_lms_sign_verify(WC_LMS_PARM_L2_H10_W4);
-    bench_lms_sign_verify(WC_LMS_PARM_L3_H5_W4);
-    bench_lms_sign_verify(WC_LMS_PARM_L3_H5_W8);
-    bench_lms_sign_verify(WC_LMS_PARM_L3_H10_W4);
-    bench_lms_sign_verify(WC_LMS_PARM_L4_H5_W8);
+    byte pub[HSS_MAX_PUBLIC_KEY_LEN];
+
+#ifdef BENCH_LMS_SLOW_KEYGEN
+#if !defined(WOLFSSL_WC_LMS) || (LMS_MAX_HEIGHT >= 15)
+    bench_lms_keygen(WC_LMS_PARM_L1_H15_W2, pub);
+    bench_lms_sign_verify(WC_LMS_PARM_L1_H15_W2, pub);
+    bench_lms_keygen(WC_LMS_PARM_L1_H15_W4, pub);
+    bench_lms_sign_verify(WC_LMS_PARM_L1_H15_W4, pub);
+    #undef LMS_PARAMS_BENCHED
+    #define LMS_PARAMS_BENCHED
+#endif
+#endif
+#if !defined(WOLFSSL_WC_LMS) || ((LMS_MAX_LEVELS >= 2) && \
+        (LMS_MAX_HEIGHT >= 10))
+    bench_lms_keygen(WC_LMS_PARM_L2_H10_W2, pub);
+    bench_lms_sign_verify(WC_LMS_PARM_L2_H10_W2, pub);
+    bench_lms_keygen(WC_LMS_PARM_L2_H10_W4, pub);
+    bench_lms_sign_verify(WC_LMS_PARM_L2_H10_W4, pub);
+    #undef LMS_PARAMS_BENCHED
+    #define LMS_PARAMS_BENCHED
+#ifdef BENCH_LMS_SLOW_KEYGEN
+    bench_lms_keygen(WC_LMS_PARM_L2_H10_W8, pub);
+    bench_lms_sign_verify(WC_LMS_PARM_L2_H10_W8, pub);
+#endif
+#endif
+#if !defined(WOLFSSL_WC_LMS) || (LMS_MAX_LEVELS >= 3)
+    bench_lms_keygen(WC_LMS_PARM_L3_H5_W4, pub);
+    bench_lms_sign_verify(WC_LMS_PARM_L3_H5_W4, pub);
+    bench_lms_keygen(WC_LMS_PARM_L3_H5_W8, pub);
+    bench_lms_sign_verify(WC_LMS_PARM_L3_H5_W8, pub);
+    #undef LMS_PARAMS_BENCHED
+    #define LMS_PARAMS_BENCHED
+#endif
+#if !defined(WOLFSSL_WC_LMS) || ((LMS_MAX_LEVELS >= 3) && \
+        (LMS_MAX_HEIGHT >= 10))
+    bench_lms_keygen(WC_LMS_PARM_L3_H10_W4, pub);
+    bench_lms_sign_verify(WC_LMS_PARM_L3_H10_W4, pub);
+#endif
+#if !defined(WOLFSSL_WC_LMS) || (LMS_MAX_LEVELS >= 4)
+    bench_lms_keygen(WC_LMS_PARM_L4_H5_W8, pub);
+    bench_lms_sign_verify(WC_LMS_PARM_L4_H5_W8, pub);
+#endif
+
+#if defined(WOLFSSL_WC_LMS) && !defined(LMS_PARAMS_BENCHED)
+    bench_lms_keygen(0x100, pub);
+    bench_lms_sign_verify(0x100, pub);
+#endif
+
     return;
 }
 

+ 12 - 0
wolfcrypt/src/misc.c

@@ -460,10 +460,16 @@ WC_MISC_STATIC WC_INLINE void c16toa(word16 wc_u16, byte* c)
 /* convert 32 bit integer to opaque */
 WC_MISC_STATIC WC_INLINE void c32toa(word32 wc_u32, byte* c)
 {
+#ifdef WOLFSSL_USE_ALIGN
     c[0] = (byte)((wc_u32 >> 24) & 0xff);
     c[1] = (byte)((wc_u32 >> 16) & 0xff);
     c[2] = (byte)((wc_u32 >>  8) & 0xff);
     c[3] =  (byte)(wc_u32        & 0xff);
+#elif defined(LITTLE_ENDIAN_ORDER)
+    *(word32*)c = ByteReverseWord32(wc_u32);
+#else
+    *(word32*)c = wc_u32;
+#endif
 }
 #endif
 
@@ -492,10 +498,16 @@ WC_MISC_STATIC WC_INLINE void ato16(const byte* c, word16* wc_u16)
 /* convert opaque to 32 bit integer */
 WC_MISC_STATIC WC_INLINE void ato32(const byte* c, word32* wc_u32)
 {
+#ifdef WOLFSSL_USE_ALIGN
     *wc_u32 = ((word32)c[0] << 24) |
               ((word32)c[1] << 16) |
               ((word32)c[2] << 8) |
                (word32)c[3];
+#elif defined(LITTLE_ENDIAN_ORDER)
+    *wc_u32 = ByteReverseWord32(*(word32*)c);
+#else
+    *wc_u32 = *(word32*)c;
+#endif
 }
 
 /* convert opaque to 32 bit integer. Interpret as little endian. */

+ 122 - 38
wolfcrypt/src/port/arm/armv8-sha256.c

@@ -130,8 +130,8 @@ static WC_INLINE void Sha256Transform(wc_Sha256* sha256, const byte* data,
     word32* k = (word32*)K;
 
     __asm__ volatile (
-    "#load leftover data\n"
-    "LD1 {v0.2d-v3.2d}, %[buffer]   \n"
+    "# load first block of data\n"
+    "LD1 {v0.16b-v3.16b}, [%[dataIn]], #64   \n"
 
     "#load current digest\n"
     "LD1 {v12.2d-v13.2d}, %[digest] \n"
@@ -293,10 +293,9 @@ static WC_INLINE void Sha256Transform(wc_Sha256* sha256, const byte* data,
     "2:\n"
     "ST1 {v12.2d-v13.2d}, %[out] \n"
 
-    : [out] "=m" (sha256->digest), "=m" (sha256->buffer), "=r" (numBlocks),
-      "=r" (data), "=r" (k)
-    : [k] "4" (k), [digest] "m" (sha256->digest), [buffer] "m" (sha256->buffer),
-      [blocks] "2" (numBlocks), [dataIn] "3" (data)
+    : [out] "=m" (sha256->digest), "=r" (numBlocks), "=r" (data), "=r" (k)
+    : [k] "3" (k), [digest] "m" (sha256->digest), [blocks] "1" (numBlocks),
+      [dataIn] "2" (data)
     : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
                       "v8",  "v9",  "v10", "v11", "v12", "v13", "v14",
                       "v15", "v16", "v17", "v18", "v19", "v20", "v21",
@@ -306,7 +305,8 @@ static WC_INLINE void Sha256Transform(wc_Sha256* sha256, const byte* data,
 }
 
 /* ARMv8 hardware acceleration */
-static WC_INLINE int Sha256Update(wc_Sha256* sha256, const byte* data, word32 len)
+static WC_INLINE int Sha256Update(wc_Sha256* sha256, const byte* data,
+    word32 len)
 {
     word32 add;
     word32 numBlocks;
@@ -315,26 +315,32 @@ static WC_INLINE int Sha256Update(wc_Sha256* sha256, const byte* data, word32 le
     if (len > 0) {
         AddLength(sha256, len);
 
-        /* fill leftover buffer with data */
-        add = min(len, WC_SHA256_BLOCK_SIZE - sha256->buffLen);
-        XMEMCPY((byte*)(sha256->buffer) + sha256->buffLen, data, add);
-        sha256->buffLen += add;
-        data            += add;
-        len             -= add;
+        if (sha256->buffLen > 0) {
+             /* fill leftover buffer with data */
+             add = min(len, WC_SHA256_BLOCK_SIZE - sha256->buffLen);
+             XMEMCPY((byte*)(sha256->buffer) + sha256->buffLen, data, add);
+             sha256->buffLen += add;
+             data            += add;
+             len             -= add;
+             if (sha256->buffLen == WC_SHA256_BLOCK_SIZE) {
+                 Sha256Transform(sha256, (byte*)sha256->buffer, 1);
+                 sha256->buffLen = 0;
+             }
+        }
 
         /* number of blocks in a row to complete */
         numBlocks = (len + sha256->buffLen)/WC_SHA256_BLOCK_SIZE;
 
         if (numBlocks > 0) {
-            /* get leftover amount after blocks */
-            add = (len + sha256->buffLen) - numBlocks * WC_SHA256_BLOCK_SIZE;
-
             Sha256Transform(sha256, data, numBlocks);
-            data += numBlocks * WC_SHA256_BLOCK_SIZE - sha256->buffLen;
+            data += numBlocks * WC_SHA256_BLOCK_SIZE;
+            len  -= numBlocks * WC_SHA256_BLOCK_SIZE;
+        }
 
+        if (len > 0) {
             /* copy over any remaining data leftover */
-            XMEMCPY(sha256->buffer, data, add);
-            sha256->buffLen = add;
+            XMEMCPY(sha256->buffer, data, len);
+            sha256->buffLen = len;
         }
     }
 
@@ -702,8 +708,9 @@ static WC_INLINE void Sha256Transform(wc_Sha256* sha256, const byte* data,
     word32* digPt = sha256->digest;
 
     __asm__ volatile (
-    "#load leftover data\n"
-    "VLDM %[buffer]!, {q0-q3} \n"
+    "# load first block of data\n"
+    "VLD1.8 {d0-d3}, [%[dataIn]]! \n"
+    "VLD1.8 {d4-d7}, [%[dataIn]]! \n"
 
     "#load current digest\n"
     "VLDM %[digest], {q12-q13} \n"
@@ -863,10 +870,8 @@ static WC_INLINE void Sha256Transform(wc_Sha256* sha256, const byte* data,
     "BEQ 2f \n"
 
     "#load in message and schedule updates \n"
-    "VLD1.32 {q0}, [%[dataIn]]!   \n"
-    "VLD1.32 {q1}, [%[dataIn]]!   \n"
-    "VLD1.32 {q2}, [%[dataIn]]!   \n"
-    "VLD1.32 {q3}, [%[dataIn]]!   \n"
+    "VLD1.8 {d0-d3}, [%[dataIn]]! \n"
+    "VLD1.8 {d4-d7}, [%[dataIn]]! \n"
 
     /* reset K pointer */
     "SUB %[k], %[k], #160 \n"
@@ -892,7 +897,8 @@ static WC_INLINE void Sha256Transform(wc_Sha256* sha256, const byte* data,
 }
 
 /* ARMv8 hardware acceleration Aarch32 */
-static WC_INLINE int Sha256Update(wc_Sha256* sha256, const byte* data, word32 len)
+static WC_INLINE int Sha256Update(wc_Sha256* sha256, const byte* data,
+    word32 len)
 {
     word32 add;
     word32 numBlocks;
@@ -901,26 +907,32 @@ static WC_INLINE int Sha256Update(wc_Sha256* sha256, const byte* data, word32 le
     if (len > 0) {
         AddLength(sha256, len);
 
-        /* fill leftover buffer with data */
-        add = min(len, WC_SHA256_BLOCK_SIZE - sha256->buffLen);
-        XMEMCPY((byte*)(sha256->buffer) + sha256->buffLen, data, add);
-        sha256->buffLen += add;
-        data            += add;
-        len             -= add;
+        if (sha256->buffLen > 0) {
+             /* fill leftover buffer with data */
+             add = min(len, WC_SHA256_BLOCK_SIZE - sha256->buffLen);
+             XMEMCPY((byte*)(sha256->buffer) + sha256->buffLen, data, add);
+             sha256->buffLen += add;
+             data            += add;
+             len             -= add;
+             if (sha256->buffLen == WC_SHA256_BLOCK_SIZE) {
+                 Sha256Transform(sha256, (byte*)sha256->buffer, 1);
+                 sha256->buffLen = 0;
+             }
+        }
 
         /* number of blocks in a row to complete */
         numBlocks = (len + sha256->buffLen)/WC_SHA256_BLOCK_SIZE;
 
         if (numBlocks > 0) {
-            /* get leftover amount after blocks */
-            add = (len + sha256->buffLen) - numBlocks * WC_SHA256_BLOCK_SIZE;
-
             Sha256Transform(sha256, data, numBlocks);
-            data += numBlocks * WC_SHA256_BLOCK_SIZE - sha256->buffLen;
+            data += numBlocks * WC_SHA256_BLOCK_SIZE;
+            len  -= numBlocks * WC_SHA256_BLOCK_SIZE;
+        }
 
+        if (len > 0) {
             /* copy over any remaining data leftover */
-            XMEMCPY(sha256->buffer, data, add);
-            sha256->buffLen = add;
+            XMEMCPY(sha256->buffer, data, len);
+            sha256->buffLen = len;
         }
     }
 
@@ -1619,6 +1631,78 @@ int wc_Sha256Transform(wc_Sha256* sha256, const unsigned char* data)
 }
 #endif
 
+#if defined(WOLFSSL_HAVE_LMS) && !defined(WOLFSSL_LMS_FULL_HASH)
+/* One block will be used from data.
+ * hash must be big enough to hold all of digest output.
+ */
+int wc_Sha256HashBlock(wc_Sha256* sha256, const unsigned char* data,
+    unsigned char* hash)
+{
+    int ret = 0;
+
+    if ((sha256 == NULL) || (data == NULL)) {
+        return BAD_FUNC_ARG;
+    }
+
+#ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO
+    Sha256Transform(sha256, data, 1);
+#else
+    Transform_Sha256_Len(sha256, data, WC_SHA256_BLOCK_SIZE);
+#endif
+
+    if (hash != NULL) {
+#ifdef LITTLE_ENDIAN_ORDER
+    #ifndef WOLFSSL_ARMASM_NO_HW_CRYPTO
+        #ifdef __aarch64__
+            __asm__ __volatile__ (
+                "LD1 {v0.2d-v1.2d}, [%[digest]]   \n"
+                "REV32 v0.16b, v0.16b \n"
+                "REV32 v1.16b, v1.16b \n"
+                "ST1 {v0.16b-v1.16b}, [%[hash]]  \n"
+                :
+                : [digest] "r" (sha256->digest), [hash] "r" (hash)
+                : "memory", "v0", "v1"
+            );
+        #else
+            __asm__ __volatile__ (
+                "VLDM %[digest], {q0-q1} \n"
+                "VREV32.8 q0, q0 \n"
+                "VREV32.8 q1, q1 \n"
+                "VST1.8 {d0-d3}, [%[hash]] \n"
+                :
+                : [digest] "r" (sha256->digest), [hash] "r" (hash)
+                : "memory", "q0", "q1"
+            );
+        #endif
+    #else
+        word32* hash32 = (word32*)hash;
+        word32* digest = (word32*)sha256->digest;
+        hash32[0] = ByteReverseWord32(digest[0]);
+        hash32[1] = ByteReverseWord32(digest[1]);
+        hash32[2] = ByteReverseWord32(digest[2]);
+        hash32[3] = ByteReverseWord32(digest[3]);
+        hash32[4] = ByteReverseWord32(digest[4]);
+        hash32[5] = ByteReverseWord32(digest[5]);
+        hash32[6] = ByteReverseWord32(digest[6]);
+        hash32[7] = ByteReverseWord32(digest[7]);
+    #endif /* !WOLFSSL_ARMASM_NO_HW_CRYPTO */
+#else
+        XMEMCPY(hash, sha256->digest, WC_SHA256_DIGEST_SIZE);
+#endif
+        sha256->digest[0] = 0x6A09E667L;
+        sha256->digest[1] = 0xBB67AE85L;
+        sha256->digest[2] = 0x3C6EF372L;
+        sha256->digest[3] = 0xA54FF53AL;
+        sha256->digest[4] = 0x510E527FL;
+        sha256->digest[5] = 0x9B05688CL;
+        sha256->digest[6] = 0x1F83D9ABL;
+        sha256->digest[7] = 0x5BE0CD19L;
+    }
+
+    return ret;
+}
+#endif /* WOLFSSL_HAVE_LMS && !WOLFSSL_LMS_FULL_HASH */
+
 #endif /* !NO_SHA256 */
 
 

+ 179 - 181
wolfcrypt/src/sha256.c

@@ -169,6 +169,38 @@ on the specific device platform.
 #endif
 
 
+#if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
+    #if ( defined(CONFIG_IDF_TARGET_ESP32C2) || \
+          defined(CONFIG_IDF_TARGET_ESP8684) || \
+          defined(CONFIG_IDF_TARGET_ESP32C3) || \
+          defined(CONFIG_IDF_TARGET_ESP32C6)    \
+        ) && \
+        defined(WOLFSSL_ESP32_CRYPT) &&         \
+        !defined(NO_WOLFSSL_ESP32_CRYPT_HASH) && \
+        !defined(NO_WOLFSSL_ESP32_CRYPT_HASH_SHA256)
+        /* For Espressif RISC-V Targets, we *may* need to reverse bytes
+         * depending on if HW is active or not. */
+        #define SHA256_REV_BYTES(ctx) \
+            (esp_sha_need_byte_reversal(ctx))
+    #endif
+#endif
+#ifndef SHA256_REV_BYTES
+    #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
+        #define SHA256_REV_BYTES(ctx)       1
+    #else
+        #define SHA256_REV_BYTES(ctx)       0
+    #endif
+#endif
+#if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA) && \
+        defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
+        (defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
+    #define SHA256_UPDATE_REV_BYTES(ctx) \
+        (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
+#else
+    #define SHA256_UPDATE_REV_BYTES(ctx)    SHA256_REV_BYTES(ctx)
+#endif
+
+
 #if !defined(WOLFSSL_PIC32MZ_HASH) && !defined(STM32_HASH_SHA2) && \
     (!defined(WOLFSSL_IMX6_CAAM) || defined(NO_IMX6_CAAM_HASH) || \
      defined(WOLFSSL_QNX_CAAM)) && \
@@ -188,11 +220,6 @@ on the specific device platform.
 
 static int InitSha256(wc_Sha256* sha256)
 {
-    int ret = 0;
-
-    if (sha256 == NULL)
-        return BAD_FUNC_ARG;
-
     XMEMSET(sha256->digest, 0, sizeof(sha256->digest));
     sha256->digest[0] = 0x6A09E667L;
     sha256->digest[1] = 0xBB67AE85L;
@@ -227,7 +254,7 @@ static int InitSha256(wc_Sha256* sha256)
     sha256->hSession = NULL;
 #endif
 
-    return ret;
+    return 0;
 }
 #endif
 
@@ -590,7 +617,7 @@ static int InitSha256(wc_Sha256* sha256)
     {
         int ret = 0;
 
-        if (sha256 == NULL || (data == NULL && len > 0)) {
+        if (sha224 == NULL || (data == NULL && len > 0)) {
             return BAD_FUNC_ARG;
         }
 
@@ -736,10 +763,6 @@ static int InitSha256(wc_Sha256* sha256)
     {
         int ret = 0; /* zero = success */
 
-        if (sha256 == NULL) {
-            return BAD_FUNC_ARG;
-        }
-
         /* We may or may not need initial digest for HW.
          * Always needed for SW-only. */
         sha256->digest[0] = 0x6A09E667L;
@@ -1049,21 +1072,13 @@ static int InitSha256(wc_Sha256* sha256)
     }
 
     /* do block size increments/updates */
-    static WC_INLINE int Sha256Update(wc_Sha256* sha256, const byte* data, word32 len)
+    static WC_INLINE int Sha256Update(wc_Sha256* sha256, const byte* data,
+        word32 len)
     {
         int ret = 0;
         word32 blocksLen;
         byte* local;
 
-        if (sha256 == NULL || (data == NULL && len > 0)) {
-            return BAD_FUNC_ARG;
-        }
-
-        if (data == NULL && len == 0) {
-            /* valid, but do nothing */
-            return 0;
-        }
-
         /* check that internal buffLen is valid */
         if (sha256->buffLen >= WC_SHA256_BLOCK_SIZE) {
             return BUFFER_E;
@@ -1092,34 +1107,13 @@ static int InitSha256(wc_Sha256* sha256)
                 }
             #endif
 
-
-            #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
-                #if defined(WOLFSSL_X86_64_BUILD) && \
-                          defined(USE_INTEL_SPEEDUP) && \
-                          (defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
-                if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
-                #endif
-                #if ( defined(CONFIG_IDF_TARGET_ESP32C2) || \
-                      defined(CONFIG_IDF_TARGET_ESP8684) || \
-                      defined(CONFIG_IDF_TARGET_ESP32C3) || \
-                      defined(CONFIG_IDF_TARGET_ESP32C6)    \
-                    ) && \
-                    defined(WOLFSSL_ESP32_CRYPT) &&         \
-                   !defined(NO_WOLFSSL_ESP32_CRYPT_HASH) && \
-                   !defined(NO_WOLFSSL_ESP32_CRYPT_HASH_SHA256)
-                /* For Espressif RISC-V Targets, we *may* need to reverse bytes
-                 * depending on if HW is active or not. */
-                    if (esp_sha_need_byte_reversal(&sha256->ctx))
-                #endif
-                {
-                    ByteReverseWords(sha256->buffer, sha256->buffer,
-                        WC_SHA256_BLOCK_SIZE);
-                }
-            #endif
+            if (SHA256_UPDATE_REV_BYTES(&sha256->ctx)) {
+                ByteReverseWords(sha256->buffer, sha256->buffer,
+                    WC_SHA256_BLOCK_SIZE);
+            }
 
             #if defined(WOLFSSL_USE_ESP32_CRYPT_HASH_HW) && \
                !defined(NO_WOLFSSL_ESP32_CRYPT_HASH_SHA256)
-
                 if (sha256->ctx.mode == ESP32_SHA_SW) {
                     #if defined(WOLFSSL_DEBUG_MUTEX)
                     {
@@ -1146,7 +1140,6 @@ static int InitSha256(wc_Sha256* sha256)
                 /* Always SW */
                 ret = XTRANSFORM(sha256, (const byte*)local);
             #endif
-
                 if (ret == 0)
                     sha256->buffLen = 0;
                 else
@@ -1161,12 +1154,13 @@ static int InitSha256(wc_Sha256* sha256)
         if (Transform_Sha256_Len_p != NULL)
         #endif
         {
-            /* get number of blocks */
-            /* 64-1 = 0x3F (~ Inverted = 0xFFFFFFC0) */
-            /* len (masked by 0xFFFFFFC0) returns block aligned length */
-            blocksLen = len & ~((word32)WC_SHA256_BLOCK_SIZE-1);
-            if (blocksLen > 0) {
-                /* Byte reversal and alignment handled in function if required */
+            if (len >= WC_SHA256_BLOCK_SIZE) {
+                /* get number of blocks */
+                /* 64-1 = 0x3F (~ Inverted = 0xFFFFFFC0) */
+                /* len (masked by 0xFFFFFFC0) returns block aligned length */
+                blocksLen = len & ~((word32)WC_SHA256_BLOCK_SIZE-1);
+                /* Byte reversal and alignment handled in function if required
+                 */
                 XTRANSFORM_LEN(sha256, data, blocksLen);
                 data += blocksLen;
                 len  -= blocksLen;
@@ -1209,28 +1203,9 @@ static int InitSha256(wc_Sha256* sha256)
                 }
             #endif
 
-            #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
-                #if ( defined(CONFIG_IDF_TARGET_ESP32C2) || \
-                      defined(CONFIG_IDF_TARGET_ESP8684) || \
-                      defined(CONFIG_IDF_TARGET_ESP32C3) || \
-                      defined(CONFIG_IDF_TARGET_ESP32C6)    \
-                    ) && \
-                    defined(WOLFSSL_ESP32_CRYPT)         && \
-                   !defined(NO_WOLFSSL_ESP32_CRYPT_HASH) && \
-                   !defined(NO_WOLFSSL_ESP32_CRYPT_HASH_SHA256)
-                /* For Espressif RISC-V Targets, we *may* need to reverse bytes
-                 * depending on if HW is active or not. */
-                    if (esp_sha_need_byte_reversal(&sha256->ctx))
-                #endif
-                #if defined(WOLFSSL_X86_64_BUILD) && \
-                          defined(USE_INTEL_SPEEDUP) && \
-                          (defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
-                if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
-                #endif
-                {
-                    ByteReverseWords(local32, local32, WC_SHA256_BLOCK_SIZE);
-                }
-            #endif
+            if (SHA256_UPDATE_REV_BYTES(&sha256->ctx)) {
+                ByteReverseWords(local32, local32, WC_SHA256_BLOCK_SIZE);
+            }
 
             #if defined(WOLFSSL_USE_ESP32_CRYPT_HASH_HW) && \
                !defined(NO_WOLFSSL_ESP32_CRYPT_HASH_SHA256)
@@ -1267,14 +1242,16 @@ static int InitSha256(wc_Sha256* sha256)
 #else
     int wc_Sha256Update(wc_Sha256* sha256, const byte* data, word32 len)
     {
-        if (sha256 == NULL || (data == NULL && len > 0)) {
+        if (sha256 == NULL) {
             return BAD_FUNC_ARG;
         }
-
         if (data == NULL && len == 0) {
             /* valid, but do nothing */
             return 0;
         }
+        if (data == NULL) {
+            return BAD_FUNC_ARG;
+        }
 
     #ifdef WOLF_CRYPTO_CB
         #ifndef WOLF_CRYPTO_CB_FIND
@@ -1301,14 +1278,9 @@ static int InitSha256(wc_Sha256* sha256)
 
     static WC_INLINE int Sha256Final(wc_Sha256* sha256)
     {
-
         int ret;
         byte* local;
 
-        if (sha256 == NULL) {
-            return BAD_FUNC_ARG;
-        }
-
         /* we'll add a 0x80 byte at the end,
         ** so make sure we have appropriate buffer length. */
         if (sha256->buffLen > WC_SHA256_BLOCK_SIZE - 1) {
@@ -1326,8 +1298,6 @@ static int InitSha256(wc_Sha256* sha256)
                     WC_SHA256_BLOCK_SIZE - sha256->buffLen);
             }
 
-            sha256->buffLen += WC_SHA256_BLOCK_SIZE - sha256->buffLen;
-
         #if defined(WOLFSSL_USE_ESP32_CRYPT_HASH_HW) && \
            !defined(NO_WOLFSSL_ESP32_CRYPT_HASH_SHA256)
             if (sha256->ctx.mode == ESP32_SHA_INIT) {
@@ -1335,28 +1305,10 @@ static int InitSha256(wc_Sha256* sha256)
             }
         #endif
 
-        #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
-            #if ( defined(CONFIG_IDF_TARGET_ESP32C2) || \
-                  defined(CONFIG_IDF_TARGET_ESP8684) || \
-                  defined(CONFIG_IDF_TARGET_ESP32C3) || \
-                  defined(CONFIG_IDF_TARGET_ESP32C6)    \
-                )  && \
-                defined(WOLFSSL_ESP32_CRYPT) &&         \
-               !defined(NO_WOLFSSL_ESP32_CRYPT_HASH) && \
-               !defined(NO_WOLFSSL_ESP32_CRYPT_HASH_SHA256)
-            /* For Espressif RISC-V Targets, we *may* need to reverse bytes
-             * depending on if HW is active or not. */
-                if (esp_sha_need_byte_reversal(&sha256->ctx))
-            #endif
-            #if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
-                          (defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
-            if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
-            #endif
-            {
-                ByteReverseWords(sha256->buffer, sha256->buffer,
-                                                      WC_SHA256_BLOCK_SIZE);
-            }
-        #endif
+        if (SHA256_UPDATE_REV_BYTES(&sha256->ctx)) {
+            ByteReverseWords(sha256->buffer, sha256->buffer,
+                WC_SHA256_BLOCK_SIZE);
+        }
 
         #if defined(WOLFSSL_USE_ESP32_CRYPT_HASH_HW) && \
            !defined(NO_WOLFSSL_ESP32_CRYPT_HASH_SHA256)
@@ -1393,28 +1345,10 @@ static int InitSha256(wc_Sha256* sha256)
     #endif
 
         /* store lengths */
-    #if defined(LITTLE_ENDIAN_ORDER) && !defined(FREESCALE_MMCAU_SHA)
-        #if ( defined(CONFIG_IDF_TARGET_ESP32C2) || \
-              defined(CONFIG_IDF_TARGET_ESP8684) || \
-              defined(CONFIG_IDF_TARGET_ESP32C3) || \
-              defined(CONFIG_IDF_TARGET_ESP32C6)    \
-            ) && \
-            defined(WOLFSSL_ESP32_CRYPT) &&         \
-           !defined(NO_WOLFSSL_ESP32_CRYPT_HASH) && \
-           !defined(NO_WOLFSSL_ESP32_CRYPT_HASH_SHA256)
-            /* For Espressif RISC-V Targets, we *may* need to reverse bytes
-             * depending on if HW is active or not. */
-            if (esp_sha_need_byte_reversal(&sha256->ctx))
-        #endif
-        #if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP) && \
-                          (defined(HAVE_INTEL_AVX1) || defined(HAVE_INTEL_AVX2))
-        if (!IS_INTEL_AVX1(intel_flags) && !IS_INTEL_AVX2(intel_flags))
-        #endif
-        {
+        if (SHA256_UPDATE_REV_BYTES(&sha256->ctx)) {
             ByteReverseWords(sha256->buffer, sha256->buffer,
-                WC_SHA256_BLOCK_SIZE);
+                WC_SHA256_PAD_SIZE);
         }
-    #endif
         /* ! 64-bit length ordering dependent on digest endian type ! */
         XMEMCPY(&local[WC_SHA256_PAD_SIZE], &sha256->hiLen, sizeof(word32));
         XMEMCPY(&local[WC_SHA256_PAD_SIZE + sizeof(word32)], &sha256->loLen,
@@ -1496,23 +1430,10 @@ static int InitSha256(wc_Sha256* sha256)
         }
 
     #ifdef LITTLE_ENDIAN_ORDER
-        #if ( defined(CONFIG_IDF_TARGET_ESP32C2) || \
-              defined(CONFIG_IDF_TARGET_ESP8684) || \
-              defined(CONFIG_IDF_TARGET_ESP32C3) || \
-              defined(CONFIG_IDF_TARGET_ESP32C6)    \
-            ) && \
-            defined(WOLFSSL_ESP32_CRYPT) &&         \
-           !defined(NO_WOLFSSL_ESP32_CRYPT_HASH) && \
-           !defined(NO_WOLFSSL_ESP32_CRYPT_HASH_SHA256)
-            /* For Espressif RISC-V Targets, we *may* need to reverse bytes
-             * depending on if HW is active or not. */
-            if (esp_sha_need_byte_reversal(&sha256->ctx))
-        #endif
-            {
-                ByteReverseWords((word32*)digest,
-                                 (word32*)sha256->digest,
-                                  WC_SHA256_DIGEST_SIZE);
-            }
+        if (SHA256_REV_BYTES(&sha256->ctx)) {
+            ByteReverseWords((word32*)digest, (word32*)sha256->digest,
+                              WC_SHA256_DIGEST_SIZE);
+        }
         XMEMCPY(hash, digest, WC_SHA256_DIGEST_SIZE);
     #else
         XMEMCPY(hash, sha256->digest, WC_SHA256_DIGEST_SIZE);
@@ -1556,22 +1477,10 @@ static int InitSha256(wc_Sha256* sha256)
         }
 
     #if defined(LITTLE_ENDIAN_ORDER)
-        #if ( defined(CONFIG_IDF_TARGET_ESP32C2) || \
-              defined(CONFIG_IDF_TARGET_ESP8684) || \
-              defined(CONFIG_IDF_TARGET_ESP32C3) || \
-              defined(CONFIG_IDF_TARGET_ESP32C6)    \
-            )  && \
-            defined(WOLFSSL_ESP32_CRYPT) &&         \
-           !defined(NO_WOLFSSL_ESP32_CRYPT_HASH) && \
-           !defined(NO_WOLFSSL_ESP32_CRYPT_HASH_SHA256)
-            /* For Espressif RISC-V Targets, we *may* need to reverse bytes
-             * depending on if HW is active or not. */
-            if (esp_sha_need_byte_reversal(&sha256->ctx))
-        #endif
-            {
-                ByteReverseWords(sha256->digest, sha256->digest,
-                                 WC_SHA256_DIGEST_SIZE);
-            }
+        if (SHA256_REV_BYTES(&sha256->ctx)) {
+            ByteReverseWords(sha256->digest, sha256->digest,
+                WC_SHA256_DIGEST_SIZE);
+        }
     #endif
         XMEMCPY(hash, sha256->digest, WC_SHA256_DIGEST_SIZE);
 
@@ -1583,18 +1492,115 @@ static int InitSha256(wc_Sha256* sha256)
 /* @param sha  a pointer to wc_Sha256 structure           */
 /* @param data data to be applied SHA256 transformation   */
 /* @return 0 on successful, otherwise non-zero on failure */
-    int wc_Sha256Transform(wc_Sha256* sha, const unsigned char* data)
+    int wc_Sha256Transform(wc_Sha256* sha256, const unsigned char* data)
     {
-        if (sha == NULL || data == NULL) {
+        if (sha256 == NULL || data == NULL) {
             return BAD_FUNC_ARG;
         }
-        return (Transform_Sha256(sha, data));
+        return Transform_Sha256(sha256, data);
     }
-    #endif
-#endif /* OPENSSL_EXTRA */
+#endif /* OPENSSL_EXTRA || HAVE_CURL */
+
+#if defined(WOLFSSL_HAVE_LMS) && !defined(WOLFSSL_LMS_FULL_HASH)
+    /* One block will be used from data.
+     * hash must be big enough to hold all of digest output.
+     */
+    int wc_Sha256HashBlock(wc_Sha256* sha256, const unsigned char* data,
+        unsigned char* hash)
+    {
+        int ret;
+
+        if ((sha256 == NULL) || (data == NULL)) {
+            return BAD_FUNC_ARG;
+        }
+
+        if (SHA256_UPDATE_REV_BYTES(&sha256->ctx)) {
+            ByteReverseWords(sha256->buffer, (word32*)data,
+                WC_SHA256_BLOCK_SIZE);
+            data = (unsigned char*)sha256->buffer;
+        }
+        ret = XTRANSFORM(sha256, data);
+
+        if ((ret == 0) && (hash != NULL)) {
+            if (!SHA256_REV_BYTES(&sha256->ctx)) {
+                XMEMCPY(hash, sha256->digest, WC_SHA256_DIGEST_SIZE);
+            }
+            else {
+        #if defined(WOLFSSL_X86_64_BUILD) && defined(USE_INTEL_SPEEDUP)
+                __asm__ __volatile__ (
+                    "mov    0x00(%[d]), %%esi\n\t"
+                    "movbe  %%esi, 0x00(%[h])\n\t"
+                    "mov    0x04(%[d]), %%esi\n\t"
+                    "movbe  %%esi, 0x04(%[h])\n\t"
+                    "mov    0x08(%[d]), %%esi\n\t"
+                    "movbe  %%esi, 0x08(%[h])\n\t"
+                    "mov    0x0c(%[d]), %%esi\n\t"
+                    "movbe  %%esi, 0x0c(%[h])\n\t"
+                    "mov    0x10(%[d]), %%esi\n\t"
+                    "movbe  %%esi, 0x10(%[h])\n\t"
+                    "mov    0x14(%[d]), %%esi\n\t"
+                    "movbe  %%esi, 0x14(%[h])\n\t"
+                    "mov    0x18(%[d]), %%esi\n\t"
+                    "movbe  %%esi, 0x18(%[h])\n\t"
+                    "mov    0x1c(%[d]), %%esi\n\t"
+                    "movbe  %%esi, 0x1c(%[h])\n\t"
+                    :
+                    : [d] "r" (sha256->digest), [h] "r" (hash)
+                    : "memory", "esi"
+                );
+        #else
+                word32* hash32 = (word32*)hash;
+                word32* digest = (word32*)sha256->digest;
+            #if WOLFSSL_GENERAL_ALIGNMENT < 4
+                ALIGN16 word32 buf[WC_SHA256_DIGEST_SIZE / sizeof(word32)];
+
+                if (((size_t)digest & 0x3) != 0) {
+                    if (((size_t)hash32 & 0x3) != 0) {
+                        XMEMCPY(buf, digest, WC_SHA256_DIGEST_SIZE);
+                        hash32 = buf;
+                        digest = buf;
+                    }
+                    else {
+                        XMEMCPY(hash, digest, WC_SHA256_DIGEST_SIZE);
+                        digest = hash32;
+                    }
+                }
+                else if (((size_t)hash32 & 0x3) != 0) {
+                    hash32 = digest;
+                }
+            #endif
+                hash32[0] = ByteReverseWord32(digest[0]);
+                hash32[1] = ByteReverseWord32(digest[1]);
+                hash32[2] = ByteReverseWord32(digest[2]);
+                hash32[3] = ByteReverseWord32(digest[3]);
+                hash32[4] = ByteReverseWord32(digest[4]);
+                hash32[5] = ByteReverseWord32(digest[5]);
+                hash32[6] = ByteReverseWord32(digest[6]);
+                hash32[7] = ByteReverseWord32(digest[7]);
+            #if WOLFSSL_GENERAL_ALIGNMENT < 4
+                if (hash != (byte*)hash32) {
+                    XMEMCPY(hash, hash32, WC_SHA256_DIGEST_SIZE);
+                }
+            #endif
+        #endif /* WOLFSSL_X86_64_BUILD && USE_INTEL_SPEEDUP */
+            }
+            sha256->digest[0] = 0x6A09E667L;
+            sha256->digest[1] = 0xBB67AE85L;
+            sha256->digest[2] = 0x3C6EF372L;
+            sha256->digest[3] = 0xA54FF53AL;
+            sha256->digest[4] = 0x510E527FL;
+            sha256->digest[5] = 0x9B05688CL;
+            sha256->digest[6] = 0x1F83D9ABL;
+            sha256->digest[7] = 0x5BE0CD19L;
+        }
 
+        return ret;
+    }
+#endif /* WOLFSSL_HAVE_LMS && !WOLFSSL_LMS_FULL_HASH */
 #endif /* !WOLFSSL_KCAPI_HASH */
 
+#endif /* XTRANSFORM */
+
 
 #ifdef WOLFSSL_SHA224
 
@@ -1713,10 +1719,6 @@ static int InitSha256(wc_Sha256* sha256)
     {
         int ret = 0;
 
-        if (sha224 == NULL) {
-            return BAD_FUNC_ARG;
-        }
-
         sha224->digest[0] = 0xc1059ed8;
         sha224->digest[1] = 0x367cd507;
         sha224->digest[2] = 0x3070dd17;
@@ -1817,7 +1819,14 @@ static int InitSha256(wc_Sha256* sha256)
     {
         int ret;
 
-        if (sha224 == NULL || (data == NULL && len > 0)) {
+        if (sha224 == NULL) {
+            return BAD_FUNC_ARG;
+        }
+        if (data == NULL && len == 0) {
+            /* valid, but do nothing */
+            return 0;
+        }
+        if (data == NULL) {
             return BAD_FUNC_ARG;
         }
 
@@ -1869,18 +1878,7 @@ static int InitSha256(wc_Sha256* sha256)
             return ret;
 
     #if defined(LITTLE_ENDIAN_ORDER)
-        #if ( defined(CONFIG_IDF_TARGET_ESP32C2) || \
-              defined(CONFIG_IDF_TARGET_ESP8684) || \
-              defined(CONFIG_IDF_TARGET_ESP32C3) || \
-              defined(CONFIG_IDF_TARGET_ESP32C6)    \
-            )  && \
-            defined(WOLFSSL_ESP32_CRYPT) && \
-           (!defined(NO_WOLFSSL_ESP32_CRYPT_HASH_SHA256) || \
-            !defined(NO_WOLFSSL_ESP32_CRYPT_HASH_SHA224)    \
-           )
-            if (esp_sha_need_byte_reversal(&sha224->ctx))
-        #endif
-        {
+        if (SHA256_REV_BYTES(&sha224->ctx)) {
             ByteReverseWords(sha224->digest,
                              sha224->digest,
                              WC_SHA224_DIGEST_SIZE);

+ 70 - 80
wolfcrypt/src/sha256_asm.S

@@ -92,7 +92,6 @@ Transform_Sha256_SSE2_Sha:
 .p2align	4
 _Transform_Sha256_SSE2_Sha:
 #endif /* __APPLE__ */
-        leaq	32(%rdi), %rdx
         movdqa	L_sse2_sha256_shuf_mask(%rip), %xmm10
         movq	(%rdi), %xmm1
         movq	8(%rdi), %xmm2
@@ -100,10 +99,10 @@ _Transform_Sha256_SSE2_Sha:
         movhpd	24(%rdi), %xmm2
         pshufd	$27, %xmm1, %xmm1
         pshufd	$27, %xmm2, %xmm2
-        movdqu	(%rdx), %xmm3
-        movdqu	16(%rdx), %xmm4
-        movdqu	32(%rdx), %xmm5
-        movdqu	48(%rdx), %xmm6
+        movdqu	(%rsi), %xmm3
+        movdqu	16(%rsi), %xmm4
+        movdqu	32(%rsi), %xmm5
+        movdqu	48(%rsi), %xmm6
         pshufb	%xmm10, %xmm3
         movdqa	%xmm1, %xmm8
         movdqa	%xmm2, %xmm9
@@ -557,7 +556,6 @@ _Transform_Sha256_AVX1:
         pushq	%r14
         pushq	%r15
         subq	$0x40, %rsp
-        leaq	32(%rdi), %rax
         vmovdqa	L_avx1_sha256_flip_mask(%rip), %xmm13
         vmovdqa	L_avx1_sha256_shuf_00BA(%rip), %xmm11
         vmovdqa	L_avx1_sha256_shuf_DC00(%rip), %xmm12
@@ -570,12 +568,12 @@ _Transform_Sha256_AVX1:
         movl	24(%rdi), %r14d
         movl	28(%rdi), %r15d
         # X0, X1, X2, X3 = W[0..15]
-        vmovdqu	(%rax), %xmm0
-        vmovdqu	16(%rax), %xmm1
+        vmovdqu	(%rsi), %xmm0
+        vmovdqu	16(%rsi), %xmm1
         vpshufb	%xmm13, %xmm0, %xmm0
         vpshufb	%xmm13, %xmm1, %xmm1
-        vmovdqu	32(%rax), %xmm2
-        vmovdqu	48(%rax), %xmm3
+        vmovdqu	32(%rsi), %xmm2
+        vmovdqu	48(%rsi), %xmm3
         vpshufb	%xmm13, %xmm2, %xmm2
         vpshufb	%xmm13, %xmm3, %xmm3
         movl	%r9d, %ebx
@@ -2947,8 +2945,7 @@ _Transform_Sha256_AVX1_Len:
         pushq	%r14
         pushq	%r15
         pushq	%rbp
-        movq	%rsi, %rbp
-        movq	%rdx, %rsi
+        movq	%rdx, %rbp
         subq	$0x40, %rsp
         vmovdqa	L_avx1_sha256_flip_mask(%rip), %xmm13
         vmovdqa	L_avx1_sha256_shuf_00BA(%rip), %xmm11
@@ -2964,12 +2961,12 @@ _Transform_Sha256_AVX1_Len:
         # Start of loop processing a block
 L_sha256_len_avx1_start:
         # X0, X1, X2, X3 = W[0..15]
-        vmovdqu	(%rbp), %xmm0
-        vmovdqu	16(%rbp), %xmm1
+        vmovdqu	(%rsi), %xmm0
+        vmovdqu	16(%rsi), %xmm1
         vpshufb	%xmm13, %xmm0, %xmm0
         vpshufb	%xmm13, %xmm1, %xmm1
-        vmovdqu	32(%rbp), %xmm2
-        vmovdqu	48(%rbp), %xmm3
+        vmovdqu	32(%rsi), %xmm2
+        vmovdqu	48(%rsi), %xmm3
         vpshufb	%xmm13, %xmm2, %xmm2
         vpshufb	%xmm13, %xmm3, %xmm3
         movl	%r9d, %ebx
@@ -5311,8 +5308,8 @@ L_sha256_len_avx1_start:
         addl	20(%rdi), %r13d
         addl	24(%rdi), %r14d
         addl	28(%rdi), %r15d
-        addq	$0x40, %rbp
-        subl	$0x40, %esi
+        addq	$0x40, %rsi
+        subl	$0x40, %ebp
         movl	%r8d, (%rdi)
         movl	%r9d, 4(%rdi)
         movl	%r10d, 8(%rdi)
@@ -5414,14 +5411,13 @@ _Transform_Sha256_AVX1_RORX:
         vmovdqa	L_avx1_rorx_sha256_flip_mask(%rip), %xmm13
         vmovdqa	L_avx1_rorx_sha256_shuf_00BA(%rip), %xmm11
         vmovdqa	L_avx1_rorx_sha256_shuf_DC00(%rip), %xmm12
-        leaq	32(%rdi), %rax
         # X0, X1, X2, X3 = W[0..15]
-        vmovdqu	(%rax), %xmm0
-        vmovdqu	16(%rax), %xmm1
+        vmovdqu	(%rsi), %xmm0
+        vmovdqu	16(%rsi), %xmm1
         vpshufb	%xmm13, %xmm0, %xmm0
         vpshufb	%xmm13, %xmm1, %xmm1
-        vmovdqu	32(%rax), %xmm2
-        vmovdqu	48(%rax), %xmm3
+        vmovdqu	32(%rsi), %xmm2
+        vmovdqu	48(%rsi), %xmm3
         vpshufb	%xmm13, %xmm2, %xmm2
         vpshufb	%xmm13, %xmm3, %xmm3
         movl	(%rdi), %r8d
@@ -7759,8 +7755,7 @@ _Transform_Sha256_AVX1_RORX_Len:
         pushq	%r14
         pushq	%r15
         pushq	%rbp
-        movq	%rsi, %rbp
-        movq	%rdx, %rsi
+        movq	%rdx, %rbp
         subq	$0x40, %rsp
         vmovdqa	L_avx1_rorx_sha256_flip_mask(%rip), %xmm13
         vmovdqa	L_avx1_rorx_sha256_shuf_00BA(%rip), %xmm11
@@ -7776,12 +7771,12 @@ _Transform_Sha256_AVX1_RORX_Len:
         # Start of loop processing a block
 L_sha256_len_avx1_len_rorx_start:
         # X0, X1, X2, X3 = W[0..15]
-        vmovdqu	(%rbp), %xmm0
-        vmovdqu	16(%rbp), %xmm1
+        vmovdqu	(%rsi), %xmm0
+        vmovdqu	16(%rsi), %xmm1
         vpshufb	%xmm13, %xmm0, %xmm0
         vpshufb	%xmm13, %xmm1, %xmm1
-        vmovdqu	32(%rbp), %xmm2
-        vmovdqu	48(%rbp), %xmm3
+        vmovdqu	32(%rsi), %xmm2
+        vmovdqu	48(%rsi), %xmm3
         vpshufb	%xmm13, %xmm2, %xmm2
         vpshufb	%xmm13, %xmm3, %xmm3
         # set_w_k_xfer_4: 0
@@ -10082,8 +10077,8 @@ L_sha256_len_avx1_len_rorx_start:
         addl	20(%rdi), %r13d
         addl	24(%rdi), %r14d
         addl	28(%rdi), %r15d
-        addq	$0x40, %rbp
-        subl	$0x40, %esi
+        addq	$0x40, %rsi
+        subl	$0x40, %ebp
         movl	%r8d, (%rdi)
         movl	%r9d, 4(%rdi)
         movl	%r10d, 8(%rdi)
@@ -10152,7 +10147,6 @@ Transform_Sha256_AVX1_Sha:
 .p2align	4
 _Transform_Sha256_AVX1_Sha:
 #endif /* __APPLE__ */
-        leaq	32(%rdi), %rdx
         vmovdqa	L_avx1_sha256_shuf_mask(%rip), %xmm10
         vmovq	(%rdi), %xmm1
         vmovq	8(%rdi), %xmm2
@@ -10160,10 +10154,10 @@ _Transform_Sha256_AVX1_Sha:
         vmovhpd	24(%rdi), %xmm2, %xmm2
         vpshufd	$27, %xmm1, %xmm1
         vpshufd	$27, %xmm2, %xmm2
-        vmovdqu	(%rdx), %xmm3
-        vmovdqu	16(%rdx), %xmm4
-        vmovdqu	32(%rdx), %xmm5
-        vmovdqu	48(%rdx), %xmm6
+        vmovdqu	(%rsi), %xmm3
+        vmovdqu	16(%rsi), %xmm4
+        vmovdqu	32(%rsi), %xmm5
+        vmovdqu	48(%rsi), %xmm6
         vpshufb	%xmm10, %xmm3, %xmm3
         vmovdqa	%xmm1, %xmm8
         vmovdqa	%xmm2, %xmm9
@@ -10581,7 +10575,6 @@ _Transform_Sha256_AVX2:
         pushq	%r14
         pushq	%r15
         subq	$0x200, %rsp
-        leaq	32(%rdi), %rax
         vmovdqa	L_avx2_sha256_flip_mask(%rip), %xmm13
         vmovdqa	L_avx2_sha256_shuf_00BA(%rip), %ymm11
         vmovdqa	L_avx2_sha256_shuf_DC00(%rip), %ymm12
@@ -10594,12 +10587,12 @@ _Transform_Sha256_AVX2:
         movl	24(%rdi), %r14d
         movl	28(%rdi), %r15d
         # X0, X1, X2, X3 = W[0..15]
-        vmovdqu	(%rax), %xmm0
-        vmovdqu	16(%rax), %xmm1
+        vmovdqu	(%rsi), %xmm0
+        vmovdqu	16(%rsi), %xmm1
         vpshufb	%xmm13, %xmm0, %xmm0
         vpshufb	%xmm13, %xmm1, %xmm1
-        vmovdqu	32(%rax), %xmm2
-        vmovdqu	48(%rax), %xmm3
+        vmovdqu	32(%rsi), %xmm2
+        vmovdqu	48(%rsi), %xmm3
         vpshufb	%xmm13, %xmm2, %xmm2
         vpshufb	%xmm13, %xmm3, %xmm3
         movl	%r9d, %ebx
@@ -12971,13 +12964,12 @@ _Transform_Sha256_AVX2_Len:
         pushq	%r14
         pushq	%r15
         pushq	%rbp
-        movq	%rsi, %rbp
-        movq	%rdx, %rsi
+        movq	%rdx, %rbp
         subq	$0x200, %rsp
-        testb	$0x40, %sil
+        testb	$0x40, %bpl
         je	L_sha256_len_avx2_block
-        vmovdqu	(%rbp), %ymm0
-        vmovdqu	32(%rbp), %ymm1
+        vmovdqu	(%rsi), %ymm0
+        vmovdqu	32(%rsi), %ymm1
         vmovups	%ymm0, 32(%rdi)
         vmovups	%ymm1, 64(%rdi)
 #ifndef __APPLE__
@@ -12985,8 +12977,8 @@ _Transform_Sha256_AVX2_Len:
 #else
         call	_Transform_Sha256_AVX2
 #endif /* __APPLE__ */
-        addq	$0x40, %rbp
-        subl	$0x40, %esi
+        addq	$0x40, %rsi
+        subl	$0x40, %ebp
         jz	L_sha256_len_avx2_done
 L_sha256_len_avx2_block:
         vmovdqa	L_avx2_sha256_flip_mask(%rip), %ymm13
@@ -13003,18 +12995,18 @@ L_sha256_len_avx2_block:
         # Start of loop processing two blocks
 L_sha256_len_avx2_start:
         # X0, X1, X2, X3 = W[0..15]
-        vmovdqu	(%rbp), %xmm0
-        vmovdqu	16(%rbp), %xmm1
-        vmovdqu	64(%rbp), %xmm4
-        vmovdqu	80(%rbp), %xmm5
+        vmovdqu	(%rsi), %xmm0
+        vmovdqu	16(%rsi), %xmm1
+        vmovdqu	64(%rsi), %xmm4
+        vmovdqu	80(%rsi), %xmm5
         vinserti128	$0x01, %xmm4, %ymm0, %ymm0
         vinserti128	$0x01, %xmm5, %ymm1, %ymm1
         vpshufb	%ymm13, %ymm0, %ymm0
         vpshufb	%ymm13, %ymm1, %ymm1
-        vmovdqu	32(%rbp), %xmm2
-        vmovdqu	48(%rbp), %xmm3
-        vmovdqu	96(%rbp), %xmm6
-        vmovdqu	112(%rbp), %xmm7
+        vmovdqu	32(%rsi), %xmm2
+        vmovdqu	48(%rsi), %xmm3
+        vmovdqu	96(%rsi), %xmm6
+        vmovdqu	112(%rsi), %xmm7
         vinserti128	$0x01, %xmm6, %ymm2, %ymm2
         vinserti128	$0x01, %xmm7, %ymm3, %ymm3
         vpshufb	%ymm13, %ymm2, %ymm2
@@ -17057,8 +17049,8 @@ L_sha256_len_avx2_start:
         addl	20(%rdi), %r13d
         addl	24(%rdi), %r14d
         addl	28(%rdi), %r15d
-        addq	$0x80, %rbp
-        subl	$0x80, %esi
+        addq	$0x80, %rsi
+        subl	$0x80, %ebp
         movl	%r8d, (%rdi)
         movl	%r9d, 4(%rdi)
         movl	%r10d, 8(%rdi)
@@ -17177,21 +17169,20 @@ _Transform_Sha256_AVX2_RORX:
         pushq	%r14
         pushq	%r15
         subq	$0x200, %rsp
-        leaq	32(%rdi), %rax
         vmovdqa	L_avx2_rorx_sha256_flip_mask(%rip), %xmm13
         vmovdqa	L_avx2_rorx_sha256_shuf_00BA(%rip), %ymm11
         vmovdqa	L_avx2_rorx_sha256_shuf_DC00(%rip), %ymm12
         # X0, X1, X2, X3 = W[0..15]
-        vmovdqu	(%rax), %xmm0
-        vmovdqu	16(%rax), %xmm1
+        vmovdqu	(%rsi), %xmm0
+        vmovdqu	16(%rsi), %xmm1
         vpshufb	%xmm13, %xmm0, %xmm0
         vpshufb	%xmm13, %xmm1, %xmm1
         vpaddd	0+L_avx2_rorx_sha256_k(%rip), %ymm0, %ymm4
         vpaddd	32+L_avx2_rorx_sha256_k(%rip), %ymm1, %ymm5
         vmovdqu	%ymm4, (%rsp)
         vmovdqu	%ymm5, 32(%rsp)
-        vmovdqu	32(%rax), %xmm2
-        vmovdqu	48(%rax), %xmm3
+        vmovdqu	32(%rsi), %xmm2
+        vmovdqu	48(%rsi), %xmm3
         vpshufb	%xmm13, %xmm2, %xmm2
         vpshufb	%xmm13, %xmm3, %xmm3
         vpaddd	64+L_avx2_rorx_sha256_k(%rip), %ymm2, %ymm4
@@ -19542,13 +19533,12 @@ _Transform_Sha256_AVX2_RORX_Len:
         pushq	%r14
         pushq	%r15
         pushq	%rbp
-        movq	%rsi, %rbp
-        movq	%rdx, %rsi
+        movq	%rdx, %rbp
         subq	$0x200, %rsp
-        testb	$0x40, %sil
+        testb	$0x40, %bpl
         je	L_sha256_len_avx2_rorx_block
-        vmovdqu	(%rbp), %ymm0
-        vmovdqu	32(%rbp), %ymm1
+        vmovdqu	(%rsi), %ymm0
+        vmovdqu	32(%rsi), %ymm1
         vmovups	%ymm0, 32(%rdi)
         vmovups	%ymm1, 64(%rdi)
 #ifndef __APPLE__
@@ -19556,8 +19546,8 @@ _Transform_Sha256_AVX2_RORX_Len:
 #else
         call	_Transform_Sha256_AVX2_RORX
 #endif /* __APPLE__ */
-        addq	$0x40, %rbp
-        subl	$0x40, %esi
+        addq	$0x40, %rsi
+        subl	$0x40, %ebp
         jz	L_sha256_len_avx2_rorx_done
 L_sha256_len_avx2_rorx_block:
         vmovdqa	L_avx2_rorx_sha256_flip_mask(%rip), %ymm13
@@ -19574,20 +19564,20 @@ L_sha256_len_avx2_rorx_block:
         # Start of loop processing two blocks
 L_sha256_len_avx2_rorx_start:
         # X0, X1, X2, X3 = W[0..15]
-        vmovdqu	(%rbp), %xmm0
-        vmovdqu	16(%rbp), %xmm1
-        vinserti128	$0x01, 64(%rbp), %ymm0, %ymm0
-        vinserti128	$0x01, 80(%rbp), %ymm1, %ymm1
+        vmovdqu	(%rsi), %xmm0
+        vmovdqu	16(%rsi), %xmm1
+        vinserti128	$0x01, 64(%rsi), %ymm0, %ymm0
+        vinserti128	$0x01, 80(%rsi), %ymm1, %ymm1
         vpshufb	%ymm13, %ymm0, %ymm0
         vpshufb	%ymm13, %ymm1, %ymm1
         vpaddd	0+L_avx2_rorx_sha256_k(%rip), %ymm0, %ymm4
         vpaddd	32+L_avx2_rorx_sha256_k(%rip), %ymm1, %ymm5
         vmovdqu	%ymm4, (%rsp)
         vmovdqu	%ymm5, 32(%rsp)
-        vmovdqu	32(%rbp), %xmm2
-        vmovdqu	48(%rbp), %xmm3
-        vinserti128	$0x01, 96(%rbp), %ymm2, %ymm2
-        vinserti128	$0x01, 112(%rbp), %ymm3, %ymm3
+        vmovdqu	32(%rsi), %xmm2
+        vmovdqu	48(%rsi), %xmm3
+        vinserti128	$0x01, 96(%rsi), %ymm2, %ymm2
+        vinserti128	$0x01, 112(%rsi), %ymm3, %ymm3
         vpshufb	%ymm13, %ymm2, %ymm2
         vpshufb	%ymm13, %ymm3, %ymm3
         vpaddd	64+L_avx2_rorx_sha256_k(%rip), %ymm2, %ymm4
@@ -23449,7 +23439,7 @@ L_sha256_len_avx2_rorx_start:
         addl	%edx, %r8d
         xorl	%r10d, %eax
         addl	%eax, %r8d
-        addq	$0x80, %rbp
+        addq	$0x80, %rsi
         addl	(%rdi), %r8d
         addl	4(%rdi), %r9d
         addl	8(%rdi), %r10d
@@ -23458,7 +23448,7 @@ L_sha256_len_avx2_rorx_start:
         addl	20(%rdi), %r13d
         addl	24(%rdi), %r14d
         addl	28(%rdi), %r15d
-        subl	$0x80, %esi
+        subl	$0x80, %ebp
         movl	%r8d, (%rdi)
         movl	%r9d, 4(%rdi)
         movl	%r10d, 8(%rdi)

+ 1 - 1
wolfcrypt/src/wc_lms.c

@@ -1,6 +1,6 @@
 /* wc_lms.c
  *
- * Copyright (C) 2006-2023 wolfSSL Inc.
+ * Copyright (C) 2006-2024 wolfSSL Inc.
  *
  * This file is part of wolfSSL.
  *

+ 26 - 0
wolfcrypt/src/wc_lms_impl.c

@@ -0,0 +1,26 @@
+/* wc_lms_impl.c
+ *
+ * Copyright (C) 2006-2024 wolfSSL Inc.
+ *
+ * This file is part of wolfSSL.
+ *
+ * wolfSSL is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * wolfSSL is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA
+ */
+
+#include <wolfssl/wolfcrypt/settings.h>
+
+#ifdef WOLFSSL_HAVE_LMS
+    #error "Contact wolfSSL to get the implementation of this file"
+#endif

+ 60 - 21
wolfcrypt/test/test.c

@@ -325,6 +325,8 @@ const byte const_byte_array[] = "A+Gd\0\0\0";
     #include <wolfssl/wolfcrypt/lms.h>
 #ifdef HAVE_LIBLMS
     #include <wolfssl/wolfcrypt/ext_lms.h>
+#else
+    #include <wolfssl/wolfcrypt/wc_lms.h>
 #endif
 #endif
 #ifdef WOLFCRYPT_HAVE_ECCSI
@@ -625,12 +627,14 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t scrypt_test(void);
     #endif
 #endif
 #if defined(WOLFSSL_HAVE_LMS)
+    #if !defined(WOLFSSL_SMALL_STACK)
+        #if defined(WOLFSSL_WC_LMS) && (LMS_MAX_HEIGHT >= 10)
+    WOLFSSL_TEST_SUBROUTINE wc_test_ret_t  lms_test_verify_only(void);
+        #endif
+    #endif
     #if !defined(WOLFSSL_LMS_VERIFY_ONLY)
     WOLFSSL_TEST_SUBROUTINE wc_test_ret_t  lms_test(void);
     #endif
-    #if defined(WOLFSSL_LMS_VERIFY_ONLY) && !defined(WOLFSSL_SMALL_STACK)
-    WOLFSSL_TEST_SUBROUTINE wc_test_ret_t  lms_test_verify_only(void);
-    #endif
 #endif
 #ifdef WOLFCRYPT_HAVE_ECCSI
     WOLFSSL_TEST_SUBROUTINE wc_test_ret_t  eccsi_test(void);
@@ -1753,15 +1757,17 @@ options: [-s max_relative_stack_bytes] [-m max_relative_heap_memory_bytes]\n\
 #endif /* if defined(WOLFSSL_HAVE_XMSS) */
 
 #if defined(WOLFSSL_HAVE_LMS)
-    #if !defined(WOLFSSL_LMS_VERIFY_ONLY)
-    if ( (ret = lms_test()) != 0)
-        TEST_FAIL("LMS      test failed!\n", ret);
+    #if !defined(WOLFSSL_SMALL_STACK)
+        #if defined(WOLFSSL_WC_LMS) && (LMS_MAX_HEIGHT >= 10)
+    if ( (ret = lms_test_verify_only()) != 0)
+        TEST_FAIL("LMS Vfy  test failed!\n", ret);
     else
-        TEST_PASS("LMS      test passed!\n");
+        TEST_PASS("LMS Vfy  test passed!\n");
+        #endif
     #endif
 
-    #if defined(WOLFSSL_LMS_VERIFY_ONLY) && !defined(WOLFSSL_SMALL_STACK)
-    if ( (ret = lms_test_verify_only()) != 0)
+    #if !defined(WOLFSSL_LMS_VERIFY_ONLY)
+    if ( (ret = lms_test()) != 0)
         TEST_FAIL("LMS      test failed!\n", ret);
     else
         TEST_PASS("LMS      test passed!\n");
@@ -3392,6 +3398,35 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t sha256_test(void)
 #undef LARGE_HASH_TEST_INPUT_SZ
 #endif /* NO_LARGE_HASH_TEST */
 
+#if defined(WOLFSSL_HAVE_LMS) && !defined(WOLFSSL_LMS_FULL_HASH)
+    unsigned char data_hb[WC_SHA256_BLOCK_SIZE] = {
+        0x61, 0x62, 0x63, 0x80, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+        0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x18
+    };
+
+    ret = wc_Sha256HashBlock(&sha, data_hb, hash);
+    if (ret != 0) {
+        ERROR_OUT(WC_TEST_RET_ENC_EC(ret), exit);
+    }
+    if (XMEMCMP(hash, b.output, WC_SHA256_DIGEST_SIZE) != 0) {
+{
+    for (int ii = 0; ii < WC_SHA256_DIGEST_SIZE; ii++)
+        fprintf(stderr, " %02x", hash[ii]);
+    fprintf(stderr, "\n");
+    for (int ii = 0; ii < WC_SHA256_DIGEST_SIZE; ii++)
+        fprintf(stderr, " %02x", b.output[ii]);
+    fprintf(stderr, "\n");
+}
+        ERROR_OUT(WC_TEST_RET_ENC_NC, exit);
+    }
+#endif
+
 exit:
 
 #if !defined(NO_LARGE_HASH_TEST) && defined(WOLFSSL_SMALL_STACK)
@@ -37793,7 +37828,7 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t lms_test(void)
     if (ret != 0) { return WC_TEST_RET_ENC_EC(ret); }
 
     if (sigSz != WC_TEST_LMS_SIG_LEN) {
-        printf("error: got %d, expected %d\n", sigSz, WC_TEST_LMS_SIG_LEN);
+        printf("error: got %u, expected %d\n", sigSz, WC_TEST_LMS_SIG_LEN);
         return WC_TEST_RET_ENC_EC(sigSz);
     }
 
@@ -37827,7 +37862,7 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t lms_test(void)
 
             ret2 = wc_LmsKey_Verify(&verifyKey, sig, sigSz, (byte *) msg,
                                      msgSz);
-            if (ret2 != -1) {
+            if ((ret2 != -1) && (ret2 != SIG_VERIFY_E)) {
                 /* Verify passed when it should have failed. */
                 return WC_TEST_RET_ENC_I(j);
             }
@@ -37848,13 +37883,17 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t lms_test(void)
 
     wc_FreeRng(&rng);
 
+#if defined(WOLFSSL_SMALL_STACK) && !defined(WOLFSSL_NO_MALLOC)
+    XFREE(sig, HEAP_HINT, DYNAMIC_TYPE_TMP_BUFFER);
+#endif
+
     return ret;
 }
 
 #endif /* if defined(WOLFSSL_HAVE_LMS) && !defined(WOLFSSL_LMS_VERIFY_ONLY) */
 
-#if defined(WOLFSSL_HAVE_LMS) && defined(WOLFSSL_LMS_VERIFY_ONLY) && \
-    !defined(WOLFSSL_SMALL_STACK)
+#if defined(WOLFSSL_HAVE_LMS) && !defined(WOLFSSL_SMALL_STACK)
+#if defined(WOLFSSL_WC_LMS) && (LMS_MAX_HEIGHT >= 10)
 
 /* A simple LMS verify only test.
  *
@@ -37868,7 +37907,7 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t lms_test(void)
  * */
 
 /* "wolfSSL LMS example message!" without null terminator. */
-static const byte lms_msg[28] =
+static byte lms_msg[28] =
 {
     0x77,0x6F,0x6C,0x66,0x53,0x53,0x4C,0x20,
     0x4C,0x4D,0x53,0x20,0x65,0x78,0x61,0x6D,
@@ -37890,7 +37929,7 @@ static const byte lms_L1H10W8_pub[HSS_MAX_PUBLIC_KEY_LEN] =
 
 #define LMS_L1H10W8_SIGLEN (1456)
 
-static const byte lms_L1H10W8_sig[LMS_L1H10W8_SIGLEN] =
+static byte lms_L1H10W8_sig[LMS_L1H10W8_SIGLEN] =
 {
     0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x01,
     0x00,0x00,0x00,0x04,0x18,0x70,0x09,0x2E,
@@ -38114,7 +38153,7 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t lms_test_verify_only(void)
     if (ret != 0) { return WC_TEST_RET_ENC_EC(ret); }
 
     if (pubLen != HSS_MAX_PUBLIC_KEY_LEN) {
-        printf("error: got %d, expected %d\n", pubLen, HSS_MAX_PUBLIC_KEY_LEN);
+        printf("error: got %u, expected %d\n", pubLen, HSS_MAX_PUBLIC_KEY_LEN);
         return WC_TEST_RET_ENC_EC(pubLen);
     }
 
@@ -38122,7 +38161,7 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t lms_test_verify_only(void)
     if (ret != 0) { return WC_TEST_RET_ENC_EC(ret); }
 
     if (sigSz != LMS_L1H10W8_SIGLEN) {
-        printf("error: got %d, expected %d\n", sigSz, LMS_L1H10W8_SIGLEN);
+        printf("error: got %u, expected %d\n", sigSz, LMS_L1H10W8_SIGLEN);
         return WC_TEST_RET_ENC_EC(sigSz);
     }
 
@@ -38137,7 +38176,7 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t lms_test_verify_only(void)
     lms_msg[msgSz / 2] ^= 1;
     ret2 = wc_LmsKey_Verify(&verifyKey, lms_L1H10W8_sig, LMS_L1H10W8_SIGLEN,
                             (byte *) lms_msg, msgSz);
-    if (ret2 != -1) {
+    if ((ret2 != -1) && (ret2 != SIG_VERIFY_E)) {
         printf("error: wc_LmsKey_Verify returned %d, expected -1\n", ret2);
         return WC_TEST_RET_ENC_EC(ret);
     }
@@ -38159,7 +38198,7 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t lms_test_verify_only(void)
         ret2 = wc_LmsKey_Verify(&verifyKey, lms_L1H10W8_sig,
                                 LMS_L1H10W8_SIGLEN,
                                 (byte *) lms_msg, msgSz);
-        if (ret2 != -1) {
+        if ((ret2 != -1) && (ret2 != SIG_VERIFY_E)) {
             /* Verify passed when it should have failed. */
             return WC_TEST_RET_ENC_I(j);
         }
@@ -38172,8 +38211,8 @@ WOLFSSL_TEST_SUBROUTINE wc_test_ret_t lms_test_verify_only(void)
     return ret;
 }
 
-#endif /* if defined(WOLFSSL_HAVE_LMS) && defined(WOLFSSL_LMS_VERIFY_ONLY) &&
-        *    !defined(WOLFSSL_SMALL_STACK) */
+#endif
+#endif /* if defined(WOLFSSL_HAVE_LMS) && !defined(WOLFSSL_SMALL_STACK) */
 
 static const int fiducial3 = WC_TEST_RET_LN; /* source code reference point --
                                               * see print_fiducials() below.

+ 3 - 2
wolfssl/wolfcrypt/settings.h

@@ -3301,8 +3301,9 @@ extern void uITRON4_free(void *p) ;
     #define NO_SESSION_CACHE_REF
 #endif
 
-/* (D)TLS v1.3 requires 64-bit number wrappers */
-#if defined(WOLFSSL_TLS13) || defined(WOLFSSL_DTLS_DROP_STATS)
+/* (D)TLS v1.3 requires 64-bit number wrappers as does XMSS and LMS. */
+#if defined(WOLFSSL_TLS13) || defined(WOLFSSL_DTLS_DROP_STATS) || \
+    defined(WOLFSSL_WC_XMSS) || defined(WOLFSSL_WC_LMS)
     #undef WOLFSSL_W64_WRAPPER
     #define WOLFSSL_W64_WRAPPER
 #endif

+ 4 - 0
wolfssl/wolfcrypt/sha256.h

@@ -249,6 +249,10 @@ WOLFSSL_API void wc_Sha256Free(wc_Sha256* sha256);
 #if defined(OPENSSL_EXTRA) || defined(HAVE_CURL)
 WOLFSSL_API int wc_Sha256Transform(wc_Sha256* sha, const unsigned char* data);
 #endif
+#if defined(WOLFSSL_HAVE_LMS) && !defined(WOLFSSL_LMS_FULL_HASH)
+WOLFSSL_API int wc_Sha256HashBlock(wc_Sha256* sha, const unsigned char* data,
+    unsigned char* hash);
+#endif
 #if defined(WOLFSSL_HASH_KEEP)
 WOLFSSL_API int wc_Sha256_Grow(wc_Sha256* sha256, const byte* in, int inSz);
 #endif

+ 1 - 0
wolfssl/wolfcrypt/types.h

@@ -1052,6 +1052,7 @@ typedef struct w64wrapper {
         DYNAMIC_TYPE_SPHINCS      = 98,
         DYNAMIC_TYPE_SM4_BUFFER   = 99,
         DYNAMIC_TYPE_DEBUG_TAG    = 100,
+        DYNAMIC_TYPE_LMS          = 101,
         DYNAMIC_TYPE_SNIFFER_SERVER      = 1000,
         DYNAMIC_TYPE_SNIFFER_SESSION     = 1001,
         DYNAMIC_TYPE_SNIFFER_PB          = 1002,

+ 1 - 1
wolfssl/wolfcrypt/wc_lms.h

@@ -1,6 +1,6 @@
 /* wc_lms.h
  *
- * Copyright (C) 2006-2023 wolfSSL Inc.
+ * Copyright (C) 2006-2024 wolfSSL Inc.
  *
  * This file is part of wolfSSL.
  *