Browse Source

Fix for DES3 with STM32 and `STM32_CRYPTO_AES_ONLY` (broken in #5223) . Add U5 PKA support and benchmarks. Fix MD5 with `OPENSSL_EXTRA` and `HAVE_MD5_CUST_API`.

David Garske 2 years ago
parent
commit
92fcea39db

+ 45 - 0
IDE/STM32Cube/STM32_Benchmarks.md

@@ -580,6 +580,51 @@ CPU: Cortex-M33 at 160 MHz
 IDE: STM32CubeIDE
 RTOS: FreeRTOS
 
+### STM32U585 (STM Symmetric AES/SHA acceleration, STM PKA PKA w/Fast Math)
+
+This test uses `WOLFSSL_SMALL_STACK_CACHE`, which slightly improves the DRBG RNG performance.
+
+Only the ECC sign and verify are currently being accelerated by PKA.
+
+```
+------------------------------------------------------------------------------
+ wolfSSL version 5.3.1
+------------------------------------------------------------------------------
+Running wolfCrypt Benchmarks...
+wolfCrypt Benchmark (block bytes 1024, min 1.0 sec each)
+RNG                575 KB took 1.039 seconds,  553.417 KB/s
+AES-128-CBC-enc      6 MB took 1.000 seconds,    6.274 MB/s
+AES-128-CBC-dec      6 MB took 1.000 seconds,    6.128 MB/s
+AES-256-CBC-enc      6 MB took 1.000 seconds,    6.274 MB/s
+AES-256-CBC-dec      6 MB took 1.000 seconds,    6.152 MB/s
+AES-128-GCM-enc      6 MB took 1.000 seconds,    5.640 MB/s
+AES-128-GCM-dec      6 MB took 1.000 seconds,    5.566 MB/s
+AES-256-GCM-enc      6 MB took 1.000 seconds,    5.615 MB/s
+AES-256-GCM-dec      6 MB took 1.000 seconds,    5.542 MB/s
+GMAC Small          11 MB took 1.000 seconds,   11.499 MB/s
+CHACHA               4 MB took 1.000 seconds,    3.882 MB/s
+CHA-POLY             2 MB took 1.008 seconds,    2.470 MB/s
+3DES               200 KB took 1.071 seconds,  186.741 KB/s
+MD5                  6 MB took 1.000 seconds,    6.299 MB/s
+POLY1305            10 MB took 1.000 seconds,   10.449 MB/s
+SHA                  6 MB took 1.000 seconds,    6.299 MB/s
+SHA-256              6 MB took 1.000 seconds,    6.250 MB/s
+HMAC-MD5             6 MB took 1.000 seconds,    6.177 MB/s
+HMAC-SHA             6 MB took 1.000 seconds,    6.177 MB/s
+HMAC-SHA256          6 MB took 1.000 seconds,    6.104 MB/s
+RSA     2048 public         28 ops took 1.031 sec, avg 36.821 ms, 27.158 ops/sec
+RSA     2048 private         2 ops took 4.310 sec, avg 2155.000 ms, 0.464 ops/sec
+DH      2048 key gen         3 ops took 1.197 sec, avg 399.000 ms, 2.506 ops/sec
+DH      2048 agree           2 ops took 1.525 sec, avg 762.500 ms, 1.311 ops/sec
+ECC   [      SECP256R1]   256 key gen        50 ops took 1.019 sec, avg 20.380 ms, 49.068 ops/sec
+ECDHE [      SECP256R1]   256 agree          52 ops took 1.008 sec, avg 19.385 ms, 51.587 ops/sec
+ECDSA [      SECP256R1]   256 sign           56 ops took 1.000 sec, avg 17.857 ms, 56.000 ops/sec
+ECDSA [      SECP256R1]   256 verify         56 ops took 1.008 sec, avg 18.000 ms, 55.556 ops/sec
+Benchmark complete
+Benchmark Test: Return code 0
+```
+
+
 ### STM32U585 (STM Symmetric AES/SHA acceleration, SP Math ASM Cortex M)
 
 ```

+ 7 - 1
IDE/STM32Cube/default_conf.ftl

@@ -135,12 +135,18 @@ extern ${variable.value} ${variable.name};
     #define HAL_CONSOLE_UART huart1
     #define WOLFSSL_STM32U5
     #define STM32_HAL_V2
+    #ifdef STM32U585xx
+        #undef  NO_STM32_HASH
+        #undef  NO_STM32_CRYPTO
+        #define WOLFSSL_STM32_PKA
+    #endif
 #else
     #warning Please define a hardware platform!
     /* This means there is not a pre-defined platform for your board/CPU */
     /* You need to define a CPU type, HW crypto and debug UART */
     /* CPU Type: WOLFSSL_STM32F1, WOLFSSL_STM32F2, WOLFSSL_STM32F4, 
-        WOLFSSL_STM32F7, WOLFSSL_STM32H7, WOLFSSL_STM32L4 and WOLFSSL_STM32L5 */
+        WOLFSSL_STM32F7, WOLFSSL_STM32H7, WOLFSSL_STM32L4, WOLFSSL_STM32L5,
+        WOLFSSL_STM32G0, WOLFSSL_STM32WB and WOLFSSL_STM32U5 */
     #define WOLFSSL_STM32F4
 
     /* Debug UART used for printf */

+ 2 - 2
wolfcrypt/src/des3.c

@@ -1824,7 +1824,7 @@ void wc_Des_SetIV(Des* des, const byte* iv)
 {
     if (des && iv) {
         XMEMCPY(des->reg, iv, DES_BLOCK_SIZE);
-    #ifdef STM32_HAL_V2
+    #if defined(STM32_CRYPTO) && !defined(STM32_CRYPTO_AES_ONLY) && defined(STM32_HAL_V2)
         ByteReverseWords(des->reg, des->reg, DES_BLOCK_SIZE);
     #endif
     }
@@ -1839,7 +1839,7 @@ int wc_Des3_SetIV(Des3* des, const byte* iv)
     }
     if (iv) {
         XMEMCPY(des->reg, iv, DES_BLOCK_SIZE);
-    #ifdef STM32_HAL_V2
+    #if defined(STM32_CRYPTO) && !defined(STM32_CRYPTO_AES_ONLY) && defined(STM32_HAL_V2)
         ByteReverseWords(des->reg, des->reg, DES_BLOCK_SIZE);
     #endif
     }

+ 7 - 1
wolfcrypt/src/md5.c

@@ -551,6 +551,7 @@ int wc_Md5Copy(wc_Md5* src, wc_Md5* dst)
 
     return ret;
 }
+
 #ifdef OPENSSL_EXTRA
 /* Apply MD5 transformation to the data                   */
 /* @param md5  a pointer to wc_MD5 structure              */
@@ -562,9 +563,14 @@ int wc_Md5Transform(wc_Md5* md5, const byte* data)
     if (md5 == NULL || data == NULL) {
         return BAD_FUNC_ARG;
     }
+#ifndef HAVE_MD5_CUST_API
     return Transform(md5, data);
-}
+#else
+    return NOT_COMPILED_IN;
 #endif
+}
+#endif /* OPENSSL_EXTRA */
+
 #ifdef WOLFSSL_HASH_FLAGS
 int wc_Md5SetFlags(wc_Md5* md5, word32 flags)
 {

+ 71 - 15
wolfcrypt/src/port/st/stm32.c

@@ -205,7 +205,7 @@ static int wc_Stm32_Hash_WaitDone(STM32_HASH_Context* stmCtx)
     /* wait until hash digest is complete */
     while ((HASH->SR & HASH_SR_BUSY) &&
         #ifdef HASH_IMR_DCIE
-            (HASH->SR & HASH_SR_DCIS) == 0 && 
+            (HASH->SR & HASH_SR_DCIS) == 0 &&
         #endif
         ++timeout < STM32_HASH_TIMEOUT) {
     };
@@ -477,18 +477,29 @@ int wc_Stm32_Aes_Init(Aes* aes, CRYP_InitTypeDef* cryptInit,
 #if defined(WOLFSSL_STM32L5)
 #include <stm32l5xx_hal_conf.h>
 #include <stm32l5xx_hal_pka.h>
-#else
+#elif defined(WOLFSSL_STM32U5)
+#include <stm32u5xx_hal_conf.h>
+#include <stm32u5xx_hal_pka.h>
+#elif defined(WOLFSSL_STM32WB)
 #include <stm32wbxx_hal_conf.h>
 #include <stm32wbxx_hal_pka.h>
+#else
+#error Please add the hal_pk.h include
 #endif
 extern PKA_HandleTypeDef hpka;
 
+#if !defined(WOLFSSL_STM32_PKA_V2) && defined(PKA_ECC_SCALAR_MUL_IN_B_COEFF)
+/* PKA hardware like in U5 added coefB and primeOrder */
+#define WOLFSSL_STM32_PKA_V2
+#endif
+
 /* Reverse array in memory (in place) */
 #ifdef HAVE_ECC
 #include <wolfssl/wolfcrypt/ecc.h>
 
 /* convert from mp_int to STM32 PKA HAL integer, as array of bytes of size sz.
- * if mp_int has less bytes than sz, add zero bytes at most significant byte positions.
+ * if mp_int has less bytes than sz, add zero bytes at most significant byte
+ * positions.
  * This is when for example modulus is 32 bytes (P-256 curve)
  * and mp_int has only 31 bytes, we add leading zeros
  * so that result array has 32 bytes, same as modulus (sz).
@@ -523,7 +534,8 @@ static int stm32_get_from_mp_int(uint8_t *dst, const mp_int *a, int sz)
     return res;
 }
 
-/* ECC specs in lsbyte at lowest address format for direct use by STM32_PKA PKHA driver functions */
+/* ECC specs in lsbyte at lowest address format for direct use by
+ * STM32_PKA PKHA driver functions */
 #if defined(HAVE_ECC192) || defined(HAVE_ALL_CURVES)
 #define ECC192
 #endif
@@ -555,6 +567,11 @@ static const uint8_t stm32_ecc192_coef[ECC192_KEYSIZE] = {
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03
 };
+static const uint8_t stm32_ecc192_coefB[ECC192_KEYSIZE] = {
+    0x64, 0x21, 0x05, 0x19, 0xe5, 0x9c, 0x80, 0xe7,
+    0x0f, 0xa7, 0xe9, 0xab, 0x72, 0x24, 0x30, 0x49,
+    0xfe, 0xb8, 0xde, 0xec, 0xc1, 0x46, 0xb9, 0xb1
+};
 static const uint8_t stm32_ecc192_pointX[ECC192_KEYSIZE] =  {
     0x18, 0x8D, 0xA8, 0x0E,  0xB0, 0x30, 0x90, 0xF6,
     0x7C, 0xBF, 0x20, 0xEB,  0x43, 0xA1, 0x88, 0x00,
@@ -588,6 +605,12 @@ static const uint8_t stm32_ecc224_coef[ECC224_KEYSIZE] = {
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x03
 };
+static const uint8_t stm32_ecc224_coefB[ECC224_KEYSIZE] = {
+    0xb4, 0x05, 0x0a, 0x85, 0x0c, 0x04, 0xb3, 0xab,
+    0xf5, 0x41, 0x32, 0x56, 0x50, 0x44, 0xb0, 0xb7,
+    0xd7, 0xbf, 0xd8, 0xba, 0x27, 0x0b, 0x39, 0x43,
+    0x23, 0x55, 0xff, 0xb4
+};
 static const uint8_t stm32_ecc224_pointX[ECC224_KEYSIZE] =  {
     0xB7, 0x0E, 0x0C, 0xBD, 0x6B, 0xB4, 0xBF, 0x7F,
     0x32, 0x13, 0x90, 0xB9, 0x4A, 0x03, 0xC1, 0xD3,
@@ -601,9 +624,9 @@ static const uint8_t stm32_ecc224_pointY[ECC224_KEYSIZE] = {
     0x85, 0x00, 0x7E, 0x34
 };
 static const uint8_t stm32_ecc224_order[ECC224_KEYSIZE] = {
-    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 
-    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x16, 0xA2, 
-    0xE0, 0xB8, 0xF0, 0x3E, 0x13, 0xDD, 0x29, 0x45, 
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x16, 0xA2,
+    0xE0, 0xB8, 0xF0, 0x3E, 0x13, 0xDD, 0x29, 0x45,
     0x5C, 0x5C, 0x2A, 0x3D
 };
 #endif /* ECC224 */
@@ -624,6 +647,12 @@ static const uint8_t stm32_ecc256_coef[ECC256_KEYSIZE] = {
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03
 };
+static const uint8_t stm32_ecc256_coefB[ECC256_KEYSIZE] = {
+    0x5a, 0xc6, 0x35, 0xd8, 0xaa, 0x3a, 0x93, 0xe7,
+    0xb3, 0xeb, 0xbd, 0x55, 0x76, 0x98, 0x86, 0xbc,
+    0x65, 0x1d, 0x06, 0xb0, 0xcc, 0x53, 0xb0, 0xf6,
+    0x3b, 0xce, 0x3c, 0x3e, 0x27, 0xd2, 0x60, 0x4b
+};
 static const uint8_t stm32_ecc256_pointX[ECC256_KEYSIZE] = {
     0x6b, 0x17, 0xd1, 0xf2, 0xe1, 0x2c, 0x42, 0x47,
     0xf8, 0xbc, 0xe6, 0xe5, 0x63, 0xa4, 0x40, 0xf2,
@@ -664,6 +693,14 @@ static const uint8_t stm32_ecc384_coef[ECC384_KEYSIZE] = {
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03
 };
+static const uint8_t stm32_ecc384_coefB[ECC384_KEYSIZE] = {
+    0xb3, 0x31, 0x2f, 0xa7, 0xe2, 0x3e, 0xe7, 0xe4,
+    0x98, 0x8e, 0x05, 0x6b, 0xe3, 0xf8, 0x2d, 0x19,
+    0x18, 0x1d, 0x9c, 0x6e, 0xfe, 0x81, 0x41, 0x12,
+    0x03, 0x14, 0x08, 0x8f, 0x50, 0x13, 0x87, 0x5a,
+    0xc6, 0x56, 0x39, 0x8d, 0x8a, 0x2e, 0xd1, 0x9d,
+    0x2a, 0x85, 0xc8, 0xed, 0xd3, 0xec, 0x2a, 0xef
+};
 static const uint8_t stm32_ecc384_pointX[ECC384_KEYSIZE] =  {
     0xAA, 0x87, 0xCA, 0x22, 0xBE, 0x8B, 0x05, 0x37,
     0x8E, 0xB1, 0xC7, 0x1E, 0xF3, 0x20, 0xAD, 0x74,
@@ -691,13 +728,15 @@ static const uint8_t stm32_ecc384_order[ECC384_KEYSIZE] = {
 #endif /* ECC384 */
 
 static int stm32_get_ecc_specs(const uint8_t **prime, const uint8_t **coef,
-    const uint32_t **coef_sign, const uint8_t **GenPointX, const uint8_t **GenPointY,
-    const uint8_t **order, int size)
+    const uint8_t **coefB, const uint32_t **coef_sign,
+    const uint8_t **GenPointX, const uint8_t **GenPointY, const uint8_t **order,
+    int size)
 {
     switch(size) {
     case 32:
         *prime = stm32_ecc256_prime;
         *coef = stm32_ecc256_coef;
+        if (coefB) *coefB = stm32_ecc256_coefB;
         *GenPointX = stm32_ecc256_pointX;
         *GenPointY = stm32_ecc256_pointY;
         *coef_sign = &stm32_ecc256_coef_sign;
@@ -707,6 +746,7 @@ static int stm32_get_ecc_specs(const uint8_t **prime, const uint8_t **coef,
     case 28:
         *prime = stm32_ecc224_prime;
         *coef = stm32_ecc224_coef;
+        if (coefB) *coefB = stm32_ecc224_coefB;
         *GenPointX = stm32_ecc224_pointX;
         *GenPointY = stm32_ecc224_pointY;
         *coef_sign = &stm32_ecc224_coef;
@@ -717,6 +757,7 @@ static int stm32_get_ecc_specs(const uint8_t **prime, const uint8_t **coef,
     case 24:
         *prime = stm32_ecc192_prime;
         *coef = stm32_ecc192_coef;
+        if (coefB) *coefB = stm32_ecc192_coefB;
         *GenPointX = stm32_ecc192_pointX;
         *GenPointY = stm32_ecc192_pointY;
         *coef_sign = &stm32_ecc192_coef;
@@ -727,6 +768,7 @@ static int stm32_get_ecc_specs(const uint8_t **prime, const uint8_t **coef,
     case 48:
         *prime = stm32_ecc384_prime;
         *coef = stm32_ecc384_coef;
+        if (coefB) *coefB = stm32_ecc384_coefB;
         *GenPointX = stm32_ecc384_pointX;
         *GenPointY = stm32_ecc384_pointY;
         *coef_sign = &stm32_ecc384_coef;
@@ -765,7 +807,7 @@ int wc_ecc_mulmod_ex(const mp_int *k, ecc_point *G, ecc_point *R, mp_int* a,
     uint8_t kbin[STM32_MAX_ECC_SIZE];
     uint8_t PtXbin[STM32_MAX_ECC_SIZE];
     uint8_t PtYbin[STM32_MAX_ECC_SIZE];
-    const uint8_t *prime, *coef, *gen_x, *gen_y;
+    const uint8_t *prime, *coef, *coefB, *gen_x, *gen_y, *order;
     const uint32_t *coef_sign;
     (void)a;
     (void)heap;
@@ -792,7 +834,8 @@ int wc_ecc_mulmod_ex(const mp_int *k, ecc_point *G, ecc_point *R, mp_int* a,
 
     size = (uint8_t)szModulus;
     /* find STM32_PKA friendly parameters for the selected curve */
-    if (0 != stm32_get_ecc_specs(&prime, &coef, &coef_sign, &gen_x, &gen_y, NULL, size)) {
+    if (0 != stm32_get_ecc_specs(&prime, &coef, &coefB, &coef_sign,
+            &gen_x, &gen_y, &order, size)) {
         return ECC_BAD_ARG_E;
     }
 
@@ -804,6 +847,13 @@ int wc_ecc_mulmod_ex(const mp_int *k, ecc_point *G, ecc_point *R, mp_int* a,
     pka_mul.pointY = Gybin;
     pka_mul.scalarMulSize = size;
     pka_mul.scalarMul = kbin;
+#ifdef WOLFSSL_STM32_PKA_V2
+    pka_mul.coefB = coefB;
+    pka_mul.primeOrder = order;
+#else
+    (void)order;
+    (void)coefB;
+#endif
 
     status = HAL_PKA_ECCMul(&hpka, &pka_mul, HAL_MAX_DELAY);
     if (status != HAL_OK) {
@@ -887,11 +937,11 @@ int stm32_ecc_verify_hash_ex(mp_int *r, mp_int *s, const byte* hash,
 
     size = (uint8_t)szModulus;
     /* find parameters for the selected curve */
-    if (0 != stm32_get_ecc_specs(&prime, &coef, &coef_sign, &gen_x, &gen_y, &order, size)) {
+    if (0 != stm32_get_ecc_specs(&prime, &coef, NULL, &coef_sign,
+            &gen_x, &gen_y, &order, size)) {
         return ECC_BAD_ARG_E;
     }
 
-
     pka_ecc.primeOrderSize =  size;
     pka_ecc.modulusSize =     size;
     pka_ecc.coefSign =        *coef_sign;
@@ -933,7 +983,7 @@ int stm32_ecc_sign_hash_ex(const byte* hash, word32 hashlen, WC_RNG* rng,
     uint8_t Rbin[STM32_MAX_ECC_SIZE];
     uint8_t Sbin[STM32_MAX_ECC_SIZE];
     uint8_t Hashbin[STM32_MAX_ECC_SIZE];
-    const uint8_t *prime, *coef, *gen_x, *gen_y, *order;
+    const uint8_t *prime, *coef, *coefB, *gen_x, *gen_y, *order;
     const uint32_t *coef_sign;
     XMEMSET(&pka_ecc, 0x00, sizeof(PKA_ECDSASignInTypeDef));
     XMEMSET(&pka_ecc_out, 0x00, sizeof(PKA_ECDSASignOutTypeDef));
@@ -952,7 +1002,8 @@ int stm32_ecc_sign_hash_ex(const byte* hash, word32 hashlen, WC_RNG* rng,
         return status;
 
     /* find parameters for the selected curve */
-    if (0 != stm32_get_ecc_specs(&prime, &coef, &coef_sign, &gen_x, &gen_y, &order, size)) {
+    if (0 != stm32_get_ecc_specs(&prime, &coef, &coefB, &coef_sign,
+            &gen_x, &gen_y, &order, size)) {
         return ECC_BAD_ARG_E;
     }
 
@@ -968,6 +1019,11 @@ int stm32_ecc_sign_hash_ex(const byte* hash, word32 hashlen, WC_RNG* rng,
     pka_ecc.modulusSize =     size;
     pka_ecc.coefSign =        *coef_sign;
     pka_ecc.coef =            coef;
+#ifdef WOLFSSL_STM32_PKA_V2
+    pka_ecc.coefB =           coefB;
+#else
+    (void)coefB;
+#endif
     pka_ecc.modulus =         prime;
     pka_ecc.basePointX =      gen_x;
     pka_ecc.basePointY =      gen_y;

+ 1 - 1
wolfssl/openssl/sha.h

@@ -95,7 +95,7 @@ typedef WOLFSSL_SHA_CTX SHA_CTX;
 
 /* adder for HW crypto */
 #ifdef STM32_HASH
-#define CTX_SHA2_HW_ADDER 30
+#define CTX_SHA2_HW_ADDER 34
 #else
 #define CTX_SHA2_HW_ADDER 0
 #endif