nRF5340 Development Kit board: ECDH implementation (found in nrf/samples/crypto/ecdh) using hardware (cc3xx) is significantly slower than when using software (oberon)

Performance on hardware (cc3xx) is significantly slower than performance on software (oberon). Does anyone have a good explanation for this? I have also tested ECDSA and can see that hardware is faster. I also tried a different elliptic curve, but the result is the same. 
added the performance code, everything else and more can be found in nrf/samples/crypto/ecdh.
Development setup:
Macbook Air Apple M1
Toolchain version: 2.6.0
nRF5340 Development Kit board

This is my main.c file:

/*
 * Copyright (c) 2021 Nordic Semiconductor ASA
 *
 * SPDX-License-Identifier: LicenseRef-Nordic-5-Clause
 */

#include <zephyr/kernel.h>
#include <zephyr/sys/printk.h>
#include <zephyr/logging/log.h>
#include <stdio.h>
#include <stdlib.h>
#include <psa/crypto.h>
#include <psa/crypto_extra.h>
#include <zephyr/timing/timing.h>
#include <math.h>

#include <zephyr/drivers/timer/system_timer.h>
#include <zephyr/sys/__assert.h>

#ifdef CONFIG_BUILD_WITH_TFM
#include <tfm_ns_interface.h>
#endif

#define APP_SUCCESS		(0)
#define APP_ERROR		(-1)
#define APP_SUCCESS_MESSAGE "Example finished successfully!"
#define APP_ERROR_MESSAGE "Example exited with error!"

#define PRINT_HEX(p_label, p_text, len)\
	({\
		LOG_INF("---- %s (len: %u): ----", p_label, len);\
		LOG_HEXDUMP_INF(p_text, len, "Content:");\
		LOG_INF("---- %s end  ----", p_label);\
	})

LOG_MODULE_REGISTER(ecdh, LOG_LEVEL_DBG);

/* ====================================================================== */
/*				Global variables/defines for the ECDH example			  */

#define NRF_CRYPTO_EXAMPLE_ECDH_KEY_BITS (256)
#define NRF_CRYPTO_EXAMPLE_ECDH_PUBLIC_KEY_SIZE (65)

psa_key_id_t key_id_alice;

/* ====================================================================== */

int crypto_init(void)
{
	psa_status_t status;

	/* Initialize PSA Crypto */
	status = psa_crypto_init();
	if (status != PSA_SUCCESS)
		return APP_ERROR;

	return APP_SUCCESS;
}

int create_ecdh_keypair(psa_key_id_t *key_id)
{
	psa_status_t status;
	psa_key_attributes_t key_attributes = PSA_KEY_ATTRIBUTES_INIT;

	/* Crypto settings for ECDH using the SHA256 hashing algorithm,
	 * the secp256r1 curve
	 */
	psa_set_key_usage_flags(&key_attributes, PSA_KEY_USAGE_DERIVE);
	psa_set_key_lifetime(&key_attributes, PSA_KEY_LIFETIME_VOLATILE);
	psa_set_key_algorithm(&key_attributes, PSA_ALG_ECDH);
	psa_set_key_type(&key_attributes, PSA_KEY_TYPE_ECC_KEY_PAIR(PSA_ECC_FAMILY_SECP_R1));
	psa_set_key_bits(&key_attributes, 256);

	/* Generate a key pair */
	status = psa_generate_key(&key_attributes, key_id);
	if (status != PSA_SUCCESS) {
		LOG_INF("psa_generate_key failed! (Error: %d)", status);
		return APP_ERROR;
	}

	psa_reset_key_attributes(&key_attributes);

	// LOG_INF("ECDH keypair created successfully!");

	return APP_SUCCESS;
}

int main(void)
{
    psa_status_t status;

    /* Init crypto */
    status = crypto_init();
    if (status != APP_SUCCESS) {
        LOG_INF(APP_ERROR_MESSAGE);
        return APP_ERROR;
    }

    /* Create the ECDH key pairs for Alice and Bob  */
    timing_t start_time, end_time;
    uint64_t total_cycles = 0;
    uint64_t sum_squared = 0;
    uint64_t min_cycles = UINT64_MAX;
    uint64_t max_cycles = 0;
    char* benchmark_name = "ECDH Keypair Generation";
    int runs = 100;

    printf("Starting %s benchmark (%d runs)...\n", benchmark_name, runs);

    // Warm-up run
    status = create_ecdh_keypair(&key_id_alice);
    if(status != APP_SUCCESS) {
        printf("%s warm-up failed with %d\n", benchmark_name, status);
    }

    psa_destroy_key(key_id_alice);

    timing_init();
    timing_start();
    for(int i = 0; i < runs; i++) {
        start_time = timing_counter_get();

        // Function to benchmark
        status = create_ecdh_keypair(&key_id_alice);

        end_time = timing_counter_get();

        if(status != APP_SUCCESS) {
            printf("%s error on run %d\n", benchmark_name, i);
            runs = i;
            break;
        }

        uint64_t cycles = timing_cycles_get(&start_time, &end_time);
        total_cycles += cycles;
        sum_squared += cycles * cycles;

        if(cycles < min_cycles) {
            min_cycles = cycles;
        }

        if(cycles > max_cycles) {
            max_cycles = cycles;
        }
        psa_destroy_key(key_id_alice);
    }
    timing_stop();
    
    printf("Frequency: %u MHz\n", timing_freq_get_mhz());
    
    double mean = total_cycles / runs;
    double var = (sum_squared - ((total_cycles * total_cycles) / runs)) / (runs - 1);
    double std = sqrt(var);

    printf("%s benchmark results:\n", benchmark_name);
    printf("   Runs:    %d\n", runs);
    printf("   Total:   %llu cycles\n", total_cycles);
    printf("   Average: %.3f cycles\n", mean);
    printf("   Minimum: %llu cycles\n", min_cycles);
    printf("   Maximum: %llu cycles\n", max_cycles);
    printf("   Std:     %.3f cycles\n", std);

    return APP_SUCCESS;
}



This is my prj.conf:

# The Zephyr CMSIS emulation assumes that ticks are ms, currently
CONFIG_SYS_CLOCK_TICKS_PER_SEC=1000

CONFIG_MAIN_STACK_SIZE=4096
CONFIG_HEAP_MEM_POOL_SIZE=4096

# Enable logging
CONFIG_CONSOLE=y
CONFIG_LOG=y

# Enable nordic security backend and PSA APIs
CONFIG_NRF_SECURITY=y
CONFIG_MBEDTLS_PSA_CRYPTO_C=y

CONFIG_MBEDTLS_ENABLE_HEAP=y
CONFIG_MBEDTLS_HEAP_SIZE=8192

CONFIG_PSA_WANT_ALG_ECDH=y
CONFIG_PSA_WANT_KEY_TYPE_ECC_KEY_PAIR=y
CONFIG_PSA_WANT_ECC_SECP_R1_256=y

# For key generation
CONFIG_PSA_WANT_GENERATE_RANDOM=y

# Timing functions
CONFIG_TIMING_FUNCTIONS=y
CONFIG_CBPRINTF_FP_SUPPORT=y



This is my nrf5340dk_nrf5340_cpuapp.conf:
# Using hardware crypto accelerator
CONFIG_PSA_CRYPTO_DRIVER_OBERON=n
CONFIG_PSA_CRYPTO_DRIVER_CC3XX=y




When I run this with CONFIG_PSA_CRYPTO_DRIVER_OBERON=y and CONFIG_PSA_CRYPTO_DRIVER_CC3XX=n, I get this output:

*** Booting nRF Connect SDK v3.5.99-ncs1 ***
Starting ECDH Keypair Generation benchmark (100 runs)...
Frequency: 64 MHz
ECDH Keypair Generation benchmark results:
Runs: 100
Total: 1149048 cycles
Average: 11490.000 cycles
Minimum: 11490 cycles
Maximum: 11520 cycles
Std: 3.000 cycles
*** Booting nRF Connect SDK v3.5.99-ncs1 ***


but running it with CONFIG_PSA_CRYPTO_DRIVER_OBERON=n and CONFIG_PSA_CRYPTO_DRIVER_CC3XX=y, I get this output:

*** Booting nRF Connect SDK v3.5.99-ncs1 ***
Starting ECDH Keypair Generation benchmark (100 runs)...
Frequency: 64 MHz
ECDH Keypair Generation benchmark results:
Runs: 100
Total: 97657851 cycles
Average: 976578.000 cycles
Minimum: 929049 cycles
Maximum: 1014195 cycles
Std: 13743.361 cycles
*** Booting nRF Connect SDK v3.5.99-ncs1 ***
Parents
  • I put my bet on that the answer by Hieu is wrong.

    I think it's different behaviour in the two implementations how they each define "keypair generation". The oberon lib only generates some random numbers which constitute the private key, while I think CC310 also derives the public key internally (which is the operation that actually costs). Please try to redo your benchmark by performing key pair generation followed by public key export to make an apples by apples comparison.

    In any case, the CC310 is quite slow when it comes to big number arithmetic (as is used in public key cryptography) for being a hw accelerator. The multiplier internally is not so fast and it uses the generic Barrett reduction algorithm, while software implementations can be optimized better for "special prime moduli" such as the ones typically used for elliptic curves. The Cracen found in nRF54L is several times faster.

  • Hi Emil

    Here is the comparison between HW and SW for public key export after key generation:

    HW:

    Starting ECDH KEY PAIR GENERATION benchmark (100 runs)...
    Frequency: 64 MHz
    ECDH KEY PAIR GENERATION benchmark results:
       Runs:    100
       Total:   97356680 cycles
       Average: 973566.000 cycles
       Minimum: 943227 cycles
       Maximum: 1013147 cycles
       Std:     13392.071 cycles
    
    Starting ECDH PUBLIC KEY EXPORT benchmark (100 runs)...
    Frequency: 64 MHz
    ECDH PUBLIC KEY EXPORT benchmark results:
       Runs:    100
       Total:   90703453 cycles
       Average: 907034.000 cycles
       Minimum: 906987 cycles
       Maximum: 907036 cycles
       Std:     4.796 cycles
    

    SW:

    Starting ECDH KEY PAIR GENERATION benchmark (100 runs)...
    Frequency: 64 MHz
    ECDH KEY PAIR GENERATION benchmark results:
       Runs:    100
       Total:   1107485 cycles
       Average: 11074.000 cycles
       Minimum: 11074 cycles
       Maximum: 11153 cycles
       Std:     7.874 cycles
    
    Starting ECDH PUBLIC KEY EXPORT benchmark (100 runs)...
    Frequency: 64 MHz
    ECDH PUBLIC KEY EXPORT benchmark results:
       Runs:    100
       Total:   147704871 cycles
       Average: 1477048.000 cycles
       Minimum: 1477031 cycles
       Maximum: 1477049 cycles
       Std:     2.000 cycles
    


    I use the nRF5340 DK and according to https://www.nordicsemi.com/Products/nRF5340, it should use the Arm CryptoCell-312. I have seen that Cracen is also a possibility, but not on nRF5340 devices.

  • Hmm something is very weird with the generation on cryptocell then, as it should definitely not take a million cycles (16 milliseconds) to simply generate 32 random bytes...

Reply Children
No Data
Related