nRF5340 Development Kit board: ECDH implementation (found in nrf/samples/crypto/ecdh) using hardware (cc3xx) is significantly slower than when using software (oberon)

Performance on hardware (cc3xx) is significantly slower than performance on software (oberon). Does anyone have a good explanation for this? I have also tested ECDSA and can see that hardware is faster. I also tried a different elliptic curve, but the result is the same. 
added the performance code, everything else and more can be found in nrf/samples/crypto/ecdh.
Development setup:
Macbook Air Apple M1
Toolchain version: 2.6.0
nRF5340 Development Kit board

This is my main.c file:

/*
 * Copyright (c) 2021 Nordic Semiconductor ASA
 *
 * SPDX-License-Identifier: LicenseRef-Nordic-5-Clause
 */

#include <zephyr/kernel.h>
#include <zephyr/sys/printk.h>
#include <zephyr/logging/log.h>
#include <stdio.h>
#include <stdlib.h>
#include <psa/crypto.h>
#include <psa/crypto_extra.h>
#include <zephyr/timing/timing.h>
#include <math.h>

#include <zephyr/drivers/timer/system_timer.h>
#include <zephyr/sys/__assert.h>

#ifdef CONFIG_BUILD_WITH_TFM
#include <tfm_ns_interface.h>
#endif

#define APP_SUCCESS		(0)
#define APP_ERROR		(-1)
#define APP_SUCCESS_MESSAGE "Example finished successfully!"
#define APP_ERROR_MESSAGE "Example exited with error!"

#define PRINT_HEX(p_label, p_text, len)\
	({\
		LOG_INF("---- %s (len: %u): ----", p_label, len);\
		LOG_HEXDUMP_INF(p_text, len, "Content:");\
		LOG_INF("---- %s end  ----", p_label);\
	})

LOG_MODULE_REGISTER(ecdh, LOG_LEVEL_DBG);

/* ====================================================================== */
/*				Global variables/defines for the ECDH example			  */

#define NRF_CRYPTO_EXAMPLE_ECDH_KEY_BITS (256)
#define NRF_CRYPTO_EXAMPLE_ECDH_PUBLIC_KEY_SIZE (65)

psa_key_id_t key_id_alice;

/* ====================================================================== */

int crypto_init(void)
{
	psa_status_t status;

	/* Initialize PSA Crypto */
	status = psa_crypto_init();
	if (status != PSA_SUCCESS)
		return APP_ERROR;

	return APP_SUCCESS;
}

int create_ecdh_keypair(psa_key_id_t *key_id)
{
	psa_status_t status;
	psa_key_attributes_t key_attributes = PSA_KEY_ATTRIBUTES_INIT;

	/* Crypto settings for ECDH using the SHA256 hashing algorithm,
	 * the secp256r1 curve
	 */
	psa_set_key_usage_flags(&key_attributes, PSA_KEY_USAGE_DERIVE);
	psa_set_key_lifetime(&key_attributes, PSA_KEY_LIFETIME_VOLATILE);
	psa_set_key_algorithm(&key_attributes, PSA_ALG_ECDH);
	psa_set_key_type(&key_attributes, PSA_KEY_TYPE_ECC_KEY_PAIR(PSA_ECC_FAMILY_SECP_R1));
	psa_set_key_bits(&key_attributes, 256);

	/* Generate a key pair */
	status = psa_generate_key(&key_attributes, key_id);
	if (status != PSA_SUCCESS) {
		LOG_INF("psa_generate_key failed! (Error: %d)", status);
		return APP_ERROR;
	}

	psa_reset_key_attributes(&key_attributes);

	// LOG_INF("ECDH keypair created successfully!");

	return APP_SUCCESS;
}

int main(void)
{
    psa_status_t status;

    /* Init crypto */
    status = crypto_init();
    if (status != APP_SUCCESS) {
        LOG_INF(APP_ERROR_MESSAGE);
        return APP_ERROR;
    }

    /* Create the ECDH key pairs for Alice and Bob  */
    timing_t start_time, end_time;
    uint64_t total_cycles = 0;
    uint64_t sum_squared = 0;
    uint64_t min_cycles = UINT64_MAX;
    uint64_t max_cycles = 0;
    char* benchmark_name = "ECDH Keypair Generation";
    int runs = 100;

    printf("Starting %s benchmark (%d runs)...\n", benchmark_name, runs);

    // Warm-up run
    status = create_ecdh_keypair(&key_id_alice);
    if(status != APP_SUCCESS) {
        printf("%s warm-up failed with %d\n", benchmark_name, status);
    }

    psa_destroy_key(key_id_alice);

    timing_init();
    timing_start();
    for(int i = 0; i < runs; i++) {
        start_time = timing_counter_get();

        // Function to benchmark
        status = create_ecdh_keypair(&key_id_alice);

        end_time = timing_counter_get();

        if(status != APP_SUCCESS) {
            printf("%s error on run %d\n", benchmark_name, i);
            runs = i;
            break;
        }

        uint64_t cycles = timing_cycles_get(&start_time, &end_time);
        total_cycles += cycles;
        sum_squared += cycles * cycles;

        if(cycles < min_cycles) {
            min_cycles = cycles;
        }

        if(cycles > max_cycles) {
            max_cycles = cycles;
        }
        psa_destroy_key(key_id_alice);
    }
    timing_stop();
    
    printf("Frequency: %u MHz\n", timing_freq_get_mhz());
    
    double mean = total_cycles / runs;
    double var = (sum_squared - ((total_cycles * total_cycles) / runs)) / (runs - 1);
    double std = sqrt(var);

    printf("%s benchmark results:\n", benchmark_name);
    printf("   Runs:    %d\n", runs);
    printf("   Total:   %llu cycles\n", total_cycles);
    printf("   Average: %.3f cycles\n", mean);
    printf("   Minimum: %llu cycles\n", min_cycles);
    printf("   Maximum: %llu cycles\n", max_cycles);
    printf("   Std:     %.3f cycles\n", std);

    return APP_SUCCESS;
}



This is my prj.conf:

# The Zephyr CMSIS emulation assumes that ticks are ms, currently
CONFIG_SYS_CLOCK_TICKS_PER_SEC=1000

CONFIG_MAIN_STACK_SIZE=4096
CONFIG_HEAP_MEM_POOL_SIZE=4096

# Enable logging
CONFIG_CONSOLE=y
CONFIG_LOG=y

# Enable nordic security backend and PSA APIs
CONFIG_NRF_SECURITY=y
CONFIG_MBEDTLS_PSA_CRYPTO_C=y

CONFIG_MBEDTLS_ENABLE_HEAP=y
CONFIG_MBEDTLS_HEAP_SIZE=8192

CONFIG_PSA_WANT_ALG_ECDH=y
CONFIG_PSA_WANT_KEY_TYPE_ECC_KEY_PAIR=y
CONFIG_PSA_WANT_ECC_SECP_R1_256=y

# For key generation
CONFIG_PSA_WANT_GENERATE_RANDOM=y

# Timing functions
CONFIG_TIMING_FUNCTIONS=y
CONFIG_CBPRINTF_FP_SUPPORT=y



This is my nrf5340dk_nrf5340_cpuapp.conf:
# Using hardware crypto accelerator
CONFIG_PSA_CRYPTO_DRIVER_OBERON=n
CONFIG_PSA_CRYPTO_DRIVER_CC3XX=y




When I run this with CONFIG_PSA_CRYPTO_DRIVER_OBERON=y and CONFIG_PSA_CRYPTO_DRIVER_CC3XX=n, I get this output:

*** Booting nRF Connect SDK v3.5.99-ncs1 ***
Starting ECDH Keypair Generation benchmark (100 runs)...
Frequency: 64 MHz
ECDH Keypair Generation benchmark results:
Runs: 100
Total: 1149048 cycles
Average: 11490.000 cycles
Minimum: 11490 cycles
Maximum: 11520 cycles
Std: 3.000 cycles
*** Booting nRF Connect SDK v3.5.99-ncs1 ***


but running it with CONFIG_PSA_CRYPTO_DRIVER_OBERON=n and CONFIG_PSA_CRYPTO_DRIVER_CC3XX=y, I get this output:

*** Booting nRF Connect SDK v3.5.99-ncs1 ***
Starting ECDH Keypair Generation benchmark (100 runs)...
Frequency: 64 MHz
ECDH Keypair Generation benchmark results:
Runs: 100
Total: 97657851 cycles
Average: 976578.000 cycles
Minimum: 929049 cycles
Maximum: 1014195 cycles
Std: 13743.361 cycles
*** Booting nRF Connect SDK v3.5.99-ncs1 ***
Related