This post is older than 2 years and might not be relevant anymore
More Info: Consider searching for newer posts

Cause: Data bus error (return address in the stack frame is not related to the instruction that caused the error)

Dears,

I'm trying to run the mbedtls library with thread safety with the freertos, already defined MBEDTLS_THREADING_C and MBEDTLS_THREADING_ALT in the user_mbedtls_config.h, but the program runs into a crash when calling xSemaphoreTake( mutex->mutex, portMAX_DELAY ), it seems it's a concurrency issues, so I'm asking for help how to solve this problem, any suggestion is appreciated.

IDE: SES
SDK: nRF5_SDK_for_Thread_and_Zigbee_v3.1.0_c7c4730


the threading_alt.c file is:

void user_mbedtls_mutex_init( mbedtls_threading_mutex_t * mutex )
{
mutex->mutex = xSemaphoreCreateMutex();

if( mutex->mutex != NULL )
{
mutex->is_valid = 1;
}
else
{
mutex->is_valid = 0;
NRF_LOG_DEBUG( ( "Failed to initialize mbedTLS mutex.\r\n" ) );
NRF_LOG_FLUSH();
}
}

/**
* @brief Implementation of mbedtls_mutex_free for thread-safety.
*
*/
void user_mbedtls_mutex_free( mbedtls_threading_mutex_t * mutex )
{
if( mutex->is_valid == 1 )
{
vSemaphoreDelete( mutex->mutex );
mutex->is_valid = 0;
}
}

/**
* @brief Implementation of mbedtls_mutex_lock for thread-safety.
*
* @return 0 if successful, MBEDTLS_ERR_THREADING_MUTEX_ERROR if timeout,
* MBEDTLS_ERR_THREADING_BAD_INPUT_DATA if the mutex is not valid.
*/
int user_mbedtls_mutex_lock( mbedtls_threading_mutex_t * mutex )
{
int ret = MBEDTLS_ERR_THREADING_BAD_INPUT_DATA;

if( mutex->is_valid == 1 )
{
if( xSemaphoreTake( mutex->mutex, portMAX_DELAY ) )
{
ret = 0;
}
else
{
ret = MBEDTLS_ERR_THREADING_MUTEX_ERROR;
NRF_LOG_DEBUG( ( "Failed to obtain mbedTLS mutex.\r\n" ) );
NRF_LOG_FLUSH();
}
}

return ret;
}

/**
* @brief Implementation of mbedtls_mutex_unlock for thread-safety.
*
* @return 0 if successful, MBEDTLS_ERR_THREADING_MUTEX_ERROR if timeout,
* MBEDTLS_ERR_THREADING_BAD_INPUT_DATA if the mutex is not valid.
*/
int user_mbedtls_mutex_unlock( mbedtls_threading_mutex_t * mutex )
{
int ret = MBEDTLS_ERR_THREADING_BAD_INPUT_DATA;

if( mutex->is_valid == 1 )
{
if( xSemaphoreGive( mutex->mutex ) )
{
ret = 0;
}
else
{
ret = MBEDTLS_ERR_THREADING_MUTEX_ERROR;
NRF_LOG_DEBUG( ( "Failed to unlock mbedTLS mutex.\r\n" ) );
NRF_LOG_FLUSH();
}
}

return ret;
}



void user_mbedtls_threading_set_alt(void)
{
/* Configure mbedtls to use FreeRTOS mutexes. */
mbedtls_threading_set_alt( user_mbedtls_mutex_init,
user_mbedtls_mutex_free,
user_mbedtls_mutex_lock,
user_mbedtls_mutex_unlock );
}





the trace of the calling function is:

step 1:--->/SDK/external/mbedtls/library/ssl_cli.c --->line3273
---> int mbedtls_ssl_handshake_client_step( mbedtls_ssl_context *ssl )
case MBEDTLS_SSL_CLIENT_HELLO:
ret = ssl_write_client_hello( ssl );
break;


step 2--->/SDK/external/mbedtls/library/ssl_cli.c --->line707
---> static int ssl_write_client_hello( mbedtls_ssl_context *ssl )
MBEDTLS_SSL_DEBUG_MSG( 3, ( "client hello, max version: [%d:%d]",
buf[4], buf[5] ) );

if( ( ret = ssl_generate_random( ssl ) ) != 0 )
{
MBEDTLS_SSL_DEBUG_RET( 1, "ssl_generate_random", ret );
return( ret );
}


step 3--->/SDK/external/mbedtls/library/ssl_cli.c --->line667
---> static int ssl_generate_random( mbedtls_ssl_context *ssl )
if( ( ret = ssl->conf->f_rng( ssl->conf->p_rng, p, 28 ) ) != 0 )
return( ret );

step 4--->/SDK/external/mbedtls/library/ctr_drbg.c --->line399
---> int mbedtls_ctr_drbg_random( void *p_rng, unsigned char *output, size_t output_len )
#if defined(MBEDTLS_THREADING_C)
if( ( ret = mbedtls_mutex_lock( &ctx->mutex ) ) != 0 )
return( ret );
#endif

step 5 -- threading_alt.c 
int user_mbedtls_mutex_lock( mbedtls_threading_mutex_t * mutex )
{
int ret = MBEDTLS_ERR_THREADING_BAD_INPUT_DATA;

if( mutex->is_valid == 1 )
{
if( xSemaphoreTake( mutex->mutex, portMAX_DELAY ) )
{


the program stuck here and then hardfault happened, see the following log:

<error> hardfault: HARD FAULT at 0x00026DFE

<error> hardfault: R0: 0x200136D4 R1: 0x00000004 R2: 0xFFFFC2F7 R3: 0x000148B5

<error> hardfault: R12: 0x2002EF94 LR: 0x00027A1B PSR: 0x21000000

<error> hardfault: Cause: Data bus error (return address in the stack frame is not related to the instruction that caused the error).

Thanks so much if any help

Parents
  • You have the instruction that caused the hardfault (0x00026DFE). You can open your application .map file or start the program in debugger to see in the dissasembly window what resides at this instruction. This will give you a better context than looking at a bigger chunk of the code.

    Can you tell us what instruction from you code snippet resides at 0x00026DFE? so that I can try to analyze the problem?

  • Dear Susheel,

    thank you for your instruction,  I re-run the program and the hard fault address at 0x0026E40, in this address I can't find the specific information in the app.map,  but I can find something near this address.

    In the debug mode, I captured the corresponding snippet code resides at the 0x002640, it's the list.c from the freertos source files, please see the following information. 

    <error> hardfault: HARD FAULT at 0x00026E40 
    [2020-04-03 09:57:40:147_R:] <error> hardfault: R0: 0x00000004 R1: 0x08580968 R2: 0x200DB510 R3: 0x00000729 
    [2020-04-03 09:57:40:156_R:] <error> hardfault: R12: 0x2002EF94 LR: 0x00027A01 PSR: 0x21000000 
    [2020-04-03 09:57:40:160_R:] <error> hardfault: Cause: Data bus error (return address in the stack frame is not related to the instruction that caused the error).
    
    
    -

    .text.vListInitialise
    0x0000000000026dd2 0x16 Output/xx_nrf52 Release/Obj/list.o
    0x0000000000026dd2 vListInitialise
    .text.vListInitialiseItem
    0x0000000000026de8 0x6 Output/xx_nrf52 Release/Obj/list.o
    0x0000000000026de8 vListInitialiseItem
    .text.vListInsertEnd
    0x0000000000026dee 0x18 Output/xx_nrf52 Release/Obj/list.o
    0x0000000000026dee vListInsertEnd
    .text.vListInsert
    0x0000000000026e06 0x2e Output/xx_nrf52 Release/Obj/list.o
    0x0000000000026e06 vListInsert
    .text.uxListRemove
    0x0000000000026e34 0x20 Output/xx_nrf52 Release/Obj/list.o
    0x0000000000026e34 uxListRemove
    .text.prvIsQueueEmpty
    0x0000000000026e54 0x16 Output/xx_nrf52 Release/Obj/queue.o
    .text.prvCopyDataToQueue
    0x0000000000026e6a 0x66 Output/xx_nrf52 Release/Obj/queue.o
    .text.prvCopyDataFromQueue
    0x0000000000026ed0 0x2a Output/xx_nrf52 Release/Obj/queue.o
    .text.prvUnlockQueue
    0x0000000000026efa 0x70 Output/xx_nrf52 Release/Obj/queue.o
    *fill* 0x0000000000026f6a 0x2 
    .text.vQueueUnregisterQueue.part.1
    0x0000000000026f6c 0x28 Output/xx_nrf52 Release/Obj/queue.o
    .text.xQueueGenericReset
    0x0000000000026f94 0x6c Output/xx_nrf52 Release/Obj/queue.o
    0x0000000000026f94 xQueueGenericReset

    thank you so much

  • Sorry for the late reply vernon. 

    Is the hardfault consistently happening all the time? Can you please help me reproduce this on nRF52840 DK? I will try to debug it more to find out why the list remove function is hardfauling..

  • It's hard to reproduce this issue on nRF52840 DK because my program runs with the azure-iot-sdk-c and a cellular module(bg96). The problem happened after the raw TCP connection established and start to handshake between the device and azure iothub, the specific steps I have posted above. However, If I commented the macro MBEDTLS_THREADING_C/MBEDTLS_THREADING_alt in the user mbedtls_config.h, it runs as normal as expected, but the project runs with Freertos, and the mbedtls support thread-safety, I modified as this link said, and runs the program, the problems would happen unexpectedly definitely after the connection established and going to handshake.

  • vernon said:
    It's hard to reproduce this issue on nRF52840 DK because my program runs with the azure-iot-sdk-c and a cellular module(bg96)

    I did not know that this is external code and I do not think the Nordic Support team would have much knowledge of the external repositories. Have you tried creating an issue in the repository?

  • I haven't asked an issue in those repositories yet, I think this issue is nothing to do with those external codes, it crashes when trying to take the mutex in the freertos code, it's maybe somewhere memory invalid access, I don't how to debug and catch where caused this exception.  

  • I see. Sorry, I misunderstood the nature of what you were asking.

    Can I see the call trace of when user_mbedtls_mutex_lock is called? If this is being called in the interrupt context then you should call xSemaphoreTakeFromISR instead. Also check if it is a valid mutex being sent from app to lock the semaphore.

Reply Children
  • Thanks for your reply, I have posted the track when called the user_mbedtls_mutex_lock above, please have a look on the original posted. I update the SDK to 16.0 today and ran the program again, it didn't crash when called user_mbedtls_mutex_lock, but it got stuck forever, and this link didn't help as well, do you know what would be the possible problem?

  • can you keep a count on calls to xSemaphoreGive and xSemaphoreTake to see if the semaphore is given enough times for this mutex to be unlocked? What I mean to say is, could it be possible that the application is not unlocking the mutex properly?

  • I add two variable to the struct mbedtls_threading_mutex_t  for counting the take/give mutex control and update the mute_take/mutex_give function, please see the following snippet code
    typedef struct
    {
    SemaphoreHandle_t mutex;
    char is_valid;
    uint8_t take_count;
    uint8_t give_count;
    } mbedtls_threading_mutex_t;

    /**
    * @brief Implementation of mbedtls_mutex_init for thread-safety.
    *
    */
    void user_mbedtls_mutex_init( mbedtls_threading_mutex_t * mutex )
    {
    mutex->mutex = xSemaphoreCreateMutex();

    if( mutex->mutex != NULL )
    {
    mutex->is_valid = 1;
    mutex->take_count = 0;
    mutex->give_count = 0;
    }
    else
    {
    mutex->is_valid = 0;
    NRF_LOG_DEBUG( ( "Failed to initialize mbedTLS mutex.\r\n" ) );
    NRF_LOG_FLUSH();
    }
    }

    /**
    * @brief Implementation of mbedtls_mutex_free for thread-safety.
    *
    */
    void user_mbedtls_mutex_free( mbedtls_threading_mutex_t * mutex )
    {
    if( mutex->is_valid == 1 )
    {
    mutex->take_count = 0;
    mutex->give_count = 0;
    vSemaphoreDelete( mutex->mutex );
    mutex->is_valid = 0;
    }
    }

    /**
    * @brief Implementation of mbedtls_mutex_lock for thread-safety.
    *
    * @return 0 if successful, MBEDTLS_ERR_THREADING_MUTEX_ERROR if timeout,
    * MBEDTLS_ERR_THREADING_BAD_INPUT_DATA if the mutex is not valid.
    */
    int user_mbedtls_mutex_lock( mbedtls_threading_mutex_t * mutex )
    {
    int ret = MBEDTLS_ERR_THREADING_BAD_INPUT_DATA;

    if( mutex->is_valid == 1 )
    {
    if( xSemaphoreTake( mutex->mutex, portMAX_DELAY ) )
    {
    mutex->take_count++;
    ret = 0;
    }
    else
    {
    ret = MBEDTLS_ERR_THREADING_MUTEX_ERROR;
    NRF_LOG_DEBUG( ( "Failed to obtain mbedTLS mutex.\r\n" ) );
    NRF_LOG_FLUSH();
    }
    }

    return ret;
    }

    /**
    * @brief Implementation of mbedtls_mutex_unlock for thread-safety.
    *
    * @return 0 if successful, MBEDTLS_ERR_THREADING_MUTEX_ERROR if timeout,
    * MBEDTLS_ERR_THREADING_BAD_INPUT_DATA if the mutex is not valid.
    */
    int user_mbedtls_mutex_unlock( mbedtls_threading_mutex_t * mutex )
    {
    int ret = MBEDTLS_ERR_THREADING_BAD_INPUT_DATA;

    if( mutex->is_valid == 1 )
    {
    if( xSemaphoreGive( mutex->mutex ) )
    {
    mutex->give_count++;
    ret = 0;
    }
    else
    {
    ret = MBEDTLS_ERR_THREADING_MUTEX_ERROR;
    NRF_LOG_DEBUG( ( "Failed to unlock mbedTLS mutex.\r\n" ) );
    NRF_LOG_FLUSH();
    }
    }

    return ret;
    }

    the take and give mutex count is 0, so I think it would be successful to take the mutex, BUT the program would stuck at the if statement "if( xSemaphoreTake( mutex->mutex, portMAX_DELAY ) ) " forever.

Related