Connectivity issues and hardfaults (flash storage suspected culprit)

Hi Nordic Dev,

We are experiencing issues with sudden connectivity loss and hardlocked devices 1,5 years after product launch.
We are currently suspecting the flash storage to be the culprit, but further investigation is needed.

We are using nRF Connect SDK 1.6.0. Bluetooth information are saved to internal flash storage partition with 2 flash pages. CONFIG_SETTINGS, CONFIG_BT_SETTINGS and CONFIG_SETTINGS_NVS are all set.

We have built a runlog that saves up to 32 bytes using Zephyr settings every time the device is stopped. Assumed worst case is that the device is started and stopped 20 times/day (which is alot!).

As far as I understand, NVS settings function like a circular buffer and doesn't write to the same point in flash, unless all the storage space is used, then the first page is erased and resused. Is that correct, or are we writing to the same point in flash constantly?

Assuming something is wrong and we write to the flash storage once every minute 24 hours/day (but not accounting for Bluetooth settings):
Running the calculation found in the docs: SECTOR_COUNT * SECTOR_SIZE * PAGE_ERASES / (NS * (DS+8)) minutes

2 * 4096 * 10000 / (1 * (32+8)) = 1861818 minutes = 3,5 years

The cases experiencing the issue dies even faster than that (1,5 years).

What could be the issue?

See run_log.c below:

#include "run_log.h"
#include <stdint.h>
#include <stdbool.h>
#include <stdio.h>
#include <errno.h>
#include <string.h>
#include <kernel.h>
#include <sys/atomic.h>
#include <device.h>
#include <init.h>
#include "settings/settings.h"

#define BIT_POWER_FAIL 		0
#define BIT_PENDING_WRITE 	1
#define FLAG_POWER_FAIL		(0x01 << BIT_POWER_FAIL)
#define FLAG_PENDING_WRITE	(0x01 << BIT_PENDING_WRITE)

static atomic_t flags = ATOMIC_INIT(0);

static K_MUTEX_DEFINE(mutex);

static struct runlog runlogs[RUNLOG_COUNT];

struct index_list {
	uint_fast8_t index;
	uint_fast16_t id;
};

static struct index_list sorted_by_id[RUNLOG_COUNT];

static time_t current_start_ts = 0;
static uint32_t current_duration = 0;

static void update_sorted_by_id(bool virgin)
{
	k_mutex_lock(&mutex, K_FOREVER);
	if (virgin) {		
		for (uint_fast8_t c = 0; c < RUNLOG_COUNT ; c++) {
			sorted_by_id[c].index = c;
			sorted_by_id[c].id = runlogs[c].id;
		}
	}
	/* bubble sort for small footprint
  	   and fine performance on small lists
       also very fast on an already sorted list */
	for (uint_fast8_t c = 0; c < (RUNLOG_COUNT-1) ; c++) {
		for (uint_fast8_t d = 0; d < (RUNLOG_COUNT-c-1); d++) {
			if (sorted_by_id[d].id < sorted_by_id[d+1].id) {
				/* swap */
				struct index_list t;
				t = sorted_by_id[d];
				sorted_by_id[d] = sorted_by_id[d+1];
				sorted_by_id[d+1] = t;
			}
		}
	}
	k_mutex_unlock(&mutex);
}

void runlog_update_current_duration(uint32_t duration)
{
	k_mutex_lock(&mutex, K_FOREVER);
	current_duration = duration;
	k_mutex_unlock(&mutex);
}

void runlog_update_current_start_ts(time_t start_ts)
{	
	k_mutex_lock(&mutex, K_FOREVER);
	current_start_ts = start_ts;
	k_mutex_unlock(&mutex);		
}

void runlog_put_log(enum spring_state state, uint32_t duration)
{
	k_mutex_lock(&mutex, K_FOREVER);

	time_t ts = current_start_ts;
	current_start_ts = 0;

	struct runlog log = {
		.state = state,
		.start_ts = ts,
		.duration = duration,
	};
	
	/* iterate latest id */
	log.id = runlogs[sorted_by_id[0].index].id + 1;
	/* overwrite oldest runlog */
	runlogs[sorted_by_id[RUNLOG_COUNT-1].index] = log;
	/* and update index list with the new id */
	sorted_by_id[RUNLOG_COUNT-1].id = log.id;
	k_mutex_unlock(&mutex);

	/* now we can sort after index list has been updated */
	update_sorted_by_id(false);

	atomic_set_bit(&flags, BIT_PENDING_WRITE);

	runlog_save();
}

void runlog_clear_all(void)
{
	k_mutex_lock(&mutex, K_FOREVER);

	struct runlog log = {
		.state = 0,
		.start_ts = 0,
		.duration = 0,
		.id = 0,
	};
	
	for (uint_fast8_t i = 0; i < RUNLOG_COUNT; i++) {
		runlogs[i] = log;
		sorted_by_id[i].id = 0;
		sorted_by_id[i].index = 0;
	}

	k_mutex_unlock(&mutex);

	atomic_set_bit(&flags, BIT_PENDING_WRITE);

	runlog_save();
}

int_fast8_t runlog_get_latest_log(uint_fast8_t offset, struct runlog *log)
{
	if (offset >= RUNLOG_COUNT) {
		return -1;
	}

	k_mutex_lock(&mutex, K_FOREVER);
	/* latest will be the first in the sorted index list */
	*log = runlogs[sorted_by_id[offset].index];
	k_mutex_unlock(&mutex);	
	return 0;
}

void runlog_set_power_fail(void)
{
	atomic_set_bit(&flags, BIT_POWER_FAIL);
}

void runlog_clear_power_fail(void)
{
	atomic_clear_bit(&flags, BIT_POWER_FAIL);
}

bool runlog_is_ready_to_sleep(void)
{
	return (atomic_get(&flags) & FLAG_PENDING_WRITE) == 0;
}

void runlog_save(void)
{
	if ((atomic_get(&flags) & FLAG_POWER_FAIL) != FLAG_POWER_FAIL) {
		settings_save();
		atomic_clear_bit(&flags, BIT_PENDING_WRITE);
	}
}

int runlog_settings_commit(void)
{
	/* all values has been loaded */
	update_sorted_by_id(true);

	/* current duration will have a nonzero
	   value if power was turned off while
	   running. So make a runlog with power off
	   as stopped reason */
	k_mutex_lock(&mutex, K_FOREVER);	
	if (current_duration) {
		/* runlog_put_log will trigger a save
		   so before running that we clear duration
		   and start_ts so these values is cleared
		   when saving.
		*/
		uint32_t dur = current_duration;
		current_duration = 0;
		runlog_put_log(SPRING_STATE_STOPPED_REASON_POWER_OFF, dur);

	}
	k_mutex_unlock(&mutex);
	return 0;
}

int runlog_settings_set(const char *name, size_t len, settings_read_cb read_cb,
		  void *cb_arg)
{
	const char *next;	
	size_t name_len;
	char buf[20];
	int rc;

	name_len = settings_name_next(name, &next);

	if (!next) {
		if (!strncmp(name, "start_ts", name_len)) {					
				rc = read_cb(cb_arg, &current_start_ts, sizeof(current_start_ts));
				return 0;			
		}

		if (!strncmp(name, "duration", name_len)) {					
			rc = read_cb(cb_arg, &current_duration, sizeof(current_duration));
			return 0;
		}

		for (uint_fast8_t i = 0; i < RUNLOG_COUNT ; i++) {			
			snprintf(buf, 20, "%d", i);
			if (!strncmp(name, buf, name_len)) {				
				rc = read_cb(cb_arg, &runlogs[i], sizeof(struct runlog));
				return 0;
			}
		}
	}

	return -ENOENT;
}

static int runlog_settings_export(int (*cb)(const char *name,
			       const void *value, size_t val_len))
{
	char buf[20];

	(void)cb("runlog/start_ts", &current_start_ts, sizeof(current_start_ts));
	(void)cb("runlog/duration", &current_duration, sizeof(current_duration));

	for (uint_fast8_t i = 0; i < RUNLOG_COUNT ; i++) {
		snprintf(buf, 20, "runlog/%d", i);
		(void)cb(buf, &runlogs[i], sizeof(struct runlog));
	}

	return 0;
}

/* static subtree handler */
SETTINGS_STATIC_HANDLER_DEFINE(run_log,"runlog",NULL,runlog_settings_set,runlog_settings_commit,runlog_settings_export);

  • Hi

    I need to get back to you tomorrow with more information as I need some time more time to get into this issue. Thank you for your patience.

    Best regards,

    Simon

  • Hi Bastian,

    With the formula it seems like there is still time for the flash write to reach the limit of the flash endurance. I have some questions, which hopefully you can answer.

    1. Is this is same issue on all of your devices or is this happening on few? If it is happening on few, then how much percentage of the devices are failing this way?
    2. Why do you suspect flash endurance here?
    3. It could be a hardfault or any other fault in your device, and the reasons for the faults can be millions. Without the log of what happened before the fault or hardfault trace (log), it is almost impossible to narrow down this only flash writes. It could be memory overflow from any parts of your application, modules, drivers or libraries. We need the log or a way to reproduce it. 
    4. You can just try to speed up the process of testing this by writing every 10 milliseconds and see if you can reproduce this issue in 18/60/100 months = 0.003 months = about 131 minutes
Related