LOG_BACKEND_FS on SD card crashes when accessing SD card from own code

Hello,

we are using NRF SDK 2.3.0 and a NRF9160 to do some sensor logging on a SD card connected with SPI.

To do so, we are using the fs.h libary for mounting and writing on the SD card. At the same time we have the LOG_BACKEND_FS activated to log some information about system state on the SD card.

But it seems like the fs logging backend crashes and never comes back, if a simultaneos access to SD card happens (It happens by chance sometimes).

I was able to write a code that reproduces that case: It constantly writes LOG messages and reads the content of the card each 20 log messages. In theory the size of the log files should increase, no matter what.

main.c:

#include <zephyr/kernel.h>
#include <zephyr/storage/disk_access.h>
#include <ff.h>
#include <zephyr/fs/fs.h>
#include <zephyr/logging/log.h>
#include <zephyr/logging/log_ctrl.h>
LOG_MODULE_REGISTER(main);

struct fs_file_t filep;

static FATFS sfat_fs;
/* mounting info */
static struct fs_mount_t mp = {
	.type = FS_FATFS,
	.fs_data = &sfat_fs,
};
/*
*  Note the fatfs library is able to mount only strings inside _VOLUME_STRS
*  in ffconf.h
*/
static const char *disk_mount_pt = "/SD:";

int SDMount(){
    /* raw disk i/o */
	do {
		static const char *disk_pdrv = "SD";
		uint64_t memory_size_mb;
		uint32_t block_count;
		uint32_t block_size;

		if (disk_access_init(disk_pdrv) != 0) {
			LOG_ERR("Storage init Error!%s","");
			break;
		}

		if (disk_access_ioctl(disk_pdrv,
				DISK_IOCTL_GET_SECTOR_COUNT, &block_count)) {
			LOG_ERR("Unable to get sector count%s","");
			break;
		}
		//LOG_INF("Block count %u", block_count);

		if (disk_access_ioctl(disk_pdrv,
				DISK_IOCTL_GET_SECTOR_SIZE, &block_size)) {
			LOG_ERR("Unable to get sector size%s","");
			break;
		}
		//printk("Sector size %u\n", block_size);

		memory_size_mb = (uint64_t)block_count * block_size;
		//printk("Memory Size(MB) %u\n", (uint32_t)(memory_size_mb >> 20));
	} while (0);

	mp.mnt_point = disk_mount_pt;

	int res = fs_mount(&mp);
	

	if (res == FR_OK) {
		LOG_INF("Disk mounted.\n");
		return 0;
	} else {
		LOG_ERR("Error mounting disk (E: %i)\n", res);
		return -1;
	}
}

int lsdir(const char *path)
{
	int res;
	struct fs_dir_t dirp;
	static struct fs_dirent entry;

	fs_dir_t_init(&dirp);

	/* Verify fs_opendir() */
	res = fs_opendir(&dirp, path);
	if (res) {
		LOG_ERR("Error opening dir %s [%d]\n", path, res);
		return res;
	}

	LOG_INF("\nListing dir %s ...\n", path);
	for (;;) {
		/* Verify fs_readdir() */
		res = fs_readdir(&dirp, &entry);

		/* entry.name[0] == 0 means end-of-dir */
		if (res || entry.name[0] == 0) {
			break;
		}

		if (entry.type == FS_DIR_ENTRY_DIR) {
			LOG_INF("[DIR ] %s\n", entry.name);
		} else {
			LOG_INF("[FILE] %s (size = %zu)\n",
				entry.name, entry.size);
		}
	}

	/* Verify fs_closedir() */
	fs_closedir(&dirp);

	return res;
}

void main(void)
{
	printk("Hello World! %s\n", CONFIG_BOARD);

	int err = SDMount();
	if (err != 0){
		LOG_ERR("Error mounting SD card: E:%i", err);
	}
	lsdir("/SD:");
	int counter = 0;
	while (1)
	{
		LOG_INF("%i", counter);
		counter++;
		k_sleep(K_MSEC(300));
		if (counter == 20){
			lsdir("/SD:");
			counter = 0;
		}
	}
}
And proj.conf

#Boot
CONFIG_BOOTLOADER_MCUBOOT=y

CONFIG_LOG=y
CONFIG_PRINTK=y
CONFIG_MAIN_STACK_SIZE=2048

#SD card & SPI 
#####################################################################
CONFIG_DISK_ACCESS=y
CONFIG_FILE_SYSTEM=y
CONFIG_FAT_FILESYSTEM_ELM=y


#Log
CONFIG_LOG_BACKEND_FS=y
CONFIG_LOG_BACKEND_FS_OUTPUT_TEXT=n
CONFIG_LOG_BACKEND_FS_DIR="/SD:"
CONFIG_LOG_BACKEND_FS_FILE_PREFIX="log"
CONFIG_LOG_BACKEND_FS_FILES_LIMIT=5
CONFIG_LOG_BACKEND_FS_FILE_SIZE=4096
CONFIG_LOG_BACKEND_FS_OVERWRITE=y


# Need this when storage is on flash
CONFIG_MPU_ALLOW_FLASH_WRITE=y

This is the log from the console:

*** Booting Zephyr OS build v3.2.99-ncs2 ***
Hello World! actinius_icarus
[00:00:02.488,403] <inf> sd: Detected SD card
[00:00:02.507,934] <inf> sd: Maximum SD clock is under 25MHz, using clock of 24000000Hz
[00:00:02.513,549] <inf> main: Disk mounted.

[00:00:02.513,641] <inf> main: 
Listing dir /SD: ...

[00:00:02.515,441] <inf> main: [FILE] LOG0008 (size = 4060)

[00:00:02.515,502] <inf> main: [FILE] LOG0009 (size = 4060)

[00:00:02.515,563] <inf> main: [FILE] LOG0010 (size = 129)

[00:00:02.519,165] <inf> main: [FILE] DEVICEID.TXT (size = 5)

[00:00:02.531,677] <inf> main: [FILE] HOSTNAME.TXT (size = 28)

[00:00:02.531,738] <inf> main: [DIR ] SYSTEM~1

[00:00:02.533,569] <inf> main: 0
[00:00:02.833,648] <inf> main: 1
[00:00:03.133,789] <inf> main: 2
[00:00:03.433,898] <inf> main: 3
[00:00:03.734,008] <inf> main: 4
[00:00:04.034,210] <inf> main: 5
[00:00:04.334,320] <inf> main: 6
[00:00:04.634,429] <inf> main: 7
[00:00:04.934,539] <inf> main: 8
[00:00:05.234,649] <inf> main: 9
[00:00:05.534,759] <inf> main: 10
[00:00:05.834,869] <inf> main: 11
[00:00:06.134,979] <inf> main: 12
[00:00:06.435,089] <inf> main: 13
[00:00:06.735,198] <inf> main: 14
[00:00:07.035,308] <inf> main: 15
[00:00:07.335,418] <inf> main: 16
[00:00:07.635,528] <inf> main: 17
[00:00:07.935,638] <inf> main: 18
[00:00:08.235,748] <inf> main: 19
[00:00:08.535,919] <inf> main: 
Listing dir /SD: ...

[00:00:08.535,980] <inf> main: [FILE] LOG0008 (size = 4060)

[00:00:08.536,041] <inf> main: [FILE] LOG0009 (size = 4060)

[00:00:08.536,071] <inf> main: [FILE] LOG0010 (size = 129)

[00:00:08.536,132] <inf> main: [FILE] LOG0000 (size = 1402)

[00:00:08.539,733] <inf> main: [FILE] DEVICEID.TXT (size = 5)

[00:00:08.552,062] <inf> main: [FILE] HOSTNAME.TXT (size = 28)

[00:00:08.552,124] <inf> main: [DIR ] SYSTEM~1

[00:00:08.553,161] <wrn> sd: Could not get SD card mutex1
[00:00:08.553,192] <err> fs: file write error (-5)
[00:00:08.553,375] <wrn> sd: Could not get SD card mutex1
[00:00:08.553,436] <err> fs: directory read error (-5)
backend corrupted
[00:00:08.553,833] <inf> main: 0
[00:00:08.853,912] <inf> main: 1
[00:00:09.154,022] <inf> main: 2
[00:00:09.454,132] <inf> main: 3
[00:00:09.754,241] <inf> main: 4
[00:00:10.054,351] <inf> main: 5
[00:00:10.354,492] <inf> main: 6
[00:00:10.654,602] <inf> main: 7
[00:00:10.954,711] <inf> main: 8
[00:00:11.254,821] <inf> main: 9
[00:00:11.554,931] <inf> main: 10
[00:00:11.855,041] <inf> main: 11
[00:00:12.155,151] <inf> main: 12
[00:00:12.455,261] <inf> main: 13
[00:00:12.755,371] <inf> main: 14
[00:00:13.055,480] <inf> main: 15
[00:00:13.355,590] <inf> main: 16
[00:00:13.655,700] <inf> main: 17
[00:00:13.955,810] <inf> main: 18
[00:00:14.255,920] <inf> main: 19
[00:00:14.556,091] <inf> main: 
Listing dir /SD: ...

[00:00:14.557,922] <inf> main: [FILE] LOG0008 (size = 4060)

[00:00:14.557,952] <inf> main: [FILE] LOG0009 (size = 4060)

[00:00:14.558,013] <inf> main: [FILE] LOG0010 (size = 129)

[00:00:14.558,074] <inf> main: [FILE] LOG0000 (size = 1402)

[00:00:14.561,645] <inf> main: [FILE] DEVICEID.TXT (size = 5)

[00:00:14.573,974] <inf> main: [FILE] HOSTNAME.TXT (size = 28)

[00:00:14.574,035] <inf> main: [DIR ] SYSTEM~1

[00:00:14.575,714] <inf> main: 0
[00:00:14.875,823] <inf> main: 1
[00:00:15.175,933] <inf> main: 2
[00:00:15.476,043] <inf> main: 3
[00:00:15.776,153] <inf> main: 4
[00:00:16.076,263] <inf> main: 5
[00:00:16.376,403] <inf> main: 6
[00:00:16.676,513] <inf> main: 7
[00:00:16.976,623] <inf> main: 8
[00:00:17.276,733] <inf> main: 9
[00:00:17.576,843] <inf> main: 10
[00:00:17.876,953] <inf> main: 11
[00:00:18.177,062] <inf> main: 12
[00:00:18.477,172] <inf> main: 13
[00:00:18.777,313] <inf> main: 14
[00:00:19.077,423] <inf> main: 15
[00:00:19.377,532] <inf> main: 16
[00:00:19.677,642] <inf> main: 17
[00:00:19.977,752] <inf> main: 18
[00:00:20.277,862] <inf> main: 19
[00:00:20.578,033] <inf> main: 
Listing dir /SD: ...

[00:00:20.579,833] <inf> main: [FILE] LOG0008 (size = 4060)

[00:00:20.579,895] <inf> main: [FILE] LOG0009 (size = 4060)

[00:00:20.579,956] <inf> main: [FILE] LOG0010 (size = 129)

[00:00:20.580,017] <inf> main: [FILE] LOG0000 (size = 1402)

[00:00:20.583,557] <inf> main: [FILE] DEVICEID.TXT (size = 5)

[00:00:20.595,886] <inf> main: [FILE] HOSTNAME.TXT (size = 28)

[00:00:20.595,947] <inf> main: [DIR ] SYSTEM~1

You can see that it first log to the file "LOG0000". Then tries to read the dir and the fs backend crashes. After that the size of LOG0000 doesn't increase, so the backend stopped working, which is not good.

When tracing it back, I found the critical part in zephyr/subsys/sd/sdmmc.c:

/* Reads data from SD card memory card */
int sdmmc_read_blocks(struct sd_card *card, uint8_t *rbuf,
	uint32_t start_block, uint32_t num_blocks)
{
	.
	.
	.
	
	ret = k_mutex_lock(&card->lock, K_NO_WAIT);
	if (ret) {
		LOG_WRN("Could not get SD card mutex1");
		return -EBUSY;
	}

The code doesn't "wait" for a mutex to come back, because it has K_NO_WAIT. When changing it to 500msec wait it works without any problems. (Same thing in sdmmc_write_blocks())

But changing the zephyr code is not a good option for us. So we were wondering if there is a thing we missed (maybe a mutex that can be accessed from outside) or a good workaround for that problem.

Again thank you very much for your help!

Best regards, Moritz

Parents
  • Hello

    It looks like your directory might not get closed properly before you try to write again.

    You could try to check the return value of fs_closedir, the same way you do with fs_opendir, that could give you some more context about what goes wrong. Since you say it works to wait for the mutex, maybe it would help to insert a short wait directly after fs_closedir.

    Best regards,

    Einar

  • Hello Einar,

    thanks for the help! I added a check of return value and a wait for 500msec (both busy_wait and sleep), but unfortunately it didn't help. The file gets closed properly each time.

        res = fs_closedir(&dirp);
    	printk("Closing: %i\n", res);
    	if (res) {
    		LOG_ERR("Error closing dir %s [%d]\n", path, res);
    		return res;
    	}
    	k_sleep(K_MSEC(500));

    Listing dir /SD: ...
    
    [00:00:48.159,942] <inf> main: [FILE] LOG0000 (size = 4071)
    
    [00:00:48.160,003] <inf> main: [FILE] LOG0001 (size = 4081)
    
    [00:00:48.160,064] <inf> main: [FILE] LOG0002 (size = 1021)
    
    [00:00:48.160,095] <inf> main: [FILE] TWIN.TXT (size = 795)
    
    [00:00:48.163,635] <inf> main: [FILE] DEVICEID.TXT (size = 5)
    
    [00:00:48.175,842] <inf> main: [FILE] HOSTNAME.TXT (size = 28)
    
    [00:00:48.175,903] <inf> main: [DIR ] SYSTEM~1
    
    Closing: 0
    [00:00:48.677,734] <inf> main: 0
    [00:00:48.977,844] <inf> main: 1
    [00:00:49.277,984] <inf> main: 2
    [00:00:49.578,094] <inf> main: 3
    [00:00:49.878,204] <inf> main: 4
    [00:00:50.178,375] <inf> main: 5
    [00:00:50.478,515] <inf> main: 6
    [00:00:50.778,625] <inf> main: 7
    [00:00:51.078,735] <inf> main: 8
    [00:00:51.378,906] <inf> main: 9
    [00:00:51.679,046] <inf> main: 10
    [00:00:51.979,156] <inf> main: 11
    [00:00:52.279,266] <inf> main: 12
    [00:00:52.579,437] <inf> main: 13
    [00:00:52.879,577] <inf> main: 14
    [00:00:53.179,687] <inf> main: 15
    [00:00:53.479,797] <inf> main: 16
    [00:00:53.779,968] <inf> main: 17
    [00:00:54.080,108] <inf> main: 18
    [00:00:54.380,218] <inf> main: 19
    [00:00:54.680,389] <inf> main: 
    Listing dir /SD: ...
    
    [00:00:54.680,450] <inf> main: [FILE] LOG0000 (size = 4071)
    
    [00:00:54.680,511] <inf> main: [FILE] LOG0001 (size = 4081)
    
    [00:00:54.680,572] <inf> main: [FILE] LOG0002 (size = 2342)
    
    [00:00:54.680,603] <inf> main: [FILE] TWIN.TXT (size = 795)
    
    [00:00:54.684,143] <inf> main: [FILE] DEVICEID.TXT (size = 5)
    
    [00:00:54.696,289] <inf> main: [FILE] HOSTNAME.TXT (size = 28)
    
    [00:00:54.696,350] <inf> main: [DIR ] SYSTEM~1
    
    [00:00:54.697,326] <wrn> sd: Could not get SD card mutex1
    [00:00:54.697,387] <err> fs: file write error (-5)
    [00:00:54.697,570] <wrn> sd: Could not get SD card mutex1
    [00:00:54.697,601] <err> fs: directory read error (-5)
    backend corrupted
    Closing: 0
    [00:00:55.198,120] <inf> main: 0
    [00:00:55.498,229] <inf> main: 1
    [00:00:55.798,339] <inf> main: 2
    [00:00:56.098,449] <inf> main: 3
    [00:00:56.398,559] <inf> main: 4
    [00:00:56.698,699] <inf> main: 5
    [00:00:56.998,809] <inf> main: 6
    [00:00:57.298,919] <inf> main: 7
    [00:00:57.599,029] <inf> main: 8
    [00:00:57.899,169] <inf> main: 9
    [00:00:58.199,279] <inf> main: 10
    [00:00:58.499,389] <inf> main: 11
    [00:00:58.799,499] <inf> main: 12
    [00:00:59.099,639] <inf> main: 13
    [00:00:59.399,749] <inf> main: 14
    [00:00:59.699,859] <inf> main: 15
    [00:00:59.999,969] <inf> main: 16
    [00:01:00.300,109] <inf> main: 17
    [00:01:00.600,219] <inf> main: 18
    [00:01:00.900,329] <inf> main: 19
    [00:01:01.200,500] <inf> main: 
    Listing dir /SD: ...
    
    [00:01:01.202,301] <inf> main: [FILE] LOG0000 (size = 4071)
    
    [00:01:01.202,362] <inf> main: [FILE] LOG0001 (size = 4081)
    
    [00:01:01.202,423] <inf> main: [FILE] LOG0002 (size = 2342)
    
    [00:01:01.202,484] <inf> main: [FILE] TWIN.TXT (size = 795)
    
    [00:01:01.205,993] <inf> main: [FILE] DEVICEID.TXT (size = 5)
    
    [00:01:01.218,170] <inf> main: [FILE] HOSTNAME.TXT (size = 28)
    
    [00:01:01.218,231] <inf> main: [DIR ] SYSTEM~1
    
    Closing: 0

    There must be something else..

  • Hm I see. In that case I think it's interesting that you get a file write error after printing DIR SYSTEM~1, then followed by a directory error..

    Maybe your end-of-dir check doesn't work as expected, or maybe you're encountering a file type you haven't accounted for?

    It might be a good idea to insert a couple debug prints to see what's going on more clearly.

    -Einar

  • Hi Einarh,

    I took the lsdir function from ncs example "fat_fs". So I guess it should be alright. I am using this function basically just to create some traffic on the SD card to recreate the bug. You can use any other SD card call to get that error.

    Are you able to repoduce this error on your side?

    And we found out, that replacing K_NO_WAIT with 500msec as proposed in the first post leads to very strange write results when we were trying to write some data to the SD card, while logging to SD card. We don't suggest to use that workaround.

    Where could we add debug prints? As said lsdir() should be alright.

    Another thing to mention: I ncs 2.0.2 this whole issue with the fs backend didn't occur. ( Will double check that)

    Thanks for your help and best regards,

    Moritz

Reply
  • Hi Einarh,

    I took the lsdir function from ncs example "fat_fs". So I guess it should be alright. I am using this function basically just to create some traffic on the SD card to recreate the bug. You can use any other SD card call to get that error.

    Are you able to repoduce this error on your side?

    And we found out, that replacing K_NO_WAIT with 500msec as proposed in the first post leads to very strange write results when we were trying to write some data to the SD card, while logging to SD card. We don't suggest to use that workaround.

    Where could we add debug prints? As said lsdir() should be alright.

    Another thing to mention: I ncs 2.0.2 this whole issue with the fs backend didn't occur. ( Will double check that)

    Thanks for your help and best regards,

    Moritz

Children
  • Hi

    have you made any progress on this?

    I would not assume a function from some sample is ok to use in any context, you'll have to verify that it works for your project yourself.

    Still, it sounds to me like your issue is primarily related to restricting data access, so I would advise to double- and triple check how and when your SD card is accessed and if you can spot anywhere an unfortunate event could occur.

    -Einar

  • Hi Einar,

    yes just found a solution that seems to work!

    The trick is to deactivate the FS_Backend before every SD card operation and wait 50ms. After the SD card operation turn the backend on again. The good thing is, no Log message will be dropped, even during deactived backend it seems to buffer all the messages and writes them as soon it is activated again.

    We are using mutexes for our SD card operations. When taking a mutex, it automatically deactivates the fs backend now. After unlocking the mutex it activates the backend again.

    #define BACKEND_IDENTIFIER_FS "fs"
    /// @brief De/Activate FS Backend. Prevents FS Backend from writting on SD card. Method wont drop LOG messages (by experience)
    /// @param desired_state True to turn backend on, False to turn backend of
    void fs_backend_activate(bool desired_state){
    	int backendslength = log_backend_count_get();
    	for (int i = 0; i <backendslength; i++){
    		const struct log_backend *backend = log_backend_get(i);
    		if (strstr(backend->name, BACKEND_IDENTIFIER_FS)){	
    			if (desired_state == true){
    				log_backend_activate(backend, backend->cb->ctx);
    				LOG_INF("Fs active");
    			} else {
    				log_backend_deactivate(backend);
    				LOG_INF("Fs inactive");
    				k_msleep(50);
    			}
    		}
    	}
    }

    int SD_lock_wait(){
    	LOG_DBG("SD mutex take");
    	fs_backend_activate(false);
    	int ret = k_mutex_lock(&SD_mutex, K_FOREVER);
    	if(ret){
    		LOG_ERR("Error taking SD mutex: %d", ret);
    	}
    	return ret;
    }
    
    int SD_unlock(){
    	LOG_DBG("SD mutex give");
    	int ret = k_mutex_unlock(&SD_mutex);
    	if(ret){
    		LOG_ERR("Error giving SD mutex: %d", ret);
    	}
    	fs_backend_activate(true);
    	return ret;
    }

    Thanks for checking this case with us, Einar

Related