performance issue with multi threadn in ncs with nrf52832

hello Nordic

i am working with nrf52832, with zephyr, ncs v1.7.1,

the device has a sensor that sample at 26khz each sample is 3 channels of 2 bytes data. the sensor use spi1 and has high thread priority (cooperative) the samples are then stream via a double buffer to the flash which uses spi2.

when tying to write a page (256 bytes) to the flash at a test in the beginning of the app it take 1ms to write that page but when trying to write within the app, while the other sensor is sampling, it takes 3ms and also i get a BUS FAULT. here is the log:

[00000018] <err> os: ***** BUS FAULT *****
[00000018] <err> os:   Unstacking error
[00000018] <err> os: ***** HARD FAULT *****
[00000018] [1;31m<err> os:   Fault escalation (see below)
[00000018] <err> os: ***** BUS FAULT *****
[00000018] <err> os:   Precise data bus error
[00000018] <err> os:   BFAR Address: 0xfe80fec0
[00000018] <err> os: r0/a1:  0x2000bef0  r1/a2:  0xfe80fec0  r2/a3:  0xfe80ff28
[00000018] <err> os: r3/a4:  0x2000beef r12/ip:  0xfa000000 r14/lr:  0x00016d79
[00000018] <err> os:  xpsr:  0x81000005
[00000018] <err> os: s[ 0]:  0x00000000  s[ 1]:  0x00000000  s[ 2]:  0x00000000  s[ 3]:  0x00000000
[00000018] <err> os: s[ 4]:  0x00000000  s[ 5]:  0x00000000  s[ 6]:  0x00000000  s[ 7]:  0x00000000
[00000018] <err> os: s[ 8]:  0x00000000  s[ 9]:  0x00000000  s[10]:  0x00000000  s[11]:  0x00000000
[00000018] <err> os: s[12]:  0x00000000  s[13]:  0x00000000  s[14]:  0x00000000  s[15]:  0x00000000
[00000018] <err> os: fpscr:  0x20001d78
[00000018] <err> os: Faulting instruction address (r15/pc): 0x0003ef10
[00000018] <err> os: >>> ZEPHYR FATAL ERROR 0: CPU exception on CPU 0
[00000018] <err> os: Fault during interrupt handling

[00000018] <err> os: Current thread: 0x20003210 (unknown)
[00000018] <err> fatal_error: Resetting system
*** Booting Zephyr OS build v2.6.99-ncs1-1  ***

now i do have in the app another driver that uses spi (at the same time (getting samples and writing samples to flash), but it uses a different instance of spi so i don't see why it shell create such a delay .. here is my complete .dts file:

// Copyright (c) 2022 Nordic Semiconductor ASA
// SPDX-License-Identifier: Apache-2.0

/dts-v1/;
#include <nordic/nrf52832_qfaa.dtsi>

/ {
	model = "halo_ep2";
	compatible = "halo-ep2";

	chosen {
		zephyr,sram = &sram0;
		zephyr,flash = &flash0;
		zephyr,code-partition = &slot0_partition;
		nordic,pm-ext-flash = &augu_flash_is25lp128;
	};

	sensors_on: sensors_on_node {
		compatible = "augury,gpio-power";
		power_gpios = <&gpio0 22 GPIO_ACTIVE_LOW>;
		label = "SENSORS_ON";
		status = "okay";
		#power_resource-cells = <0>;
	};

	flash_on: flash_on_node {
		compatible = "augury,gpio-power";
		power_gpios = <&gpio0 23 (GPIO_PULL_UP | GPIO_ACTIVE_LOW)>;
		label = "FLASH_ON";
		status = "okay";
		#power_resource-cells = <0>;
	};	

    leds {
        compatible = "gpio-leds";
        led0_red: led_0 {
            gpios = <&gpio0 25 GPIO_ACTIVE_HIGH>;
            label = "RED_LED_0";
        };
        led1_orange: led_1 {
            gpios = <&gpio0 26 GPIO_ACTIVE_HIGH>;
            label = "ORANGE_LED_1";
        };
        led2_green: led_2 {
            gpios = <&gpio0 27 GPIO_ACTIVE_HIGH>;
            label = "GREEN_LED_2";
        };							
    };

	aliases {
		led0-red = &led0_red;
		led1-orange = &led1_orange;
		led2-green = &led2_green;

	};
};

	&adc {
		status ="okay";
	};

	&gpiote {
		status ="okay";
	};

	&gpio0 {
		status ="okay";
	};


	&spi1 {
		compatible = "nordic,nrf-spim";
		status = "okay";
		sck-pin = <20>;
		mosi-pin = <14>;    // p0.14
		miso-pin = <16>;    // p0.16
		cs-gpios = <&gpio0 13 GPIO_ACTIVE_LOW>;
		iis3dwb@0 {
			compatible = "st,iis3dwb";
			reg = <0>;
			int1_gpios = <&gpio0 11 (GPIO_ACTIVE_HIGH | GPIO_PULL_DOWN)>;
			int2_gpios = <&gpio0 12 (GPIO_ACTIVE_HIGH | GPIO_PULL_DOWN)>;
			label = "IIS3DWB";
			power_resources = <&sensors_on>;
			spi-max-frequency = <8000000>;
		};
	};

	&spi2 {
		status = "okay";
		sck-pin = <5>; // gpio 0 pin 5
		mosi-pin = <4>; // gpio 0 pin 4
		miso-pin = <2>; // gpio 0 pin 2
		compatible = "nordic,nrf-spi";
		cs-gpios = <&gpio0 3 (GPIO_PULL_UP | GPIO_ACTIVE_LOW)>;
		augu_flash_is25lp128: augu_flash_is25lp128@0 {
			compatible = "issi,augu_flash_is25lp128";
			reg = <0>;
			label = "AUGU_FLASH_IS25LP128";
			power_resources = <&flash_on>;
			jedec_id = [9d 60 18];
			spi-max-frequency = <8000000>; //<133000000>;
			page_size = <256>;
			block_size = <65536>;
			size = <0x8000000>;
		};
	};
	&flash0 {
		partitions {
			compatible = "fixed-partitions";
			#address-cells = <1>;
			#size-cells = <1>;

			mbr_partition: partition@0 {
				label = "mbr";
				reg = <0x00000000 0x00001000>;
			};
			boot_partition: partition@1000 {
				label = "mcuboot";
				reg = <0x1000 0xc000>;
			};
			slot0_partition: partition@d000 {
				label = "image-0";
				reg = <0xd000 0x6c800>;
			};
			slot1_partition: partition@79800 {
				label = "image-1";
				reg = <0x79800 0x800>;
			};
			storage_partition: partition@7a000 {
				label = "storage";
				reg = <0x7a000 0x6000>;
			};
		};
	};

(running the app on another board with nrd52840, with nand flash that writes 2k each time, works well, so i don't know if it is a performance issue or something else.. i will also mention that the RAM take on the nrf52832 with zephyr and all is high:

[567/572] Linking C executable zephyr/zephyr.elf
Memory region         Used Size  Region Size  %age Used
           FLASH:      253604 B     419328 B     60.48%
            SRAM:       60282 B        64 KB     91.98%
        IDT_LIST:          0 GB         2 KB      0.00%
[568/572] Generating zephyr/mcuboot_primary_app.hex
[569/572] Generating zephyr/mcuboot_primary.hex
[570/572] Generating ../../zephyr/app_update.bin, ../../zephyr/app_signed.hex, ../../zephyr/app_test_update.hex, ../../zephyr/app_moved_test_update.hex
[571/572] Generating ../../zephyr/dfu_application.zip
[572/572] Generating zephyr/merged.hex

any idea why i get the BUS FAULT ? 

p.s. i don't know if it has any affect but the page data is saved in a heap section previously allocated like so

K_HEAP_DEFINE(my_heap, 3072);   // 256*3*2 = 1536 + safe buffer

int mem_heap_alloc(int size, struct k_mem_block *block)
{
    block->data = NULL;
    block->data = k_heap_alloc(&my_heap, size, K_NO_WAIT);
    if (block->data == NULL)
    {
        return -ENOMEM;
    }

    return 0;
}

void mem_heap_free(struct k_mem_block *block)
{
    k_heap_free(&my_heap, block->data);
}

and the call is for 

mem_heap_alloc(with 2 page size) 
like a double buffer for steaming the 
data collected to flash, one page gets 
fill with samples while the other is written to flash, in theaory 

the other question i will leave for now it is also regarding a try to improve the current spi_nor_write function but anyhow it will not solve the 3ms issue and if it will remain on 1ms per page write as seen in the first test then we will be ok 

we think it is a performance issue (which we don't see when the same code runs on nrf52840 with a nand flash, which also writes 2k each time and not 256 pages)

one of the possible solutions is to change the api of the sensor driver so instead of using many time fetch and get, to get one api for fetching a pointer to a buffer of the samples stored in the driver, another idea is to have the driver use easy dma to put the samples to a buffer in memory with an interrupt after reaching 256 bytes, and then write to flash from this buffer. any idea if there is some support for one of this operations in zephyr/ncs ?,

any ideas as for which is better ?

any ideas of beast way to do preferred one ?

hope to read from you soon

best regards

Ziv

Parents
  • Hi Ziv

    How are you reading the sensor over SPI? 

    Are you using the normal or the async driver?

    Running an SPI transaction every ~38us is not ideal, but it is very odd that the 256 byte write should slow down when it is running on a separate SPI bus. At the lower levels the SPI peripheral should use DMA, which means the 256 byte transaction should go through in the background (unfortunately it would have to be split in 2 transactions, since the maximum length of one transaction is 255 bytes in the nRF52832) . 

    The nRF52840 allows you to send much longer packets in a single transaction, which might be why it is working better. 

    I agree that sampling the sensor in a more automated way would be a better approach. The SPIM module in the nRF52 devices support a so called list feature, which allows you to queue a list of transactions and have them triggered in the background from a hardware event (such as a timer compare event). In this setup you could have the SPIM module run several operations in the background until you have filled the flash buffer, and run the processing and SPI interaction at a less frequent rate. 

    You will need to use the nrfx_spim driver directly in this case, since the generic Zephyr SPI driver doesn't support the list feature of the SPIM peripheral. 

    Best regards
    Torbjørn

  • hi overbekk

    How are you reading the sensor over SPI?

    yes spi1, the flash works on spi2

    Are you using the normal or the async driver?

    i have CONFIG_SPI_ASYNC=y in my .config file under the build/zephyr, so async is how i work.

    In this setup you could have the SPIM module run several operations in the background until you have filled the flash buffer

    maybe i misunderstood your intended use, but 1. i need to be writing while sampling for a second so if my buffers is filling up faster then the are dequeued then its not good enough, also since nrf52832 have 1/4 the ram the nrf52840 has, i need to find a way to reduce my buffers and not put more cause i am already around 90% ram take. 2. there is some unclear issue to work with spim in the flash cause of  CONFIG_SOC_NRF52832_ALLOW_SPIM_DESPITE_PAN_58=y,

    it fails here (on the spi_nrfx_spim.c file on the ncs (1.7.1 v): 

    		/* This SPIM driver is only used by the NRF52832 if
    		   SOC_NRF52832_ALLOW_SPIM_DESPITE_PAN_58 is enabled */
    		if (IS_ENABLED(CONFIG_SOC_NRF52832) &&
    		   (xfer.rx_length == 1 && xfer.tx_length <= 1)) {
    			LOG_WRN("Transaction aborted since it would trigger nRF52832 PAN 58");
    			error = -EIO;
    		}

    i had to change to spi

    Running an SPI transaction every ~38us is not ideal,

    the 38us spi call is called via ppi and i was thinking that what slows the flash page write is not the spi itself but available cpu and ram .. but i am not sure anymore and the coming questions are about that: 

    1. i sometimes get this error, and i wonder if it is related to the 90% ram take or is it something else ??

    [00000078] <err> spi_nrfx_spi: chunk_length - 4
    [00000078] <err> spi_nrfx_spi: chunk_length - 256
    [00000078] <err> spi_nrfx_spi: chunk_length - 0
    [00000078] <err> os: ***** MPU FAULT *****
    [00000078] <err> os:   Stacking error (context area might be not valid)
    [00000078] <err> os: r0/a1:  0xbc1c36a8  r1/a2:  0xfcb9b13f  r2/a3:  0x00000000
    [00000078] <err> os: r3/a4:  0x20007080 r12/ip:  0x00000010 r14/lr:  0x00000000
    [00000078] <err> os:  xpsr:  0x81000000
    [00000078] [1;31m<err> os: s[ 0]:  0x00000000  s[ 1]:  0x00000000  s[ 2]:  0x00000000  s[ 3]:  0x00000000
    [00000078] <err> os: s[ 4]:  0x00000000  s[ 5]:  0x00000000  s[ 6]:  0x00000000  s[ 7]:  0x00000000
    [00000078] <err> os: s[ 8]:  0x00000000  s[ 9]:  0x00000000  s[10]:  0x00000000  s[11]:  0x00000000
    [00000078] <err> os: s[12]:  0x00000000  s[13]:  0x00000000  s[14]:  0x00000000  s[15]:  0x00000000
    [00000078] <err> os: fpscr:  0x0000dd9e
    [00000078] <err> os: Faulting instruction address (r15/pc): 0x0003efe6
    [00000078] <err> os: >>> ZEPHYR FATAL ERROR 2: Stack overflow on CPU 0
    [00000078] <err> os: Current thread: 0x20003210 (unknown)
    [00000079] <err> fatal_error: Resetting system
    *** Booting Zephyr OS build v2.6.99-ncs1-1  ***
    

    2. i used ppk with nrf connect power manager desktop application to monitor some gpios and this is the outcome 

    (line 0 - iis3dwb sensor int1 

    line 3 - flash spi chip select 

    line 2 - gpio(0,26) which is up at the beginning of 'spi_nor_cmd_addr_write(...)' and down before the return from it.

    line 4 - gpio(0,27) - the flash_write api function ( 'spi_nor_write(...) ' ), gpio is up at entry and down before return.

    as can be seen in the following code parts:

    /* i dismissed some 'if's in the function them seems not so importent in my case
    static int spi_nor_write(const struct device *dev, off_t addr, const void *src, size_t size)
    {
    	int ret = 0;
    	uint32_t start_time_ms, end_time_ms;
    	static int counter = 0;
    
    	gpio_pin_set(gpio_dev_blue, 27, 1);
    
    	acquire_device(dev);
    
    		while (size > 0) {
    			size_t to_write = size;
    
    			/* Don't write more than a page. */
    			if (to_write >= page_size) {
    				to_write = page_size;
    			}
    
    			/* Don't write across a page boundary */
    			if (((addr + to_write - 1U) / page_size)
    			!= (addr / page_size)) {
    				to_write = page_size - (addr % page_size);
    			}
    			spi_nor_cmd_write(dev, SPI_NOR_CMD_WREN);
    			ret = spi_nor_cmd_addr_write(dev, SPI_NOR_CMD_PP, addr, src, to_write);
    			if (ret != 0) {
    				break;
    			}
    
    			size -= to_write;
    			src = (const uint8_t *)src + to_write;
    			addr += to_write;
    			counter++;
    			spi_nor_wait_until_ready(dev);
    		}
    
    	release_device(dev);
    	gpio_pin_set(gpio_dev_blue, 27, 0);
    	return ret;
    }
    
    static int spi_nor_access(const struct device *const dev,
    			  uint8_t opcode, unsigned int access,
    			  off_t addr, void *data, size_t length)
    {
    	gpio_pin_set(gpio_dev_green, 26, 1);
    	const struct spi_nor_config *const driver_cfg = dev->config;
    	struct spi_nor_data *const driver_data = dev->data;
    	bool is_addressed = (access & NOR_ACCESS_ADDRESSED) != 0U;
    	bool is_write = (access & NOR_ACCESS_WRITE) != 0U;
    	uint8_t buf[5] = { 0 };
    	
    	struct spi_buf spi_buf[2] = {
    		{
    			.buf = buf,
    			.len = 1,
    		},
    		{
    			.buf = data,
    			.len = length
    		}
    	};
    
    	buf[0] = opcode;
    	if (is_addressed) {
    		bool access_24bit = (access & NOR_ACCESS_24BIT_ADDR) != 0;
    		bool access_32bit = (access & NOR_ACCESS_32BIT_ADDR) != 0;
    		bool use_32bit = (access_32bit
    				  || (!access_24bit
    				      && driver_data->flag_access_32bit));
    		union {
    			uint32_t u32;
    			uint8_t u8[4];
    		} addr32 = {
    			.u32 = sys_cpu_to_be32(addr),
    		};
    
    		if (use_32bit) {
    			memcpy(&buf[1], &addr32.u8[0], 4);
    			spi_buf[0].len += 4;
    		} else {
    			memcpy(&buf[1], &addr32.u8[1], 3);
    			spi_buf[0].len += 3;
    		}
    	};
    
    	const struct spi_buf_set tx_set = {
    		.buffers = spi_buf,
    		.count = (length != 0) ? 2 : 1,
    	};
    
    	const struct spi_buf_set rx_set = {
    		.buffers = spi_buf,
    		.count = 2,
    	};
    
    	if (is_write) {
    		return spi_write_dt(&driver_cfg->spi, &tx_set);
    	}
    	int res = spi_transceive_dt(&driver_cfg->spi, &tx_set, &rx_set);
    	// LOG_ERR("spi_transceive_dt ret - %d\n", res);
    	gpio_pin_set(gpio_dev_green, 26, 0);
    	return res; //spi_transceive_dt(&driver_cfg->spi, &tx_set, &rx_set);
    }

    (after i moved the ' spi_nor_wait_until_ready(dev); ' from its original place to just before the ' spi_nor_cmd_write(dev, SPI_NOR_CMD_WREN); '  it did go faster but still i see there is a long delay between first read of the flash ready state to the second one, almost 1ms which is a lot in my case .. so why is it and how can i reduce it ??

    3. spi speed is set to 8mhz so passing 256 bytes should take around 250us but, looking at the nrf ppk image above, it looks like it actually takes almost 1ms for chip select to go back up..  what can cause this delay ? the flash itself can work up to 133mhz so it should not be the source for the delay

    4. going in to the  ' spi_nrfx_spi.c ' and ' spi_context.h ' for the implementation of the ' transceive(...) ' i don't understand how the ' transfer_next_chunk( ..) '  is called twice and if i understand it correct then the event handler when first spi is done calls it again, but do no check the count there only later after some if's 

    which also brings to to the efficiency question we have double calls with chunk 0 and chunk 1 for simple WREN command instead of only one dive into functions ? and the "count check" also seems wasteful sorry to say again if i understood it right then a lot of operations can be saved by checking it earlier

    and i wonder if instead of having this structure :

    struct spi_buf spi_buf[2] = {
    		{
    			.buf = buf,
    			.len = 1,
    		},
    		{
    			.buf = data,
    			.len = length
    		}
    	};

    ,which do save coping of the data to one buffer, is actually more efficient then having one spi_buf struct ? so instead of having a chunk of 4 and another chunk of 256 bytes with 2 separate spi calls, i will have one with 260 bytes ?

    5. in the following function in the spi_context.h file:

    static inline int spi_context_wait_for_completion(struct spi_context *ctx)
    {
    	int status = 0;
    	uint32_t timeout_ms;
    
    	timeout_ms = MAX(ctx->tx_len, ctx->rx_len) * 8 * 1000 /
    		     ctx->config->frequency;
    	timeout_ms += CONFIG_SPI_COMPLETION_TIMEOUT_TOLERANCE;
    
    #ifdef CONFIG_SPI_ASYNC
    	if (!ctx->asynchronous) {
    		if (k_sem_take(&ctx->sync, K_MSEC(timeout_ms))) {
    			LOG_ERR("Timeout waiting for transfer complete");
    			return -ETIMEDOUT;
    		}
    		status = ctx->sync_status;
    	}...

    i see that  CONFIG_SPI_COMPLETION_TIMEOUT_TOLERANCE=200ms by default, what should happen if i change it to '0' and is that what maybe slows the re-checking of the 'is flash ready' read command ?  also maybe i am missing something but 8000/8000000 (8Mhz = 8000000) will always come out as '0' cause 'timeout_ms' is not a double or float am i wrong ?

    6. i tried to improve the - spi_nor_cmd_addr_write(devopcodeaddrsrclength)

    with my own specific program page function: 

    p.s. also tried to implement a page program leaner function instead of the  ' spi_nor_cmd_addr_write '  which is here

    static int augu_flash_page_program(const struct device *const dev, uint32_t addr, const void* data, size_t data_length)
    {
    	const struct spi_nor_config *const driver_cfg = dev->config;
    
    	uint8_t buf[4] = { SPI_NOR_CMD_PP, 0, 0, 0 };
    
    	struct spi_buf spi_buf[2] = {
    		{
    			.buf = buf,
    			.len = 4,
    		},
    		{
    			.buf = data,
    			.len = data_length,
    		}
    	};
    
    	union {
    		uint32_t u32;
    		uint8_t u8[4];
    	} addr32 = {
    		.u32 = sys_cpu_to_be32(addr),
    	};
    	memcpy(&buf[1], &addr32.u8[1], 3);
    
    	const struct spi_buf_set tx_set = {
    		.buffers = spi_buf,
    		.count = 2,
    	};
     
    	return spi_write_dt(&driver_cfg->spi, &tx_set);
    	// return spi_transceive(&driver_cfg->spi, &tx_set, &rx_set);
    }

    but it actually took longer time which i can't understand why ?

     

    sorry it is a long question but i am really trying to get to the bottom of it and see how i can improve and maybe even change configurations to use the spi_nrfx_spi driver in a better way or if i have a way to make modification without changing the ncs or rewriting an spi driver myself (which i don't want to do)

    hope to read from you soon  

    best regards

    Ziv

  • Hi Ziv

    Did you get my comment in the other case regarding increasing the flash size? 

    Have you tried to test the speed of the external flash access without the sensor SPI being active, to see whether or not the slow down is caused by the rapid SPI communication to the sensor?
    Essentially just send 256 bytes of dummy data at the same rate that you would normally do, and see if you get different behavior on the bus (using the scope).

    I think I might have to do some testing on my own in order to try and reproduce your issue. The nRF52840DK has a NOR flash device on the board, which should allow me to set up a comparable test to what you are doing.

    Best regards
    Torbjørn

  • hi Torbjørn

    Did you get my comment in the other case regarding increasing the flash size? 

    saw your answer on the other thread (i will close it later), 1. i did not find in my .config of the build an spi stack size configuration, it is probably default and i am not sure exactly how to change it but the MPU fault does not happen only when trying to write to flash while trying to sample it can also just happen on startup but it is happening less if i am at 88% SRAM then if i am at 92% SRAM. so i am trying to save on ram where i can.

    the sensor spi works on a different instance the the flash one so i don't see why it should matter .. what i see that is strange for me is to my opinion in the ncs level .. here are the correct pictures of oscilloscope catch i did today 

    1. the yellow line frames the call for the flash write function ( the pink line is the spi clock for the external flash  and the blue line is the MOSI )

    first there is a millisecond until the page transfer starts which is in the drivers level 

    2. inside the page transfer part we see a delay in clock agter every 10 bytes for 10microsecond

    3. inside each group of 10 bytes we see 2 microsecond delay between each byte 

    so the spi speed it set to 8Mhz but in actual it is much slower

    it will be great to understand and fix that behaviour for start and then move to the rest of the stuff 

    p.s. the sensor spi sriver works with spi module that we wrote that bypass the zephyr implementation and uses the nrfx_spim module directly (again we have an issue with using spim cause of the areta previously mentioned)

    I think I might have to do some testing on my own in order to try and reproduce your issue. The nRF52840DK has a NOR flash device on the board

    you can try though i work with nrf52832 but i guess to test the ncs implementation of the spi driver it should be fine

    hope to read from you soon

    best regards

    Ziv

  • ok i think i know what the issue is .. 

    the spi implementation does not use DMA and there is a buffer of 2 bytes but every time with cpu intervention and within an interrupt handler another 2 bytes are passed

    with a light bare metal application it might not be an issue but with all the zephyr overhead and a sensor interrupt happening every ~37 microseconds (which also explains the 10 microseconds between each 10 bytes transferred), then it is too slow

    so my guess i need to figure out how to work with dma -> work with spim despite the erata

    i will try to test it tomorrow adjusting my spi_nor driver to work with spim and maybe bypass the areta issue by forcing rx != 1 no matter what .. see if it works

    well i tried but it seems this structure 

    struct spi_buf spi_buf[2] = {
    		{
    			.buf = buf,
    			.len = 1,
    		},
    		{
    			.buf = data,
    			.len = length
    		}
    	};
    	
    	and
    	
    	const struct spi_buf_set tx_set = {
    		.buffers = spi_buf,
    		.count = (length != 0) ? 2 : 1,
    	};
    
    	const struct spi_buf_set rx_set = {
    		.buffers = spi_buf,
    		.count = 2,
    	};

    does not allow to work with the limitation of the PAN_58 erata since you can't just put len=2 on the first buf without it messing the data read

    would really like if you have ideas regarding this issue ???

    beside that 

    1. if you have another idea/ explanation for the above condition please share

    2. still not clear why the reduced function for page program mentioned in the first post of this thread is slower then the original on ??

    3. going back to the MPU fault and maybe how to figure out what is the meaning of all the lines in the error message so i can look for the stack or some other problem causing it

    hope to read from you soon 

    best regards

    Ziv

Reply
  • ok i think i know what the issue is .. 

    the spi implementation does not use DMA and there is a buffer of 2 bytes but every time with cpu intervention and within an interrupt handler another 2 bytes are passed

    with a light bare metal application it might not be an issue but with all the zephyr overhead and a sensor interrupt happening every ~37 microseconds (which also explains the 10 microseconds between each 10 bytes transferred), then it is too slow

    so my guess i need to figure out how to work with dma -> work with spim despite the erata

    i will try to test it tomorrow adjusting my spi_nor driver to work with spim and maybe bypass the areta issue by forcing rx != 1 no matter what .. see if it works

    well i tried but it seems this structure 

    struct spi_buf spi_buf[2] = {
    		{
    			.buf = buf,
    			.len = 1,
    		},
    		{
    			.buf = data,
    			.len = length
    		}
    	};
    	
    	and
    	
    	const struct spi_buf_set tx_set = {
    		.buffers = spi_buf,
    		.count = (length != 0) ? 2 : 1,
    	};
    
    	const struct spi_buf_set rx_set = {
    		.buffers = spi_buf,
    		.count = 2,
    	};

    does not allow to work with the limitation of the PAN_58 erata since you can't just put len=2 on the first buf without it messing the data read

    would really like if you have ideas regarding this issue ???

    beside that 

    1. if you have another idea/ explanation for the above condition please share

    2. still not clear why the reduced function for page program mentioned in the first post of this thread is slower then the original on ??

    3. going back to the MPU fault and maybe how to figure out what is the meaning of all the lines in the error message so i can look for the stack or some other problem causing it

    hope to read from you soon 

    best regards

    Ziv

Children
  • Hi Ziv

    You mean to say that both the sensor and the flash is accessed without using DMA? 
    If so it will make quite a big impact on CPU usage, especially when you want to send a large transaction to the flash device. 

    I would strongly recommend integrating DMA, and rather applying the workaround code if you encounter 1 byte transactions. Can you let me know in which file you found the problematic structure with the 1 byte buffer length?

    Regarding the MPU fault, could you try to run your program with the MPU stack guard enabled, and see if you spot the stack overflow quicker?

    CONFIG_MPU_STACK_GUARD=y

    If the issue occurs more often when RAM consumption is higher it could be a stack overflow issue that isn't detected until the stack overflows into other variables.

    Best regards
    Torbjørn

  • hi Torbjørn

    You mean to say that both the sensor and the flash is accessed without using DMA? 

    no, the sensor uses ppi and spim (->uses DMA)

    Can you let me know in which file you found the problematic structure with the 1 byte buffer length?

    NCS v1.7.1, in the  zephyr/drivers/flash/spi_nor.c , in the implementation of spi_nor_access(...)

    static int spi_nor_access(const struct device *const dev,
    			  uint8_t opcode, unsigned int access,
    			  off_t addr, void *data, size_t length)
    {
    	const struct spi_nor_config *const driver_cfg = dev->config;
    	struct spi_nor_data *const driver_data = dev->data;
    	bool is_addressed = (access & NOR_ACCESS_ADDRESSED) != 0U;
    	bool is_write = (access & NOR_ACCESS_WRITE) != 0U;
    	uint8_t buf[5] = { 0 };
    	struct spi_buf spi_buf[2] = {
    		{
    			.buf = buf,
    			.len = 1,
    		},
    		{
    			.buf = data,
    			.len = length
    		}
    	};
    ....

    when calling the command WREN (write enable) it sends only one byte, also when checking for is flash ready it sends one chunk with length 1 and then the data chunk so in the 1 byte sending it fails.

    i think to avoid this i will just use the nrfx_spim directly from my flash driver make it more efficient as well 

    i tested the CONFIG_MPU_STACK_GUARD=y

    not sure what kind of extra information i was supposed to get  ???

    but this is how the MPU FAULT looks like:

    [00000022] 1m<err> os: ***** MPU FAULT *****
    [00000022] <err> os:   Stacking error (context area might be not valid)
    [00000022] [1;31m<err> os: r0/a1:  0xbc1c16a8  r1/a2:  0x7cb9b13d  r2/a3:  0x803faa2e
    [00000022] <err> os: r3/a4:  0x03ff41f3 r12/ip:  0x00000001 r14/lr:  0x00000000
    [00000022] <err> os:  xpsr:  0x81000000
    [00000022] <err> os: s[ 0]:  0x00000000  s[ 1]:  0x00000000  s[ 2]:  0x00000000  s[ 3]:  0xffffffff
    [00000022] <err> os: s[ 4]:  0x00000000  s[ 5]:  0x00000000  s[ 6]:  0x00000000  s[ 7]:  0x00000000
    [00000022] [1;31m<err> os: s[ 8]:  0x00000000  s[ 9]:  0x00000000  s[10]:  0x00000000  s[11]:  0x00000000
    [00000022] <err> os: s[12]:  0x00000000  s[13]:  0x00000000  s[14]:  0x00000000  s[15]:  0xffffffff
    [00000022] <err> os: fpscr:  0x00000000
    [00000022] <err> os: Faulting instruction address (r15/pc): 0x0003e540
    [00000022] <err> os: >>> ZEPHYR FATAL ERROR 2: Stack overflow on CPU 0
    [00000022] <err> os: Current thread: 0x20003228 (unknown)
    [00000023] <err> fatal_error: Resetting system
    *** Booting Zephyr OS build v2.6.99-ncs1-1  ***

    also there is still some delays in the app that i don't know where they come from, any tool you can recommend for better understanding that (tried CONFIG_THREAD_ANALIZE it just gives thread stack take and not in real time)  ?

    hope to read from you soon

    best regards

    Ziv

  • Hi Ziv

    I am very sorry for the slow response, I was out in holiday for the last 4 weeks. 

    Have you had any progress on your end? 

    If not I will set aside some time this week to try and reproduce the issue. 

    Best regards
    Torbjørn

  • hi Torbjørn

    i actually had a lot of progress, i freed some time from a high priority thread that enable some other things to work better, i changed the nor_spi driver to work with spim and created my own spim layer. 

    still there are some issues that i can not figure out, the MPU FAULT i don't understand how to read all the log under it and how can it help me to figure out where the problem is, it sometimes happen and sometimes not, also i can find a tool that will show in real time what tasks/threads are running and when do they context switch and how long does this switching takes

    beside the occasional MPU FAULTS,  when i want to check if flash is ready and loop on it but not all the time but with some delay, i set 50 microsecond between each call to read the flash register but for some reason, as i saw with a picoscop there is a delay of more then 200 microsecond and i don't know why .. 

    the blue line is a toggle in the following code section:

    the yello/orange line is the spi clock 

    the red line is a function running on a higher priority thread

    static int augu_nor_flash_wait_until_ready(augu_spi_data_t *spi_data, int polling_interval_usec)
    {
    	int ret = 0;
    	int count = 0;
    	uint8_t status_reg[2] = { 0, 1 }; // INITIATED WITH BUSY STATE
    	uint8_t cmd_buf[1] = { SPI_NOR_CMD_RDSR };
    	const struct device *gpio_dev_blue = device_get_binding("GPIO_0");
    	gpio_pin_toggle(gpio_dev_blue, 27);
    	ret = augu_spi_xfer(spi_data, cmd_buf, RDSR_CMD_LENGTH, status_reg, sizeof(status_reg));
    	// gpio_pin_set(gpio_dev_blue, 27, 0);
    
    	while (!ret && (status_reg[1] & SPI_NOR_WIP_BIT))
    	{
    		k_sleep(K_USEC(polling_interval_usec));
    		gpio_pin_toggle(gpio_dev_blue, 27);
    		ret = augu_spi_xfer(spi_data, cmd_buf, RDSR_CMD_LENGTH, status_reg, sizeof(status_reg));
    		// gpio_pin_set(gpio_dev_blue, 27, 0);
    
    		if(BUSY_CHECKS_COUNTS <= ++count )
    		{
    			LOG_ERR("wait for flash busy timed out!!!");
    			return -ETIMEDOUT;
    		}
    	}
    	return ret;
    }

    so, i need some help to better understand how to extract some info for the MPU FAULTS description (like the once i posted before in this thread and i add more here, maybe they are all the same), the environment what is working and what is happening underneath  

     <inf> AUGU_SPI: nrfx_ppi_channel_alloc for spi: chan = 2
    [00000010] <err> os: ***** MPU FAULT *****
    [00000010] <err> os:   Stacking error (context area might be not valid)
    [00000010] <err> os: r0/a1:  0x974b1c56  r1/a2:  0xa9037e23  r2/a3:  0xfd7b35b1
    [00000010] <err> os: r3/a4:  0x04365cc4 r12/ip:  0x00000001 r14/lr:  0x00000000
    [00000010] <err> os:  xpsr:  0x81000000
    [00000010] [1;31m<err> os: s[ 0]:  0x00000000  s[ 1]:  0x00000000  s[ 2]:  0x00000000  s[ 3]:  0x00000000
    [00000010] <err> os: s[ 4]:  0x00000000  s[ 5]:  0x00000000  s[ 6]:  0x00000000  s[ 7]:  0x00000000
    [00000010] <err> os: s[ 8]:  0x00000000  s[ 9]:  0x00000000  s[10]:  0x00000000  s[11]:  0xffffffff
    [00000010] [1;31m<err> os: s[12]:  0x00000000  s[13]:  0x00000000  s[14]:  0x00000000  s[15]:  0xffffffff
    [00000010] <err> os: fpscr:  0x00000000[0m
    [00000010] <err> os: Faulting instruction address (r15/pc): 0x0003febc
    [00000010] <err> os: >>> ZEPHYR FATAL ERROR 2: Stack overflow on CPU 0
    [00000010] <err> os: Current thread: 0x20003200 (unknown)
    [00000011] <err> fatal_error: Resetting system
    *** Booting Zephyr OS build v2.6.99-ncs1-1  ***
    
    
    [00000016] <inf> AUGU_SPI: nrfx_ppi_channel_alloc for spi: chan = 2
    [00000017] <err> os: ***** MPU FAULT *****
    [00000017] <err> os:   Stacking error (context area might be not valid)
    [00000017] <err> os: r0/a1:  0xfd7b35b1  r1/a2:  0x04365cc4  r2/a3:  0x00000001
    [00000017] <err> os: r3/a4:  0x00000000 r12/ip:  0x00000000 r14/lr:  0xe000ed00
    [00000017] <err> os:  xpsr:  0x81000000[0m
    [00000017] <err> os: s[ 0]:  0x00000000  s[ 1]:  0x00000000  s[ 2]:  0x00000000  s[ 3]:  0x00000000
    [00000017] <err> os: s[ 4]:  0x00000000  s[ 5]:  0x00000000  s[ 6]:  0x00000000  s[ 7]:  0x00000000
    [00000017] <err> os: s[ 8]:  0x00000000  s[ 9]:  0x00000000  s[10]:  0x00000000  s[11]:  0x00000000
    [00000017] <err> os: s[12]:  0x00000000  s[13]:  0xffffffff  s[14]:  0x00000000  s[15]:  0x00000000
    [00000017] [1;31m<err> os: fpscr:  0x00000000
    [00000017] <err> os: Faulting instruction address (r15/pc): 0x000321b8[0m
    [00000017] <err> os: >>> ZEPHYR FATAL ERROR 2: Stack overflow on CPU 0
    [00000017] <err> os: Current thread: 0x20003200 (unknown)
    [00000017] <err> fatal_error: Resetting system
    *** Booting Zephyr OS build v2.6.99-ncs1-1  ***
    
    
    [00000008] <inf> AUGU_SPI: nrfx_ppi_channel_alloc for spi: chan = 2
    [00000009] <err> os: ***** MPU FAULT *****
    [00000009] <err> os:   Stacking error (context area might be not valid)
    [00000009] <err> os: r0/a1:  0x974b1c56  r1/a2:  0xa9037e23  r2/a3:  0xfd7b35b1
    [00000009] <err> os: r3/a4:  0x04365cc4 r12/ip:  0x20007742 r14/lr:  0x200073e0
    [00000009] <err> os:  xpsr:  0x81000000
    [00000009] <err> os: s[ 0]:  0x00000000  s[ 1]:  0x00000000  s[ 2]:  0x00000000  s[ 3]:  0x00000000
    [00000009] <err> os: s[ 4]:  0x00000000  s[ 5]:  0x00000000  s[ 6]:  0x00000000  s[ 7]:  0x00000000
    [00000009] <err> os: s[ 8]:  0x00000000  s[ 9]:  0x00000000  s[10]:  0x00000000  s[11]:  0x00000000
    [00000009] [1;31m<err> os: s[12]:  0x00000000  s[13]:  0x00000000  s[14]:  0x00000000  s[15]:  0xffffffff
    [00000009] <err> os: fpscr:  0x00000000
    [00000009] <err> os: Faulting instruction address (r15/pc): 0x0003feb2
    [00000009] <err> os: >>> ZEPHYR FATAL ERROR 2: Stack overflow on CPU 0m
    [00000009] <err> os: Current thread: 0x20003200 (unknown)
    [00000010] <err> fatal_error: Resetting system
    *** Booting Zephyr OS build v2.6.99-ncs1-1  ***
    
    
    [00000008] <inf> AUGU_SPI: nrfx_ppi_channel_alloc for spi: chan = 2
    [00000009] <err> os: ***** MPU FAULT *****
    [00000009] <err> os:   Stacking error (context area might be not valid)
    [00000009] [1;31m<err> os: r0/a1:  0x974b1c56  r1/a2:  0xa9037e23  r2/a3:  0xfd7b35b1
    [00000009] <err> os: r3/a4:  0x04365cc4 r12/ip:  0x00000001 r14/lr:  0x00000000
    [00000009] <err> os:  xpsr:  0x81000000
    [00000009] <err> os: s[ 0]:  0x00000000  s[ 1]:  0x00000000  s[ 2]:  0x00000000  s[ 3]:  0xffffffff
    [00000009] <err> os: s[ 4]:  0x00000000  s[ 5]:  0x00000000  s[ 6]:  0x00000000  s[ 7]:  0x00000000
    [00000009] [1;31m<err> os: s[ 8]:  0x00000000  s[ 9]:  0x00000000  s[10]:  0x00000000  s[11]:  0x00000000
    [00000009] <err> os: s[12]:  0x00000000  s[13]:  0x00000000  s[14]:  0x00000000  s[15]:  0xffffffff
    [00000009] <err> os: fpscr:  0x00000000
    [00000009] <err> os: Faulting instruction address (r15/pc): 0x0003feb2
    [00000009] <err> os: >>> ZEPHYR FATAL ERROR 2: Stack overflow on CPU 0
    [00000009] [1;31m<err> os: Current thread: 0x20003200 (unknown)
    [00000010] <err> fatal_error: Resetting system
    *** Booting Zephyr OS build v2.6.99-ncs1-1  ***
    
    
    [00000017] <inf> AUGU_SPI: nrfx_ppi_channel_alloc for spi: chan = 2
    [00000018] <err> os: ***** MPU FAULT *****
    [00000018] <err> os:   Stacking error (context area might be not valid)
    [00000018] <err> os: r0/a1:  0x974b1c56  r1/a2:  0xa9037e23  r2/a3:  0xfd7b35b1
    [00000018] <err> os: r3/a4:  0x04365cc4 r12/ip:  0x00000001 r14/lr:  0x00000000
    [00000018] <err> os:  xpsr:  0x81000000
    [00000018] <err> os: s[ 0]:  0x00000000  s[ 1]:  0x00000000  s[ 2]:  0x00000000  s[ 3]:  0x00000000
    [00000018] <err> os: s[ 4]:  0x00000000  s[ 5]:  0x00000000  s[ 6]:  0x00000000  s[ 7]:  0x00000000
    [00000018] <err> os: s[ 8]:  0x00000000  s[ 9]:  0x00000000  s[10]:  0x00000000  s[11]:  0x00000000
    [00000018] <err> os: s[12]:  0x00000000  s[13]:  0x00000000  s[14]:  0x00000000  s[15]:  0xffffffff
    [00000018] <err> os: fpscr:  0x00000000
    [00000018] <err> os: Faulting instruction address (r15/pc): 0x0003feb2
    [00000018] <err> os: >>> ZEPHYR FATAL ERROR 2: Stack overflow on CPU 0
    [00000018] <err> os: Current thread: 0x20003200 (unknown)
    [00000018] <err> fatal_error: Resetting system
    *** Booting Zephyr OS build v2.6.99-ncs1-1  ***
    
    
    # at restart
    [00000000] <inf> COMM_MNG: Bluetooth advertising started! duration = 30000
    [00000000] <err> os: ***** MPU FAULT *****
    [00000000] <err> os:   Stacking error (context area might be not valid)
    [00000000] <err> os: r0/a1:  0xfd7b35b1  r1/a2:  0x04365cc4  r2/a3:  0x00000001
    [00000000] <err> os: r3/a4:  0x00000000 r12/ip:  0x20003200 r14/lr:  0x00000000
    [00000000] <err> os:  xpsr:  0x81000000
    [00000000] <err> os: s[ 0]:  0x00000000  s[ 1]:  0x00000000  s[ 2]:  0x00000000  s[ 3]:  0x00000000
    [00000000] <err> os: s[ 4]:  0x00000000  s[ 5]:  0x00000000  s[ 6]:  0x00000000  s[ 7]:  0x00000000
    [00000000] <err> os: s[ 8]:  0x00000000  s[ 9]:  0x00000000  s[10]:  0x00000000  s[11]:  0x00000000
    [00000000] <err> os: s[12]:  0x00000000  s[13]:  0x00000000  s[14]:  0x00000000  s[15]:  0xffffffff
    [00000000] <err> os: fpscr:  0x00000000
    [00000000] <err> os: Faulting instruction address (r15/pc): 0x0003feb2
    [00000000] <err> os: >>> ZEPHYR FATAL ERROR 2: Stack overflow on CPU 0
    [00000000] <err> os: Current thread: 0x20003200 (unknown)
    [00000002] <err> fatal_error: Resetting system
    *** Booting Zephyr OS build v2.6.99-ncs1-1  ***
    

    hope to read from you soon

    best regards 

    Ziv

  • Hi Ziv,

     

    It seems that you have stack issues, most likely related to the stack size of the current thread running.

    To backtrack a fault, you can use arm-none-eabi-addr2line with the faulting address as an input:

    arm-none-eabi-addr2line -e build-folder/zephyr/zephyr.elf 0x0003febc

     

    In this case, you probably want to check which thread faulted, which you can check by opening your .map file in build-folder/zephyr/zephyr.map, and find which thread is running on "0x20003200", and then increase the stack for this specific thread.

    If that is the main thread, you can increase it by adjusting "CONFIG_MAIN_STACK_SIZE" in your prj.conf.

     

    Kind regards,

    Håkon

Related