Beware that this post is related to an SDK in maintenance mode
More Info: Consider nRF Connect SDK for new designs
This post is older than 2 years and might not be relevant anymore
More Info: Consider searching for newer posts

FDS garbage collection does not clean up dirty records

I added the following code to FDS:

static void dump_headers(void) {
    for (uint16_t page = 0; page < FDS_DATA_PAGES; page++) {
        uint32_t const *p_page_end = (m_pages[page].p_addr + FDS_PAGE_SIZE);

        fds_header_t const *p_header = (fds_header_t *) (m_pages[page].p_addr + FDS_PAGE_TAG_SIZE);

        bool page_valid = true;
        while (page_valid && header_has_next(p_header, p_page_end)) {
            switch (header_check(p_header, p_page_end)) {
                case FDS_HEADER_VALID:
                    NRF_LOG_INFO("Valid header on page %d, offset %d, file %d, record key %d, length %d words, record id %d", page, ((uint32_t)p_header) - ((uint32_t)m_pages[page].p_addr), p_header->file_id, p_header->record_key, p_header->length_words, p_header->record_id);
                    p_header = header_jump(p_header);
                    break;

                case FDS_HEADER_DIRTY:
                    NRF_LOG_INFO("Dirty header on page %d, offset %d, file %d, record key %d, length %d words, record id %d", page, ((uint32_t)p_header) - ((uint32_t)m_pages[page].p_addr), p_header->file_id, p_header->record_key, p_header->length_words, p_header->record_id);
                    p_header = header_jump(p_header);
                    break;

                case FDS_HEADER_CORRUPT:
                    // We can't reliably jump over this record.
                    // There is nothing more we can do on this page.
                    NRF_LOG_INFO("Corrupt header on page %d, offset %d, file %d, record key %d, length %d words, record id %d", page, ((uint32_t)p_header) - ((uint32_t)m_pages[page].p_addr), p_header->file_id, p_header->record_key, p_header->length_words, p_header->record_id);
                    page_valid = false;
                    break;
            }

        }
    }
}


static void queue_process(ret_code_t result)
{
    // snip
        switch (m_p_cur_op->op_code)
        {
            // snip
            case FDS_OP_GC:
                result = gc_execute(result);
                if(result == FDS_OP_COMPLETED) {
                    NRF_LOG_INFO("GC complete. Dumping storage headers:");
                    dump_headers();
                }
                break;
        }
}

It seems sometimes this works fine, and after GC is complete all record headers are valid. However, sometimes it doesn't, and dirty records remain after GC:

<info> app: GC complete. Dumping storage headers:
<info> app: Valid header on page 0, offset 8, file 3, record key 1, length 3 words, record id 1
<info> app: Valid header on page 0, offset 32, file 4, record key 1, length 4 words, record id 2
<info> app: Valid header on page 0, offset 60, file 1, record key 1, length 1 words, record id 3
<info> app: Valid header on page 0, offset 76, file 2, record key 1, length 4 words, record id 4
<info> app: Valid header on page 0, offset 104, file 1, record key 4, length 8 words, record id 5
<info> app: Valid header on page 0, offset 148, file 2, record key 4, length 4 words, record id 6
<info> app: Dirty header on page 0, offset 176, file 1, record key 0, length 1 words, record id 7
<info> app: Dirty header on page 0, offset 192, file 2, record key 0, length 4 words, record id 8
<info> app: Valid header on page 0, offset 220, file 1, record key 2, length 3 words, record id 9
<info> app: Valid header on page 0, offset 244, file 2, record key 2, length 4 words, record id 10
<info> app: Dirty header on page 0, offset 272, file 1, record key 0, length 1 words, record id 11
<info> app: Dirty header on page 0, offset 288, file 2, record key 0, length 4 words, record id 12
<info> app: Valid header on page 0, offset 316, file 1, record key 7, length 1 words, record id 13
<info> app: Valid header on page 0, offset 332, file 2, record key 7, length 4 words, record id 14
<info> app: Valid header on page 0, offset 360, file 1, record key 6, length 30 words, record id 15
<info> app: Valid header on page 0, offset 492, file 2, record key 6, length 4 words, record id 16
<info> app: Valid header on page 0, offset 520, file 1, record key 5, length 26 words, record id 17
<info> app: Valid header on page 0, offset 636, file 2, record key 5, length 4 words, record id 18
<info> app: Dirty header on page 0, offset 664, file 1, record key 0, length 236 words, record id 19
<info> app: Dirty header on page 0, offset 1620, file 2, record key 0, length 4 words, record id 20
<info> app: Dirty header on page 0, offset 1648, file 1, record key 0, length 1 words, record id 21
<info> app: Dirty header on page 0, offset 1664, file 2, record key 0, length 4 words, record id 22
<info> app: Valid header on page 0, offset 1692, file 1, record key 3, length 1 words, record id 23
<info> app: Valid header on page 0, offset 1708, file 2, record key 3, length 4 words, record id 24
<info> app: Dirty header on page 0, offset 1736, file 1, record key 0, length 234 words, record id 25
<info> app: Dirty header on page 0, offset 2684, file 2, record key 0, length 4 words, record id 26
<info> app: Dirty header on page 0, offset 2712, file 1, record key 0, length 1 words, record id 27
<info> app: Dirty header on page 0, offset 2728, file 2, record key 0, length 4 words, record id 28
<info> app: Dirty header on page 0, offset 2756, file 1, record key 0, length 233 words, record id 29
<info> app: Dirty header on page 0, offset 3700, file 2, record key 0, length 4 words, record id 30
<info> app: Dirty header on page 0, offset 3728, file 1, record key 0, length 1 words, record id 31
<info> app: Dirty header on page 0, offset 3744, file 2, record key 0, length 4 words, record id 32
<info> app: Dirty header on page 0, offset 3772, file 2, record key 0, length 4 words, record id 34
<info> app: Dirty header on page 0, offset 3800, file 1, record key 0, length 1 words, record id 35
<info> app: Dirty header on page 0, offset 3816, file 2, record key 0, length 4 words, record id 36
<info> app: Dirty header on page 0, offset 3844, file 2, record key 0, length 4 words, record id 38
<info> app: Dirty header on page 0, offset 3872, file 1, record key 0, length 1 words, record id 39
<info> app: Dirty header on page 0, offset 3888, file 2, record key 0, length 4 words, record id 40
<info> app: Dirty header on page 0, offset 3916, file 2, record key 0, length 4 words, record id 42
<info> app: Dirty header on page 0, offset 3944, file 1, record key 0, length 1 words, record id 43
<info> app: Dirty header on page 0, offset 3960, file 2, record key 0, length 4 words, record id 44
<info> app: Dirty header on page 0, offset 3988, file 2, record key 0, length 4 words, record id 46
<info> app: Dirty header on page 0, offset 4016, file 1, record key 0, length 1 words, record id 47
<info> app: Dirty header on page 0, offset 4032, file 2, record key 0, length 4 words, record id 48
<info> app: Dirty header on page 0, offset 4060, file 2, record key 0, length 4 words, record id 50
<info> app: Dirty header on page 1, offset 8, file 1, record key 0, length 233 words, record id 33
<info> app: Dirty header on page 1, offset 952, file 1, record key 0, length 233 words, record id 37
<info> app: Dirty header on page 1, offset 1896, file 1, record key 0, length 233 words, record id 41
<info> app: Dirty header on page 1, offset 2840, file 1, record key 0, length 50 words, record id 45
<info> app: Dirty header on page 1, offset 3052, file 1, record key 0, length 234 words, record id 49
<info> app: Valid header on page 1, offset 4000, file 1, record key 9, length 1 words, record id 51
<info> app: Valid header on page 1, offset 4016, file 2, record key 9, length 4 words, record id 52

This causes FDS_ERR_NO_SPACE_IN_FLASH on next write, and repeating GC doesn't fix it. Why would GC succeed when it doesn't clean up dirty records? And why is garbage collection not cleaning up records?

Parents
  • After several hours of debugging, I think I have figured it out. The problem is that sometimes I leave records open, and if I open the same record twice fds_record_open does not check if the record is already open, and so increments the records_open counter again. This means the count is too high, and will never return to 0 even when all records are closed, so GC can never run.

    fds_record_open should be modified either to return an error if p_desc is already open, or not to increment the records_open count if it is already open. My preference is the latter, because it allows you to open the record again without error handling and extra checking in user code. This also mirrors the behaviour of fds_record_close, which just ignores the record if not open. I think the only risk is if p_desc was not initialised to 0 by the user and so record_is_open is non-zero garbage. To avoid this, record_find could be modified to initialise record_is_open, i.e.:

    // Record found; update the descriptor.
    p_desc->record_id    = p_header->record_id;
    p_desc->p_record     = p_token->p_addr;
    p_desc->gc_run_count = m_gc.run_count
    p_desc->record_is_open = false;

    Here is my modification to fds_record_open:

    ret_code_t fds_record_open(fds_record_desc_t  * const p_desc,
                               fds_flash_record_t * const p_flash_rec)
    {
        uint16_t page;
    
        if ((p_desc == NULL) || (p_flash_rec == NULL))
        {
            return FDS_ERR_NULL_ARG;
        }
        
        uint32_t const *existing_record = p_desc->p_record;
    
        // Find the record if necessary.
        if (record_find_by_desc(p_desc, &page))
        {
            fds_header_t const * const p_header = (fds_header_t*)p_desc->p_record;
    
    #if (FDS_CRC_CHECK_ON_READ)
            if (!crc_verify_success(p_header->crc16,
                                    p_header->length_words,
                                    p_desc->p_record))
            {
                return FDS_ERR_CRC_CHECK_FAILED;
            }
    #endif
    
            // Check if the record is already open, and only increment the counter if not
            if(p_desc->record_is_open) {
                if(existing_record != p_desc->p_record) {
                    // the passed p_desc was not valid (open, but pointer not correct), so we should return error
                    return FDS_ERR_INVALID_ARG;
                }
            } else {
                (void) nrf_atomic_u32_add(&m_pages[page].records_open, 1);
    
                // Set the record as open in the descriptor.
                p_desc->record_is_open = true;
                NRF_LOG_INFO("Opened record %d, now %d records open", p_desc->record_id, m_pages[page].records_open);
            }
    
            // Initialize p_flash_rec.
            p_flash_rec->p_header = p_header;
            p_flash_rec->p_data   = (p_desc->p_record + FDS_HEADER_SIZE);
    
            return NRF_SUCCESS;
        }
    
        // The record could not be found.
        // It either never existed or it has been deleted.
        return FDS_ERR_NOT_FOUND;
    }

    Additionally, I noticed that while fds_record_open uses nrf_atomic_u32_add, fds_record_close does not use nrf_atomic_u32_sub. Perhaps this is due to the use of CRITICAL_SECTION macros, but these are not defined to anything unless FDS_THREADS is defined. I think it should be modified like so:

    ret_code_t fds_record_close(fds_record_desc_t * const p_desc)
    {
        ret_code_t ret;
        uint16_t   page;
    
        if (p_desc == NULL)
        {
            return FDS_ERR_NULL_ARG;
        }
    
        if (record_find_by_desc((fds_record_desc_t*)p_desc, &page))
        {
            CRITICAL_SECTION_ENTER();
            if ((m_pages[page].records_open > 0) && (p_desc->record_is_open))
            {
                nrf_atomic_u32_sub(&m_pages[page].records_open, 1);
                p_desc->record_is_open = false;
    
                ret = NRF_SUCCESS;
            }
            else
            {
                ret = FDS_ERR_NO_OPEN_RECORDS;
            }
            CRITICAL_SECTION_EXIT();
        }
        else
        {
            ret = FDS_ERR_NOT_FOUND;
        }
    
        return ret;
    }

Reply
  • After several hours of debugging, I think I have figured it out. The problem is that sometimes I leave records open, and if I open the same record twice fds_record_open does not check if the record is already open, and so increments the records_open counter again. This means the count is too high, and will never return to 0 even when all records are closed, so GC can never run.

    fds_record_open should be modified either to return an error if p_desc is already open, or not to increment the records_open count if it is already open. My preference is the latter, because it allows you to open the record again without error handling and extra checking in user code. This also mirrors the behaviour of fds_record_close, which just ignores the record if not open. I think the only risk is if p_desc was not initialised to 0 by the user and so record_is_open is non-zero garbage. To avoid this, record_find could be modified to initialise record_is_open, i.e.:

    // Record found; update the descriptor.
    p_desc->record_id    = p_header->record_id;
    p_desc->p_record     = p_token->p_addr;
    p_desc->gc_run_count = m_gc.run_count
    p_desc->record_is_open = false;

    Here is my modification to fds_record_open:

    ret_code_t fds_record_open(fds_record_desc_t  * const p_desc,
                               fds_flash_record_t * const p_flash_rec)
    {
        uint16_t page;
    
        if ((p_desc == NULL) || (p_flash_rec == NULL))
        {
            return FDS_ERR_NULL_ARG;
        }
        
        uint32_t const *existing_record = p_desc->p_record;
    
        // Find the record if necessary.
        if (record_find_by_desc(p_desc, &page))
        {
            fds_header_t const * const p_header = (fds_header_t*)p_desc->p_record;
    
    #if (FDS_CRC_CHECK_ON_READ)
            if (!crc_verify_success(p_header->crc16,
                                    p_header->length_words,
                                    p_desc->p_record))
            {
                return FDS_ERR_CRC_CHECK_FAILED;
            }
    #endif
    
            // Check if the record is already open, and only increment the counter if not
            if(p_desc->record_is_open) {
                if(existing_record != p_desc->p_record) {
                    // the passed p_desc was not valid (open, but pointer not correct), so we should return error
                    return FDS_ERR_INVALID_ARG;
                }
            } else {
                (void) nrf_atomic_u32_add(&m_pages[page].records_open, 1);
    
                // Set the record as open in the descriptor.
                p_desc->record_is_open = true;
                NRF_LOG_INFO("Opened record %d, now %d records open", p_desc->record_id, m_pages[page].records_open);
            }
    
            // Initialize p_flash_rec.
            p_flash_rec->p_header = p_header;
            p_flash_rec->p_data   = (p_desc->p_record + FDS_HEADER_SIZE);
    
            return NRF_SUCCESS;
        }
    
        // The record could not be found.
        // It either never existed or it has been deleted.
        return FDS_ERR_NOT_FOUND;
    }

    Additionally, I noticed that while fds_record_open uses nrf_atomic_u32_add, fds_record_close does not use nrf_atomic_u32_sub. Perhaps this is due to the use of CRITICAL_SECTION macros, but these are not defined to anything unless FDS_THREADS is defined. I think it should be modified like so:

    ret_code_t fds_record_close(fds_record_desc_t * const p_desc)
    {
        ret_code_t ret;
        uint16_t   page;
    
        if (p_desc == NULL)
        {
            return FDS_ERR_NULL_ARG;
        }
    
        if (record_find_by_desc((fds_record_desc_t*)p_desc, &page))
        {
            CRITICAL_SECTION_ENTER();
            if ((m_pages[page].records_open > 0) && (p_desc->record_is_open))
            {
                nrf_atomic_u32_sub(&m_pages[page].records_open, 1);
                p_desc->record_is_open = false;
    
                ret = NRF_SUCCESS;
            }
            else
            {
                ret = FDS_ERR_NO_OPEN_RECORDS;
            }
            CRITICAL_SECTION_EXIT();
        }
        else
        {
            ret = FDS_ERR_NOT_FOUND;
        }
    
        return ret;
    }

Children
No Data
Related