nrf modem: 1.3.4
sdk: 2.3.0
Custom board.
We have an asset tracker that caches events when out of connectivity zone. We can accumulate 10-100 messages depending upon the duration. Once in a while, when emptying the cache the socket/mqtt does not return and after 15 minutes the watchdog would reset the CPU. So overcome this challenge, it set a timeout on the socket to 15 seconds.
err = setsockopt(client.transport.tls.sock, SOL_SOCKET , SO_SNDTIMEO, &timeout_mqtt, sizeof(timeout_mqtt)); if (err) { #ifdef DEBUG_PRINT_MQTT printf("failed to set send timeout. err: %d. errno: %d\n", err, errno); #endif }2023-12-06 - modem trace on fw 91-22.log
This is allowing our application to continue and not reset but after 15 seconds error -11 is return followed by closing of the socket and mqtt connection. This is happening around the same time at the 16/17th message (each message is sent with a second delay in between to allow mqtt to do its thing).
00> CACHE, 12-07T 2:53:58.517Z, INFO, Sending event: 16 of 99. eventCounter: 120 00> MQTT, 12-07T 2:53:58.518Z, INFO, Publishing mqtt msg id: 1037, 00> Publishing 00> [00:07:55.668,853] <inf> nrf_modem: send() fd 0x0, len 178, blocking 00> [00:07:55.676,086] <inf> nrf_modem: RPC_IP_STATUS_NTF fd 0xff, RPC flow ctrl on 00> [00:07:55.876,098] <inf> nrf_modem: RPC_IP_SEND_RES fd 0x0, result RPC_IP_ERR_NO_MEM 00> [00:08:10.876,739] <err> net_mqtt: Transport write failed, err_code = -11, closing connection 00> [00:08:10.877,258] <inf> net_mqtt_sock_tls: Closing socket 0 00> [00:08:10.877,685] <dbg> net_sock: z_impl_zsock_close: (main): close: ctx=0x2001108c, fd=0 00> [00:08:10.878,234] <inf> nrf_modem: close() fd 0x0 00> [00:08:10.885,406] <inf> nrf_modem: RPC_IP_CLOSE_RES fd 0x0, result RPC_IP_ERR_OK 00> [00:08:10.885,894] <inf> nrf_modem: poll() fd 0x0, nval:1 00> MQTT, 12-07T 2:54:13.740Z, ERROR, mqtt_evt_handler, client disconnected: -11 00> MQTT, 12-07T 2:54:13.741Z, ERROR, mqttSend, publish failed. mqttState: 0, err: -11, errno: 11
even though it happens at "sending event 16 of 99" everytime, it does not always happen. Sometimes it will happen within 5 mintues of running the code, other times it could be hours. LTE connectivity is not changed during this time as I am doing this simulated caching test from my work desk.
Once the socket and connection is closed by the mqtt.c, the next attempt to connect on mqtt results in errno -12 (ENOMEM).
Summary of issues:
1. Mqtt data transfer freezer without timeout on socket (resulting in watchdog reset after 15 minutes)
2. Mqtt data transfer returns -11 with timeout on socket following by -12 (ENOMEM) for next mqtt connection attempts
3. There is any way to get prints as well as trace over RTT? I collected trace but i am not sure if this event was captured in it.
Here are my proj.conf settings and I have also attached a trace.
# # Copyright (c) 2019 Nordic Semiconductor ASA # # SPDX-License-Identifier: LicenseRef-BSD-5-Clause-Nordic # General config CONFIG_NEWLIB_LIBC=y CONFIG_NEWLIB_LIBC_NANO=n #disable for 64-bit long long support CONFIG_NEWLIB_LIBC_FLOAT_PRINTF=y CONFIG_HW_STACK_PROTECTION=y #added on 2023-11-24 to detect stack overflow CONFIG_PM=y # # Required to disable default behavior of deep sleep on timeout CONFIG_PM_DEVICE=y #to implement our own error handler so we can store critical data before rebooting CONFIG_RESET_ON_FATAL_ERROR=n # Main CONFIG_NCS_SAMPLES_DEFAULTS=n #we want to be able to reboot CONFIG_REBOOT=y CONFIG_FPU=y #added on 2023-11-27 to deal with ENOMEM error on mqtt connect #after mqtt send returns -11 CONFIG_NRF_MODEM_LIB_HEAP_SIZE=4096 CONFIG_NRF_MODEM_LIB_SENDMSG_BUF_SIZE=2048 CONFIG_POSIX_MAX_FDS=10 CONFIG_NET_BUF_RX_COUNT=32 CONFIG_NET_BUF_TX_COUNT=32 CONFIG_NET_PKT_RX_COUNT=16 CONFIG_NET_PKT_TX_COUNT=16 CONFIG_HEAP_MEM_POOL_SIZE=28672 CONFIG_MAIN_STACK_SIZE=28672 CONFIG_ISR_STACK_SIZE=10240 CONFIG_SYSTEM_WORKQUEUE_STACK_SIZE=12288 #Watchdog CONFIG_WATCHDOG=y # Following 2 are added to avoid k_timeout_t error CONFIG_DATE_TIME=y #nrfx drivers #set all to no for power measurement CONFIG_SERIAL=y CONFIG_UART_INTERRUPT_DRIVEN=y #for RTT # disabled for RTT Modem trace CONFIG_USE_SEGGER_RTT=y CONFIG_RTT_CONSOLE=y CONFIG_LOG_BACKEND_UART=n CONFIG_THREAD_NAME=y #dont halt but reset the system on panic CONFIG_TFM_HALT_ON_CORE_PANIC=n # for modem trace. not working as of 2023-12-06 CONFIG_LOG_BACKEND_RTT=n CONFIG_NRF_MODEM_LIB_TRACE_ENABLED=y CONFIG_NRF_MODEM_LIB_TRACE_BACKEND_RTT=y # Network CONFIG_NETWORKING=y CONFIG_NET_SOCKETS=y CONFIG_NET_SOCKETS_OFFLOAD=y CONFIG_NET_NATIVE=n CONFIG_NET_SOCKETS_POSIX_NAMES=y #Zephyr http api - sockets CONFIG_HTTP_CLIENT=y # SMS CONFIG_SMS=y # nRF modem library # Modem library CONFIG_NRF_MODEM_LIB=y CONFIG_NRF_MODEM_LIB_SYS_INIT=n CONFIG_MODEM_INFO=y #for nrf temperature # LTE link control #use 60 seconds timeout CONFIG_LTE_NETWORK_TIMEOUT=45 CONFIG_LTE_LINK_CONTROL=y CONFIG_LTE_AUTO_INIT_AND_CONNECT=n CONFIG_LTE_NETWORK_MODE_LTE_M=y #Bands CONFIG_LTE_LOCK_BANDS=y # BANDS FOR US/CAN: 2, 4, 5, 12, 13, 17, 25, 26. BANDS FOR CHINA: 8 # BANDS FOR CHILE: 3, 5 CONFIG_LTE_LOCK_BAND_MASK="0000010001100000011110" CONFIG_LTE_LC_TAU_PRE_WARNING_NOTIFICATIONS=y #psm #101 = minuts, 1010 = 10 (10 minutes PSM) #001 - 1 hours 00001 (1) = 1 hours, 01010 = 10 hours, 00110 = 6 hours, 11010 = 26 hours CONFIG_LTE_PSM_REQ_RPTAU="00100110" #000 - 2 second 00001 (2) = 2 seconds on time, 00011 = 6 second on time CONFIG_LTE_PSM_REQ_RAT="00000011" # Library for buttons and LEDs CONFIG_DK_LIBRARY=n #flash with nvs CONFIG_FLASH=y CONFIG_FLASH_PAGE_LAYOUT=y CONFIG_NVS=y # CONFIG_NVS_LOG_LEVEL_DBG=y CONFIG_MPU_ALLOW_FLASH_WRITE=y CONFIG_SOC_FLASH_NRF_EMULATE_ONE_BYTE_WRITE_ACCESS=y # Image manager CONFIG_IMG_MANAGER=y CONFIG_IMG_ERASE_PROGRESSIVELY=y # FOTA library CONFIG_FOTA_DOWNLOAD=y CONFIG_FOTA_DOWNLOAD_PROGRESS_EVT=y # Download client CONFIG_DOWNLOAD_CLIENT=y # CONFIG_DOWNLOAD_CLIENT_TLS=y CONFIG_DOWNLOAD_CLIENT_STACK_SIZE=4096 CONFIG_DOWNLOAD_CLIENT_BUF_SIZE=4096 CONFIG_DOWNLOAD_CLIENT_HTTP_FRAG_SIZE_2048=y CONFIG_NET_SOCKETS_TLS_SET_MAX_FRAGMENT_LENGTH=n # DFU Target CONFIG_DFU_TARGET=y # Modem key management CONFIG_MODEM_KEY_MGMT=y # Application Upgrade support CONFIG_BOOTLOADER_MCUBOOT=y # MQTT CONFIG_MQTT_LIB=y CONFIG_MQTT_LIB_TLS=y #17 minutes, in seconds CONFIG_MQTT_KEEPALIVE=1020 CONFIG_MQTT_CLEAN_SESSION=n CONFIG_MQTT_TLS_SESSION_CACHING=y # GPIO CONFIG_GPIO=y # ADC CONFIG_ADC=y CONFIG_ADC_NRFX_SAADC=y CONFIG_ADC_ASYNC=n CONFIG_I2C_NRFX=y CONFIG_I2C=y # CONFIG_NRFX_TWI=y CONFIG_AT_MONITOR=y
and my mqttSend function:
int mqttSend()// uint8_t *data_to_send , uint16_t len) { int err = 0; uint8_t pubAckCheckCounter = 0; mydbg_MqttEvents.noOfMqttSendAttempted++; // err = mqttDataPublish(&client,MQTT_QOS_1_AT_LEAST_ONCE,lteMsgString, myLteMsgStringLength); err = mqttDataPublish(&client,MQTT_QOS_0_AT_MOST_ONCE,lteMsgString, myLteMsgStringLength); if (err != 0) { #ifdef DEBUG_PRINT_MQTT printf(" MQTT, %s, ERROR, mqttSend, publish failed. mqttState: %d, err: %d, errno: %d\n", myFormattedDateTime(), mqttIsConnected, err, errno); #endif mydbg_MqttEvents.noOfFailedMqttSends++; mydbg_MqttEvents.lastMqttSendError = err; lastMqttCommFailedWithEAGAIN = true; //we came here because mqttDataPublish returned -11 and closed the socket and connection. //just restart the thread. //Also tried without restarting thread and the effect on being able to create the subsequent mqtt //connection is the same with -12 (ENOMEM) stopMqttThread(); return ERROR_FAILURE; } else { lastMqttCommFailedWithEAGAIN = false; mydbg_MqttEvents.noOfMqttMsgsSent++; #ifdef DEBUG_PRINT_MQTT printf(" MQTT, %s, INFO, mqttSend, sent the MQTT data\n", myFormattedDateTime()); #endif #if defined(UNITTEST_MQTT) && defined(ENABLE_PRINTS) if (enum_myMqttUntitTest == unittest_mqtt_ignorePubAck) { return ERROR_SUCCESS; } #endif return ERROR_SUCCESS; } return ERROR_FAILURE; }
Thank you for your continued support
Regards
Noaman