BT HCI Core fault disappears when I set log level to debug.

I'm using the nRF9160DK as a BLE-to-LTE bridge, and I have the onboard 52840 configured as my Bluetooth controller. The 9160DK advertises as a peripheral and waits for a connection from a different board, which automatically connects. If they connect and disconnect even once, I get the following error:

00> [00:05:21.990,570] <inf> tracing: Connected: C2:5E:E8:30:50:1C (random)...
00> [00:05:21.998,229] <inf> tracing: ADV CONNECTED!
00> [00:07:17.994,842] <err> tracing: Disconnected, reason 0x08 
00> 
00> [00:07:18.002,105] <err> bt_hci_core: OpCode 0x0001 completed instead of expected 0x2039
00> ASSERTION FAIL [err == 0] @ WEST_TOPDIR/zephyr/subsys/bluetooth/host/hci_core.c:430
00>   Controller unresponsive, command opcode 0x2039 timeout with err -11
00> [00:07:38.015,686] <err> os: r0/a1:  0x00000003  r1/a2:  0x20011e00  r2/a3:  0x00000000
00> [00:07:38.024,749] <err> os: r3/a4:  0x00000003 r12/ip:  0x00000000 r14/lr:  0x0000f02b
00> [00:07:38.033,813] <err> os:  xpsr:  0x01000000
00> [00:07:38.039,306] <err> os: s[ 0]:  0xffffffff  s[ 1]:  0xffffffff  s[ 2]:  0x2001bd44  s[ 3]:  0x00000103
00> [00:07:38.050,292] <err> os: s[ 4]:  0x2001bce8  s[ 5]:  0x2001bd7c  s[ 6]:  0x2001bcc8  s[ 7]:  0x2001bcc8
00> [00:07:38.061,248] <err> os: s[ 8]:  0x00000000  s[ 9]:  0x00000001  s[10]:  0x2001bcd8  s[11]:  0x2001bcd8
00> [00:07:38.072,235] <err> os: s[12]:  0x00000000  s[13]:  0x2001bd7c  s[14]:  0x2001d4b4  s[15]:  0x2001d4a8
00> [00:07:38.083,160] <err> os: fpscr:  0x00000001
00> [00:07:38.088,623] <err> os: Faulting instruction address (r15/pc): 0x0000f03a
00> [00:07:38.096,862] <err> os: >>> ZEPHYR FATAL ERROR 3: Kernel oops on CPU 0
00> [00:07:38.104,858] <err> os: Current thread: 0x2000f4a8 (sysworkq)
00> [00:07:38.112,030] [1;31m<err> os: Halting system[0m

After I saw this, I dug into hci_core.c. It appears a 10-second wait for a response is timing out on line 430. I tried changing that timeout to 20 seconds, and it still crashed.

Then I tried increasing the BT_RX_WQ stack size. That had no effect, and thread analyzer shows it at 20% stack usage.

Next I enabled the debug logs. (CONFIG_BT_HCI_CORE_LOG_LEVEL_DBG=y) This logs a lot of information to the console, and I have log mode set to immediate. I think it probably slowed the execution speed down a bit. It has not crashed a single time with debug logs enabled, even though I've tested for a while.

This smells like a timing issue to me. Has anyone run into this before? The only result Google had for that error message was unrelated, I think. It was something about the system workqueue being interrupted for too long.

Happy to provide more info if you need it. My prj.conf is below.

# Imported from zephyr/samples/net/mqtt_publisher/prj.conf
# If this doesn't work, I blame the idiots who created this mess of a build system.

#--- SECURITY ---#
CONFIG_BOOTLOADER_MCUBOOT=y
CONFIG_BUILD_WITH_TFM=y
CONFIG_TRUSTED_EXECUTION_NONSECURE=y
CONFIG_HW_STACK_PROTECTION=y

#--- I/O ---#
# CONFIG_CONSOLE=y
# CONFIG_STDOUT_CONSOLE=y
# CONFIG_PRINTK=y
CONFIG_SERIAL=y

#--- KERNEL OPTIONS ---#
CONFIG_POSIX_API=y
# Maximum no. of file descriptors.
CONFIG_POSIX_MAX_FDS=32
CONFIG_ZVFS_OPEN_MAX=32
# Enable the ring buffer library.
CONFIG_RING_BUFFER=y
# Otherwise you can't print floating point numbers!
CONFIG_FPU=y 
CONFIG_CBPRINTF_FP_SUPPORT=y
# Enable the GPIO capability.
CONFIG_GPIO=y
CONFIG_PINCTRL=y
CONFIG_UART_USE_RUNTIME_CONFIGURE=n
#CONFIG_UART_INTERRUPT_DRIVEN=y
CONFIG_UART_LINE_CTRL=y
# CONFIG_UART_ASYNC_API=y
CONFIG_THREAD_NAME=y

#--- STACK/BUFFER SIZES ---#
CONFIG_MAIN_STACK_SIZE=1536
CONFIG_IDLE_STACK_SIZE=320
CONFIG_ISR_STACK_SIZE=2048
CONFIG_HEAP_MEM_POOL_SIZE=2048
CONFIG_NET_RX_STACK_SIZE=2048
CONFIG_NET_TX_STACK_SIZE=4096
# CONFIG_NET_MGMT_EVENT_STACK_SIZE=2048
CONFIG_NET_BUF_DATA_SIZE=2048

#--- NETWORKING ---#
CONFIG_NETWORKING=y
CONFIG_NET_SOCKETS=y
CONFIG_NET_SOCKETS_OFFLOAD=y
# Enable TLS credential management.
CONFIG_TLS_CREDENTIALS=y
# Enable IPv6 support
CONFIG_NET_IPV6=y
# Enable IPv4 support
CONFIG_NET_IPV4=y
# Enable the MQTT Lib
CONFIG_MQTT_LIB=y
CONFIG_MQTT_LIB_TLS=y
CONFIG_MQTT_KEEPALIVE=1200
CONFIG_MQTT_CLEAN_SESSION=y
# Below depends on CONFIG_POSIX_MAX_FDS
CONFIG_NET_SOCKETS_POLL_MAX=32 

#--- DNS ---#
CONFIG_DNS_RESOLVER=y
# Enable additional buffers
CONFIG_DNS_RESOLVER_ADDITIONAL_BUF_CTR=5
# Enable additional queries
CONFIG_DNS_RESOLVER_ADDITIONAL_QUERIES=2
# Enable mDNS support
CONFIG_MDNS_RESOLVER=y
CONFIG_DNS_RESOLVER_MAX_SERVERS=3
CONFIG_DNS_SERVER_IP_ADDRESSES=y
CONFIG_DNS_NUM_CONCUR_QUERIES=5
# Google DNS IPv4 and IPv6 servers
CONFIG_DNS_SERVER1="8.8.8.8"
CONFIG_DNS_SERVER2="2001:4860:4860::8888"

#--- MODEM ---#
CONFIG_MODEM=y
CONFIG_MODEM_KEY_MGMT=y
CONFIG_LTE_LINK_CONTROL=y
CONFIG_NRF_MODEM_LIB=y
CONFIG_MODEM_SOCKET=y
CONFIG_MODEM_SOCKET_PACKET_COUNT=50
# Reset the modem if it encounters a fault in its firmware.
CONFIG_NRF_MODEM_LIB_ON_FAULT_RESET_MODEM=y
CONFIG_NRF_MODEM_LIB_SENDMSG_BUF_SIZE=4096
CONFIG_NRF_MODEM_LIB_SHMEM_RX_SIZE=4096
CONFIG_NRF_MODEM_LIB_SHMEM_TX_SIZE=4096
CONFIG_NRF_MODEM_LIB_HEAP_SIZE=4096

#--- BLUETOOTH ---#
CONFIG_BT=y
CONFIG_BT_DEVICE_NAME="MyDev"

CONFIG_BT_H4=y
CONFIG_BT_HCI=y
CONFIG_BT_PERIPHERAL=y
CONFIG_BT_EXT_ADV=y
# CONFIG_BT_CTLR=y
# CONFIG_BT_LL_SW_SPLIT=y
# CONFIG_BT_LL_SOFTDEVICE=y
CONFIG_BT_DRIVERS=y
CONFIG_BT_WAIT_NOP=y
CONFIG_NRF_SW_LPUART=n

CONFIG_BT_PER_ADV=y
CONFIG_BT_PER_ADV_SYNC_TRANSFER_SENDER=y
CONFIG_BT_PER_ADV_RSP=y

CONFIG_BT_ATT_PREPARE_COUNT=2
CONFIG_BT_L2CAP_TX_BUF_COUNT=8
CONFIG_BT_L2CAP_TX_MTU=247
CONFIG_BT_BUF_ACL_RX_SIZE=250

CONFIG_BT_HCI_CORE_LOG_LEVEL_DBG=y

CONFIG_NRFX_UARTE0=y
CONFIG_NRFX_UARTE1=y
CONFIG_NRFX_UARTE2=y
CONFIG_NRFX_UARTE3=y
 
#--- DATE & TIME ---#
CONFIG_DATE_TIME=y
CONFIG_DATE_TIME_MODEM=y
CONFIG_DATE_TIME_AUTO_UPDATE=y

#--- ZCBOR LIBRARY ---#
# CONFIG_ZCBOR=y
# 
######### DEBUGGING MODE ##########
# CONFIG_DEBUG=y
# CONFIG_DEBUG_INFO=y
# CONFIG_DEBUG_THREAD_INFO=y
CONFIG_RTT_CONSOLE=y
CONFIG_USE_SEGGER_RTT=y

# CONFIG_NRF_MODEM_LIB_TRACE=y
# CONFIG_NRF_MODEM_LIB_TRACE_BACKEND_UART=y

# Debugging configuration
# CONFIG_THREAD_ANALYZER=y
# CONFIG_THREAD_ANALYZER_AUTO=y
# CONFIG_THREAD_ANALYZER_AUTO_STACK_SIZE=2048
# CONFIG_THREAD_ANALYZER_RUN_UNLOCKED=y
# CONFIG_THREAD_ANALYZER_USE_PRINTK=y

# CONFIG_TRACING=y
# CONFIG_TRACING_NETWORKING=y
# CONFIG_TRACING_NET_CORE=y
# CONFIG_TRACING_NET_SOCKETS=y
# CONFIG_TRACING_OBJECT_TRACKING=y
# 
# CONFIG_SEGGER_SYSTEMVIEW=y

CONFIG_NET_LOG=y

# CONFIG_MQTT_LOG_LEVEL_DBG=y
CONFIG_LOG=y
CONFIG_LOG_BACKEND_RTT=y
CONFIG_LOG_MODE_IMMEDIATE=y
# CONFIG_LOG_MODE_DEFERRED=n 
# If you want deferred logging, set stack sizes below.
# CONFIG_LOG_BUFFER_SIZE=4096
# CONFIG_LOG_PROCESS_THREAD_STACK_SIZE=4096

# CONFIG_NRF_MODEM_LIB_MEM_DIAG=y
# CONFIG_NRF_MODEM_LIB_MEM_DIAG_ALLOC=y
# CONFIG_NRF_MODEM_LIB_MEM_DIAG_DUMP=y

# Add asserts
# CONFIG_ASSERT=y
# CONFIG_ASSERT_VERBOSE=y
# CONFIG_ASSERT_NO_COND_INFO=n
# CONFIG_ASSERT_NO_MSG_INFO=n
# CONFIG_THREAD_NAME=y
# CONFIG_STACK_SENTINEL=y

CONFIG_WATCHDOG=y

Related