Hardware
- Module: Fanstel WT02P40P (dual-band 2.4 GHz / 5 GHz)
- SoC: nRF52 + Wi-Fi coprocessor (WT02 series)
Software
- nRF Connect SDK Version: v2.6.2
- OS: Zephyr RTOS (default networking stack)
Problem Description
We are observing intermittent TLS/WebSocket connection issues during long runtime testing while the device is connected over Wi-Fi.
After extended operation, the firmware occasionally gets stuck during secure socket connection establishment at the following API:
connect(sock, (struct sockaddr *)&server, sizeof(struct sockaddr_in));
The issue is observed in both:
- TLS server connection flow
- WebSocket secure connection flow
Observed Behavior
- Device operates normally for several hours/days.
- The issue is primarily observed during the periodic HubStatus cycle.
-
During the HubStatus flow, the firmware performs:
WebSocket Stop → TLS Connect → HTTP APIs → Disconnect → WebSocket Restart
- During this reconnect sequence, the system intermittently gets stuck inside the
connect()API. - In some cases the connection eventually fails and retries continue.
- In other cases the thread appears blocked for extended duration, causing communication instability and occasional watchdog reset/reboot.
Current Workaround
To avoid blocking the main work-queue thread, we implemented a dedicated thread for the connect() API handling along with timeout monitoring and watchdog feeding.
The devices are currently under long runtime testing to validate whether this workaround fully resolves the issue.
We would like support from the Nordic team to understand:
- Possible reasons for
connect()blocking during TLS/WebSocket handshake on Zephyr/nRF Connect SDK. - Whether this behaviour is expected with the Wi-Fi/TLS stack.
- Recommended socket handling/recovery mechanism for long-running TLS/WebSocket applications.
- Any known limitations, fixes, or configuration recommendations for NCS v2.6.2 related to TLS socket stability.
Relevant source files
/******************** Include header files ********************/
#include <zephyr/kernel.h>
#include <zephyr/net/socket.h>
#include <zephyr/net/websocket.h>
#include <zephyr/net/tls_credentials.h>
#include <zephyr/logging/log.h>
#include <string.h>
#include <errno.h>
#include <stdio.h>
#include <zephyr/posix/fcntl.h>
/* -- User defined Includes -- */
#include "NK_WM_WebsocketManager.h"
#include "NK_WM_WifiManager.h"
#include "NK_WD_WatchDog.h"
#include "NK_TS_TaskScheduler.h"
#include "NK_AH_AdcHandler.h"
#include "NK_GPIO_Handler.h"
#include "NK_Cloud_HubStatus.h"
#include "NK_Net_Conn.h"
#include "NK_Net_Manager.h"
#include "NK_Cloud_ObjectTime.h"
#include "NK_BLE_CommandHandler.h"
#include "NK_FM_FlashManager.h"
#include "NK_WD_WatchDog.h"
/* -- User defined Includes -- */
/******************** Include header files ********************/
LOG_MODULE_REGISTER(nk_ws, LOG_LEVEL_INF);
/* CONFIG */
#define WS_HOSTNAME "api.nexkey.com"
#define WS_PORT 443
#define WS_URL "/"
#define WS_TLS_SEC_TAG 1
#define WS_RECV_BUF_SIZE 2048
#define WS_TMP_BUF_SIZE 256
#define WS_HANDSHAKE_MAX_RETRY 3
/* STATE */
static int tcp_sock = -1;
static int ws_sock = -1;
static bool ws_connected = false;
static struct sockaddr_storage server_addr;
static uint8_t ws_rx_buf[WS_RECV_BUF_SIZE];
static uint8_t tmp_buf[WS_TMP_BUF_SIZE];
char SetHubReqObjectId[32] = {0};
/* CERTIFICATE */
extern const unsigned char ca_certificate[];
/* ========================= CONNECT THREAD STATE ========================= */
/*
* WHY A THREAD?
* -------------
* connect() with IPPROTO_TLS_1_2 on Zephyr + nRF7002 + mbedTLS performs the
* FULL TLS handshake INSIDE the connect() call. This includes:
* TCP SYN/ACK (~200ms) + TLS cert download + RSA verification (2-8 seconds!)
*
* O_NONBLOCK does NOT help. Zephyr's software TLS layer (net/tls.c) runs the
* handshake synchronously regardless of socket flags. So connect() BLOCKS the
* calling thread for 2-10 seconds no matter what.
*
* Result: your work queue thread is frozen → watchdog not fed → device resets.
*
* FIX: Run connect() in its own dedicated thread.
* - connect() can block as long as it needs in that thread
* - Calling thread stays FREE → feeds watchdog every 100ms
* - If connect() hangs beyond 15s → thread is aborted → clean recovery
* - Device NEVER needs power cycle again
*/
/* How long to wait for connect() to finish (covers worst-case TLS on weak WiFi) */
#define WS_CONNECT_THREAD_TIMEOUT_MS 15000
#define WS_CONNECT_THREAD_POLL_MS 100 /* watchdog feed interval */
#define WS_CONNECT_THREAD_STACK_SIZE 3072
#define WS_CONNECT_THREAD_PRIORITY 5
/* State values set by the connect thread, read by the calling thread */
typedef enum
{
WS_TC_IDLE = 0, /* not started yet */
WS_TC_RUNNING = 1, /* connect() is executing */
WS_TC_OK = 2, /* connect() returned 0 */
WS_TC_FAIL = 3, /* connect() returned error */
} ws_tc_state_t;
static atomic_t ws_tc_state = ATOMIC_INIT(WS_TC_IDLE);
static int ws_tc_errno = 0;
/* Thread object + stack — statically allocated (no heap needed) */
static struct k_thread ws_tc_thread;
K_THREAD_STACK_DEFINE(ws_tc_stack, WS_CONNECT_THREAD_STACK_SIZE);
/* Semaphore: connect thread signals this when connect() returns */
static K_SEM_DEFINE(ws_tc_done_sem, 0, 1);
static char last_cmd[512];
static bool json_get_string(const char *msg, const char *key, char *out, size_t out_size)
{
char pattern[32];
snprintf(pattern, sizeof(pattern), "\"%s\":\"", key);
char *start = strstr(msg, pattern);
if (!start)
return false;
start += strlen(pattern);
char *end = strchr(start, '"');
if (!end)
return false;
size_t len = end - start;
if (len >= out_size)
len = out_size - 1;
memcpy(out, start, len);
out[len] = '\0';
return true;
}
static bool json_get_last_object_id(const char *msg, char *out, size_t out_size)
{
const char *key = "\"objectId\":\"";
const char *found = msg;
const char *last = NULL;
/* find last occurrence */
while ((found = strstr(found, key)) != NULL)
{
last = found;
found += strlen(key);
}
if (!last)
return false;
/* move to value start */
last += strlen(key);
const char *end = strchr(last, '"');
if (!end)
return false;
size_t len = end - last;
if (len >= out_size)
len = out_size - 1;
memcpy(out, last, len);
out[len] = '\0';
return true;
}
static bool json_get_int(const char *msg, const char *key, int *value)
{
char pattern[32];
snprintf(pattern, sizeof(pattern), "\"%s\":", key);
char *start = strstr(msg, pattern);
if (!start)
return false;
start += strlen(pattern);
*value = atoi(start);
return true;
}
bool is_duplicate_cmd(const char *cmd)
{
if (strcmp(last_cmd, cmd) == 0)
{
LOG_INF("Duplicate WS command ignored");
return true;
}
strncpy(last_cmd, cmd, sizeof(last_cmd) - 1);
return false;
}
static void ws_handle_command(const char *msg)
{
if (msg == NULL || strlen(msg) < 10)
return;
if (!strstr(msg, "\"op\":\"create\""))
return;
if (is_duplicate_cmd(msg))
return;
LOG_INF("WS Hub command received");
/* -------- Extract fields from packet -------- */
char lockId[16] = {0};
uint32_t unlockDuration = 0;
json_get_string(msg, "lockId", lockId, sizeof(lockId));
json_get_last_object_id(msg, SetHubReqObjectId, sizeof(SetHubReqObjectId));
json_get_int(msg, "duration", &unlockDuration);
LOG_INF("LockID: %s", lockId);
LOG_INF("ObjectID: %s", SetHubReqObjectId);
LOG_INF("unlockDuration: %d", unlockDuration);
if (strstr(msg, "\"status\":10"))
{
LOG_INF("UNLOCK command received");
// uint8_t door_state = NK_AdcReading();
uint8_t door_state = 0;
if (door_state != 1)
{
LOG_INF("Triggering relay");
settings.business_hour_relock = unlockDuration;
UnLockCommand_Handle();
prepareSetHubReqStatusPayload(HUB_REQ_UNLOCKED, HUB_REQ_SUCCESSS_STATES_LEN);
TS_StopHubStatusManageTimer();
NK_WS_Stop(); // Disconnect websocket
/* DNS resolve */
LOG_INF("HubStatus: resolve server");
if (NK_NET_Resolve())
{
NK_NET_Manager_Start();
LOG_ERR("HubStatus failed → DNS resolve");
return;
}
/* TLS setup */
LOG_INF("HubStatus: TLS setup");
if (NK_NET_TLS_Setup())
{
LOG_ERR("HubStatus failed TLS setup");
return;
}
/* TLS connect */
LOG_INF("HubStatus: TLS connect");
if (NK_NET_Connect())
{
LOG_ERR("HubStatus failed TLS connect");
NK_NET_Manager_Start();
return;
}
NK_Cloud_SetHubStatusRequest();
prepareSetHubReqStatusPayload(HUB_REQ_RETURNED_TO_IDLE, HUB_REQ_SUCCESSS_STATES_LEN);
NK_Cloud_SetHubStatusRequest();
NK_Cloud_LogUnlockRequestAPI();
NK_Cloud_HubStatus_Request();
NK_NET_Disconnect();
NK_WS_Start(); // connect websocket
TS_StartHubStatusManageTimer(K_MINUTES(HUBSTATUS_CYCLE_TIME), K_MINUTES(HUBSTATUS_CYCLE_TIME)); // Start periodic hubstatus timer
}
else
{
LOG_INF("Door already unlocked");
prepareSetHubReqStatusPayload(HUB_REQ_UNLOCKED_ALREADY, HUB_REQ_SUCCESSS_STATES_LEN);
TS_StopHubStatusManageTimer();
NK_WS_Stop(); // Disconnect websocket
// /* DNS resolve */
LOG_INF("HubStatus: resolve server");
if (NK_NET_Resolve())
{
NK_NET_Manager_Start();
LOG_ERR("HubStatus failed → DNS resolve");
return;
}
/* TLS setup */
LOG_INF("HubStatus: TLS setup");
if (NK_NET_TLS_Setup())
{
LOG_ERR("HubStatus failed TLS setup");
return;
}
/* TLS connect */
LOG_INF("HubStatus: TLS connect");
if (NK_NET_Connect())
{
LOG_ERR("HubStatus failed TLS connect");
NK_NET_Manager_Start();
return;
}
NK_Cloud_SetHubStatusRequest();
NK_Cloud_LogUnlockRequestAPI();
NK_Cloud_HubStatus_Request();
NK_NET_Disconnect();
NK_WS_Start(); // connect websocket
TS_StartHubStatusManageTimer(K_MINUTES(HUBSTATUS_CYCLE_TIME), K_MINUTES(HUBSTATUS_CYCLE_TIME)); // Start periodic hubstatus timer
}
}
/* Lock */
else if (strstr(msg, "\"status\":11"))
{
LOG_INF("LOCK command received");
// uint8_t door_state = NK_AdcReading();
uint8_t door_state = 1;
if (door_state != 2)
{
LOG_INF("Door LOCK triggered");
LockCommand_Handle();
prepareSetHubReqStatusPayload(HUB_REQ_LOCKED, HUB_REQ_SUCCESSS_STATES_LEN);
TS_StopHubStatusManageTimer();
NK_WS_Stop(); // Disconnect websocket
/* DNS resolve */
LOG_INF("HubStatus: resolve server");
if (NK_NET_Resolve())
{
NK_NET_Manager_Start();
LOG_ERR("HubStatus failed → DNS resolve");
return;
}
/* TLS setup */
LOG_INF("HubStatus: TLS setup");
if (NK_NET_TLS_Setup())
{
LOG_ERR("HubStatus failed TLS setup");
return;
}
/* TLS connect */
LOG_INF("HubStatus: TLS connect");
if (NK_NET_Connect())
{
LOG_ERR("HubStatus failed TLS connect");
NK_NET_Manager_Start();
return;
}
NK_Cloud_SetHubStatusRequest();
prepareSetHubReqStatusPayload(HUB_REQ_RETURNED_TO_IDLE, HUB_REQ_SUCCESSS_STATES_LEN);
NK_Cloud_SetHubStatusRequest();
NK_Cloud_LogUnlockRequestAPI();
NK_Cloud_HubStatus_Request();
NK_NET_Disconnect();
NK_WS_Start(); // connect websocket
TS_StartHubStatusManageTimer(K_MINUTES(HUBSTATUS_CYCLE_TIME), K_MINUTES(HUBSTATUS_CYCLE_TIME)); // Start periodic hubstatus timer
}
else
{
LOG_INF("Door already locked");
prepareSetHubReqStatusPayload(HUB_REQ_LOCKED_ALREADY, HUB_REQ_SUCCESSS_STATES_LEN);
TS_StopHubStatusManageTimer();
NK_WS_Stop(); // Disconnect websocket
// /* DNS resolve */
LOG_INF("HubStatus: resolve server");
if (NK_NET_Resolve())
{
NK_NET_Manager_Start();
LOG_ERR("HubStatus failed → DNS resolve");
return;
}
/* TLS setup */
LOG_INF("HubStatus: TLS setup");
if (NK_NET_TLS_Setup())
{
LOG_ERR("HubStatus failed TLS setup");
return;
}
/* TLS connect */
LOG_INF("HubStatus: TLS connect");
if (NK_NET_Connect())
{
LOG_ERR("HubStatus failed TLS connect");
NK_NET_Manager_Start();
return;
}
NK_Cloud_SetHubStatusRequest();
NK_Cloud_LogUnlockRequestAPI();
NK_Cloud_HubStatus_Request();
NK_NET_Disconnect();
NK_WS_Start(); // connect websocket
TS_StartHubStatusManageTimer(K_MINUTES(HUBSTATUS_CYCLE_TIME), K_MINUTES(HUBSTATUS_CYCLE_TIME)); // Start periodic hubstatus timer
}
}
}
/* ========================= TLS ========================= */
static int ws_tls_setup(void)
{
int err = tls_credential_add(
WS_TLS_SEC_TAG,
TLS_CREDENTIAL_CA_CERTIFICATE,
ca_certificate,
strlen(ca_certificate));
if (err == -EEXIST)
return 0;
if (err < 0)
{
LOG_ERR("WS TLS credential failed (%d)", err);
return err;
}
LOG_INF("WS TLS ready");
return 0;
}
/* ========================= DNS ========================= */
static int ws_resolve(void)
{
struct addrinfo hints = {
.ai_family = AF_INET,
.ai_socktype = SOCK_STREAM};
struct addrinfo *result;
int err;
int retries = 3;
for (int attempt = 1; attempt <= retries; attempt++)
{
watchdog_feed_periodic(); // ADD
err = getaddrinfo(WS_HOSTNAME, "443", &hints, &result);
if (!err && result)
{
memcpy(&server_addr, result->ai_addr, sizeof(struct sockaddr_in));
freeaddrinfo(result);
LOG_INF("WS DNS OK (attempt %d)", attempt);
return 0;
}
LOG_WRN("WS DNS failed (attempt %d/%d), err=%d", attempt, retries, err);
if (result)
{
freeaddrinfo(result);
result = NULL;
}
/* watchdog-safe delay */
int delay = 0;
while (delay < 2000)
{
watchdog_feed_periodic();
k_sleep(K_MSEC(50));
delay += 50;
}
}
LOG_ERR("WS DNS failed after %d attempts", retries);
return -EAGAIN;
}
#if 0
static int ws_resolve(void)
{
struct addrinfo hints = {
.ai_family = AF_INET,
.ai_socktype = SOCK_STREAM};
struct addrinfo *result;
int err = getaddrinfo(WS_HOSTNAME, "443", &hints, &result);
if (err || !result)
{
LOG_ERR("WS DNS failed");
return -EIO;
}
memcpy(&server_addr, result->ai_addr, sizeof(struct sockaddr_in));
freeaddrinfo(result);
LOG_INF("WS DNS OK");
return 0;
}
#endif
/* ========================= TCP CONNECT THREAD ========================= */
/*
* This thread does the ONLY thing that was causing the freeze:
* calls connect() and waits for it to finish (including TLS handshake).
*
* It runs completely independently. The calling thread (ws_tcp_connect)
* is free to do other things — like feed the watchdog.
*
* When connect() returns (success or fail), the thread:
* 1. Sets ws_tc_state to WS_TC_OK or WS_TC_FAIL
* 2. Gives the semaphore to wake up the calling thread
* 3. Exits — its stack is reused on next call
*/
static void ws_tc_thread_fn(void *a, void *b, void *c)
{
ARG_UNUSED(a);
ARG_UNUSED(b);
ARG_UNUSED(c);
LOG_INF("[WS-TC Thread] connect() started — TLS handshake will happen here");
/*
* We use BLOCKING mode (no O_NONBLOCK) intentionally.
*
* Reason: Zephyr's software TLS stack ignores O_NONBLOCK during the
* TLS handshake. The handshake always blocks the calling thread.
* Since we ARE that calling thread (isolated here), blocking is fine.
* The parent thread stays free.
*
* The socket-level SO_SNDTIMEO set before spawning this thread acts
* as a hardware-level backup timeout inside the TLS stack itself.
*/
int err = connect(tcp_sock,
(struct sockaddr *)&server_addr,
sizeof(struct sockaddr_in));
if (err == 0)
{
LOG_INF("[WS-TC Thread] connect() SUCCESS — TCP + TLS done");
ws_tc_errno = 0;
atomic_set(&ws_tc_state, WS_TC_OK);
}
else
{
LOG_ERR("[WS-TC Thread] connect() FAILED errno=%d", errno);
ws_tc_errno = errno;
atomic_set(&ws_tc_state, WS_TC_FAIL);
}
/* Wake up the calling thread */
k_sem_give(&ws_tc_done_sem);
/* Thread exits here — no cleanup needed, stack reused next call */
}
/* ========================= TCP CONNECT ========================= */
static int ws_tcp_connect(void)
{
int attempt;
/*
* SO_SNDTIMEO on the socket acts as a BACKUP timeout inside the
* TLS stack. If the network layer hangs, this causes connect() to
* return with an error after 12s instead of blocking forever.
* Our thread timeout (15s) is longer — so the socket timeout fires
* first, which is cleaner than aborting the thread mid-TLS.
*/
struct timeval sock_timeout = {.tv_sec = 12, .tv_usec = 0};
for (attempt = 1; attempt <= WS_HANDSHAKE_MAX_RETRY; attempt++)
{
watchdog_feed_periodic();
/* ── Step 1: Create socket ── */
tcp_sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TLS_1_2);
if (tcp_sock < 0)
{
LOG_ERR("WS socket create fail errno=%d (attempt %d)", errno, attempt);
/* Feed watchdog during retry wait */
int d = 0;
while (d < 2000)
{
watchdog_feed_periodic();
k_sleep(K_MSEC(100));
d += 100;
}
continue;
}
/* ── Step 2: TLS settings (unchanged from original) ── */
sec_tag_t sec_tag_list[] = {WS_TLS_SEC_TAG};
setsockopt(tcp_sock, SOL_TLS, TLS_SEC_TAG_LIST, sec_tag_list, sizeof(sec_tag_list));
setsockopt(tcp_sock, SOL_TLS, TLS_HOSTNAME, WS_HOSTNAME, strlen(WS_HOSTNAME));
/* ── Step 3: Socket timeouts ── */
/* These apply to send/recv AFTER connection, and act as backup during TLS */
setsockopt(tcp_sock, SOL_SOCKET, SO_RCVTIMEO, &sock_timeout, sizeof(sock_timeout));
setsockopt(tcp_sock, SOL_SOCKET, SO_SNDTIMEO, &sock_timeout, sizeof(sock_timeout));
/* ── Step 4: Reset thread state before spawning ── */
k_sem_reset(&ws_tc_done_sem);
atomic_set(&ws_tc_state, WS_TC_RUNNING);
ws_tc_errno = 0;
LOG_INF("WS TCP connect: spawning thread (attempt %d/%d)",
attempt, WS_HANDSHAKE_MAX_RETRY);
/* ── Step 5: Spawn the connect thread ── */
/*
* connect() runs inside ws_tc_thread_fn() from this point on.
* THIS thread (the calling work queue thread) is now FREE.
* It will spend its time below feeding the watchdog.
*/
k_thread_create(
&ws_tc_thread,
ws_tc_stack,
K_THREAD_STACK_SIZEOF(ws_tc_stack),
ws_tc_thread_fn,
NULL, NULL, NULL,
WS_CONNECT_THREAD_PRIORITY,
0,
K_NO_WAIT);
/* ── Step 6: Wait loop — THIS is where watchdog gets fed ── */
/*
* We wait 100ms at a time. Each iteration:
* - k_sem_take() waits up to 100ms for the thread to signal
* - If no signal (EAGAIN): thread still running → feed WD → repeat
* - If signal (0): thread done → check result → break
*
* This means even if connect() takes 10 seconds:
* → watchdog is fed 100 times during those 10 seconds
* → device stays alive
* → you stay in control
*/
int elapsed = 0;
int wait_result = -ETIMEDOUT;
while (elapsed < WS_CONNECT_THREAD_TIMEOUT_MS)
{
/* CRITICAL: feed watchdog every 100ms — never miss this */
watchdog_feed_periodic();
/* Wait up to 100ms for thread to signal completion */
int sem_err = k_sem_take(&ws_tc_done_sem, K_MSEC(WS_CONNECT_THREAD_POLL_MS));
if (sem_err == 0)
{
/* Thread signaled — connect() has returned */
ws_tc_state_t final = (ws_tc_state_t)atomic_get(&ws_tc_state);
wait_result = (final == WS_TC_OK) ? 0 : -EIO;
break;
}
/* sem_err == -EAGAIN: 100ms passed, thread still running */
elapsed += WS_CONNECT_THREAD_POLL_MS;
/* Log progress every 2 seconds so you can see it in RTT */
if ((elapsed % 2000) == 0)
{
LOG_INF("WS TCP: TLS handshake in progress... %d ms elapsed", elapsed);
}
}
/* Reset state for next call */
atomic_set(&ws_tc_state, WS_TC_IDLE);
/* ── Step 7: Handle result ── */
if (wait_result == -ETIMEDOUT)
{
/*
* connect() is STILL blocked inside the thread after 15 seconds.
* This is the "forever stuck" scenario you described.
*
* We handle it cleanly:
* 1. Abort the thread (stops it immediately)
* 2. Close the socket (releases the TLS context + network buffers)
* 3. Log the event
* 4. Fall through to retry loop
*
* NO POWER CYCLE NEEDED. The next attempt creates a fresh socket
* and fresh thread. The device stays running.
*/
LOG_ERR("WS TCP connect: TIMEOUT at %d ms — aborting stuck thread",
WS_CONNECT_THREAD_TIMEOUT_MS);
// k_thread_abort(tid);
k_sleep(K_MSEC(500));
if (tcp_sock >= 0)
{
shutdown(tcp_sock, SHUT_RDWR);
close(tcp_sock);
tcp_sock = -1;
}
/* Wait before retry — let network stack recover */
int d = 0;
while (d < 3000)
{
watchdog_feed_periodic();
k_sleep(K_MSEC(100));
d += 100;
}
continue; /* try next attempt */
}
if (wait_result == 0)
{
LOG_INF("WS TCP connected (attempt %d) — TCP + TLS complete!", attempt);
return 0; /* ← SUCCESS */
}
/* connect() returned an error (not timeout) */
LOG_ERR("WS TCP connect failed errno=%d (attempt %d)", ws_tc_errno, attempt);
if (tcp_sock >= 0)
{
shutdown(tcp_sock, SHUT_RDWR);
close(tcp_sock);
tcp_sock = -1;
}
/* Backoff delay before next attempt */
if (attempt < WS_HANDSHAKE_MAX_RETRY)
{
int delay = (1000 << attempt); /* 2s, 4s */
int d = 0;
LOG_WRN("WS TCP: retrying in %d ms...", delay);
while (d < delay)
{
watchdog_feed_periodic();
k_sleep(K_MSEC(50));
d += 50;
}
}
}
LOG_ERR("WS TCP connect failed after %d attempts", WS_HANDSHAKE_MAX_RETRY);
return -EIO;
}
/* ========================= WS CONNECT ========================= */
static int ws_connect(void)
{
int attempt;
for (attempt = 1; attempt <= WS_HANDSHAKE_MAX_RETRY; attempt++)
{
watchdog_feed_periodic(); // ADD
struct websocket_request req = {0};
req.host = WS_HOSTNAME;
req.url = WS_URL;
req.tmp_buf = tmp_buf;
req.tmp_buf_len = sizeof(tmp_buf);
LOG_INF("WS handshake attempt %d", attempt);
ws_sock = websocket_connect(tcp_sock, &req, 5000, "ws"); // timeout FIX
if (ws_sock >= 0)
{
ws_connected = true;
LOG_INF("WS connected");
NK_WS_SendPing();
TS_StartWsPingManageTimer(K_SECONDS(20), K_SECONDS(20));
NK_WS_SendConnect();
TS_StartWsRxManageTimer(K_MSEC(50), K_MSEC(50));
return 0;
}
LOG_ERR("WS handshake failed (attempt %d)", attempt);
if (ws_sock >= 0)
{
shutdown(ws_sock, SHUT_RDWR);
close(ws_sock);
ws_sock = -1;
}
if (attempt < WS_HANDSHAKE_MAX_RETRY)
{
int delay = (1000 << attempt);
int elapsed = 0;
while (elapsed < delay)
{
watchdog_feed_periodic();
k_sleep(K_MSEC(50));
elapsed += 50;
}
}
}
LOG_ERR("WS handshake failed after %d attempts", WS_HANDSHAKE_MAX_RETRY);
return -EIO;
}
/* ========================= RX LOOP ========================= */
void ws_rx_loop(void)
{
uint64_t remaining;
uint32_t opcode;
int ret;
watchdog_feed_periodic(); // ADD
ret = websocket_recv_msg(ws_sock,
ws_rx_buf,
sizeof(ws_rx_buf) - 1,
&opcode,
&remaining,
100);
/* If timeout or no data -> do nothing */
if (ret <= 0)
{
return;
}
/* Null terminate safely */
ws_rx_buf[ret] = '\0';
/* Print log ONLY when message received */
LOG_INF("WS RX: %s", ws_rx_buf);
/* handle connect */
if (strstr(ws_rx_buf, "\"op\":\"connected\""))
{
// NK_WS_SendSubscribe("4LMA3sZ5uF");
const char *hub_id = NK_Cloud_GetHubObjectId();
if (hub_id == NULL || strlen(hub_id) == 0)
{
LOG_ERR("Hub ObjectID not available");
return;
}
LOG_INF("Subscribing Hub: %s", hub_id);
NK_WS_SendSubscribe(hub_id);
}
/* handle subscribed */
else if (strstr(ws_rx_buf, "\"op\":\"subscribed\""))
{
LOG_INF("WS subscribed OK");
}
/* handle hub commands */
ws_handle_command(ws_rx_buf);
}
/* ========================= PUBLIC ========================= */
int NK_WS_Init(void)
{
ws_tls_setup();
return 0;
}
void NK_WS_Start(void)
{
if (!wifi_manager_is_connected())
return;
LOG_INF("WS start");
if (ws_resolve())
return;
if (ws_tcp_connect())
return;
if (ws_connect())
return;
}
void NK_WS_Stop(void)
{
ws_connected = false;
if (ws_sock >= 0)
{
shutdown(ws_sock, SHUT_RDWR);
close(ws_sock);
}
if (tcp_sock >= 0)
{
shutdown(tcp_sock, SHUT_RDWR);
close(tcp_sock);
}
ws_sock = -1;
tcp_sock = -1;
LOG_INF("WS stopped");
TS_StopWsRxManageTimer();
TS_StopWsPingManageTimer();
}
bool NK_WS_IsConnected(void)
{
return ws_connected;
}
/* ========================= SEND ========================= */
static int ws_send_text(const char *msg)
{
if (!ws_connected)
return -1;
return websocket_send_msg(ws_sock,
msg,
strlen(msg),
WEBSOCKET_OPCODE_DATA_TEXT,
true, true,
SYS_FOREVER_MS);
}
int NK_WS_SendConnect(void)
{
return ws_send_text(
"{\"op\":\"connect\",\"applicationId\":\"znQMq11i08XE5VXT0J1D83zc7I61t5zWVWeRu3vC\"}");
}
int NK_WS_SendSubscribe(const char *hub_id)
{
char buf[256];
snprintf(buf, sizeof(buf),
"{\"op\":\"subscribe\",\"requestId\":3,\"query\":{\"className\":\"HubRequest\",\"where\":{\"hub\":{\"__type\":\"Pointer\",\"className\":\"Hub\",\"objectId\":\"%s\"}}}}",
hub_id);
return ws_send_text(buf);
}
void NK_WS_SendPing(void)
{
if (!ws_connected)
{
LOG_INF("WS server not connected");
return;
}
else
{
int ret = websocket_send_msg(ws_sock, NULL, 0, WEBSOCKET_OPCODE_PING, true, true, SYS_FOREVER_MS);
if (ret < 0)
{
LOG_INF("WS ping failed");
k_sleep(K_SECONDS(1));
NK_WS_Stop();
NK_WS_Start();
}
else
{
LOG_INF("WS ping");
}
}
}
char *hub_request_objectId(void)
{
return SetHubReqObjectId;
}
/******************** Include header files ********************/
#include <stdio.h>
#include <zephyr/kernel.h>
#include <zephyr/net/socket.h>
#include <zephyr/net/tls_credentials.h>
#include <zephyr/logging/log.h>
#include <string.h>
#include <errno.h>
#include <zephyr/posix/fcntl.h>
/* -- User defined Includes -- */
#include "NK_Net_Conn.h"
#include "NK_TS_TaskScheduler.h"
#include "NK_HTTP_Core.h"
#include "NK_Cloud_Login.h"
#include "NK_Cloud_ObjectTime.h"
#include "NK_Cloud_HubStatus.h"
#include "NK_Net_Manager.h"
#include "NK_WM_WebsocketManager.h"
#include "NK_WD_WatchDog.h"
/* -- User defined Includes -- */
/******************** Include header files ********************/
LOG_MODULE_REGISTER(nk_net_conn, LOG_LEVEL_INF);
/* Server config */
#define NK_SERVER_HOSTNAME "api.nexkey.com"
#define NK_SERVER_PORT "443"
#define NK_TLS_SEC_TAG 1
#define NK_WAIT_TIMEOUT_MS 10000
/* TLS certificate */
static const char ca_certificate[] = {
#include "certificate.h"
};
/* Internal state */
static int nk_sock = -1;
static bool nk_connected = false;
static struct sockaddr_storage nk_server_addr;
struct timeval timeoutVlue;
static char nk_server_ip[NET_IPV4_ADDR_LEN];
static char server_ip_str[16];
/* Initialize module */
int NK_NET_Init(void)
{
NK_WS_Stop();
nk_sock = -1;
nk_connected = false;
memset(&nk_server_addr, 0, sizeof(nk_server_addr));
memset(nk_server_ip, 0, sizeof(nk_server_ip));
return 0;
}
/* DNS resolve */
int NK_NET_Resolve(void)
{
int retry = 3;
int err;
while (retry--)
{
struct addrinfo *result = NULL;
struct addrinfo hints = {
.ai_family = AF_INET,
.ai_socktype = SOCK_STREAM};
err = getaddrinfo(NK_SERVER_HOSTNAME,
NK_SERVER_PORT,
&hints,
&result);
if (!err && result)
{
struct sockaddr_in *server4 =
(struct sockaddr_in *)&nk_server_addr;
server4->sin_addr.s_addr =
((struct sockaddr_in *)result->ai_addr)->sin_addr.s_addr;
server4->sin_family = AF_INET;
server4->sin_port =
((struct sockaddr_in *)result->ai_addr)->sin_port;
inet_ntop(AF_INET,
&server4->sin_addr.s_addr,
nk_server_ip,
sizeof(nk_server_ip));
LOG_INF("Server IP: %s", nk_server_ip);
snprintf(server_ip_str, sizeof(server_ip_str), "%s", nk_server_ip);
freeaddrinfo(result);
return 0;
}
LOG_ERR("DNS resolve failed (%d), retrying...", err);
k_sleep(K_SECONDS(2));
}
LOG_ERR("DNS resolve failed after retries");
return -EAGAIN;
}
/* TLS setup */
int NK_NET_TLS_Setup(void)
{
static bool tls_loaded = false;
if (tls_loaded)
{
return 0;
}
int err = tls_credential_add(
NK_TLS_SEC_TAG,
TLS_CREDENTIAL_CA_CERTIFICATE,
ca_certificate,
sizeof(ca_certificate));
if (err == -EEXIST)
{
LOG_INF("TLS certificate already loaded");
tls_loaded = true;
return 0;
}
if (err < 0)
{
LOG_ERR("TLS credential add failed (%d)", err);
return err;
}
LOG_INF("TLS credentials loaded");
tls_loaded = true;
return 0;
}
/* Connect TLS socket */
/* Connect TLS socket */
int NK_NET_Connect(void)
{
int retry = 3;
int err;
// timeoutVlue.tv_sec = 10; // 10 seconds timeout
// timeoutVlue.tv_usec = 0;
struct timeval timeout = {
.tv_sec = 3, // reduced from 10 sec
.tv_usec = 0};
if (nk_connected)
{
LOG_INF("Already connected");
return 0;
}
/* safety: close old socket */
if (nk_sock >= 0)
{
close(nk_sock);
nk_sock = -1;
}
while (retry--)
{
watchdog_feed_periodic(); // IMPORTANT
nk_sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TLS_1_2);
if (nk_sock < 0)
{
LOG_ERR("Socket create failed (%d)", errno);
// k_sleep(K_SECONDS(2));
continue;
}
/* Apply TLS security tag */
sec_tag_t sec_tag_list[] = {NK_TLS_SEC_TAG};
setsockopt(nk_sock,
SOL_TLS,
TLS_SEC_TAG_LIST,
sec_tag_list,
sizeof(sec_tag_list));
/* Apply hostname verification */
setsockopt(nk_sock,
SOL_TLS,
TLS_HOSTNAME,
NK_SERVER_HOSTNAME,
sizeof(NK_SERVER_HOSTNAME));
setsockopt(nk_sock, SOL_SOCKET, SO_RCVTIMEO, &timeout, sizeof(timeout));
setsockopt(nk_sock, SOL_SOCKET, SO_SNDTIMEO, &timeout, sizeof(timeout));
int flags = fcntl(nk_sock, F_GETFL, 0);
fcntl(nk_sock, F_SETFL, flags | O_NONBLOCK);
/* feed watchdog BEFORE connect */
watchdog_feed_periodic();
int64_t start_time = k_uptime_get();
err = connect(nk_sock,
(struct sockaddr *)&nk_server_addr,
sizeof(struct sockaddr_in));
if (err < 0 && errno == EINPROGRESS)
{
struct pollfd fds;
fds.fd = nk_sock;
fds.events = ZSOCK_POLLOUT | ZSOCK_POLLERR | ZSOCK_POLLHUP;
int elapsed = 0;
while (elapsed < 5000) // real timeout 5 sec
{
int poll_res = poll(&fds, 1, 100); // small interval
watchdog_feed_periodic(); // CRITICAL
if (poll_res > 0)
{
int result;
socklen_t len = sizeof(result);
getsockopt(nk_sock, SOL_SOCKET, SO_ERROR, &result, &len);
if (result == 0)
{
err = 0;
break;
}
else
{
err = result;
break;
}
}
elapsed += 100;
}
}
int64_t duration = k_uptime_get() - start_time;
if (duration > 6000)
{
LOG_ERR("Connect blocked too long (%lld ms)", duration);
err = -ETIMEDOUT;
}
if (err == 0)
{
nk_connected = true;
LOG_INF("Server connected (TLS)");
NK_HTTP_Core_Init();
return 0;
}
LOG_ERR("Server connect failed (%d), retrying...", err);
close(nk_sock); // MUST cleanup
nk_sock = -1;
}
LOG_ERR("TLS connect failed after retries");
NK_NET_Manager_Start();
return -EIO;
}
/* Disconnect */
void NK_NET_Disconnect(void)
{
if (nk_sock >= 0)
{
close(nk_sock);
nk_sock = -1;
}
// k_sleep(K_SECONDS(1));
nk_connected = false;
LOG_INF("Server disconnected");
}
static bool NKi_WaitWithTimeout(bool (*func)(void))
{
int elapsed = 0;
while (!func())
{
k_sleep(K_MSEC(50));
elapsed += 50;
if (elapsed >= NK_WAIT_TIMEOUT_MS)
{
LOG_ERR("Timeout waiting!");
return false;
}
}
return true;
}
void cloud_bootstrap_sequence(void)
{
LOG_INF("CLOUD BOOTSTRAP START");
NK_Cloud_Login_Start();
// while (!NK_Cloud_Login_IsDone())
// {
// k_sleep(K_MSEC(50));
// }
if (!NKi_WaitWithTimeout(NK_Cloud_Login_IsDone))
{
LOG_ERR("Cloud login request timeout !!!");
return;
}
NK_Cloud_ObjectTime_Request();
// while (!NK_Cloud_ObjectTime_IsReady())
// {
// k_sleep(K_MSEC(50));
// }
if (!NKi_WaitWithTimeout(NK_Cloud_ObjectTime_IsReady))
{
LOG_ERR("Object time request Timeout!!!");
return;
}
NK_Cloud_HubStatus_Request();
// while (!NK_Cloud_HubStatus_LastOk())
// {
// k_sleep(K_MSEC(50));
// }
if (!NKi_WaitWithTimeout(NK_Cloud_HubStatus_LastOk))
{
LOG_ERR("Cloud hub status request Timeout!!!");
return;
}
LOG_INF("CLOUD BOOTSTRAP DONE");
NK_NET_Manager_Stop();
// k_sleep(K_SECONDS(1));
/* START WEBSOCKET */
NK_WS_Start();
TS_StartHubStatusManageTimer(K_MINUTES(HUBSTATUS_CYCLE_TIME), K_MINUTES(HUBSTATUS_CYCLE_TIME));
}
/* Get socket */
int NK_NET_GetSocket(void)
{
return nk_sock;
}
/* Connection state */
bool NK_NET_IsConnected(void)
{
return nk_connected;
}
/* Get server IP */
const char *NK_NET_Get_ServerIP(void)
{
return server_ip_str;
}
Questions / Assistance Needed
-
Are there any known issues with TLS or TCP socket connections in nRF Connect SDK v2.6.2?
-
Could this be related to Wi-Fi stack timing, memory usage, or TLS configuration?
-
Are there recommended configurations or patches for improving TLS stability over router networks?
Any guidance, debugging suggestions, or known limitations would be greatly appreciated.