Persistent (-1) error in POSIX connect() function for TCP reconnection

I am doing HTTP POST's and GET's from my nRF52840-based device to a Java Spring Boot HTTP server running on a laptop via TCP. The device is communicating with a border router via OpenThread. The border router and my laptop are both on my local WAN (laptop's IP is 192.168.1.27). When the device boots up and the server is running, I can do POST's and GET's without issue. When I turn off the server to test the device's reconnection logic, I see the SYN message from the device trying to establish a connection with the server. The connect() function subsequently returns (-1). If I bring the server back up while the device is still making these SYN attempts, the connection is re-established and everything resumes properly.

However, if I do not bring up the server after some time (usually a few minutes), the device stops sending SYN messages and only does DNS lookups of the server hostname. The connect() function still returns error (-1) but no longer actually tries to establish a connection. If I bring the server back up, the device never re-establishes connection with the server due to the lack of SYN messages.

What is going on with the device's TCP connection that causes it to stop SYN reconnection attempts?

OpenThread Wireshark traces:

   

Relevant Code:

#define HTTP_POST_MESSAGE_FORMAT 	                                       \
	"POST /%s HTTP/1.1\r\n"                                                 \
	"Host: %s\r\n"                                                         \
	"Connection: close\r\n"                                           \
	"Accept: application/json\r\n"								   \
	"Content-Type: application/json\r\n"									\
	"Content-Length: %d\r\n"									\
	"\r\n"																	\
	"%s"
	
static void otHTTPDNScallback(otError aError, const otDnsAddressResponse *aResponse, void *aContext)
{
	otError oterr = OT_ERROR_NONE;
	char tmpbuf[50];
	uint16_t i=0;

	if (aError==OT_ERROR_NONE)
	{
		while (oterr!=OT_ERROR_NOT_FOUND && i<40) //set an i-limit to avoid enless loop
		{
			oterr = otDnsAddressResponseGetAddress(aResponse, i, aContext, NULL);
			
			if (oterr==OT_ERROR_NONE)
			{
				otIp6AddressToString(aContext, tmpbuf, sizeof(tmpbuf));
				LOG_DBG("DNS returned IP addr #%d: %s",i,tmpbuf);
                break; //use first found IP address
			}
            else
            {
                LOG_ERR("Error in otDnsAddressResponseGetAddress (err %d)", oterr);
            }		
			i+=1;
		}
	}
    else
    {
        LOG_ERR("Error in HTTP DNS lookup (err: %d)", aError);
    }
	k_sem_give(&getHTTPaddr_sem);   //Need to hold main function until address is obtained in this 
                                    //callback. Return semaphore here so program can continue
}

char *HTTPPOST(char *ipv4host, char *path, uint8_t HTTP_API_TYPE, char *payload, uint16_t port_no)                
{
	int sock, err, msglen, recvlen;
    char bufipv6[50];
    char port_buf[7];

    snprintf(port_buf, sizeof(port_buf),"%d",port_no);

    memset(&response[0], 0, sizeof(response)); //Rezero response buffer
 
#if IS_ENABLED(CONFIG_OPENTHREAD_DNS_CLIENT)
	otError oterr;
    otIp6Address ot6addr;

	LOG_DBG("otDnsClientResolveIp4Address server: %s",ipv4host);

	if (otSrpClientIsRunning(openthread_get_default_instance()))
	{
		LOG_DBG("SRP client is running!");
	}
	else
	{
		LOG_WRN("SRP client is NOT running!");
	}

	oterr = otDnsClientResolveIp4Address(openthread_get_default_instance(), ipv4host,
  										otHTTPDNScallback, &ot6addr,
  										NULL);
	
	k_sem_take(&getHTTPaddr_sem, K_SECONDS(5)); // Wait for DNS callback

    //Convert openthread IPv6 address to string type
    otIp6AddressToString(&ot6addr, bufipv6, sizeof(bufipv6));

    LOG_DBG("IPv6 Address: %s", bufipv6);
    
    static struct sockaddr brok;
    struct sockaddr_in6 *brok6 = ((struct sockaddr_in6 *)&brok);
    inet_pton(AF_INET6, bufipv6, &brok6->sin6_addr);
    brok6->sin6_family = AF_INET6;
    brok6->sin6_port = htons(port_no);

	sock = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP);
    if (sock<0) {
        LOG_ERR("Error setting socket for HTTP POST, err %d. Returning NULL pointer.", sock);
        (void)close(sock);
        return NULL;
    }

	LOG_WRN("sock = %d\n", sock);

    msglen = snprintf(msg,sizeof(msg), HTTP_POST_MESSAGE_FORMAT, path, ipv4host, strlen(payload), payload);
    LOG_DBG("POST request:\n%s", msg);

    err = connect(sock, &brok, sizeof(struct sockaddr_in6));
    if (err<0) {
		LOG_ERR("Failed to connect to host for HTTP POST, err %d. Returning NULL pointer.", err);
        (void)close(sock);
		return NULL;
	}

    err = send(sock, msg, msglen, 0);
    if (err<0) {
		LOG_ERR("Failed to send HTTP POST, err %d. Returning NULL pointer.", err);
        (void)close(sock);
		return NULL;
	}

    recvlen = recv(sock, response, sizeof(response) - 1, 0);
    if (recvlen < 0) {
        LOG_ERR("Error reading HTTP POSTT response, err %d. Returning NULL pointer.\n", err);
        (void)close(sock);
        return NULL;
    }

    LOG_WRN("HTTP response code: %d", ret_code);

    LOG_DBG("recvlen = %d",recvlen);
    LOG_DBG("HTTP POST Response:\n%s", response);

	err = close(sock);
    return (&response[0]);
}

Relevant configs:

CONFIG_ENABLE_THREAD_NETWORK=y

CONFIG_OPENTHREAD_THREAD_VERSION_1_2=y
CONFIG_OPENTHREAD_NORDIC_LIBRARY_MTD=y
CONFIG_OPENTHREAD_FTD=n
CONFIG_OPENTHREAD_MTD=y
CONFIG_OPENTHREAD_MTD_SED=y

CONFIG_SYSTEM_WORKQUEUE_STACK_SIZE=4096

CONFIG_OPENTHREAD_SHELL=n
CONFIG_OT_CHANNEL=19
CONFIG_NET_IPV6=y
CONFIG_NET_IPV4=n
CONFIG_NET_CONFIG_SETTINGS=y
CONFIG_NET_CONFIG_NEED_IPV4=n
CONFIG_NET_CONFIG_NEED_IPV6=y

# Configure dependencies
CONFIG_IEEE802154_2015=y
CONFIG_IEEE802154_NRF5_RX_STACK_SIZE=800
CONFIG_NRF_802154_ENCRYPTION=y
CONFIG_IEEE802154_CSL_ENDPOINT=y
CONFIG_NET_PKT_TXTIME=y
CONFIG_NET_PKT_TIMESTAMP=y
CONFIG_OPENTHREAD_MAC_SOFTWARE_TX_SECURITY_ENABLE=n
CONFIG_IEEE802154_DRIVER_LOG_LEVEL_INF=y


CONFIG_OPENTHREAD_DUA=y
CONFIG_OPENTHREAD_MLR=y
#CONFIG_OPENTHREAD_BACKBONE_ROUTER=y
CONFIG_OPENTHREAD_LINK_METRICS_INITIATOR=y
CONFIG_OPENTHREAD_LINK_METRICS_SUBJECT=y
CONFIG_OPENTHREAD_CSL_RECEIVER=y

# CSL configuration
#CONFIG_OPENTHREAD_CSL_RECEIVE_TIME_AHEAD=3000
#CONFIG_OPENTHREAD_CSL_MIN_RECEIVE_ON=300

##### OPENTHREAD #####
CONFIG_OPENTHREAD_THREAD_STACK_SIZE=7250
CONFIG_OPENTHREAD_DEBUG=y
CONFIG_OPENTHREAD_L2_DEBUG=y
CONFIG_OPENTHREAD_MANUAL_START=y
CONFIG_OPENTHREAD_JOINER=y
CONFIG_OPENTHREAD_JOINER_AUTOSTART=n

CONFIG_OPENTHREAD_POLL_PERIOD=1000

# Generic networking options
CONFIG_NETWORKING=y

# Kernel options
CONFIG_INIT_STACKS=y

# Increase set for threads with meta-irq priority
CONFIG_NUM_METAIRQ_PRIORITIES=1

# Logging
CONFIG_NET_LOG=y   #POWERSAVING
CONFIG_NET_STATISTICS=y

# Disable certain parts of Zephyr IPv6 stack
CONFIG_NET_IPV6_NBR_CACHE=n
CONFIG_NET_IPV6_MLD=n

# Stack sizes configuration
CONFIG_NET_TX_STACK_SIZE=1500
CONFIG_NET_RX_STACK_SIZE=1500

# Network buffers
CONFIG_NET_PKT_RX_COUNT=8
CONFIG_NET_PKT_TX_COUNT=8
CONFIG_NET_BUF_RX_COUNT=26
CONFIG_NET_BUF_TX_COUNT=26

# L2 OpenThread enabling
CONFIG_NET_L2_OPENTHREAD=y
CONFIG_OPENTHREAD_L2_LOG_LEVEL_INF=y

# Select OpenThread nRF Security backends
CONFIG_OPENTHREAD_NRF_SECURITY_CHOICE=y

# Enable ping sender support
CONFIG_OPENTHREAD_PING_SENDER=y

CONFIG_NET_MGMT_EVENT_INFO=y

# Enable DNS-via-border-router's-upstream functionality
CONFIG_OPENTHREAD_DNS_CLIENT=y
CONFIG_OPENTHREAD_SRP_CLIENT=y
CONFIG_OPENTHREAD_ECDSA=y


# Network sockets
CONFIG_NET_SOCKETS=y
CONFIG_NET_SOCKETS_POSIX_NAMES=y
CONFIG_NET_SOCKETS_POLL_MAX=4
# Enable TCP support
CONFIG_NET_TCP=y
# Required for SOCKET STREAM
CONFIG_NET_UDP=y 
# Required for getting UTC time 
#CONFIG_NET_TCP_LOG_LEVEL_DBG=y
CONFIG_OPENTHREAD_TCP_ENABLE=n

#Socket settings
CONFIG_POSIX_MAX_FDS=6

# IP address options
CONFIG_NET_IF_UNICAST_IPV6_ADDR_COUNT=3
CONFIG_NET_IF_MCAST_IPV6_ADDR_COUNT=4
CONFIG_NET_MAX_CONTEXTS=8

# Download Client
CONFIG_FOTA_DOWNLOAD=y
CONFIG_DOWNLOAD_CLIENT=y
CONFIG_DOWNLOAD_CLIENT_HTTP_FRAG_SIZE_4096=y
CONFIG_DOWNLOAD_CLIENT_STACK_SIZE=4096
#CONFIG_DOWNLOAD_CLIENT_LOG_LEVEL_DBG=y
CONFIG_DOWNLOAD_CLIENT_BUF_SIZE=4096
CONFIG_DOWNLOAD_CLIENT_JAVAEDGE=y

CONFIG_EXTERNAL_LIBC=n
CONFIG_CJSON_LIB=y

CONFIG_DATE_TIME=y
CONFIG_DATE_TIME_NTP=y
CONFIG_DATE_TIME_LOG_LEVEL_DBG=n

CONFIG_NET_TCP_WORKQ_STACK_SIZE=4096

Parents
  • Hello,

    I need some more information from you. Please answer all of the questions below.

    Which nRF Connect SDK version are you using?

    Is your project based on a sample?

    Have you set up a maximum amount of repetitions for a SYN message on your nRF52840 device? You could try to add a restart of SYN message sending to the application which is triggered after a timeout or external input, f.ex.

    How is your Thread border router configured?

    You have CONFIG_OPENTHREAD_TCP_ENABLE=n in your .conf file. Is your intention to use the Zephyr implementation of TCP instead of the TCP implementation in OpenThread? In Thread 1.3, support for TCP was introduced which makes TCP more efficient in an IEEE 802.15.4 network.

    Best regards,

    Maria

  • Hello,

    Were you able to look into this further? This is one of the final issues in our development process.

    I just observed an interesting transmission sequence that could help with troubleshooting. The device successfully sent a large number (~650) of locally stored messages to the server. In the middle of clearing this local message backlog, the device started to exhibit the behavior in my original post. This is interesting because the server never went offline and messages were sending successfully until the error was encountered abruptly. There were a few failed transmissions in the middle of the successful sends, but the those messages were always sent successfully on the first retry. Maybe I have a buffer that is filling or sockets that aren't being closed?

    ^See original post for code that generates this error message. This error occurs for each of the DNS lookups in the bottom-most blue block of Wireshark messages

  • Hello,

    Sorry about the delay.

    Per the documentation for connect(), the return value being -1 only indicates that an error has occurred. To find out which error which caused the call to fail, we need to know the value of errno after connect() has returned with an error. Make sure to save the value to another variable before doing any other function calls.

    My suggestion for how to do this:

    You need to include errno.h and change the variable which is printed in the LOG_ERR on line 100 to be errno instead of err.

    You can then find the relevant error code in zephyr/lib/libc/minimal/include/errno.h.

  • Thank you for the recommendation. I have added #include <errno.h> in this .c file and changed the code around the connect() call as follows:

    int errnum;
    
    err = connect(sock, &brok, sizeof(struct sockaddr_in6));
    errnum = errno; //errnum currently unused, set just in case it's needed later on
    if (err<0)
    {
        LOG_ERR("Failed to connect to host for HTTP POST, errno %d. Returning NULL pointer.", errno);
        (void)close(sock);
        return NULL;
    }

    I also set CONFIG_NET_TCP_LOG_LEVEL_DBG=y for additional information. The resulting error is 116, which appears to be a timeout error. However, these SYN messages still disappear in the Wireshark logs after some time. I trust the Wireshark logs because the device does not reconnect to the server while these SYN messages are missing. If I restart the server while these SYN attempts are still seen in the Wireshark logs, the device reconnects successfully.

    I forgot to mention that I also have these configs set (in addition to the configs I listed in my original post) for a fall-back, non-OpenThread form of communication:

    CONFIG_MBEDTLS_CIPHER_MODE_CBC=y
    CONFIG_MBEDTLS_CIPHER_MODE_CTR=y
    CONFIG_MBEDTLS_LIBRARY_NRF_SECURITY=y
    CONFIG_MBEDTLS_ENABLE_HEAP=Y
    CONFIG_MBEDTLS_HEAP_SIZE=10240
    CONFIG_NORDIC_SECURITY_BACKEND=y

  • Hi  ,

    Did the timeout error and/or my other troubleshooting give you any ideas for next steps?

    Thanks.

  • Hello,

    Sorry, I haven't been able to find a solution yet. I have a suspicion that you need to set up your client to restart the connection request completely after the SYN messages stops. But I am not sure.

    I am looking for the right people to help me with this. Thank you for your patience.

    Best regards,

    Maria

  • I believe my client is restarting the connection request completely each time.

    After connect() returns the 116 timeout error, the socket is closed via "close(sock)". The next time a connection is attempted, the application starts from the top with "sock = socket(AF_INET6, SOCK_STREAM, IPPROTO_TCP);" followed by "err = connect(sock, &brok, sizeof(struct sockaddr_in6));".

Reply Children
Related