Skip to content

Commit d5a2836

Browse files
committed
fix(websocket): fix race condition during
abort(CONFIG_ESP_WS_CLIENT_SEPARATE_TX_LOCK = y)
1 parent 18f0d02 commit d5a2836

File tree

1 file changed

+55
-6
lines changed

1 file changed

+55
-6
lines changed

components/esp_websocket_client/esp_websocket_client.c

Lines changed: 55 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -244,7 +244,17 @@ static esp_err_t esp_websocket_client_dispatch_event(esp_websocket_client_handle
244244
static esp_err_t esp_websocket_client_abort_connection(esp_websocket_client_handle_t client, esp_websocket_error_type_t error_type)
245245
{
246246
ESP_WS_CLIENT_STATE_CHECK(TAG, client, return ESP_FAIL);
247+
248+
// CRITICAL: Check if already closing/closed to prevent double-close
249+
if (client->state == WEBSOCKET_STATE_CLOSING || client->state == WEBSOCKET_STATE_UNKNOW) {
250+
ESP_LOGW(TAG, "Connection already closing/closed, skipping abort");
251+
return ESP_OK;
252+
}
253+
247254
esp_transport_close(client->transport);
255+
// Note: We do NOT set client->transport = NULL here because:
256+
// 1. Transport is reused on reconnect
257+
// 2. We use state instead to track if transport is valid
248258

249259
if (!client->config->auto_reconnect) {
250260
client->run = false;
@@ -256,6 +266,7 @@ static esp_err_t esp_websocket_client_abort_connection(esp_websocket_client_hand
256266
}
257267
client->error_handle.error_type = error_type;
258268
esp_websocket_client_dispatch_event(client, WEBSOCKET_EVENT_DISCONNECTED, NULL, 0);
269+
259270
return ESP_OK;
260271
}
261272

@@ -679,8 +690,18 @@ static int esp_websocket_client_send_with_exact_opcode(esp_websocket_client_hand
679690
} else {
680691
esp_websocket_client_error(client, "esp_transport_write() returned %d, errno=%d", ret, errno);
681692
}
693+
ESP_LOGD(TAG, "Calling abort_connection due to send error");
694+
#ifdef CONFIG_ESP_WS_CLIENT_SEPARATE_TX_LOCK
695+
xSemaphoreGiveRecursive(client->tx_lock);
696+
xSemaphoreTakeRecursive(client->lock, portMAX_DELAY);
697+
esp_websocket_client_abort_connection(client, WEBSOCKET_ERROR_TYPE_TCP_TRANSPORT);
698+
xSemaphoreGiveRecursive(client->lock);
699+
return ret;
700+
#else
701+
// Already holding client->lock, safe to call
682702
esp_websocket_client_abort_connection(client, WEBSOCKET_ERROR_TYPE_TCP_TRANSPORT);
683703
goto unlock_and_return;
704+
#endif
684705
}
685706
opcode = 0;
686707
widx += wlen;
@@ -1019,7 +1040,6 @@ static esp_err_t esp_websocket_client_recv(esp_websocket_client_handle_t client)
10191040
esp_websocket_free_buf(client, false);
10201041
return ESP_OK;
10211042
}
1022-
10231043
esp_websocket_client_dispatch_event(client, WEBSOCKET_EVENT_DATA, client->rx_buffer, rlen);
10241044

10251045
client->payload_offset += rlen;
@@ -1030,9 +1050,36 @@ static esp_err_t esp_websocket_client_recv(esp_websocket_client_handle_t client)
10301050
const char *data = (client->payload_len == 0) ? NULL : client->rx_buffer;
10311051
ESP_LOGD(TAG, "Sending PONG with payload len=%d", client->payload_len);
10321052
#ifdef CONFIG_ESP_WS_CLIENT_SEPARATE_TX_LOCK
1033-
if (xSemaphoreTakeRecursive(client->tx_lock, WEBSOCKET_TX_LOCK_TIMEOUT_MS) != pdPASS) {
1034-
ESP_LOGE(TAG, "Could not lock ws-client within %d timeout", WEBSOCKET_TX_LOCK_TIMEOUT_MS);
1035-
return ESP_FAIL;
1053+
// CRITICAL: To avoid deadlock, we must follow lock ordering: tx_lock BEFORE client->lock
1054+
// Current state: We hold client->lock (acquired by caller at line 1228)
1055+
// We need: tx_lock
1056+
// Deadlock risk: Send error path holds tx_lock and waits for client->lock = circular wait!
1057+
//
1058+
// Solution: Temporarily release client->lock, acquire both locks in correct order
1059+
// 1. Release client->lock
1060+
// 2. Acquire tx_lock (now safe - no deadlock)
1061+
// 3. Re-acquire client->lock (for consistency with caller expectations)
1062+
// 4. Send PONG
1063+
// 5. Release both locks in reverse order
1064+
1065+
xSemaphoreGiveRecursive(client->lock); // Release client->lock
1066+
1067+
// Now acquire tx_lock (safe - no circular wait possible)
1068+
if (xSemaphoreTakeRecursive(client->tx_lock, portMAX_DELAY) != pdPASS) {
1069+
ESP_LOGE(TAG, "Failed to acquire tx_lock for PONG");
1070+
xSemaphoreTakeRecursive(client->lock, portMAX_DELAY); // Re-acquire client->lock before returning
1071+
return ESP_OK; // Return gracefully, caller expects client->lock to be held
1072+
}
1073+
1074+
// Re-acquire client->lock to maintain consistency
1075+
xSemaphoreTakeRecursive(client->lock, portMAX_DELAY);
1076+
1077+
// CRITICAL: Check if transport is still valid after re-acquiring lock
1078+
// Another thread may have closed it while we didn't hold client->lock
1079+
if (client->state == WEBSOCKET_STATE_CLOSING || client->state == WEBSOCKET_STATE_UNKNOW || client->transport == NULL) {
1080+
ESP_LOGW(TAG, "Transport closed while preparing PONG, skipping send");
1081+
xSemaphoreGiveRecursive(client->tx_lock);
1082+
return ESP_OK; // Caller expects client->lock to be held, which it is
10361083
}
10371084
#endif
10381085
esp_transport_ws_send_raw(client->transport, WS_TRANSPORT_OPCODES_PONG | WS_TRANSPORT_OPCODES_FIN, data, client->payload_len,
@@ -1221,12 +1268,14 @@ static void esp_websocket_client_task(void *pv)
12211268
esp_websocket_client_abort_connection(client, WEBSOCKET_ERROR_TYPE_TCP_TRANSPORT);
12221269
xSemaphoreGiveRecursive(client->lock);
12231270
} else if (read_select > 0) {
1271+
// CRITICAL: Protect entire recv operation with client->lock
1272+
// This prevents transport from being closed while recv is in progress
1273+
xSemaphoreTakeRecursive(client->lock, lock_timeout);
12241274
if (esp_websocket_client_recv(client) == ESP_FAIL) {
12251275
ESP_LOGE(TAG, "Error receive data");
1226-
xSemaphoreTakeRecursive(client->lock, lock_timeout);
12271276
esp_websocket_client_abort_connection(client, WEBSOCKET_ERROR_TYPE_TCP_TRANSPORT);
1228-
xSemaphoreGiveRecursive(client->lock);
12291277
}
1278+
xSemaphoreGiveRecursive(client->lock);
12301279
}
12311280
} else if (WEBSOCKET_STATE_WAIT_TIMEOUT == client->state) {
12321281
// waiting for reconnection or a request to stop the client...

0 commit comments

Comments
 (0)