-
Notifications
You must be signed in to change notification settings - Fork 693
Fix WebSocket connection reliability and domain reload recovery #656
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ae6f5c3
50634e2
4a58fad
0c4c5cc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -7,7 +7,7 @@ | |
| import os | ||
| import time | ||
| import uuid | ||
| from typing import Any | ||
| from typing import Any, ClassVar | ||
|
|
||
| from starlette.endpoints import WebSocketEndpoint | ||
| from starlette.websockets import WebSocket | ||
|
|
@@ -21,6 +21,7 @@ | |
| WelcomeMessage, | ||
| RegisteredMessage, | ||
| ExecuteCommandMessage, | ||
| PingMessage, | ||
| RegisterMessage, | ||
| RegisterToolsMessage, | ||
| PongMessage, | ||
|
|
@@ -29,7 +30,7 @@ | |
| SessionDetails, | ||
| ) | ||
|
|
||
| logger = logging.getLogger("mcp-for-unity-server") | ||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| class PluginDisconnectedError(RuntimeError): | ||
|
|
@@ -63,6 +64,10 @@ class PluginHub(WebSocketEndpoint): | |
| KEEP_ALIVE_INTERVAL = 15 | ||
| SERVER_TIMEOUT = 30 | ||
| COMMAND_TIMEOUT = 30 | ||
| # Server-side ping interval (seconds) - how often to send pings to Unity | ||
| PING_INTERVAL = 10 | ||
| # Max time (seconds) to wait for pong before considering connection dead | ||
| PING_TIMEOUT = 20 | ||
| # Timeout (seconds) for fast-fail commands like ping/read_console/get_editor_state. | ||
| # Keep short so MCP clients aren't blocked during Unity compilation/reload/unfocused throttling. | ||
| FAST_FAIL_TIMEOUT = 2.0 | ||
|
|
@@ -78,6 +83,10 @@ class PluginHub(WebSocketEndpoint): | |
| _pending: dict[str, dict[str, Any]] = {} | ||
| _lock: asyncio.Lock | None = None | ||
| _loop: asyncio.AbstractEventLoop | None = None | ||
| # session_id -> last pong timestamp (monotonic) | ||
| _last_pong: ClassVar[dict[str, float]] = {} | ||
| # session_id -> ping task | ||
| _ping_tasks: ClassVar[dict[str, asyncio.Task]] = {} | ||
|
|
||
| @classmethod | ||
| def configure( | ||
|
|
@@ -176,12 +185,20 @@ async def on_disconnect(self, websocket: WebSocket, close_code: int) -> None: | |
| (sid for sid, ws in cls._connections.items() if ws is websocket), None) | ||
| if session_id: | ||
| cls._connections.pop(session_id, None) | ||
| # Stop the ping loop for this session | ||
| ping_task = cls._ping_tasks.pop(session_id, None) | ||
| if ping_task and not ping_task.done(): | ||
| ping_task.cancel() | ||
| # Clean up last pong tracking | ||
| cls._last_pong.pop(session_id, None) | ||
| # Fail-fast any in-flight commands for this session to avoid waiting for COMMAND_TIMEOUT. | ||
| pending_ids = [ | ||
| command_id | ||
| for command_id, entry in cls._pending.items() | ||
| if entry.get("session_id") == session_id | ||
| ] | ||
| if pending_ids: | ||
| logger.debug(f"Cancelling {len(pending_ids)} pending commands for disconnected session") | ||
| for command_id in pending_ids: | ||
| entry = cls._pending.get(command_id) | ||
| future = entry.get("future") if isinstance( | ||
|
|
@@ -364,10 +381,18 @@ async def _handle_register(self, websocket: WebSocket, payload: RegisterMessage) | |
| session = await registry.register(session_id, project_name, project_hash, unity_version, project_path, user_id=user_id) | ||
| async with lock: | ||
| cls._connections[session.session_id] = websocket | ||
| # Initialize last pong time and start ping loop for this session | ||
| cls._last_pong[session_id] = time.monotonic() | ||
| # Cancel any existing ping task for this session (shouldn't happen, but be safe) | ||
| old_task = cls._ping_tasks.pop(session_id, None) | ||
| if old_task and not old_task.done(): | ||
| old_task.cancel() | ||
| # Start the server-side ping loop | ||
| ping_task = asyncio.create_task(cls._ping_loop(session_id, websocket)) | ||
| cls._ping_tasks[session_id] = ping_task | ||
|
|
||
| if user_id: | ||
| logger.info( | ||
| f"Plugin registered: {project_name} ({project_hash}) for user {user_id}") | ||
| logger.info(f"Plugin registered: {project_name} ({project_hash}) for user {user_id}") | ||
| else: | ||
| logger.info(f"Plugin registered: {project_name} ({project_hash})") | ||
|
|
||
|
|
@@ -429,11 +454,77 @@ async def _handle_command_result(self, payload: CommandResultMessage) -> None: | |
| async def _handle_pong(self, payload: PongMessage) -> None: | ||
| cls = type(self) | ||
| registry = cls._registry | ||
| lock = cls._lock | ||
| if registry is None: | ||
| return | ||
| session_id = payload.session_id | ||
| if session_id: | ||
| await registry.touch(session_id) | ||
| # Record last pong time for staleness detection (under lock for consistency) | ||
| if lock is not None: | ||
| async with lock: | ||
| cls._last_pong[session_id] = time.monotonic() | ||
|
|
||
| @classmethod | ||
| async def _ping_loop(cls, session_id: str, websocket: WebSocket) -> None: | ||
| """Server-initiated ping loop to detect dead connections. | ||
|
|
||
| Sends periodic pings to the Unity client. If no pong is received within | ||
| PING_TIMEOUT seconds, the connection is considered dead and closed. | ||
| This helps detect connections that die silently (e.g., Windows OSError 64). | ||
| """ | ||
| logger.debug(f"[Ping] Starting ping loop for session {session_id}") | ||
| try: | ||
| while True: | ||
|
Comment on lines
+469
to
+478
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion (bug_risk): Consider cleaning up Relying on Suggested implementation: await registry.touch(session_id)
# Record last pong time for staleness detection
cls._last_pong[session_id] = time.monotonic()
@classmethod
def _cleanup_ping_session(cls, session_id: str) -> None:
"""Cleanup ping tracking state for a session.
This is used by the ping loop's finally block to fully manage its own
lifecycle, and can also be safely called from on_disconnect as needed.
"""
# Remove this session's ping task and staleness tracking, if present.
# Using pop(..., None) makes this idempotent and safe to call multiple times.
cls._ping_tasks.pop(session_id, None)
cls._last_pong.pop(session_id, None)
@classmethod
async def _ping_loop(cls, session_id: str, websocket: WebSocket) -> None:To fully implement your suggestion, the following additional changes are needed in the same file:
@classmethod
async def _ping_loop(cls, session_id: str, websocket: WebSocket) -> None:
"""Server-initiated ping loop to detect dead connections.
Sends periodic pings to the Unity client. If no pong is received within
PING_TIMEOUT seconds, the connection is considered dead and closed.
This helps detect connections that die silently (e.g., Windows OSError 64).
"""
logger.debug(f"[Ping] Starting ping loop for session {session_id}")
try:
while True:
await asyncio.sleep(cls.PING_INTERVAL)
# ... existing ping / timeout logic remains unchanged ...
finally:
# Ensure we don't leave stale entries if on_disconnect never fires
cls._cleanup_ping_session(session_id)
These changes ensure that the ping loop fully manages its own lifecycle and avoids stale entries even in shutdown/exception edge cases where |
||
| await asyncio.sleep(cls.PING_INTERVAL) | ||
|
|
||
| # Check if we're still supposed to be running and get last pong time (under lock) | ||
| lock = cls._lock | ||
| if lock is None: | ||
| break | ||
| async with lock: | ||
| if session_id not in cls._connections: | ||
| logger.debug(f"[Ping] Session {session_id} no longer in connections, stopping ping loop") | ||
| break | ||
| # Read last pong time under lock for consistency | ||
| last_pong = cls._last_pong.get(session_id, 0) | ||
|
|
||
| # Check staleness: has it been too long since we got a pong? | ||
| elapsed = time.monotonic() - last_pong | ||
| if elapsed > cls.PING_TIMEOUT: | ||
| logger.warning( | ||
| f"[Ping] Session {session_id} stale: no pong for {elapsed:.1f}s " | ||
| f"(timeout={cls.PING_TIMEOUT}s). Closing connection." | ||
| ) | ||
| try: | ||
| await websocket.close(code=1001) # Going away | ||
| except Exception as close_ex: | ||
| logger.debug(f"[Ping] Error closing stale websocket: {close_ex}") | ||
| break | ||
|
|
||
| # Send a ping to the client | ||
| try: | ||
| ping_msg = PingMessage() | ||
| await websocket.send_json(ping_msg.model_dump()) | ||
| logger.debug(f"[Ping] Sent ping to session {session_id}") | ||
| except Exception as send_ex: | ||
| # Send failed - connection is dead | ||
| logger.warning( | ||
| f"[Ping] Failed to send ping to session {session_id}: {send_ex}. " | ||
| "Connection likely dead." | ||
| ) | ||
| try: | ||
| await websocket.close(code=1006) # Abnormal closure | ||
| except Exception: | ||
| pass | ||
|
Comment on lines
+492
to
+519
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🧩 Analysis chain🌐 Web query:
💡 Result: In RFC 6455, WebSocket close code 1006 (“abnormal closure”) is reserved and must not be sent on the wire:
So: you will never legitimately see a peer send a Close frame with code 1006; implementations use it to report “abnormal closure” when the closing handshake didn’t happen. [1][2] Sources: 🏁 Script executed: # Find and examine the file mentioned in the review
fd -t f "plugin_hub.py" | head -5Repository: CoplayDev/unity-mcp Length of output: 97 🏁 Script executed: # Read the specific lines mentioned in the review
sed -n '492,519p' Server/src/transport/plugin_hub.pyRepository: CoplayDev/unity-mcp Length of output: 1455 🏁 Script executed: # Let's see more context around these lines to understand the full function
sed -n '470,530p' Server/src/transport/plugin_hub.pyRepository: CoplayDev/unity-mcp Length of output: 2947 Replace close code 1006 with 1001 and log close failures. Line 517 uses close code 1006, which RFC 6455 explicitly reserves and forbids sending on the wire—it must only be used internally to report abnormal closure when no Close frame was exchanged. Sending it violates the specification and can cause protocol violations. Additionally, the exception handler at lines 518–519 silently swallows errors without logging, hiding diagnostics crucial for debugging connection failures. 🔧 Proposed fix await websocket.close(code=1006) # Abnormal closure
- except Exception:
- pass
+ await websocket.close(code=1001) # Going away
+ except Exception as close_ex:
+ logger.debug(f"[Ping] Error closing websocket after send failure: {close_ex}")🧰 Tools🪛 Ruff (0.14.14)[warning] 501-501: Do not catch blind exception: (BLE001) [warning] 510-510: Do not catch blind exception: (BLE001) [error] 518-519: (S110) [warning] 518-518: Do not catch blind exception: (BLE001) 🤖 Prompt for AI Agents |
||
| break | ||
|
|
||
| except asyncio.CancelledError: | ||
| logger.debug(f"[Ping] Ping loop cancelled for session {session_id}") | ||
| except Exception as ex: | ||
| logger.warning(f"[Ping] Ping loop error for session {session_id}: {ex}") | ||
| finally: | ||
| logger.debug(f"[Ping] Ping loop ended for session {session_id}") | ||
|
|
||
| @classmethod | ||
| async def _get_connection(cls, session_id: str) -> WebSocket: | ||
|
|
@@ -465,19 +556,30 @@ async def _resolve_session_id(cls, unity_instance: str | None, user_id: str | No | |
| if cls._registry is None: | ||
| raise RuntimeError("Plugin registry not configured") | ||
|
|
||
| # Bound waiting for Unity sessions so calls fail fast when editors are not ready. | ||
| # Bound waiting for Unity sessions. Default to 20s to handle domain reloads | ||
| # (which can take 10-20s after test runs or script changes). | ||
| # | ||
| # NOTE: This wait can impact agentic workflows where domain reloads happen | ||
| # frequently (e.g., after test runs, script compilation). The 20s default | ||
| # balances handling slow reloads vs. avoiding unnecessary delays. | ||
| # | ||
| # TODO: Make this more deterministic by detecting Unity's actual reload state | ||
| # (e.g., via status file, heartbeat, or explicit "reloading" signal from Unity) | ||
| # rather than blindly waiting up to 20s. See Issue #657. | ||
| # | ||
| # Configurable via: UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S (default: 20.0, max: 20.0) | ||
| try: | ||
| max_wait_s = float( | ||
| os.environ.get("UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S", "2.0")) | ||
| os.environ.get("UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S", "20.0")) | ||
| except ValueError as e: | ||
| raw_val = os.environ.get( | ||
| "UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S", "2.0") | ||
| "UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S", "20.0") | ||
| logger.warning( | ||
| "Invalid UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S=%r, using default 2.0: %s", | ||
| "Invalid UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S=%r, using default 20.0: %s", | ||
| raw_val, e) | ||
| max_wait_s = 2.0 | ||
| # Clamp to [0, 30] to prevent misconfiguration from causing excessive waits | ||
| max_wait_s = max(0.0, min(max_wait_s, 30.0)) | ||
| max_wait_s = 20.0 | ||
| # Clamp to [0, 20] to prevent misconfiguration from causing excessive waits | ||
| max_wait_s = max(0.0, min(max_wait_s, 20.0)) | ||
| retry_ms = float(getattr(config, "reload_retry_ms", 250)) | ||
| sleep_seconds = max(0.05, min(0.25, retry_ms / 1000.0)) | ||
|
|
||
|
|
@@ -613,7 +715,7 @@ async def send_command_for_instance( | |
| "Invalid UNITY_MCP_SESSION_READY_WAIT_SECONDS=%r, using default 6.0: %s", | ||
| raw_val, e) | ||
| max_wait_s = 6.0 | ||
| max_wait_s = max(0.0, min(max_wait_s, 30.0)) | ||
| max_wait_s = max(0.0, min(max_wait_s, 20.0)) | ||
| if max_wait_s > 0: | ||
| deadline = time.monotonic() + max_wait_s | ||
| while time.monotonic() < deadline: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Reload wait is still capped at 20s—objective calls for 30s.
Issue
#654notes reloads can take 10–30s, but the default, clamp, and log text still enforce 20s, which can prematurely abort retries. Please bump the default and cap (and update messaging) to 30s.♻️ Proposed fix
🤖 Prompt for AI Agents