CoplayDev · dsarno · Jan 31, 2026 · Jan 30, 2026 · Jan 30, 2026 · Jan 30, 2026
diff --git a/MCPForUnity/Editor/Services/Transport/TransportCommandDispatcher.cs b/MCPForUnity/Editor/Services/Transport/TransportCommandDispatcher.cs
@@ -35,13 +35,15 @@ public PendingCommand(
                 CompletionSource = completionSource;
                 CancellationToken = cancellationToken;
                 CancellationRegistration = registration;
+                QueuedAt = DateTime.UtcNow;
             }
 
             public string CommandJson { get; }
             public TaskCompletionSource<string> CompletionSource { get; }
             public CancellationToken CancellationToken { get; }
             public CancellationTokenRegistration CancellationRegistration { get; }
             public bool IsExecuting { get; set; }
+            public DateTime QueuedAt { get; }
 
             public void Dispose()
             {

diff --git a/MCPForUnity/Editor/Services/Transport/Transports/WebSocketTransportClient.cs b/MCPForUnity/Editor/Services/Transport/Transports/WebSocketTransportClient.cs
@@ -619,6 +619,7 @@ private Task SendPongAsync(CancellationToken token)
             var payload = new JObject
             {
                 ["type"] = "pong",
+                ["session_id"] = _sessionId  // Include session ID for server-side tracking
             };
             return SendJsonAsync(payload, token);
         }
@@ -652,6 +653,10 @@ private async Task SendJsonAsync(JObject payload, CancellationToken token)
 
         private async Task HandleSocketClosureAsync(string reason)
         {
+            // Capture stack trace for debugging disconnection triggers
+            var stackTrace = new System.Diagnostics.StackTrace(true);
+            McpLog.Debug($"[WebSocket] HandleSocketClosureAsync called. Reason: {reason}\nStack trace:\n{stackTrace}");
+
             if (_lifecycleCts == null || _lifecycleCts.IsCancellationRequested)
             {
                 return;

diff --git a/Server/src/transport/legacy/unity_connection.py b/Server/src/transport/legacy/unity_connection.py
@@ -306,8 +306,10 @@ def read_status_file(target_hash: str | None = None) -> dict | None:
         for attempt in range(attempts + 1):
             try:
                 # Ensure connected (handshake occurs within connect())
+                t_conn_start = time.time()
                 if not self.sock and not self.connect():
                     raise ConnectionError("Could not connect to Unity")
+                logger.info("[TIMING-STDIO] connect took %.3fs command=%s", time.time() - t_conn_start, command_type)
 
                 # Build payload
                 if command_type == 'ping':
@@ -324,20 +326,24 @@ def read_status_file(target_hash: str | None = None) -> dict | None:
                     with contextlib.suppress(Exception):
                         logger.debug(
                             f"send {len(payload)} bytes; mode={mode}; head={payload[:32].decode('utf-8', 'ignore')}")
+                    t_send_start = time.time()
                     if self.use_framing:
                         header = struct.pack('>Q', len(payload))
                         self.sock.sendall(header)
                         self.sock.sendall(payload)
                     else:
                         self.sock.sendall(payload)
+                    logger.info("[TIMING-STDIO] sendall took %.3fs command=%s", time.time() - t_send_start, command_type)
 
                     # During retry bursts use a short receive timeout and ensure restoration
                     restore_timeout = None
                     if attempt > 0 and last_short_timeout is None:
                         restore_timeout = self.sock.gettimeout()
                         self.sock.settimeout(1.0)
                     try:
+                        t_recv_start = time.time()
                         response_data = self.receive_full_response(self.sock)
+                        logger.info("[TIMING-STDIO] receive took %.3fs command=%s len=%d", time.time() - t_recv_start, command_type, len(response_data))
                         with contextlib.suppress(Exception):
                             logger.debug(
                                 f"recv {len(response_data)} bytes; mode={mode}")
@@ -419,7 +425,8 @@ def read_status_file(target_hash: str | None = None) -> dict | None:
 
                     # Cap backoff depending on state
                     if status and status.get('reloading'):
-                        cap = 0.8
+                        # Domain reload can take 10-20s; use longer waits
+                        cap = 5.0
                     elif fast_error:
                         cap = 0.25
                     else:
@@ -761,22 +768,36 @@ def send_command_with_retry(
     Uses config.reload_retry_ms and config.reload_max_retries by default. Preserves the
     structured failure if retries are exhausted.
     """
+    t_retry_start = time.time()
+    logger.info("[TIMING-STDIO] send_command_with_retry START command=%s", command_type)
+    t_get_conn = time.time()
     conn = get_unity_connection(instance_id)
+    logger.info("[TIMING-STDIO] get_unity_connection took %.3fs command=%s", time.time() - t_get_conn, command_type)
     if max_retries is None:
         max_retries = getattr(config, "reload_max_retries", 40)
     if retry_ms is None:
         retry_ms = getattr(config, "reload_retry_ms", 250)
+    # Default to 20s to handle domain reloads (which can take 10-20s after tests or script changes).
+    #
+    # NOTE: This wait can impact agentic workflows where domain reloads happen
+    # frequently (e.g., after test runs, script compilation). The 20s default
+    # balances handling slow reloads vs. avoiding unnecessary delays.
+    #
+    # TODO: Make this more deterministic by detecting Unity's actual reload state
+    # rather than blindly waiting up to 20s. See Issue #657.
+    #
+    # Configurable via: UNITY_MCP_RELOAD_MAX_WAIT_S (default: 20.0, max: 20.0)
     try:
         max_wait_s = float(os.environ.get(
-            "UNITY_MCP_RELOAD_MAX_WAIT_S", "2.0"))
+            "UNITY_MCP_RELOAD_MAX_WAIT_S", "20.0"))
     except ValueError as e:
-        raw_val = os.environ.get("UNITY_MCP_RELOAD_MAX_WAIT_S", "2.0")
+        raw_val = os.environ.get("UNITY_MCP_RELOAD_MAX_WAIT_S", "20.0")
         logger.warning(
-            "Invalid UNITY_MCP_RELOAD_MAX_WAIT_S=%r, using default 2.0: %s",
+            "Invalid UNITY_MCP_RELOAD_MAX_WAIT_S=%r, using default 20.0: %s",
             raw_val, e)
-        max_wait_s = 2.0
-    # Clamp to [0, 30] to prevent misconfiguration from causing excessive waits
-    max_wait_s = max(0.0, min(max_wait_s, 30.0))
+        max_wait_s = 20.0
+    # Clamp to [0, 20] to prevent misconfiguration from causing excessive waits
+    max_wait_s = max(0.0, min(max_wait_s, 20.0))
 
     # If retry_on_reload=False, disable connection-level retries too (issue #577)
     # Commands that trigger compilation/reload shouldn't retry on disconnect
@@ -847,6 +868,7 @@ def send_command_with_retry(
             instance_id or "default",
             waited,
         )
+    logger.info("[TIMING-STDIO] send_command_with_retry DONE total=%.3fs command=%s", time.time() - t_retry_start, command_type)
     return response
 
 

diff --git a/Server/src/transport/models.py b/Server/src/transport/models.py
@@ -23,6 +23,11 @@ class ExecuteCommandMessage(BaseModel):
     params: dict[str, Any]
     timeout: float
 
+
+class PingMessage(BaseModel):
+    """Server-initiated ping to detect dead connections."""
+    type: str = "ping"
+
 # Incoming (Plugin -> Server)
 
 

diff --git a/Server/src/transport/plugin_hub.py b/Server/src/transport/plugin_hub.py
@@ -7,7 +7,7 @@
 import os
 import time
 import uuid
-from typing import Any
+from typing import Any, ClassVar
 
 from starlette.endpoints import WebSocketEndpoint
 from starlette.websockets import WebSocket
@@ -21,6 +21,7 @@
     WelcomeMessage,
     RegisteredMessage,
     ExecuteCommandMessage,
+    PingMessage,
     RegisterMessage,
     RegisterToolsMessage,
     PongMessage,
@@ -29,7 +30,7 @@
     SessionDetails,
 )
 
-logger = logging.getLogger("mcp-for-unity-server")
+logger = logging.getLogger(__name__)
 
 
 class PluginDisconnectedError(RuntimeError):
@@ -63,6 +64,10 @@ class PluginHub(WebSocketEndpoint):
     KEEP_ALIVE_INTERVAL = 15
     SERVER_TIMEOUT = 30
     COMMAND_TIMEOUT = 30
+    # Server-side ping interval (seconds) - how often to send pings to Unity
+    PING_INTERVAL = 10
+    # Max time (seconds) to wait for pong before considering connection dead
+    PING_TIMEOUT = 20
     # Timeout (seconds) for fast-fail commands like ping/read_console/get_editor_state.
     # Keep short so MCP clients aren't blocked during Unity compilation/reload/unfocused throttling.
     FAST_FAIL_TIMEOUT = 2.0
@@ -78,6 +83,10 @@ class PluginHub(WebSocketEndpoint):
     _pending: dict[str, dict[str, Any]] = {}
     _lock: asyncio.Lock | None = None
     _loop: asyncio.AbstractEventLoop | None = None
+    # session_id -> last pong timestamp (monotonic)
+    _last_pong: ClassVar[dict[str, float]] = {}
+    # session_id -> ping task
+    _ping_tasks: ClassVar[dict[str, asyncio.Task]] = {}
 
     @classmethod
     def configure(
@@ -176,12 +185,20 @@ async def on_disconnect(self, websocket: WebSocket, close_code: int) -> None:
                 (sid for sid, ws in cls._connections.items() if ws is websocket), None)
             if session_id:
                 cls._connections.pop(session_id, None)
+                # Stop the ping loop for this session
+                ping_task = cls._ping_tasks.pop(session_id, None)
+                if ping_task and not ping_task.done():
+                    ping_task.cancel()
+                # Clean up last pong tracking
+                cls._last_pong.pop(session_id, None)
                 # Fail-fast any in-flight commands for this session to avoid waiting for COMMAND_TIMEOUT.
                 pending_ids = [
                     command_id
                     for command_id, entry in cls._pending.items()
                     if entry.get("session_id") == session_id
                 ]
+                if pending_ids:
+                    logger.debug(f"Cancelling {len(pending_ids)} pending commands for disconnected session")
                 for command_id in pending_ids:
                     entry = cls._pending.get(command_id)
                     future = entry.get("future") if isinstance(
@@ -364,10 +381,18 @@ async def _handle_register(self, websocket: WebSocket, payload: RegisterMessage)
         session = await registry.register(session_id, project_name, project_hash, unity_version, project_path, user_id=user_id)
         async with lock:
             cls._connections[session.session_id] = websocket
+            # Initialize last pong time and start ping loop for this session
+            cls._last_pong[session_id] = time.monotonic()
+            # Cancel any existing ping task for this session (shouldn't happen, but be safe)
+            old_task = cls._ping_tasks.pop(session_id, None)
+            if old_task and not old_task.done():
+                old_task.cancel()
+            # Start the server-side ping loop
+            ping_task = asyncio.create_task(cls._ping_loop(session_id, websocket))
+            cls._ping_tasks[session_id] = ping_task
 
         if user_id:
-            logger.info(
-                f"Plugin registered: {project_name} ({project_hash}) for user {user_id}")
+            logger.info(f"Plugin registered: {project_name} ({project_hash}) for user {user_id}")
         else:
             logger.info(f"Plugin registered: {project_name} ({project_hash})")
 
@@ -429,11 +454,77 @@ async def _handle_command_result(self, payload: CommandResultMessage) -> None:
     async def _handle_pong(self, payload: PongMessage) -> None:
         cls = type(self)
         registry = cls._registry
+        lock = cls._lock
         if registry is None:
             return
         session_id = payload.session_id
         if session_id:
             await registry.touch(session_id)
+            # Record last pong time for staleness detection (under lock for consistency)
+            if lock is not None:
+                async with lock:
+                    cls._last_pong[session_id] = time.monotonic()
+
+    @classmethod
+    async def _ping_loop(cls, session_id: str, websocket: WebSocket) -> None:
+        """Server-initiated ping loop to detect dead connections.
+
+        Sends periodic pings to the Unity client. If no pong is received within
+        PING_TIMEOUT seconds, the connection is considered dead and closed.
+        This helps detect connections that die silently (e.g., Windows OSError 64).
+        """
+        logger.debug(f"[Ping] Starting ping loop for session {session_id}")
+        try:
+            while True:
+                await asyncio.sleep(cls.PING_INTERVAL)
+
+                # Check if we're still supposed to be running and get last pong time (under lock)
+                lock = cls._lock
+                if lock is None:
+                    break
+                async with lock:
+                    if session_id not in cls._connections:
+                        logger.debug(f"[Ping] Session {session_id} no longer in connections, stopping ping loop")
+                        break
+                    # Read last pong time under lock for consistency
+                    last_pong = cls._last_pong.get(session_id, 0)
+
+                # Check staleness: has it been too long since we got a pong?
+                elapsed = time.monotonic() - last_pong
+                if elapsed > cls.PING_TIMEOUT:
+                    logger.warning(
+                        f"[Ping] Session {session_id} stale: no pong for {elapsed:.1f}s "
+                        f"(timeout={cls.PING_TIMEOUT}s). Closing connection."
+                    )
+                    try:
+                        await websocket.close(code=1001)  # Going away
+                    except Exception as close_ex:
+                        logger.debug(f"[Ping] Error closing stale websocket: {close_ex}")
+                    break
+
+                # Send a ping to the client
+                try:
+                    ping_msg = PingMessage()
+                    await websocket.send_json(ping_msg.model_dump())
+                    logger.debug(f"[Ping] Sent ping to session {session_id}")
+                except Exception as send_ex:
+                    # Send failed - connection is dead
+                    logger.warning(
+                        f"[Ping] Failed to send ping to session {session_id}: {send_ex}. "
+                        "Connection likely dead."
+                    )
+                    try:
+                        await websocket.close(code=1006)  # Abnormal closure
+                    except Exception:
+                        pass
+                    break
+
+        except asyncio.CancelledError:
+            logger.debug(f"[Ping] Ping loop cancelled for session {session_id}")
+        except Exception as ex:
+            logger.warning(f"[Ping] Ping loop error for session {session_id}: {ex}")
+        finally:
+            logger.debug(f"[Ping] Ping loop ended for session {session_id}")
 
     @classmethod
     async def _get_connection(cls, session_id: str) -> WebSocket:
@@ -465,19 +556,30 @@ async def _resolve_session_id(cls, unity_instance: str | None, user_id: str | No
         if cls._registry is None:
             raise RuntimeError("Plugin registry not configured")
 
-        # Bound waiting for Unity sessions so calls fail fast when editors are not ready.
+        # Bound waiting for Unity sessions. Default to 20s to handle domain reloads
+        # (which can take 10-20s after test runs or script changes).
+        #
+        # NOTE: This wait can impact agentic workflows where domain reloads happen
+        # frequently (e.g., after test runs, script compilation). The 20s default
+        # balances handling slow reloads vs. avoiding unnecessary delays.
+        #
+        # TODO: Make this more deterministic by detecting Unity's actual reload state
+        # (e.g., via status file, heartbeat, or explicit "reloading" signal from Unity)
+        # rather than blindly waiting up to 20s. See Issue #657.
+        #
+        # Configurable via: UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S (default: 20.0, max: 20.0)
         try:
             max_wait_s = float(
-                os.environ.get("UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S", "2.0"))
+                os.environ.get("UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S", "20.0"))
         except ValueError as e:
             raw_val = os.environ.get(
-                "UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S", "2.0")
+                "UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S", "20.0")
             logger.warning(
-                "Invalid UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S=%r, using default 2.0: %s",
+                "Invalid UNITY_MCP_SESSION_RESOLVE_MAX_WAIT_S=%r, using default 20.0: %s",
                 raw_val, e)
-            max_wait_s = 2.0
-        # Clamp to [0, 30] to prevent misconfiguration from causing excessive waits
-        max_wait_s = max(0.0, min(max_wait_s, 30.0))
+            max_wait_s = 20.0
+        # Clamp to [0, 20] to prevent misconfiguration from causing excessive waits
+        max_wait_s = max(0.0, min(max_wait_s, 20.0))
         retry_ms = float(getattr(config, "reload_retry_ms", 250))
         sleep_seconds = max(0.05, min(0.25, retry_ms / 1000.0))
 
@@ -613,7 +715,7 @@ async def send_command_for_instance(
                     "Invalid UNITY_MCP_SESSION_READY_WAIT_SECONDS=%r, using default 6.0: %s",
                     raw_val, e)
                 max_wait_s = 6.0
-            max_wait_s = max(0.0, min(max_wait_s, 30.0))
+            max_wait_s = max(0.0, min(max_wait_s, 20.0))
             if max_wait_s > 0:
                 deadline = time.monotonic() + max_wait_s
                 while time.monotonic() < deadline:

diff --git a/Server/src/transport/unity_instance_middleware.py b/Server/src/transport/unity_instance_middleware.py
@@ -214,9 +214,10 @@ async def _inject_unity_instance(self, context: MiddlewareContext) -> None:
             # The 'active_instance' (Name@hash) might be valid for stdio even if PluginHub fails.
 
             session_id: str | None = None
-            # Only validate via PluginHub if we are actually using HTTP transport
-            # OR if we want to support hybrid mode. For now, let's be permissive.
-            if PluginHub.is_configured():
+            # Only validate via PluginHub if we are actually using HTTP transport.
+            # For stdio transport, skip PluginHub entirely - we only need the instance ID.
+            from transport.unity_transport import _is_http_transport
+            if _is_http_transport() and PluginHub.is_configured():
                 try:
                     # resolving session_id might fail if the plugin disconnected
                     # We only need session_id for HTTP transport routing.

diff --git a/Server/src/transport/unity_transport.py b/Server/src/transport/unity_transport.py
@@ -11,8 +11,8 @@
 from models.models import MCPResponse
 from models.unity_response import normalize_unity_response
 
+logger = logging.getLogger(__name__)
 T = TypeVar("T")
-logger = logging.getLogger("mcp-for-unity-server")
 
 
 def _is_http_transport() -> bool: