Skip to content

CancelledError from connection failures is indistinguishable from external cancellation #1830

@saintx

Description

@saintx

Initial Checks

Description

Summary

The MCP Python SDK raises asyncio.CancelledError when a server connection fails. This is structurally identical to external task cancellation (Ctrl+C, SIGTERM), making it impossible for client code to correctly handle both scenarios.

Problem

When an MCP server becomes unreachable during a request:

try:
    result = await session.list_tools()
except asyncio.CancelledError:
    # Is this:
    # A) Server died (should reconnect/retry)
    # B) Operator hit Ctrl+C (should propagate for clean shutdown)
    #
    # Cannot distinguish.

Root Cause

The SDK uses anyio for structured concurrency. Transport layers (mcp/client/sse.py, mcp/client/streamable_http.py) create task groups:

# mcp/client/sse.py (simplified)
async with anyio.create_task_group() as tg:
    tg.start_soon(sse_reader)    # Reads from server
    tg.start_soon(post_writer)   # Writes to server
    yield read_stream, write_stream

When the server connection fails:

  1. sse_reader task fails (connection lost)
  2. anyio's task group cancels sibling tasks
  3. CancelledError propagates to response_stream_reader.receive() in session.py
  4. Client code catches CancelledError

This is the same exception type raised by task.cancel() during external shutdown.

Evidence

Exception characteristics when catching CancelledError:

Scenario ex.args task.cancelling() delta
SSE server dies () +1
Streaming HTTP server dies ('Cancelled by cancel scope...',) +1
External task.cancel() () +1

SSE internal failure and external cancellation have identical characteristics.

Impact

If client converts CancelledError → ConnectionError:

  • External shutdown (Ctrl+C) raises ConnectionError instead of CancelledError
  • Retry loops may continue instead of exiting
  • asyncio's cooperative cancellation model is broken

If client propagates CancelledError:

  • Server failures escape as BaseException
  • Callers must use except BaseException to handle failures
  • Poor error messages ("CancelledError" vs "connection lost")

Files Involved

File Role
mcp/client/sse.py SSE transport - creates task group
mcp/client/streamable_http.py Streaming HTTP transport - creates task group
mcp/shared/session.py response_stream_reader.receive() - where CancelledError surfaces

Example Code

#!/usr/bin/env python3
"""
Minimal reproduction: CancelledError ambiguity in MCP SDK.

This script demonstrates that when an MCP server dies mid-request,
the client receives asyncio.CancelledError - the same exception type
raised by external task cancellation (Ctrl+C, SIGTERM).

Run: python repro_cancelled_error_ambiguity.py

Expected output:
  - Test 1 (server dies): CancelledError
  - Test 2 (external cancel): CancelledError

Both scenarios produce identical exceptions, making it impossible
for client code to distinguish server failure from intentional shutdown.
"""
import asyncio
import multiprocessing
import socket
import time
from typing import Any

import uvicorn
from starlette.applications import Starlette
from starlette.requests import Request
from starlette.responses import Response
from starlette.routing import Mount, Route

from mcp.client.session import ClientSession
from mcp.client.sse import sse_client
from mcp.server import Server
from mcp.server.sse import SseServerTransport
from mcp.server.transport_security import TransportSecuritySettings
from mcp.types import TextContent, Tool


# === Minimal MCP Server ===

class SlowToolServer(Server):
    def __init__(self):
        super().__init__("test-server")

        @self.list_tools()
        async def handle_list_tools() -> list[Tool]:
            return [Tool(
                name="slow_tool",
                description="Takes 10 seconds",
                inputSchema={"type": "object", "properties": {}},
            )]

        @self.call_tool()
        async def handle_call_tool(name: str, args: dict[str, Any]) -> list[TextContent]:
            await asyncio.sleep(10.0)
            return [TextContent(type="text", text="Done")]


def run_server(port: int) -> None:
    security = TransportSecuritySettings(
        allowed_hosts=["127.0.0.1:*"],
        allowed_origins=["http://127.0.0.1:*"],
    )
    sse = SseServerTransport("/messages/", security_settings=security)
    server = SlowToolServer()

    async def handle_sse(request: Request) -> Response:
        async with sse.connect_sse(request.scope, request.receive, request._send) as streams:
            await server.run(streams[0], streams[1], server.create_initialization_options())
        return Response()

    app = Starlette(routes=[
        Route("/sse", endpoint=handle_sse),
        Mount("/messages/", app=sse.handle_post_message),
    ])
    uvicorn.Server(uvicorn.Config(app=app, host="127.0.0.1", port=port, log_level="error")).run()


def get_free_port() -> int:
    with socket.socket() as s:
        s.bind(("127.0.0.1", 0))
        return s.getsockname()[1]


def wait_for_server(port: int, timeout: float = 5.0) -> None:
    start = time.time()
    while time.time() - start < timeout:
        try:
            with socket.socket() as s:
                s.settimeout(0.1)
                s.connect(("127.0.0.1", port))
                return
        except (ConnectionRefusedError, OSError):
            time.sleep(0.01)
    raise TimeoutError(f"Server did not start within {timeout}s")


# === Test 1: Server dies mid-request ===

async def test_server_dies() -> str:
    """Kill server while request is in flight. What exception do we get?"""
    port = get_free_port()
    proc = multiprocessing.Process(target=run_server, kwargs={"port": port}, daemon=True)
    proc.start()
    wait_for_server(port)

    exception_type = None
    try:
        async with sse_client(f"http://127.0.0.1:{port}/sse") as (r, w):
            async with ClientSession(r, w) as session:
                await session.initialize()

                task = asyncio.create_task(session.call_tool("slow_tool", {}))
                await asyncio.sleep(0.3)

                # Kill server while request is pending
                proc.kill()
                proc.join(timeout=1)

                await asyncio.wait_for(task, timeout=5.0)

    except asyncio.CancelledError:
        exception_type = "CancelledError"
    except Exception as ex:
        exception_type = type(ex).__name__
    finally:
        if proc.is_alive():
            proc.kill()

    return exception_type or "None"


# === Test 2: External cancellation ===

async def test_external_cancel() -> str:
    """Cancel task externally (simulating Ctrl+C). What exception do we get?"""
    port = get_free_port()
    proc = multiprocessing.Process(target=run_server, kwargs={"port": port}, daemon=True)
    proc.start()
    wait_for_server(port)

    exception_type = None
    try:
        async with sse_client(f"http://127.0.0.1:{port}/sse") as (r, w):
            async with ClientSession(r, w) as session:
                await session.initialize()

                task = asyncio.create_task(session.call_tool("slow_tool", {}))
                await asyncio.sleep(0.3)

                # External cancellation
                task.cancel()

                await task

    except asyncio.CancelledError:
        exception_type = "CancelledError"
    except Exception as ex:
        exception_type = type(ex).__name__
    finally:
        if proc.is_alive():
            proc.kill()

    return exception_type or "None"


# === Main ===

if __name__ == "__main__":
    print("Test 1: Server dies mid-request")
    result1 = asyncio.run(test_server_dies())
    print(f"  Exception: {result1}")

    print()
    print("Test 2: External cancellation (Ctrl+C simulation)")
    result2 = asyncio.run(test_external_cancel())
    print(f"  Exception: {result2}")

    print()
    print("Result:")
    if result1 == result2 == "CancelledError":
        print("  Both scenarios raise CancelledError.")
        print("  Client code cannot distinguish server failure from shutdown request.")
    else:
        print(f"  Test 1: {result1}")
        print(f"  Test 2: {result2}")

Python & MCP Python SDK

Python: 3.12.12
MCP SDK: 1.20.0

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions