feat: better error context (#38)

fwereade · web-flow · commit aba06bc29ea1 · 2025-09-25T09:33:43.000+01:00
### what

Makes some slightly-unclear error-ish conditions a little less unclear.

### why

These are the cases that have been commented on as hard to
follow/interpret.

### testing

Apart from the trivial unit tests, it – seems to help?

### docs

Even this kinda is docs more than anything.
diff --git a/stacklet/mcp/assetdb/redash.py b/stacklet/mcp/assetdb/redash.py
@@ -20,6 +20,7 @@
 from ..lifespan import server_cached
 from ..settings import SETTINGS
 from ..stacklet_auth import StackletCredentials
+from ..utils.error import AnnotatedError
 from .models import ExportFormat, Job, Query, QueryListResponse, QueryResult, QueryUpsert
 
 
@@ -92,8 +93,18 @@ async def list_queries(
         if tags:
             params["tags"] = tags
 
-        result = await self._make_request("GET", "api/queries", params=params)
-        return QueryListResponse(**result)
+        try:
+            result = await self._make_request("GET", "api/queries", params=params)
+            return QueryListResponse(**result)
+        except httpx.HTTPStatusError as err:
+            if err.response.status_code == 400:
+                raise AnnotatedError(
+                    problem="Backend rejected request",
+                    likely_cause="the page parameter was out of bounds",
+                    next_steps="check page 1, or try a simpler search",
+                    original_error=str(err),
+                )
+            raise
 
     async def get_query(self, query_id: int) -> Query:
         """
@@ -196,11 +207,19 @@ async def _poll_job(self, job: Job, timeout: int) -> int:
             if job.query_result_id:
                 return job.query_result_id
             elif job.status.is_terminal:
-                raise RuntimeError(f"Query execution failed: {job.error or 'Unknown error.'}")
+                raise AnnotatedError(
+                    problem=f"Query execution error: {job.error or '(unknown)'}",
+                    likely_cause="the query SQL or parameters were invalid",
+                    next_steps="investigate the errors, or try a simpler query and build up",
+                )
 
             remaining_s = cutoff - time.monotonic()
             if remaining_s <= 0:
-                raise RuntimeError(f"Query execution timed out after {timeout} seconds")
+                raise AnnotatedError(
+                    problem=f"Timed out after {timeout} seconds",
+                    likely_cause="the query is still executing",
+                    next_steps="request cached results (with max_age=-1), or try a simpler query",
+                )
             await asyncio.sleep(min(interval_s, remaining_s))
             interval_s *= 2
 
diff --git a/stacklet/mcp/platform/graphql.py b/stacklet/mcp/platform/graphql.py
@@ -28,6 +28,7 @@
 from ..lifespan import server_cached
 from ..settings import SETTINGS
 from ..stacklet_auth import StackletCredentials
+from ..utils.error import AnnotatedError
 from .models import (
     ConnectionExport,
     ExportRequest,
@@ -80,7 +81,11 @@ async def query(self, query: str, variables: dict[str, Any]) -> GraphQLQueryResu
             Structured GraphQL query result
         """
         if not self.enable_mutations and has_mutations(query):
-            raise Exception("Mutations not allowed in the client")
+            raise AnnotatedError(
+                problem="Mutations disabled",
+                likely_cause="the user doesn't want you to run mutations",
+                next_steps="tell the user to set 'STACKLET_MCP_PLATFORM_ALLOW_MUTATIONS'",
+            )
 
         return await self._query(query, variables)
 
@@ -166,7 +171,11 @@ async def start_export(self, spec: ExportRequest) -> str:
         """
         result = await self._query(self.Q_START_EXPORT, {"input": spec.for_graphql()})
         if result.errors:
-            raise RuntimeError(f"Export mutation failed: {result.errors}")
+            raise AnnotatedError(
+                problem=f"Export mutation failed: {result.errors}",
+                likely_cause="what it says",
+                next_steps="check data types with 'platform_get_types'",
+            )
 
         # If no errors, data is at least guaranteed truthy.
         export = cast(dict[str, Any], result.data)["exportConnection"]["export"]
diff --git a/stacklet/mcp/utils/error.py b/stacklet/mcp/utils/error.py
@@ -0,0 +1,34 @@
+# LICENSE HEADER MANAGED BY add-license-header
+#
+# Copyright (c) 2025 Stacklet, Inc.
+#
+
+"""
+Error handling utilities for creating annotated ToolErrors with user guidance.
+"""
+
+from fastmcp.exceptions import ToolError
+
+
+class AnnotatedError(ToolError):
+    """
+    A well-annotated ToolError with context and guidance.
+
+    Args:
+        problem: Clear description of what went wrong
+        likely_cause: Most probable reason for the failure
+        next_steps: Actionable advice for resolving the issue
+        original_error: Optional underlying error details
+    """
+
+    def __init__(
+        self,
+        problem: str,
+        likely_cause: str,
+        next_steps: str,
+        original_error: str | None = None,
+    ):
+        message = f"{problem}. This likely means {likely_cause}. Next steps: {next_steps}"
+        if original_error:
+            message += f". Original error: {original_error}"
+        super().__init__(message)
diff --git a/tests/test_tools_assetdb.py b/tests/test_tools_assetdb.py
@@ -147,8 +147,10 @@ async def test_page_missing(self):
         ):
             result = await self.assert_call({"page": 999}, error=True)
 
-        # XXX better errors might be nice, "page 999 out of range" is… likely?
-        assert result.text == "Error calling tool 'assetdb_query_list': mocked http 400"
+        assert result.text == (
+            "Backend rejected request. This likely means the page parameter was out of bounds. "
+            "Next steps: check page 1, or try a simpler search. Original error: mocked http 400"
+        )
 
     @json_guard_parametrize([5, 10])
     async def test_page_size(self, mangle, value):
@@ -220,7 +222,9 @@ async def test_not_found(self):
         ):
             result = await self.assert_call({"query_id": 999}, error=True)
 
-        # XXX better errors might be nice, "query 999 does not exist"
+        # Generally, this is enough context for the LLM to handle it fine.
+        # Annotated errors come into their own when the meaning of a raw
+        # error is not immediately obvious.
         assert result.text == "Error calling tool 'assetdb_query_get': mocked http 404"
 
 
@@ -437,7 +441,7 @@ async def assert_tool_call(self, params, *expect_http, expect_error=None):
             result = await self.assert_call(params, error=bool(expect_error))
 
         if expect_error:
-            assert result.text == f"Error calling tool '{self.tool_name}': " + expect_error
+            assert result.text == expect_error
         else:
             self.assert_tool_query_result(result)
 
@@ -549,7 +553,10 @@ async def test_job_timeout(self, mangle, value, async_sleeps):
             self.expect_get_job(self.job_response(JobStatus.STARTED)),
             self.expect_get_job(self.job_response(JobStatus.STARTED)),
             self.expect_get_job(self.job_response(JobStatus.STARTED)),
-            expect_error="Query execution timed out after 60 seconds",
+            expect_error=(
+                "Timed out after 60 seconds. This likely means the query is still executing. "
+                "Next steps: request cached results (with max_age=-1), or try a simpler query"
+            ),
         )
         assert async_sleeps == [2, 4, 8, 16, 30]
 
@@ -559,7 +566,11 @@ async def test_job_failure(self):
             {"query_id": self.QUERY_ID},
             self.expect_post(self.post_data(), self.job_response(JobStatus.QUEUED)),
             self.expect_get_job(self.job_response(JobStatus.FAILED)),
-            expect_error="Query execution failed: Oh no borken",
+            expect_error=(
+                "Query execution error: Oh no borken. This likely means the query SQL or "
+                "parameters were invalid. Next steps: investigate the errors, or try a simpler "
+                "query and build up"
+            ),
         )
 
     async def test_job_cancellation(self):
@@ -570,7 +581,11 @@ async def test_job_cancellation(self):
             {"query_id": self.QUERY_ID},
             self.expect_post(self.post_data(), self.job_response(JobStatus.QUEUED)),
             self.expect_get_job(self.job_response(JobStatus.CANCELED)),
-            expect_error="Query execution failed: Unknown error.",
+            expect_error=(
+                "Query execution error: (unknown). This likely means the query SQL or "
+                "parameters were invalid. Next steps: investigate the errors, or try a simpler "
+                "query and build up"
+            ),
         )
 
 
@@ -641,7 +656,10 @@ async def test_job_timeout(self, mangle, value, async_sleeps):
             self.expect_get_job(self.job_response(JobStatus.STARTED)),
             self.expect_get_job(self.job_response(JobStatus.STARTED)),
             self.expect_get_job(self.job_response(JobStatus.STARTED)),
-            expect_error="Query execution timed out after 60 seconds",
+            expect_error=(
+                "Timed out after 60 seconds. This likely means the query is still executing. "
+                "Next steps: request cached results (with max_age=-1), or try a simpler query"
+            ),
         )
         assert async_sleeps == [2, 4, 8, 16, 30]
 
@@ -651,7 +669,11 @@ async def test_job_failure(self):
             {"query": "SELECT 1"},
             self.expect_post(self.post_data(), self.job_response(JobStatus.QUEUED)),
             self.expect_get_job(self.job_response(JobStatus.FAILED)),
-            expect_error="Query execution failed: Oh no borken",
+            expect_error=(
+                "Query execution error: Oh no borken. This likely means the query SQL or "
+                "parameters were invalid. Next steps: investigate the errors, or try a simpler "
+                "query and build up"
+            ),
         )
 
     async def test_job_cancellation(self):
@@ -662,7 +684,11 @@ async def test_job_cancellation(self):
             {"query": "SELECT 1"},
             self.expect_post(self.post_data(), self.job_response(JobStatus.QUEUED)),
             self.expect_get_job(self.job_response(JobStatus.CANCELED)),
-            expect_error="Query execution failed: Unknown error.",
+            expect_error=(
+                "Query execution error: (unknown). This likely means the query SQL or "
+                "parameters were invalid. Next steps: investigate the errors, or try a simpler "
+                "query and build up"
+            ),
         )