vectorize-io · nicoloboschi · Feb 4, 2026 · Feb 3, 2026 · Feb 3, 2026 · Feb 4, 2026
diff --git a/hindsight-api/hindsight_api/engine/providers/claude_code_llm.py b/hindsight-api/hindsight_api/engine/providers/claude_code_llm.py
@@ -291,61 +291,202 @@ async def call_with_tools(
         tool_choice: str | dict[str, Any] = "auto",
     ) -> LLMToolCallResult:
         """
-        Make an LLM API call with tool/function calling support.
+        Make an LLM API call with tool/function calling support using Claude Agent SDK.
 
-        Note: This is a simplified implementation. Full tool support would require
-        integrating with Claude Agent SDK's tool system.
+        This implementation uses ClaudeSDKClient (not query()) because custom tools via
+        SDK MCP servers are only supported with the client. Tools are converted from OpenAI
+        format to SDK MCP tools, and tool names are formatted as mcp__hindsight_tools__{name}.
 
         Args:
             messages: List of message dicts. Can include tool results with role='tool'.
             tools: List of tool definitions in OpenAI format.
-            max_completion_tokens: Maximum tokens in response.
-            temperature: Sampling temperature.
+            max_completion_tokens: Maximum tokens in response (not used by Claude Agent SDK).
+            temperature: Sampling temperature (not used by Claude Agent SDK).
             scope: Scope identifier for tracking.
             max_retries: Maximum retry attempts.
             initial_backoff: Initial backoff time in seconds.
             max_backoff: Maximum backoff time in seconds.
-            tool_choice: How to choose tools - "auto", "none", "required", or specific function.
+            tool_choice: How to choose tools (not used by Claude Agent SDK).
 
         Returns:
             LLMToolCallResult with content and/or tool_calls.
         """
-        # For now, use regular call without tools
-        # Full implementation would require mapping OpenAI tool format to Claude Agent SDK tools
-        logger.warning(
-            "Claude Code provider does not fully support tool calling yet. Falling back to regular text completion."
+        from claude_agent_sdk import (
+            AssistantMessage,
+            ClaudeAgentOptions,
+            ClaudeSDKClient,
+            SdkMcpTool,
+            TextBlock,
+            ToolUseBlock,
+            create_sdk_mcp_server,
         )
 
-        result = await self.call(
-            messages=messages,
-            response_format=None,
-            max_completion_tokens=max_completion_tokens,
-            temperature=temperature,
-            scope=scope,
-            max_retries=max_retries,
-            initial_backoff=initial_backoff,
-            max_backoff=max_backoff,
-            return_usage=True,
-        )
+        start_time = time.time()
 
-        if isinstance(result, tuple):
-            text, usage = result
-            return LLMToolCallResult(
-                content=text,
-                tool_calls=[],
-                finish_reason="stop",
-                input_tokens=usage.input_tokens,
-                output_tokens=usage.output_tokens,
-            )
-        else:
-            # Fallback if return_usage didn't work as expected
-            return LLMToolCallResult(
-                content=str(result),
-                tool_calls=[],
-                finish_reason="stop",
-                input_tokens=0,
-                output_tokens=0,
+        # Convert OpenAI tool format to Claude Agent SDK SdkMcpTool format
+        sdk_tools: list[SdkMcpTool] = []
+        tool_names: list[str] = []
+
+        for tool in tools:
+            func = tool.get("function", {})
+            tool_name = func.get("name", "")
+            tool_description = func.get("description", "")
+            parameters = func.get("parameters", {})
+
+            # Create a handler with proper closure to avoid transport issues
+            def make_handler(name: str):
+                async def handler(args: dict[str, Any]) -> dict[str, Any]:
+                    # Return immediately with success - tool execution happens externally
+                    return {
+                        "content": [
+                            {
+                                "type": "text",
+                                "text": f"[Tool {name} called successfully]",
+                            }
+                        ]
+                    }
+
+                return handler
+
+            sdk_tools.append(
+                SdkMcpTool(
+                    name=tool_name,
+                    description=tool_description,
+                    input_schema=parameters,
+                    handler=make_handler(tool_name),
+                )
             )
+            tool_names.append(tool_name)
+
+        # Create an MCP server with the tools
+        mcp_server = create_sdk_mcp_server(
+            name="hindsight_tools",
+            version="1.0.0",
+            tools=sdk_tools if sdk_tools else None,
+        )
+
+        # Build system prompt and user content from messages
+        system_prompt = ""
+        user_content = ""
+
+        for msg in messages:
+            role = msg.get("role", "user")
+            content = msg.get("content", "")
+
+            if role == "system":
+                system_prompt += ("\n\n" + content) if system_prompt else content
+            elif role == "user":
+                user_content += ("\n\n" + content) if user_content else content
+            elif role == "assistant":
+                # Include previous assistant messages as context
+                user_content += f"\n\n[Previous assistant response: {content}]"
+            elif role == "tool":
+                # Tool results are already in tool_results_map, append to user context
+                tool_call_id = msg.get("tool_call_id", "")
+                user_content += f"\n\n[Tool result for {tool_call_id}: {content}]"
+
+        # Format tool names for SDK MCP servers: mcp__{server_name}__{tool_name}
+        # This is required by the Claude Agent SDK for MCP server tools
+        allowed_tool_names = [f"mcp__hindsight_tools__{name}" for name in tool_names]
+
+        # Configure SDK options with MCP server
+        options = ClaudeAgentOptions(
+            system_prompt=system_prompt if system_prompt else None,
+            max_turns=1,  # Single-turn for API-style interactions
+            mcp_servers={"hindsight_tools": mcp_server} if sdk_tools else {},
+            allowed_tools=allowed_tool_names if allowed_tool_names else [],
+        )
+
+        # Call Claude Agent SDK with retry logic
+        last_exception = None
+        for attempt in range(max_retries + 1):
+            try:
+                full_text = ""
+                tool_calls: list[LLMToolCall] = []
+
+                # Use ClaudeSDKClient for tool calling support
+                # Note: query() does NOT support custom tools, only ClaudeSDKClient does
+                async with ClaudeSDKClient(options=options) as client:
+                    # Send the query
+                    await client.query(user_content)
+
+                    # Receive response
+                    async for message in client.receive_response():
+                        if isinstance(message, AssistantMessage):
+                            for block in message.content:
+                                if isinstance(block, TextBlock):
+                                    full_text += block.text
+                                elif isinstance(block, ToolUseBlock):
+                                    # SDK returns tool names with MCP prefix (mcp__hindsight_tools__{name})
+                                    # Strip the prefix to return original tool name expected by caller
+                                    tool_name = block.name
+                                    if tool_name.startswith("mcp__hindsight_tools__"):
+                                        tool_name = tool_name.replace("mcp__hindsight_tools__", "", 1)
+
+                                    tool_calls.append(
+                                        LLMToolCall(
+                                            id=block.id,
+                                            name=tool_name,
+                                            arguments=block.input,
+                                        )
+                                    )
+
+                # Record metrics
+                duration = time.time() - start_time
+                metrics = get_metrics_collector()
+
+                # Estimate token usage (Claude Agent SDK doesn't report exact counts)
+                estimated_input = sum(len(m.get("content", "")) for m in messages) // 4
+                estimated_output = len(full_text) // 4
+
+                metrics.record_llm_call(
+                    provider=self.provider,
+                    model=self.model,
+                    scope=scope,
+                    duration=duration,
+                    input_tokens=estimated_input,
+                    output_tokens=estimated_output,
+                    success=True,
+                )
+
+                # Log slow calls
+                if duration > 10.0:
+                    logger.info(
+                        f"slow llm call: scope={scope}, model={self.provider}/{self.model}, time={duration:.3f}s"
+                    )
+
+                return LLMToolCallResult(
+                    content=full_text if full_text else None,
+                    tool_calls=tool_calls,
+                    finish_reason="tool_calls" if tool_calls else "stop",
+                    input_tokens=estimated_input,
+                    output_tokens=estimated_output,
+                )
+
+            except Exception as e:
+                last_exception = e
+
+                # Check for authentication errors
+                error_str = str(e).lower()
+                if "auth" in error_str or "login" in error_str or "credential" in error_str:
+                    logger.error(f"Claude Code authentication error: {e}")
+                    raise RuntimeError(
+                        f"Claude Code authentication failed: {e}\n\n"
+                        "Run 'claude auth login' to authenticate with Claude Pro/Max."
+                    ) from e
+
+                if attempt < max_retries:
+                    backoff = min(initial_backoff * (2**attempt), max_backoff)
+                    logger.warning(f"Claude Code tool call error (attempt {attempt + 1}/{max_retries + 1}): {e}")
+                    await asyncio.sleep(backoff)
+                    continue
+                else:
+                    logger.error(f"Claude Code tool call error after {max_retries + 1} attempts: {e}")
+                    raise
+
+        if last_exception:
+            raise last_exception
+        raise RuntimeError("Claude Code tool call failed after all retries")
 
     async def cleanup(self) -> None:
         """Clean up resources (no HTTP client to close for Claude Agent SDK)."""

diff --git a/hindsight-api/hindsight_api/engine/providers/codex_llm.py b/hindsight-api/hindsight_api/engine/providers/codex_llm.py
@@ -177,6 +177,9 @@ async def call(
             schema_msg = f"\n\nYou must respond with valid JSON matching this schema:\n{json.dumps(schema, indent=2)}"
             system_instruction += schema_msg
 
+        # gpt-5.2-codex only supports "detailed" reasoning summary
+        reasoning_summary = "detailed" if "5.2" in self.model else self.reasoning_summary
+
         # Build Codex request payload
         payload = {
             "model": self.model,
@@ -192,7 +195,7 @@ async def call(
             "tools": [],
             "tool_choice": "auto",
             "parallel_tool_calls": True,
-            "reasoning": {"summary": self.reasoning_summary},
+            "reasoning": {"summary": reasoning_summary},
             "store": False,  # Codex uses stateless mode
             "stream": True,  # SSE streaming
             "include": ["reasoning.encrypted_content"],
@@ -283,13 +286,20 @@ async def call(
                         "Run 'codex auth login' to re-authenticate."
                     ) from e
 
+                # Log the actual error message from the API
+                error_detail = e.response.text[:500] if hasattr(e.response, "text") else str(e)
+
                 if attempt < max_retries:
                     backoff = min(initial_backoff * (2**attempt), max_backoff)
-                    logger.warning(f"Codex HTTP error {status_code} (attempt {attempt + 1}/{max_retries + 1})")
+                    logger.warning(
+                        f"Codex HTTP error {status_code} (attempt {attempt + 1}/{max_retries + 1}): {error_detail}"
+                    )
                     await asyncio.sleep(backoff)
                     continue
                 else:
-                    logger.error(f"Codex HTTP error after {max_retries + 1} attempts: {e}")
+                    logger.error(
+                        f"Codex HTTP error after {max_retries + 1} attempts: Status {status_code}, Detail: {error_detail}"
+                    )
                     raise
 
             except httpx.RequestError as e:
@@ -379,8 +389,22 @@ async def call_with_tools(
         """
         Make API call with tool calling support.
 
-        Note: This is a basic implementation. Full tool calling support for Codex
-        may require additional SSE event parsing.
+        Parses Codex SSE stream to extract tool calls from response.output_item.done events.
+        Tools are converted from OpenAI format to Codex format (flat structure at top level).
+
+        Args:
+            messages: List of message dicts. Can include tool results with role='tool'.
+            tools: List of tool definitions in OpenAI format.
+            max_completion_tokens: Maximum tokens in response.
+            temperature: Sampling temperature.
+            scope: Scope identifier for tracking.
+            max_retries: Maximum retry attempts.
+            initial_backoff: Initial backoff time in seconds.
+            max_backoff: Maximum backoff time in seconds.
+            tool_choice: How to choose tools - "auto", "none", "required", or specific function.
+
+        Returns:
+            LLMToolCallResult with content and/or tool_calls.
         """
         start_time = time.time()
 
@@ -413,28 +437,30 @@ async def call_with_tools(
                 )
 
         # Convert tools to Codex format
+        # Codex expects tools with type and name/description/parameters at top level
         codex_tools = []
         for tool in tools:
             func = tool.get("function", {})
             codex_tools.append(
                 {
                     "type": "function",
-                    "function": {
-                        "name": func.get("name", ""),
-                        "description": func.get("description", ""),
-                        "parameters": func.get("parameters", {}),
-                    },
+                    "name": func.get("name", ""),
+                    "description": func.get("description", ""),
+                    "parameters": func.get("parameters", {}),
                 }
             )
 
+        # gpt-5.2-codex only supports "detailed" reasoning summary
+        reasoning_summary = "detailed" if "5.2" in self.model else self.reasoning_summary
+
         payload = {
             "model": self.model,
             "instructions": system_instruction,
             "input": user_messages,
             "tools": codex_tools,
             "tool_choice": tool_choice,
             "parallel_tool_calls": True,
-            "reasoning": {"summary": self.reasoning_summary},
+            "reasoning": {"summary": reasoning_summary},
             "store": False,
             "stream": True,
             "include": ["reasoning.encrypted_content"],
@@ -451,8 +477,16 @@ async def call_with_tools(
 
         url = f"{self.base_url}/codex/responses"
 
+        # Debug logging for troubleshooting
+        logger.debug(f"Codex tool call request: url={url}, model={payload['model']}, tools={len(codex_tools)}")
+
         try:
             response = await self._client.post(url, json=payload, headers=headers, timeout=120.0)
+
+            # Log response details on error
+            if response.status_code != 200:
+                logger.error(f"Codex API error {response.status_code}: {response.text[:500]}")
+
             response.raise_for_status()
 
             # Parse SSE for tool calls and content
@@ -512,13 +546,30 @@ async def _parse_sse_tool_stream(self, response: httpx.Response) -> tuple[str |
                     if event_type == "response.text.delta" and "delta" in data:
                         content += data["delta"]
 
-                    # Extract tool calls
-                    elif event_type == "response.function_call_arguments.delta":
-                        # Handle tool call events (implementation depends on actual Codex SSE format)
-                        pass
-
-                except json.JSONDecodeError:
-                    pass
+                    # Extract completed tool calls from response.output_item.done
+                    elif event_type == "response.output_item.done":
+                        item = data.get("item", {})
+                        if item.get("type") == "function_call" and item.get("status") == "completed":
+                            tool_name = item.get("name", "")
+                            arguments_str = item.get("arguments", "{}")
+                            call_id = item.get("call_id", "")
+
+                            try:
+                                arguments = json.loads(arguments_str)
+                            except json.JSONDecodeError:
+                                logger.warning(f"Failed to parse tool arguments: {arguments_str}")
+                                arguments = {}
+
+                            tool_calls.append(
+                                LLMToolCall(
+                                    id=call_id,
+                                    name=tool_name,
+                                    arguments=arguments,
+                                )
+                            )
+
+                except json.JSONDecodeError as e:
+                    logger.warning(f"Failed to parse SSE data: {e}, data_str: {data_str[:200]}")
 
         return content if content else None, tool_calls