vectorize-io · nicoloboschi · Feb 19, 2026 · Feb 19, 2026 · Feb 19, 2026 · Feb 19, 2026
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -174,6 +174,7 @@ jobs:
       HINDSIGHT_API_LLM_PROVIDER: groq
       HINDSIGHT_API_LLM_API_KEY: ${{ secrets.GROQ_API_KEY }}
       HINDSIGHT_API_LLM_MODEL: openai/gpt-oss-20b
+      HINDSIGHT_API_LLM_GROQ_SERVICE_TIER: flex
       HINDSIGHT_API_URL: http://localhost:8888
       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       UV_INDEX: pytorch=https://download.pytorch.org/whl/cpu
@@ -233,6 +234,7 @@ jobs:
         HINDSIGHT_API_LLM_PROVIDER=${{ env.HINDSIGHT_API_LLM_PROVIDER }}
         HINDSIGHT_API_LLM_API_KEY=${{ env.HINDSIGHT_API_LLM_API_KEY }}
         HINDSIGHT_API_LLM_MODEL=${{ env.HINDSIGHT_API_LLM_MODEL }}
+        HINDSIGHT_API_LLM_GROQ_SERVICE_TIER=${{ env.HINDSIGHT_API_LLM_GROQ_SERVICE_TIER }}
         EOF
 
     - name: Start API server
@@ -361,6 +363,7 @@ jobs:
       COHERE_API_KEY: ${{ secrets.COHERE_API_KEY }}
       HINDSIGHT_API_EMBEDDINGS_OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
       HINDSIGHT_API_LLM_MODEL: openai/gpt-oss-20b
+      HINDSIGHT_API_LLM_GROQ_SERVICE_TIER: flex
       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       # Prefer CPU-only PyTorch in CI (but keep PyPI for everything else)
       UV_INDEX: pytorch=https://download.pytorch.org/whl/cpu
@@ -417,6 +420,7 @@ jobs:
       HINDSIGHT_API_LLM_PROVIDER: groq
       HINDSIGHT_API_LLM_API_KEY: ${{ secrets.GROQ_API_KEY }}
       HINDSIGHT_API_LLM_MODEL: openai/gpt-oss-20b
+      HINDSIGHT_API_LLM_GROQ_SERVICE_TIER: flex
       HINDSIGHT_API_URL: http://localhost:8888
       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       # Prefer CPU-only PyTorch in CI (but keep PyPI for everything else)
@@ -458,6 +462,7 @@ jobs:
         HINDSIGHT_API_LLM_PROVIDER=${{ env.HINDSIGHT_API_LLM_PROVIDER }}
         HINDSIGHT_API_LLM_API_KEY=${{ env.HINDSIGHT_API_LLM_API_KEY }}
         HINDSIGHT_API_LLM_MODEL=${{ env.HINDSIGHT_API_LLM_MODEL }}
+        HINDSIGHT_API_LLM_GROQ_SERVICE_TIER=${{ env.HINDSIGHT_API_LLM_GROQ_SERVICE_TIER }}
         EOF
 
     - name: Start API server
@@ -493,6 +498,7 @@ jobs:
       HINDSIGHT_API_LLM_PROVIDER: groq
       HINDSIGHT_API_LLM_API_KEY: ${{ secrets.GROQ_API_KEY }}
       HINDSIGHT_API_LLM_MODEL: openai/gpt-oss-20b
+      HINDSIGHT_API_LLM_GROQ_SERVICE_TIER: flex
       HINDSIGHT_API_URL: http://localhost:8888
       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       # Prefer CPU-only PyTorch in CI (but keep PyPI for everything else)
@@ -539,6 +545,7 @@ jobs:
         HINDSIGHT_API_LLM_PROVIDER=${{ env.HINDSIGHT_API_LLM_PROVIDER }}
         HINDSIGHT_API_LLM_API_KEY=${{ env.HINDSIGHT_API_LLM_API_KEY }}
         HINDSIGHT_API_LLM_MODEL=${{ env.HINDSIGHT_API_LLM_MODEL }}
+        HINDSIGHT_API_LLM_GROQ_SERVICE_TIER=${{ env.HINDSIGHT_API_LLM_GROQ_SERVICE_TIER }}
         EOF
 
     - name: Start API server
@@ -574,6 +581,7 @@ jobs:
       HINDSIGHT_API_LLM_PROVIDER: groq
       HINDSIGHT_API_LLM_API_KEY: ${{ secrets.GROQ_API_KEY }}
       HINDSIGHT_API_LLM_MODEL: openai/gpt-oss-20b
+      HINDSIGHT_API_LLM_GROQ_SERVICE_TIER: flex
       HINDSIGHT_API_URL: http://localhost:8888
       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       # Prefer CPU-only PyTorch in CI (but keep PyPI for everything else)
@@ -619,6 +627,7 @@ jobs:
         HINDSIGHT_API_LLM_PROVIDER=${{ env.HINDSIGHT_API_LLM_PROVIDER }}
         HINDSIGHT_API_LLM_API_KEY=${{ env.HINDSIGHT_API_LLM_API_KEY }}
         HINDSIGHT_API_LLM_MODEL=${{ env.HINDSIGHT_API_LLM_MODEL }}
+        HINDSIGHT_API_LLM_GROQ_SERVICE_TIER=${{ env.HINDSIGHT_API_LLM_GROQ_SERVICE_TIER }}
         EOF
 
     - name: Start API server
@@ -654,6 +663,7 @@ jobs:
       HINDSIGHT_API_LLM_PROVIDER: groq
       HINDSIGHT_API_LLM_API_KEY: ${{ secrets.GROQ_API_KEY }}
       HINDSIGHT_API_LLM_MODEL: openai/gpt-oss-20b
+      HINDSIGHT_API_LLM_GROQ_SERVICE_TIER: flex
       HINDSIGHT_API_URL: http://localhost:8888
       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       # Prefer CPU-only PyTorch in CI (but keep PyPI for everything else)
@@ -693,6 +703,7 @@ jobs:
         HINDSIGHT_API_LLM_PROVIDER=${{ env.HINDSIGHT_API_LLM_PROVIDER }}
         HINDSIGHT_API_LLM_API_KEY=${{ env.HINDSIGHT_API_LLM_API_KEY }}
         HINDSIGHT_API_LLM_MODEL=${{ env.HINDSIGHT_API_LLM_MODEL }}
+        HINDSIGHT_API_LLM_GROQ_SERVICE_TIER=${{ env.HINDSIGHT_API_LLM_GROQ_SERVICE_TIER }}
         EOF
 
     - name: Start API server
@@ -732,6 +743,7 @@ jobs:
       HINDSIGHT_API_LLM_PROVIDER: groq
       HINDSIGHT_API_LLM_API_KEY: ${{ secrets.GROQ_API_KEY }}
       HINDSIGHT_API_LLM_MODEL: openai/gpt-oss-20b
+      HINDSIGHT_API_LLM_GROQ_SERVICE_TIER: flex
       HINDSIGHT_API_URL: http://localhost:8888
       HINDSIGHT_EMBED_PACKAGE_PATH: ${{ github.workspace }}/hindsight-embed
       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@@ -798,6 +810,7 @@ jobs:
         HINDSIGHT_API_LLM_PROVIDER=${{ env.HINDSIGHT_API_LLM_PROVIDER }}
         HINDSIGHT_API_LLM_API_KEY=${{ env.HINDSIGHT_API_LLM_API_KEY }}
         HINDSIGHT_API_LLM_MODEL=${{ env.HINDSIGHT_API_LLM_MODEL }}
+        HINDSIGHT_API_LLM_GROQ_SERVICE_TIER=${{ env.HINDSIGHT_API_LLM_GROQ_SERVICE_TIER }}
         EOF
 
     - name: Start API server
@@ -833,6 +846,7 @@ jobs:
       HINDSIGHT_API_LLM_PROVIDER: groq
       HINDSIGHT_API_LLM_API_KEY: ${{ secrets.GROQ_API_KEY }}
       HINDSIGHT_API_LLM_MODEL: openai/gpt-oss-20b
+      HINDSIGHT_API_LLM_GROQ_SERVICE_TIER: flex
       HINDSIGHT_API_URL: http://localhost:8888
       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       UV_INDEX: pytorch=https://download.pytorch.org/whl/cpu
@@ -889,6 +903,7 @@ jobs:
         HINDSIGHT_API_LLM_PROVIDER=${{ env.HINDSIGHT_API_LLM_PROVIDER }}
         HINDSIGHT_API_LLM_API_KEY=${{ env.HINDSIGHT_API_LLM_API_KEY }}
         HINDSIGHT_API_LLM_MODEL=${{ env.HINDSIGHT_API_LLM_MODEL }}
+        HINDSIGHT_API_LLM_GROQ_SERVICE_TIER=${{ env.HINDSIGHT_API_LLM_GROQ_SERVICE_TIER }}
         EOF
 
     - name: Start API server
@@ -982,6 +997,7 @@ jobs:
       HINDSIGHT_API_LLM_PROVIDER: groq
       HINDSIGHT_API_LLM_API_KEY: ${{ secrets.GROQ_API_KEY }}
       HINDSIGHT_API_LLM_MODEL: openai/gpt-oss-20b
+      HINDSIGHT_API_LLM_GROQ_SERVICE_TIER: flex
       # Prefer CPU-only PyTorch in CI
       UV_INDEX: pytorch=https://download.pytorch.org/whl/cpu
 
@@ -1026,6 +1042,7 @@ jobs:
       HINDSIGHT_API_LLM_PROVIDER: groq
       HINDSIGHT_API_LLM_API_KEY: ${{ secrets.GROQ_API_KEY }}
       HINDSIGHT_API_LLM_MODEL: openai/gpt-oss-20b
+      HINDSIGHT_API_LLM_GROQ_SERVICE_TIER: flex
       # For test_server_integration.py compatibility
       HINDSIGHT_LLM_PROVIDER: groq
       HINDSIGHT_LLM_API_KEY: ${{ secrets.GROQ_API_KEY }}
@@ -1075,6 +1092,7 @@ jobs:
       HINDSIGHT_API_LLM_PROVIDER: groq
       HINDSIGHT_API_LLM_API_KEY: ${{ secrets.GROQ_API_KEY }}
       HINDSIGHT_API_LLM_MODEL: openai/gpt-oss-20b
+      HINDSIGHT_API_LLM_GROQ_SERVICE_TIER: flex
       HINDSIGHT_API_URL: http://localhost:8888
       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       UV_INDEX: pytorch=https://download.pytorch.org/whl/cpu
@@ -1130,6 +1148,7 @@ jobs:
         HINDSIGHT_API_LLM_PROVIDER=${{ env.HINDSIGHT_API_LLM_PROVIDER }}
         HINDSIGHT_API_LLM_API_KEY=${{ env.HINDSIGHT_API_LLM_API_KEY }}
         HINDSIGHT_API_LLM_MODEL=${{ env.HINDSIGHT_API_LLM_MODEL }}
+        HINDSIGHT_API_LLM_GROQ_SERVICE_TIER=${{ env.HINDSIGHT_API_LLM_GROQ_SERVICE_TIER }}
         EOF
 
     - name: Start API server
@@ -1167,6 +1186,7 @@ jobs:
       HINDSIGHT_API_LLM_PROVIDER: groq
       HINDSIGHT_API_LLM_API_KEY: ${{ secrets.GROQ_API_KEY }}
       HINDSIGHT_API_LLM_MODEL: openai/gpt-oss-20b
+      HINDSIGHT_API_LLM_GROQ_SERVICE_TIER: flex
       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
       UV_INDEX: pytorch=https://download.pytorch.org/whl/cpu
 

diff --git a/hindsight-api/hindsight_api/api/mcp.py b/hindsight-api/hindsight_api/api/mcp.py
@@ -78,10 +78,9 @@ def create_mcp_server(memory: MemoryEngine, multi_bank: bool = True) -> FastMCP:
                    If False, only expose bank-scoped tools without bank_id parameters.
 
     Returns:
-        Configured FastMCP server instance with stateless_http enabled
+        Configured FastMCP server instance
     """
-    # Use stateless_http=True for Claude Code compatibility
-    mcp = FastMCP("hindsight-mcp-server", stateless_http=True)
+    mcp = FastMCP("hindsight-mcp-server")
 
     # Configure and register tools using shared module
     config = MCPToolsConfig(
@@ -211,9 +210,9 @@ def __init__(
         else:
             # Create servers internally (for direct construction / tests)
             self.multi_bank_server = create_mcp_server(memory, multi_bank=True)
-            self.multi_bank_app = self.multi_bank_server.http_app(path="/")
+            self.multi_bank_app = self.multi_bank_server.http_app(path="/", stateless_http=True)
             self.single_bank_server = create_mcp_server(memory, multi_bank=False)
-            self.single_bank_app = self.single_bank_server.http_app(path="/")
+            self.single_bank_app = self.single_bank_server.http_app(path="/", stateless_http=True)
 
     def _get_header(self, scope: dict, name: str) -> str | None:
         """Extract a header value from ASGI scope."""
@@ -379,9 +378,9 @@ def create_mcp_servers(memory: MemoryEngine):
         Tuple of (multi_bank_server, single_bank_server, multi_bank_app, single_bank_app)
     """
     multi_bank_server = create_mcp_server(memory, multi_bank=True)
-    multi_bank_app = multi_bank_server.http_app(path="/")
+    multi_bank_app = multi_bank_server.http_app(path="/", stateless_http=True)
 
     single_bank_server = create_mcp_server(memory, multi_bank=False)
-    single_bank_app = single_bank_server.http_app(path="/")
+    single_bank_app = single_bank_server.http_app(path="/", stateless_http=True)
 
     return multi_bank_server, single_bank_server, multi_bank_app, single_bank_app
diff --git a/hindsight-api/hindsight_api/config.py b/hindsight-api/hindsight_api/config.py
@@ -233,8 +233,6 @@ def normalize_config_dict(config: dict[str, Any]) -> dict[str, Any]:
 ENV_MPFP_TOP_K_NEIGHBORS = "HINDSIGHT_API_MPFP_TOP_K_NEIGHBORS"
 ENV_RECALL_MAX_CONCURRENT = "HINDSIGHT_API_RECALL_MAX_CONCURRENT"
 ENV_RECALL_CONNECTION_BUDGET = "HINDSIGHT_API_RECALL_CONNECTION_BUDGET"
-ENV_MCP_LOCAL_BANK_ID = "HINDSIGHT_API_MCP_LOCAL_BANK_ID"
-ENV_MCP_INSTRUCTIONS = "HINDSIGHT_API_MCP_INSTRUCTIONS"
 ENV_MENTAL_MODEL_REFRESH_CONCURRENCY = "HINDSIGHT_API_MENTAL_MODEL_REFRESH_CONCURRENCY"
 
 # OpenTelemetry tracing configuration
@@ -389,7 +387,6 @@ def normalize_config_dict(config: dict[str, Any]) -> dict[str, Any]:
 DEFAULT_MPFP_TOP_K_NEIGHBORS = 20  # Fan-out limit per node in MPFP graph traversal
 DEFAULT_RECALL_MAX_CONCURRENT = 32  # Max concurrent recall operations per worker
 DEFAULT_RECALL_CONNECTION_BUDGET = 4  # Max concurrent DB connections per recall operation
-DEFAULT_MCP_LOCAL_BANK_ID = "mcp"
 DEFAULT_MENTAL_MODEL_REFRESH_CONCURRENCY = 8  # Max concurrent mental model refreshes
 
 # Retain settings

diff --git a/hindsight-api/hindsight_api/mcp_local.py b/hindsight-api/hindsight_api/mcp_local.py
@@ -1,157 +1,39 @@
 """
-Local MCP server for use with Claude Code (stdio transport).
+Local MCP server entry point for use with Claude Code (HTTP transport).
 
-This runs a fully local Hindsight instance with embedded PostgreSQL (pg0).
-No external database or server required.
+This is a thin wrapper around the main hindsight-api server that pre-configures
+sensible defaults for local use (embedded PostgreSQL via pg0, warning log level).
+
+The full API runs on localhost:8888. Configure Claude Code's MCP settings:
+    claude mcp add --transport http hindsight http://localhost:8888/mcp/
+
+Or pinned to a specific bank (single-bank mode):
+    claude mcp add --transport http hindsight http://localhost:8888/mcp/default/
 
 Run with:
     hindsight-local-mcp
 
 Or with uvx:
     uvx hindsight-api@latest hindsight-local-mcp
 
-Configure in Claude Code's MCP settings:
-    {
-        "mcpServers": {
-            "hindsight": {
-                "command": "uvx",
-                "args": ["hindsight-api@latest", "hindsight-local-mcp"],
-                "env": {
-                    "HINDSIGHT_API_LLM_API_KEY": "your-openai-key"
-                }
-            }
-        }
-    }
-
 Environment variables:
     HINDSIGHT_API_LLM_API_KEY: Required. API key for LLM provider.
     HINDSIGHT_API_LLM_PROVIDER: Optional. LLM provider (default: "openai").
     HINDSIGHT_API_LLM_MODEL: Optional. LLM model (default: "gpt-4o-mini").
-    HINDSIGHT_API_MCP_LOCAL_BANK_ID: Optional. Memory bank ID (default: "mcp").
-    HINDSIGHT_API_LOG_LEVEL: Optional. Log level (default: "warning").
-    HINDSIGHT_API_MCP_INSTRUCTIONS: Optional. Additional instructions appended to both retain and recall tools.
-
-Example custom instructions (these are ADDED to the default behavior):
-    To also store assistant actions:
-        HINDSIGHT_API_MCP_INSTRUCTIONS="Also store every action you take, including tool calls, code written, and decisions made."
-
-    To also store conversation summaries:
-        HINDSIGHT_API_MCP_INSTRUCTIONS="Also store summaries of important conversations and their outcomes."
+    HINDSIGHT_API_DATABASE_URL: Optional. Override database URL (default: pg0://hindsight-mcp).
 """
 
-import logging
 import os
-import sys
-
-from mcp.server.fastmcp import FastMCP
-
-from hindsight_api.config import (
-    DEFAULT_MCP_LOCAL_BANK_ID,
-    DEFAULT_MCP_RECALL_DESCRIPTION,
-    DEFAULT_MCP_RETAIN_DESCRIPTION,
-    ENV_MCP_INSTRUCTIONS,
-    ENV_MCP_LOCAL_BANK_ID,
-)
-from hindsight_api.mcp_tools import MCPToolsConfig, register_mcp_tools
-
-# Configure logging - default to warning to avoid polluting stderr during MCP init
-# MCP clients interpret stderr output as errors, so we suppress INFO logs by default
-_log_level_str = os.environ.get("HINDSIGHT_API_LOG_LEVEL", "warning").lower()
-_log_level_map = {
-    "critical": logging.CRITICAL,
-    "error": logging.ERROR,
-    "warning": logging.WARNING,
-    "info": logging.INFO,
-    "debug": logging.DEBUG,
-}
-logging.basicConfig(
-    level=_log_level_map.get(_log_level_str, logging.WARNING),
-    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
-    stream=sys.stderr,  # MCP uses stdout for protocol, logs go to stderr
-)
-logger = logging.getLogger(__name__)
-
-
-def create_local_mcp_server(bank_id: str, memory=None) -> FastMCP:
-    """
-    Create a stdio MCP server with retain/recall tools.
-
-    Args:
-        bank_id: The memory bank ID to use for all operations.
-        memory: Optional MemoryEngine instance. If not provided, creates one with pg0.
-
-    Returns:
-        Configured FastMCP server instance.
-    """
-    # Import here to avoid slow startup if just checking --help
-    from hindsight_api import MemoryEngine
-
-    # Create memory engine with pg0 embedded database if not provided
-    if memory is None:
-        memory = MemoryEngine(db_url="pg0://hindsight-mcp")
-
-    # Get custom instructions from environment variable (appended to both tools)
-    extra_instructions = os.environ.get(ENV_MCP_INSTRUCTIONS, "")
-
-    retain_description = DEFAULT_MCP_RETAIN_DESCRIPTION
-    recall_description = DEFAULT_MCP_RECALL_DESCRIPTION
-
-    if extra_instructions:
-        retain_description = f"{DEFAULT_MCP_RETAIN_DESCRIPTION}\n\nAdditional instructions: {extra_instructions}"
-        recall_description = f"{DEFAULT_MCP_RECALL_DESCRIPTION}\n\nAdditional instructions: {extra_instructions}"
-
-    mcp = FastMCP("hindsight")
-
-    # Configure and register tools using shared module
-    config = MCPToolsConfig(
-        bank_id_resolver=lambda: bank_id,
-        include_bank_id_param=False,  # Local MCP uses fixed bank_id
-        tools={"retain", "recall"},  # Local MCP only has retain and recall
-        retain_description=retain_description,
-        recall_description=recall_description,
-        retain_fire_and_forget=True,  # Local MCP uses fire-and-forget pattern
-    )
-
-    register_mcp_tools(mcp, memory, config)
-
-    return mcp
-
-
-async def _initialize_and_run(bank_id: str):
-    """Initialize memory and run the MCP server."""
-    from hindsight_api import MemoryEngine
-
-    # Create and initialize memory engine with pg0 embedded database
-    # Note: We avoid printing to stderr during init as MCP clients show it as "errors"
-    memory = MemoryEngine(db_url="pg0://hindsight-mcp")
-    await memory.initialize()
-
-    # Create and run the server
-    mcp = create_local_mcp_server(bank_id, memory=memory)
-    await mcp.run_stdio_async()
-
-
-def main():
-    """Main entry point for the stdio MCP server."""
-    import asyncio
-
-    from hindsight_api.config import ENV_LLM_API_KEY, get_config
 
-    # Check for required environment variables
-    config = get_config()
-    if not config.llm_api_key:
-        print(f"Error: {ENV_LLM_API_KEY} environment variable is required", file=sys.stderr)
-        print("Set it in your MCP configuration or shell environment", file=sys.stderr)
-        sys.exit(1)
 
-    # Get bank ID from environment, default to "mcp"
-    bank_id = os.environ.get(ENV_MCP_LOCAL_BANK_ID, DEFAULT_MCP_LOCAL_BANK_ID)
+def main() -> None:
+    """Start the Hindsight API server with local defaults."""
+    # Set local defaults (only if not already configured by the user)
+    os.environ.setdefault("HINDSIGHT_API_DATABASE_URL", "pg0://hindsight-mcp")
 
-    # Note: We don't print to stderr as MCP clients display it as "error output"
-    # Use HINDSIGHT_API_LOG_LEVEL=debug for verbose startup logging
+    from hindsight_api.main import main as api_main
 
-    # Run the async initialization and server
-    asyncio.run(_initialize_and_run(bank_id))
+    api_main()
 
 
 if __name__ == "__main__":