ClickHouse · evgenii-baev · Feb 27, 2025 · Feb 27, 2025 · Feb 28, 2025 · Mar 3, 2025
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,8 @@
 .envrc
 .ruff_cache/
+.specstory/
+.venv/
+.github/
 
 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -172,3 +175,5 @@ cython_debug/
 
 # PyPI configuration file
 .pypirc
+.specstory/*
+.specstory/history/*
diff --git a/mcp_clickhouse/__init__.py b/mcp_clickhouse/__init__.py
@@ -3,11 +3,13 @@
     list_databases,
     list_tables,
     run_select_query,
+    get_table_sample,
 )
 
 __all__ = [
     "list_databases",
     "list_tables",
     "run_select_query",
+    "get_table_sample",
     "create_clickhouse_client",
 ]
diff --git a/mcp_clickhouse/main.py b/mcp_clickhouse/main.py
@@ -1,7 +1,16 @@
+"""
+MCP ClickHouse - Model Context Protocol server for ClickHouse database integration.
+
+This module provides the entry point for running the MCP ClickHouse server,
+which enables AI models to interact with ClickHouse databases through a set of
+well-defined tools.
+"""
+
 from .mcp_server import mcp
 
 
 def main():
+    """Run the MCP ClickHouse server."""
     mcp.run()
 
 

diff --git a/mcp_clickhouse/mcp_env.py b/mcp_clickhouse/mcp_env.py
@@ -16,49 +16,44 @@ class ClickHouseConfig:
     This class handles all environment variable configuration with sensible defaults
     and type conversion. It provides typed methods for accessing each configuration value.
 
-    Required environment variables:
-        CLICKHOUSE_HOST: The hostname of the ClickHouse server
-        CLICKHOUSE_USER: The username for authentication
-        CLICKHOUSE_PASSWORD: The password for authentication
-
-    Optional environment variables (with defaults):
-        CLICKHOUSE_PORT: The port number (default: 8443 if secure=True, 8123 if secure=False)
-        CLICKHOUSE_SECURE: Enable HTTPS (default: true)
-        CLICKHOUSE_VERIFY: Verify SSL certificates (default: true)
-        CLICKHOUSE_CONNECT_TIMEOUT: Connection timeout in seconds (default: 30)
-        CLICKHOUSE_SEND_RECEIVE_TIMEOUT: Send/receive timeout in seconds (default: 300)
-        CLICKHOUSE_DATABASE: Default database to use (default: None)
+    Default values (if environment variables are not set):
+        CLICKHOUSE_HOST: "localhost"
+        CLICKHOUSE_USER: ""
+        CLICKHOUSE_PASSWORD: ""
+        CLICKHOUSE_PORT: 8123
+        CLICKHOUSE_SECURE: false
+        CLICKHOUSE_VERIFY: false
+        CLICKHOUSE_CONNECT_TIMEOUT: 5
+        CLICKHOUSE_SEND_RECEIVE_TIMEOUT: 300
+        CLICKHOUSE_DATABASE: None
     """
 
     def __init__(self):
         """Initialize the configuration from environment variables."""
-        self._validate_required_vars()
+        self._set_default_vars()
 
     @property
     def host(self) -> str:
         """Get the ClickHouse host."""
-        return os.environ["CLICKHOUSE_HOST"]
+        return os.environ.get("CLICKHOUSE_HOST", "localhost")
 
     @property
     def port(self) -> int:
         """Get the ClickHouse port.
 
-        Defaults to 8443 if secure=True, 8123 if secure=False.
-        Can be overridden by CLICKHOUSE_PORT environment variable.
+        Defaults to 8123 if not specified.
         """
-        if "CLICKHOUSE_PORT" in os.environ:
-            return int(os.environ["CLICKHOUSE_PORT"])
-        return 8443 if self.secure else 8123
+        return int(os.environ.get("CLICKHOUSE_PORT", "8123"))
 
     @property
     def username(self) -> str:
         """Get the ClickHouse username."""
-        return os.environ["CLICKHOUSE_USER"]
+        return os.environ.get("CLICKHOUSE_USER", "")
 
     @property
     def password(self) -> str:
         """Get the ClickHouse password."""
-        return os.environ["CLICKHOUSE_PASSWORD"]
+        return os.environ.get("CLICKHOUSE_PASSWORD", "")
 
     @property
     def database(self) -> Optional[str]:
@@ -69,25 +64,25 @@ def database(self) -> Optional[str]:
     def secure(self) -> bool:
         """Get whether HTTPS is enabled.
 
-        Default: True
+        Default: False
         """
-        return os.getenv("CLICKHOUSE_SECURE", "true").lower() == "true"
+        return os.getenv("CLICKHOUSE_SECURE", "false").lower() == "true"
 
     @property
     def verify(self) -> bool:
         """Get whether SSL certificate verification is enabled.
 
-        Default: True
+        Default: False
         """
-        return os.getenv("CLICKHOUSE_VERIFY", "true").lower() == "true"
+        return os.getenv("CLICKHOUSE_VERIFY", "false").lower() == "true"
 
     @property
     def connect_timeout(self) -> int:
         """Get the connection timeout in seconds.
 
-        Default: 30
+        Default: 5
         """
-        return int(os.getenv("CLICKHOUSE_CONNECT_TIMEOUT", "30"))
+        return int(os.getenv("CLICKHOUSE_CONNECT_TIMEOUT", "5"))
 
     @property
     def send_receive_timeout(self) -> int:
@@ -120,22 +115,23 @@ def get_client_config(self) -> dict:
 
         return config
 
-    def _validate_required_vars(self) -> None:
-        """Validate that all required environment variables are set.
+    def _set_default_vars(self) -> None:
+        """Set default values for environment variables if they are not already set."""
+        defaults = {
+            "CLICKHOUSE_HOST": "localhost",
+            "CLICKHOUSE_USER": "",
+            "CLICKHOUSE_PASSWORD": "",
+            "CLICKHOUSE_PORT": "8123",
+            "CLICKHOUSE_SECURE": "false",
+            "CLICKHOUSE_VERIFY": "false",
+            "CLICKHOUSE_CONNECT_TIMEOUT": "5",
+            "CLICKHOUSE_SEND_RECEIVE_TIMEOUT": "300",
+        }
 
-        Raises:
-            ValueError: If any required environment variable is missing.
-        """
-        missing_vars = []
-        for var in ["CLICKHOUSE_HOST", "CLICKHOUSE_USER", "CLICKHOUSE_PASSWORD"]:
+        for var, default_value in defaults.items():
             if var not in os.environ:
-                missing_vars.append(var)
-
-        if missing_vars:
-            raise ValueError(
-                f"Missing required environment variables: {', '.join(missing_vars)}"
-            )
+                os.environ[var] = default_value
 
 
 # Global instance for easy access
-config = ClickHouseConfig()
+config = ClickHouseConfig()
diff --git a/mcp_clickhouse/mcp_server.py b/mcp_clickhouse/mcp_server.py
@@ -28,16 +28,23 @@
 mcp = FastMCP(MCP_SERVER_NAME, dependencies=deps)
 
 
-@mcp.tool()
+@mcp.tool(
+    description="Lists all available databases in the ClickHouse server. Use this tool to get a complete list of databases before exploring their tables. No parameters required."
+)
 def list_databases():
     logger.info("Listing all databases")
     client = create_clickhouse_client()
     result = client.command("SHOW DATABASES")
-    logger.info(f"Found {len(result) if isinstance(result, list) else 1} databases")
+    logger.info(
+        f"Found {len(result) if isinstance(result, list) else 1} databases")
     return result
 
 
-@mcp.tool()
+@mcp.tool(
+    description="Lists tables in a ClickHouse database with detailed schema information. "
+    "Provides complete table structure including columns, types, and creation statements. "
+    "Use the 'like' parameter to filter results with SQL LIKE pattern."
+)
 def list_tables(database: str, like: str = None):
     logger.info(f"Listing tables in database '{database}'")
     client = create_clickhouse_client()
@@ -49,7 +56,8 @@ def list_tables(database: str, like: str = None):
     # Get all table comments in one query
     table_comments_query = f"SELECT name, comment FROM system.tables WHERE database = {format_query_value(database)}"
     table_comments_result = client.query(table_comments_query)
-    table_comments = {row[0]: row[1] for row in table_comments_result.result_rows}
+    table_comments = {row[0]: row[1]
+                      for row in table_comments_result.result_rows}
 
     # Get all column comments in one query
     column_comments_query = f"SELECT table, name, comment FROM system.columns WHERE database = {format_query_value(database)}"
@@ -105,7 +113,12 @@ def get_table_info(table):
     return tables
 
 
-@mcp.tool()
+@mcp.tool(
+    description="Executes a SELECT query against the ClickHouse database. "
+    "Use for custom data retrieval with your own SQL. "
+    "Queries are executed in read-only mode for safety. "
+    "Format your query without specifying database names in SQL."
+)
 def run_select_query(query: str):
     logger.info(f"Executing SELECT query: {query}")
     client = create_clickhouse_client()
@@ -125,6 +138,74 @@ def run_select_query(query: str):
         return f"error running query: {err}"
 
 
+@mcp.tool(
+    description="Retrieves a random sample of rows from a table using ORDER BY RAND(). "
+    "Perfect for data exploration and quick analysis. "
+    "Limit parameter capped at 10 rows. "
+    "Use the where parameter for filtering specific data patterns."
+)
+def get_table_sample(database: str, table: str, columns: str = "*", limit: int = 5, where: str = None):
+    """Retrieves a random sample of rows from a table with ORDER BY RAND()
+
+    Args:
+        database: The database containing the table
+        table: The table to sample data from
+        columns: Comma-separated list of columns to retrieve (default: "*" for all columns)
+        limit: Maximum number of rows to return (default: 5, max: 10)
+        where: Optional WHERE clause to filter the data
+
+    Returns:
+        List of dictionaries, each representing a random row from the table
+
+    Raises:
+        ValueError: If limit is > 10 or < 1
+        ConnectionError: If there's an issue connecting to ClickHouse
+        ClickHouseError: If there's an error executing the query
+    """
+    # Validate limit
+    if limit > 10:
+        logger.warning(
+            f"Requested limit {limit} exceeds maximum of 10, using 10 instead")
+        limit = 10
+    elif limit < 1:
+        logger.warning(
+            f"Requested limit {limit} is less than 1, using 1 instead")
+        limit = 1
+
+    logger.info(f"Sampling {limit} random rows from {database}.{table}")
+    client = create_clickhouse_client()
+
+    try:
+        # Build the query
+        query = f"SELECT {columns} FROM {quote_identifier(database)}.{quote_identifier(table)}"
+
+        # Add WHERE clause if provided
+        if where:
+            query += f" WHERE {where}"
+
+        # Add random ordering and limit
+        query += f" ORDER BY rand() LIMIT {limit}"
+
+        logger.info(f"Executing sampling query: {query}")
+
+        # Execute query with readonly setting for safety
+        res = client.query(query, settings={"readonly": 1})
+        column_names = res.column_names
+        rows = []
+
+        for row in res.result_rows:
+            row_dict = {}
+            for i, col_name in enumerate(column_names):
+                row_dict[col_name] = row[i]
+            rows.append(row_dict)
+
+        logger.info(f"Sample query returned {len(rows)} rows")
+        return rows
+    except Exception as err:
+        logger.error(f"Error executing sample query: {err}")
+        return f"error running sample query: {err}"
+
+
 def create_clickhouse_client():
     client_config = config.get_client_config()
     logger.info(
@@ -139,7 +220,8 @@ def create_clickhouse_client():
         client = clickhouse_connect.get_client(**client_config)
         # Test the connection
         version = client.server_version
-        logger.info(f"Successfully connected to ClickHouse server version {version}")
+        logger.info(
+            f"Successfully connected to ClickHouse server version {version}")
         return client
     except Exception as e:
         logger.error(f"Failed to connect to ClickHouse: {str(e)}")

diff --git a/uv.lock b/uv.lock