Skip to content

Refactor ClickHouse MCP tools with improved documentation and functionality #18

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
.envrc
.ruff_cache/
.specstory/
.venv/
.github/

# Byte-compiled / optimized / DLL files
__pycache__/
Expand Down Expand Up @@ -172,3 +175,5 @@ cython_debug/

# PyPI configuration file
.pypirc
.specstory/*
.specstory/history/*
2 changes: 2 additions & 0 deletions mcp_clickhouse/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
list_databases,
list_tables,
run_select_query,
get_table_sample,
)

__all__ = [
"list_databases",
"list_tables",
"run_select_query",
"get_table_sample",
"create_clickhouse_client",
]
9 changes: 9 additions & 0 deletions mcp_clickhouse/main.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,16 @@
"""
MCP ClickHouse - Model Context Protocol server for ClickHouse database integration.

This module provides the entry point for running the MCP ClickHouse server,
which enables AI models to interact with ClickHouse databases through a set of
well-defined tools.
"""

from .mcp_server import mcp


def main():
"""Run the MCP ClickHouse server."""
mcp.run()


Expand Down
78 changes: 37 additions & 41 deletions mcp_clickhouse/mcp_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,49 +16,44 @@ class ClickHouseConfig:
This class handles all environment variable configuration with sensible defaults
and type conversion. It provides typed methods for accessing each configuration value.

Required environment variables:
CLICKHOUSE_HOST: The hostname of the ClickHouse server
CLICKHOUSE_USER: The username for authentication
CLICKHOUSE_PASSWORD: The password for authentication

Optional environment variables (with defaults):
CLICKHOUSE_PORT: The port number (default: 8443 if secure=True, 8123 if secure=False)
CLICKHOUSE_SECURE: Enable HTTPS (default: true)
CLICKHOUSE_VERIFY: Verify SSL certificates (default: true)
CLICKHOUSE_CONNECT_TIMEOUT: Connection timeout in seconds (default: 30)
CLICKHOUSE_SEND_RECEIVE_TIMEOUT: Send/receive timeout in seconds (default: 300)
CLICKHOUSE_DATABASE: Default database to use (default: None)
Default values (if environment variables are not set):
CLICKHOUSE_HOST: "localhost"
CLICKHOUSE_USER: ""
CLICKHOUSE_PASSWORD: ""
CLICKHOUSE_PORT: 8123
CLICKHOUSE_SECURE: false
CLICKHOUSE_VERIFY: false
CLICKHOUSE_CONNECT_TIMEOUT: 5
CLICKHOUSE_SEND_RECEIVE_TIMEOUT: 300
CLICKHOUSE_DATABASE: None
"""

def __init__(self):
"""Initialize the configuration from environment variables."""
self._validate_required_vars()
self._set_default_vars()

@property
def host(self) -> str:
"""Get the ClickHouse host."""
return os.environ["CLICKHOUSE_HOST"]
return os.environ.get("CLICKHOUSE_HOST", "localhost")

@property
def port(self) -> int:
"""Get the ClickHouse port.

Defaults to 8443 if secure=True, 8123 if secure=False.
Can be overridden by CLICKHOUSE_PORT environment variable.
Defaults to 8123 if not specified.
"""
if "CLICKHOUSE_PORT" in os.environ:
return int(os.environ["CLICKHOUSE_PORT"])
return 8443 if self.secure else 8123
return int(os.environ.get("CLICKHOUSE_PORT", "8123"))

@property
def username(self) -> str:
"""Get the ClickHouse username."""
return os.environ["CLICKHOUSE_USER"]
return os.environ.get("CLICKHOUSE_USER", "")

@property
def password(self) -> str:
"""Get the ClickHouse password."""
return os.environ["CLICKHOUSE_PASSWORD"]
return os.environ.get("CLICKHOUSE_PASSWORD", "")

@property
def database(self) -> Optional[str]:
Expand All @@ -69,25 +64,25 @@ def database(self) -> Optional[str]:
def secure(self) -> bool:
"""Get whether HTTPS is enabled.

Default: True
Default: False
"""
return os.getenv("CLICKHOUSE_SECURE", "true").lower() == "true"
return os.getenv("CLICKHOUSE_SECURE", "false").lower() == "true"

@property
def verify(self) -> bool:
"""Get whether SSL certificate verification is enabled.

Default: True
Default: False
"""
return os.getenv("CLICKHOUSE_VERIFY", "true").lower() == "true"
return os.getenv("CLICKHOUSE_VERIFY", "false").lower() == "true"

@property
def connect_timeout(self) -> int:
"""Get the connection timeout in seconds.

Default: 30
Default: 5
"""
return int(os.getenv("CLICKHOUSE_CONNECT_TIMEOUT", "30"))
return int(os.getenv("CLICKHOUSE_CONNECT_TIMEOUT", "5"))

@property
def send_receive_timeout(self) -> int:
Expand Down Expand Up @@ -120,22 +115,23 @@ def get_client_config(self) -> dict:

return config

def _validate_required_vars(self) -> None:
"""Validate that all required environment variables are set.
def _set_default_vars(self) -> None:
"""Set default values for environment variables if they are not already set."""
defaults = {
"CLICKHOUSE_HOST": "localhost",
"CLICKHOUSE_USER": "",
"CLICKHOUSE_PASSWORD": "",
"CLICKHOUSE_PORT": "8123",
"CLICKHOUSE_SECURE": "false",
"CLICKHOUSE_VERIFY": "false",
"CLICKHOUSE_CONNECT_TIMEOUT": "5",
"CLICKHOUSE_SEND_RECEIVE_TIMEOUT": "300",
}

Raises:
ValueError: If any required environment variable is missing.
"""
missing_vars = []
for var in ["CLICKHOUSE_HOST", "CLICKHOUSE_USER", "CLICKHOUSE_PASSWORD"]:
for var, default_value in defaults.items():
if var not in os.environ:
missing_vars.append(var)

if missing_vars:
raise ValueError(
f"Missing required environment variables: {', '.join(missing_vars)}"
)
os.environ[var] = default_value
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This ends up calling putenv underneath. Ideally we wouldn't mutate process environment variables. Especially since this changes getenv to always be used with a default, it seems unnecessary



# Global instance for easy access
config = ClickHouseConfig()
config = ClickHouseConfig()
94 changes: 88 additions & 6 deletions mcp_clickhouse/mcp_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,23 @@
mcp = FastMCP(MCP_SERVER_NAME, dependencies=deps)


@mcp.tool()
@mcp.tool(
description="Lists all available databases in the ClickHouse server. Use this tool to get a complete list of databases before exploring their tables. No parameters required."
)
def list_databases():
logger.info("Listing all databases")
client = create_clickhouse_client()
result = client.command("SHOW DATABASES")
logger.info(f"Found {len(result) if isinstance(result, list) else 1} databases")
logger.info(
f"Found {len(result) if isinstance(result, list) else 1} databases")
return result


@mcp.tool()
@mcp.tool(
description="Lists tables in a ClickHouse database with detailed schema information. "
"Provides complete table structure including columns, types, and creation statements. "
"Use the 'like' parameter to filter results with SQL LIKE pattern."
)
def list_tables(database: str, like: str = None):
logger.info(f"Listing tables in database '{database}'")
client = create_clickhouse_client()
Expand All @@ -49,7 +56,8 @@ def list_tables(database: str, like: str = None):
# Get all table comments in one query
table_comments_query = f"SELECT name, comment FROM system.tables WHERE database = {format_query_value(database)}"
table_comments_result = client.query(table_comments_query)
table_comments = {row[0]: row[1] for row in table_comments_result.result_rows}
table_comments = {row[0]: row[1]
for row in table_comments_result.result_rows}

# Get all column comments in one query
column_comments_query = f"SELECT table, name, comment FROM system.columns WHERE database = {format_query_value(database)}"
Expand Down Expand Up @@ -105,7 +113,12 @@ def get_table_info(table):
return tables


@mcp.tool()
@mcp.tool(
description="Executes a SELECT query against the ClickHouse database. "
"Use for custom data retrieval with your own SQL. "
"Queries are executed in read-only mode for safety. "
"Format your query without specifying database names in SQL."
)
def run_select_query(query: str):
logger.info(f"Executing SELECT query: {query}")
client = create_clickhouse_client()
Expand All @@ -125,6 +138,74 @@ def run_select_query(query: str):
return f"error running query: {err}"


@mcp.tool(
description="Retrieves a random sample of rows from a table using ORDER BY RAND(). "
"Perfect for data exploration and quick analysis. "
"Limit parameter capped at 10 rows. "
"Use the where parameter for filtering specific data patterns."
)
def get_table_sample(database: str, table: str, columns: str = "*", limit: int = 5, where: str = None):
"""Retrieves a random sample of rows from a table with ORDER BY RAND()

Args:
database: The database containing the table
table: The table to sample data from
columns: Comma-separated list of columns to retrieve (default: "*" for all columns)
limit: Maximum number of rows to return (default: 5, max: 10)
where: Optional WHERE clause to filter the data

Returns:
List of dictionaries, each representing a random row from the table

Raises:
ValueError: If limit is > 10 or < 1
ConnectionError: If there's an issue connecting to ClickHouse
ClickHouseError: If there's an error executing the query
"""
# Validate limit
if limit > 10:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why cap at 10?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since this is an MCP and the package is meant to work with AI agents, I had to set this limit to avoid overflowing the language model’s context window. I deal with huge tables, and the AI kept trying to grab a way too large sample. Do you think I should increase the cap?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

10 seems low, have you checked how other MCP servers handle this?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd increase this cap! I've had a bunch of times where I want to sample a couple of tables with sparse matches between them, and with the limit of 10, it can't find matches.

logger.warning(
f"Requested limit {limit} exceeds maximum of 10, using 10 instead")
limit = 10
elif limit < 1:
logger.warning(
f"Requested limit {limit} is less than 1, using 1 instead")
limit = 1

logger.info(f"Sampling {limit} random rows from {database}.{table}")
client = create_clickhouse_client()

try:
# Build the query
query = f"SELECT {columns} FROM {quote_identifier(database)}.{quote_identifier(table)}"

# Add WHERE clause if provided
if where:
query += f" WHERE {where}"

# Add random ordering and limit
query += f" ORDER BY rand() LIMIT {limit}"

logger.info(f"Executing sampling query: {query}")

# Execute query with readonly setting for safety
res = client.query(query, settings={"readonly": 1})
column_names = res.column_names
rows = []

for row in res.result_rows:
row_dict = {}
for i, col_name in enumerate(column_names):
row_dict[col_name] = row[i]
rows.append(row_dict)

logger.info(f"Sample query returned {len(rows)} rows")
return rows
except Exception as err:
logger.error(f"Error executing sample query: {err}")
return f"error running sample query: {err}"


def create_clickhouse_client():
client_config = config.get_client_config()
logger.info(
Expand All @@ -139,7 +220,8 @@ def create_clickhouse_client():
client = clickhouse_connect.get_client(**client_config)
# Test the connection
version = client.server_version
logger.info(f"Successfully connected to ClickHouse server version {version}")
logger.info(
f"Successfully connected to ClickHouse server version {version}")
return client
except Exception as e:
logger.error(f"Failed to connect to ClickHouse: {str(e)}")
Expand Down
2 changes: 2 additions & 0 deletions uv.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.