Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
168 changes: 167 additions & 1 deletion docs/scripts/modal_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,16 @@
This app provides:
1. Scheduled daily crawling of cua.ai/docs stored in a Modal volume
2. Database generation (LanceDB vectors + SQLite FTS) for the MCP server
3. S3 upload of generated databases for Kubernetes storage controller sync

The MCP server that queries these databases runs as a separate containerized
service (see docs/scripts/docs-mcp-server/).
service (see docs/scripts/docs-mcp-server/). The S3 uploads enable the
Kubernetes deployment to sync databases from S3 to the MCP server pods.

S3 Configuration:
- Bucket: trycua-docs-mcp-data (us-west-2)
- docs_db -> s3://trycua-docs-mcp-data/docs_db/
- code_db -> s3://trycua-docs-mcp-data/code_db/

Usage:
modal deploy docs/scripts/modal_app.py
Expand All @@ -31,6 +38,13 @@
# GitHub token secret for cloning
github_secret = modal.Secret.from_name("github-secret", required_keys=["GITHUB_TOKEN"])

# AWS IAM role ARN for S3 uploads (uses Modal OIDC for authentication)
AWS_ROLE_ARN = "arn:aws:iam::296062593712:role/modal-docs-mcp-write-role"
AWS_REGION = "us-west-2"

# S3 bucket for database uploads
S3_BUCKET = "trycua-docs-mcp-data"

# Define the container image with all dependencies
image = (
modal.Image.debian_slim(python_version="3.12")
Expand All @@ -45,6 +59,7 @@
"pandas>=2.0.0",
"markdown-it-py>=3.0.0",
"markitdown>=0.0.1",
"boto3>=1.26.0",
)
.run_commands("playwright install --with-deps chromium")
)
Expand Down Expand Up @@ -106,6 +121,134 @@ def extract_text(token_list):
return text.strip()


def upload_directory_to_s3(local_path: str, bucket: str, s3_prefix: str) -> dict:
"""Upload a directory to S3.

Args:
local_path: Path to the local directory to upload
bucket: S3 bucket name
s3_prefix: Prefix (folder path) in S3 bucket

Returns:
Dict with upload statistics
"""
import os

import boto3

# Use Modal OIDC to assume the AWS IAM role and get temporary credentials
# Modal provides the OIDC token via the MODAL_IDENTITY_TOKEN environment variable
identity_token = os.environ.get("MODAL_IDENTITY_TOKEN")
if not identity_token:
raise RuntimeError("MODAL_IDENTITY_TOKEN environment variable not set. Running outside Modal?")

# Use STS to exchange the OIDC token for temporary AWS credentials
sts_client = boto3.client("sts", region_name=AWS_REGION)
response = sts_client.assume_role_with_web_identity(
RoleArn=AWS_ROLE_ARN,
RoleSessionName="modal-docs-mcp-upload",
WebIdentityToken=identity_token,
)

creds = response["Credentials"]

s3 = boto3.client(
"s3",
aws_access_key_id=creds["AccessKeyId"],
aws_secret_access_key=creds["SecretAccessKey"],
aws_session_token=creds["SessionToken"],
region_name=AWS_REGION,
)
uploaded_files = 0
total_size = 0

for root, dirs, files in os.walk(local_path):
for file in files:
local_file = os.path.join(root, file)
# Create S3 key by joining prefix with relative path
relative_path = os.path.relpath(local_file, local_path)
s3_key = s3_prefix.rstrip("/") + "/" + relative_path

file_size = os.path.getsize(local_file)
s3.upload_file(local_file, bucket, s3_key)
uploaded_files += 1
total_size += file_size

return {
"files_uploaded": uploaded_files,
"total_size_bytes": total_size,
"bucket": bucket,
"prefix": s3_prefix,
}


# =============================================================================
# S3 Upload Functions
# =============================================================================


@app.function(
image=image,
volumes={VOLUME_PATH: docs_volume},
timeout=1800, # 30 minutes
)
def upload_docs_db_to_s3() -> dict:
"""Upload the documentation database to S3.

This function uploads the docs_db directory to S3 for the Kubernetes
storage controller to sync to the MCP server pods.

Returns:
Dict with upload statistics
"""
print("Uploading docs database to S3...")

db_path = Path(DB_PATH)
if not db_path.exists():
print("No docs database found to upload")
return {"error": "No docs database found"}

result = upload_directory_to_s3(
local_path=str(db_path),
bucket=S3_BUCKET,
s3_prefix="docs_db",
)

print(f"Docs DB upload complete: {result['files_uploaded']} files, {result['total_size_bytes']} bytes")
return result


@app.function(
image=image,
volumes={CODE_VOLUME_PATH: code_volume},
timeout=1800, # 30 minutes
)
def upload_code_db_to_s3() -> dict:
"""Upload the code index database to S3.

This function uploads the code_db directory to S3 for the Kubernetes
storage controller to sync to the MCP server pods.

Returns:
Dict with upload statistics
"""
print("Uploading code database to S3...")

db_path = Path(CODE_DB_PATH)
if not db_path.exists():
print("No code database found to upload")
return {"error": "No code database found"}

result = upload_directory_to_s3(
local_path=str(db_path),
bucket=S3_BUCKET,
s3_prefix="code_db",
)

print(f"Code DB upload complete: {result['files_uploaded']} files, {result['total_size_bytes']} bytes")
return result


# =============================================================================
# Crawling Functions
# =============================================================================
Expand Down Expand Up @@ -336,6 +479,11 @@ async def scheduled_crawl():
await generate_vector_db.remote.aio()
await generate_sqlite_db.remote.aio()

# Upload docs database to S3 for Kubernetes sync
print("Uploading docs database to S3...")
upload_result = upload_docs_db_to_s3.remote()
print(f"S3 upload complete: {upload_result}")

print(f"Scheduled crawl complete: {summary}")
return summary

Expand Down Expand Up @@ -1457,6 +1605,12 @@ async def scheduled_code_index():
print(f"Aggregation complete: {agg_result}")
result["aggregation"] = agg_result

# Upload code database to S3 for Kubernetes sync
print("Uploading code database to S3...")
upload_result = upload_code_db_to_s3.remote()
print(f"S3 upload complete: {upload_result}")
result["s3_upload"] = upload_result

return result
except modal.exception.FunctionTimeoutError as e:
print(f"Code indexing timed out: {e}")
Expand Down Expand Up @@ -1497,6 +1651,7 @@ def main(
skip_code: bool = False,
parallel: bool = True,
code_only: bool = False,
upload_to_s3: bool = False,
):
"""Run initial crawl and database generation

Expand All @@ -1505,6 +1660,7 @@ def main(
skip_code: Skip code indexing
parallel: Use parallel code indexing (default: True)
code_only: Only run code indexing (shortcut for --skip-docs)
upload_to_s3: Upload databases to S3 after generation (for K8s sync)
"""
if code_only:
skip_docs = True
Expand All @@ -1522,6 +1678,11 @@ def main(
sqlite_result = generate_sqlite_db.remote()
print(f"SQLite DB: {sqlite_result}")

if upload_to_s3:
print("Uploading docs database to S3...")
upload_result = upload_docs_db_to_s3.remote()
print(f"S3 upload: {upload_result}")

if not skip_code:
if parallel:
print("Generating code index (parallel)...")
Expand All @@ -1536,4 +1697,9 @@ def main(
agg_result = aggregate_code_databases.remote()
print(f"Aggregation: {agg_result}")

if upload_to_s3:
print("Uploading code database to S3...")
upload_result = upload_code_db_to_s3.remote()
print(f"S3 upload: {upload_result}")

print("Done! Deploy with: modal deploy docs/scripts/modal_app.py")