ambient-code · Gkrumbach07 · Dec 17, 2025 · Dec 15, 2025 · Dec 15, 2025 · Dec 16, 2025
diff --git a/.github/workflows/runner-tests.yml b/.github/workflows/runner-tests.yml
@@ -38,11 +38,11 @@ jobs:
         run: |
           # Only run standalone unit tests that don't require runner_shell runtime
           # (test_model_mapping.py and test_wrapper_vertex.py require full runtime environment)
-          pytest tests/test_observability.py tests/test_security_utils.py -v --tb=short --color=yes
+          pytest tests/test_observability.py tests/test_security_utils.py tests/test_privacy_masking.py -v --tb=short --color=yes
 
       - name: Run tests with coverage
         run: |
-          pytest tests/test_observability.py tests/test_security_utils.py --cov=observability --cov=security_utils --cov-report=term-missing --cov-report=xml
+          pytest tests/test_observability.py tests/test_security_utils.py tests/test_privacy_masking.py --cov=observability --cov=security_utils --cov-report=term-missing --cov-report=xml
 
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v4

diff --git a/CLAUDE.md b/CLAUDE.md
@@ -335,6 +335,86 @@ The Claude Code runner (`components/runners/claude-code-runner/`) provides:
 - **API version**: `v1alpha1` (current)
 - **RBAC**: Namespace-scoped service accounts with minimal permissions
 
+### Langfuse Observability (LLM Tracing)
+
+The platform includes optional Langfuse integration for LLM observability, tracking usage metrics while protecting user privacy.
+
+#### Privacy-First Design
+
+- **Default behavior**: User messages and assistant responses are **REDACTED** in traces
+- **Preserved data**: Usage metrics (tokens, costs), metadata (model, turn count, timestamps)
+- **Rationale**: Track costs and usage patterns without exposing potentially sensitive user data
+
+#### Configuration
+
+**Enable Langfuse** (disabled by default):
+```bash
+# In ambient-admin-langfuse-secret
+LANGFUSE_ENABLED=true
+LANGFUSE_PUBLIC_KEY=<your-key>
+LANGFUSE_SECRET_KEY=<your-secret>
+LANGFUSE_HOST=http://langfuse-web.langfuse.svc.cluster.local:3000
+```
+
+**Privacy Controls** (optional - masking enabled by default):
+```bash
+# Masking is ENABLED BY DEFAULT (no environment variable needed)
+# The runner defaults to LANGFUSE_MASK_MESSAGES=true if not set
+
+# To explicitly set (optional):
+LANGFUSE_MASK_MESSAGES=true
+
+# To disable masking (dev/testing ONLY - exposes full message content):
+LANGFUSE_MASK_MESSAGES=false
+```
+
+#### Deployment
+
+Deploy Langfuse to your cluster:
+```bash
+# Deploy with default privacy-preserving settings
+./e2e/scripts/deploy-langfuse.sh
+
+# For OpenShift
+./e2e/scripts/deploy-langfuse.sh --openshift
+
+# For Kubernetes
+./e2e/scripts/deploy-langfuse.sh --kubernetes
+```
+
+#### Implementation
+
+- **Location**: `components/runners/claude-code-runner/observability.py`
+- **Masking function**: `_privacy_masking_function()` - redacts content while preserving metrics
+- **Test coverage**: `tests/test_privacy_masking.py` - validates masking behavior
+
+#### What Gets Logged
+
+**With Masking Enabled (Default)**:
+- ✅ Token counts (input, output, cache read, cache creation)
+- ✅ Cost calculations (USD per session)
+- ✅ Model names and versions
+- ✅ Turn counts and session durations
+- ✅ Tool usage (names, execution status)
+- ✅ Error states and completion status
+- ❌ User prompts (redacted)
+- ❌ Assistant responses (redacted)
+- ❌ Tool outputs with long content (redacted)
+
+**With Masking Disabled** (dev/testing only):
+- ✅ All of the above
+- ⚠️ Full user message content (potentially sensitive!)
+- ⚠️ Full assistant response content
+- ⚠️ Complete tool outputs
+
+#### OpenTelemetry Support
+
+Langfuse supports OpenTelemetry as of 2025:
+- **Current implementation**: Langfuse Python SDK (v3, OTel-based)
+- **Alternative**: Pure OpenTelemetry SDK → Langfuse OTLP endpoint (`/api/public/otel`)
+- **Migration**: Not recommended unless vendor neutrality is required
+- **Benefit**: Current SDK already uses OTel underneath
+
 ## Backend and Operator Development Standards
 
 **IMPORTANT**: When working on backend (`components/backend/`) or operator (`components/operator/`) code, you MUST follow these strict guidelines based on established patterns in the codebase.

diff --git a/components/manifests/base/ambient-admin-langfuse-secret.yaml.example b/components/manifests/base/ambient-admin-langfuse-secret.yaml.example
@@ -16,6 +16,7 @@
 #     --from-literal=LANGFUSE_SECRET_KEY=sk-lf-YOUR-SECRET-KEY-HERE \
 #     --from-literal=LANGFUSE_HOST=http://langfuse-web.langfuse.svc.cluster.local:3000 \
 #     --from-literal=LANGFUSE_ENABLED=true \
+#     --from-literal=LANGFUSE_MASK_MESSAGES=true \
 #     -n ambient-code
 #
 # Option 2: Using this YAML file (less secure - keys visible in manifest):
@@ -43,3 +44,19 @@ stringData:
 
   # Enable Langfuse observability for all sessions
   LANGFUSE_ENABLED: "true"
+
+  # Privacy Controls: Mask user messages and assistant responses in traces
+  # Default: "true" (privacy-first - redacts message content, preserves usage metrics)
+  # Set to "false" only for dev/testing environments where full message logging is needed
+  #
+  # What gets logged with LANGFUSE_MASK_MESSAGES=true (recommended for production):
+  #   ✅ Token counts (input, output, cache read/creation)
+  #   ✅ Cost calculations (USD per session)
+  #   ✅ Model names, turn counts, session metadata
+  #   ✅ Tool names and execution status
+  #   ❌ User prompts → [REDACTED FOR PRIVACY]
+  #   ❌ Assistant responses → [REDACTED FOR PRIVACY]
+  #   ❌ Long tool outputs → [REDACTED FOR PRIVACY]
+  #
+  # NOTE: This setting is optional. If omitted, defaults to "true" (masking enabled).
+  LANGFUSE_MASK_MESSAGES: "true"
diff --git a/components/runners/claude-code-runner/observability.py b/components/runners/claude-code-runner/observability.py
@@ -55,6 +55,62 @@
 )
 
 
+def _privacy_masking_function(data: Any, **kwargs) -> Any:
+    """Mask sensitive user inputs and outputs while preserving usage metrics.
+
+    This function redacts message content (user prompts and assistant responses)
+    to prevent logging potentially sensitive data, while preserving:
+    - Usage metrics (token counts, costs)
+    - Metadata (model, turn number, timestamps)
+    - Session identifiers
+
+    Controlled by LANGFUSE_MASK_MESSAGES environment variable:
+    - "true" (default): Redact all message content for privacy
+    - "false": Allow full message logging (use only in dev/testing)
+
+    Args:
+        data: Data to potentially mask (string, dict, list, or other)
+        **kwargs: Additional context (unused but required by Langfuse API)
+
+    Returns:
+        Masked data with same structure as input
+    """
+    if isinstance(data, str):
+        # Redact string content (likely message text)
+        # Short strings (< 50 chars) might be metadata, keep them
+        if len(data) > 50:
+            return "[REDACTED FOR PRIVACY]"
+        return data
+    elif isinstance(data, dict):
+        # Recursively process dict, preserving structure
+        masked = {}
+        for key, value in data.items():
+            # Preserve usage and metadata fields - these don't contain sensitive data
+            if key in ("usage", "usage_details", "metadata", "model", "turn",
+                      "input_tokens", "output_tokens", "cache_read_input_tokens",
+                      "cache_creation_input_tokens", "total_tokens", "cost_usd",
+                      "duration_ms", "duration_api_ms", "num_turns", "session_id",
+                      "tool_id", "tool_name", "is_error", "level"):
+                masked[key] = value
+            # Redact content fields that may contain user data
+            elif key in ("content", "text", "input", "output", "prompt", "completion"):
+                if isinstance(value, str) and len(value) > 50:
+                    masked[key] = "[REDACTED FOR PRIVACY]"
+                else:
+                    # Short values might be metadata/enums, recurse
+                    masked[key] = _privacy_masking_function(value)
+            else:
+                # Recursively process other fields
+                masked[key] = _privacy_masking_function(value)
+        return masked
+    elif isinstance(data, list):
+        # Recursively process list items
+        return [_privacy_masking_function(item) for item in data]
+    else:
+        # Preserve other types (numbers, booleans, None, etc.)
+        return data
+
+
 class ObservabilityManager:
     """Manages Langfuse observability for Claude sessions.
     """
@@ -128,9 +184,25 @@ async def initialize(self, prompt: str, namespace: str, model: str = None) -> bo
             return False
 
         try:
-            # Initialize client
+            # Determine if message masking should be enabled
+            # Default: MASK messages (privacy-first approach)
+            # Set LANGFUSE_MASK_MESSAGES=false to explicitly disable masking (dev/testing only)
+            mask_messages_env = os.getenv("LANGFUSE_MASK_MESSAGES", "true").strip().lower()
+            enable_masking = mask_messages_env not in ("false", "0", "no")
+
+            if enable_masking:
+                logging.info("Langfuse: Privacy masking ENABLED - user messages and responses will be redacted")
+                mask_fn = _privacy_masking_function
+            else:
+                logging.warning("Langfuse: Privacy masking DISABLED - full message content will be logged (use only for dev/testing)")
+                mask_fn = None
+
+            # Initialize client with optional masking
             self.langfuse_client = Langfuse(
-                public_key=public_key, secret_key=secret_key, host=host
+                public_key=public_key,
+                secret_key=secret_key,
+                host=host,
+                mask=mask_fn
             )
 
             # Build metadata with model information

diff --git a/components/runners/claude-code-runner/tests/test_observability.py b/components/runners/claude-code-runner/tests/test_observability.py
@@ -4,7 +4,7 @@
 import os
 import logging
 from unittest.mock import Mock, patch
-from observability import ObservabilityManager
+from observability import ObservabilityManager, _privacy_masking_function
 
 
 @pytest.fixture
@@ -151,10 +151,12 @@ async def test_init_successful(self, mock_langfuse_class, mock_propagate, manage
         assert manager.langfuse_client is not None
         assert manager._propagate_ctx is not None
 
+        # Verify Langfuse client was initialized with privacy masking enabled (default)
         mock_langfuse_class.assert_called_once_with(
             public_key="pk-lf-public",
             secret_key="sk-lf-secret",
             host="http://localhost:3000",
+            mask=_privacy_masking_function,
         )
 
         # Verify propagate_attributes was called