lightspeed-core · tisnik · Oct 8, 2025 · Oct 7, 2025
diff --git a/src/lightspeed_evaluation/core/output/statistics.py b/src/lightspeed_evaluation/core/output/statistics.py
@@ -3,9 +3,37 @@
 import statistics
 from typing import Any
 
+import numpy as np
+import pandas as pd
+
 from lightspeed_evaluation.core.models import EvaluationResult
 
 
+def bootstrap_intervals(
+    s: pd.Series, confidence: int = 95, bootstrap_steps: int = 100000
+) -> tuple[np.floating, np.floating, np.floating]:
+    """Compute confidence interval using bootstraping, return low, mean, high."""
+    if not 0 <= confidence <= 100:
+        raise ValueError("Invalid confidence, must be between 0 and 100")
+
+    sample_n = len(s)
+    sample_mean = np.mean(s)
+
+    confidence_rev = 100 - confidence
+
+    rates = np.array(
+        [np.mean(s.sample(n=sample_n, replace=True)) for _ in range(bootstrap_steps)]
+    )
+
+    # Median (not mean) is correct here
+    mean_boot_strap = np.median(rates)
+    low = np.percentile(rates - sample_mean, (confidence_rev / 2.0))
+    high = np.percentile(rates - sample_mean, 100 - (confidence_rev / 2.0))
+
+    # high represent lower bound, low represents upper bound
+    return sample_mean - high, mean_boot_strap, sample_mean - low
+
+
 def calculate_basic_stats(results: list[EvaluationResult]) -> dict[str, Any]:
     """Calculate basic pass/fail/error statistics from results."""
     if not results:
@@ -106,14 +134,35 @@ def _finalize_metric_stats(stats: dict[str, Any]) -> None:
     # Calculate statistical measures for scores
     if stats["scores"]:
         scores = stats["scores"]
-        stats["score_statistics"] = {
+        scores_series = pd.Series(scores)
+
+        # Calculate basic statistics
+        score_stats = {
             "mean": statistics.mean(scores),
             "median": statistics.median(scores),
             "std": statistics.stdev(scores) if len(scores) > 1 else 0.0,
             "min": min(scores),
             "max": max(scores),
             "count": len(scores),
         }
+
+        # Calculate confidence intervals using bootstrap
+        if len(scores) > 1:  # Need at least 2 samples for meaningful bootstrap
+            try:
+                ci_low, ci_mean, ci_high = bootstrap_intervals(scores_series)
+                score_stats["confidence_interval"] = {
+                    "low": float(ci_low),
+                    "mean": float(ci_mean),
+                    "high": float(ci_high),
+                    "confidence_level": 95,  # Default confidence level
+                }
+            except (ValueError, RuntimeError):
+                # If bootstrap fails, set confidence interval to None
+                score_stats["confidence_interval"] = None
+        else:
+            score_stats["confidence_interval"] = None
+
+        stats["score_statistics"] = score_stats
     else:
         stats["score_statistics"] = {
             "mean": 0.0,
@@ -122,6 +171,7 @@ def _finalize_metric_stats(stats: dict[str, Any]) -> None:
             "min": 0.0,
             "max": 0.0,
             "count": 0,
+            "confidence_interval": None,
         }
 
 
@@ -132,7 +182,22 @@ def _finalize_conversation_stats(stats: dict[str, Any]) -> None:
         stats["pass_rate"] = stats["pass"] / total * 100
         stats["fail_rate"] = stats["fail"] / total * 100
         stats["error_rate"] = stats["error"] / total * 100
+
+        # Calculate confidence intervals for conversation rates
+        if total > 1:  # Need at least 2 samples for meaningful bootstrap
+            try:
+                # Create binary series for each outcome type
+                # Note: We need to reconstruct the original results for this conversation
+                # Since we don't have access to the original results here,
+                # we'll skip CI for conversations. This could be enhanced by
+                # passing the original results to this function
+                stats["confidence_intervals"] = None
+            except (ValueError, RuntimeError):
+                stats["confidence_intervals"] = None
+        else:
+            stats["confidence_intervals"] = None
     else:
         stats["pass_rate"] = 0.0
         stats["fail_rate"] = 0.0
         stats["error_rate"] = 0.0
+        stats["confidence_intervals"] = None
diff --git a/tests/unit/core/output/test_statistics.py b/tests/unit/core/output/test_statistics.py
@@ -1,9 +1,11 @@
 """Unit tests for core.output.statistics module."""
 
 import pytest
+import pandas as pd
 
 from lightspeed_evaluation.core.models import EvaluationResult, EvaluationScope
 from lightspeed_evaluation.core.output.statistics import (
+    bootstrap_intervals,
     calculate_basic_stats,
     calculate_detailed_stats,
 )
@@ -35,6 +37,70 @@ def test_evaluation_scope_conversation_level(self):
         assert scope.is_conversation is True
 
 
+class TestBootstrapIntervals:
+    """Unit tests for bootstrap_intervals function."""
+
+    def test_bootstrap_intervals_valid_confidence(self):
+        """Test bootstrap_intervals with valid confidence levels."""
+        data = pd.Series([0.8, 0.9, 0.7, 0.85, 0.75])
+
+        # Test default 95% confidence
+        low, mean, high = bootstrap_intervals(data)
+        assert low <= mean <= high
+        assert isinstance(low, float)
+        assert isinstance(mean, float)
+        assert isinstance(high, float)
+
+        # Test 90% confidence (should be narrower)
+        low_90, mean_90, high_90 = bootstrap_intervals(data, confidence=90)
+        assert low_90 <= mean_90 <= high_90
+        ci_95_width = high - low
+        ci_90_width = high_90 - low_90
+        assert ci_90_width < ci_95_width
+
+    def test_bootstrap_intervals_invalid_confidence(self):
+        """Test bootstrap_intervals with invalid confidence levels."""
+        data = pd.Series([0.8, 0.9, 0.7])
+
+        # Test negative confidence
+        with pytest.raises(
+            ValueError, match="Invalid confidence, must be between 0 and 100"
+        ):
+            bootstrap_intervals(data, confidence=-5)
+
+        # Test confidence > 100
+        with pytest.raises(
+            ValueError, match="Invalid confidence, must be between 0 and 100"
+        ):
+            bootstrap_intervals(data, confidence=150)
+
+    def test_bootstrap_intervals_edge_cases(self):
+        """Test bootstrap_intervals with edge cases."""
+        # Test with single value
+        single_value = pd.Series([0.5])
+        low, mean, high = bootstrap_intervals(single_value)
+        assert low == mean == high == 0.5
+
+        # Test with all same values
+        same_values = pd.Series([0.8, 0.8, 0.8, 0.8, 0.8])
+        low, mean, high = bootstrap_intervals(same_values)
+        assert abs(low - 0.8) < 0.001
+        assert abs(mean - 0.8) < 0.001
+        assert abs(high - 0.8) < 0.001
+
+    def test_bootstrap_intervals_confidence_levels(self):
+        """Test bootstrap_intervals with different confidence levels."""
+        data = pd.Series([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])
+
+        # Test 0% confidence (should be very narrow)
+        low_0, mean_0, high_0 = bootstrap_intervals(data, confidence=0)
+        assert low_0 <= mean_0 <= high_0
+
+        # Test 100% confidence (should be very wide)
+        low_100, mean_100, high_100 = bootstrap_intervals(data, confidence=100)
+        assert low_100 <= mean_100 <= high_100
+
+
 class TestCalculateBasicStats:
     """Unit tests for calculate_basic_stats function."""