Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 66 additions & 1 deletion src/lightspeed_evaluation/core/output/statistics.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,37 @@
import statistics
from typing import Any

import numpy as np
import pandas as pd

from lightspeed_evaluation.core.models import EvaluationResult


def bootstrap_intervals(
s: pd.Series, confidence: int = 95, bootstrap_steps: int = 100000
) -> tuple[np.floating, np.floating, np.floating]:
"""Compute confidence interval using bootstraping, return low, mean, high."""
if not 0 <= confidence <= 100:
raise ValueError("Invalid confidence, must be between 0 and 100")

sample_n = len(s)
sample_mean = np.mean(s)

confidence_rev = 100 - confidence

rates = np.array(
[np.mean(s.sample(n=sample_n, replace=True)) for _ in range(bootstrap_steps)]
)

# Median (not mean) is correct here
mean_boot_strap = np.median(rates)
low = np.percentile(rates - sample_mean, (confidence_rev / 2.0))
high = np.percentile(rates - sample_mean, 100 - (confidence_rev / 2.0))

# high represent lower bound, low represents upper bound
return sample_mean - high, mean_boot_strap, sample_mean - low


def calculate_basic_stats(results: list[EvaluationResult]) -> dict[str, Any]:
"""Calculate basic pass/fail/error statistics from results."""
if not results:
Expand Down Expand Up @@ -106,14 +134,35 @@ def _finalize_metric_stats(stats: dict[str, Any]) -> None:
# Calculate statistical measures for scores
if stats["scores"]:
scores = stats["scores"]
stats["score_statistics"] = {
scores_series = pd.Series(scores)

# Calculate basic statistics
score_stats = {
"mean": statistics.mean(scores),
"median": statistics.median(scores),
"std": statistics.stdev(scores) if len(scores) > 1 else 0.0,
"min": min(scores),
"max": max(scores),
"count": len(scores),
}

# Calculate confidence intervals using bootstrap
if len(scores) > 1: # Need at least 2 samples for meaningful bootstrap
try:
ci_low, ci_mean, ci_high = bootstrap_intervals(scores_series)
score_stats["confidence_interval"] = {
"low": float(ci_low),
"mean": float(ci_mean),
"high": float(ci_high),
"confidence_level": 95, # Default confidence level
}
except (ValueError, RuntimeError):
# If bootstrap fails, set confidence interval to None
score_stats["confidence_interval"] = None
else:
score_stats["confidence_interval"] = None

stats["score_statistics"] = score_stats
else:
stats["score_statistics"] = {
"mean": 0.0,
Expand All @@ -122,6 +171,7 @@ def _finalize_metric_stats(stats: dict[str, Any]) -> None:
"min": 0.0,
"max": 0.0,
"count": 0,
"confidence_interval": None,
}


Expand All @@ -132,7 +182,22 @@ def _finalize_conversation_stats(stats: dict[str, Any]) -> None:
stats["pass_rate"] = stats["pass"] / total * 100
stats["fail_rate"] = stats["fail"] / total * 100
stats["error_rate"] = stats["error"] / total * 100

# Calculate confidence intervals for conversation rates
if total > 1: # Need at least 2 samples for meaningful bootstrap
try:
# Create binary series for each outcome type
# Note: We need to reconstruct the original results for this conversation
# Since we don't have access to the original results here,
# we'll skip CI for conversations. This could be enhanced by
# passing the original results to this function
stats["confidence_intervals"] = None
except (ValueError, RuntimeError):
stats["confidence_intervals"] = None
else:
stats["confidence_intervals"] = None
else:
stats["pass_rate"] = 0.0
stats["fail_rate"] = 0.0
stats["error_rate"] = 0.0
stats["confidence_intervals"] = None
66 changes: 66 additions & 0 deletions tests/unit/core/output/test_statistics.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
"""Unit tests for core.output.statistics module."""

import pytest
import pandas as pd

from lightspeed_evaluation.core.models import EvaluationResult, EvaluationScope
from lightspeed_evaluation.core.output.statistics import (
bootstrap_intervals,
calculate_basic_stats,
calculate_detailed_stats,
)
Expand Down Expand Up @@ -35,6 +37,70 @@ def test_evaluation_scope_conversation_level(self):
assert scope.is_conversation is True


class TestBootstrapIntervals:
"""Unit tests for bootstrap_intervals function."""

def test_bootstrap_intervals_valid_confidence(self):
"""Test bootstrap_intervals with valid confidence levels."""
data = pd.Series([0.8, 0.9, 0.7, 0.85, 0.75])

# Test default 95% confidence
low, mean, high = bootstrap_intervals(data)
assert low <= mean <= high
assert isinstance(low, float)
assert isinstance(mean, float)
assert isinstance(high, float)

# Test 90% confidence (should be narrower)
low_90, mean_90, high_90 = bootstrap_intervals(data, confidence=90)
assert low_90 <= mean_90 <= high_90
ci_95_width = high - low
ci_90_width = high_90 - low_90
assert ci_90_width < ci_95_width

def test_bootstrap_intervals_invalid_confidence(self):
"""Test bootstrap_intervals with invalid confidence levels."""
data = pd.Series([0.8, 0.9, 0.7])

# Test negative confidence
with pytest.raises(
ValueError, match="Invalid confidence, must be between 0 and 100"
):
bootstrap_intervals(data, confidence=-5)

# Test confidence > 100
with pytest.raises(
ValueError, match="Invalid confidence, must be between 0 and 100"
):
bootstrap_intervals(data, confidence=150)

def test_bootstrap_intervals_edge_cases(self):
"""Test bootstrap_intervals with edge cases."""
# Test with single value
single_value = pd.Series([0.5])
low, mean, high = bootstrap_intervals(single_value)
assert low == mean == high == 0.5

# Test with all same values
same_values = pd.Series([0.8, 0.8, 0.8, 0.8, 0.8])
low, mean, high = bootstrap_intervals(same_values)
assert abs(low - 0.8) < 0.001
assert abs(mean - 0.8) < 0.001
assert abs(high - 0.8) < 0.001

def test_bootstrap_intervals_confidence_levels(self):
"""Test bootstrap_intervals with different confidence levels."""
data = pd.Series([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

# Test 0% confidence (should be very narrow)
low_0, mean_0, high_0 = bootstrap_intervals(data, confidence=0)
assert low_0 <= mean_0 <= high_0

# Test 100% confidence (should be very wide)
low_100, mean_100, high_100 = bootstrap_intervals(data, confidence=100)
assert low_100 <= mean_100 <= high_100


class TestCalculateBasicStats:
"""Unit tests for calculate_basic_stats function."""

Expand Down