Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ export OPENAI_API_KEY="your-key"

# Run evaluation
lightspeed-eval --system-config config/system.yaml --eval-data config/evaluation_data.yaml
```

## 📊 Supported Metrics

Expand Down
4 changes: 2 additions & 2 deletions archive/lightspeed_core_evaluation/taxonomy_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,8 +187,8 @@ def _get_score(
retrieved_contexts=[data.context],
)
return scorer.single_turn_score(
data # pyright: ignore [reportArgumentType]
)
data
) # pyright: ignore [reportArgumentType]

df = self._taxonomy_df.copy()
if self._args.eval_type in ("all", "context"):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,7 @@ def run_evaluation(self) -> None:
conversations = self.data_manager.get_conversations()

logger.info(
"Starting Agent Goal Evaluation\n"
"Total: %d evaluations across %d conversations",
"Starting Agent Goal Evaluation\nTotal: %d evaluations across %d conversations",
self.data_manager.get_eval_count(),
len(conversations),
)
Expand Down
3 changes: 1 addition & 2 deletions lsc_agent_eval/src/lsc_agent_eval/core/utils/judge.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,7 @@ def _setup_litellm(self) -> None:
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
raise JudgeModelError(
"OPENAI_API_KEY environment variable is required "
"for OpenAI provider"
"OPENAI_API_KEY environment variable is required for OpenAI provider"
)
self.model_name = self.judge_model

Expand Down
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,12 @@ generate_answers = "generate_answers.generate_answers:main"
# Note: torch[cpu] variant configuration removed for uv compatibility
# Modern PyTorch versions are available on PyPI directly

[tool.black]
line-length = 88

[tool.pydocstyle]
convention = "google"

[tool.mypy]
disable_error_code = ["union-attr", "return-value", "arg-type", "import-untyped"]
ignore_missing_imports = true
Expand Down
3 changes: 1 addition & 2 deletions src/lightspeed_evaluation/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
"""
LightSpeed Evaluation Framework.
"""LightSpeed Evaluation Framework.

Main components:
- EvaluationDriver: Runs complete evaluation
Expand Down
3 changes: 1 addition & 2 deletions src/lightspeed_evaluation/core/llm/deepeval.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@


class DeepEvalLLMManager:
"""
DeepEval LLM Manager - Takes LLM parameters directly.
"""DeepEval LLM Manager - Takes LLM parameters directly.

This manager focuses solely on DeepEval-specific LLM integration.
"""
Expand Down
3 changes: 1 addition & 2 deletions src/lightspeed_evaluation/core/llm/manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,7 @@ class LLMError(Exception):


class LLMManager:
"""
Generic LLM Manager for all use cases (Ragas, DeepEval, Custom metrics).
"""Generic LLM Manager for all use cases (Ragas, DeepEval, Custom metrics).

Responsibilities:
- Environment validation for multiple providers
Expand Down
3 changes: 1 addition & 2 deletions src/lightspeed_evaluation/core/llm/ragas.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,7 @@ def is_finished(self, response: LLMResult) -> bool:


class RagasLLMManager:
"""
Ragas LLM Manager - Takes LLM parameters directly.
"""Ragas LLM Manager - Takes LLM parameters directly.

This manager focuses solely on Ragas-specific LLM integration.
"""
Expand Down
6 changes: 2 additions & 4 deletions src/lightspeed_evaluation/core/metrics/custom.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@ class CustomMetrics:
"""Handles custom metrics using LLMManager for direct LiteLLM calls."""

def __init__(self, llm_manager: LLMManager):
"""
Initialize with LLM Manager.
"""Initialize with LLM Manager.

Args:
llm_manager: Pre-configured LLMManager with validated parameters
Expand Down Expand Up @@ -89,8 +88,7 @@ def _call_llm(self, prompt: str, system_prompt: Optional[str] = None) -> str:
raise RuntimeError(f"LiteLLM call failed: {str(e)}") from e

def _parse_score_response(self, response: str) -> Tuple[Optional[float], str]:
r"""
Parse LLM response to extract score and reason.
r"""Parse LLM response to extract score and reason.

Expected formats:
- "Score: 0.85\nReason: The answer is accurate..."
Expand Down
3 changes: 1 addition & 2 deletions src/lightspeed_evaluation/core/metrics/deepeval.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,7 @@ class DeepEvalMetrics:
"""Handles DeepEval metrics evaluation using LLM Manager."""

def __init__(self, llm_manager: LLMManager):
"""
Initialize with LLM Manager.
"""Initialize with LLM Manager.

Args:
llm_manager: Pre-configured LLMManager with validated parameters
Expand Down
3 changes: 1 addition & 2 deletions src/lightspeed_evaluation/core/metrics/ragas.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@ class RagasMetrics:
"""Handles Ragas metrics evaluation using LLM Manager."""

def __init__(self, llm_manager: LLMManager):
"""
Initialize with LLM Manager.
"""Initialize with LLM Manager.

Args:
llm_manager: Pre-configured LLMManager with validated parameters
Expand Down
20 changes: 8 additions & 12 deletions src/lightspeed_evaluation/drivers/evaluation.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""
Evaluation Driver - Main evaluation controller.
"""Evaluation Driver - Main evaluation controller.

Controls the evaluation flow through conversations & turns
Controls the evaluation flow through conversations & turns.
"""

import time
Expand Down Expand Up @@ -116,14 +115,13 @@ def get_supported_frameworks(self) -> List[str]:


class EvaluationDriver:
"""
Main evaluation driver - orchestrates the evaluation process.
"""Main evaluation driver - orchestrates the evaluation process.

Responsibilities:
- Data validation
- Metric routing and evaluation
- Result collection
- Status determination (PASS/FAIL/ERROR)
- Status determination (PASS/FAIL/ERROR).
"""

def __init__(self, config_loader: ConfigLoader):
Expand All @@ -146,14 +144,13 @@ def validate_data(self, evaluation_data: List[EvaluationData]) -> bool:
def run_evaluation(
self, evaluation_data: List[EvaluationData]
) -> List[EvaluationResult]:
"""
Run complete evaluation pipeline.
"""Run complete evaluation pipeline.

Args:
evaluation_data: List of conversation data to evaluate

Returns:
List of evaluation results
List of evaluation results.
"""
print("🚀 Starting evaluation...")
self.results = []
Expand Down Expand Up @@ -214,11 +211,10 @@ def _evaluate_conversation(self, conv_data: EvaluationData) -> None:
def _evaluate_metric(
self, request: EvaluationRequest
) -> Optional[EvaluationResult]:
"""
Evaluate single metric using context.
"""Evaluate single metric using context.

Returns:
EvaluationResult or None if evaluation fails
EvaluationResult or None if evaluation fails.
"""
start_time = time.time()

Expand Down
3 changes: 1 addition & 2 deletions src/lightspeed_evaluation/runner/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@
def run_evaluation(
system_config_path: str, evaluation_data_path: str, output_dir: Optional[str] = None
) -> Optional[Dict[str, int]]:
"""
Run the complete evaluation pipeline using EvaluationDriver.
"""Run the complete evaluation pipeline using EvaluationDriver.

Args:
system_config_path: Path to system.yaml
Expand Down