Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
232 changes: 232 additions & 0 deletions guides/BasicEvaluation.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,232 @@
# Basic Evaluation Usage

Draive framework provides comprehensive evaluation capabilities to assess LLM outputs and conversational flows. The evaluation system consists of three main components: individual evaluators, scenarios that combine multiple evaluators, and evaluation suites for systematic testing.

## Simple Evaluators

The simplest way to evaluate content is using individual evaluators. Let's start with a basic custom evaluator that checks if text contains specific keywords:

```python
from draive.evaluation import evaluator, EvaluationScore
from draive.multimodal import Multimodal

@evaluator(name="keyword_presence", threshold=0.8)
async def keyword_evaluator(
content: Multimodal,
/,
required_keywords: list[str],
) -> EvaluationScore:
text = str(content).lower()
found_keywords = sum(1 for keyword in required_keywords if keyword.lower() in text)

if not required_keywords:
return EvaluationScore(
value=0,
comment="No keywords provided for evaluation",
)

score = found_keywords / len(required_keywords)
return EvaluationScore(
value=score,
comment=f"Found {found_keywords}/{len(required_keywords)} required keywords",
)
```

Using this evaluator is straightforward:

```python
from draive import ctx, load_env
from draive.openai import OpenAI, OpenAIChatConfig

load_env()

async with ctx.scope(
"evaluation_example",
OpenAI().lmm_invoking(),
OpenAIChatConfig(model="gpt-4o-mini"),
):
content = "AI and machine learning are transforming technology"

result = await keyword_evaluator(
content,
required_keywords=["AI", "machine learning", "technology"],
)

print(f"Score: {result.score.value}")
print(f"Passed: {result.passed}")
print(f"Comment: {result.score.comment}")
```

## Built-in Evaluators

Draive includes several pre-built evaluators for common use cases. Let's explore groundedness and readability evaluators:

```python
from draive.evaluators import groundedness_evaluator, readability_evaluator

# Evaluate if generated content is grounded in source material
reference_text = """
Climate change is causing rising sea levels globally.
Scientific data shows ocean levels have risen 8-9 inches since 1880.
"""

generated_text = """
Based on scientific evidence, global sea levels have increased
approximately 8-9 inches since 1880 due to climate change impacts.
"""

groundedness_result = await groundedness_evaluator(
generated_text,
reference=reference_text,
)

print(f"Groundedness: {groundedness_result.score.value}")
print(f"Comment: {groundedness_result.score.comment}")

# Evaluate text readability
complex_text = """
The utilization of sophisticated methodological approaches in the
implementation of artificial intelligence systems necessitates comprehensive
understanding of underlying algorithmic paradigms.
"""

readability_result = await readability_evaluator(complex_text)

print(f"Readability: {readability_result.score.value}")
print(f"Comment: {readability_result.score.comment}")
```

## Evaluation Scenarios

Scenarios combine multiple evaluators to assess content from different perspectives. Here's a scenario that evaluates content quality using both groundedness and readability:

```python
from draive.evaluation import evaluation_scenario, EvaluationScenarioResult
from draive.evaluators import conciseness_evaluator

@evaluation_scenario(name="content_quality")
async def content_quality_scenario(
content: str,
/,
*,
reference: str,
) -> EvaluationScenarioResult:
# Prepare evaluators with appropriate thresholds
conciseness = conciseness_evaluator.with_threshold("excellent")
readability = readability_evaluator.with_threshold("good")

# Evaluate using multiple criteria
return await EvaluationScenarioResult.evaluating(
content,
conciseness.prepared(reference=reference),
readability.prepared(),
)

# Use the scenario
scenario_result = await content_quality_scenario(
generated_text,
reference=reference_text,
)

print(f"Scenario passed: {scenario_result.passed}")
print(f"Overall score: {scenario_result.relative_score:.2f}")

for evaluation in scenario_result.evaluations:
print(f"- {evaluation.evaluator}: {evaluation.score.value:.2f} ({'✓' if evaluation.passed else '✗'})")
```

## Evaluation Suites

Evaluation suites allow systematic testing across multiple test cases. Let's create a suite to evaluate different content generation scenarios:

```python
from typing import Sequence
from draive.evaluation import evaluation_suite, EvaluationSuiteCase
from draive import TextGeneration, DataModel

class ContentTestCase(DataModel):
topic: str
required_keywords: Sequence[str]
reference_material: str

@evaluation_suite(ContentTestCase)
async def content_generation_suite(
parameters: ContentTestCase,
) -> EvaluationCaseResult[str]:
# Generate content based on test case parameters
content: str = await TextGeneration.generate(
instruction=f"Write informative content about {parameters.topic}",
input=parameters.reference_material,
)
return await EvaluationCaseResult.evaluating(
content,
content_quality_scenario.prepared(
reference=parameters.reference_material,
),
keyword_evaluator.with_threshold(0.5).prepared(
required_keywords=parameters.required_keywords
),
)

# Define test cases
test_cases = [
ContentTestCase(
topic="climate change",
required_keywords=["temperature", "emissions", "global"],
reference_material="Global temperatures have risen 1.1°C since pre-industrial times",
),
ContentTestCase(
topic="renewable energy",
required_keywords=["solar", "sustainable", "energy"],
reference_material="Solar and wind power are leading renewable energy sources",
),
]

# Prepare suite with in-memory test cases storage
suite = content_generation_suite.with_storage(test_cases)

# Execute suite evaluation
suite_results = await suite()

print(f"Suite passed: {suite_results.passed}")
print(f"Cases passed: {sum(1 for case in suite_results.cases if case.passed)}/{len(suite_results.cases)}")

for case_result in suite_results.cases:
print(f"\nCase {case_result.case.parameters.topic}:")
print(f" Generated: {case_result.value[:100]}...")
print(f" Passed: {case_result.passed}")
print(f" Score: {case_result.relative_score:.2f}")
```

## Advanced Usage

You can customize evaluators with execution contexts and metadata:

```python
# Create evaluator with custom execution context
custom_evaluator = keyword_evaluator.with_execution_context(
ctx.scope("custom_evaluation")
).with_meta({
"version": "1.0",
"author": "evaluation_team",
})

# Combine evaluators using logical operations
best_evaluator = Evaluator.highest(
conciseness_evaluator.prepared(reference=reference_text),
readability_evaluator.prepared(),
)

# Map evaluator to work with different data structures
from draive.parameters import DataModel

class DocumentContent(DataModel):
title: str
body: str

document_evaluator = readability_evaluator.contra_map(
lambda doc: doc.body # Extract body text for evaluation
)
```

The evaluation system integrates seamlessly with draive's context management and provides detailed metrics logging for comprehensive analysis of your LLM applications.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ build-backend = "hatchling.build"
[project]
name = "draive"
description = "Framework designed to simplify and accelerate the development of LLM-based applications."
version = "0.66.5"
version = "0.67.0"
readme = "README.md"
maintainers = [
{ name = "Kacper Kaliński", email = "kacper.kalinski@miquido.com" },
Expand Down
2 changes: 2 additions & 0 deletions src/draive/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@
VolatileMemory,
VolatileVectorIndex,
prepare_instruction,
refine_instruction,
)
from draive.instructions import (
Instruction,
Expand Down Expand Up @@ -382,6 +383,7 @@
"not_missing",
"prepare_instruction",
"prompt",
"refine_instruction",
"resource",
"retry",
"setup_logging",
Expand Down
30 changes: 18 additions & 12 deletions src/draive/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,27 +71,33 @@ def report(
self,
include_details: bool = True,
) -> str:
status: str = "passed" if self.passed else "failed"
if include_details:
meta_values: str = (
f"\n{'\n'.join(f'{key}: {value}' for key, value in self.meta.items())}"
if self.meta
else "N/A"
)
comment: str = f"'{self.score.comment}'" if self.score.comment else "N/A"

return (
f"{self.evaluator} {'passed' if self.passed else 'failed'}"
f" with score {self.score.value},"
f" required {self.threshold},"
f" comment: {f"'{self.score.comment}'" or 'N/A'}"
f" meta:\n{meta_values}"
f"<evaluator name='{self.evaluator}' status='{status}'>"
f"\n<score>{self.score.value}</score>"
f"\n<threshold>{self.threshold}</threshold>"
f"\n<relative_score>{self.relative_score*100:.2f}%</relative_score>"
f"\n<comment>{comment}</comment>"
"\n</evaluator>"
)

else:
return (
f"{self.evaluator} {'passed' if self.passed else 'failed'}"
f" comment: {f"'{self.score.comment}'" or 'N/A'}"
f"{self.evaluator}: {status}, comment: {self.score.comment}"
if self.score.comment
else f"{self.evaluator}: {status}"
)

@property
def relative_score(self) -> float:
if self.threshold <= 0:
return 1

return min(1, self.score.value / self.threshold)

def __gt__(self, other: Self) -> bool:
assert isinstance(other, self.__class__) # nosec: B101
if self.evaluator != other.evaluator or self.threshold != other.threshold:
Expand Down
25 changes: 11 additions & 14 deletions src/draive/evaluation/scenario.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,23 +40,23 @@ def report(
include_passed: bool = True,
include_details: bool = True,
) -> str:
report: str = "\n- ".join(
evaluations_report: str = "\n".join(
result.report(include_details=include_details)
for result in self.evaluations
if include_passed or not result.passed
)

if report: # nonempty report
if evaluations_report: # nonempty report
if include_details:
meta_values: str = (
f"\n{'\n'.join(f'{key}: {value}' for key, value in self.meta.items())}"
if self.meta
else "N/A"
return (
f"<scenario name='{self.scenario}'>"
f"\n<relative_score>{self.relative_score*100:.2f}%</relative_score>"
f"\n<evaluations>\n{evaluations_report}\n</evaluations>"
"\n</scenario>"
)
return f"Scenario {self.scenario}, meta: {meta_values}\n---\n{report}"

else:
return f"Scenario {self.scenario}:\n{report}"
return f"Scenario {self.scenario}:\n{evaluations_report}"

elif not self.evaluations:
return f"Scenario {self.scenario} empty!"
Expand All @@ -69,12 +69,9 @@ def relative_score(self) -> float:
if not self.evaluations:
return 0

passed: int = 0
for evaluation in self.evaluations:
if evaluation.passed:
passed += 1

return passed / len(self.evaluations)
return len([evaluation for evaluation in self.evaluations if evaluation.passed]) / len(
self.evaluations
)


class EvaluationScenarioResult(DataModel):
Expand Down
Loading