Skip to content
2 changes: 0 additions & 2 deletions packages/phoenix-evals/src/phoenix/evals/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
create_classifier,
create_evaluator,
evaluate_dataframe,
list_evaluators,
)
from .legacy import (
CODE_FUNCTIONALITY_PROMPT_BASE_TEMPLATE,
Expand Down Expand Up @@ -172,7 +171,6 @@
"ToolSchema",
"SourceType",
"create_classifier",
"list_evaluators",
"create_evaluator",
"async_evaluate_dataframe",
"evaluate_dataframe",
Expand Down
74 changes: 34 additions & 40 deletions packages/phoenix-evals/src/phoenix/evals/evaluators.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
from .llm import LLM
from .llm.types import ObjectGenerationMethod
from .templating import Template
from .utils import default_tqdm_progress_bar_formatter, remap_eval_input
from .utils import _deprecate_positional_args, default_tqdm_progress_bar_formatter, remap_eval_input

# --- Type Aliases ---
EvalInput = Dict[str, Any]
Expand Down Expand Up @@ -172,6 +172,7 @@ class Evaluator(ABC):

def __init__(
self,
*,
name: str,
source: SourceType,
direction: DirectionType = "maximize",
Expand Down Expand Up @@ -231,9 +232,9 @@ def evaluate(
input_mapping = input_mapping or self._input_mapping
required_fields = self._get_required_fields(input_mapping)
remapped_eval_input = remap_eval_input(
eval_input,
required_fields,
input_mapping,
eval_input=eval_input,
required_fields=required_fields,
input_mapping=input_mapping,
)
if self.input_schema is not None:
try:
Expand All @@ -255,9 +256,9 @@ async def async_evaluate(
input_mapping = input_mapping or self._input_mapping
required_fields = self._get_required_fields(input_mapping)
remapped_eval_input = remap_eval_input(
eval_input,
required_fields,
input_mapping,
eval_input=eval_input,
required_fields=required_fields,
input_mapping=input_mapping,
)
if self.input_schema is not None:
try:
Expand Down Expand Up @@ -340,6 +341,7 @@ class LLMEvaluator(Evaluator):

def __init__(
self,
*,
name: str,
llm: LLM,
prompt_template: Union[str, Template],
Expand Down Expand Up @@ -385,12 +387,12 @@ async def _async_evaluate(self, eval_input: EvalInput) -> List[Score]:
def evaluate(
self, eval_input: EvalInput, input_mapping: Optional[InputMappingType] = None
) -> List[Score]:
return super().evaluate(eval_input, input_mapping)
return super().evaluate(eval_input=eval_input, input_mapping=input_mapping)

async def async_evaluate(
self, eval_input: EvalInput, input_mapping: Optional[InputMappingType] = None
) -> List[Score]:
return await super().async_evaluate(eval_input, input_mapping)
return await super().async_evaluate(eval_input=eval_input, input_mapping=input_mapping)


# --- LLM ClassificationEvaluator ---
Expand Down Expand Up @@ -491,6 +493,7 @@ class ClassificationEvaluator(LLMEvaluator):

def __init__(
self,
*,
name: str,
llm: LLM,
prompt_template: Union[str, Template],
Expand Down Expand Up @@ -615,17 +618,6 @@ async def _async_evaluate(self, eval_input: EvalInput) -> List[Score]:
]


# --- Registry & simple evaluator decorator ---
_registry: Dict[str, Callable[..., List[Score]]] = {}


def list_evaluators() -> List[str]:
"""
Return a list of names of all registered evaluators.
"""
return list(_registry.keys())


def create_evaluator(
name: str, source: SourceType = "heuristic", direction: DirectionType = "maximize"
) -> Callable[[Callable[..., Any]], Evaluator]:
Expand Down Expand Up @@ -705,7 +697,7 @@ def word_count(text: str) -> int:
"text": ["Hello world", "This is a longer sentence", "Short"]
})

results_df = evaluate_dataframe(df, [word_count])
results_df = evaluate_dataframe(dataframe=df, evaluators=[word_count])
print(results_df["word_count_score"]) # JSON scores for each row

Notes:
Expand Down Expand Up @@ -855,7 +847,6 @@ async def __call__(self, *args: Any, **kwargs: Any) -> Any:

_AsyncFunctionEvaluator.__doc__ = original_docstring
evaluator_instance = _AsyncFunctionEvaluator()
_registry[name] = evaluator_instance.evaluate
return evaluator_instance
else:

Expand Down Expand Up @@ -900,13 +891,13 @@ def __call__(self, *args: Any, **kwargs: Any) -> Any:

_FunctionEvaluator.__doc__ = original_docstring
evaluator_instance = _FunctionEvaluator() # pyright: ignore
_registry[name] = evaluator_instance.evaluate
return evaluator_instance

return deco


# --- Factory functions ---
@_deprecate_positional_args("create_classifier")
def create_classifier(
name: str,
prompt_template: str,
Expand Down Expand Up @@ -995,6 +986,7 @@ def create_classifier(


# --- Bound Evaluator ---
@_deprecate_positional_args("bind_evaluator")
def bind_evaluator(
evaluator: Evaluator,
input_mapping: InputMappingType,
Expand Down Expand Up @@ -1026,7 +1018,7 @@ def text_length(content: str) -> int:

# Map 'message' field to 'content' parameter
mapping = {"content": "message"}
bound_evaluator = bind_evaluator(text_length, mapping)
bound_evaluator = bind_evaluator(evaluator=text_length, input_mapping=mapping)

# Now we can use 'message' instead of 'content'
result = bound_evaluator.evaluate({"message": "Hello world"})
Expand All @@ -1045,7 +1037,7 @@ def precision(retrieved_docs: list, relevant_docs: list) -> float:
"retrieved_docs": "retrieved_documents",
"relevant_docs": lambda x: [x["expected_document"]]
}
bound_evaluator = bind_evaluator(precision, mapping)
bound_evaluator = bind_evaluator(evaluator=precision, input_mapping=mapping)

data = {
"retrieved_documents": [1, 2, 3],
Expand All @@ -1072,7 +1064,7 @@ def response_quality(question: str, answer: str, context: str) -> dict:
"answer": "response.text",
"context": lambda x: " ".join(x["documents"])
}
bound_evaluator = bind_evaluator(response_quality, mapping)
bound_evaluator = bind_evaluator(evaluator=response_quality, input_mapping=mapping)

data = {
"query": "What is the capital?",
Expand Down Expand Up @@ -1181,6 +1173,7 @@ def _process_results_and_add_to_dataframe(
result_df[score_col] = score_list


@_deprecate_positional_args("evaluate_dataframe")
def evaluate_dataframe(
dataframe: pd.DataFrame,
evaluators: List[Evaluator],
Expand Down Expand Up @@ -1238,7 +1231,7 @@ def has_question(text: str) -> bool:
})

evaluators = [word_count, has_question]
results_df = evaluate_dataframe(df, evaluators, hide_tqdm_bar=True)
results_df = evaluate_dataframe(dataframe=df, evaluators=evaluators, hide_tqdm_bar=True)

# Results include original columns plus score columns
print(results_df.columns)
Expand All @@ -1255,21 +1248,21 @@ def response_length(response: str) -> int:

# Data has 'answer' column but evaluator expects 'response'
mapping = {"response": "answer"}
bound_evaluator = bind_evaluator(response_length, mapping)
bound_evaluator = bind_evaluator(evaluator=response_length, input_mapping=mapping)

df = pd.DataFrame({
"question": ["What is AI?", "How does ML work?"],
"answer": ["AI is artificial intelligence",
"ML uses algorithms to learn patterns"]
})

results_df = evaluate_dataframe(df, [bound_evaluator])
results_df = evaluate_dataframe(dataframe=df, evaluators=[bound_evaluator])

With progress bar and error handling::

results_df = evaluate_dataframe(
df,
evaluators,
dataframe=df,
evaluators=evaluators,
tqdm_bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]",
exit_on_error=False, # Continue on errors
max_retries=3
Expand All @@ -1291,6 +1284,7 @@ def response_length(response: str) -> int:
- Failed evaluations: If an evaluation fails, the failure details will be recorded
in the execution_details column and the score will be None.
"""

# Prepare common data structures
result_df, eval_inputs, task_inputs = _prepare_dataframe_evaluation(dataframe, evaluators)

Expand All @@ -1299,7 +1293,7 @@ def _task(task_input: Tuple[int, int]) -> List[Score]:
eval_input_index, evaluator_index = task_input
eval_input = eval_inputs[eval_input_index]
evaluator = evaluators[evaluator_index]
scores = evaluator.evaluate(eval_input)
scores = evaluator.evaluate(eval_input=eval_input)
return scores

# Only pass parameters that were explicitly provided, otherwise use SyncExecutor defaults
Expand Down Expand Up @@ -1329,6 +1323,7 @@ def _task(task_input: Tuple[int, int]) -> List[Score]:
return result_df


@_deprecate_positional_args("async_evaluate_dataframe")
async def async_evaluate_dataframe(
dataframe: pd.DataFrame,
evaluators: List[Evaluator],
Expand Down Expand Up @@ -1390,8 +1385,8 @@ def text_analysis(text: str) -> dict:

async def main():
results_df = await async_evaluate_dataframe(
df,
[text_analysis],
dataframe=df,
evaluators=[text_analysis],
concurrency=5 # Process up to 5 rows concurrently
hide_tqdm_bar=True,
)
Expand Down Expand Up @@ -1423,8 +1418,8 @@ async def main():

async def evaluate_sentiment():
results_df = await async_evaluate_dataframe(
df,
[sentiment_evaluator],
dataframe=df,
evaluators=[sentiment_evaluator],
concurrency=2, # Limit concurrent LLM calls
tqdm_bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt}"
)
Expand All @@ -1436,8 +1431,8 @@ async def evaluate_sentiment():

async def robust_evaluation():
results_df = await async_evaluate_dataframe(
df,
evaluators,
dataframe=df,
evaluators=evaluators,
concurrency=3,
exit_on_error=False, # Continue despite errors
max_retries=5, # Retry failed evaluations
Expand Down Expand Up @@ -1474,7 +1469,7 @@ async def _task(task_input: Tuple[int, int]) -> List[Score]:
eval_input_index, evaluator_index = task_input
eval_input = eval_inputs[eval_input_index]
evaluator = evaluators[evaluator_index]
scores = await evaluator.async_evaluate(eval_input)
scores = await evaluator.async_evaluate(eval_input=eval_input)
return scores

# Only pass parameters that were explicitly provided, otherwise use Executor defaults
Expand Down Expand Up @@ -1521,7 +1516,6 @@ async def _task(task_input: Tuple[int, int]) -> List[Score]:
"Evaluator",
"LLMEvaluator",
"ClassificationEvaluator",
"list_evaluators",
"create_evaluator",
"create_classifier",
"bind_evaluator",
Expand Down
32 changes: 32 additions & 0 deletions packages/phoenix-evals/src/phoenix/evals/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import functools
import inspect
import json
import warnings
from typing import Any, Callable, Dict, List, Mapping, Optional, Set, Union

import pandas as pd
Expand Down Expand Up @@ -27,6 +29,34 @@
InputMappingType = Optional[Mapping[str, Union[str, Callable[[Mapping[str, Any]], Any]]]]


def _deprecate_positional_args(
func_name: str,
) -> Callable[[Callable[..., Any]], Callable[..., Any]]:
"""
Decorator to issue deprecation warnings for positional argument usage.

Args:
func_name: Name of the function being decorated (for warning message)
"""

def decorator(func: Callable[..., Any]) -> Callable[..., Any]:
@functools.wraps(func)
def wrapper(*args: Any, **kwargs: Any) -> Any:
# Issue deprecation warning if called with ANY positional arguments
if len(args) > 0:
warnings.warn(
f"Positional arguments for {func_name} are deprecated and will be removed "
f"in a future version. Please use keyword arguments instead.",
DeprecationWarning,
stacklevel=2,
)
return func(*args, **kwargs)

return wrapper

return decorator


# --- Input Map/Transform Helpers ---
def _bind_mapping_function(
mapping_function: Callable[..., Any],
Expand Down Expand Up @@ -68,6 +98,7 @@ def _bind_mapping_function(
return mapping_function(**bound.arguments)


@_deprecate_positional_args("remap_eval_input")
def remap_eval_input(
eval_input: Mapping[str, Any],
required_fields: Set[str],
Expand Down Expand Up @@ -297,6 +328,7 @@ def _safe_json_load(x: Any) -> Any:
return eval_df


@_deprecate_positional_args("to_annotation_dataframe")
def to_annotation_dataframe(
dataframe: pd.DataFrame,
score_names: Optional[List[str]] = None,
Expand Down
Loading