Skip to content

Commit

Permalink
tests + error handling
Browse files Browse the repository at this point in the history
  • Loading branch information
isahers1 committed Oct 9, 2024
1 parent 2cfa36e commit 6de1edf
Show file tree
Hide file tree
Showing 3 changed files with 162 additions and 26 deletions.
3 changes: 1 addition & 2 deletions python/langsmith/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -4214,13 +4214,12 @@ def _log_evaluation_feedback(
_executor: Optional[cf.ThreadPoolExecutor] = None,
) -> List[ls_evaluator.EvaluationResult]:
results = self._select_eval_results(evaluator_response)

def _submit_feedback(**kwargs):
if _executor:
_executor.submit(self.create_feedback, **kwargs)
else:
self.create_feedback(**kwargs)

for res in results:
source_info_ = source_info or {}
if res.evaluator_info:
Expand Down
101 changes: 77 additions & 24 deletions python/langsmith/evaluation/_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@
import threading
import uuid
import inspect
import re
import ast
import textwrap
from contextvars import copy_context
from typing import (
Awaitable,
Expand Down Expand Up @@ -84,27 +85,76 @@
],
]



def extract_code_evaluator_feedback_keys(python_code: str) -> list[str]:
# Find the return statement
return_match = re.search(r'return\s*({[^}]+})', python_code)
if not return_match:
def extract_dict_keys(node):
if isinstance(node, ast.Dict):
keys = []
key_value = None
for key, value in zip(node.keys, node.values):
if isinstance(key, (ast.Str, ast.Constant)):
key_str = key.s if isinstance(key, ast.Str) else key.value
if key_str == 'key' and isinstance(value, (ast.Str, ast.Constant)):
key_value = value.s if isinstance(value, ast.Str) else value.value
elif key_str not in ['key', 'score']:
keys.append(key_str)
return [key_value] if key_value else keys
return []

# Extract the dictionary from the return statement
dict_str = return_match.group(1)

# Find all keys in the dictionary
key_matches = re.findall(r'"([^"]+)":', dict_str)
def extract_evaluation_result_key(node):
if isinstance(node, ast.Call) and isinstance(node.func, ast.Name) and node.func.id == 'EvaluationResult':
for keyword in node.keywords:
if keyword.arg == 'key' and isinstance(keyword.value, (ast.Str, ast.Constant)):
return [keyword.value.s if isinstance(keyword.value, ast.Str) else keyword.value.value]
return []

# Filter out 'key' and 'score'
feedback_keys = [key for key in key_matches if key not in ['key', 'score']]
def extract_evaluation_results_keys(node, variables):
if isinstance(node, ast.Call) and isinstance(node.func, ast.Name) and node.func.id == 'EvaluationResults':
for keyword in node.keywords:
if keyword.arg == 'results':
if isinstance(keyword.value, ast.Name):
return variables.get(keyword.value.id, [])
elif isinstance(keyword.value, ast.List):
keys = []
for elt in keyword.value.elts:
keys.extend(extract_evaluation_result_key(elt))
return keys
return []

# If 'key' is present in the dictionary, add its value to the feedback_keys
key_value_match = re.search(r'"key"\s*:\s*"([^"]+)"', dict_str)
if key_value_match:
feedback_keys.append(key_value_match.group(1))
python_code = textwrap.dedent(python_code)

return feedback_keys
try:
tree = ast.parse(python_code)
function_def = tree.body[0]
if not isinstance(function_def, ast.FunctionDef):
return []

variables = {}
keys = []

for node in ast.walk(function_def):
if isinstance(node, ast.Assign):
if isinstance(node.value, ast.List):
list_keys = []
for elt in node.value.elts:
list_keys.extend(extract_evaluation_result_key(elt))
if isinstance(node.targets[0], ast.Name):
variables[node.targets[0].id] = list_keys
elif isinstance(node, ast.Return) and node.value is not None:
dict_keys = extract_dict_keys(node.value)
eval_result_key = extract_evaluation_result_key(node.value)
eval_results_keys = extract_evaluation_results_keys(node.value, variables)

keys.extend(dict_keys)
keys.extend(eval_result_key)
keys.extend(eval_results_keys)

# If no keys found, return the function name
return keys if keys else [function_def.name]

except SyntaxError:
return []

def evaluate(
target: TARGET_T,
Expand Down Expand Up @@ -1376,15 +1426,18 @@ def _run_evaluators(
)
)
except Exception as e:
feedback_keys = extract_code_evaluator_feedback_keys(inspect.getsource(evaluator.func))
error_response = EvaluationResults(results=[EvaluationResult(key=key,source_run_id=run.id,
comment=repr(e),extra={"error":True}) for key in feedback_keys])
eval_results["results"].extend(
# TODO: This is a hack
self.client._log_evaluation_feedback(
error_response, run=run, _executor=executor
try:
feedback_keys = extract_code_evaluator_feedback_keys(inspect.getsource(evaluator.func))
error_response = EvaluationResults(results=[EvaluationResult(key=key,source_run_id=run.id,
comment=repr(e),extra={"error":True}) for key in feedback_keys])
eval_results["results"].extend(
# TODO: This is a hack
self.client._log_evaluation_feedback(
error_response, run=run, _executor=executor
)
)
)
except:
pass
logger.error(
f"Error running evaluator {repr(evaluator)} on"
f" run {run.id}: {repr(e)}",
Expand Down
84 changes: 84 additions & 0 deletions python/tests/evaluation/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@
from typing import Callable, Sequence, Tuple, TypeVar

import pytest
import sys
import os

# Add the current directory (which contains 'langsmith') to the Python path
sys.path.insert(0, os.path.abspath(os.path.dirname(os.path.dirname(os.path.dirname(__file__)))))

from langsmith import Client, aevaluate, evaluate, expect, test
from langsmith.schemas import Example, Run
Expand Down Expand Up @@ -32,6 +37,85 @@ def wait_for(
raise ValueError(f"Callable did not return within {total_time}")


def test_error_handling_evaluators():
client = Client()
_ = client.clone_public_dataset(
"https://smith.langchain.com/public/419dcab2-1d66-4b94-8901-0357ead390df/d"
)
dataset_name = "Evaluate Examples"

# Case 1: Normal dictionary return
def error_dict_evaluator(run: Run, example: Example):
if True: # This condition ensures the error is always raised
raise ValueError("Error in dict evaluator")
return {"key": "dict_key", "score": 1}

# Case 2: EvaluationResult return
def error_evaluation_result(run: Run, example: Example):
if True: # This condition ensures the error is always raised
raise ValueError("Error in EvaluationResult evaluator")
return EvaluationResult(key="eval_result_key", score=1)

# Case 3: EvaluationResults return
def error_evaluation_results(run: Run, example: Example):
if True: # This condition ensures the error is always raised
raise ValueError("Error in EvaluationResults evaluator")
return EvaluationResults(
results=[
EvaluationResult(key="eval_results_key1", score=1),
EvaluationResult(key="eval_results_key2", score=2)
]
)

# Case 4: Dictionary without 'key' field
def error_dict_no_key(run: Run, example: Example):
if True: # This condition ensures the error is always raised
raise ValueError("Error in dict without key evaluator")
return {"score":1}

def predict(inputs: dict) -> dict:
return {"output": "Yes"}

results = evaluate(
predict,
data=dataset_name,
evaluators=[
error_dict_evaluator,
error_evaluation_result,
error_evaluation_results,
error_dict_no_key,
],
max_concurrency=1, # To ensure deterministic order
)

assert len(results) == 10 # Assuming 10 examples in the dataset

for result in results:
eval_results = result["evaluation_results"]["results"]
assert len(eval_results) == 5

# Check error handling for each evaluator
assert eval_results[0].key == "dict_key"
assert "Error in dict evaluator" in eval_results[0].comment
assert eval_results[0].extra.get("error") is True

assert eval_results[1].key == "eval_result_key"
assert "Error in EvaluationResult evaluator" in eval_results[1].comment
assert eval_results[1].extra.get("error") is True

assert eval_results[2].key == "eval_results_key1"
assert "Error in EvaluationResults evaluator" in eval_results[2].comment
assert eval_results[2].extra.get("error") is True

assert eval_results[3].key == "eval_results_key2"
assert "Error in EvaluationResults evaluator" in eval_results[3].comment
assert eval_results[3].extra.get("error") is True

assert eval_results[4].key == "error_dict_no_key"
assert "Error in dict without key evaluator" in eval_results[4].comment
assert eval_results[4].extra.get("error") is True


def test_evaluate():
client = Client()
_ = client.clone_public_dataset(
Expand Down

0 comments on commit 6de1edf

Please sign in to comment.