Skip to content

Commit 03abfce

Browse files
authored
chore(llmobs): add base experiments classes (#13930)
Adds skeleton code for Experiments, experiment tasks, and experiment evaluator classes/decorators. Implementation of experiment run() has been left out for a follow-up PR. Basic structure of Experiments: - Call LLMObs.experiment_task as a decorator to wrap a task function (must have `input` as an arg) - Call LLMObs.experiment_evaluator as a decorator to wrap an evaluator function (must have `input/output/expected_output` as args) - Create a Dataset - Create an Experiment(name: str, task, dataset, evaluators, description, config) - Call experiment.run(...) Some concerns: - Should experiment task/evaluator decorators support async/generator methods? Currently (and based on #13314) it only supports sync methods. - The ExperimentTask wrapper class requires `input` as an arg name, which shadows Python builtins. ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)
1 parent c109542 commit 03abfce

File tree

4 files changed

+140
-2
lines changed

4 files changed

+140
-2
lines changed

ddtrace/llmobs/_experiment.py

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from typing import Any
2+
from typing import Callable
23
from typing import Dict
34
from typing import List
45
from typing import Optional
@@ -13,7 +14,7 @@
1314

1415

1516
class DatasetRecord(TypedDict):
16-
input: NonNoneJSONType
17+
input_data: NonNoneJSONType
1718
expected_output: JSONType
1819
metadata: Dict[str, Any]
1920
record_id: NotRequired[Optional[str]]
@@ -28,3 +29,35 @@ def __init__(self, name: str, dataset_id: str, data: List[DatasetRecord]) -> Non
2829
self.name = name
2930
self._id = dataset_id
3031
self._data = data
32+
33+
34+
class Experiment:
35+
def __init__(
36+
self,
37+
name: str,
38+
task: Callable[[Dict[str, NonNoneJSONType]], JSONType],
39+
dataset: Dataset,
40+
evaluators: List[Callable[[NonNoneJSONType, JSONType, JSONType], JSONType]],
41+
description: str = "",
42+
config: Optional[Dict[str, Any]] = None,
43+
_llmobs: Optional[Any] = None, # LLMObs service (cannot import here due to circular dependency)
44+
) -> None:
45+
self.name = name
46+
self._task = task
47+
self._dataset = dataset
48+
self._evaluators = evaluators
49+
self._description = description
50+
self._config: Dict[str, Any] = config or {}
51+
self._llmobs = _llmobs
52+
self._id: Optional[str] = None
53+
54+
def run(self, jobs: int = 1, raise_errors: bool = False, sample_size: Optional[int] = None) -> None:
55+
task_results = self._run_task(jobs, raise_errors, sample_size)
56+
self._run_evaluators(task_results, raise_errors=raise_errors)
57+
return
58+
59+
def _run_task(self, jobs: int, raise_errors: bool = False, sample_size: Optional[int] = None) -> List[Any]:
60+
return []
61+
62+
def _run_evaluators(self, task_results, raise_errors: bool = False) -> None:
63+
pass

ddtrace/llmobs/_llmobs.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from dataclasses import dataclass
22
from dataclasses import field
3+
import inspect
34
import json
45
import os
56
import time
@@ -74,6 +75,9 @@
7475
from ddtrace.llmobs._context import LLMObsContextProvider
7576
from ddtrace.llmobs._evaluators.runner import EvaluatorRunner
7677
from ddtrace.llmobs._experiment import Dataset
78+
from ddtrace.llmobs._experiment import Experiment
79+
from ddtrace.llmobs._experiment import JSONType
80+
from ddtrace.llmobs._experiment import NonNoneJSONType
7781
from ddtrace.llmobs._utils import AnnotationContext
7882
from ddtrace.llmobs._utils import LinkTracker
7983
from ddtrace.llmobs._utils import ToolCallTracker
@@ -573,6 +577,42 @@ def create_dataset(cls, name: str, description: str) -> Dataset:
573577
def _delete_dataset(cls, dataset_id: str) -> None:
574578
return cls._instance._dne_client.dataset_delete(dataset_id)
575579

580+
@classmethod
581+
def experiment(
582+
cls,
583+
name: str,
584+
task: Callable[[Dict[str, NonNoneJSONType]], JSONType],
585+
dataset: Dataset,
586+
evaluators: List[Callable[[NonNoneJSONType, JSONType, JSONType], JSONType]],
587+
description: str = "",
588+
) -> Experiment:
589+
"""Initializes an Experiment to run a task on a Dataset and evaluators.
590+
591+
:param name: The name of the experiment.
592+
:param task: The task function to run. Must accept a parameter ``input_data`` and optionally ``config``.
593+
:param dataset: The dataset to run the experiment on, created with LLMObs.pull/create_dataset().
594+
:param evaluators: A list of evaluator functions to evaluate the task output.
595+
Must accept parameters ``input_data``, ``output_data``, and ``expected_output``.
596+
:param description: A description of the experiment.
597+
"""
598+
if not callable(task):
599+
raise TypeError("task must be a callable function.")
600+
sig = inspect.signature(task)
601+
params = sig.parameters
602+
if "input_data" not in params:
603+
raise TypeError("Task function must have an 'input_data' parameter.")
604+
if not isinstance(dataset, Dataset):
605+
raise TypeError("Dataset must be an LLMObs Dataset object.")
606+
if not evaluators or not all(callable(evaluator) for evaluator in evaluators):
607+
raise TypeError("Evaluators must be a list of callable functions.")
608+
for evaluator in evaluators:
609+
sig = inspect.signature(evaluator)
610+
params = sig.parameters
611+
required_params = ("input_data", "output_data", "expected_output")
612+
if not all(param in params for param in required_params):
613+
raise TypeError("Evaluator function must have parameters {}.".format(required_params))
614+
return Experiment(name, task, dataset, evaluators, description=description, _llmobs=cls)
615+
576616
@classmethod
577617
def register_processor(cls, processor: Optional[Callable[[LLMObsSpan], LLMObsSpan]] = None) -> None:
578618
"""Register a processor to be called on each LLMObs span.

ddtrace/llmobs/_writer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -356,7 +356,7 @@ def dataset_pull(self, name: str) -> Dataset:
356356
class_records.append(
357357
{
358358
"record_id": record["id"],
359-
"input": attrs["input"],
359+
"input_data": attrs["input"],
360360
"expected_output": attrs["expected_output"],
361361
"metadata": attrs.get("metadata", {}),
362362
}

tests/llmobs/test_experiments.py

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,19 @@
1111
"""
1212

1313
import os
14+
import re
1415

1516
import pytest
1617

1718

19+
def dummy_task(input_data):
20+
return input_data
21+
22+
23+
def dummy_evaluator(input_data, output_data, expected_output):
24+
return output_data == expected_output
25+
26+
1827
@pytest.fixture
1928
def test_dataset(llmobs):
2029
ds = llmobs.create_dataset(name="test-dataset", description="A test dataset")
@@ -44,3 +53,59 @@ def test_dataset_pull_non_existent(llmobs):
4453
def test_dataset_pull(llmobs, test_dataset):
4554
dataset = llmobs.pull_dataset(name=test_dataset.name)
4655
assert dataset._id is not None
56+
57+
58+
def test_experiment_invalid_task_type_raises(llmobs, test_dataset):
59+
with pytest.raises(TypeError, match="task must be a callable function."):
60+
llmobs.experiment("test_experiment", 123, test_dataset, [dummy_evaluator])
61+
62+
63+
def test_experiment_invalid_task_signature_raises(llmobs, test_dataset):
64+
with pytest.raises(TypeError, match="Task function must have an 'input_data' parameter."):
65+
66+
def my_task(not_input):
67+
pass
68+
69+
llmobs.experiment("test_experiment", my_task, test_dataset, [dummy_evaluator])
70+
71+
72+
def test_experiment_invalid_dataset_raises(llmobs):
73+
with pytest.raises(TypeError, match="Dataset must be an LLMObs Dataset object."):
74+
llmobs.experiment("test_experiment", dummy_task, 123, [dummy_evaluator])
75+
76+
77+
def test_experiment_invalid_evaluators_type_raises(llmobs, test_dataset):
78+
with pytest.raises(TypeError, match="Evaluators must be a list of callable functions"):
79+
llmobs.experiment("test_experiment", dummy_task, test_dataset, [])
80+
with pytest.raises(TypeError, match="Evaluators must be a list of callable functions"):
81+
llmobs.experiment("test_experiment", dummy_task, test_dataset, [123])
82+
83+
84+
def test_experiment_invalid_evaluator_signature_raises(llmobs, test_dataset):
85+
expected_err = "Evaluator function must have parameters ('input_data', 'output_data', 'expected_output')."
86+
with pytest.raises(TypeError, match=re.escape(expected_err)):
87+
88+
def my_evaluator_missing_expected_output(input_data, output_data):
89+
pass
90+
91+
llmobs.experiment("test_experiment", dummy_task, test_dataset, [my_evaluator_missing_expected_output])
92+
with pytest.raises(TypeError, match=re.escape(expected_err)):
93+
94+
def my_evaluator_missing_input(output_data, expected_output):
95+
pass
96+
97+
llmobs.experiment("test_experiment", dummy_task, test_dataset, [my_evaluator_missing_input])
98+
with pytest.raises(TypeError, match=re.escape(expected_err)):
99+
100+
def my_evaluator_missing_output(input_data, expected_output):
101+
pass
102+
103+
llmobs.experiment("test_experiment", dummy_task, test_dataset, [my_evaluator_missing_output])
104+
105+
106+
def test_experiment_create(llmobs, test_dataset):
107+
exp = llmobs.experiment("test_experiment", dummy_task, test_dataset, [dummy_evaluator], description="lorem ipsum")
108+
assert exp.name == "test_experiment"
109+
assert exp._task == dummy_task
110+
assert exp._dataset == test_dataset
111+
assert exp._evaluators == [dummy_evaluator]

0 commit comments

Comments
 (0)