Skip to content

feat(llmobs): add datasets and experiments features #13314

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions ddtrace/llmobs/_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,9 @@
# Tool call arguments are used to lookup the associated tool call info.
# When there are no tool call args, we use this as a place-holder lookup key
OAI_HANDOFF_TOOL_ARG = "{}"

# Experiments related
EXPECTED_OUTPUT = "_ml_obs.meta.input.expected_output"
EXPERIMENT_INPUT = "_ml_obs.meta.input"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

confused as to why we need new fields for input and output here but maybe it will become obvious from further reading

EXPERIMENT_OUTPUT = "_ml_obs.meta.output"
EXPERIMENT_ID_BAGGAGE_KEY = "experiment_id"
69 changes: 69 additions & 0 deletions ddtrace/llmobs/_llmobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,10 @@
from ddtrace.llmobs._constants import SPAN_LINKS
from ddtrace.llmobs._constants import SPAN_START_WHILE_DISABLED_WARNING
from ddtrace.llmobs._constants import TAGS
from ddtrace.llmobs._constants import EXPECTED_OUTPUT
from ddtrace.llmobs._constants import EXPERIMENT_INPUT
from ddtrace.llmobs._constants import EXPERIMENT_OUTPUT
from ddtrace.llmobs._constants import EXPERIMENT_ID_BAGGAGE_KEY
from ddtrace.llmobs._context import LLMObsContextProvider
from ddtrace.llmobs._evaluators.runner import EvaluatorRunner
from ddtrace.llmobs._utils import AnnotationContext
Expand Down Expand Up @@ -204,6 +208,14 @@ def _llmobs_span_event(cls, span: Span) -> Dict[str, Any]:
span._set_ctx_item(ML_APP, ml_app)
parent_id = span._get_ctx_item(PARENT_ID_KEY) or ROOT_PARENT_ID

# Experiments related
if span._get_ctx_item(EXPECTED_OUTPUT) is not None:
meta["expected_output"] = span._get_ctx_item(EXPECTED_OUTPUT)
if span._get_ctx_item(EXPERIMENT_INPUT) is not None:
meta["input"] = span._get_ctx_item(EXPERIMENT_INPUT)
if span._get_ctx_item(EXPERIMENT_OUTPUT) is not None:
meta["output"] = span._get_ctx_item(EXPERIMENT_OUTPUT)

llmobs_span_event = {
"trace_id": format_trace_id(span.trace_id),
"span_id": str(span.span_id),
Expand Down Expand Up @@ -241,6 +253,12 @@ def _llmobs_tags(span: Span, ml_app: str, session_id: Optional[str] = None) -> L
"language": "python",
"error": span.error,
}

# Add experiment_id from baggage if present
experiment_id = span.context.get_baggage_item(EXPERIMENT_ID_BAGGAGE_KEY)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should just look generally in the context for the experiment ID to follow the same paradigm we do for parent id and mlobs trace id

if experiment_id:
tags["experiment_id"] = experiment_id

err_type = span.get_tag(ERROR_TYPE)
if err_type:
tags["error_type"] = err_type
Expand Down Expand Up @@ -775,6 +793,35 @@ def agent(
"agent", name=name, session_id=session_id, ml_app=ml_app, _decorator=_decorator
)

@classmethod
def _experiment(
cls,
name: Optional[str] = None,
session_id: Optional[str] = None,
ml_app: Optional[str] = None,
experiment_id: Optional[str] = None,
) -> Span:
"""
Trace an LLM experiment, only used internally by the experiments SDK.

:param str name: The name of the traced operation. If not provided, a default value of "agent" will be set.
:param str session_id: The ID of the underlying user session. Required for tracking sessions.
:param str ml_app: The name of the ML application that the agent is orchestrating. If not provided, the default
value will be set to the value of `DD_LLMOBS_ML_APP`.
:param str experiment_id: The ID of the experiment to associate with this span and its children.

:returns: The Span object representing the traced operation.
"""
if cls.enabled is False:
log.warning(SPAN_START_WHILE_DISABLED_WARNING)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should be an info log in case this is expected (user doesn't want to submit spans)

also i don't think we need a constant for this log line - the indirection makes it less easy to debug

span = cls._instance._start_span("experiment", name=name, session_id=session_id, ml_app=ml_app)

# Set experiment_id in baggage if provided
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

again, probably want to set it in the context rather than the baggage

if experiment_id:
span.context.set_baggage_item(EXPERIMENT_ID_BAGGAGE_KEY, experiment_id)

return span

@classmethod
def workflow(
cls,
Expand Down Expand Up @@ -967,11 +1014,22 @@ def annotate(
error = cls._tag_embedding_io(span, input_documents=input_data, output_text=output_data)
elif span_kind == "retrieval":
error = cls._tag_retrieval_io(span, input_text=input_data, output_documents=output_data)
elif span_kind == "experiment":
error = cls._tag_experiment_io(span, input_data=input_data, output_data=output_data)
else:
cls._tag_text_io(span, input_value=input_data, output_value=output_data)
finally:
telemetry.record_llmobs_annotate(span, error)

@staticmethod
def _tag_expected_output(span, expected_output: dict) -> None:
"""Tags a given LLMObs span with a prompt"""
try:
span._set_ctx_item(EXPECTED_OUTPUT, expected_output)
except TypeError:
log.warning("Failed to validate expected output with error: ", exc_info=True)
return

@classmethod
def _tag_llm_io(cls, span, input_messages=None, output_messages=None) -> Optional[str]:
"""Tags input/output messages for LLM-kind spans.
Expand Down Expand Up @@ -1048,6 +1106,17 @@ def _tag_text_io(cls, span, input_value=None, output_value=None):
if output_value is not None:
span._set_ctx_item(OUTPUT_VALUE, safe_json(output_value))

@classmethod
def _tag_experiment_io(cls, span, input_data=None, output_data=None):
"""Tags input/output values for experiment kind spans.
Will be mapped to span's `meta.{input,output}.values` fields.
"""
if input_data is not None:
span._set_ctx_item(EXPERIMENT_INPUT, input_data)
if output_data is not None:
span._set_ctx_item(EXPERIMENT_OUTPUT, output_data)
return None

@staticmethod
def _set_dict_attribute(span: Span, key, value: Dict[str, Any]) -> None:
"""Sets a given LLM Obs span attribute with a dictionary key/values.
Expand Down
48 changes: 48 additions & 0 deletions ddtrace/llmobs/_utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from dataclasses import dataclass
import http.client
import json
from typing import Dict
from typing import List
from typing import Optional
from typing import Tuple
from typing import Union
import urllib.request
from urllib.error import HTTPError

from ddtrace import config
from ddtrace.ext import SpanTypes
Expand Down Expand Up @@ -314,3 +317,48 @@ def on_tool_call_output_used(self, tool_id: str, llm_span: Span) -> None:
"output",
"input",
)
class HTTPResponse:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this different than ddtrace.internal.utils.http.Response?

def __init__(self, resp) -> None:
if resp is None:
raise ValueError("Response object cannot be None")
self._resp = resp
self._content = None # type: Optional[bytes]

@property
def status_code(self) -> int:
if hasattr(self._resp, "status"):
return self._resp.status
elif hasattr(self._resp, "code"):
return self._resp.code
elif hasattr(self._resp, "getcode"):
return self._resp.getcode()
else:
raise AttributeError(f"Could not find status code in response object of type {type(self._resp)}")

def read(self) -> bytes:
if self._content is None:
content = self._resp.read()
if content is None:
return b""
self._content = content
return self._content

def text(self) -> str:
return self.read().decode("utf-8")

def json(self) -> dict:
return json.loads(self.text())


def http_request(
method: str, url: str, headers: Optional[Dict[str, str]] = None, body: Optional[bytes] = None
) -> HTTPResponse:
"""Make an HTTP request and return an HTTPResponse object."""
req = urllib.request.Request(url, data=body, method=method)
if headers:
req.headers.update(headers)
try:
response = urllib.request.urlopen(req)
return HTTPResponse(response)
except HTTPError as e:
return HTTPResponse(e)
13 changes: 9 additions & 4 deletions ddtrace/llmobs/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
from ddtrace.llmobs._constants import SPAN_SUBDOMAIN_NAME
from ddtrace.llmobs._utils import safe_json
from ddtrace.settings._agent import config as agent_config
from ddtrace.internal.utils.formats import asbool
import os


logger = get_logger(__name__)
Expand Down Expand Up @@ -294,10 +296,13 @@ def enqueue(self, event: LLMObsSpanEvent) -> None:
self._enqueue(event, truncated_event_size or raw_event_size)

def _data(self, events: List[LLMObsSpanEvent]) -> List[Dict[str, Any]]:
return [
{"_dd.stage": "raw", "_dd.tracer_version": ddtrace.__version__, "event_type": "span", "spans": [event]}
for event in events
]
payloads = []
for event in events:
data = {"_dd.stage": "raw", "_dd.tracer_version": ddtrace.__version__, "event_type": "span", "spans": [event]}
if asbool(os.getenv("DD_EXPERIMENTS_RUNNER_ENABLED")):
data["_dd.scope"] = "experiments"
payloads.append(data)
return payloads


def _truncate_span_event(event: LLMObsSpanEvent) -> LLMObsSpanEvent:
Expand Down
13 changes: 13 additions & 0 deletions ddtrace/llmobs/experimentation/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
"""
LLM Datasets and Experiments.
"""

from ._dataset import Dataset
from ._experiment import Experiment
from ._decorators import task
from ._decorators import evaluator
from ._decorators import summary_metric
from ._config import init


__all__ = ["Dataset", "Experiment", "task", "evaluator", "init", "summary_metric"]
Loading