From 3bee3a351dddfd1792836f5a247199923174f3a6 Mon Sep 17 00:00:00 2001 From: William FH <13333726+hinthornw@users.noreply.github.com> Date: Mon, 29 Apr 2024 10:46:21 -0700 Subject: [PATCH] Add expect.score (#631) --- python/langsmith/_expect.py | 40 +++++++++++++++++++-- python/langsmith/_testing.py | 6 +++- python/langsmith/run_helpers.py | 26 +++++++------- python/langsmith/utils.py | 9 +++++ python/pyproject.toml | 2 +- python/tests/unit_tests/test_run_helpers.py | 11 ++++-- 6 files changed, 75 insertions(+), 19 deletions(-) diff --git a/python/langsmith/_expect.py b/python/langsmith/_expect.py index 9c9e1bbc6..fcdeeebee 100644 --- a/python/langsmith/_expect.py +++ b/python/langsmith/_expect.py @@ -36,6 +36,11 @@ def test_output_semantically_close(): expect.value(response_txt).to_contain("Hello!") # Or using a custom check expect.value(response_txt).against(lambda x: "Hello" in x) + + # You can even use this for basic metric logging within unit tests + + expect.score(0.8) + expect.score(0.7, key="similarity").to_be_greater_than(0.7) """ # noqa: E501 from __future__ import annotations @@ -72,7 +77,7 @@ def __init__( max_workers=3 ) rt = rh.get_current_run_tree() - self._run_id = rt.id if rt else run_id + self._run_id = rt.trace_id if rt else run_id def _submit_feedback(self, score: int, message: Optional[str] = None) -> None: if not ls_utils.test_tracking_is_disabled(): @@ -336,6 +341,37 @@ def value(self, value: Any) -> _Matcher: """ return _Matcher(self.client, "value", value, _executor=self.executor) + def score( + self, + score: Union[float, int], + *, + key: str = "score", + source_run_id: Optional[ls_client.ID_TYPE] = None, + comment: Optional[str] = None, + ) -> _Matcher: + """Log a numeric score to LangSmith. + + Args: + score: The score value to log. + key: The key to use for logging the score. Defaults to "score". + + Examples: + >>> expect.score(0.8) # doctest: +ELLIPSIS + + + >>> expect.score(0.8, key="similarity").to_be_greater_than(0.7) + """ + self._submit_feedback( + key, + { + "score": score, + "source_info": {"method": "expect.score"}, + "source_run_id": source_run_id, + "comment": comment, + }, + ) + return _Matcher(self.client, key, score, _executor=self.executor) + ## Private Methods @overload @@ -354,7 +390,7 @@ def __call__( def _submit_feedback(self, key: str, results: dict): current_run = rh.get_current_run_tree() - run_id = current_run.id if current_run else None + run_id = current_run.trace_id if current_run else None if not ls_utils.test_tracking_is_disabled(): self.executor.submit( self.client.create_feedback, run_id=run_id, key=key, **results diff --git a/python/langsmith/_testing.py b/python/langsmith/_testing.py index f97468182..cfbc2a7a0 100644 --- a/python/langsmith/_testing.py +++ b/python/langsmith/_testing.py @@ -359,7 +359,11 @@ def _end_tests( test_suite.client.update_project( test_suite.experiment_id, end_time=datetime.datetime.now(datetime.timezone.utc), - metadata={**git_info, "dataset_version": test_suite.get_version()}, + metadata={ + **git_info, + "dataset_version": test_suite.get_version(), + "revision_id": ls_env.get_langchain_env_var_metadata().get("revision_id"), + }, ) test_suite.wait() diff --git a/python/langsmith/run_helpers.py b/python/langsmith/run_helpers.py index db612c7da..478d119d8 100644 --- a/python/langsmith/run_helpers.py +++ b/python/langsmith/run_helpers.py @@ -9,7 +9,6 @@ import functools import inspect import logging -import traceback import uuid import warnings from contextvars import copy_context @@ -448,8 +447,7 @@ async def async_wrapper( ): function_result = await fr_coro except BaseException as e: - stacktrace = traceback.format_exc() - _container_end(run_container, error=stacktrace) + _container_end(run_container, error=e) raise e _container_end(run_container, outputs=function_result) return function_result @@ -521,8 +519,7 @@ async def async_generator_wrapper( except StopAsyncIteration: pass except BaseException as e: - stacktrace = traceback.format_exc() - _container_end(run_container, error=stacktrace) + _container_end(run_container, error=e) raise e if results: if reduce_fn: @@ -564,8 +561,7 @@ def wrapper( func, *args, **kwargs ) except BaseException as e: - stacktrace = traceback.format_exc() - _container_end(run_container, error=stacktrace) + _container_end(run_container, error=e) raise e _container_end(run_container, outputs=function_result) return function_result @@ -620,8 +616,7 @@ def generator_wrapper( pass except BaseException as e: - stacktrace = traceback.format_exc() - _container_end(run_container, error=stacktrace) + _container_end(run_container, error=e) raise e if results: if reduce_fn: @@ -712,7 +707,7 @@ def trace( else: new_run = run_trees.RunTree( name=name, - run_id=run_id, + id=run_id or uuid.uuid4(), reference_example_id=reference_example_id, run_type=run_type, extra=extra_outer, @@ -730,7 +725,8 @@ def trace( if exceptions_to_handle and isinstance(e, exceptions_to_handle): tb = None else: - tb = traceback.format_exc() + tb = utils._format_exc() + tb = f"{e.__class__.__name__}: {e}\n\n{tb}" new_run.end(error=tb) new_run.patch() raise e @@ -930,7 +926,7 @@ class _ContainerInput(TypedDict, total=False): def _container_end( container: _TraceableContainer, outputs: Optional[Any] = None, - error: Optional[str] = None, + error: Optional[BaseException] = None, ): """End the run.""" run_tree = container.get("new_run") @@ -938,7 +934,11 @@ def _container_end( # Tracing enabled return outputs_ = outputs if isinstance(outputs, dict) else {"output": outputs} - run_tree.end(outputs=outputs_, error=error) + error_ = None + if error: + stacktrace = utils._format_exc() + error_ = f"{repr(error)}\n\n{stacktrace}" + run_tree.end(outputs=outputs_, error=error_) run_tree.patch() if error: try: diff --git a/python/langsmith/utils.py b/python/langsmith/utils.py index 5c3d616e7..0217ab4e4 100644 --- a/python/langsmith/utils.py +++ b/python/langsmith/utils.py @@ -7,7 +7,9 @@ import os import pathlib import subprocess +import sys import threading +import traceback from typing import ( Any, Callable, @@ -488,3 +490,10 @@ def with_optional_cache( yield else: yield + + +def _format_exc() -> str: + # Used internally to format exceptions without cluttering the traceback + tb_lines = traceback.format_exception(*sys.exc_info()) + filtered_lines = [line for line in tb_lines if "langsmith/" not in line] + return "".join(filtered_lines) diff --git a/python/pyproject.toml b/python/pyproject.toml index 292e4209a..537bb1ca3 100644 --- a/python/pyproject.toml +++ b/python/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langsmith" -version = "0.1.51" +version = "0.1.52" description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform." authors = ["LangChain "] license = "MIT" diff --git a/python/tests/unit_tests/test_run_helpers.py b/python/tests/unit_tests/test_run_helpers.py index f2e164c22..61a1d2004 100644 --- a/python/tests/unit_tests/test_run_helpers.py +++ b/python/tests/unit_tests/test_run_helpers.py @@ -4,6 +4,7 @@ import json import sys import time +import uuid import warnings from typing import Any, AsyncGenerator, Generator, Optional, cast from unittest.mock import MagicMock, patch @@ -473,8 +474,12 @@ async def some_async_func(queries: list) -> AsyncGenerator[list, None]: @traceable async def another_async_func(query: str) -> str: - with langsmith.trace(name="zee-cm", inputs={"query": query}) as run_tree: + rid = uuid.uuid4() + with langsmith.trace( + name="zee-cm", inputs={"query": query}, run_id=rid + ) as run_tree: run_tree.end(outputs={"query": query}) + assert run_tree.id == rid return query @traceable @@ -848,11 +853,13 @@ def child_fn(a: int, b: int) -> int: mock_client_ = _get_mock_client() with tracing_context(enabled=True): + rid = uuid.uuid4() with langsmith.trace( - name="parent_fn", inputs={"a": 1, "b": 2}, client=mock_client_ + name="parent_fn", inputs={"a": 1, "b": 2}, client=mock_client_, run_id=rid ) as run: result = child_fn(1, 2) run.end(outputs={"result": result}) + assert run.id == rid assert result == 3 assert run.name == "parent_fn"