From 3bee3a351dddfd1792836f5a247199923174f3a6 Mon Sep 17 00:00:00 2001
From: William FH <13333726+hinthornw@users.noreply.github.com>
Date: Mon, 29 Apr 2024 10:46:21 -0700
Subject: [PATCH] Add expect.score (#631)

---
 python/langsmith/_expect.py                 | 40 +++++++++++++++++++--
 python/langsmith/_testing.py                |  6 +++-
 python/langsmith/run_helpers.py             | 26 +++++++-------
 python/langsmith/utils.py                   |  9 +++++
 python/pyproject.toml                       |  2 +-
 python/tests/unit_tests/test_run_helpers.py | 11 ++++--
 6 files changed, 75 insertions(+), 19 deletions(-)

diff --git a/python/langsmith/_expect.py b/python/langsmith/_expect.py
index 9c9e1bbc6..fcdeeebee 100644
--- a/python/langsmith/_expect.py
+++ b/python/langsmith/_expect.py
@@ -36,6 +36,11 @@ def test_output_semantically_close():
         expect.value(response_txt).to_contain("Hello!")
         # Or using a custom check
         expect.value(response_txt).against(lambda x: "Hello" in x)
+
+        # You can even use this for basic metric logging within unit tests
+
+        expect.score(0.8)
+        expect.score(0.7, key="similarity").to_be_greater_than(0.7)
 """  # noqa: E501
 
 from __future__ import annotations
@@ -72,7 +77,7 @@ def __init__(
             max_workers=3
         )
         rt = rh.get_current_run_tree()
-        self._run_id = rt.id if rt else run_id
+        self._run_id = rt.trace_id if rt else run_id
 
     def _submit_feedback(self, score: int, message: Optional[str] = None) -> None:
         if not ls_utils.test_tracking_is_disabled():
@@ -336,6 +341,37 @@ def value(self, value: Any) -> _Matcher:
         """
         return _Matcher(self.client, "value", value, _executor=self.executor)
 
+    def score(
+        self,
+        score: Union[float, int],
+        *,
+        key: str = "score",
+        source_run_id: Optional[ls_client.ID_TYPE] = None,
+        comment: Optional[str] = None,
+    ) -> _Matcher:
+        """Log a numeric score to LangSmith.
+
+        Args:
+            score: The score value to log.
+            key: The key to use for logging the score. Defaults to "score".
+
+        Examples:
+            >>> expect.score(0.8) # doctest: +ELLIPSIS
+            <langsmith._expect._Matcher object at ...>
+
+            >>> expect.score(0.8, key="similarity").to_be_greater_than(0.7)
+        """
+        self._submit_feedback(
+            key,
+            {
+                "score": score,
+                "source_info": {"method": "expect.score"},
+                "source_run_id": source_run_id,
+                "comment": comment,
+            },
+        )
+        return _Matcher(self.client, key, score, _executor=self.executor)
+
     ## Private Methods
 
     @overload
@@ -354,7 +390,7 @@ def __call__(
 
     def _submit_feedback(self, key: str, results: dict):
         current_run = rh.get_current_run_tree()
-        run_id = current_run.id if current_run else None
+        run_id = current_run.trace_id if current_run else None
         if not ls_utils.test_tracking_is_disabled():
             self.executor.submit(
                 self.client.create_feedback, run_id=run_id, key=key, **results
diff --git a/python/langsmith/_testing.py b/python/langsmith/_testing.py
index f97468182..cfbc2a7a0 100644
--- a/python/langsmith/_testing.py
+++ b/python/langsmith/_testing.py
@@ -359,7 +359,11 @@ def _end_tests(
     test_suite.client.update_project(
         test_suite.experiment_id,
         end_time=datetime.datetime.now(datetime.timezone.utc),
-        metadata={**git_info, "dataset_version": test_suite.get_version()},
+        metadata={
+            **git_info,
+            "dataset_version": test_suite.get_version(),
+            "revision_id": ls_env.get_langchain_env_var_metadata().get("revision_id"),
+        },
     )
     test_suite.wait()
 
diff --git a/python/langsmith/run_helpers.py b/python/langsmith/run_helpers.py
index db612c7da..478d119d8 100644
--- a/python/langsmith/run_helpers.py
+++ b/python/langsmith/run_helpers.py
@@ -9,7 +9,6 @@
 import functools
 import inspect
 import logging
-import traceback
 import uuid
 import warnings
 from contextvars import copy_context
@@ -448,8 +447,7 @@ async def async_wrapper(
                     ):
                         function_result = await fr_coro
             except BaseException as e:
-                stacktrace = traceback.format_exc()
-                _container_end(run_container, error=stacktrace)
+                _container_end(run_container, error=e)
                 raise e
             _container_end(run_container, outputs=function_result)
             return function_result
@@ -521,8 +519,7 @@ async def async_generator_wrapper(
                 except StopAsyncIteration:
                     pass
             except BaseException as e:
-                stacktrace = traceback.format_exc()
-                _container_end(run_container, error=stacktrace)
+                _container_end(run_container, error=e)
                 raise e
             if results:
                 if reduce_fn:
@@ -564,8 +561,7 @@ def wrapper(
                         func, *args, **kwargs
                     )
             except BaseException as e:
-                stacktrace = traceback.format_exc()
-                _container_end(run_container, error=stacktrace)
+                _container_end(run_container, error=e)
                 raise e
             _container_end(run_container, outputs=function_result)
             return function_result
@@ -620,8 +616,7 @@ def generator_wrapper(
                     pass
 
             except BaseException as e:
-                stacktrace = traceback.format_exc()
-                _container_end(run_container, error=stacktrace)
+                _container_end(run_container, error=e)
                 raise e
             if results:
                 if reduce_fn:
@@ -712,7 +707,7 @@ def trace(
     else:
         new_run = run_trees.RunTree(
             name=name,
-            run_id=run_id,
+            id=run_id or uuid.uuid4(),
             reference_example_id=reference_example_id,
             run_type=run_type,
             extra=extra_outer,
@@ -730,7 +725,8 @@ def trace(
         if exceptions_to_handle and isinstance(e, exceptions_to_handle):
             tb = None
         else:
-            tb = traceback.format_exc()
+            tb = utils._format_exc()
+            tb = f"{e.__class__.__name__}: {e}\n\n{tb}"
         new_run.end(error=tb)
         new_run.patch()
         raise e
@@ -930,7 +926,7 @@ class _ContainerInput(TypedDict, total=False):
 def _container_end(
     container: _TraceableContainer,
     outputs: Optional[Any] = None,
-    error: Optional[str] = None,
+    error: Optional[BaseException] = None,
 ):
     """End the run."""
     run_tree = container.get("new_run")
@@ -938,7 +934,11 @@ def _container_end(
         # Tracing enabled
         return
     outputs_ = outputs if isinstance(outputs, dict) else {"output": outputs}
-    run_tree.end(outputs=outputs_, error=error)
+    error_ = None
+    if error:
+        stacktrace = utils._format_exc()
+        error_ = f"{repr(error)}\n\n{stacktrace}"
+    run_tree.end(outputs=outputs_, error=error_)
     run_tree.patch()
     if error:
         try:
diff --git a/python/langsmith/utils.py b/python/langsmith/utils.py
index 5c3d616e7..0217ab4e4 100644
--- a/python/langsmith/utils.py
+++ b/python/langsmith/utils.py
@@ -7,7 +7,9 @@
 import os
 import pathlib
 import subprocess
+import sys
 import threading
+import traceback
 from typing import (
     Any,
     Callable,
@@ -488,3 +490,10 @@ def with_optional_cache(
             yield
     else:
         yield
+
+
+def _format_exc() -> str:
+    # Used internally to format exceptions without cluttering the traceback
+    tb_lines = traceback.format_exception(*sys.exc_info())
+    filtered_lines = [line for line in tb_lines if "langsmith/" not in line]
+    return "".join(filtered_lines)
diff --git a/python/pyproject.toml b/python/pyproject.toml
index 292e4209a..537bb1ca3 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "langsmith"
-version = "0.1.51"
+version = "0.1.52"
 description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
 authors = ["LangChain <support@langchain.dev>"]
 license = "MIT"
diff --git a/python/tests/unit_tests/test_run_helpers.py b/python/tests/unit_tests/test_run_helpers.py
index f2e164c22..61a1d2004 100644
--- a/python/tests/unit_tests/test_run_helpers.py
+++ b/python/tests/unit_tests/test_run_helpers.py
@@ -4,6 +4,7 @@
 import json
 import sys
 import time
+import uuid
 import warnings
 from typing import Any, AsyncGenerator, Generator, Optional, cast
 from unittest.mock import MagicMock, patch
@@ -473,8 +474,12 @@ async def some_async_func(queries: list) -> AsyncGenerator[list, None]:
 
     @traceable
     async def another_async_func(query: str) -> str:
-        with langsmith.trace(name="zee-cm", inputs={"query": query}) as run_tree:
+        rid = uuid.uuid4()
+        with langsmith.trace(
+            name="zee-cm", inputs={"query": query}, run_id=rid
+        ) as run_tree:
             run_tree.end(outputs={"query": query})
+            assert run_tree.id == rid
         return query
 
     @traceable
@@ -848,11 +853,13 @@ def child_fn(a: int, b: int) -> int:
 
     mock_client_ = _get_mock_client()
     with tracing_context(enabled=True):
+        rid = uuid.uuid4()
         with langsmith.trace(
-            name="parent_fn", inputs={"a": 1, "b": 2}, client=mock_client_
+            name="parent_fn", inputs={"a": 1, "b": 2}, client=mock_client_, run_id=rid
         ) as run:
             result = child_fn(1, 2)
             run.end(outputs={"result": result})
+            assert run.id == rid
 
     assert result == 3
     assert run.name == "parent_fn"