langchain-ai · katmayb · Dec 15, 2025
@@ -203,6 +203,8 @@ class LinkMap(TypedDict):
             "Client.aevaluate": "langsmith/observability/sdk/client/#langsmith.client.Client.aevaluate",
             "Client.get_experiment_results": "langsmith/observability/sdk/client/#langsmith.client.Client.get_experiment_results",
             "ExperimentResults": "langsmith/observability/sdk/evaluation/#langsmith.evaluation._runner.ExperimentResults",
+            "traceable": "langsmith/observability/sdk/run_helpers/#langsmith.run_helpers.traceable",
+            "@traceable": "langsmith/observability/sdk/run_helpers/#langsmith.run_helpers.traceable",
             # LangGraph
             "get_stream_writer": "langgraph/config/#langgraph.config.get_stream_writer",
             "StateGraph": "langgraph/graphs/#langgraph.graph.state.StateGraph",

@@ -976,7 +976,15 @@
                   "group": "Feedback & evaluation",
                   "pages": [
                     "langsmith/attach-user-feedback",
-                    "langsmith/online-evaluations"
+                    {
+                      "group": "Set up online evaluators",
+                      "pages": [
+                        "langsmith/online-evaluations-llm-as-judge",
+                        "langsmith/online-evaluations-multi-turn",
+                        "langsmith/online-evaluations-code",
+                        "langsmith/online-evaluations-composite"
+                      ]
+                    }
                   ]
                 },
                 {
@@ -1027,27 +1035,35 @@
                       "pages": [
                         "langsmith/evaluate-llm-application",
                         "langsmith/run-evaluation-from-prompt-playground",
-                        "langsmith/prebuilt-evaluators"
+                        "langsmith/run-evals-api-only"
                       ]
                     },
                     {
                       "group": "Evaluation types",
                       "pages": [
                         "langsmith/evaluation-types",
-                        "langsmith/code-evaluator",
-                        "langsmith/llm-as-judge",
-                        "langsmith/composite-evaluators",
-                        "langsmith/summary",
-                        "langsmith/evaluate-pairwise"
-                      ]
-                    },
-                    {
-                      "group": "Frameworks & integrations",
-                      "pages": [
-                        "langsmith/evaluation-async",
-                        "langsmith/pytest",
-                        "langsmith/vitest-jest",
-                        "langsmith/run-evals-api-only"
+                        {
+                          "group": "UI",
+                          "pages": [
+                            "langsmith/llm-as-judge",
+                            "langsmith/code-evaluator-ui",
+                            "langsmith/composite-evaluators-ui"
+                          ]
+                        },
+                        {
+                          "group": "SDK",
+                          "pages": [
+                            "langsmith/llm-as-judge-sdk",
+                            "langsmith/code-evaluator-sdk",
+                            "langsmith/composite-evaluators-sdk",
+                            "langsmith/summary",
+                            "langsmith/evaluate-pairwise",
+                            "langsmith/prebuilt-evaluators",
+                            "langsmith/evaluation-async",
+                            "langsmith/pytest",
+                            "langsmith/vitest-jest"
+                          ]
+                        }
                       ]
                     },
                     {
@@ -1867,6 +1883,14 @@
     {
       "source": "/oss/python/integrations/llms/google_ai",
       "destination": "/oss/python/integrations/llms/google_generative_ai"
+    },
+    {
+      "source": "/langsmith/composite-evaluators",
+      "destination": "/langsmith/composite-evaluators-ui"
+    },
+    {
+      "source": "/langsmith/online-evaluations",
+      "destination": "/langsmith/online-evaluations-llm-as-judge"
     }
   ]
 }
@@ -233,7 +233,7 @@ When you use certain features with `base` tier traces, their data retention will
 
 The complete list of scenarios in which a trace will upgrade when:
 
-* **Feedback** is added to any run on the trace (or any trace in the thread), whether through [manual annotation](/langsmith/annotate-traces-inline#annotate-traces-and-runs-inline), automatically with [an online evaluator](/langsmith/online-evaluations), or programmatically [via the SDK](/langsmith/attach-user-feedback#log-user-feedback-using-the-sdk).
+* **Feedback** is added to any run on the trace (or any trace in the thread), whether through [manual annotation](/langsmith/annotate-traces-inline#annotate-traces-and-runs-inline), automatically with [an online evaluator](/langsmith/online-evaluations-llm-as-judge), or programmatically [via the SDK](/langsmith/attach-user-feedback#log-user-feedback-using-the-sdk).
 * An **[annotation queue](/langsmith/annotation-queues#assign-runs-to-an-annotation-queue)** receives any run from the trace.
 * An **[automation rule](/langsmith/rules#set-up-automation-rules)** matches any run within a trace.
 

@@ -39,7 +39,7 @@ LangSmith offers threshold-based alerting on three core metrics:
 | Metric Type        | Description                         | Use Case                                                                                                                                                                                 |
 | ------------------ | ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | **Errored Runs**   | Track runs with an error status     | Monitors for failures in an application.                                                                                                                                                 |
-| **Feedback Score** | Measures the average feedback score | Track [feedback from end users](/langsmith/attach-user-feedback) or [online evaluation results](/langsmith/online-evaluations) to alert on regressions. |
+| **Feedback Score** | Measures the average feedback score | Track [feedback from end users](/langsmith/attach-user-feedback) or [online evaluation results](/langsmith/online-evaluations-llm-as-judge) to alert on regressions. |
 | **Latency**        | Measures average run execution time | Tracks the latency of your application to alert on spikes and performance bottlenecks.                                                                                                   |
 
 Additionally, for **Errored Runs** and **Run Latency**, you can define filters to narrow down the runs that trigger alerts. For example, you might create an error alert filter for all `llm` runs tagged with `support_agent` that encounter a `RateLimitExceeded` error.

@@ -24,7 +24,7 @@ The process for binding evaluators to a dataset is very similar to the process f
 
 ## Custom code evaluators
 
-The process for binding a code evaluators to a dataset is very similar to the process for configuring a code evaluator in online evaluation. View instruction for [configuring code evaluators](/langsmith/online-evaluations#configure-a-custom-code-evaluator).
+The process for binding a code evaluators to a dataset is very similar to the process for configuring a code evaluator in online evaluation. View instruction for [configuring code evaluators](/langsmith/online-evaluations-code).
 
 The only difference between configuring a code evaluator in online evaluation and binding a code evaluator to a dataset is that the custom code evaluator can reference outputs that are part of the dataset's `Example`.
 

@@ -3,11 +3,11 @@ title: How to define a code evaluator
 sidebarTitle: Code evaluator
 ---
 
-<Info>
-* [Evaluators](/langsmith/evaluation-concepts#evaluators)
-</Info>
+Code evaluators are functions that take a dataset example and the resulting application output, and return one or more metrics. These functions can be passed directly into the @[`evaluate()`][Client.evaluate] or @[`aevaluate()`][Client.aevaluate] functions.
 
-Code evaluators are just functions that take a dataset example and the resulting application output, and return one or more metrics. These functions can be passed directly into [evaluate()](https://docs.smith.langchain.com/reference/python/evaluation/langsmith.evaluation._runner.evaluate) / [aevaluate()](https://docs.smith.langchain.com/reference/python/evaluation/langsmith.evaluation._arunner.aevaluate).
+<Tip>
+To define code evaluators in the LangSmith UI, refer to [How to define a code evaluator (UI)](/langsmith/code-evaluator-ui).
+</Tip>
 
 ## Basic example
 

@@ -0,0 +1,111 @@
+---
+title: How to define a code evaluator
+sidebarTitle: Code evaluator
+---
+
+Code evaluators in the [LangSmith UI](https://smith.langchain.com) allow you to write custom evaluation logic using Python or TypeScript code directly in the interface. Unlike [LLM-as-a-judge](/langsmith/llm-as-judge) evaluators that use a model to evaluate outputs, code evaluators use deterministic logic you define.
+
+<Note>
+To define code evaluators programmatically using the SDK, refer to [How to define a code evaluator (SDK)](/langsmith/code-evaluator).
+</Note>
+
+## Step 1. Create the evaluator
+
+1. Create an evaluator from one of the following pages in the [LangSmith UI](https://smith.langchain.com):
+    - In the playground or from a dataset: Select the **+ Evaluator** button.
+    - From a tracing project: Select **Add rules**, configure your rule and select **Apply evaluator**.
+
+1. Select **Create custom code evaluator** from the evaluator type options.
+
+## Step 2. Write your evaluator code
+
+In the **Add Custom Code Evaluator** page, define your evaluation logic using Python or TypeScript.
+
+Your evaluator function must be named `perform_eval` and should:
+
+1. Accept `run` and `example` parameters.
+1. Access data via `run['inputs']`, `run['outputs']`, and `example['outputs']`.
+1. Return a dictionary with your metric name as the key.
+
+### Function signature
+
+```python
+def perform_eval(run, example):
+    # Access the data
+    inputs = run['inputs']
+    outputs = run['outputs']
+    reference_outputs = example['outputs']  # Optional: reference/expected outputs
+
+    # Your evaluation logic here
+    score = ...
+
+    # Return a dict with your metric name
+    return {"metric_name": score}
+```
+
+### Example: Exact match evaluator
+
+```python
+def perform_eval(run, example):
+    """Check if the answer exactly matches the expected answer."""
+    actual = run['outputs']['answer']
+    expected = example['outputs']['answer']
+
+    is_correct = actual == expected
+    return {"exact_match": is_correct}
+```
+
+### Example: Concision evaluator
+
+```python
+def perform_eval(run, example):
+    """Score how concise the answer is. 1 is most concise, 5 is least concise."""
+    answer = run['outputs']['answer']
+    score = min(len(answer) // 1000, 4) + 1
+
+    return {"concision_score": score}
+```
+
+### Example: Input-based evaluator
+
+```python
+def perform_eval(run, example):
+    """Check if the input text contains toxic language."""
+    text = run['inputs'].get('text', '').lower()
+    toxic_words = ["idiot", "stupid", "hate", "awful"]
+
+    is_toxic = any(word in text for word in toxic_words)
+    return {"is_toxic": is_toxic}
+```
+
+## Step 3. Configure the evaluator
+
+### Name and description
+
+Give your evaluator a clear name that describes what it measures (e.g., "Exact Match", "Concision Score").
+
+### Feedback configuration
+
+Configure how the score should be interpreted:
+
+- **Boolean**: True/false feedback
+- **Categorical**: String values representing categories
+- **Continuous**: Numerical scoring within a range
+
+## Step 4. Test and save
+
+1. Preview your evaluator on example data to ensure it works as expected
+2. Click **Save** to make the evaluator available for use
+
+## Use your code evaluator
+
+Once created, you can use your code evaluator:
+
+- When running evaluations from the [playground](/langsmith/observability-concepts#prompt-playground)
+- As part of a dataset to [automatically run evaluations on experiments](/langsmith/bind-evaluator-to-dataset)
+- When running [online evaluations](/langsmith/online-evaluations-code)
+
+## Related
+
+- [LLM-as-a-judge evaluator (UI)](/langsmith/llm-as-judge): Use an LLM to evaluate outputs
+- [Composite evaluators](/langsmith/composite-evaluators-ui): Combine multiple evaluator scores