diff --git a/README.md b/README.md index 3094757..594d7e8 100644 --- a/README.md +++ b/README.md @@ -89,7 +89,7 @@ print(eval_result) ```ts import { createTrajectoryLLMAsJudge, - type ChatCompletionMessage, + type FlexibleChatCompletionMessage, TRAJECTORY_ACCURACY_PROMPT, } from "agentevals"; @@ -117,7 +117,7 @@ const outputs = [ role: "assistant", content: "The weather in SF is 80 degrees and sunny.", }, -] satisfies ChatCompletionMessage[]; +] satisfies FlexibleChatCompletionMessage[]; const evalResult = await trajectoryEvaluator({ outputs, @@ -290,7 +290,7 @@ print(result) ```ts import { createTrajectoryMatchEvaluator, - type ChatCompletionMessage, + type FlexibleChatCompletionMessage, } from "agentevals"; const outputs = [ @@ -312,7 +312,7 @@ const outputs = [ }, { role: "tool", content: "It's 80 degrees and sunny in SF." }, { role: "assistant", content: "The weather in SF is 80 degrees and sunny." }, -] satisfies ChatCompletionMessage[]; +] satisfies FlexibleChatCompletionMessage[]; const referenceOutputs = [ { role: "user", content: "What is the weather in San Francisco?" }, @@ -327,7 +327,7 @@ const referenceOutputs = [ }] }, { role: "tool", content: "It's 80 degrees and sunny in San Francisco." }, -] satisfies ChatCompletionMessage[]; +] satisfies FlexibleChatCompletionMessage[]; const evaluator = createTrajectoryMatchEvaluator({ trajectoryMatchMode: "strict", @@ -442,7 +442,7 @@ print(result) ```ts import { createTrajectoryMatchEvaluator, - type ChatCompletionMessage, + type FlexibleChatCompletionMessage, } from "agentevals"; const outputs = [ @@ -470,7 +470,7 @@ const outputs = [ }, { role: "tool", content: "Nothing fun is happening, you should stay indoors and read!" }, { role: "assistant", content: "The weather in SF is 80 degrees and sunny, but there is nothing fun happening." }, -] satisifes ChatCompletionMessage[]; +] satisifes FlexibleChatCompletionMessage[]; const referenceOutputs = [ { role: "user", content: "What is the weather in SF and is there anything fun happening?" }, @@ -495,7 +495,7 @@ const referenceOutputs = [ { role: "tool", content: "Nothing fun is happening, you should stay indoors and read!" }, { role: "tool", content: "It's 80 degrees and sunny in SF." }, { role: "assistant", content: "In SF, it's 80˚ and sunny, but there is nothing fun happening." }, -] satisfies ChatCompletionMessage[]; +] satisfies FlexibleChatCompletionMessage[]; const evaluator = createTrajectoryMatchEvaluator({ trajectoryMatchMode: "unordered", @@ -597,7 +597,7 @@ print(result) ```ts import { createTrajectoryMatchEvaluator, - type ChatCompletionMessage + type FlexibleChatCompletionMessage } from "agentevals"; const outputs = [ @@ -620,7 +620,7 @@ const outputs = [ { role: "tool", content: "It's 80 degrees and sunny in SF, and 90 degrees and rainy in London." }, { role: "tool", content: "Unknown." }, { role: "assistant", content: "The weather in SF is 80 degrees and sunny. In London, it's 90 degrees and rainy."}, -] satisfies ChatCompletionMessage[]; +] satisfies FlexibleChatCompletionMessage[]; const referenceOutputs = [ { role: "user", content: "What is the weather in SF and London?" }, @@ -638,7 +638,7 @@ const referenceOutputs = [ }, { role: "tool", content: "It's 80 degrees and sunny in San Francisco, and 90 degrees and rainy in London." }, { role: "assistant", content: "The weather in SF is 80˚ and sunny. In London, it's 90˚ and rainy." }, -] satisfies ChatCompletionMessage[]; +] satisfies FlexibleChatCompletionMessage[]; const evaluator = createTrajectoryMatchEvaluator({ trajectoryMatchMode: "superset", // or "subset" @@ -757,7 +757,7 @@ print(result) ```ts import { createTrajectoryMatchEvaluator, - type ChatCompletionMessage, + type FlexibleChatCompletionMessage, } from "agentevals"; const outputs = [ @@ -774,7 +774,7 @@ const outputs = [ }, { role: "tool", content: "It's 80 degrees and sunny in SF." }, { role: "assistant", content: "The weather in SF is 80 degrees and sunny." }, -] satisfies ChatCompletionMessage[]; +] satisfies FlexibleChatCompletionMessage[]; const referenceOutputs = [ { role: "user", content: "What is the weather in San Francisco?" }, @@ -789,7 +789,7 @@ const referenceOutputs = [ }] }, { role: "tool", content: "It's 80 degrees and sunny in San Francisco." }, -] satisfies ChatCompletionMessage[]; +] satisfies FlexibleChatCompletionMessage[]; const evaluator = createTrajectoryMatchEvaluator({ trajectoryMatchMode: "strict", @@ -877,7 +877,7 @@ print(eval_result) import { createTrajectoryLLMAsJudge, TRAJECTORY_ACCURACY_PROMPT, - type ChatCompletionMessage, + type FlexibleChatCompletionMessage, } from "agentevals"; const evaluator = createTrajectoryLLMAsJudge({ @@ -901,7 +901,7 @@ const outputs = [ }, {role: "tool", content: "It's 80 degrees and sunny in SF."}, {role: "assistant", content: "The weather in SF is 80 degrees and sunny."}, -] satisfies ChatCompletionMessage[]; +] satisfies FlexibleChatCompletionMessage[]; const result = await evaluator({ outputs }); @@ -988,7 +988,7 @@ print(eval_result) import { createTrajectoryLLMAsJudge, TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE, - type ChatCompletionMessage, + type FlexibleChatCompletionMessage, } from "agentevals"; const evaluator = createTrajectoryLLMAsJudge({ @@ -1012,7 +1012,7 @@ const outputs = [ }, {role: "tool", content: "It's 80 degrees and sunny in SF."}, {role: "assistant", content: "The weather in SF is 80 degrees and sunny."}, -] satisfies ChatCompletionMessage[]; +] satisfies FlexibleChatCompletionMessage[]; const referenceOutputs = [ {role: "user", content: "What is the weather in SF?"}, @@ -1030,7 +1030,7 @@ const referenceOutputs = [ }, {role: "tool", content: "It's 80 degrees and sunny in San Francisco."}, {role: "assistant", content: "The weather in SF is 80˚ and sunny."}, -] satisfies ChatCompletionMessage[]; +] satisfies FlexibleChatCompletionMessage[]; const result = await evaluator({ outputs, diff --git a/js/README.md b/js/README.md index c48e127..767882c 100644 --- a/js/README.md +++ b/js/README.md @@ -28,7 +28,7 @@ Once you've done this, you can run your first trajectory evaluator. We represent ```ts import { createTrajectoryLLMAsJudge, - type ChatCompletionMessage, + type FlexibleChatCompletionMessage, TRAJECTORY_ACCURACY_PROMPT, } from "agentevals"; @@ -56,7 +56,7 @@ const outputs = [ role: "assistant", content: "The weather in SF is 80 degrees and sunny.", }, -] satisfies ChatCompletionMessage[]; +] satisfies FlexibleChatCompletionMessage[]; const evalResult = await trajectoryEvaluator({ outputs, @@ -133,7 +133,7 @@ in the same order with the same tool calls. Note that it does allow for differen ```ts import { createTrajectoryMatchEvaluator, - type ChatCompletionMessage, + type FlexibleChatCompletionMessage, } from "agentevals"; const outputs = [ @@ -155,7 +155,7 @@ const outputs = [ }, { role: "tool", content: "It's 80 degrees and sunny in SF." }, { role: "assistant", content: "The weather in SF is 80 degrees and sunny." }, -] satisfies ChatCompletionMessage[]; +] satisfies FlexibleChatCompletionMessage[]; const referenceOutputs = [ { role: "user", content: "What is the weather in San Francisco?" }, @@ -170,7 +170,7 @@ const referenceOutputs = [ }] }, { role: "tool", content: "It's 80 degrees and sunny in San Francisco." }, -] satisfies ChatCompletionMessage[]; +] satisfies FlexibleChatCompletionMessage[]; const evaluator = createTrajectoryMatchEvaluator({ trajectoryMatchMode: "strict", @@ -202,7 +202,7 @@ The `"unordered"` `trajectory_match_mode` compares two trajectories and ensures ```ts import { createTrajectoryMatchEvaluator, - type ChatCompletionMessage, + type FlexibleChatCompletionMessage, } from "agentevals"; const outputs = [ @@ -230,7 +230,7 @@ const outputs = [ }, { role: "tool", content: "Nothing fun is happening, you should stay indoors and read!" }, { role: "assistant", content: "The weather in SF is 80 degrees and sunny, but there is nothing fun happening." }, -] satisifes ChatCompletionMessage[]; +] satisifes FlexibleChatCompletionMessage[]; const referenceOutputs = [ { role: "user", content: "What is the weather in SF and is there anything fun happening?" }, @@ -255,7 +255,7 @@ const referenceOutputs = [ { role: "tool", content: "Nothing fun is happening, you should stay indoors and read!" }, { role: "tool", content: "It's 80 degrees and sunny in SF." }, { role: "assistant", content: "In SF, it's 80˚ and sunny, but there is nothing fun happening." }, -] satisfies ChatCompletionMessage[]; +] satisfies FlexibleChatCompletionMessage[]; const evaluator = createTrajectoryMatchEvaluator({ trajectoryMatchMode: "unordered", @@ -287,7 +287,7 @@ The `"subset"` and `"superset"` modes match partial trajectories (ensuring that ```ts import { createTrajectoryMatchEvaluator, - type ChatCompletionMessage + type FlexibleChatCompletionMessage } from "agentevals"; const outputs = [ @@ -310,7 +310,7 @@ const outputs = [ { role: "tool", content: "It's 80 degrees and sunny in SF, and 90 degrees and rainy in London." }, { role: "tool", content: "Unknown." }, { role: "assistant", content: "The weather in SF is 80 degrees and sunny. In London, it's 90 degrees and rainy."}, -] satisfies ChatCompletionMessage[]; +] satisfies FlexibleChatCompletionMessage[]; const referenceOutputs = [ { role: "user", content: "What is the weather in SF and London?" }, @@ -328,7 +328,7 @@ const referenceOutputs = [ }, { role: "tool", content: "It's 80 degrees and sunny in San Francisco, and 90 degrees and rainy in London." }, { role: "assistant", content: "The weather in SF is 80˚ and sunny. In London, it's 90˚ and rainy." }, -] satisfies ChatCompletionMessage[]; +] satisfies FlexibleChatCompletionMessage[]; const evaluator = createTrajectoryMatchEvaluator({ trajectoryMatchMode: "superset", // or "subset" @@ -376,7 +376,7 @@ Here's an example that allows case insensitivity for the arguments to a tool nam ```ts import { createTrajectoryMatchEvaluator, - type ChatCompletionMessage, + type FlexibleChatCompletionMessage, } from "agentevals"; const outputs = [ @@ -393,7 +393,7 @@ const outputs = [ }, { role: "tool", content: "It's 80 degrees and sunny in SF." }, { role: "assistant", content: "The weather in SF is 80 degrees and sunny." }, -] satisfies ChatCompletionMessage[]; +] satisfies FlexibleChatCompletionMessage[]; const referenceOutputs = [ { role: "user", content: "What is the weather in San Francisco?" }, @@ -408,7 +408,7 @@ const referenceOutputs = [ }] }, { role: "tool", content: "It's 80 degrees and sunny in San Francisco." }, -] satisfies ChatCompletionMessage[]; +] satisfies FlexibleChatCompletionMessage[]; const evaluator = createTrajectoryMatchEvaluator({ trajectoryMatchMode: "strict", @@ -447,7 +447,7 @@ The LLM-as-judge trajectory evaluator that uses an LLM to evaluate the trajector import { createTrajectoryLLMAsJudge, TRAJECTORY_ACCURACY_PROMPT, - type ChatCompletionMessage, + type FlexibleChatCompletionMessage, } from "agentevals"; const evaluator = createTrajectoryLLMAsJudge({ @@ -471,7 +471,7 @@ const outputs = [ }, {role: "tool", content: "It's 80 degrees and sunny in SF."}, {role: "assistant", content: "The weather in SF is 80 degrees and sunny."}, -] satisfies ChatCompletionMessage[]; +] satisfies FlexibleChatCompletionMessage[]; const result = await evaluator({ outputs }); @@ -492,7 +492,7 @@ If you have a reference trajectory, you can add an extra variable to your prompt import { createTrajectoryLLMAsJudge, TRAJECTORY_ACCURACY_PROMPT_WITH_REFERENCE, - type ChatCompletionMessage, + type FlexibleChatCompletionMessage, } from "agentevals"; const evaluator = createTrajectoryLLMAsJudge({ @@ -516,7 +516,7 @@ const outputs = [ }, {role: "tool", content: "It's 80 degrees and sunny in SF."}, {role: "assistant", content: "The weather in SF is 80 degrees and sunny."}, -] satisfies ChatCompletionMessage[]; +] satisfies FlexibleChatCompletionMessage[]; const referenceOutputs = [ {role: "user", content: "What is the weather in SF?"}, @@ -534,7 +534,7 @@ const referenceOutputs = [ }, {role: "tool", content: "It's 80 degrees and sunny in San Francisco."}, {role: "assistant", content: "The weather in SF is 80˚ and sunny."}, -] satisfies ChatCompletionMessage[]; +] satisfies FlexibleChatCompletionMessage[]; const result = await evaluator({ outputs, diff --git a/js/package.json b/js/package.json index 68be42d..c04d499 100644 --- a/js/package.json +++ b/js/package.json @@ -1,6 +1,6 @@ { "name": "agentevals", - "version": "0.0.6", + "version": "0.0.5", "packageManager": "yarn@3.5.1", "type": "module", "scripts": { diff --git a/js/src/trajectory/llm.ts b/js/src/trajectory/llm.ts index a5f1600..a65f0a1 100644 --- a/js/src/trajectory/llm.ts +++ b/js/src/trajectory/llm.ts @@ -5,6 +5,7 @@ import { _runEvaluator, _normalizeToOpenAIMessagesList } from "../utils.js"; import { _chatCompletionMessagesToString } from "./utils.js"; import { ChatCompletionMessage, + FlexibleChatCompletionMessage, EvaluatorResult, TrajectoryLLMAsJudgeParams, } from "../types.js"; @@ -57,12 +58,26 @@ Grade the following trajectory: function _formatInputs(params: { outputs: | ChatCompletionMessage[] + | FlexibleChatCompletionMessage[] | BaseMessage[] - | { messages: (BaseMessage | ChatCompletionMessage)[] }; + | { + messages: ( + | BaseMessage + | ChatCompletionMessage + | FlexibleChatCompletionMessage + )[]; + }; referenceOutputs?: | ChatCompletionMessage[] + | FlexibleChatCompletionMessage[] | BaseMessage[] - | { messages: (BaseMessage | ChatCompletionMessage)[] }; + | { + messages: ( + | BaseMessage + | ChatCompletionMessage + | FlexibleChatCompletionMessage + )[]; + }; }): [string, string] { const { outputs, referenceOutputs } = params; const normalizedOutputs = _normalizeToOpenAIMessagesList(outputs); @@ -130,12 +145,26 @@ export const createTrajectoryLLMAsJudge = ({ }: { outputs: | ChatCompletionMessage[] + | FlexibleChatCompletionMessage[] | BaseMessage[] - | { messages: (BaseMessage | ChatCompletionMessage)[] }; + | { + messages: ( + | BaseMessage + | ChatCompletionMessage + | FlexibleChatCompletionMessage + )[]; + }; referenceOutputs?: | ChatCompletionMessage[] + | FlexibleChatCompletionMessage[] | BaseMessage[] - | { messages: (BaseMessage | ChatCompletionMessage)[] }; + | { + messages: ( + | BaseMessage + | ChatCompletionMessage + | FlexibleChatCompletionMessage + )[]; + }; [key: string]: unknown; }): Promise => { const [formattedOutputs, formattedReferenceOutputs] = _formatInputs({ diff --git a/js/src/trajectory/match.ts b/js/src/trajectory/match.ts index 5593f66..8f37345 100644 --- a/js/src/trajectory/match.ts +++ b/js/src/trajectory/match.ts @@ -1,6 +1,7 @@ import { BaseMessage } from "@langchain/core/messages"; import { ChatCompletionMessage, + FlexibleChatCompletionMessage, ToolArgsMatchMode, ToolArgsMatchOverrides, } from "../types.js"; @@ -100,12 +101,26 @@ export function createTrajectoryMatchEvaluator({ }: { outputs: | ChatCompletionMessage[] + | FlexibleChatCompletionMessage[] | BaseMessage[] - | { messages: (BaseMessage | ChatCompletionMessage)[] }; + | { + messages: ( + | BaseMessage + | ChatCompletionMessage + | FlexibleChatCompletionMessage + )[]; + }; referenceOutputs: | ChatCompletionMessage[] + | FlexibleChatCompletionMessage[] | BaseMessage[] - | { messages: (BaseMessage | ChatCompletionMessage)[] }; + | { + messages: ( + | BaseMessage + | ChatCompletionMessage + | FlexibleChatCompletionMessage + )[]; + }; [key: string]: unknown; }) { const normalizedOutputs = _normalizeToOpenAIMessagesList(outputs); diff --git a/js/src/trajectory/strict.ts b/js/src/trajectory/strict.ts index 42b468a..dfbb097 100644 --- a/js/src/trajectory/strict.ts +++ b/js/src/trajectory/strict.ts @@ -1,6 +1,7 @@ import { BaseMessage } from "@langchain/core/messages"; import { ChatCompletionMessage, + FlexibleChatCompletionMessage, EvaluatorResult, ToolArgsMatchMode, ToolArgsMatchOverrides, @@ -9,14 +10,8 @@ import { _normalizeToOpenAIMessagesList, _runEvaluator } from "../utils.js"; import { _getMatcherForToolName } from "./utils.js"; export async function _scorer(params: { - outputs: - | ChatCompletionMessage[] - | BaseMessage[] - | { messages: (BaseMessage | ChatCompletionMessage)[] }; - referenceOutputs: - | ChatCompletionMessage[] - | BaseMessage[] - | { messages: (BaseMessage | ChatCompletionMessage)[] }; + outputs: ChatCompletionMessage[]; + referenceOutputs: ChatCompletionMessage[]; toolArgsMatchMode: ToolArgsMatchMode; toolArgsMatchOverrides?: ToolArgsMatchOverrides; }): Promise { @@ -26,9 +21,8 @@ export async function _scorer(params: { toolArgsMatchMode, toolArgsMatchOverrides, } = params; - const normalizedOutputs = _normalizeToOpenAIMessagesList(outputs); - const normalizedReferenceOutputs = - _normalizeToOpenAIMessagesList(referenceOutputs); + const normalizedOutputs = outputs; + const normalizedReferenceOutputs = referenceOutputs; if (!normalizedOutputs || !normalizedReferenceOutputs) { throw new Error( @@ -112,20 +106,40 @@ export async function _scorer(params: { export async function trajectoryStrictMatch(params: { outputs: | ChatCompletionMessage[] + | FlexibleChatCompletionMessage[] | BaseMessage[] - | { messages: (BaseMessage | ChatCompletionMessage)[] }; + | { + messages: ( + | BaseMessage + | ChatCompletionMessage + | FlexibleChatCompletionMessage + )[]; + }; referenceOutputs: | ChatCompletionMessage[] + | FlexibleChatCompletionMessage[] | BaseMessage[] - | { messages: (BaseMessage | ChatCompletionMessage)[] }; + | { + messages: ( + | BaseMessage + | ChatCompletionMessage + | FlexibleChatCompletionMessage + )[]; + }; toolCallArgsExactMatch: boolean; }): Promise { + const normalizedOutputs = _normalizeToOpenAIMessagesList(params.outputs); + const normalizedReferenceOutputs = _normalizeToOpenAIMessagesList( + params.referenceOutputs + ); + return _runEvaluator( "trajectory_strict_match", _scorer, "trajectory_strict_match", { - ...params, + outputs: normalizedOutputs, + referenceOutputs: normalizedReferenceOutputs, toolArgsMatchMode: params.toolCallArgsExactMatch ? "exact" : "ignore", } ); diff --git a/js/src/trajectory/subset.ts b/js/src/trajectory/subset.ts index ee96bb3..e043dee 100644 --- a/js/src/trajectory/subset.ts +++ b/js/src/trajectory/subset.ts @@ -1,6 +1,7 @@ import { BaseMessage } from "@langchain/core/messages"; import { ChatCompletionMessage, + FlexibleChatCompletionMessage, EvaluatorResult, ToolArgsMatchMode, ToolArgsMatchOverrides, @@ -39,13 +40,25 @@ export const _scorer = async (params: { */ export async function trajectorySubset(params: { outputs: - | ChatCompletionMessage[] + | FlexibleChatCompletionMessage[] | BaseMessage[] - | { messages: (BaseMessage | ChatCompletionMessage)[] }; + | { + messages: ( + | BaseMessage + | ChatCompletionMessage + | FlexibleChatCompletionMessage + )[]; + }; referenceOutputs: - | ChatCompletionMessage[] + | FlexibleChatCompletionMessage[] | BaseMessage[] - | { messages: (BaseMessage | ChatCompletionMessage)[] }; + | { + messages: ( + | BaseMessage + | ChatCompletionMessage + | FlexibleChatCompletionMessage + )[]; + }; }): Promise { const { outputs, referenceOutputs } = params; const outputsList = _normalizeToOpenAIMessagesList(outputs); diff --git a/js/src/trajectory/superset.ts b/js/src/trajectory/superset.ts index 4e0998b..4566b63 100644 --- a/js/src/trajectory/superset.ts +++ b/js/src/trajectory/superset.ts @@ -1,6 +1,7 @@ import { BaseMessage } from "@langchain/core/messages"; import { ChatCompletionMessage, + FlexibleChatCompletionMessage, EvaluatorResult, ToolArgsMatchMode, ToolArgsMatchOverrides, @@ -39,13 +40,25 @@ export const _scorer = async (params: { */ export async function trajectorySuperset(params: { outputs: - | ChatCompletionMessage[] + | FlexibleChatCompletionMessage[] | BaseMessage[] - | { messages: (BaseMessage | ChatCompletionMessage)[] }; + | { + messages: ( + | BaseMessage + | ChatCompletionMessage + | FlexibleChatCompletionMessage + )[]; + }; referenceOutputs: - | ChatCompletionMessage[] + | FlexibleChatCompletionMessage[] | BaseMessage[] - | { messages: (BaseMessage | ChatCompletionMessage)[] }; + | { + messages: ( + | BaseMessage + | ChatCompletionMessage + | FlexibleChatCompletionMessage + )[]; + }; }): Promise { const { outputs, referenceOutputs } = params; const outputsList = _normalizeToOpenAIMessagesList(outputs); diff --git a/js/src/trajectory/tests/trajectory.test.ts b/js/src/trajectory/tests/trajectory.test.ts index f2aeef8..866d390 100644 --- a/js/src/trajectory/tests/trajectory.test.ts +++ b/js/src/trajectory/tests/trajectory.test.ts @@ -4,7 +4,7 @@ import { expect } from "vitest"; import { HumanMessage, AIMessage, ToolMessage } from "@langchain/core/messages"; import { createTrajectoryMatchEvaluator } from "../match.js"; -import { ChatCompletionMessage } from "../../types.js"; +import { FlexibleChatCompletionMessage } from "../../types.js"; ls.describe("trajectory", () => { ls.test.each([ @@ -29,64 +29,62 @@ ls.describe("trajectory", () => { feedbackKey: "trajectory_subset_match", }, ])("trajectory exact match", async ({ trajectoryMatchMode, feedbackKey }) => { - const outputs = [ - { - role: "user", - content: "What is the weather in SF?", - }, - { - role: "assistant", - content: "", - tool_calls: [ - { - function: { - name: "get_weather", - arguments: JSON.stringify({ city: "San Francisco" }), - }, - }, - ], - }, - { - role: "tool", - content: "It's 80 degrees and sunny in SF.", - }, - { - role: "assistant", - content: "The weather in SF is 80 degrees and sunny.", - }, - ] satisfies ChatCompletionMessage[]; - const referenceOutputs = [ - { - role: "user", - content: "What is the weather in SF?", - }, - { - role: "assistant", - content: "", - tool_calls: [ - { - function: { - name: "get_weather", - arguments: JSON.stringify({ city: "San Francisco" }), - }, - }, - ], - }, - { - role: "tool", - content: "It's 80˚ and sunny in San Francisco.", - }, - { - role: "assistant", - content: "The weather in San Francisco is 80˚ and sunny.", - }, - ] satisfies ChatCompletionMessage[]; const evaluator = createTrajectoryMatchEvaluator({ trajectoryMatchMode, }); const result = await evaluator({ - outputs, - referenceOutputs, + outputs: [ + { + role: "user", + content: "What is the weather in SF?", + }, + { + role: "assistant", + content: "", + tool_calls: [ + { + function: { + name: "get_weather", + arguments: JSON.stringify({ city: "San Francisco" }), + }, + }, + ], + }, + { + role: "tool", + content: "It's 80 degrees and sunny in SF.", + }, + { + role: "assistant", + content: "The weather in SF is 80 degrees and sunny.", + }, + ], + referenceOutputs: [ + { + role: "user", + content: "What is the weather in SF?", + }, + { + role: "assistant", + content: "", + tool_calls: [ + { + function: { + name: "get_weather", + arguments: JSON.stringify({ city: "San Francisco" }), + }, + }, + ], + }, + { + role: "tool", + content: "It's 80˚ and sunny in San Francisco.", + }, + { + role: "assistant", + content: "The weather in San Francisco is 80˚ and sunny.", + }, + ], }); expect(result).toBeDefined(); expect(result.key).toBe(feedbackKey); @@ -154,7 +152,7 @@ ls.describe("trajectory", () => { content: "The weather in SF is 80 degrees and sunny. In London, it's 90 degrees and rainy.", }, - ] satisfies ChatCompletionMessage[]; + ] satisfies FlexibleChatCompletionMessage[]; const referenceOutputs = [ { role: "user", @@ -191,7 +189,7 @@ ls.describe("trajectory", () => { content: "The weather in London is 90˚ and rainy. In SF, it's 80˚ and sunny.", }, - ] satisfies ChatCompletionMessage[]; + ] satisfies FlexibleChatCompletionMessage[]; const evaluator = createTrajectoryMatchEvaluator({ trajectoryMatchMode, }); @@ -276,7 +274,7 @@ ls.describe("trajectory", () => { content: "The weather in SF is 80 degrees and sunny. In London, it's 90 degrees and rainy.", }, - ] satisfies ChatCompletionMessage[]; + ] satisfies FlexibleChatCompletionMessage[]; const referenceOutputs = [ { role: "user", @@ -313,7 +311,7 @@ ls.describe("trajectory", () => { content: "The weather in London is 90˚ and rainy. In SF, it's 80˚ and sunny.", }, - ] satisfies ChatCompletionMessage[]; + ] satisfies FlexibleChatCompletionMessage[]; const evaluator = createTrajectoryMatchEvaluator({ trajectoryMatchMode, }); @@ -382,7 +380,7 @@ ls.describe("trajectory", () => { content: "The weather in SF is 80 degrees and sunny. In London, it's 9000 degrees and hallucinating.", }, - ] satisfies ChatCompletionMessage[]; + ] satisfies FlexibleChatCompletionMessage[]; const referenceOutputs = [ { role: "user", @@ -419,7 +417,7 @@ ls.describe("trajectory", () => { content: "The weather in London is 90˚ and rainy. In SF, it's 80˚ and sunny.", }, - ] satisfies ChatCompletionMessage[]; + ] satisfies FlexibleChatCompletionMessage[]; const evaluator = createTrajectoryMatchEvaluator({ trajectoryMatchMode, }); @@ -487,7 +485,7 @@ ls.describe("trajectory", () => { role: "assistant", content: "The weather in SF is 80 degrees and sunny.", }, - ] satisfies ChatCompletionMessage[]; + ] satisfies FlexibleChatCompletionMessage[]; const referenceOutputs = [ { role: "user", @@ -513,7 +511,7 @@ ls.describe("trajectory", () => { role: "assistant", content: "The weather in SF is 80˚ and sunny.", }, - ] satisfies ChatCompletionMessage[]; + ] satisfies FlexibleChatCompletionMessage[]; const evaluator = createTrajectoryMatchEvaluator({ trajectoryMatchMode, }); @@ -592,7 +590,7 @@ ls.describe("trajectory", () => { content: "The weather in SF is 80˚ and sunny. In London, it's 90˚ and rainy.", }, - ] satisfies ChatCompletionMessage[]; + ] satisfies FlexibleChatCompletionMessage[]; const referenceOutputs = [ { role: "user", @@ -620,7 +618,7 @@ ls.describe("trajectory", () => { content: "The weather in SF is 80 degrees and sunny. In London, it's 90 degrees and rainy.", }, - ] satisfies ChatCompletionMessage[]; + ] satisfies FlexibleChatCompletionMessage[]; const evaluator = createTrajectoryMatchEvaluator({ trajectoryMatchMode, }); @@ -1009,7 +1007,7 @@ ls.describe("trajectory", () => { content: "The next flight after that is LX0112 from CDG to BSL is in 4 hours. However, we do not currently allow upgrades to first class. Confirming that I should book it for you anyway?", }, - ] satisfies ChatCompletionMessage[]; + ] satisfies FlexibleChatCompletionMessage[]; const referenceOutputs = [ { role: "user", content: "Hi there, what time is my flight?" }, @@ -1085,7 +1083,7 @@ ls.describe("trajectory", () => { content: "Ok, it looks like upgrades to first class are possible. What date would you like to change your flight to?", }, - ] satisfies ChatCompletionMessage[]; + ] satisfies FlexibleChatCompletionMessage[]; const evaluatorNoOverrides = createTrajectoryMatchEvaluator({ trajectoryMatchMode, }); @@ -1231,7 +1229,7 @@ ls.describe("trajectory", () => { content: "The next flight after that is LX0112 from CDG to BSL is in 4 hours. However, we do not currently allow upgrades to first class. Confirming that I should book it for you anyway?", }, - ] satisfies ChatCompletionMessage[]; + ] satisfies FlexibleChatCompletionMessage[]; const referenceOutputs = [ { role: "user", content: "Hi there, what time is my flight?" }, { @@ -1310,7 +1308,7 @@ ls.describe("trajectory", () => { content: "Ok, it looks like upgrades to first class are possible. What date would you like to change your flight to?", }, - ] satisfies ChatCompletionMessage[]; + ] satisfies FlexibleChatCompletionMessage[]; const evaluatorNoOverrides = createTrajectoryMatchEvaluator({ trajectoryMatchMode, @@ -1360,7 +1358,7 @@ ls.describe("trajectory", () => { ], }, { role: "assistant", content: "Your flight is at 10:00 AM." }, - ] satisfies ChatCompletionMessage[]; + ] satisfies FlexibleChatCompletionMessage[]; const referenceOutputs = [ { role: "user", content: "Hi there, what time is my flight?" }, { @@ -1378,7 +1376,7 @@ ls.describe("trajectory", () => { ], }, { role: "assistant", content: "Your flight is at 10:00 AM." }, - ] satisfies ChatCompletionMessage[]; + ] satisfies FlexibleChatCompletionMessage[]; const evaluator = createTrajectoryMatchEvaluator({ toolArgsMatchMode: toolArgsMatchMode as any, }); @@ -1412,7 +1410,7 @@ ls.describe("trajectory", () => { ], }, { role: "assistant", content: "Your flight is at 10:00 AM." }, - ] satisfies ChatCompletionMessage[]; + ] satisfies FlexibleChatCompletionMessage[]; const referenceOutputs = [ { role: "user", content: "Hi there, what time is my flight?" }, { @@ -1430,7 +1428,7 @@ ls.describe("trajectory", () => { ], }, { role: "assistant", content: "Your flight is at 10:00 AM." }, - ] satisfies ChatCompletionMessage[]; + ] satisfies FlexibleChatCompletionMessage[]; const evaluator = createTrajectoryMatchEvaluator({ toolArgsMatchMode: toolArgsMatchMode as any, }); @@ -1464,7 +1462,7 @@ ls.describe("trajectory", () => { ], }, { role: "assistant", content: "Your flight is at 10:00 AM." }, - ] satisfies ChatCompletionMessage[]; + ] satisfies FlexibleChatCompletionMessage[]; const referenceOutputs = [ { role: "user", content: "Hi there, what time is my flight?" }, { @@ -1482,7 +1480,7 @@ ls.describe("trajectory", () => { ], }, { role: "assistant", content: "Your flight is at 10:00 AM." }, - ] satisfies ChatCompletionMessage[]; + ] satisfies FlexibleChatCompletionMessage[]; const evaluator = createTrajectoryMatchEvaluator({ toolArgsMatchMode: toolArgsMatchMode as any, }); diff --git a/js/src/trajectory/tests/trajectory_llm.test.ts b/js/src/trajectory/tests/trajectory_llm.test.ts index 06a43ae..9e82767 100644 --- a/js/src/trajectory/tests/trajectory_llm.test.ts +++ b/js/src/trajectory/tests/trajectory_llm.test.ts @@ -5,7 +5,7 @@ import { createTrajectoryLLMAsJudge, TRAJECTORY_ACCURACY_PROMPT, } from "../llm.js"; -import { ChatCompletionMessage } from "../../types.js"; +import { FlexibleChatCompletionMessage } from "../../types.js"; ls.describe("Trajectory LLM", () => { ls.test( @@ -37,7 +37,7 @@ ls.describe("Trajectory LLM", () => { role: "assistant", content: "The weather in SF is 80 degrees and sunny.", }, - ] satisfies ChatCompletionMessage[]; + ] satisfies FlexibleChatCompletionMessage[]; const referenceOutputs = [ { role: "user", content: "What is the weather in SF?" }, @@ -58,7 +58,7 @@ ls.describe("Trajectory LLM", () => { content: "It's 80 degrees and sunny in San Francisco.", }, { role: "assistant", content: "The weather in SF is 80˚ and sunny." }, - ] satisfies ChatCompletionMessage[]; + ] satisfies FlexibleChatCompletionMessage[]; const evalResult = await evaluator({ inputs, @@ -127,7 +127,7 @@ ls.describe("Trajectory LLM", () => { role: "assistant", content: "The weather in SF is 80 degrees and sunny.", }, - ] satisfies ChatCompletionMessage[]; + ] satisfies FlexibleChatCompletionMessage[]; const evalResult = await evaluator({ outputs, }); @@ -194,7 +194,7 @@ According to this reference trajectory: role: "assistant", content: "The weather in SF is 80 degrees and sunny.", }, - ] satisfies ChatCompletionMessage[]; + ] satisfies FlexibleChatCompletionMessage[]; const referenceOutputs = [ { role: "user", content: "What is the weather in SF?" }, @@ -215,7 +215,7 @@ According to this reference trajectory: content: "It's 80 degrees and sunny in San Francisco.", }, { role: "assistant", content: "The weather in SF is 80˚ and sunny." }, - ] satisfies ChatCompletionMessage[]; + ] satisfies FlexibleChatCompletionMessage[]; const evalResult = await evaluator({ inputs, diff --git a/js/src/trajectory/unordered.ts b/js/src/trajectory/unordered.ts index 9cac9d6..1cc458c 100644 --- a/js/src/trajectory/unordered.ts +++ b/js/src/trajectory/unordered.ts @@ -1,6 +1,7 @@ import { BaseMessage } from "@langchain/core/messages"; import { ChatCompletionMessage, + FlexibleChatCompletionMessage, EvaluatorResult, ToolArgsMatchMode, ToolArgsMatchOverrides, @@ -46,13 +47,25 @@ export const _scorer = async (params: { */ export async function trajectoryUnorderedMatch(params: { outputs: - | ChatCompletionMessage[] + | FlexibleChatCompletionMessage[] | BaseMessage[] - | { messages: (BaseMessage | ChatCompletionMessage)[] }; + | { + messages: ( + | BaseMessage + | ChatCompletionMessage + | FlexibleChatCompletionMessage + )[]; + }; referenceOutputs: - | ChatCompletionMessage[] + | FlexibleChatCompletionMessage[] | BaseMessage[] - | { messages: (BaseMessage | ChatCompletionMessage)[] }; + | { + messages: ( + | BaseMessage + | ChatCompletionMessage + | FlexibleChatCompletionMessage + )[]; + }; }): Promise { const { outputs, referenceOutputs } = params; const outputsList = _normalizeToOpenAIMessagesList(outputs); diff --git a/js/src/types.ts b/js/src/types.ts index 1f48a23..191d753 100644 --- a/js/src/types.ts +++ b/js/src/types.ts @@ -2,6 +2,28 @@ import { createLLMAsJudge } from "openevals/llm"; export * from "openevals/types"; +// More tolerant version of ChatCompletionMessage that allows missing tool_call_id +export type FlexibleChatCompletionMessage = Record & + ( + | { + content: any; + role: "user" | "system" | "developer"; + id?: string; + } + | { + role: "assistant"; + content: any; + tool_calls?: any[]; + id?: string; + } + | { + role: "tool"; + content: any; + tool_call_id?: string; // Made optional for backward compatibility + id?: string; + } + ); + // Trajectory extracted from agent export type GraphTrajectory = { inputs?: (Record | null)[]; diff --git a/js/src/utils.ts b/js/src/utils.ts index d2ae7d4..4794006 100644 --- a/js/src/utils.ts +++ b/js/src/utils.ts @@ -6,6 +6,7 @@ import { } from "openevals/utils"; import { ChatCompletionMessage, + FlexibleChatCompletionMessage, MultiResultScorerReturnType, SingleResultScorerReturnType, } from "./types.js"; @@ -21,15 +22,48 @@ export const _convertToOpenAIMessage = ( } }; +export const _convertToChatCompletionMessage = ( + message: BaseMessage | ChatCompletionMessage | FlexibleChatCompletionMessage +): ChatCompletionMessage => { + let converted: FlexibleChatCompletionMessage; + + if (isBaseMessage(message)) { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + converted = _convertMessagesToOpenAIParams([message])[0] as any; + } else { + converted = message as FlexibleChatCompletionMessage; + } + + // For tool messages without tool_call_id, generate one for compatibility + if (converted.role === "tool" && !converted.tool_call_id) { + converted = { + ...converted, + tool_call_id: "generated-" + Math.random().toString(36).substring(2), + }; + } + + return converted as ChatCompletionMessage; +}; + export const _normalizeToOpenAIMessagesList = ( messages?: - | (BaseMessage | ChatCompletionMessage)[] - | { messages: (BaseMessage | ChatCompletionMessage)[] } + | (BaseMessage | ChatCompletionMessage | FlexibleChatCompletionMessage)[] + | { + messages: ( + | BaseMessage + | ChatCompletionMessage + | FlexibleChatCompletionMessage + )[]; + } ): ChatCompletionMessage[] => { if (!messages) { return []; } - let messagesList: (BaseMessage | ChatCompletionMessage)[]; + let messagesList: ( + | BaseMessage + | ChatCompletionMessage + | FlexibleChatCompletionMessage + )[]; if (!Array.isArray(messages)) { if ("messages" in messages && Array.isArray(messages.messages)) { messagesList = messages.messages; @@ -41,7 +75,7 @@ export const _normalizeToOpenAIMessagesList = ( } else { messagesList = messages; } - return messagesList.map(_convertToOpenAIMessage); + return messagesList.map(_convertToChatCompletionMessage); }; export const processScore = ( diff --git a/js/yarn.lock b/js/yarn.lock index d6fff1a..217217a 100644 --- a/js/yarn.lock +++ b/js/yarn.lock @@ -3404,8 +3404,8 @@ __metadata: linkType: hard "langsmith@npm:^0.3.26": - version: 0.3.29 - resolution: "langsmith@npm:0.3.29" + version: 0.3.48 + resolution: "langsmith@npm:0.3.48" dependencies: "@types/uuid": ^10.0.0 chalk: ^4.1.2 @@ -3415,11 +3415,20 @@ __metadata: semver: ^7.6.3 uuid: ^10.0.0 peerDependencies: + "@opentelemetry/api": "*" + "@opentelemetry/exporter-trace-otlp-proto": "*" + "@opentelemetry/sdk-trace-base": "*" openai: "*" peerDependenciesMeta: + "@opentelemetry/api": + optional: true + "@opentelemetry/exporter-trace-otlp-proto": + optional: true + "@opentelemetry/sdk-trace-base": + optional: true openai: optional: true - checksum: 5e714752d40bd90525436d2d977ed0e750031f946dbc6c5549a1d7ceb5546b48dfcc23a1fb1b758f18d954fe53e6089b7ca600c60f2f72d854aa24d5ff0a41fa + checksum: bb535aec6ae05cd8f07bb4f32ccffa06f91793360bcd5045cab1387e52322756c5592dbdcb67e5b5358818da1913958cd63716ce6d4d3e9e016cfd3737bd57fe languageName: node linkType: hard