Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions go/agent.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
package scenario

import (
"context"

"github.com/openai/openai-go"
)

type AgentRole string
type AgentReturnKind int

const (
AgentRoleAgent AgentRole = "agent"
AgentRoleUser AgentRole = "user"
AgentRoleJudge AgentRole = "judge"

AgentReturnString AgentReturnKind = iota
AgentReturnScenarioResult
AgentReturnMessages
AgentReturnMessage
)

type AgentConfig struct {
Name string

Model string
OpenAIClient *openai.Client

Temperature *float64
MaxTokens *int64
}

type AgentInput struct {
ThreadID string
Messages []openai.ChatCompletionMessageParamUnion
NewMessages []openai.ChatCompletionMessageParamUnion
RequestedRole AgentRole
JudgmentRequest bool
ScenarioState ExecutionState
ScenarioConfig ScenarioConfig
}

type AgentAdapter interface {
Role() AgentRole
Call(ctx context.Context, input AgentInput) (*AgentReturn, error)
}

type AgentReturn struct {
Kind AgentReturnKind

StringValue string
ScenarioResultValue ScenarioResult
MessagesValue []openai.ChatCompletionMessageParamUnion
MessageValue openai.ChatCompletionMessageParamUnion
}

func NewStringAgentReturn(s string) *AgentReturn {
return &AgentReturn{Kind: AgentReturnString, StringValue: s}
}
func NewScenarioResultAgentReturn(r ScenarioResult) *AgentReturn {
return &AgentReturn{Kind: AgentReturnScenarioResult, ScenarioResultValue: r}
}
func NewMessagesAgentReturn(msgs []openai.ChatCompletionMessageParamUnion) *AgentReturn {
return &AgentReturn{Kind: AgentReturnMessages, MessagesValue: msgs}
}
func NewEmptyAgentReturn() *AgentReturn {
return &AgentReturn{Kind: AgentReturnMessages, MessagesValue: []openai.ChatCompletionMessageParamUnion{}}
}
func NewMessageAgentReturn(msg openai.ChatCompletionMessageParamUnion) *AgentReturn {
return &AgentReturn{Kind: AgentReturnMessage, MessageValue: msg}
}
240 changes: 240 additions & 0 deletions go/agent_judge.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
package scenario

import (
"context"
"errors"
"fmt"
"strings"

"github.com/langwatch/scenario/go/internal"
"github.com/langwatch/scenario/go/internal/libraries/ptr"

"github.com/openai/openai-go"
"github.com/openai/openai-go/shared"
)

const (
judgePrompt = `
<role>
You are an LLM as a judge watching a simulated conversation as it plays out live to determine if the agent under test meets the criteria or not.
</role>

<goal>
Your goal is to determine if you already have enough information to make a verdict of the scenario below, or if the conversation should continue for longer.
If you do have enough information, use the finish_test tool to determine if all the criteria have been met, if not, use the continue_test tool to let the next step play out.
</goal>

<scenario>
{{.Description}}
</scenario>

<criteria>
{{.FormattedCriteriaList}}
</criteria>

<rules>
- Be strict, do not let the conversation continue if the agent already broke one of the "do not" or "should not" criterias.
- DO NOT make any judgment calls that are not explicitly listed in the success or failure criteria, withhold judgement if necessary
</rules>
`

lastMessagePrompt = `
System:

<finish_test>
This is the last message, conversation has reached the maximum number of turns, give your final verdict,
if you don't have enough information to make a verdict, say inconclusive with max turns reached.
</finish_test>
`
)

func buildJudgePrompt(criteria []string, description string) string {
formattedCriteriaList := ""
for i, criterion := range criteria {
formattedCriteriaList += fmt.Sprintf("%d. %s\n", i+1, criterion)
}

populatedPrompt := strings.ReplaceAll(judgePrompt, "{{.FormattedCriteriaList}}", formattedCriteriaList)
populatedPrompt = strings.ReplaceAll(populatedPrompt, "{{.Description}}", description)

return populatedPrompt
}

type JudgeAgentConfig struct {
AgentConfig

SystemPrompt *string
Criteria []string
}

type JudgeAgent struct {
cfg JudgeAgentConfig
}

func NewJudgeAgent(cfg JudgeAgentConfig) *JudgeAgent {
return &JudgeAgent{
cfg: cfg,
}
}

func (a *JudgeAgent) Role() AgentRole {
return AgentRoleJudge
}

func (a *JudgeAgent) Call(ctx context.Context, input AgentInput) (*AgentReturn, error) {
var systemPrompt string
if a.cfg.SystemPrompt != nil {
systemPrompt = *a.cfg.SystemPrompt
} else {
systemPrompt = buildJudgePrompt(a.cfg.Criteria, input.ScenarioConfig.Description)
}

lastMessage := input.ScenarioState.CurrentTurn() >= input.ScenarioConfig.MaxTurns
enforceJudgement := input.JudgmentRequest
hasCriteria := len(a.cfg.Criteria) > 0
messages := append(
[]openai.ChatCompletionMessageParamUnion{openai.SystemMessage(systemPrompt)},
input.Messages...,
)

if lastMessage {
messages = append(messages, openai.UserMessage(lastMessagePrompt))
}

if enforceJudgement && !hasCriteria {
return NewScenarioResultAgentReturn(ScenarioResult{
Success: false,
Messages: []openai.ChatCompletionMessageParamUnion{},
Reasoning: ptr.Ptr("TestingAgent was called as a judge, but it has no criteria to judge against"),
MetCriteria: []string{},
UnmetCriteria: []string{},
}), nil
}

params := openai.ChatCompletionNewParams{
Messages: messages,
Model: a.cfg.Model,
Temperature: openai.Opt(ptr.ValueOrDefault(a.cfg.Temperature, 0.0)),
Tools: createJudgeAgentTools(a.cfg.Criteria),
}
if a.cfg.MaxTokens != nil {
params.MaxCompletionTokens = openai.Opt(*a.cfg.MaxTokens)
}

completion, err := a.cfg.OpenAIClient.Chat.Completions.New(ctx, params)
if err != nil {
return nil, err
}

if len(completion.Choices) == 0 {
return nil, errors.New("judge agent had no response choices")
}

completionChoice := completion.Choices[0]
if len(completionChoice.Message.ToolCalls) == 0 {
return nil, errors.New("judge agent response has no tool calls")
}

toolCall := completionChoice.Message.ToolCalls[0]
if toolCall.Type != "function" {
return nil, errors.New("judge agent response tool call is of an unknown type")
}

switch toolCall.Function.Name {
case "continue_test":
return NewEmptyAgentReturn(), nil

case "finish_test":
toolArguments, err := internal.ParseJudgeAgentFinishTestToolArguments(toolCall.Function.Arguments)
if err != nil {
return nil, errors.New("")
}

passedCriteria := []string{}
failedCriteria := []string{}

for key, reasoning := range toolArguments.Criteria {
reasoningBool, ok := reasoning.(bool)
if !ok {
continue
}

if reasoningBool == true {
passedCriteria = append(passedCriteria, key)
} else {
failedCriteria = append(failedCriteria, key)
}
}

return NewScenarioResultAgentReturn(ScenarioResult{
Success: toolArguments.Verdict == "success" && len(failedCriteria) == 0,
Messages: messages,
Reasoning: ptr.Ptr(toolArguments.Reasoning),
MetCriteria: passedCriteria,
UnmetCriteria: failedCriteria,
}), nil

default:
return nil, errors.New("judge agent response tool call is not of a known name")
}
}

func createJudgeAgentTools(criteria []string) []openai.ChatCompletionToolParam {
criteriaMap := map[string]any{}
criteriaNames := []string{}
for _, criterion := range criteria {
paramName := criterionNameToParamName(criterion)
criteriaNames = append(criteriaNames, paramName)
criteriaMap[paramName] = map[string]any{
"enum": []any{true, false, "inconclusive"},
"description": criterion,
}
}

tools := []openai.ChatCompletionToolParam{{
Type: "function",
Function: shared.FunctionDefinitionParam{
Name: "continue_test",
Description: openai.Opt("Continue the test with the next step"),
Strict: openai.Opt(true),
Parameters: openai.FunctionParameters{
"type": "object",
"properties": map[any]any{},
"required": []any{},
"additionalProperties": false,
},
},
}, {
Type: "function",
Function: shared.FunctionDefinitionParam{
Name: "finish_test",
Description: openai.Opt("Complete the test with a final verdict"),
Strict: openai.Opt(true),
Parameters: openai.FunctionParameters{
"type": "object",
"properties": map[any]any{
"criteria": map[any]any{
"type": "object",
"properties": criteriaMap,
"required": criteriaNames,
"additionalProperties": false,
"description": "Strict verdict for each criterion",
},
"reasoning": map[any]any{
"type": "string",
"description": "Explanation of what the final verdict should be",
},
"verdict": map[any]any{
"type": "string",
"enum": []any{"success", "failure", "inconclusive"},
"description": "The final verdict of the test",
},
},
"required": []any{"criteria", "reasoning", "verdict"},
"additionalProperties": false,
},
},
}}

return tools
}
Loading