Skip to content

Commit

Permalink
Add support for test case threshold value (promptfoo#125)
Browse files Browse the repository at this point in the history
If present, pass/fail status of a test case is determined by whether the
combined weighted score of all assertions exceeds the `threshold` value.

If not present, the default behavior is that pass/fail status of a test
case is determined by whether all its assertions pass.

related to promptfoo#124
  • Loading branch information
typpo authored Sep 1, 2023
1 parent d7b86c6 commit 544d836
Show file tree
Hide file tree
Showing 3 changed files with 67 additions and 2 deletions.
16 changes: 14 additions & 2 deletions src/assertions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -80,10 +80,22 @@ export async function runAssertions(test: AtomicTestCase, output: string): Promi
}
}

const finalScore = totalScore / totalWeight;
let finalReason = allPass ? 'All assertions passed' : failedReason;
if (test.threshold) {
// Existence of a test threshold overrides the pass/fail status of individual assertions
allPass = finalScore >= test.threshold;
if (allPass) {
finalReason = `Aggregate score ${finalScore.toFixed(2)}${test.threshold} threshold`;
} else {
finalReason = `Aggregate score ${finalScore.toFixed(2)} < ${test.threshold} threshold`;
}
}

return {
pass: allPass,
score: totalScore / totalWeight,
reason: allPass ? 'All assertions passed' : failedReason,
score: finalScore,
reason: finalReason,
tokensUsed,
componentResults,
assertion: null,
Expand Down
3 changes: 3 additions & 0 deletions src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,9 @@ export interface TestCase {

// Additional configuration settings for the prompt
options?: PromptConfig & OutputConfig & GradingConfig;

// The required score for this test case. If not provided, the test case is graded pass/fail.
threshold?: number;
}

export interface Scenario {
Expand Down
50 changes: 50 additions & 0 deletions test/assertions.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,56 @@ describe('runAssertions', () => {
expect(result.pass).toBeFalsy();
expect(result.reason).toBe('Expected output "Expected output"');
});

it('should fail when combined score is less than threshold', async () => {
const output = 'Different output';

const result: GradingResult = await runAssertions(
{
threshold: 0.5,
assert: [
{
type: 'equals',
value: 'Hello world',
weight: 2,
},
{
type: 'contains',
value: 'world',
weight: 1,
},
],
},
'Hi there world',
);
expect(result.pass).toBeFalsy();
expect(result.reason).toBe('Aggregate score 0.33 < 0.5 threshold');
});

it('should pass when combined score is greater than threshold', async () => {
const output = 'Different output';

const result: GradingResult = await runAssertions(
{
threshold: 0.25,
assert: [
{
type: 'equals',
value: 'Hello world',
weight: 2,
},
{
type: 'contains',
value: 'world',
weight: 1,
},
],
},
'Hi there world',
);
expect(result.pass).toBeTruthy();
expect(result.reason).toBe('Aggregate score 0.33 ≥ 0.25 threshold');
});
});

describe('runAssertion', () => {
Expand Down

0 comments on commit 544d836

Please sign in to comment.