Skip to content

Commit

Permalink
Add Prompts and Datasets pages (promptfoo#211)
Browse files Browse the repository at this point in the history
* Add Prompts and Datasets page

Also adds a `metrics` field on `Prompt`
  • Loading branch information
typpo authored Oct 9, 2023
1 parent 6f5167b commit 7ca9285
Show file tree
Hide file tree
Showing 21 changed files with 797 additions and 159 deletions.
15 changes: 15 additions & 0 deletions src/evaluator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -549,6 +549,21 @@ class Evaluator {
tokenUsage: row.response?.tokenUsage,
gradingResult: row.gradingResult,
};

table.head.prompts[colIndex].metrics = table.head.prompts[colIndex].metrics || {
score: 0,
testPassCount: 0,
testFailCount: 0,
assertPassCount: 0,
assertFailCount: 0,
};
const metrics = table.head.prompts[colIndex].metrics;
invariant(metrics, 'Expected prompt.metrics to be set');
metrics.score += row.score;
metrics.testPassCount += row.success ? 1 : 0;
metrics.testFailCount += row.success ? 0 : 1;
metrics.assertPassCount += row.gradingResult?.componentResults?.filter(r => r.pass).length || 0;
metrics.assertFailCount += row.gradingResult?.componentResults?.filter(r => !r.pass).length || 0;
},
);

Expand Down
4 changes: 3 additions & 1 deletion src/share.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ export async function createShareableUrl(
): Promise<string> {
const sharedResults: SharedResults = {
data: {
version: 1,
version: 2,
// TODO(ian): Take date from results, if applicable.
createdAt: new Date().toISOString(),
results,
config,
},
Expand Down
45 changes: 40 additions & 5 deletions src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,21 @@ export interface Prompt {
raw: string;
display: string;
function?: (context: { vars: Record<string, string | object> }) => Promise<string | object>;
metrics?: { score: number, testPassCount: number, testFailCount: number, assertPassCount: number, assertFailCount: number };
}

// Used when building prompts index from files.
export interface PromptWithMetadata {
id: string;
prompt: Prompt;
recentEvalDate: Date;
recentEvalId: string;
evals: {
id: string;
datasetId: string;
metrics: Prompt['metrics'];
}[];
count: number;
}

export interface EvaluateResult {
Expand Down Expand Up @@ -237,6 +252,23 @@ export interface Assertion {
rubricPrompt?: GradingConfig['rubricPrompt'];
}

// Used when building prompts index from files.
export interface TestCasesWithMetadataPrompt {
prompt: Prompt;
id: string;
evalId: string;
}

export interface TestCasesWithMetadata {
id: string;
testCases: string | string[] | TestCase[];
recentEvalDate: Date;
recentEvalId: string;
evalIds: string[];
count: number;
prompts: TestCasesWithMetadataPrompt[];
}

// Each test case is graded pass/fail. A test case represents a unique input to the LLM after substituting `vars` in the prompt.
export interface TestCase<Vars = Record<string, string | string[] | object>> {
// Optional description of what you're testing
Expand Down Expand Up @@ -352,9 +384,12 @@ export interface EvaluateTestSuite extends TestSuiteConfig {
}

export interface SharedResults {
data: {
version: number;
results: EvaluateSummary;
config: Partial<UnifiedConfig>;
};
data: ResultsFile;
}

export interface ResultsFile {
version: number;
createdAt: string;
results: EvaluateSummary;
config: Partial<UnifiedConfig>;
}
195 changes: 171 additions & 24 deletions src/util.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import * as fs from 'fs';
import * as path from 'path';
import * as os from 'os';
import { createHash } from 'crypto';

import $RefParser from '@apidevtools/json-schema-ref-parser';
import yaml from 'js-yaml';
Expand All @@ -10,7 +11,7 @@ import { stringify } from 'csv-stringify/sync';
import logger from './logger';
import { getDirectory } from './esm';

import type { EvaluateSummary, EvaluateTableOutput, UnifiedConfig } from './types';
import type { EvaluateSummary, EvaluateTableOutput, UnifiedConfig, Prompt, PromptWithMetadata, TestCase, TestCasesWithMetadata, ResultsFile, TestCasesWithMetadataPrompt } from './types';

let globalConfigCache: any = null;

Expand Down Expand Up @@ -128,24 +129,19 @@ export function writeLatestResults(results: EvaluateSummary, config: Partial<Uni
const resultsDirectory = path.join(getConfigDirectoryPath(), 'output');

// Replace hyphens with colons (Windows compatibility).
const timestamp = new Date().toISOString().replace(/:/g, '-');

const newResultsPath = path.join(resultsDirectory, `eval-${timestamp}.json`);
const filename = dateToFilename(new Date());
const newResultsPath = path.join(resultsDirectory, filename);
const latestResultsPath = getLatestResultsPath();
try {
fs.mkdirSync(resultsDirectory, { recursive: true });
fs.writeFileSync(
newResultsPath,
JSON.stringify(
{
version: 1,
config,
results,
},
null,
2,
),
);

const resultsFileData: ResultsFile = {
version: 2,
createdAt: new Date().toISOString(),
config,
results,
};
fs.writeFileSync(newResultsPath, JSON.stringify(resultsFileData, null, 2));

// Use copy instead of symlink to avoid issues with Windows permissions.
try {
Expand Down Expand Up @@ -179,25 +175,176 @@ export function cleanupOldResults(remaining = RESULT_HISTORY_LENGTH) {
}
}

export function readResult(
name: string,
): { results: EvaluateSummary; config: Partial<UnifiedConfig> } | undefined {
export function filenameToDate(filename: string) {
const dateString = filename.slice('eval-'.length, filename.length - '.json'.length);

// Replace hyphens with colons where necessary (Windows compatibility).
const dateParts = dateString.split('T');
const timePart = dateParts[1].replace(/-/g, ':');
const formattedDateString = `${dateParts[0]}T${timePart}`;

const date = new Date(formattedDateString);
return date.toLocaleDateString('en-US', {
year: 'numeric',
month: 'long',
day: 'numeric',
hour: '2-digit',
minute: '2-digit',
second: '2-digit',
timeZoneName: 'short',
});
}

export function dateToFilename(date: Date) {
return `eval-${date.toISOString().replace(/:/g, '-')}.json`;
}

export function readResult(name: string): {result: ResultsFile, createdAt: Date} | undefined {
const resultsDirectory = path.join(getConfigDirectoryPath(), 'output');
const resultsPath = path.join(resultsDirectory, name);
try {
const results = JSON.parse(fs.readFileSync(fs.realpathSync(resultsPath), 'utf-8'));
return results;
const result = JSON.parse(fs.readFileSync(fs.realpathSync(resultsPath), 'utf-8')) as ResultsFile;
const createdAt = new Date(filenameToDate(name));
return { result, createdAt };
} catch (err) {
logger.error(`Failed to read results from ${resultsPath}:\n${err}`);
}
}

export function readLatestResults():
| { results: EvaluateSummary; config: Partial<UnifiedConfig> }
| undefined {
export function readLatestResults(): ResultsFile | undefined {
return JSON.parse(fs.readFileSync(getLatestResultsPath(), 'utf-8'));
}

export function getPromptsForTestCases(testCases: TestCase[]) {
const testCasesJson = JSON.stringify(testCases);
const testCasesSha256 = createHash('sha256').update(testCasesJson).digest('hex');
return getPromptsForTestCasesHash(testCasesSha256);
}

export function getPromptsForTestCasesHash(testCasesSha256: string) {
return getPromptsWithPredicate(result => {
const testsJson = JSON.stringify(result.config.tests);
const hash = createHash('sha256').update(testsJson).digest('hex');
return hash === testCasesSha256;
});
}

function sha256(str: string) {
return createHash('sha256').update(str).digest('hex');
}

export function getPrompts() {
return getPromptsWithPredicate(() => true);
}

export function getPromptsWithPredicate(predicate: (result: ResultsFile) => boolean): PromptWithMetadata[] {
const resultsFiles = listPreviousResults();
const groupedPrompts: { [hash: string]: PromptWithMetadata } = {};

for (const filePath of resultsFiles) {
const file = readResult(filePath);
if (!file) {
continue;
}
const { result, createdAt } = file;
if (result && predicate(result)) {
for (const prompt of result.results.table.head.prompts) {
const promptId = sha256(prompt.raw);
const datasetId = result.config.tests ? sha256(JSON.stringify(result.config.tests)) : 'No dataset';
if (promptId in groupedPrompts) {
groupedPrompts[promptId].recentEvalDate = new Date(Math.max(groupedPrompts[promptId].recentEvalDate.getTime(), new Date(createdAt).getTime()));
groupedPrompts[promptId].count += 1;
groupedPrompts[promptId].evals.push({
id: filePath,
datasetId,
metrics: prompt.metrics,
});
} else {
groupedPrompts[promptId] = {
id: promptId,
prompt,
recentEvalDate: new Date(createdAt),
count: 1,
recentEvalId: filePath,
evals: [{
id: filePath,
datasetId,
metrics: prompt.metrics,
}],
};
}
}
}
}

return Object.values(groupedPrompts);/*.map(({ prompt, date, count, evalId }) => ({
prompt,
date,
count,
evalId,
}));*/
}

export function getTestCases() {
return getTestCasesWithPredicate(() => true);
}

export function getTestCasesWithPredicate(predicate: (result: ResultsFile) => boolean): TestCasesWithMetadata[] {
const resultsFiles = listPreviousResults();
const groupedTestCases: { [hash: string]: TestCasesWithMetadata } = {};

for (const filePath of resultsFiles) {
const file = readResult(filePath);
if (!file) {
continue;
}
const { result, createdAt } = file;
const testCases = result?.config?.tests;
if (testCases && predicate(result)) {
const datasetId = sha256(JSON.stringify(testCases));
if (datasetId in groupedTestCases) {
groupedTestCases[datasetId].recentEvalDate = new Date(Math.max(groupedTestCases[datasetId].recentEvalDate.getTime(), new Date(createdAt).getTime()));
groupedTestCases[datasetId].count += 1;
const newPrompts = result.results.table.head.prompts.map(prompt => ({id: sha256(prompt.raw), prompt, evalId: filePath}));
const promptsById: Record<string, TestCasesWithMetadataPrompt> = {};
for (const prompt of groupedTestCases[datasetId].prompts.concat(newPrompts)) {
if (!(prompt.id in promptsById)) {
promptsById[prompt.id] = prompt;
}
}
groupedTestCases[datasetId].prompts = Object.values(promptsById);
groupedTestCases[datasetId].evalIds.push(filePath);
} else {
const newPrompts = result.results.table.head.prompts.map(prompt => ({id: createHash('sha256').update(prompt.raw).digest('hex'), prompt, evalId: filePath}));
const promptsById: Record<string, TestCasesWithMetadataPrompt> = {};
for (const prompt of newPrompts) {
if (!(prompt.id in promptsById)) {
promptsById[prompt.id] = prompt;
}
}
groupedTestCases[datasetId] = {
id: datasetId,
testCases,
recentEvalDate: new Date(createdAt),
count: 1,
recentEvalId: filePath,
evalIds: [filePath],
prompts: Object.values(promptsById),
};
}
}
}

return Object.values(groupedTestCases)/*.map(({ testCases, date, count, evalId, id, topPrompts }) => ({
id,
testCases,
date,
count,
evalId,
topPrompts,
}));*/
}

export function getNunjucksEngine() {
nunjucks.configure({
autoescape: false,
Expand Down
40 changes: 40 additions & 0 deletions src/web/nextui/src/app/components/Navigation.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
.nav {
padding: 0.25rem 0 0.25rem 1rem;
gap: 1rem;
background-color: #eee;
margin-bottom: 1rem;
}

[data-theme='dark'] .nav {
background-color: #333;
}

.nav a {
text-decoration: none;
color: #000;
align-self: center;
}

.nav .active {
font-weight: bold;
}

[data-theme='dark'] .nav a {
color: #f0f0f0;
}

.nav .right-aligned {
display: flex;
align-items: center;
gap: 1rem;
margin-left: auto;
margin-right: 0.5rem;
}

.nav a:hover {
text-decoration: underline;
}

[data-theme='dark'] .nav a:hover {
color: #ddd;
}
Loading

0 comments on commit 7ca9285

Please sign in to comment.