Skip to content

Commit

Permalink
Add CLI commands for listing and showing evals, prompts, and datasets (
Browse files Browse the repository at this point in the history
…promptfoo#218)

Adds the following:

```
promptfoo list evals
promptfoo list prompts
promptfoo list datasets

promptfoo show <id>
promptfoo show eval <id>
promptfoo show prompt <id>
promptfoo show dataset <id>
```
  • Loading branch information
typpo authored Oct 10, 2023
1 parent 7ca9285 commit 211c869
Show file tree
Hide file tree
Showing 10 changed files with 386 additions and 62 deletions.
83 changes: 83 additions & 0 deletions src/commands/list.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import chalk from 'chalk';
import { Command } from 'commander';

import { getEvals, getPrompts, getTestCases, printBorder, sha256 } from '../util';
import { wrapTable } from '../table';
import logger from '../logger';
import telemetry from '../telemetry';

export function listCommand(program: Command) {
const listCommand = program.command('list').description('List various resources');

listCommand.command('evals')
.description('List evaluations.')
.action(async () => {
telemetry.maybeShowNotice();
telemetry.record('command_used', {
name: 'list evals',
});
await telemetry.send();

const evals = getEvals();
const tableData = evals.map(evl => ({
'Eval ID': evl.id.slice(0, 6),
Filename: evl.filePath,
Prompts: evl.results.table.head.prompts.map(p => sha256(p.raw).slice(0, 6)).join(', '),
Vars: evl.results.table.head.vars.map(v => v).join(', '),
}));

logger.info(wrapTable(tableData));
printBorder();

logger.info(`Run ${chalk.green('promptfoo show eval <id>')} to see details of a specific evaluation.`);
logger.info(`Run ${chalk.green('promptfoo show prompt <id>')} to see details of a specific prompt.`);
});

listCommand.command('prompts')
.description('List prompts used')
.action(async () => {
telemetry.maybeShowNotice();
telemetry.record('command_used', {
name: 'list prompts',
});
await telemetry.send();

const prompts = getPrompts().sort((a, b) => b.recentEvalId.localeCompare(a.recentEvalId));
const tableData = prompts.map(prompt => ({
'Prompt ID': prompt.id.slice(0, 6),
'Raw': prompt.prompt.raw.slice(0, 100) + (prompt.prompt.raw.length > 100 ? '...' : ''),
'# evals': prompt.count,
'Most recent eval': prompt.recentEvalId.slice(0, 6),
}));

logger.info(wrapTable(tableData));
printBorder();
logger.info(`Run ${chalk.green('promptfoo show prompt <id>')} to see details of a specific prompt.`);
logger.info(`Run ${chalk.green('promptfoo show eval <id>')} to see details of a specific evaluation.`);
});

listCommand.command('datasets')
.description('List datasets used')
.action(async () => {
telemetry.maybeShowNotice();
telemetry.record('command_used', {
name: 'list datasets',
});
await telemetry.send();

const datasets = getTestCases().sort((a, b) => b.recentEvalId.localeCompare(a.recentEvalId));
const tableData = datasets.map(dataset => ({
'Dataset ID': dataset.id.slice(0, 6),
'Highest scoring prompt': dataset.prompts.sort((a, b) => (b.prompt.metrics?.score || 0) - (a.prompt.metrics?.score || 0))[0].id.slice(0, 6),
'# evals': dataset.count,
'# prompts': dataset.prompts.length,
'Most recent eval': dataset.recentEvalId.slice(0, 6),
}));

logger.info(wrapTable(tableData));
printBorder();
logger.info(`Run ${chalk.green('promptfoo show dataset <id>')} to see details of a specific dataset.`);
logger.info(`Run ${chalk.green('promptfoo show prompt <id>')} to see details of a specific prompt.`);
logger.info(`Run ${chalk.green('promptfoo show eval <id>')} to see details of a specific evaluation.`);
});
}
137 changes: 137 additions & 0 deletions src/commands/show.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import chalk from 'chalk';
import { Command } from 'commander';

import { getEvalFromHash, getPromptFromHash, getDatasetFromHash, printBorder } from '../util';
import { generateTable, wrapTable } from '../table';
import logger from '../logger';
import telemetry from '../telemetry';

export function showCommand(program: Command) {
const showCommand = program.command('show <id>').description('Show details of a specific resource').action(async (id: string) => {
const evl = getEvalFromHash(id);
if (evl) {
return handleEval(id);
}

const prompt = getPromptFromHash(id);
if (prompt) {
return handlePrompt(id);
}

const dataset = getDatasetFromHash(id);
if (dataset) {
return handleDataset(id);
}

logger.error(`No resource found with ID ${id}`);
});

showCommand
.command('eval <id>')
.description('Show details of a specific evaluation')
.action(handleEval);

showCommand.command('prompt <id>').description('Show details of a specific prompt').action(handlePrompt);

showCommand.command('dataset <id>').description('Show details of a specific dataset').action(handleDataset);
}

async function handleEval(id: string) {
telemetry.maybeShowNotice();
telemetry.record('command_used', {
name: 'show eval',
});
await telemetry.send();

const evl = getEvalFromHash(id);
if (!evl) {
logger.error(`No evaluation found with ID ${id}`);
return;
}

const { prompts, vars } = evl.results.table.head;
logger.info(generateTable(evl.results, 100, 25));
if (evl.results.table.body.length > 25) {
const rowsLeft = evl.results.table.body.length - 25;
logger.info(`... ${rowsLeft} more row${rowsLeft === 1 ? '' : 's'} not shown ...\n`);
}

printBorder();
logger.info(chalk.cyan(`Eval ${id}`));
printBorder();
// TODO(ian): List prompt ids
logger.info(`${prompts.length} prompts`);
logger.info(`${vars.length} variables: ${vars.slice(0, 5).join(', ')}${vars.length > 5 ? ` (and ${vars.length - 5} more...)` : ''}`);
}

async function handlePrompt(id: string) {
telemetry.maybeShowNotice();
telemetry.record('command_used', {
name: 'show prompt',
});
await telemetry.send();

const prompt = getPromptFromHash(id);
if (!prompt) {
logger.error(`Prompt with ID ${id} not found.`);
return;
}

printBorder();
logger.info(chalk.cyan(prompt.prompt.raw));
printBorder();
logger.info(chalk.bold(`Prompt ${id}`));
printBorder();

logger.info(`This prompt is used in the following evals:`);
const table = [];
for (const evl of prompt.evals.sort((a, b) => b.id.localeCompare(a.id)).slice(0, 10)) {
table.push({
'Eval ID': evl.id.slice(0, 6),
'Dataset ID': evl.datasetId.slice(0, 6),
'Raw score': evl.metrics?.score.toFixed(2) || '-',
'Pass rate': evl.metrics && evl.metrics.testPassCount + evl.metrics.testFailCount > 0 ? `${(evl.metrics.testPassCount / (evl.metrics.testPassCount + evl.metrics.testFailCount) * 100).toFixed(2)}%` : '-',
'Pass count': evl.metrics?.testPassCount || '-',
'Fail count': evl.metrics?.testFailCount || '-',
});
}
logger.info(wrapTable(table));
printBorder();
logger.info(`Run ${chalk.green('promptfoo show eval <id>')} to see details of a specific evaluation.`);
logger.info(`Run ${chalk.green('promptfoo show dataset <id>')} to see details of a specific dataset.`);
}

async function handleDataset(id: string) {
telemetry.maybeShowNotice();
telemetry.record('command_used', {
name: 'show dataset',
});
await telemetry.send();

const dataset = getDatasetFromHash(id);
if (!dataset) {
logger.error(`Dataset with ID ${id} not found.`);
return;
}

printBorder();
logger.info(chalk.bold(`Dataset ${id}`));
printBorder();

logger.info(`This dataset is used in the following evals:`);
const table = [];
for (const prompt of dataset.prompts.sort((a, b) => b.evalId.localeCompare(a.evalId)).slice(0, 10)) {
table.push({
'Eval ID': prompt.evalId.slice(0, 6),
'Prompt ID': prompt.id.slice(0, 6),
'Raw score': prompt.prompt.metrics?.score.toFixed(2) || '-',
'Pass rate': prompt.prompt.metrics && prompt.prompt.metrics.testPassCount + prompt.prompt.metrics.testFailCount > 0 ? `${(prompt.prompt.metrics.testPassCount / (prompt.prompt.metrics.testPassCount + prompt.prompt.metrics.testFailCount) * 100).toFixed(2)}%` : '-',
'Pass count': prompt.prompt.metrics?.testPassCount || '-',
'Fail count': prompt.prompt.metrics?.testFailCount || '-',
});
}
logger.info(wrapTable(table));
printBorder();
logger.info(`Run ${chalk.green('promptfoo show prompt <id>')} to see details of a specific prompt.`);
logger.info(`Run ${chalk.green('promptfoo show eval <id>')} to see details of a specific evaluation.`);
}
13 changes: 9 additions & 4 deletions src/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ import { readTest, readTests } from './testCases';
import {
cleanupOldResults,
maybeReadConfig,
printBorder,
readConfig,
readLatestResults,
writeLatestResults,
Expand All @@ -26,6 +27,8 @@ import { getDirectory } from './esm';
import { startServer } from './web/server';
import { checkForUpdates } from './updates';
import { gatherFeedback } from './feedback';
import { listCommand } from './commands/list';
import { showCommand } from './commands/show';

import type {
CommandLineOptions,
Expand All @@ -34,7 +37,7 @@ import type {
TestSuite,
UnifiedConfig,
} from './types';
import { generateTable } from './table';
import { generateTable, wrapTable } from './table';
import { createShareableUrl } from './share';

function createDummyFiles(directory: string | null) {
Expand Down Expand Up @@ -421,8 +424,7 @@ async function main() {

telemetry.maybeShowNotice();

const border = '='.repeat((process.stdout.columns || 80) - 10);
logger.info(border);
printBorder();
if (!cmdObj.write) {
logger.info(`${chalk.green('✔')} Evaluation complete`);
} else {
Expand All @@ -443,7 +445,7 @@ async function main() {
);
}
}
logger.info(border);
printBorder();
logger.info(chalk.green.bold(`Successes: ${summary.stats.successes}`));
logger.info(chalk.red.bold(`Failures: ${summary.stats.failures}`));
logger.info(
Expand All @@ -462,6 +464,9 @@ async function main() {
}
});

listCommand(program);
showCommand(program);

program.parse(process.argv);

if (!process.argv.slice(2).length) {
Expand Down
15 changes: 15 additions & 0 deletions src/table.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,18 @@ export function generateTable(summary: EvaluateSummary, tableCellMaxLength = 250
}
return table;
}

export function wrapTable(rows: Record<string, string | number>[]) {
const maxWidth = process.stdout.columns ? process.stdout.columns - 10 : 120;
const head = Object.keys(rows[0]);
const table = new Table({
head,
colWidths: Array(head.length).fill(Math.floor(maxWidth / head.length)),
wordWrap: true,
wrapOnWordBoundary: true,
});
for (const row of rows) {
table.push(Object.values(row));
}
return table;
}
13 changes: 12 additions & 1 deletion src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,10 @@ export interface PromptWithMetadata {
prompt: Prompt;
recentEvalDate: Date;
recentEvalId: string;
recentEvalFilepath: string;
evals: {
id: string;
filePath: string;
datasetId: string;
metrics: Prompt['metrics'];
}[];
Expand Down Expand Up @@ -257,14 +259,15 @@ export interface TestCasesWithMetadataPrompt {
prompt: Prompt;
id: string;
evalId: string;
evalFilepath: string;
}

export interface TestCasesWithMetadata {
id: string;
testCases: string | string[] | TestCase[];
recentEvalDate: Date;
recentEvalId: string;
evalIds: string[];
recentEvalFilepath: string;
count: number;
prompts: TestCasesWithMetadataPrompt[];
}
Expand Down Expand Up @@ -377,6 +380,14 @@ export type UnifiedConfig = TestSuiteConfig & {
commandLineOptions: Partial<CommandLineOptions>;
};

export interface EvalWithMetadata {
id: string;
filePath: string;
date: Date;
config: Partial<UnifiedConfig>;
results: EvaluateSummary;
}

// node.js package interface
export interface EvaluateTestSuite extends TestSuiteConfig {
prompts: string[];
Expand Down
Loading

0 comments on commit 211c869

Please sign in to comment.