Add CLI commands for listing and showing evals, prompts, and datasets (…

…promptfoo#218) Adds the following: ``` promptfoo list evals promptfoo list prompts promptfoo list datasets promptfoo show <id> promptfoo show eval <id> promptfoo show prompt <id> promptfoo show dataset <id> ```
sparticleinc · Oct 10, 2023 · 211c869 · 211c869
1 parent 7ca9285
commit 211c869
Show file tree

Hide file tree

Showing 10 changed files with 386 additions and 62 deletions.
diff --git a/src/commands/list.ts b/src/commands/list.ts
@@ -0,0 +1,83 @@
+import chalk from 'chalk';
+import { Command } from 'commander';
+
+import { getEvals, getPrompts, getTestCases, printBorder, sha256 } from '../util';
+import { wrapTable } from '../table';
+import logger from '../logger';
+import telemetry from '../telemetry';
+
+export function listCommand(program: Command) {
+  const listCommand = program.command('list').description('List various resources');
+
+  listCommand.command('evals')
+    .description('List evaluations.')
+    .action(async () => {
+      telemetry.maybeShowNotice();
+      telemetry.record('command_used', {
+        name: 'list evals',
+      });
+      await telemetry.send();
+
+      const evals = getEvals();
+      const tableData = evals.map(evl => ({
+        'Eval ID': evl.id.slice(0, 6),
+        Filename: evl.filePath,
+        Prompts: evl.results.table.head.prompts.map(p => sha256(p.raw).slice(0, 6)).join(', '),
+        Vars: evl.results.table.head.vars.map(v => v).join(', '),
+      }));
+
+      logger.info(wrapTable(tableData));
+      printBorder();
+
+      logger.info(`Run ${chalk.green('promptfoo show eval <id>')} to see details of a specific evaluation.`);
+      logger.info(`Run ${chalk.green('promptfoo show prompt <id>')} to see details of a specific prompt.`);
+    });
+
+  listCommand.command('prompts')
+    .description('List prompts used')
+    .action(async () => {
+      telemetry.maybeShowNotice();
+      telemetry.record('command_used', {
+        name: 'list prompts',
+      });
+      await telemetry.send();
+
+      const prompts = getPrompts().sort((a, b) => b.recentEvalId.localeCompare(a.recentEvalId));
+      const tableData = prompts.map(prompt => ({
+        'Prompt ID': prompt.id.slice(0, 6),
+        'Raw': prompt.prompt.raw.slice(0, 100) + (prompt.prompt.raw.length > 100 ? '...' : ''),
+        '# evals': prompt.count,
+        'Most recent eval': prompt.recentEvalId.slice(0, 6),
+      }));
+
+      logger.info(wrapTable(tableData));
+      printBorder();
+      logger.info(`Run ${chalk.green('promptfoo show prompt <id>')} to see details of a specific prompt.`);
+      logger.info(`Run ${chalk.green('promptfoo show eval <id>')} to see details of a specific evaluation.`);
+    });
+
+  listCommand.command('datasets')
+    .description('List datasets used')
+    .action(async () => {
+      telemetry.maybeShowNotice();
+      telemetry.record('command_used', {
+        name: 'list datasets',
+      });
+      await telemetry.send();
+
+      const datasets = getTestCases().sort((a, b) => b.recentEvalId.localeCompare(a.recentEvalId));
+      const tableData = datasets.map(dataset => ({
+        'Dataset ID': dataset.id.slice(0, 6),
+        'Highest scoring prompt': dataset.prompts.sort((a, b) => (b.prompt.metrics?.score || 0) - (a.prompt.metrics?.score || 0))[0].id.slice(0, 6),
+        '# evals': dataset.count,
+        '# prompts': dataset.prompts.length,
+        'Most recent eval': dataset.recentEvalId.slice(0, 6),
+      }));
+
+      logger.info(wrapTable(tableData));
+      printBorder();
+      logger.info(`Run ${chalk.green('promptfoo show dataset <id>')} to see details of a specific dataset.`);
+      logger.info(`Run ${chalk.green('promptfoo show prompt <id>')} to see details of a specific prompt.`);
+      logger.info(`Run ${chalk.green('promptfoo show eval <id>')} to see details of a specific evaluation.`);
+    });
+}
diff --git a/src/commands/show.ts b/src/commands/show.ts
@@ -0,0 +1,137 @@
+import chalk from 'chalk';
+import { Command } from 'commander';
+
+import { getEvalFromHash, getPromptFromHash, getDatasetFromHash, printBorder } from '../util';
+import { generateTable, wrapTable } from '../table';
+import logger from '../logger';
+import telemetry from '../telemetry';
+
+export function showCommand(program: Command) {
+  const showCommand = program.command('show <id>').description('Show details of a specific resource').action(async (id: string) => {
+    const evl = getEvalFromHash(id);
+    if (evl) {
+      return handleEval(id);
+    }
+
+    const prompt = getPromptFromHash(id);
+    if (prompt) {
+      return handlePrompt(id);
+    }
+
+    const dataset = getDatasetFromHash(id);
+    if (dataset) {
+      return handleDataset(id);
+    }
+
+    logger.error(`No resource found with ID ${id}`);
+  });
+
+  showCommand
+    .command('eval <id>')
+    .description('Show details of a specific evaluation')
+    .action(handleEval);
+
+  showCommand.command('prompt <id>').description('Show details of a specific prompt').action(handlePrompt);
+
+  showCommand.command('dataset <id>').description('Show details of a specific dataset').action(handleDataset);
+}
+
+async function handleEval(id: string) {
+  telemetry.maybeShowNotice();
+  telemetry.record('command_used', {
+    name: 'show eval',
+  });
+  await telemetry.send();
+
+  const evl = getEvalFromHash(id);
+  if (!evl) {
+    logger.error(`No evaluation found with ID ${id}`);
+    return;
+  }
+
+  const { prompts, vars } = evl.results.table.head;
+  logger.info(generateTable(evl.results, 100, 25));
+  if (evl.results.table.body.length > 25) {
+    const rowsLeft = evl.results.table.body.length - 25;
+    logger.info(`... ${rowsLeft} more row${rowsLeft === 1 ? '' : 's'} not shown ...\n`);
+  }
+
+  printBorder();
+  logger.info(chalk.cyan(`Eval ${id}`));
+  printBorder();
+  // TODO(ian): List prompt ids
+  logger.info(`${prompts.length} prompts`);
+  logger.info(`${vars.length} variables: ${vars.slice(0, 5).join(', ')}${vars.length > 5 ? ` (and ${vars.length - 5} more...)` : ''}`);
+}
+
+async function handlePrompt(id: string) {
+  telemetry.maybeShowNotice();
+  telemetry.record('command_used', {
+    name: 'show prompt',
+  });
+  await telemetry.send();
+
+  const prompt = getPromptFromHash(id);
+  if (!prompt) {
+    logger.error(`Prompt with ID ${id} not found.`);
+    return;
+  }
+
+  printBorder();
+  logger.info(chalk.cyan(prompt.prompt.raw));
+  printBorder();
+  logger.info(chalk.bold(`Prompt ${id}`));
+  printBorder();
+
+  logger.info(`This prompt is used in the following evals:`);
+  const table = [];
+  for (const evl of prompt.evals.sort((a, b) => b.id.localeCompare(a.id)).slice(0, 10)) {
+    table.push({
+      'Eval ID': evl.id.slice(0, 6),
+      'Dataset ID': evl.datasetId.slice(0, 6),
+      'Raw score': evl.metrics?.score.toFixed(2) || '-',
+      'Pass rate': evl.metrics && evl.metrics.testPassCount + evl.metrics.testFailCount > 0 ? `${(evl.metrics.testPassCount / (evl.metrics.testPassCount + evl.metrics.testFailCount) * 100).toFixed(2)}%` : '-',
+      'Pass count': evl.metrics?.testPassCount || '-',
+      'Fail count': evl.metrics?.testFailCount || '-',
+    });
+  }
+  logger.info(wrapTable(table));
+  printBorder();
+  logger.info(`Run ${chalk.green('promptfoo show eval <id>')} to see details of a specific evaluation.`);
+  logger.info(`Run ${chalk.green('promptfoo show dataset <id>')} to see details of a specific dataset.`);
+}
+
+async function handleDataset(id: string) {
+    telemetry.maybeShowNotice();
+    telemetry.record('command_used', {
+      name: 'show dataset',
+    });
+    await telemetry.send();
+
+    const dataset = getDatasetFromHash(id);
+    if (!dataset) {
+      logger.error(`Dataset with ID ${id} not found.`);
+      return;
+    }
+
+    printBorder();
+    logger.info(chalk.bold(`Dataset ${id}`));
+    printBorder();
+
+    logger.info(`This dataset is used in the following evals:`);
+    const table = [];
+    for (const prompt of dataset.prompts.sort((a, b) => b.evalId.localeCompare(a.evalId)).slice(0, 10)) {
+      table.push({
+        'Eval ID': prompt.evalId.slice(0, 6),
+        'Prompt ID': prompt.id.slice(0, 6),
+        'Raw score': prompt.prompt.metrics?.score.toFixed(2) || '-',
+        'Pass rate': prompt.prompt.metrics && prompt.prompt.metrics.testPassCount + prompt.prompt.metrics.testFailCount > 0 ? `${(prompt.prompt.metrics.testPassCount / (prompt.prompt.metrics.testPassCount + prompt.prompt.metrics.testFailCount) * 100).toFixed(2)}%` : '-',
+        'Pass count': prompt.prompt.metrics?.testPassCount || '-',
+        'Fail count': prompt.prompt.metrics?.testFailCount || '-',
+      });
+    }
+    logger.info(wrapTable(table));
+    printBorder();
+    logger.info(`Run ${chalk.green('promptfoo show prompt <id>')} to see details of a specific prompt.`);
+    logger.info(`Run ${chalk.green('promptfoo show eval <id>')} to see details of a specific evaluation.`);
+  }
diff --git a/src/main.ts b/src/main.ts
@@ -15,6 +15,7 @@ import { readTest, readTests } from './testCases';
 import {
   cleanupOldResults,
   maybeReadConfig,
+  printBorder,
   readConfig,
   readLatestResults,
   writeLatestResults,
@@ -26,6 +27,8 @@ import { getDirectory } from './esm';
 import { startServer } from './web/server';
 import { checkForUpdates } from './updates';
 import { gatherFeedback } from './feedback';
+import { listCommand } from './commands/list';
+import { showCommand } from './commands/show';
 
 import type {
   CommandLineOptions,
@@ -34,7 +37,7 @@ import type {
   TestSuite,
   UnifiedConfig,
 } from './types';
-import { generateTable } from './table';
+import { generateTable, wrapTable } from './table';
 import { createShareableUrl } from './share';
 
 function createDummyFiles(directory: string | null) {
@@ -421,8 +424,7 @@ async function main() {
 
       telemetry.maybeShowNotice();
 
-      const border = '='.repeat((process.stdout.columns || 80) - 10);
-      logger.info(border);
+      printBorder();
       if (!cmdObj.write) {
         logger.info(`${chalk.green('✔')} Evaluation complete`);
       } else {
@@ -443,7 +445,7 @@ async function main() {
           );
         }
       }
-      logger.info(border);
+      printBorder();
       logger.info(chalk.green.bold(`Successes: ${summary.stats.successes}`));
       logger.info(chalk.red.bold(`Failures: ${summary.stats.failures}`));
       logger.info(
@@ -462,6 +464,9 @@ async function main() {
       }
     });
 
+  listCommand(program);
+  showCommand(program);
+
   program.parse(process.argv);
 
   if (!process.argv.slice(2).length) {

diff --git a/src/table.ts b/src/table.ts
@@ -46,3 +46,18 @@ export function generateTable(summary: EvaluateSummary, tableCellMaxLength = 250
   }
   return table;
 }
+
+export function wrapTable(rows: Record<string, string | number>[]) {
+  const maxWidth = process.stdout.columns ? process.stdout.columns - 10 : 120;
+  const head = Object.keys(rows[0]);
+  const table = new Table({
+    head,
+    colWidths: Array(head.length).fill(Math.floor(maxWidth / head.length)),
+    wordWrap: true,
+    wrapOnWordBoundary: true,
+  });
+  for (const row of rows) {
+    table.push(Object.values(row));
+  }
+  return table;
+}
diff --git a/src/types.ts b/src/types.ts
@@ -135,8 +135,10 @@ export interface PromptWithMetadata {
   prompt: Prompt;
   recentEvalDate: Date;
   recentEvalId: string;
+  recentEvalFilepath: string;
   evals: {
     id: string;
+    filePath: string;
     datasetId: string;
     metrics: Prompt['metrics'];
   }[];
@@ -257,14 +259,15 @@ export interface TestCasesWithMetadataPrompt {
   prompt: Prompt;
   id: string;
   evalId: string;
+  evalFilepath: string;
 }
 
 export interface TestCasesWithMetadata {
   id: string;
   testCases: string | string[] | TestCase[];
   recentEvalDate: Date;
   recentEvalId: string;
-  evalIds: string[];
+  recentEvalFilepath: string;
   count: number;
   prompts: TestCasesWithMetadataPrompt[];
 }
@@ -377,6 +380,14 @@ export type UnifiedConfig = TestSuiteConfig & {
   commandLineOptions: Partial<CommandLineOptions>;
 };
 
+export interface EvalWithMetadata {
+  id: string;
+  filePath: string;
+  date: Date;
+  config: Partial<UnifiedConfig>;
+  results: EvaluateSummary;
+}
+
 // node.js package interface
 export interface EvaluateTestSuite extends TestSuiteConfig {
   prompts: string[];