Runs are per-test

autoblocksai · nicolewhite · Feb 19, 2024 · Feb 8, 2024 · Feb 8, 2024 · Feb 8, 2024
commit 452a34997c76f79b562e8a6f5041a337f41b3f13
diff --git a/python-example.py b/python-example.py
@@ -107,7 +107,7 @@ async def evaluate_output(
     )
 
   await client.post("/evals", json=dict(
-    testId=id,
+    testExternalId=id,
     testCaseHash=test_case.hash,
     evaluatorId=evaluator.id,
     score=evaluation.score,
@@ -132,7 +132,7 @@ async def run_test_case(
     output = await loop.run_in_executor(None, ctx.run, fn, test_case)
 
   await client.post("/results", json=dict(
-    testId=test_id,
+    testExternalId=test_id,
     testCaseHash=test_case.hash,
     testCaseBody=dataclasses.asdict(test_case),
     testCaseOutput=output,
@@ -170,6 +170,8 @@ async def run_test(
   ]
   await asyncio.gather(*run_tasks)
 
+  await client.post("/end", json=dict(testExternalId=test_id))
+
 
 # Sync entrypoint
 def test(
@@ -186,11 +188,11 @@ def test(
       fn=fn,
     ),
     loop,
- )
+  )
   future.result()
 
 
-
+# Example usage
 if __name__ == "__main__":
   import random
 

diff --git a/src/handlers/testing/exec/index.ts b/src/handlers/testing/exec/index.ts
@@ -8,6 +8,9 @@ import { z } from 'zod';
 import { startInteractiveCLI, interactiveEmitter } from './interactive-cli';
 import net from 'net';
 
+/**
+ * Utils
+ */
 function findAvailablePort(startPort: number): Promise<number> {
   return new Promise((resolve, reject) => {
     function tryListening(port: number) {
@@ -34,20 +37,29 @@ function findAvailablePort(startPort: number): Promise<number> {
 }
 
 /**
- * Current run utils
+ * Globals
  */
-let _currentRunId: string | undefined = undefined;
 let _currentRunMessage: string | undefined = undefined;
 let _isInteractive: boolean | undefined = undefined;
 
-async function currentRunId(): Promise<string> {
-  if (!_currentRunId) {
-    const run = await startRun();
-    _currentRunId = run.runId;
+// Map of test's external ID to its current run ID and its internal test ID
+const testExternalIdToRun: Record<string, { runId: string; testId: string }> =
+  {};
+
+async function currentRun(args: {
+  testExternalId: string;
+}): Promise<{ runId: string; testId: string }> {
+  let run = testExternalIdToRun[args.testExternalId];
+  if (!run) {
+    run = await startRun({ testExternalId: args.testExternalId });
+    testExternalIdToRun[args.testExternalId] = run;
   }
-  return _currentRunId;
+  return run;
 }
 
+/**
+ * Logger
+ */
 const logger = {
   log: (...args: unknown[]) => {
     if (_isInteractive) {
@@ -71,7 +83,7 @@ const logger = {
  * Accumulate events for the duration of the run
  */
 interface TestCaseEvent {
-  testId: string;
+  testExternalId: string;
   testCaseHash: string;
   message: string;
   traceId: string;
@@ -84,7 +96,7 @@ const testCaseEvents: TestCaseEvent[] = [];
 /**
  * Keep a map of test case hashes to their result IDs
  *
- * runId -> testId -> testCaseHash -> testCaseResultId
+ * testExternalId -> testCaseHash -> testCaseResultId
  */
 const testCaseHashToResultId: Record<string, Record<string, string>> = {};
 
@@ -111,43 +123,54 @@ function evaluationPassed(args: {
 /**
  * Public API stubs
  */
-async function startRun(): Promise<{ runId: string }> {
-  logger.log('POST /api/testing/local/runs', { message: _currentRunMessage });
-  return { runId: crypto.randomUUID() };
+async function startRun(args: {
+  testExternalId: string;
+}): Promise<{ runId: string; testId: string }> {
+  logger.log('POST /api/testing/local/runs', {
+    testExternalId: args.testExternalId,
+    message: _currentRunMessage,
+  });
+  return { runId: crypto.randomUUID(), testId: crypto.randomUUID() };
 }
 
-async function endRun(): Promise<void> {
-  const runId = await currentRunId();
+async function endRun(args: { testExternalId: string }): Promise<void> {
+  const { runId } = await currentRun(args);
   logger.log(`POST /api/testing/local/runs/${runId}/end`);
-  interactiveEmitter.emit('end');
-  _currentRunId = undefined;
+  interactiveEmitter.emit('end', { testExternalId: args.testExternalId });
+  delete testExternalIdToRun[args.testExternalId];
 }
 
 async function postTestCaseResult(args: {
-  testId: string;
+  testExternalId: string;
   testCaseHash: string;
   testCaseBody?: unknown;
   testCaseOutput?: unknown;
   testCaseEvents: TestCaseEvent[];
 }): Promise<{ testCaseResultId: string }> {
-  const runId = await currentRunId();
-  logger.log(`POST /api/testing/local/runs/${runId}/results`, args);
+  const { runId, testId } = await currentRun(args);
+  logger.log(`POST /api/testing/local/runs/${runId}/results`, {
+    ...args,
+    testId,
+  });
   return { testCaseResultId: crypto.randomUUID() };
 }
 
 async function postTestCaseEval(args: {
-  testId: string;
+  testExternalId: string;
   testCaseResultId: string;
   evaluatorId: string;
   score: number;
   passed: boolean | undefined;
   thresholdOp?: '<' | '<=' | '>' | '>=';
   thresholdValue?: number;
 }): Promise<void> {
-  const runId = await currentRunId();
+  const { runId, testId } = await currentRun(args);
   // TODO: use enums, zod schemas for passing this data to the interactive CLI
-  interactiveEmitter.emit('eval', { ...args, runId });
-  logger.log(`POST /api/testing/local/runs/${runId}/evals`, args);
+  interactiveEmitter.emit('eval', args);
+  logger.log(`POST /api/testing/local/runs/${runId}/evals`, {
+    ...args,
+    testId,
+  });
 }
 
 /**
@@ -160,7 +183,7 @@ app.post(
   zValidator(
     'json',
     z.object({
-      testId: z.string(),
+      testExternalId: z.string(),
       testCaseHash: z.string(),
       message: z.string(),
       traceId: z.string(),
@@ -180,7 +203,7 @@ app.post(
   zValidator(
     'json',
     z.object({
-      testId: z.string(),
+      testExternalId: z.string(),
       testCaseHash: z.string(),
       testCaseBody: z.unknown(),
       testCaseOutput: z.unknown(),
@@ -190,18 +213,21 @@ app.post(
     const data = c.req.valid('json');
 
     const events = testCaseEvents.filter(
-      (e) => e.testId === data.testId && e.testCaseHash === data.testCaseHash,
+      (e) =>
+        e.testExternalId === data.testExternalId &&
+        e.testCaseHash === data.testCaseHash,
     );
     const { testCaseResultId } = await postTestCaseResult({
       ...data,
       testCaseEvents: events,
     });
 
-    if (!testCaseHashToResultId[data.testId]) {
-      testCaseHashToResultId[data.testId] = {};
+    if (!testCaseHashToResultId[data.testExternalId]) {
+      testCaseHashToResultId[data.testExternalId] = {};
     }
 
-    testCaseHashToResultId[data.testId][data.testCaseHash] = testCaseResultId;
+    testCaseHashToResultId[data.testExternalId][data.testCaseHash] =
+      testCaseResultId;
 
     return c.json('ok');
   },
@@ -212,7 +238,7 @@ app.post(
   zValidator(
     'json',
     z.object({
-      testId: z.string(),
+      testExternalId: z.string(),
       testCaseHash: z.string(),
       evaluatorId: z.string(),
       score: z.number(),
@@ -233,7 +259,7 @@ app.post(
     }
 
     const testCaseResultId =
-      testCaseHashToResultId[data.testId]?.[data.testCaseHash];
+      testCaseHashToResultId[data.testExternalId]?.[data.testCaseHash];
 
     if (!testCaseResultId) {
       logger.warn(
@@ -252,6 +278,21 @@ app.post(
   },
 );
 
+app.post(
+  '/end',
+  zValidator(
+    'json',
+    z.object({
+      testExternalId: z.string(),
+    }),
+  ),
+  async (c) => {
+    const data = c.req.valid('json');
+    await endRun(data);
+    return c.json('ok');
+  },
+);
+
 /**
  * Exec command while local server is running
  */
@@ -294,7 +335,6 @@ export async function exec(args: {
         env,
         silent: args.interactive,
       }).finally(async () => {
-        await endRun();
         server?.close();
       });
     },

diff --git a/src/handlers/testing/exec/interactive-cli.tsx b/src/handlers/testing/exec/interactive-cli.tsx
@@ -7,21 +7,69 @@ export const interactiveEmitter = new EventEmitter();
 
 const Space = () => <Text> </Text>;
 
+function TestOutcomes(props: {
+  runIsOver: boolean;
+  testExternalId: string;
+  outcomes: boolean[];
+}) {
+  const passed = props.outcomes.every((x) => x);
+  return (
+    <Box alignItems="center">
+      {props.runIsOver ? (
+        <Text color="white" backgroundColor={passed ? 'green' : 'red'}>
+          <Space />
+          {passed ? 'PASSED' : 'FAILED'}
+          <Space />
+        </Text>
+      ) : (
+        <Spinner type="dots" />
+      )}
+      <Space />
+      <Text bold={true}>{props.testExternalId}</Text>
+      <Space />
+      {props.outcomes.map((passed, i) => (
+        <Text key={i} color={passed ? 'green' : 'red'}>
+          {'.'}
+        </Text>
+      ))}
+      <Spacer />
+      <Box borderStyle="single">
+        <Text>Rerun</Text>
+      </Box>
+    </Box>
+  );
+}
+
 const App = () => {
-  const [outcomes, setOutcomes] = useState<boolean[]>([]);
-  const [runIsOver, setRunIsOver] = useState<boolean>(false);
+  const [testIdToOutcomes, setTestIdToOutcomes] = useState<
+    Record<string, boolean[]>
+  >({});
+  const [testIdToRunIsOver, setTestIdToRunIsOver] = useState<
+    Record<string, boolean>
+  >({});
 
   useInput((input, key) => {
     // TODO: add interaction!
   });
 
   useEffect(() => {
-    const evalListener = (args: { passed: boolean }) => {
-      setOutcomes((prevOutcomes) => [...prevOutcomes, args.passed]);
+    const evalListener = (args: {
+      testExternalId: string;
+      passed: boolean;
+    }) => {
+      setTestIdToOutcomes((prevOutcomes) => {
+        const { testExternalId, passed } = args;
+        return {
+          ...prevOutcomes,
+          [testExternalId]: [...(prevOutcomes[testExternalId] || []), passed],
+        };
+      });
     };
 
-    const onEndListener = () => {
-      setRunIsOver(true);
+    const onEndListener = (args: { testExternalId: string }) => {
+      setTestIdToRunIsOver((prevRunIsOver) => {
+        return { ...prevRunIsOver, [args.testExternalId]: true };
+      });
     };
 
     interactiveEmitter.on('eval', evalListener);
@@ -36,34 +84,19 @@ const App = () => {
   return (
     <Box
       paddingX={1}
+      flexDirection="column"
       borderStyle="round"
       borderColor="gray"
-      alignItems="center"
+      minHeight={12}
     >
-      {runIsOver ? (
-        <Text
-          color="white"
-          backgroundColor={outcomes.every((x) => x) ? 'green' : 'red'}
-        >
-          <Space />
-          {outcomes.every((x) => x) ? 'PASSED' : 'FAILED'}
-          <Space />
-        </Text>
-      ) : (
-        <Spinner type="dots" />
-      )}
-      <Space />
-      <Text bold={true}>acme-bot</Text>
-      <Space />
-      {outcomes.map((passed, i) => (
-        <Text key={i} color={passed ? 'green' : 'red'}>
-          {'.'}
-        </Text>
+      {Object.entries(testIdToOutcomes).map(([testId, outcomes]) => (
+        <TestOutcomes
+          key={testId}
+          runIsOver={testIdToRunIsOver[testId]}
+          testExternalId={testId}
+          outcomes={outcomes}
+        />
       ))}
-      <Spacer />
-      <Box borderStyle="single">
-        <Text>Rerun</Text>
-      </Box>
     </Box>
   );
 };