@@ -23,7 +23,13 @@ import { generateExperimentName } from "./utils";
2323import { exactMatch , errorMatch } from "./scoring" ;
2424import { tasksByName , tasksConfig , getModelList } from "./taskConfig" ;
2525import { Eval , wrapAISDKModel , wrapOpenAI } from "braintrust" ;
26- import { SummaryResult , Testcase , EvalInput } from "@/types/evals" ;
26+ import {
27+ SummaryResult ,
28+ Testcase ,
29+ EvalInput ,
30+ ErrorType ,
31+ EvalOutput ,
32+ } from "@/types/evals" ;
2733import { EvalLogger } from "./logger" ;
2834import { AvailableModel , LLMClient } from "@browserbasehq/stagehand" ;
2935import { env } from "./env" ;
@@ -46,6 +52,14 @@ import { buildOnlineMind2WebTestcases } from "./suites/onlineMind2Web";
4652
4753dotenv . config ( ) ;
4854
55+ process . on ( "uncaughtException" , ( err ) => {
56+ console . error ( "[eval-runner] Uncaught exception:" , err ) ;
57+ } ) ;
58+
59+ process . on ( "unhandledRejection" , ( reason ) => {
60+ console . error ( "[eval-runner] Unhandled rejection:" , reason ) ;
61+ } ) ;
62+
4963/**
5064 * Read max concurrency and trial count from environment variables set in args.ts.
5165 * Fallback to defaults (20 and 5) if they're not provided.
@@ -107,20 +121,6 @@ const generateFilteredTestcases = (): Testcase[] => {
107121 ) ;
108122 }
109123
110- // Check for dataset filter from environment
111- const datasetFilter = process . env . EVAL_DATASET ;
112-
113- // If using external benchmarks via dataset filter, override category to use agent models
114- if (
115- datasetFilter &&
116- [ "gaia" , "webvoyager" , "webbench" , "osworld" ] . includes ( datasetFilter )
117- ) {
118- effectiveCategory = "external_agent_benchmarks" ;
119- console . log (
120- `Using dataset filter "${ datasetFilter } ", switching to external_agent_benchmarks category.` ,
121- ) ;
122- }
123-
124124 // Dynamically determine the MODELS based on the effective category
125125 const currentModels = getModelList ( effectiveCategory ) ;
126126
@@ -130,18 +130,15 @@ const generateFilteredTestcases = (): Testcase[] => {
130130 ) ;
131131
132132 // Special handling: fan out GAIA dataset for agent/gaia
133- const isGAIATaskIncluded =
134- taskNamesToRun . includes ( "agent/gaia" ) || datasetFilter === "gaia" ;
133+ const isGAIATaskIncluded = taskNamesToRun . includes ( "agent/gaia" ) ;
135134 // Special handling: fan out WebVoyager dataset for agent/webvoyager
136- const isWebVoyagerTaskIncluded =
137- taskNamesToRun . includes ( "agent/webvoyager" ) ||
138- datasetFilter === "webvoyager" ;
135+ const isWebVoyagerTaskIncluded = taskNamesToRun . includes ( "agent/webvoyager" ) ;
139136 // Special handling: fan out WebBench dataset for agent/webbench
140- const isWebBenchTaskIncluded =
141- taskNamesToRun . includes ( "agent/webbench" ) || datasetFilter === "webbench" ;
137+ const isWebBenchTaskIncluded = taskNamesToRun . includes ( "agent/webbench" ) ;
138+
142139 // Special handling: fan out OSWorld dataset for agent/osworld
143- const isOSWorldTaskIncluded =
144- taskNamesToRun . includes ( "agent/osworld" ) || datasetFilter === "osworld" ;
140+ const isOSWorldTaskIncluded = taskNamesToRun . includes ( "agent/osworld" ) ;
141+
145142 // Special handling: fan out Mind2Web dataset for agent/onlineMind2Web
146143 const isMind2WebTaskIncluded = taskNamesToRun . includes (
147144 "agent/onlineMind2Web" ,
@@ -150,100 +147,57 @@ const generateFilteredTestcases = (): Testcase[] => {
150147 let allTestcases : Testcase [ ] = [ ] ;
151148
152149 // Only include GAIA if no dataset filter or if gaia is selected
153- if ( isGAIATaskIncluded && ( ! datasetFilter || datasetFilter === "gaia" ) ) {
150+ if ( isGAIATaskIncluded ) {
154151 taskNamesToRun = taskNamesToRun . filter ( ( t ) => t !== "agent/gaia" ) ;
155152 allTestcases . push ( ...buildGAIATestcases ( currentModels ) ) ;
156- } else if (
157- taskNamesToRun . includes ( "agent/gaia" ) &&
158- datasetFilter &&
159- datasetFilter !== "gaia"
160- ) {
161- // Remove GAIA from tasks to run if dataset filter excludes it
162- taskNamesToRun = taskNamesToRun . filter ( ( t ) => t !== "agent/gaia" ) ;
163153 }
164154
165155 // Only include WebVoyager if no dataset filter or if webvoyager is selected
166- if (
167- isWebVoyagerTaskIncluded &&
168- ( ! datasetFilter || datasetFilter === "webvoyager" )
169- ) {
156+ if ( isWebVoyagerTaskIncluded ) {
170157 taskNamesToRun = taskNamesToRun . filter ( ( t ) => t !== "agent/webvoyager" ) ;
171158 allTestcases . push ( ...buildWebVoyagerTestcases ( currentModels ) ) ;
172- } else if (
173- taskNamesToRun . includes ( "agent/webvoyager" ) &&
174- datasetFilter &&
175- datasetFilter !== "webvoyager"
176- ) {
177- // Remove WebVoyager from tasks to run if dataset filter excludes it
178- taskNamesToRun = taskNamesToRun . filter ( ( t ) => t !== "agent/webvoyager" ) ;
179159 }
180160
181161 // Only include WebBench if no dataset filter or if webbench is selected
182- if (
183- isWebBenchTaskIncluded &&
184- ( ! datasetFilter || datasetFilter === "webbench" )
185- ) {
162+ if ( isWebBenchTaskIncluded ) {
186163 taskNamesToRun = taskNamesToRun . filter ( ( t ) => t !== "agent/webbench" ) ;
187164 allTestcases . push ( ...buildWebBenchTestcases ( currentModels ) ) ;
188- } else if (
189- taskNamesToRun . includes ( "agent/webbench" ) &&
190- datasetFilter &&
191- datasetFilter !== "webbench"
192- ) {
193- // Remove WebBench from tasks to run if dataset filter excludes it
194- taskNamesToRun = taskNamesToRun . filter ( ( t ) => t !== "agent/webbench" ) ;
195165 }
196166
197167 // Only include OSWorld if no dataset filter or if osworld is selected
198- if (
199- isOSWorldTaskIncluded &&
200- ( ! datasetFilter || datasetFilter === "osworld" )
201- ) {
168+ if ( isOSWorldTaskIncluded ) {
202169 taskNamesToRun = taskNamesToRun . filter ( ( t ) => t !== "agent/osworld" ) ;
203170 allTestcases . push ( ...buildOSWorldTestcases ( currentModels ) ) ;
204- } else if (
205- taskNamesToRun . includes ( "agent/osworld" ) &&
206- datasetFilter &&
207- datasetFilter !== "osworld"
208- ) {
209- // Remove OSWorld from tasks to run if dataset filter excludes it
210- taskNamesToRun = taskNamesToRun . filter ( ( t ) => t !== "agent/osworld" ) ;
211171 }
212172
213173 // Only include Mind2Web if no dataset filter or if onlineMind2Web is selected
214- if (
215- isMind2WebTaskIncluded &&
216- ( ! datasetFilter || datasetFilter === "onlineMind2Web" )
217- ) {
174+ if ( isMind2WebTaskIncluded ) {
218175 taskNamesToRun = taskNamesToRun . filter ( ( t ) => t !== "agent/onlineMind2Web" ) ;
219176 allTestcases . push ( ...buildOnlineMind2WebTestcases ( currentModels ) ) ;
220- } else if (
221- isMind2WebTaskIncluded &&
222- datasetFilter &&
223- datasetFilter !== "onlineMind2Web"
224- ) {
225- // Remove Mind2Web from tasks to run if dataset filter excludes it
226- taskNamesToRun = taskNamesToRun . filter ( ( t ) => t !== "agent/onlineMind2Web" ) ;
227177 }
228178
229179 // Create a list of all remaining testcases using the determined task names and models
230180 const regularTestcases = currentModels . flatMap ( ( model ) =>
231- taskNamesToRun . map ( ( testName ) => ( {
232- input : { name : testName , modelName : model as AvailableModel } ,
233- name : testName ,
234- tags : [
235- model ,
236- testName ,
237- ...( tasksConfig . find ( ( t ) => t . name === testName ) ?. categories || [ ] ) . map (
238- ( x ) => `category/${ x } ` ,
239- ) ,
240- ] ,
241- metadata : {
242- model : model as AvailableModel ,
243- test : testName ,
244- } ,
245- expected : true ,
246- } ) ) ,
181+ taskNamesToRun . map ( ( testName ) => {
182+ const taskCategories =
183+ tasksConfig . find ( ( t ) => t . name === testName ) ?. categories || [ ] ;
184+ return {
185+ input : { name : testName , modelName : model as AvailableModel } ,
186+ name : testName ,
187+ tags : [
188+ model ,
189+ // Only include primary category as tag
190+ taskCategories . length > 0 ? taskCategories [ 0 ] : "uncategorized" ,
191+ ] ,
192+ metadata : {
193+ model : model as AvailableModel ,
194+ test : testName ,
195+ category : taskCategories [ 0 ] ,
196+ categories : taskCategories , // Keep all categories in metadata for filtering
197+ } ,
198+ expected : true ,
199+ } ;
200+ } ) ,
247201 ) ;
248202
249203 allTestcases = [ ...allTestcases , ...regularTestcases ] ;
@@ -312,42 +266,27 @@ const generateFilteredTestcases = (): Testcase[] => {
312266 const logger = new EvalLogger ( ) ;
313267 try {
314268 // Dynamically import the task based on its name
315- const taskModulePath = path . join (
316- __dirname ,
317- "tasks" ,
318- `${ input . name } .ts` ,
319- ) ;
269+ const basePath = path . join ( __dirname , "tasks" , `${ input . name } ` ) ;
270+ const candidatePaths = [ `${ basePath } .js` , `${ basePath } .ts` ] ;
320271
321- // Check if file exists at direct path
322272 let taskModule ;
323- try {
324- // First try to import directly (for backward compatibility)
325- taskModule = await import ( taskModulePath ) ;
326- } catch ( error ) {
327- if ( input . name . includes ( "/" ) ) {
328- // If the name includes a path separator, try to import from subdirectory
329- const subDirPath = path . join (
330- __dirname ,
331- "tasks" ,
332- `${ input . name } .ts` ,
333- ) ;
334- try {
335- taskModule = await import ( subDirPath ) ;
336- } catch ( subError ) {
337- throw new StagehandEvalError (
338- `Failed to import task module for ${ input . name } . Tried paths:\n` +
339- `- ${ taskModulePath } \n` +
340- `- ${ subDirPath } \n` +
341- `Error: ${ subError . message } ` ,
342- ) ;
343- }
344- } else {
345- throw new StagehandEvalError (
346- `Failed to import task module for ${ input . name } at path ${ taskModulePath } : ${ error . message } ` ,
347- ) ;
273+ let lastError : unknown ;
274+ for ( const candidate of candidatePaths ) {
275+ try {
276+ taskModule = await import ( candidate ) ;
277+ break ;
278+ } catch ( err ) {
279+ lastError = err ;
348280 }
349281 }
350282
283+ if ( ! taskModule ) {
284+ const tried = candidatePaths . join ( "\n- " ) ;
285+ throw new StagehandEvalError (
286+ `Failed to import task module for ${ input . name } . Tried paths:\n- ${ tried } \nError: ${ ( lastError as Error ) ?. message } ` ,
287+ ) ;
288+ }
289+
351290 // Extract the task function
352291 const taskName = input . name . includes ( "/" )
353292 ? input . name . split ( "/" ) . pop ( ) // Get the last part of the path for nested tasks
@@ -362,9 +301,6 @@ const generateFilteredTestcases = (): Testcase[] => {
362301 }
363302
364303 // Execute the task
365- console . log (
366- `🏃 Running eval: ${ input . name } with model: ${ input . modelName } ` ,
367- ) ;
368304 let taskInput : Awaited < ReturnType < typeof initStagehand > > ;
369305
370306 if ( USE_API ) {
@@ -426,6 +362,7 @@ const generateFilteredTestcases = (): Testcase[] => {
426362 }
427363 // Pass full EvalInput to the task (data-driven params available via input.params)
428364 let result ;
365+ let isStagehandClosed = false ;
429366 try {
430367 result = await taskFunction ( { ...taskInput , input } ) ;
431368 // Log result to console
@@ -435,31 +372,80 @@ const generateFilteredTestcases = (): Testcase[] => {
435372 console . log ( `❌ ${ input . name } : Failed` ) ;
436373 }
437374 } finally {
438- await taskInput . stagehand . close ( ) ;
375+ // Only close if not already closed
376+ if ( taskInput . stagehand && ! isStagehandClosed ) {
377+ try {
378+ await taskInput . stagehand . close ( ) ;
379+ isStagehandClosed = true ;
380+ } catch ( closeError ) {
381+ console . warn ( "Error closing stagehand:" , closeError ) ;
382+ }
383+ }
439384 }
440385 return result ;
441386 } catch ( error ) {
387+ // Categorize the error
388+ let errorType = ErrorType . UNKNOWN ;
389+ const errorMessage =
390+ error instanceof Error ? error . message : String ( error ) ;
391+
392+ if ( error instanceof Error ) {
393+ if (
394+ error . message . includes ( "timeout" ) ||
395+ error . message . includes ( "Timeout" )
396+ ) {
397+ errorType = ErrorType . TIMEOUT ;
398+ } else if (
399+ error . message . includes ( "network" ) ||
400+ error . message . includes ( "fetch" )
401+ ) {
402+ errorType = ErrorType . NETWORK ;
403+ } else if (
404+ error . message . includes ( "parse" ) ||
405+ error . message . includes ( "JSON" )
406+ ) {
407+ errorType = ErrorType . PARSING_ERROR ;
408+ } else if (
409+ error . message . includes ( "init" ) ||
410+ error . message . includes ( "setup" )
411+ ) {
412+ errorType = ErrorType . SETUP_ERROR ;
413+ }
414+ }
415+
442416 // Log any errors that occur during task execution
443- console . error ( `❌ ${ input . name } : Error - ${ error } ` ) ;
417+ console . error ( `❌ ${ input . name } : ${ errorType } - ${ errorMessage } ` ) ;
444418 logger . error ( {
445419 message : `Error in task ${ input . name } ` ,
446420 level : 0 ,
447421 auxiliary : {
448422 error : {
449- value : error . message ,
423+ value : errorMessage ,
424+ type : "string" ,
425+ } ,
426+ error_type : {
427+ value : errorType ,
450428 type : "string" ,
451429 } ,
452430 trace : {
453- value : error . stack ,
431+ value : error instanceof Error ? error . stack : "" ,
454432 type : "string" ,
455433 } ,
456434 } ,
457435 } ) ;
458- return {
436+
437+ const output : EvalOutput = {
459438 _success : false ,
460439 error : JSON . parse ( JSON . stringify ( error , null , 2 ) ) ,
440+ error_type : errorType ,
441+ error_message : errorMessage ,
442+ error_stack : error instanceof Error ? error . stack : undefined ,
461443 logs : logger . getLogs ( ) ,
444+ debugUrl : "" ,
445+ sessionUrl : "" ,
462446 } ;
447+
448+ return output ;
463449 }
464450 } ,
465451 // Use the scoring functions defined above
@@ -475,6 +461,10 @@ const generateFilteredTestcases = (): Testcase[] => {
475461 ? { _success : result . output }
476462 : result . output ;
477463
464+ // The full output object (including error_type, error_message, etc.)
465+ // is already captured in result.output and will be visible in Braintrust
466+ // We don't need to duplicate it in metadata
467+
478468 return {
479469 input : result . input ,
480470 output,
0 commit comments