11import { AIService } from '../services/aiService' ;
22import {
3- startPluginEvaluation ,
43 submitPluginEvaluation ,
54 startPluginEvaluationWithQuestions ,
5+ getEvaluationResults ,
66} from '../services' ;
77import type { ModelInfo , PersonaInfo } from '../components/chat-header/types' ;
88import type {
@@ -11,6 +11,8 @@ import type {
1111 TestCase ,
1212 SubmissionItem ,
1313 EvaluationRun ,
14+ PersonaConfigRequest ,
15+ PersonaModelSettings ,
1416} from './evaluationViewTypes' ;
1517import { EvaluationPersistence , type PersistedEvaluationState } from './EvaluationPersistence' ;
1618
@@ -25,6 +27,7 @@ export class EvaluationService {
2527 private deps : ServiceDependencies ;
2628 private abortController : AbortController | null = null ;
2729 private currentModel : ModelInfo | null = null ;
30+ private currentLlmModel : string | null = null ;
2831 private currentPersona : PersonaInfo | null = null ;
2932 private currentCollectionId : string | null = null ;
3033 private processedQuestionIds : Set < string > = new Set ( ) ;
@@ -39,6 +42,31 @@ export class EvaluationService {
3942 this . updateShellState = updateShellState ;
4043 }
4144
45+ /**
46+ * Convert PersonaInfo to PersonaConfigRequest format for API
47+ */
48+ private convertPersonaToRequest ( persona : PersonaInfo | null ) : PersonaConfigRequest | null {
49+ if ( ! persona ) return null ;
50+
51+ return {
52+ id : persona . id || null ,
53+ name : persona . name || null ,
54+ description : ( persona as any ) . description || null ,
55+ system_prompt : persona . system_prompt || null ,
56+ model_settings : {
57+ temperature : 0.7 ,
58+ top_p : 0.9 ,
59+ frequency_penalty : 0.0 ,
60+ presence_penalty : 0.0 ,
61+ context_window : 4000 ,
62+ stop_sequences : [ ] ,
63+ ...( persona . model_settings || { } ) ,
64+ } as PersonaModelSettings ,
65+ created_at : ( persona as any ) . created_at || null ,
66+ updated_at : ( persona as any ) . updated_at || null ,
67+ } ;
68+ }
69+
4270 private updateState ( newState : Partial < EvaluationFeatureState > ) : void {
4371 this . state = { ...this . state , ...newState } ;
4472 this . updateShellState ( newState ) ;
@@ -50,25 +78,29 @@ export class EvaluationService {
5078 public runEvaluation = async (
5179 selectedModel : ModelInfo ,
5280 selectedPersona : PersonaInfo | null = null ,
53- collectionId ? : string | null ,
54- questions ? : string [ ] | null
81+ collectionId : string ,
82+ questions : string [ ]
5583 ) : Promise < void > => {
5684 this . updateState ( { isRunning : true , error : null , progress : 0 } ) ;
5785 this . abortController = new AbortController ( ) ;
5886
5987 // Save current run config
6088 this . currentModel = selectedModel ;
89+ this . currentLlmModel = selectedModel . name ;
6190 this . currentPersona = selectedPersona ;
62- this . currentCollectionId = collectionId || null ;
91+ this . currentCollectionId = collectionId ;
6392 this . processedQuestionIds . clear ( ) ;
6493
6594 try {
6695 // Step 1: Start evaluation - get test questions with context
6796 console . log ( 'Starting evaluation...' ) ;
6897
69- const { evaluation_run_id, test_data } = collectionId && questions
70- ? await startPluginEvaluationWithQuestions ( { collection_id : collectionId , questions } )
71- : await startPluginEvaluation ( ) ;
98+ const { evaluation_run_id, test_data } = await startPluginEvaluationWithQuestions ( {
99+ collection_id : collectionId ,
100+ questions,
101+ llm_model : this . currentLlmModel ,
102+ persona : this . convertPersonaToRequest ( selectedPersona ) ,
103+ } ) ;
72104 console . log ( `Evaluation started: ${ evaluation_run_id } ` ) ;
73105 console . log ( `Total questions: ${ test_data . length } ` ) ;
74106
@@ -80,10 +112,14 @@ export class EvaluationService {
80112 total_questions : test_data . length ,
81113 correct_count : 0 ,
82114 incorrect_count : 0 ,
115+ evaluated_count : 0 ,
83116 accuracy : 0 ,
84117 started_at : new Date ( ) . toISOString ( ) ,
85118 is_completed : false ,
86119 progress : 0 ,
120+ status : 'running' ,
121+ duration_seconds : null ,
122+ run_date : new Date ( ) . toISOString ( ) ,
87123 } ,
88124 } ) ;
89125
@@ -132,41 +168,67 @@ export class EvaluationService {
132168
133169 this . updateState ( { isGenerating : false } ) ;
134170
135- // Step 4: Submit batch for judging
171+ // Step 4: Submit batch for judging (returns 202 immediately)
136172 console . log ( `Submitting ${ submissions . length } answers...` ) ;
137- const result = await submitPluginEvaluation ( {
173+ const submitResponse = await submitPluginEvaluation ( {
138174 evaluation_run_id,
139175 submissions,
140176 } ) ;
141177
142- console . log ( `Batch ${ batchIndex + 1 } results:` , result ) ;
143-
144- // Update progress and results
145- this . updateState ( {
146- progress : result . progress ,
147- currentResults : result ,
148- activeRun : {
149- ...this . state . activeRun ! ,
150- progress : result . progress ,
151- correct_count : result . correct_count ,
152- incorrect_count : result . incorrect_count ,
153- accuracy : ( result . correct_count / result . total_questions ) * 100 ,
154- } ,
155- } ) ;
178+ console . log ( `Batch ${ batchIndex + 1 } submitted:` , submitResponse . message ) ;
179+
180+ // Step 5: Poll for results until batch is judged
181+ const previousEvaluatedCount = this . state . activeRun ?. evaluated_count || 0 ;
182+ const targetCount = previousEvaluatedCount + submissions . length ;
183+
184+ console . log ( `Waiting for judging... (target: ${ targetCount } )` ) ;
185+
186+ while ( true ) {
187+ // Check if aborted
188+ if ( this . abortController ?. signal . aborted ) {
189+ console . log ( 'Evaluation aborted during polling' ) ;
190+ this . updateState ( { isRunning : false } ) ;
191+ return ;
192+ }
193+
194+ // Wait 2 seconds before polling
195+ await new Promise ( resolve => setTimeout ( resolve , 2000 ) ) ;
196+
197+ // Poll for results
198+ const resultsData = await getEvaluationResults ( evaluation_run_id ) ;
199+ const evaluationRun = resultsData . evaluation_run ;
200+
201+ console . log ( `Poll result: evaluated ${ evaluationRun . evaluated_count } /${ evaluationRun . total_questions } ` ) ;
202+
203+ // Update UI with latest progress
204+ this . updateState ( {
205+ progress : evaluationRun . progress ,
206+ activeRun : {
207+ ...this . state . activeRun ! ,
208+ evaluated_count : evaluationRun . evaluated_count ,
209+ progress : evaluationRun . progress ,
210+ correct_count : evaluationRun . correct_count ,
211+ incorrect_count : evaluationRun . incorrect_count ,
212+ accuracy : evaluationRun . accuracy ,
213+ } ,
214+ } ) ;
215+
216+ // Check if batch is processed
217+ if ( evaluationRun . evaluated_count >= targetCount ) {
218+ console . log ( `Batch ${ batchIndex + 1 } judged successfully` ) ;
219+ break ;
220+ }
221+ }
156222
157223 // Save progress to persistence
158224 this . savePersistenceState ( evaluation_run_id , test_data ) ;
159225
160226 // Check if completed
161- if ( result . is_completed ) {
227+ const currentRun = this . state . activeRun ! ;
228+ if ( currentRun . evaluated_count === currentRun . total_questions ) {
162229 console . log ( 'Evaluation completed!' ) ;
163230 const completedRun : EvaluationRun = {
164- id : evaluation_run_id ,
165- total_questions : result . total_questions ,
166- correct_count : result . correct_count ,
167- incorrect_count : result . incorrect_count ,
168- accuracy : ( result . correct_count / result . total_questions ) * 100 ,
169- started_at : this . state . activeRun ! . started_at ,
231+ ...currentRun ,
170232 completed_at : new Date ( ) . toISOString ( ) ,
171233 is_completed : true ,
172234 progress : 1.0 ,
@@ -274,11 +336,12 @@ export class EvaluationService {
274336 * Save current evaluation state to persistence
275337 */
276338 private savePersistenceState ( runId : string , testCases : TestCase [ ] ) : void {
277- if ( ! this . currentModel ) return ;
339+ if ( ! this . currentModel || ! this . currentLlmModel ) return ;
278340
279341 const persistedState : PersistedEvaluationState = {
280342 runId,
281343 model : this . currentModel ,
344+ llmModel : this . currentLlmModel ,
282345 persona : this . currentPersona ,
283346 collectionId : this . currentCollectionId ,
284347 testCases,
@@ -308,6 +371,7 @@ export class EvaluationService {
308371
309372 // Restore tracking state
310373 this . currentModel = persistedState . model ;
374+ this . currentLlmModel = persistedState . llmModel ;
311375 this . currentPersona = persistedState . persona ;
312376 this . currentCollectionId = persistedState . collectionId || null ;
313377 this . processedQuestionIds = new Set ( persistedState . processedQuestionIds ) ;
@@ -341,10 +405,14 @@ export class EvaluationService {
341405 total_questions : test_data . length ,
342406 correct_count : 0 ,
343407 incorrect_count : 0 ,
408+ evaluated_count : this . processedQuestionIds . size ,
344409 accuracy : 0 ,
345410 started_at : new Date ( persistedState . timestamp ) . toISOString ( ) ,
346411 is_completed : false ,
347412 progress : this . processedQuestionIds . size / test_data . length ,
413+ status : 'running' ,
414+ duration_seconds : null ,
415+ run_date : new Date ( persistedState . timestamp ) . toISOString ( ) ,
348416 } ,
349417 } ) ;
350418
@@ -391,41 +459,67 @@ export class EvaluationService {
391459
392460 this . updateState ( { isGenerating : false } ) ;
393461
394- // Submit batch for judging
462+ // Submit batch for judging (returns 202 immediately)
395463 console . log ( `Submitting ${ submissions . length } answers...` ) ;
396- const result = await submitPluginEvaluation ( {
464+ const submitResponse = await submitPluginEvaluation ( {
397465 evaluation_run_id,
398466 submissions,
399467 } ) ;
400468
401- console . log ( `Batch ${ batchIndex + 1 } results:` , result ) ;
402-
403- // Update progress and results
404- this . updateState ( {
405- progress : result . progress ,
406- currentResults : result ,
407- activeRun : {
408- ...this . state . activeRun ! ,
409- progress : result . progress ,
410- correct_count : result . correct_count ,
411- incorrect_count : result . incorrect_count ,
412- accuracy : ( result . correct_count / result . total_questions ) * 100 ,
413- } ,
414- } ) ;
469+ console . log ( `Batch ${ batchIndex + 1 } submitted:` , submitResponse . message ) ;
470+
471+ // Poll for results until batch is judged
472+ const previousEvaluatedCount = this . state . activeRun ?. evaluated_count || 0 ;
473+ const targetCount = previousEvaluatedCount + submissions . length ;
474+
475+ console . log ( `Waiting for judging... (target: ${ targetCount } )` ) ;
476+
477+ while ( true ) {
478+ // Check if aborted
479+ if ( this . abortController ?. signal . aborted ) {
480+ console . log ( 'Evaluation aborted during polling' ) ;
481+ this . updateState ( { isRunning : false } ) ;
482+ return ;
483+ }
484+
485+ // Wait 2 seconds before polling
486+ await new Promise ( resolve => setTimeout ( resolve , 2000 ) ) ;
487+
488+ // Poll for results
489+ const resultsData = await getEvaluationResults ( evaluation_run_id ) ;
490+ const evaluationRun = resultsData . evaluation_run ;
491+
492+ console . log ( `Poll result: evaluated ${ evaluationRun . evaluated_count } /${ evaluationRun . total_questions } ` ) ;
493+
494+ // Update UI with latest progress
495+ this . updateState ( {
496+ progress : evaluationRun . progress ,
497+ activeRun : {
498+ ...this . state . activeRun ! ,
499+ evaluated_count : evaluationRun . evaluated_count ,
500+ progress : evaluationRun . progress ,
501+ correct_count : evaluationRun . correct_count ,
502+ incorrect_count : evaluationRun . incorrect_count ,
503+ accuracy : evaluationRun . accuracy ,
504+ } ,
505+ } ) ;
506+
507+ // Check if batch is processed
508+ if ( evaluationRun . evaluated_count >= targetCount ) {
509+ console . log ( `Batch ${ batchIndex + 1 } judged successfully` ) ;
510+ break ;
511+ }
512+ }
415513
416514 // Save progress to persistence
417515 this . savePersistenceState ( evaluation_run_id , test_data ) ;
418516
419517 // Check if completed
420- if ( result . is_completed ) {
518+ const currentRun = this . state . activeRun ! ;
519+ if ( currentRun . evaluated_count === currentRun . total_questions ) {
421520 console . log ( 'Evaluation completed!' ) ;
422521 const completedRun : EvaluationRun = {
423- id : evaluation_run_id ,
424- total_questions : result . total_questions ,
425- correct_count : result . correct_count ,
426- incorrect_count : result . incorrect_count ,
427- accuracy : ( result . correct_count / result . total_questions ) * 100 ,
428- started_at : this . state . activeRun ! . started_at ,
522+ ...currentRun ,
429523 completed_at : new Date ( ) . toISOString ( ) ,
430524 is_completed : true ,
431525 progress : 1.0 ,
0 commit comments