@@ -576,6 +576,8 @@ impl Conversation {
576576 let repo_ref = & self . repo_ref ;
577577 let chunks = stream:: iter ( paths)
578578 . map ( |path| async move {
579+ tracing:: debug!( ?path, "reading file" ) ;
580+
579581 let lines = ctx
580582 . app
581583 . indexes
@@ -589,20 +591,24 @@ impl Conversation {
589591 . map ( |( i, line) | format ! ( "{} {line}" , i + 1 ) )
590592 . collect :: < Vec < _ > > ( ) ;
591593
592- Result :: < _ > :: Ok ( ( lines, path) )
593- } )
594- // Buffer file loading to load multiple paths at once
595- . buffered ( 10 )
596- . and_then ( |( lines, path) : ( Vec < String > , String ) | async move {
597594 const MAX_TOKENS : usize = 3400 ;
598595 const LINE_OVERLAP : usize = 3 ;
599596
600597 let bpe = tiktoken_rs:: get_bpe_from_model ( "gpt-3.5-turbo" ) ?;
601- let iter = split_line_set_by_tokens ( lines, bpe, MAX_TOKENS , LINE_OVERLAP )
602- . map ( move |lines| Result :: < _ > :: Ok ( ( lines, path. clone ( ) ) ) ) ;
603598
604- Ok ( futures:: stream:: iter ( iter) )
599+ let iter = tokio:: task:: spawn_blocking ( || {
600+ split_line_set_by_tokens ( lines, bpe, MAX_TOKENS , LINE_OVERLAP )
601+ . collect :: < Vec < _ > > ( )
602+ } )
603+ . await
604+ . context ( "failed to split by token" ) ?
605+ . into_iter ( )
606+ . map ( move |lines| Result :: < _ > :: Ok ( ( lines, path. clone ( ) ) ) ) ;
607+
608+ Result :: < _ > :: Ok ( futures:: stream:: iter ( iter) )
605609 } )
610+ // Buffer file loading to load multiple paths at once
611+ . buffered ( 10 )
606612 . try_flatten ( )
607613 . map ( |result| async {
608614 let ( lines, path) = result?;
@@ -621,6 +627,8 @@ impl Conversation {
621627 let contents = lines. join ( "\n " ) ;
622628 let prompt = prompts:: file_explanation ( question, & path, & contents) ;
623629
630+ tracing:: debug!( ?path, "calling chat API on file" ) ;
631+
624632 let json = ctx
625633 . llm_gateway
626634 . chat ( & [ llm_gateway:: api:: Message :: system ( & prompt) ] )
@@ -1127,6 +1135,11 @@ fn split_line_set_by_tokens(
11271135 max_tokens : usize ,
11281136 line_overlap : usize ,
11291137) -> impl Iterator < Item = Vec < String > > {
1138+ let line_tokens = lines
1139+ . iter ( )
1140+ . map ( |line| bpe. encode_ordinary ( line) . len ( ) )
1141+ . collect :: < Vec < _ > > ( ) ;
1142+
11301143 let mut start = 0usize ;
11311144
11321145 std:: iter:: from_fn ( move || {
@@ -1138,14 +1151,12 @@ fn split_line_set_by_tokens(
11381151
11391152 let mut subset = Vec :: new ( ) ;
11401153
1141- loop {
1142- if start >= lines. len ( ) {
1143- break ;
1144- }
1145-
1146- let text = subset. join ( "\n " ) ;
1147-
1148- if limit_tokens ( & text, bpe. clone ( ) , max_tokens) . len ( ) < text. len ( ) {
1154+ while start < lines. len ( ) {
1155+ if line_tokens[ start - subset. len ( ) ..start]
1156+ . iter ( )
1157+ . sum :: < usize > ( )
1158+ > max_tokens
1159+ {
11491160 subset. pop ( ) ;
11501161 start -= 1 ;
11511162 break ;
0 commit comments