fix

jlamypoirier · jlamypoirier · commit 1b00dcc97a28 · 2023-05-25T15:51:52.000-04:00
diff --git a/scripts/run_textgen_benchmark_breakdown.sh b/scripts/run_textgen_benchmark_breakdown.sh
@@ -17,7 +17,7 @@ SAVE_DIR=data/benchmarks/v3
 RUN="python3 -m src.main --pipeline_class=TG_Pipeline --max_log_outputs=0 --dtype=float16 --device=cuda  --custom_generate  --breakdown_latency --ignore_oom --no_fast_init "
 
 
-IMPL=("flash" "causal" "vector" "bigcode")
+IMPL=("flash" "causal" "vector" "bigcode" "bigcode2")
 
 
 STEP=("" "--no_cache")
@@ -38,7 +38,7 @@ run () { # run(step, runtime, attn)
   fi
 }
 
-for impl in {0..3}
+for impl in {0..4}
 do
   if [ "${STEP_ID}" -eq "0" ]
   then
diff --git a/src/pipeline.py b/src/pipeline.py
@@ -619,10 +619,10 @@ def _generate_textgen(
         with torch.inference_mode():
             for key_length in range(input_length, output_length, key_length_step):
                 try:
-                    if (key_length_step > 1 and key_length > key_length) or not use_cache or not do_prefill:
+                    if (key_length_step > 1 and key_length > input_length) or not use_cache or not do_prefill:
                         if not hasattr(self.model, "fast_forward"):
                             raise NotImplementedError()
-                        self.model.fast_forward(batch, key_length, use_cache)
+                        self.model.fast_forward(batch, key_length, self.dtype if use_cache else None)
                         last_time = self._get_time(breakdown_latency)
                     generated, batch = self.model.generate_token(batch)
                     t2 = self._get_time(breakdown_latency)