vllm-project#14: Add trace_mode option to TTWorker and TTModelRunner, update perf measurement to decode multiple tokens

skhorasganiTT · skhorasganiTT · commit 18f4c6eb1ad0 · 2025-04-16T18:58:37.000Z
Signed-off-by: Salar Hosseini &lt;skhorasgani@tenstorrent.com&gt;
diff --git a/examples/offline_inference_tt.py b/examples/offline_inference_tt.py
@@ -64,7 +64,9 @@ def run_inference_perf(
         print("Measuring performance with dummy prompts of length", input_prompt_len)
         prompt_token_ids = [[0]*input_prompt_len]*max_seqs_in_batch  # dummy prompts
     sampling_params = sampling_params[:max_seqs_in_batch] if isinstance(sampling_params, list) else sampling_params
-    sampling_params.max_tokens = 2  # 1 prefill output token + 1 decode output token
+    
+    # Set an arbitrary max_tokens to simulate generating multiple tokens consecutively
+    sampling_params.max_tokens = 33  # 1 prefill output token + 32 decode output tokens
     
     # Compile run
     print("Starting compile run")
@@ -74,8 +76,8 @@ def run_inference_perf(
 
     # Inference runs
     print("Starting inference runs")
-    N_warmup = 5
-    N_inference = 15
+    N_warmup = 1
+    N_inference = 5
     for i in tqdm(range(N_inference), desc="Inference runs"):
         if i == N_warmup:  # Reset stats after warmup
             llm.llm_engine.stat_loggers['global'].reset()
@@ -105,7 +107,13 @@ def generate_tokens(llm : LLM, prompts, sampling_params, prompt_token_ids=None,
     parser = argparse.ArgumentParser()
     parser.add_argument("--prompts_json", type=str, default="tt_metal/prompts.json", help="Path to JSON file containing prompts")
     parser.add_argument("--measure_perf", action="store_true", help="Measure performance")
-    parser.add_argument("--perf_prompt_len", type=int, default=127, help="Length of dummy prompts for performance measurement")
+    parser.add_argument("--perf_prompt_len", type=int, default=128, help="Length of dummy prompts for performance measurement")
+    parser.add_argument("--greedy_sampling", action="store_true", help="Use greedy decoding instead of top-k/p")
     args = parser.parse_args()
 
-    run_inference(args.prompts_json, measure_perf=args.measure_perf, perf_prompt_len=args.perf_prompt_len)
+    run_inference(
+        args.prompts_json,
+        measure_perf=args.measure_perf,
+        perf_prompt_len=args.perf_prompt_len,
+        greedy_sampling=args.greedy_sampling
+    )
diff --git a/tt_metal/README.md b/tt_metal/README.md
@@ -2,7 +2,7 @@
 ## vLLM and tt-metal Branches
 Git-checkout the following branches in each repo separately:
 - vLLM branch: [dev](https://github.com/tenstorrent/vllm/tree/dev) (last verified commit: [3f7beb2](https://github.com/tenstorrent/vllm/tree/3f7beb23cbaf3be2e104061905da5f91644e5a68))
-- tt-metal branch: [main](https://github.com/tenstorrent/tt-metal) (last verified commit: [f521af0](https://github.com/tenstorrent/tt-metal/tree/f521af0061bf53567942b7a27fd89aa300ec16ce))
+- tt-metal branch: [main](https://github.com/tenstorrent/tt-metal) (last verified commit: [f0b2483](https://github.com/tenstorrent/tt-metal/tree/f0b2483529a55d1101eb142ae1c70eec5260ecf7))
 
 ## Environment Creation
 
diff --git a/vllm/worker/tt_model_runner.py b/vllm/worker/tt_model_runner.py
@@ -92,6 +92,7 @@ def __init__(
         device_config: DeviceConfig,
         cache_config: CacheConfig,
         load_config: LoadConfig,
+        trace_mode: bool = True,
     ):
         self.model_config = model_config
         self.parallel_config = parallel_config
@@ -105,6 +106,9 @@ def __init__(
         self.sliding_window = model_config.get_sliding_window()
         self.block_size = cache_config.block_size
 
+        self.trace_mode = trace_mode  # whether to use ttnn tracing for model execution
+        self.execute_trace_kwargs = None  # kw args for trace execution (populated during first decode execution)
+
     def load_model(self) -> None:
         # Note: using custom TT loader instead of selecting from default vllm loaders
         loader = TTModelLoader(self.load_config)
@@ -234,6 +238,13 @@ def prepare_model_input(
                     block_tables,
                     torch.zeros(batch_pad_len, block_tables.shape[1], dtype=torch.int32, device="cpu")
                 ])
+            
+            # Pad block_tables to max num blocks so ttnn tracing can work (requires constant shape)
+            if self.trace_mode:
+                block_tables = torch.cat([
+                    block_tables,
+                    torch.zeros(block_tables.shape[0], self.cache_config.num_gpu_blocks - block_tables.shape[1], dtype=torch.int32, device="cpu")
+                ], dim=1)
         
         return TTModelInput(input_tokens, input_positions, prompt_lens, seq_groups, block_tables, unpadded_batch_size, tt_sampling_params)
 
@@ -257,7 +268,35 @@ def execute_model(
             "prompt_lens": model_input.prompt_lens,
         }
         
-        logits = self.model.forward(**execute_model_kwargs)  # [batch_size, seq_len, vocab_size]
+        is_decode = model_input.prompt_lens is None
+        
+        if self.trace_mode and is_decode:  # Trace mode for decode
+            # Remove prompt_lens from execute_model_kwargs since it's not used for decode
+            execute_model_kwargs.pop("prompt_lens")
+            
+            # Capture trace for the first decode execution
+            if self.execute_trace_kwargs is None:
+                logger.info("Capturing trace for first decode execution")
+                trace_id, tt_inp, rot_mat, cache_idxs_tt, tt_logits, tt_page_table = self.model.capture_trace(
+                    **execute_model_kwargs
+                )
+                self.execute_trace_kwargs = {
+                    "trace_id": trace_id,
+                    "tt_inp": tt_inp,
+                    "rot_mat": rot_mat,
+                    "cache_idxs_tt": cache_idxs_tt,
+                    "tt_logits": tt_logits,
+                    "tt_page_table": tt_page_table,
+                }
+            
+            # Remove kv_cache from execute_model_kwargs since it doesn't need to be copied to device for trace execution
+            execute_model_kwargs.pop("kv_cache")
+            
+            logits = self.model.decode_forward_trace(
+                **execute_model_kwargs, **self.execute_trace_kwargs
+            )
+        else:
+            logits = self.model.forward(**execute_model_kwargs)  # [batch_size, seq_len, vocab_size]
 
         # Note: for other devices, vLLM applies vllm.model_executor.layers.logits_processor::LogitsProcessor::_apply_logits_processors on logits, we don't use this
         # Note: for other devices, vLLM applies vllm.model_executor.layers.sampler::Sampler for sampling tokens, we don't use this
@@ -292,4 +331,13 @@ def _validate_sampling_params(self, sampling_params):
         assert sampling_params.best_of == 1, "Currently only supporting best_of=1"
         assert not sampling_params.use_beam_search, "Currently not supporting beam search"
         assert sampling_params.logprobs is None, "Currently not supporting logprobs"
-        assert sampling_params.prompt_logprobs is None, "Currently not supporting prompt_logprobs"
+        assert sampling_params.prompt_logprobs is None, "Currently not supporting prompt_logprobs"
+
+    ## Destructor (used to delete ttnn trace if using trace mode)
+    
+    def __del__(self):
+        if self.trace_mode and self.execute_trace_kwargs is not None:
+            self.model.delete_trace(self.execute_trace_kwargs["trace_id"])
+        
+        if hasattr(super(TTModelRunner, self), '__del__'):
+            super().__del__()
diff --git a/vllm/worker/tt_worker.py b/vllm/worker/tt_worker.py
@@ -181,13 +181,16 @@ def __init__(
             self.cache_dtype = STR_DTYPE_TO_TORCH_DTYPE[
                 self.cache_config.cache_dtype]
 
+        self.trace_mode = True  # whether to use ttnn tracing for model execution, TODO: make this configurable
+
         self.model_runner: TTModelRunner = TTModelRunner(
             model_config,
             parallel_config,
             scheduler_config,
             device_config,
             cache_config,
-            load_config
+            load_config,
+            trace_mode=self.trace_mode,
         )
         
         self.cache_engine: List[TTCacheEngine]
@@ -371,7 +374,10 @@ def _get_dispatch_core_type(self):
         return dispatch_core_type
     
     def _open_t3k_mesh_device(self):
-        device_params = {}
+        if self.trace_mode:
+            device_params = {"trace_region_size": 14227456}  # TODO: make this configurable
+        else:
+            device_params = {}
         mesh_device = ttnn.open_mesh_device(
             ttnn.MeshShape(2, 4),
             dispatch_core_type=self._get_dispatch_core_type(),
@@ -398,6 +404,8 @@ def _enable_async_mode(self):
     ## Destructor (used to close devices)
     
     def __del__(self):
+        del self.model_runner  # Delete model runner first in case there are model arifacts (e.g ttnn trace)
+        
         if self.mesh_device:
             devices = self.mesh_device.get_devices()