diff --git a/examples/offline_profile.py b/examples/offline_profile.py index 054c438036eb0..1c95b5bed451c 100644 --- a/examples/offline_profile.py +++ b/examples/offline_profile.py @@ -36,7 +36,10 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], print(f" {key} = {value}") # Create sampling params - sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=8) + sampling_params = SamplingParams(temperature=0.8, + top_p=0.95, + max_tokens=8, + ignore_eos=True) # Create LLM llm = LLM( @@ -74,14 +77,14 @@ def run_profile(context: ProfileContext, csv_output: Optional[str], sys.exit(-1) for i in range(batch_size): + prompt_token_ids = torch.randint( + llm.llm_engine.model_config.get_vocab_size(), + size=(prompt_len, )).tolist() + llm.llm_engine.add_request( request_id=f"seq{i}", - prompt=None, - prompt_token_ids=torch.randint( - 128, # 128 to skip over special tokens - llm.llm_engine.model_config.get_vocab_size() // 2, - size=(prompt_len, )).tolist(), - sampling_params=sampling_params) + inputs={'prompt_token_ids': prompt_token_ids}, + params=sampling_params) with nm_profile() as prefill_prof: llm.llm_engine.step() # First step is prefill diff --git a/vllm/profiler/nm_profile.py b/vllm/profiler/nm_profile.py index bc357978bec54..0cc98cd17f40e 100644 --- a/vllm/profiler/nm_profile.py +++ b/vllm/profiler/nm_profile.py @@ -191,7 +191,8 @@ def _get_kineto_gpu_event(self, node: _ModuleTreeNode): correlated_kineto_events = self._kineto_event_correlation_map.get( node.event.correlation_id, []) iterator = (x for x in correlated_kineto_events - if x.device_type() == DeviceType.CUDA) + if x.device_type() == DeviceType.CUDA + and x.name() == node.event.name) return next(iterator, None) def _cumulative_cuda_time(self, node: _ModuleTreeNode): diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index a321eafce1a2f..ff8df9b3ea56e 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -1,3 +1,5 @@ +# This file has been modified by Neural Magic + import gc import time import warnings @@ -986,9 +988,13 @@ def vocab_size(self) -> int: return self.model_config.get_vocab_size() -class CUDAGraphRunner: +# NOTE: this is nn.Module so the profiler can properly capture/group +# kernels calls made within the graph +class CUDAGraphRunner(nn.Module): def __init__(self, model: nn.Module): + super().__init__() + self.model = model self.input_buffers: Dict[str, torch.Tensor] = {} self.output_buffers: Dict[str, torch.Tensor] = {} @@ -1084,9 +1090,6 @@ def forward( # Return the output tensor. return self.output_buffers["hidden_states"] - def __call__(self, *args, **kwargs): - return self.forward(*args, **kwargs) - def _get_graph_batch_size(batch_size: int) -> int: """Returns the padded batch size given actual batch size.