Add initial implementation of TT backend (Worker, ModelRunner, ModelLoader) with basic llama generation example (#1)

skhorasganiTT · skhorasganiTT · commit a462a0359e5b · 2025-04-16T18:56:29.000Z
Signed-off-by: Salar Hosseini &lt;skhorasgani@tenstorrent.com&gt;
diff --git a/examples/offline_inference_tt.py b/examples/offline_inference_tt.py
@@ -0,0 +1,31 @@
+import os
+import sys
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from tt_metal.models.demos.t3000.llama2_70b.tt.llama_generation import TtLlamaModelForGeneration
+
+from vllm import LLM, SamplingParams
+from vllm import ModelRegistry
+ModelRegistry.register_model("TTLlamaForCausalLM", TtLlamaModelForGeneration)
+
+# Sample prompts.
+# prompts = [
+#     "Hello, my name is",
+#     "The president of the United States is",
+#     "The capital of France is",
+#     "The future of AI is",
+# ]
+prompts = [ "Hello, my name is" ] * 32
+# Create a sampling params object.
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# Create an LLM.
+llm = LLM(model="meta-llama/Meta-Llama-3.1-70B")
+
+# Generate texts from the prompts. The output is a list of RequestOutput objects
+# that contain the prompt, generated text, and other information.
+outputs = llm.generate(prompts, sampling_params)
+# Print the outputs.
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
diff --git a/requirements/tt.txt b/requirements/tt.txt
@@ -0,0 +1,2 @@
+# Common dependencies
+-r common.txt
diff --git a/setup.py b/setup.py
@@ -429,6 +429,10 @@ def _no_device() -> bool:
     return VLLM_TARGET_DEVICE == "empty"
 
 
+def _is_tt() -> bool:
+    return VLLM_TARGET_DEVICE == "tt"
+
+
 def _is_cuda() -> bool:
     has_cuda = torch.version.cuda is not None
     return (VLLM_TARGET_DEVICE == "cuda" and has_cuda
@@ -544,6 +548,8 @@ def get_vllm_version() -> str:
     if _no_device():
         if envs.VLLM_TARGET_DEVICE == "empty":
             version += f"{sep}empty"
+    elif _is_tt():
+        version += f"{sep}tt"
     elif _is_cuda():
         if envs.VLLM_USE_PRECOMPILED:
             version += f"{sep}precompiled"
@@ -625,6 +631,8 @@ def _read_requirements(filename: str) -> list[str]:
         requirements = _read_requirements("cpu.txt")
     elif _is_xpu():
         requirements = _read_requirements("xpu.txt")
+    elif _is_tt():
+        requirements = _read_requirements("tt.txt")
     else:
         raise ValueError(
             "Unsupported platform, please use CUDA, ROCm, Neuron, HPU, "
@@ -665,6 +673,9 @@ def _read_requirements(filename: str) -> list[str]:
 
 if _no_device():
     ext_modules = []
+    
+if _is_tt():
+    ext_modules = []
 
 if not ext_modules:
     cmdclass = {}
diff --git a/tt_metal/README.md b/tt_metal/README.md
@@ -0,0 +1,29 @@
+
+## Environment Creation 
+
+To setup the tt-metal environment with vLLM, follow the instructions in `setup-metal.sh`
+
+## Accessing the Meta-Llama-3.1 Hugging Face Model
+
+To run Meta-Llama-3.1, it is required to have access to the model on Hugging Face. 
+Steps:
+1. Request access on [https://huggingface.co/meta-llama/Meta-Llama-3.1-70B](https://huggingface.co/meta-llama/Meta-Llama-3.1-70B).
+2. Once you have received access, create and copy your access token from the settings tab on Hugging Face.
+3. Run this code in python and paste your access token:
+    ```python
+    from huggingface_hub import notebook_login
+    notebook_login()
+    ```
+
+## Importing the tt-metal models
+
+Create a symbolic link to the tt-metal models folder inside vLLM:
+```sh
+cd tt_metal
+ln -s <path/to/tt-metal>/models ./models
+```
+
+## Running the offline inference example
+```python
+WH_ARCH_YAML=wormhole_b0_80_arch_eth_dispatch.yaml python examples/offline_inference_tt.py
+```
diff --git a/tt_metal/setup-metal.sh b/tt_metal/setup-metal.sh
@@ -0,0 +1,13 @@
+export PYTHON_ENV_DIR="${TT_METAL_HOME}/build/python_env_vllm"
+export VLLM_TARGET_DEVICE="tt"
+
+# to create vllm env (first time):
+# 1. setup tt-metal env vars 
+# 2. source $vllm_dir/tt_metal/setup-metal.sh (this script)
+# 3. build and create tt-metal env as usual
+# 4. source $PYTHON_ENV_DIR/bin/activate 
+# 5. pip3 install --upgrade pip
+# 6. cd $vllm_dir && pip install -e .
+
+# to activate (after first time):
+# 1. source $vllm_dir/tt_metal/setup-metal.sh && source $PYTHON_ENV_DIR/bin/activate
diff --git a/vllm/config.py b/vllm/config.py
@@ -2070,7 +2070,7 @@ def __init__(self, device: str = "auto") -> None:
         # Some device types require processing inputs on CPU
         if self.device_type in ["neuron"]:
             self.device = torch.device("cpu")
-        elif self.device_type in ["tpu"]:
+        elif self.device_type in ["tpu"] or self.device_type in ["tt"]:
             self.device = None
         else:
             # Set device with device type
diff --git a/vllm/model_executor/model_loader/tt_loader.py b/vllm/model_executor/model_loader/tt_loader.py
@@ -0,0 +1,26 @@
+from typing import Optional
+
+from torch import nn
+
+from vllm.model_executor.model_loader.loader import BaseModelLoader
+from vllm.model_executor.model_loader.utils import get_model_architecture
+from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ModelConfig, 
+                         ParallelConfig, SchedulerConfig)
+
+
+class TTModelLoader(BaseModelLoader):
+    def load_model(self, *, model_config: ModelConfig,
+                   device_config: DeviceConfig,
+                   parallel_config: ParallelConfig,
+                   scheduler_config: SchedulerConfig,
+                   cache_config: CacheConfig) -> nn.Module:
+        """Load a model with the given configurations."""
+        
+        # For TT models, prepend "TT" to the architecture name, e.g. "TTLlamaForCausalLM"
+        arch_names = model_config.hf_config.architectures
+        assert len(model_config.hf_config.architectures) == 1
+        arch_names[0] = "TT" + arch_names[0]
+        
+        model_class, _ = get_model_architecture(model_config)
+        model = model_class.initialize_vllm_model(model_config.hf_config, device_config.device)
+        return model
diff --git a/vllm/worker/tt_model_runner.py b/vllm/worker/tt_model_runner.py
@@ -0,0 +1,211 @@
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type, Union
+
+import torch
+
+from vllm.config import (CacheConfig, DeviceConfig, LoadConfig,
+                         ModelConfig, ParallelConfig,
+                         SchedulerConfig)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.model_executor.model_loader.tt_loader import TTModelLoader
+from vllm.sequence import IntermediateTensors, SequenceGroupMetadata, Logprob, SequenceOutput, CompletionSequenceGroupOutput
+from vllm.worker.model_runner_base import ModelRunnerBase, ModelRunnerInputBase
+
+if TYPE_CHECKING:
+    from vllm.attention.backends.abstract import AttentionBackend
+
+logger = init_logger(__name__)
+
+
+@dataclass(frozen=True)
+class TTModelInput(ModelRunnerInputBase):
+    """
+    Used by the TTModelRunner.
+    """
+    input_tokens: Optional[torch.Tensor] = None
+    input_positions: Optional[torch.Tensor] = None
+    prompt_lens: Optional[torch.Tensor] = None
+    seq_groups: Optional[List[List[int]]] = None
+
+    def as_broadcastable_tensor_dict(
+            self) -> Dict[str, Union[int, torch.Tensor]]:
+        tensor_dict = {
+            "input_tokens": self.input_tokens,
+            "input_positions": self.input_positions,
+            "prompt_lens": self.prompt_lens,
+            "seq_groups": self.seq_groups,
+        }
+        
+        return tensor_dict
+
+    @classmethod
+    def from_broadcasted_tensor_dict(
+            cls: Type["TTModelInput"],
+            tensor_dict: Dict[str, Any],
+    ) -> "TTModelInput":
+        return cls(**tensor_dict)
+
+
+class TTModelRunner(ModelRunnerBase[TTModelInput]):
+
+    def __init__(
+        self,
+        model_config: ModelConfig,
+        parallel_config: ParallelConfig,
+        scheduler_config: SchedulerConfig,
+        device_config: DeviceConfig,
+        cache_config: CacheConfig,
+        load_config: LoadConfig,
+    ):
+        self.model_config = model_config
+        self.parallel_config = parallel_config
+        self.scheduler_config = scheduler_config
+        # Currently, TT worker doesn't support chunked prefill.
+        assert self.scheduler_config.chunked_prefill_enabled is False
+        self.device_config = device_config
+        self.cache_config = cache_config
+        self.load_config = load_config
+
+        self.device = self.device_config.device
+
+        self.sliding_window = model_config.get_sliding_window()
+        self.block_size = cache_config.block_size
+
+    def load_model(self) -> None:
+        # Note: using custom TT loader instead of selecting from default vllm loaders
+        loader = TTModelLoader(self.load_config)
+        self.model = loader.load_model(model_config=self.model_config,
+            device_config=self.device_config,
+            parallel_config=self.parallel_config,
+            scheduler_config=self.scheduler_config,
+            cache_config=self.cache_config
+        )
+
+    def make_model_input_from_broadcasted_tensor_dict(
+        self,
+        tensor_dict: Dict[str, Any],
+    ) -> TTModelInput:
+        return TTModelInput.from_broadcasted_tensor_dict(
+            tensor_dict,
+        )
+
+    def prepare_model_input(
+            self,
+            seq_group_metadata_list: List[SequenceGroupMetadata],
+            virtual_engine: int = 0,
+            finished_requests_ids: Optional[List[str]] = None
+    ) -> TTModelInput:
+        
+        # NOTE: We assume that all sequences in the group are all prompts or
+        # all decodes.
+        is_prompt = seq_group_metadata_list[0].is_prompt  # prefill if True, otherwise decode
+        assert all(x.is_prompt == is_prompt for x in seq_group_metadata_list), "Currently only supporting all prefills or all decodes in seq group"
+        
+        batch_size = len(seq_group_metadata_list)
+        assert batch_size > 0
+        
+        input_tokens: List[int] = []
+        input_positions: List[int] = []
+        prompt_lens: List[int] = []
+
+        for seq_group_metadata in seq_group_metadata_list:
+            seq_ids = list(seq_group_metadata.seq_data.keys())
+            assert len(seq_ids) == 1   # Only support one sequence per request group
+            seq_id = seq_ids[0]
+
+            seq_data = seq_group_metadata.seq_data[seq_id]
+            
+            if is_prompt:
+                # tokens
+                prompt_tokens = seq_data.get_token_ids()
+                input_tokens.extend(prompt_tokens)
+                
+                # positions
+                prompt_len = len(prompt_tokens)
+                prompt_lens.append(prompt_len)
+                input_positions.extend(list(range(prompt_len)))
+            else:
+                # tokens
+                generation_token = seq_data.get_last_token_id()
+                input_tokens.append(generation_token)
+                
+                # positions
+                position = seq_data.get_len() - 1
+                input_positions.append(position)
+                
+            # TODO: Get block table using seq_group_metadata.block_tables[seq_id]
+                
+        input_tokens = torch.tensor(input_tokens, dtype=torch.int32, device="cpu")
+        input_positions = torch.tensor(input_positions, dtype=torch.int32, device="cpu")
+        if is_prompt:
+            prompt_lens = torch.tensor(prompt_lens,
+                                    dtype=torch.int32,
+                                    device="cpu")
+        else:
+            prompt_lens = None
+            
+        seq_groups = [
+            list(metadata.seq_data.keys())
+            for metadata in seq_group_metadata_list
+        ]
+        
+        return TTModelInput(input_tokens, input_positions, prompt_lens, seq_groups)
+
+    @torch.no_grad()
+    def execute_model(
+        self,
+        model_input: TTModelInput,
+        kv_caches: List[torch.Tensor],
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        num_steps: int = 1,
+    ) -> Optional[List[SamplerOutput]]:
+        if num_steps > 1:
+            raise ValueError(
+                "TT worker does not support multi-step execution.")
+            
+        is_prompt = model_input.prompt_lens is not None  # prefill if True, otherwise decode
+
+        if is_prompt:
+            input_position = 0
+            # Currently only support same prompt length
+            assert torch.all(model_input.prompt_lens == model_input.prompt_lens[0]), "Currently only supporting same prompt lengths for prefill"
+            batch_size = model_input.prompt_lens.shape[0]
+        else:
+            # Currently only support same decode positions
+            input_position = model_input.input_positions[0].item()
+            assert torch.all(model_input.input_positions == input_position), "Currently only supporting same input positions for decode"
+            batch_size = model_input.input_tokens.shape[0]
+        
+        input_tokens = model_input.input_tokens.view(batch_size, -1)
+        
+        execute_model_kwargs = {
+            "tokens": input_tokens,
+            "start_pos": input_position,
+            # TODO: Add block table and maybe kv cache
+        }
+        
+        logits = self.model.forward(**execute_model_kwargs)  # [batch_size, seq_len, vocab_size]
+
+        # Note: for other devices, vLLM applies vllm.model_executor.layers.logits_processor::LogitsProcessor::_apply_logits_processors on logits, we don't use this
+        # Note: for other devices, vLLM applies vllm.model_executor.layers.sampler::Sampler for sampling tokens, we don't use this
+        next_logits = logits[:, -1, :]  # batch, vocab of last token
+        next_token_ids = self._sample_tokens(next_logits)
+
+        # Minimal code to construct the sampler outputs, based on tpu_model_runner.py
+        # TT backend does not support the advanced sampling parameters such as logprobs.
+        zero_logprob = Logprob(0.0)
+        sampler_outputs = []
+        for batch_idx, seq_ids in enumerate(model_input.seq_groups):
+            assert len(seq_ids) == 1   # Only support one sequence per request group
+            next_token_id = next_token_ids[batch_idx]
+            seq_outputs = [SequenceOutput(seq_ids[0], next_token_id,
+                                {next_token_id: zero_logprob})]
+            sampler_outputs.append(
+                CompletionSequenceGroupOutput(seq_outputs, None))
+        return [SamplerOutput(sampler_outputs)]
+
+
+    def _sample_tokens(self, logits):
+        # TODO: Add other sampling methods, currently only using greedy sampling
+        return torch.argmax(logits, dim=-1)
diff --git a/vllm/worker/tt_worker.py b/vllm/worker/tt_worker.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# Common dependencies`
	`2`	`+-r common.txt`