vllm-project · njhill · Jun 21, 2024 · May 20, 2024 · May 20, 2024 · May 20, 2024
diff --git a/examples/offline_inference.py b/examples/offline_inference.py
@@ -1,20 +1,32 @@
+from transformers import AutoConfig
+
+from fms_extras.models.hf.modeling_mlp_speculator import MLPSpeculatorConfig
 from vllm import LLM, SamplingParams
+AutoConfig.register("mlp_speculator", MLPSpeculatorConfig)
+
+template = "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{}\n\n### Response:"
 
 # Sample prompts.
 prompts = [
     "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
+    # "The president of the United States is",
+    # "The capital of France is",
+    # "The future of AI is",
 ]
+prompts = [template.format(prompt) for prompt in prompts]
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 
 # Create an LLM.
-llm = LLM(model="facebook/opt-125m")
+llm = LLM(model="ibm-granite/granite-7b-instruct", use_v2_block_manager=True, enforce_eager=True, speculative_model="ibm-granite/granite-7b-instruct-accelerator", num_speculative_tokens=5)
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
+import time
+outputs = llm.generate(prompts, sampling_params)
+start = time.time()
 outputs = llm.generate(prompts, sampling_params)
+end = time.time()
+print((end-start) / sum([len(o.outputs[0].token_ids) for o in outputs]))
 # Print the outputs.
 for output in outputs:
     prompt = output.prompt

diff --git a/vllm/config.py b/vllm/config.py
@@ -935,7 +935,7 @@ def _verify_args(self) -> None:
             raise ValueError("Expected num_speculative_tokens to be greater "
                              f"than zero ({self.num_speculative_tokens}).")
 
-        if self.draft_model_config:
+        if self.draft_model_config and self.draft_model_config.hf_config.model_type != "mlp_speculator":
             self.draft_model_config.verify_with_parallel_config(
                 self.draft_parallel_config)
 

diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
@@ -56,6 +56,7 @@
     "Starcoder2ForCausalLM": ("starcoder2", "Starcoder2ForCausalLM"),
     "ArcticForCausalLM": ("arctic", "ArcticForCausalLM"),
     "XverseForCausalLM": ("xverse", "XverseForCausalLM"),
+    "MLPSpeculatorPreTrainedModel": ("mlp_speculator", "MLPSpeculator"),
 }
 
 _EMBEDDING_MODELS = {

diff --git a/vllm/model_executor/models/mlp_speculator.py b/vllm/model_executor/models/mlp_speculator.py
@@ -0,0 +1,135 @@
+from typing import Optional, List, Iterable, Tuple
+
+import torch.nn as nn
+import torch
+import math
+from vllm.attention import AttentionMetadata
+from vllm.model_executor import SamplingMetadata
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.sequence import SamplerOutput
+
+
+class MLPSpeculatorLayerNorm(nn.Module):
+    """
+    A L2 normalization implementation
+    ...
+    Args
+    ----
+    normalized_shape : int
+        Dimensionality of input data (size of final tensor axis)
+    elementwise_scale_weight : torch.Tensor
+        learned scaling term after normalization?
+    elementwise_shift_bias : torch.Tensor
+        learned bias term after normalization?
+    eps : float
+        Safety term to prevent division by zero. Make sure the chosen value fits in the range of your encoding scheme (i.e. fp16 requires eps >= 6e-8).
+    """
+
+    def __init__(
+        self,
+        normalized_shape,
+        eps=1e-06,
+    ):
+        super(MLPSpeculatorLayerNorm, self).__init__()
+        self.weight = nn.Parameter(torch.empty(normalized_shape))
+        self.bias = nn.Parameter(torch.empty(normalized_shape))
+        self.eps = eps
+
+    def forward(self, x):
+        xf = x
+        xf = xf * torch.rsqrt(xf.pow(2).mean(-1, keepdim=True) + self.eps)
+        x = xf.type_as(x)
+        x = self.weight * x
+        x = x + self.bias
+        return x
+
+class MLPSpeculator(nn.Module):
+    def __init__(
+        self,
+        config,
+        **kwargs
+    ) -> None:
+        super().__init__()
+        self.current_head_index = 0
+        self.n_predict = config.n_predict
+        self.vocab_size = config.vocab_size
+        self.emb_dim = config.emb_dim
+        self.inner_dim = config.inner_dim if config.inner_dim != 0 else config.emb_dim
+        self.emb = nn.ModuleList([
+                VocabParallelEmbedding(config.vocab_size, self.inner_dim, org_num_embeddings=config.vocab_size)
+                for _ in range(config.n_predict)
+        ])
+
+        self.proj = nn.ModuleList([
+            nn.Linear((self.emb_dim if i == 0 else self.inner_dim), self.inner_dim, bias=False)
+            for i in range(config.n_predict)
+        ])
+
+        self.head = nn.ModuleList([nn.Linear(self.inner_dim, self.vocab_size, bias=False) for _ in range(config.n_predict)])
+        self.ln = nn.ModuleList([MLPSpeculatorLayerNorm(self.inner_dim) for _ in range(config.n_predict)])
+
+        self.state_weight = 0.5 ** (0.5 / config.n_predict)
+        self.emb_weight = math.sqrt((1 - self.state_weight ** 2) * (self.inner_dim / 2))
+        self.activation = nn.GELU()
+        self.config = config
+        self.logits_processor = LogitsProcessor(config.vocab_size, config.vocab_size, 1.0)
+        self.sampler = Sampler()
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        # prune the hidden states
+        if self.current_head_index == 0:
+            if self.first_decode_step:
+                self.first_decode_step = False
+            else:
+                self.previous_hidden_state = self.previous_hidden_state.reshape(-1, self.n_predict + 1, self.previous_hidden_state.size(1))
+                self.previous_hidden_state = self.previous_hidden_state.gather(
+                    1,
+                    (self.accepted_token_lengths - 1)[:, None, None].expand(-1, 1, self.previous_hidden_state.size(2))
+                ).squeeze(1) # b x d
+
+        # Project and predict
+        z = self.emb[self.current_head_index](input_ids[-1])  # b k d
+        state = self.proj[self.current_head_index](self.previous_hidden_state)
+        # Weighted add of state_weight*state and emb_weight*z
+        # Let subsequent LN take care of denominator
+        # state_weight is close to 1, so shouldn't be any precision issues
+        state = torch.add(state, z, alpha=self.emb_weight / self.state_weight)
+        state = self.activation(self.ln[self.current_head_index](state))  # b k d
+
+        # todo: not yet supporting top_k_tokens_per_head
+
+        self.previous_hidden_state = state
+        self.current_head_index += 1
+        return state
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        current_head_index = self.current_head_index - 1
+        logits = self.logits_processor(self.head[current_head_index].weight, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            param = params_dict[name.replace("speculator.", "")]
+            weight_loader = getattr(param, "weight_loader",
+                                    default_weight_loader)
+            weight_loader(param, loaded_weight)
diff --git a/vllm/spec_decode/hidden_states_worker.py b/vllm/spec_decode/hidden_states_worker.py
@@ -0,0 +1,78 @@
+from typing import List, Optional
+
+from vllm.sequence import SequenceGroupMetadata, ExecuteModelRequest, SamplerOutput
+from vllm.worker.worker import Worker
+import torch
+
+class HiddenStatesWorker(Worker):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.speculator = None
+        self.prev_request_context_lengths = {}
+
+    def _get_hidden_states(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        kv_caches: List[torch.Tensor],
+    ):
+
+        (input_tokens, input_positions, attn_metadata, sampling_metadata,
+         lora_requests, lora_mapping, multi_modal_input
+         ) = self.model_runner.prepare_input_tensors(seq_group_metadata_list)
+
+        if self.model_runner.lora_config:
+            self.model_runner.set_active_loras(lora_requests, lora_mapping)
+
+        # Currently cuda graph is only supported by the decode phase.
+        prefill_meta = attn_metadata.prefill_metadata
+        decode_meta = attn_metadata.decode_metadata
+        if prefill_meta is None and decode_meta.use_cuda_graph:
+            graph_batch_size = input_tokens.shape[0]
+            model_executable = self.model_runner.graph_runners[graph_batch_size]
+        else:
+            model_executable = self.model_runner.model
+        execute_model_kwargs = {
+            "input_ids": input_tokens,
+            "positions": input_positions,
+            "kv_caches": kv_caches,
+            "attn_metadata": attn_metadata,
+        }
+        if self.vision_language_config:
+            execute_model_kwargs.update({"image_input": multi_modal_input})
+
+        # save the previous hidden states for later use
+        hidden_states = model_executable(**execute_model_kwargs)
+
+        # Compute the logits.
+        logits = self.model_runner.model.compute_logits(hidden_states, sampling_metadata)
+
+        # Only perform sampling in the driver worker.
+        if not self.model_runner.is_driver_worker:
+            return None
+
+        # Sample the next token.
+        output = self.model_runner.model.sample(
+            logits=logits,
+            sampling_metadata=sampling_metadata,
+        )
+
+        return output, hidden_states
+
+
+    @torch.inference_mode()
+    def execute_model(
+            self,
+            execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> List[SamplerOutput]:
+
+        # reset the head to call in speculator
+        self.speculator.current_head_index = 0
+
+        sampler_output, hidden_states = self._get_hidden_states(execute_model_req.seq_group_metadata_list, self.gpu_cache)
+
+        # if we are executing the prompt, we need to flag the first decode step since pruning is handled differently
+        if execute_model_req.seq_group_metadata_list[0].is_prompt:
+            self.speculator.first_decode_step = True
+        self.speculator.previous_hidden_state = hidden_states
+        return [sampler_output]
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
@@ -8,6 +8,7 @@
                            SequenceGroupMetadata)
 from vllm.spec_decode.interfaces import SpeculativeProposals
 from vllm.spec_decode.top1_proposer import Top1Proposer
+from vllm.worker.model_runner import SingleStepSpeculativeModelRunner
 from vllm.worker.worker import Worker
 
 
@@ -28,6 +29,7 @@ def __init__(self, *args, **kwargs):
 
         # Lazy initialization list.
         self._proposer: Top1Proposer
+        self.requires_kv_cache: bool
 
     def init_device(self):
         super().init_device()
@@ -39,6 +41,8 @@ def init_device(self):
             max_proposal_len=self.max_model_len,
         )
 
+        self.requires_kv_cache = not isinstance(self.model_runner, SingleStepSpeculativeModelRunner)
+
     def set_include_gpu_probs_tensor(self):
         # Need include_gpu_probs_tensor for multi_step_worker
         self.model_runner.model.sampler.include_gpu_probs_tensor = True
@@ -66,8 +70,8 @@ def sampler_output(
             copied_seq_group_metadata_list)
 
         # Assert enough KV space for sample_len tokens per sequence.
-        self._assert_enough_kv_space(execute_model_req.seq_group_metadata_list,
-                                     sample_len)
+        if self.requires_kv_cache:
+            self._assert_enough_kv_space(execute_model_req.seq_group_metadata_list, sample_len)
 
         # Run model sample_len times.
         model_outputs = []

diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
@@ -9,6 +9,7 @@
 from vllm.sequence import (ExecuteModelRequest, SamplerOutput,
                            SequenceGroupMetadata)
 from vllm.spec_decode.batch_expansion import BatchExpansionTop1Scorer
+from vllm.spec_decode.hidden_states_worker import HiddenStatesWorker
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeScorer, SpeculativeScores)
 from vllm.spec_decode.metrics import AsyncMetricsCollector
@@ -32,7 +33,10 @@ def create_spec_worker(*args, **kwargs) -> "SpecDecodeWorker":
     speculative_config = kwargs.get("speculative_config")
     assert speculative_config is not None
 
-    target_worker = Worker(*args, **kwargs)
+    if speculative_config.draft_model_config.hf_config.model_type == "mlp_speculator":
+        target_worker = HiddenStatesWorker(*args, **kwargs)
+    else:
+        target_worker = Worker(*args, **kwargs)
 
     draft_worker_kwargs = kwargs.copy()
     # Override draft-model specific worker args.
@@ -165,6 +169,8 @@ def init_device(self) -> None:
         # NOTE(cade): load_model is not part of the WorkerBase interface.
         self.scorer_worker.load_model()
         self.proposer_worker.load_model()
+        if isinstance(self.scorer_worker, HiddenStatesWorker):
+            self.scorer_worker.speculator = self.proposer_worker.model_runner.model
 
         self._metrics.init_gpu_tensors(self.rank)
         self.rejection_sampler.init_gpu_tensors(self.rank)
@@ -212,23 +218,27 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         num_gpu_blocks, num_cpu_blocks = (
             self.scorer_worker.determine_num_available_blocks())
 
-        scorer_cache_block_size_bytes = (
-            self.scorer_worker.get_cache_block_size_bytes())
-        proposer_cache_block_size_bytes = (
-            self.proposer_worker.get_cache_block_size_bytes())
+        if not isinstance(self.scorer_worker, HiddenStatesWorker):
+            scorer_cache_block_size_bytes = (
+                self.scorer_worker.get_cache_block_size_bytes())
+            proposer_cache_block_size_bytes = (
+                self.proposer_worker.get_cache_block_size_bytes())
 
-        new_num_gpu_blocks = split_num_cache_blocks_evenly(
-            scorer_cache_block_size_bytes, proposer_cache_block_size_bytes,
-            num_gpu_blocks)
-        return new_num_gpu_blocks, num_cpu_blocks
+            num_gpu_blocks = split_num_cache_blocks_evenly(
+                scorer_cache_block_size_bytes, proposer_cache_block_size_bytes,
+                num_gpu_blocks)
+
+        return num_gpu_blocks, num_cpu_blocks
 
     def initialize_cache(self, num_gpu_blocks: int,
                          num_cpu_blocks: int) -> None:
         """Initialize the cache engine of the scorer and proposer workers.
         """
         self.scorer_worker.initialize_cache(num_gpu_blocks=num_gpu_blocks,
                                             num_cpu_blocks=num_cpu_blocks)
-        self.proposer_worker.initialize_cache(num_gpu_blocks=num_gpu_blocks,
+
+        if not isinstance(self.scorer_worker, HiddenStatesWorker):
+            self.proposer_worker.initialize_cache(num_gpu_blocks=num_gpu_blocks,
                                               num_cpu_blocks=num_cpu_blocks)
 
     def _broadcast_control_flow_decision(
@@ -291,6 +301,7 @@ def execute_model(
             # Used for prefill.
             if num_lookahead_slots == 0 or len(
                     execute_model_req.seq_group_metadata_list) == 0:
+                disable_all_speculation = disable_all_speculation or isinstance(self.scorer_worker, HiddenStatesWorker)
                 return self._run_no_spec(execute_model_req,
                                          skip_proposer=disable_all_speculation)