RichardoMrMu
diff --git a/‎tests/v1/sample/test_rejection_sampler.py
Lines changed: 166 additions & 65 deletions b/‎tests/v1/sample/test_rejection_sampler.py
Lines changed: 166 additions & 65 deletions
diff --git a/‎vllm/envs.py
Lines changed: 0 additions & 1 deletion b/‎vllm/envs.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎vllm/v1/outputs.py
Lines changed: 1 addition & 1 deletion b/‎vllm/v1/outputs.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎vllm/v1/sample/ops/utils.py
Lines changed: 30 additions & 0 deletions b/‎vllm/v1/sample/ops/utils.py
Lines changed: 30 additions & 0 deletions
diff --git a/‎vllm/v1/sample/rejection_sampler.py
Lines changed: 506 additions & 292 deletions b/‎vllm/v1/sample/rejection_sampler.py
Lines changed: 506 additions & 292 deletions
diff --git a/‎vllm/v1/spec_decode/metadata.py
Lines changed: 61 additions & 0 deletions b/‎vllm/v1/spec_decode/metadata.py
Lines changed: 61 additions & 0 deletions
diff --git a/‎vllm/v1/spec_decode/utils.py
Lines changed: 0 additions & 1 deletion b/‎vllm/v1/spec_decode/utils.py
Lines changed: 0 additions & 1 deletion
@@ -35,7 +35,6 @@
     VLLM_TRACE_FUNCTION: int = 0
     VLLM_ATTENTION_BACKEND: Optional[str] = None
     VLLM_USE_FLASHINFER_SAMPLER: Optional[bool] = None
-    VLLM_USE_FLASHINFER_REJECTION_SAMPLER: bool = False
     VLLM_FLASHINFER_FORCE_TENSOR_CORES: bool = False
     VLLM_PP_LAYER_PARTITION: Optional[str] = None
     VLLM_CPU_KVCACHE_SPACE: int = 0
 
@@ -46,7 +46,7 @@ class SamplerOutput:
     # [num_reqs, max_num_generated_tokens]
     # Different requests can have different number of generated tokens.
     # All requests are padded to max_num_generated_tokens.
-    # INVALID_TOKEN_ID (-1 by default) is used for padding.
+    # PLACEHOLDER_TOKEN_ID (-1 by default) is used for padding.
     sampled_token_ids: torch.Tensor
     logprobs_tensors: Optional[LogprobsTensors]
 
 
@@ -0,0 +1,30 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Union
+
+import torch
+
+
+def compiled_softmax(
+    logits: torch.Tensor,
+    temperature: Union[float, torch.Tensor] = 1.0,
+) -> torch.Tensor:
+    """Faster softmax kernel generated by torch.compile.
+
+    Args:
+        logits: [n, vocab_size]
+        temperature: [n] or float
+    """
+    # NOTE(woosuk): Avoid recompilation by marking the first dim as dynamic.
+    torch._dynamo.mark_dynamic(logits, index=0)
+    if isinstance(temperature, torch.Tensor):
+        torch._dynamo.mark_dynamic(temperature, index=0)
+    return _softmax(logits, temperature)
+
+
+@torch.compile
+def _softmax(
+    logits: torch.Tensor,
+    temperature: Union[float, torch.Tensor],
+) -> torch.Tensor:
+    logits = logits / temperature
+    return torch.softmax(logits, dim=-1, dtype=torch.float32)
@@ -0,0 +1,61 @@
+# SPDX-License-Identifier: Apache-2.0
+from dataclasses import dataclass
+
+import numpy as np
+import torch
+
+
+@dataclass
+class SpecDecodeMetadata:
+
+    # [num_tokens]
+    draft_token_ids: torch.Tensor
+    # [batch_size]
+    num_draft_tokens: list[int]
+    # [batch_size]
+    cu_num_draft_tokens: torch.Tensor
+    # [num_tokens]
+    target_logits_indices: torch.Tensor
+    # [batch_size]
+    bonus_logits_indices: torch.Tensor
+    # [num_tokens + batch_size]
+    logits_indices: torch.Tensor
+
+    def __post_init__(self):
+        self.max_spec_len = max(self.num_draft_tokens)
+
+    @classmethod
+    def make_dummy(
+        cls,
+        draft_token_ids: list[list[int]],
+        device: torch.device,
+    ) -> "SpecDecodeMetadata":
+        batch_size = len(draft_token_ids)
+        num_draft_tokens = [len(ids) for ids in draft_token_ids]
+        flattened_draft_token_ids = sum(draft_token_ids, [])
+        num_tokens = len(flattened_draft_token_ids)
+
+        draft_token_ids_tensor = torch.tensor(flattened_draft_token_ids,
+                                              dtype=torch.int32,
+                                              device=device)
+        cu_num_draft_tokens = np.cumsum(num_draft_tokens, dtype=np.int32)
+        cu_num_draft_tokens_tensor = torch.from_numpy(cu_num_draft_tokens).to(
+            device)
+
+        target_logits_indices = torch.zeros(num_tokens,
+                                            dtype=torch.int32,
+                                            device=device)
+        bonus_logits_indices = torch.zeros(batch_size,
+                                           dtype=torch.int32,
+                                           device=device)
+        logits_indices = torch.zeros(num_tokens + batch_size,
+                                     dtype=torch.int32,
+                                     device=device)
+        return cls(
+            draft_token_ids=draft_token_ids_tensor,
+            num_draft_tokens=num_draft_tokens,
+            cu_num_draft_tokens=cu_num_draft_tokens_tensor,
+            target_logits_indices=target_logits_indices,
+            bonus_logits_indices=bonus_logits_indices,
+            logits_indices=logits_indices,
+        )
@@ -1,5 +1,4 @@
 # SPDX-License-Identifier: Apache-2.0
-from vllm.v1.sample.ops.topk_topp_sampler import random_sample  # noqa
 from vllm.v1.worker.gpu_input_batch import InputBatch
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,4 @@`
`1`	`1`	`# SPDX-License-Identifier: Apache-2.0`
`2`		`-from vllm.v1.sample.ops.topk_topp_sampler import random_sample # noqa`
`3`	`2`	`from vllm.v1.worker.gpu_input_batch import InputBatch`
`4`	`3`
`5`	`4`