.

vllm-project · rkooo567 · May 15, 2024 · May 8, 2024 · May 8, 2024 · May 8, 2024
commit b42b43d3553743381ad3ead27ac527843b2f9895
diff --git a/tests/spec_decode/e2e/test_multistep_correctness.py b/tests/spec_decode/e2e/test_multistep_correctness.py
@@ -253,50 +253,50 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
                                          force_output_len=True)
 
 
-# @pytest.mark.parametrize(
-#     "common_llm_kwargs",
-#     [{
-#         # Skip cuda graph recording for fast test.
-#         "enforce_eager": True,
-
-#         # Required for spec decode.
-#         "use_v2_block_manager": True
-#     }])
-# @pytest.mark.parametrize(
-#     "per_test_common_llm_kwargs",
-#     [
-#         # Try two different tiny base models.
-#         # Note that one is equal to the draft model, another isn't.
-#         {
-#             "model": "JackFram/llama-68m",
-#         },
-#         {
-#             "model": "JackFram/llama-160m",
-#         },
-#     ])
-# @pytest.mark.parametrize("baseline_llm_kwargs", [{}])
-# @pytest.mark.parametrize("test_llm_kwargs", [
-#     {
-#         "speculative_model": "JackFram/llama-68m",
-#         "num_speculative_tokens": 5,
-#     },
-# ])
-# @pytest.mark.parametrize("max_output_len", [
-#     256,
-# ])
-# @pytest.mark.parametrize("batch_size", [32])
-# @pytest.mark.parametrize("seed", [1])
-# def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
-#         baseline_llm_generator, test_llm_generator, batch_size: int,
-#         max_output_len: int):
-#     """Verify greedy equality on a tiny model, with a large batch size, and when
-#     sampling respects the EOS token.
-#     """
-#     run_greedy_equality_correctness_test(baseline_llm_generator,
-#                                          test_llm_generator,
-#                                          batch_size,
-#                                          max_output_len,
-#                                          force_output_len=False)
+@pytest.mark.parametrize(
+    "common_llm_kwargs",
+    [{
+        # Skip cuda graph recording for fast test.
+        "enforce_eager": True,
+
+        # Required for spec decode.
+        "use_v2_block_manager": True
+    }])
+@pytest.mark.parametrize(
+    "per_test_common_llm_kwargs",
+    [
+        # Try two different tiny base models.
+        # Note that one is equal to the draft model, another isn't.
+        {
+            "model": "JackFram/llama-68m",
+        },
+        {
+            "model": "JackFram/llama-160m",
+        },
+    ])
+@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
+@pytest.mark.parametrize("test_llm_kwargs", [
+    {
+        "speculative_model": "JackFram/llama-68m",
+        "num_speculative_tokens": 5,
+    },
+])
+@pytest.mark.parametrize("max_output_len", [
+    256,
+])
+@pytest.mark.parametrize("batch_size", [32])
+@pytest.mark.parametrize("seed", [1])
+def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs_diff_output_len(
+        baseline_llm_generator, test_llm_generator, batch_size: int,
+        max_output_len: int):
+    """Verify greedy equality on a tiny model, with a large batch size, and when
+    sampling respects the EOS token.
+    """
+    run_greedy_equality_correctness_test(baseline_llm_generator,
+                                         test_llm_generator,
+                                         batch_size,
+                                         max_output_len,
+                                         force_output_len=False)
 
 
 @pytest.mark.parametrize(

diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
@@ -348,6 +348,7 @@ def forward(
             #     print(f"SANG-TODO {decode_meta.block_tables=}")
             #     # print(f"SANG-TODO {attn_metadata.slot_mapping=}")
             #     print(f"SANG-TODO {decode_meta.seq_lens_tensor=}")
+            #     print(f"SANG-TODO decode query: {decode_query=}")
             output[num_prefill_tokens:] = flash_attn_with_kvcache(
                 decode_query.unsqueeze(1),
                 key_cache,

diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
@@ -266,16 +266,16 @@ def _run_speculative_decoding_step(
         """
 
         # Generate proposals using draft worker.
-        # print("SANG-TODO draft")
+        print("SANG-TODO draft")
         proposals = self.proposer_worker.get_spec_proposals(execute_model_req)
 
-        # print("SANG-TODO target")
+        print("SANG-TODO target")
         proposal_scores = self.scorer.score_proposals(
             execute_model_req,
             proposals,
         )
 
-        # print("SANG-TODO score")
+        print("SANG-TODO score")
         accepted_token_ids, target_logprobs = self._verify_tokens(
             execute_model_req.seq_group_metadata_list, proposal_scores,
             proposals, execute_model_req.num_lookahead_slots)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -502,6 +502,7 @@ def _prepare_model_input(
         slot_mapping_tensor = torch.tensor(slot_mapping,
                                            dtype=torch.long,
                                            device=self.device)
+        print(f"SANG-TODO {seq_lens_tensor=} {block_tables=}")
 
         if self.attn_backend.get_name() == "flashinfer":
             if not hasattr(self, "flashinfer_workspace_buffer"):