Skip to content

[Bugfix][2/n] Fix speculative decoding CI - Fix test_ngram_e2e_greedy_correctness #19644

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jun 15, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion tests/spec_decode/e2e/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,13 @@
@pytest.mark.parametrize(
"common_llm_kwargs",
[{
"model_name": "JackFram/llama-68m",

# Verify equality when cuda graphs allowed.
"enforce_eager": False,
"model_name": "JackFram/llama-68m",

# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize(
"per_test_common_llm_kwargs",
Expand Down Expand Up @@ -59,6 +62,9 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,

# Skip cuda graph recording for fast test.
"enforce_eager": True,

# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [])
@pytest.mark.parametrize(
Expand Down Expand Up @@ -117,6 +123,9 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,

# Skip cuda graph recording for fast test.
"enforce_eager": True,

# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
Expand Down
18 changes: 17 additions & 1 deletion tests/spec_decode/e2e/test_logprobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,10 @@
"model_name": "JackFram/llama-160m",

# Skip cuda graph recording for fast test.
"enforce_eager": True
"enforce_eager": True,

# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
Expand Down Expand Up @@ -75,6 +78,9 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs,

# Skip cuda graph recording for fast test.
"enforce_eager": True,

# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
Expand Down Expand Up @@ -128,6 +134,9 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs,

# Skip cuda graph recording for fast test.
"enforce_eager": True,

# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
Expand Down Expand Up @@ -182,6 +191,9 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,

# Skip cuda graph recording for fast test.
"enforce_eager": True,

# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
Expand Down Expand Up @@ -256,8 +268,12 @@ def test_logprobs_temp_1(vllm_runner, common_llm_kwargs,
"common_llm_kwargs",
[{
"model_name": "JackFram/llama-160m",

# Skip cuda graph recording for fast test.
"enforce_eager": True,

# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
Expand Down
3 changes: 3 additions & 0 deletions tests/spec_decode/e2e/test_mlp_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,9 @@ def test_mlp_disable_queue(vllm_runner, common_llm_kwargs,

# Skip cuda graph recording for fast test.
"enforce_eager": True,

# Precision
"dtype": PRECISION,
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
Expand Down
24 changes: 24 additions & 0 deletions tests/spec_decode/e2e/test_multistep_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@

# Skip cuda graph recording for fast test.
"enforce_eager": True,

# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize(
"per_test_common_llm_kwargs",
Expand Down Expand Up @@ -139,6 +142,9 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,

# Print spec metrics.
"disable_log_stats": False,

# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize(
"per_test_common_llm_kwargs",
Expand Down Expand Up @@ -216,6 +222,9 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(

# Print spec metrics.
"disable_log_stats": False,

# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize(
"per_test_common_llm_kwargs",
Expand Down Expand Up @@ -279,6 +288,9 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
[{
# Skip cuda graph recording for fast test.
"enforce_eager": True,

# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize(
"per_test_common_llm_kwargs",
Expand Down Expand Up @@ -464,6 +476,8 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(

# Skip cuda graph recording for fast test.
"enforce_eager": True,
# The original model is float32, keep it for numerical stability.
"dtype": "float32",
Comment on lines +479 to +480
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

For consistency with other additions in this PR (e.g., at lines 61-62 in this file, and in tests/spec_decode/e2e/test_ngram_correctness.py), consider adding a blank line before this comment block. This improves readability.

This suggestion also applies to similar additions at lines 540-541, 608-609, 676-677, 729-730, and 788-789 in this file.

        # The original model is float32, keep it for numerical stability.
        "dtype": "float32"

}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
{
Expand Down Expand Up @@ -523,6 +537,8 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(

# Skip cuda graph recording for fast test.
"enforce_eager": True,
# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize(
"per_test_common_llm_kwargs",
Expand Down Expand Up @@ -589,6 +605,8 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,

# Skip cuda graph recording for fast test.
"enforce_eager": True,
# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
Expand Down Expand Up @@ -655,6 +673,8 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs,

# Skip cuda graph recording for fast test.
"enforce_eager": True,
# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
Expand Down Expand Up @@ -706,6 +726,8 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs,

# Skip cuda graph recording for fast test.
"enforce_eager": True,
# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
Expand Down Expand Up @@ -763,6 +785,8 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,

# Skip cuda graph recording for fast test.
"enforce_eager": True,
# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
Expand Down
18 changes: 18 additions & 0 deletions tests/spec_decode/e2e/test_ngram_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@

# Print spec metrics.
"disable_log_stats": False,

# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
{
Expand Down Expand Up @@ -97,6 +100,9 @@ def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,

# Print spec metrics.
"disable_log_stats": False,

# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
{
Expand Down Expand Up @@ -160,6 +166,9 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,

# Skip cuda graph recording for fast test.
"enforce_eager": True,

# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [
{
Expand Down Expand Up @@ -221,6 +230,9 @@ def test_ngram_e2e_greedy_correctness_with_preemption(

# Skip cuda graph recording for fast test.
"enforce_eager": True,

# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
Expand Down Expand Up @@ -281,6 +293,9 @@ def test_ngram_different_k(vllm_runner, common_llm_kwargs,

# Skip cuda graph recording for fast test.
"enforce_eager": True,

# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
Expand Down Expand Up @@ -337,6 +352,9 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,

# Skip cuda graph recording for fast test.
"enforce_eager": True,

# The original model is float32, keep it for numerical stability.
"dtype": "float32",
}])
@pytest.mark.parametrize("per_test_common_llm_kwargs", [{}])
@pytest.mark.parametrize("baseline_llm_kwargs", [{}])
Expand Down
3 changes: 2 additions & 1 deletion vllm/model_executor/models/eagle.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ class EAGLE(nn.Module):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
super().__init__()
config = vllm_config.model_config.hf_config
self.dtype = vllm_config.model_config.dtype
self.config = config

architectures = getattr(self.config.model, "architectures", [])
Expand Down Expand Up @@ -250,7 +251,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
lm_head_weight = torch.zeros(
self.lm_head.org_vocab_size,
self.lm_head.embedding_dim,
dtype=self.config.torch_dtype,
dtype=self.dtype,
)

weight_loader = getattr(self.lm_head.weight, "weight_loader",
Expand Down