[WIP][V1][Metrics] Speculative decoding metrics

markmc · markmc · commit cd3ecadc190b · 2025-03-19T14:15:39.000-04:00
Fixes #13990, part of #10582 Signed-off-by: Mark McLoughlin <markmc@redhat.com>
diff --git a/tests/v1/sample/test_rejection_sampler.py b/tests/v1/sample/test_rejection_sampler.py
@@ -95,6 +95,10 @@ def test_perfect_match(rejection_sampler):
                             device=logits.device)
     assert torch.equal(output, expected)
 
+    assert rejection_sampler.stats.num_draft_tokens == 3
+    assert rejection_sampler.stats.num_accepted_tokens == 3
+    assert rejection_sampler.stats.num_emitted_tokens == 4
+
 
 def test_early_mismatch(rejection_sampler):
     """Test when there's an early mismatch in tokens"""
@@ -122,6 +126,10 @@ def test_early_mismatch(rejection_sampler):
     )
     assert torch.equal(output, expected)
 
+    assert rejection_sampler.stats.num_draft_tokens == 3
+    assert rejection_sampler.stats.num_accepted_tokens == 1
+    assert rejection_sampler.stats.num_emitted_tokens == 2
+
 
 def test_multiple_sequences(rejection_sampler):
     """Test handling multiple sequences of speculated tokens"""
@@ -148,6 +156,10 @@ def test_multiple_sequences(rejection_sampler):
                             device=logits.device)
     assert torch.equal(output, expected)
 
+    assert rejection_sampler.stats.num_draft_tokens == 3
+    assert rejection_sampler.stats.num_accepted_tokens == 3
+    assert rejection_sampler.stats.num_emitted_tokens == 5
+
 
 def test_single_token_sequence(rejection_sampler):
     """Test handling sequences with single token"""
@@ -171,6 +183,10 @@ def test_single_token_sequence(rejection_sampler):
     expected = torch.tensor([[1, 2]], dtype=torch.int, device=logits.device)
     assert torch.equal(output, expected)
 
+    assert rejection_sampler.stats.num_draft_tokens == 1
+    assert rejection_sampler.stats.num_accepted_tokens == 1
+    assert rejection_sampler.stats.num_emitted_tokens == 2
+
 
 def test_empty_sequence(rejection_sampler):
     """Test handling empty sequence of speculated tokens"""
@@ -194,6 +210,10 @@ def test_empty_sequence(rejection_sampler):
     expected = torch.tensor([[5]], dtype=torch.int, device=logits.device)
     assert torch.equal(output, expected)
 
+    assert rejection_sampler.stats.num_draft_tokens == 0
+    assert rejection_sampler.stats.num_accepted_tokens == 0
+    assert rejection_sampler.stats.num_emitted_tokens == 1
+
 
 def test_multiple_mismatches(rejection_sampler):
     """Test handling multiple sequences with mismatches"""
@@ -223,17 +243,24 @@ def test_multiple_mismatches(rejection_sampler):
     )
     assert torch.equal(output, expected)
 
+    assert rejection_sampler.stats.num_draft_tokens == 6
+    assert rejection_sampler.stats.num_accepted_tokens == 3
+    assert rejection_sampler.stats.num_emitted_tokens == 5
+
 
 @pytest.mark.parametrize(
-    "spec_tokens,output_tokens,expected",
+    "spec_tokens,output_tokens,expected,expected_stats",
     [
-        ([[1, 2]], [[1, 2, 3]], [[1, 2, 3]]),  # Perfect match with bonus
-        ([[1]], [[2, 3]], [[2, PLACEHOLDER_TOKEN_ID]]),  # First mismatch
-        ([[1, 2], [3, 4]], [[1, 5, 6], [3, 4, 7]],
-         [[1, 5, PLACEHOLDER_TOKEN_ID], [3, 4, 7]]),  # Mixed matches
+        ([[1, 2]], [[1, 2, 3]], [[1, 2, 3]],
+         (2, 2, 3)),  # Perfect match with bonus
+        ([[1]], [[2, 3]], [[2, PLACEHOLDER_TOKEN_ID]],
+         (1, 0, 1)),  # First mismatch
+        ([[1, 2], [3, 4]], [[1, 5, 6], [3, 4, 7]
+                            ], [[1, 5, PLACEHOLDER_TOKEN_ID], [3, 4, 7]],
+         (4, 3, 5)),  # Mixed matches
     ])
 def test_parametrized_cases(rejection_sampler, spec_tokens, output_tokens,
-                            expected):
+                            expected, expected_stats):
     """Parametrized test for various matching scenarios"""
     metadata = create_sampling_metadata(all_greedy=True)
     logits = create_logits_tensor(output_tokens)
@@ -254,6 +281,10 @@ def test_parametrized_cases(rejection_sampler, spec_tokens, output_tokens,
                                    device=logits.device)
     assert torch.equal(output, expected_tensor)
 
+    assert rejection_sampler.stats.num_draft_tokens == expected_stats[0]
+    assert rejection_sampler.stats.num_accepted_tokens == expected_stats[1]
+    assert rejection_sampler.stats.num_emitted_tokens == expected_stats[2]
+
 
 ########################### Tests for Random Sampling ###################
 @pytest.mark.parametrize("k", [1, 3, 5])
@@ -314,6 +345,12 @@ def test_deterministic_when_seeded(
 
         results.append(rep_result)
 
+        stats = rejection_sampler.stats.take()
+        assert stats.num_draft_tokens == num_tokens
+        assert stats.num_emitted_tokens >= batch_size
+        assert (stats.num_emitted_tokens -
+                batch_size) == stats.num_accepted_tokens
+
     for i in range(batch_size):
         if seeded_mask[i]:
             for j in range(1, n_rep):
diff --git a/vllm/v1/core/scheduler.py b/vllm/v1/core/scheduler.py
@@ -20,6 +20,7 @@
 from vllm.v1.metrics.stats import SchedulerStats
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request, RequestStatus
+from vllm.v1.spec_decode.metrics import SpecDecodingStats
 from vllm.v1.structured_output import StructuredOutputManager
 
 logger = init_logger(__name__)
@@ -533,6 +534,7 @@ def update_from_output(
         spec_token_ids = model_runner_output.spec_token_ids
         logprobs = model_runner_output.logprobs
         prompt_logprobs_dict = model_runner_output.prompt_logprobs_dict
+        spec_decoding_stats = model_runner_output.spec_decoding_stats
         num_scheduled_tokens = scheduler_output.num_scheduled_tokens
 
         new_running: list[Request] = []
@@ -645,7 +647,7 @@ def update_from_output(
         self.running = new_running
         return EngineCoreOutputs(
             outputs=outputs,
-            scheduler_stats=self.make_stats(),
+            scheduler_stats=self.make_stats(spec_decoding_stats),
         )
 
     def _check_stop(self, request: Request) -> bool:
@@ -733,12 +735,16 @@ def get_num_unscheduled_requests(self) -> int:
     def reset_prefix_cache(self) -> bool:
         return self.kv_cache_manager.reset_prefix_cache()
 
-    def make_stats(self) -> Optional[SchedulerStats]:
+    def make_stats(
+        self,
+        spec_decoding_stats: Optional[SpecDecodingStats] = None,
+    ) -> Optional[SchedulerStats]:
         if not self.log_stats:
             return None
         return SchedulerStats(
             num_running_reqs=len(self.running),
             num_waiting_reqs=len(self.waiting),
             gpu_cache_usage=self.kv_cache_manager.usage,
             prefix_cache_stats=self.kv_cache_manager.make_prefix_cache_stats(),
+            spec_decoding_stats=spec_decoding_stats,
         )
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
@@ -66,7 +66,7 @@ def __init__(
         self.stat_loggers: list[StatLoggerBase] = []
         if self.log_stats:
             if logger.isEnabledFor(logging.INFO):
-                self.stat_loggers.append(LoggingStatLogger())
+                self.stat_loggers.append(LoggingStatLogger(vllm_config))
             self.stat_loggers.append(PrometheusStatLogger(vllm_config))
 
         # Tokenizer (+ ensure liveness if running in another process).
diff --git a/vllm/v1/metrics/loggers.py b/vllm/v1/metrics/loggers.py
@@ -12,6 +12,7 @@
 from vllm.v1.core.kv_cache_utils import PrefixCachingMetrics
 from vllm.v1.engine import FinishReason
 from vllm.v1.metrics.stats import IterationStats, SchedulerStats
+from vllm.v1.spec_decode.metrics import SpecDecodingMetrics
 
 logger = init_logger(__name__)
 
@@ -31,12 +32,14 @@ def log(self):  # noqa
 
 class LoggingStatLogger(StatLoggerBase):
 
-    def __init__(self):
+    def __init__(self, vllm_config: VllmConfig):
         self._reset(time.monotonic())
         self.last_scheduler_stats = SchedulerStats()
         # Prefix cache metrics. This cannot be reset.
         # TODO: Make the interval configurable.
         self.prefix_caching_metrics = PrefixCachingMetrics()
+        self.spec_decoding_metrics = SpecDecodingMetrics(
+            vllm_config.speculative_config)
 
     def _reset(self, now):
         self.last_log_time = now
@@ -64,6 +67,10 @@ def record(self, scheduler_stats: SchedulerStats,
 
         self.prefix_caching_metrics.observe(scheduler_stats.prefix_cache_stats)
 
+        if scheduler_stats.spec_decoding_stats is not None:
+            self.spec_decoding_metrics.observe(
+                scheduler_stats.spec_decoding_stats)
+
         self.last_scheduler_stats = scheduler_stats
 
     def log(self):
@@ -91,6 +98,9 @@ def log(self):
             self.prefix_caching_metrics.hit_rate * 100,
         )
 
+        if scheduler_stats.spec_decoding_stats is not None:
+            self.spec_decoding_metrics.log()
+
 
 class PrometheusStatLogger(StatLoggerBase):
 
@@ -296,6 +306,26 @@ def __init__(self, vllm_config: VllmConfig):
                         self.labelname_running_lora_adapters,
                     ])
 
+        #
+        # Speculative Decoding metrics
+        # FIXME: add note on acceptance rate and system efficiency
+        #
+        self.counter_spec_decode_num_draft_tokens = \
+            prometheus_client.Counter(
+                name="vllm:spec_decode_num_draft_tokens_total",
+                documentation="Number of draft tokens.",
+                labelnames=labelnames).labels(*labelvalues)
+        self.counter_spec_decode_num_accepted_tokens = \
+            prometheus_client.Counter(
+                name="vllm:spec_decode_num_accepted_tokens_total",
+                documentation="Number of accepted tokens.",
+                labelnames=labelnames).labels(*labelvalues)
+        self.counter_spec_decode_num_emitted_tokens = \
+            prometheus_client.Counter(
+                name="vllm:spec_decode_num_emitted_tokens_total",
+                documentation="Number of emitted tokens.",
+                labelnames=labelnames).labels(*labelvalues)
+
         #
         # Cache config info metric
         #
@@ -332,6 +362,14 @@ def record(self, scheduler_stats: SchedulerStats,
         self.counter_gpu_prefix_cache_hits.inc(
             scheduler_stats.prefix_cache_stats.hits)
 
+        if scheduler_stats.spec_decoding_stats is not None:
+            self.counter_spec_decode_num_draft_tokens.inc(
+                scheduler_stats.spec_decoding_stats.num_draft_tokens)
+            self.counter_spec_decode_num_accepted_tokens.inc(
+                scheduler_stats.spec_decoding_stats.num_accepted_tokens)
+            self.counter_spec_decode_num_emitted_tokens.inc(
+                scheduler_stats.spec_decoding_stats.num_emitted_tokens)
+
         if iteration_stats is None:
             return
 
diff --git a/vllm/v1/metrics/stats.py b/vllm/v1/metrics/stats.py
@@ -4,6 +4,8 @@
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Optional
 
+from vllm.v1.spec_decode.metrics import SpecDecodingStats
+
 if TYPE_CHECKING:
     from vllm.v1.engine import EngineCoreEvent, EngineCoreOutput, FinishReason
     from vllm.v1.output_processor import RequestState
@@ -35,6 +37,8 @@ class SchedulerStats:
     prefix_cache_stats: PrefixCacheStats = field(
         default_factory=PrefixCacheStats)
 
+    spec_decoding_stats: Optional[SpecDecodingStats] = None
+
 
 @dataclass
 class LoRAStats:
diff --git a/vllm/v1/outputs.py b/vllm/v1/outputs.py
@@ -5,6 +5,8 @@
 
 import torch
 
+from vllm.v1.spec_decode.metrics import SpecDecodingStats
+
 
 class LogprobsLists(NamedTuple):
 
@@ -50,6 +52,8 @@ class SamplerOutput:
     sampled_token_ids: torch.Tensor
     logprobs_tensors: Optional[LogprobsTensors]
 
+    spec_decoding_stats: Optional[SpecDecodingStats] = None
+
 
 # ModelRunnerOutput is serialized and sent to the scheduler process.
 # This is expensive for torch.Tensor so prefer to use list instead.
@@ -81,6 +85,8 @@ class ModelRunnerOutput:
     # [prompt_len]
     prompt_logprobs_dict: dict[str, Optional[LogprobsTensors]]
 
+    spec_decoding_stats: Optional[SpecDecodingStats] = None
+
 
 EMPTY_MODEL_RUNNER_OUTPUT = ModelRunnerOutput(
     req_ids=[],
@@ -89,4 +95,5 @@ class ModelRunnerOutput:
     spec_token_ids=None,
     logprobs=None,
     prompt_logprobs_dict={},
+    spec_decoding_stats=None,
 )
diff --git a/vllm/v1/sample/rejection_sampler.py b/vllm/v1/sample/rejection_sampler.py
diff --git a/vllm/v1/spec_decode/metrics.py b/vllm/v1/spec_decode/metrics.py
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py