correct restart count tracking to use original request ID

Vasilis Kontonis · Vasilis Kontonis · commit 2c2156b7f771 · 2025-11-19T16:52:06.000Z
- Move original_req_id lookup before restart count check - Use original_req_id for restart count lookup instead of current req_id - Fixes request ID collisions (was creating duplicate IDs like 0_0_restart_1) - Fixes restart counter (was always showing restart vllm-project#1) - Fixes KV cache leak (duplicate IDs prevented proper cleanup) - Enhanced debug logging to show both original and current request IDs
diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py
@@ -597,8 +597,11 @@ def process_outputs(
             # 4) Check for restart condition (BlockPhi3 specific)
             should_restart = False
             if pooling_output is None and self._should_restart(new_token_ids):
-                # Check restart limits
-                current_restart_count = self.restart_count.get(req_id, 0)
+                # Get the original request ID FIRST (in case this is already a restarted request)
+                original_req_id = self.original_request_id.get(req_id, req_id)
+                
+                # Check restart limits using ORIGINAL request ID
+                current_restart_count = self.restart_count.get(original_req_id, 0)
                 if current_restart_count < self.max_restarts_per_request:
                     # Extract full generated text
                     assert req_state.detokenizer is not None
@@ -609,9 +612,6 @@ def process_outputs(
                     if extraction:
                         block_content, summary_text = extraction
                         
-                        # Get the original request ID (in case this is already a restarted request)
-                        original_req_id = self.original_request_id.get(req_id, req_id)
-                        
                         # Store this block+summary in accumulated history
                         if original_req_id not in self.accumulated_blocks:
                             self.accumulated_blocks[original_req_id] = []
@@ -652,7 +652,7 @@ def process_outputs(
                             try:
                                 with open("/tmp/blockphi3_restart_debug.log", "a") as f:
                                     f.write(
-                                        f"[RESTART] Request {req_id} restart #{current_restart_count + 1}\n"
+                                        f"[RESTART] Request {original_req_id} (current: {req_id}) restart #{current_restart_count + 1}\n"
                                         f"  Block length: {len(block_content)}\n"
                                         f"  Summary length: {len(summary_text)}\n"
                                         f"  New prompt tokens: {len(new_prompt_tokens)}\n"
@@ -664,7 +664,7 @@ def process_outputs(
                         try:
                             with open("/tmp/blockphi3_restart_debug.log", "a") as f:
                                 f.write(
-                                    f"[RESTART_LIMIT] Request {req_id} reached max restarts "
+                                    f"[RESTART_LIMIT] Request {original_req_id} (current: {req_id}) reached max restarts "
                                     f"({self.max_restarts_per_request})\n"
                                 )
                         except Exception: