Add interface with is_reasoning_end_streaming for reasoning parsers

hdlj-h · hdlj-h · commit c7d58f4c31a3 · 2025-12-05T14:45:52.000+01:00
Signed-off-by: hdlj-h &lt;hubert@hcompany.ai&gt;
diff --git a/docs/features/reasoning_outputs.md b/docs/features/reasoning_outputs.md
@@ -299,6 +299,9 @@ Additionally, to enable structured output, you'll need to create a new `Reasoner
 
         def is_reasoning_end(self, input_ids: list[int]) -> bool:
             return self.end_token_id in input_ids
+
+        def is_reasoning_end_streaming(self, input_ids: list[int], delta_ids: list[int]) -> bool:
+            return self.end_token_id in delta_token_ids
         ...
     ```
 
diff --git a/tests/reasoning/test_base_thinking_reasoning_parser.py b/tests/reasoning/test_base_thinking_reasoning_parser.py
@@ -132,6 +132,41 @@ def test_is_reasoning_end(self, test_tokenizer):
             is False
         )
 
+    def test_is_reasoning_end_streaming(self, test_tokenizer):
+        """Test the is_reasoning_end_streaming method."""
+        parser = TestThinkingReasoningParser(test_tokenizer)
+        end_token_id = parser.end_token_id
+        start_token_id = parser.start_token_id
+
+        assert (
+            parser.is_reasoning_end_streaming([1, 2, end_token_id], [end_token_id])
+            is True
+        )
+        assert parser.is_reasoning_end_streaming([1, 2, 3, 4], [4]) is False
+        assert parser.is_reasoning_end_streaming([], []) is False
+        assert (
+            parser.is_reasoning_end_streaming(
+                [1, start_token_id, 2, end_token_id], [end_token_id]
+            )
+            is True
+        )
+        assert (
+            parser.is_reasoning_end_streaming([1, start_token_id, 2, 3], [3]) is False
+        )
+        assert (
+            parser.is_reasoning_end_streaming(
+                [1, start_token_id, 2, end_token_id, 2, start_token_id],
+                [end_token_id, 2, start_token_id],
+            )
+            is False
+        )
+        assert (
+            parser.is_reasoning_end_streaming(
+                [1, start_token_id, 2, end_token_id, 2, 2], [2]
+            )
+            is False
+        )
+
     def test_extract_content_ids(self, test_tokenizer):
         """Test the extract_content_ids method."""
         parser = TestThinkingReasoningParser(test_tokenizer)
diff --git a/tests/v1/structured_output/test_reasoning_structured_output.py b/tests/v1/structured_output/test_reasoning_structured_output.py
@@ -70,7 +70,7 @@ def mock_request_with_structured_output(self):
         request.use_structured_output = True
         request.prompt_token_ids = [1, 2, 3, 4, 5]
         request.all_token_ids = [1, 2, 3, 4, 5, 6, 7, 8]
-        request.num_computed_tokens = 3
+        request.num_computed_tokens = 5
         return request
 
     def test_should_fill_bitmask_with_enable_in_reasoning(
diff --git a/vllm/reasoning/abs_reasoning_parsers.py b/vllm/reasoning/abs_reasoning_parsers.py
@@ -52,9 +52,7 @@ def is_reasoning_end(self, input_ids: list[int]) -> bool:
         Check if the reasoning content ends in the input_ids.
 
         It is used in structured engines like `xgrammar` to check if the
-        reasoning content ends in the model output. `input_ids` can be
-        either the entire model output or the last few computed tokens of
-        the model output (like during a decode step).
+        reasoning content ends in the model output.
 
         Parameters:
         input_ids: list[int]
@@ -65,6 +63,31 @@ def is_reasoning_end(self, input_ids: list[int]) -> bool:
             True if the reasoning content ends in the input_ids.
         """
 
+    def is_reasoning_end_streaming(
+        self, input_ids: list[int], delta_ids: list[int]
+    ) -> bool:
+        """
+        Check if the reasoning content ends in the input_ids on a
+        decode step.
+
+        It is used in structured engines like `xgrammar` to check if the
+        reasoning content ends in the model output during a decode step.
+        `input_ids` the entire model output and `delta_ids` are the last few
+        computed tokens of the model output (like during a decode step).
+
+        Parameters:
+        input_ids: list[int]
+            The entire model output.
+        delta_ids: list[int]
+            The last few computed tokens of the model output at the current decode step.
+
+        Returns:
+        bool
+            True if the reasoning content ends in the `delta_ids` on a
+            decode step.
+        """
+        return self.is_reasoning_end(input_ids)
+
     @abstractmethod
     def extract_content_ids(self, input_ids: list[int]) -> list[int]:
         """
diff --git a/vllm/reasoning/basic_parsers.py b/vllm/reasoning/basic_parsers.py
@@ -74,6 +74,11 @@ def is_reasoning_end(self, input_ids: list[int]) -> bool:
                 return True
         return False
 
+    def is_reasoning_end_streaming(
+        self, input_ids: list[int], delta_ids: list[int]
+    ) -> bool:
+        return self.is_reasoning_end(delta_ids)
+
     def extract_content_ids(self, input_ids: list[int]) -> list[int]:
         """
         Extract the content after the end tokens
diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py
@@ -339,8 +339,8 @@ def should_advance(self, request: Request) -> bool:
             return True
 
         # Check if reasoning ends in *this* step
-        if self.reasoner.is_reasoning_end(
-            request.all_token_ids[request.num_computed_tokens :]
+        if self.reasoner.is_reasoning_end_streaming(
+            request.all_token_ids, request.all_token_ids[request.num_computed_tokens :]
         ):
             # Reasoning just ended, so we shouldn't advance til
             # next pass