|
| 1 | +# SPDX-License-Identifier: Apache-2.0 |
| 2 | + |
| 3 | +import re |
| 4 | +from collections.abc import Sequence |
| 5 | +from typing import Optional, Union |
| 6 | + |
| 7 | +from transformers import PreTrainedTokenizerBase |
| 8 | + |
| 9 | +from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, |
| 10 | + DeltaMessage) |
| 11 | +from vllm.logger import init_logger |
| 12 | +from vllm.reasoning import ReasoningParser, ReasoningParserManager |
| 13 | + |
| 14 | +logger = init_logger(__name__) |
| 15 | + |
| 16 | + |
| 17 | +@ReasoningParserManager.register_module("qwen3") |
| 18 | +class Qwen3ReasoningParser(ReasoningParser): |
| 19 | + """ |
| 20 | + Reasoning parser for the Qwen3 model. |
| 21 | +
|
| 22 | + The Qwen3 model uses <think>...</think> tokens to denote reasoning text |
| 23 | + within its output. The model provides a strict switch to disable reasoning |
| 24 | + output via the 'enable_thinking=False' parameter. This parser extracts the |
| 25 | + reasoning content enclosed by <think> and </think> tokens from the model's |
| 26 | + output. |
| 27 | + """ |
| 28 | + |
| 29 | + def __init__(self, tokenizer: PreTrainedTokenizerBase): |
| 30 | + super().__init__(tokenizer) |
| 31 | + self.think_start_token = "<think>" |
| 32 | + self.think_end_token = "</think>" |
| 33 | + |
| 34 | + self.reasoning_regex = re.compile( |
| 35 | + rf"{self.think_start_token}(.*?){self.think_end_token}", re.DOTALL) |
| 36 | + |
| 37 | + if not self.model_tokenizer: |
| 38 | + raise ValueError( |
| 39 | + "The model tokenizer must be passed to the ReasoningParser " |
| 40 | + "constructor during construction.") |
| 41 | + |
| 42 | + self.think_start_token_id = self.vocab.get(self.think_start_token) |
| 43 | + self.think_end_token_id = self.vocab.get(self.think_end_token) |
| 44 | + if (self.think_start_token_id is None |
| 45 | + or self.think_end_token_id is None): |
| 46 | + raise RuntimeError( |
| 47 | + "Qwen3 reasoning parser could not locate think start/end " |
| 48 | + "tokens in the tokenizer!") |
| 49 | + |
| 50 | + def extract_reasoning_content_streaming( |
| 51 | + self, |
| 52 | + previous_text: str, |
| 53 | + current_text: str, |
| 54 | + delta_text: str, |
| 55 | + previous_token_ids: Sequence[int], |
| 56 | + current_token_ids: Sequence[int], |
| 57 | + delta_token_ids: Sequence[int], |
| 58 | + ) -> Union[DeltaMessage, None]: |
| 59 | + """ |
| 60 | + Extract reasoning content from a delta message. |
| 61 | + Handles streaming output where previous + delta = current. |
| 62 | + Uses token IDs for faster processing. |
| 63 | + For text <think>abc</think>xyz: |
| 64 | + - 'abc' goes to reasoning_content |
| 65 | + - 'xyz' goes to content |
| 66 | + """ |
| 67 | + # Skip single special tokens |
| 68 | + if len(delta_token_ids) == 1 and (delta_token_ids[0] in [ |
| 69 | + self.think_start_token_id, self.think_end_token_id |
| 70 | + ]): |
| 71 | + return None |
| 72 | + |
| 73 | + if self.think_start_token_id in previous_token_ids: |
| 74 | + if self.think_end_token_id in delta_token_ids: |
| 75 | + # <think> in previous, </think> in delta, |
| 76 | + # extract reasoning content |
| 77 | + end_index = delta_text.find(self.think_end_token) |
| 78 | + reasoning_content = delta_text[:end_index] |
| 79 | + content = delta_text[end_index + len(self.think_end_token):] |
| 80 | + return DeltaMessage(reasoning_content=reasoning_content, |
| 81 | + content=content if content else None) |
| 82 | + elif self.think_end_token_id in previous_token_ids: |
| 83 | + # <think> in previous, </think> in previous, |
| 84 | + # reasoning content continues |
| 85 | + return DeltaMessage(content=delta_text) |
| 86 | + else: |
| 87 | + # <think> in previous, no </think> in previous or delta, |
| 88 | + # reasoning content continues |
| 89 | + return DeltaMessage(reasoning_content=delta_text) |
| 90 | + elif self.think_start_token_id in delta_token_ids: |
| 91 | + logger.info(delta_text) |
| 92 | + if self.think_end_token_id in delta_token_ids: |
| 93 | + # <think> in delta, </think> in delta, extract reasoning content |
| 94 | + start_index = delta_text.find(self.think_start_token) |
| 95 | + end_index = delta_text.find(self.think_end_token) |
| 96 | + reasoning_content = delta_text[start_index + |
| 97 | + len(self.think_start_token |
| 98 | + ):end_index] |
| 99 | + content = delta_text[end_index + len(self.think_end_token):] |
| 100 | + return DeltaMessage(reasoning_content=reasoning_content, |
| 101 | + content=content if content else None) |
| 102 | + else: |
| 103 | + # <think> in delta, no </think> in delta, |
| 104 | + # reasoning content continues |
| 105 | + return DeltaMessage(reasoning_content=delta_text) |
| 106 | + else: |
| 107 | + # thinking is disabled, just content |
| 108 | + return DeltaMessage(content=delta_text) |
| 109 | + |
| 110 | + def extract_reasoning_content( |
| 111 | + self, model_output: str, request: ChatCompletionRequest |
| 112 | + ) -> tuple[Optional[str], Optional[str]]: |
| 113 | + |
| 114 | + # Check if the model output contains the <think> tokens. |
| 115 | + if (self.think_start_token not in model_output |
| 116 | + or self.think_end_token not in model_output): |
| 117 | + return None, model_output |
| 118 | + else: |
| 119 | + # Use a regex to find the reasoning content |
| 120 | + reasoning_content = self.reasoning_regex.findall(model_output)[0] |
| 121 | + |
| 122 | + # Remove the reasoning content from the model output |
| 123 | + # Although <think> token is always at the |
| 124 | + # beginning of the line, we cannot guarantee that the |
| 125 | + # other models will follow this convention. |
| 126 | + # Therefore, we need to add :start_index. |
| 127 | + start_index = model_output.find(self.think_start_token) |
| 128 | + if start_index != -1: |
| 129 | + end_index = start_index + len( |
| 130 | + f"{self.think_start_token}{reasoning_content}{self.think_end_token}" |
| 131 | + ) |
| 132 | + model_output = model_output[:start_index] + \ |
| 133 | + model_output[end_index:] |
| 134 | + |
| 135 | + if len(model_output) == 0: |
| 136 | + return reasoning_content, None |
| 137 | + |
| 138 | + return reasoning_content, model_output |
0 commit comments