more robust whispace handling

Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
vllm-project · njhill · Oct 29, 2024 · Sep 10, 2024 · Sep 10, 2024 · Sep 10, 2024
commit 113fbb6f3f2fd208acbbc177d8061bc0207a2c4d
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
@@ -227,14 +227,4 @@ Supported models:
 * `ibm-granite/granite-20b-functioncalling`
 
 Flags: `--tool-call-parser granite-20b-fc`
-`examples/tool_chat_template_granite20b_fc.jinja`: this is a modified chat template from the original on Huggingface, which is not vLLM compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported.
-
-Supported models:
-* `ibm-granite/granite-20b-functioncalling`
-
-Flags: `--tool-call-parser granite`
-
-Known issues:
-1. Tool call parsing is not yet supported in streaming mode.
-
-* `examples/tool_chat_template_granite_response.jinja` - this is a modified chat template from the original on Huggingface, which is not vLLM compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported.
+`examples/tool_chat_template_granite_20b_fc.jinja`: this is a modified chat template from the original on Huggingface, which is not vLLM compatible. It blends function description elements from the Hermes template and follows the same system prompt as "Response Generation" mode from [the paper](https://arxiv.org/abs/2407.00121). Parallel function calls are supported.
diff --git a/...es/tool_chat_template_granite20b_fc.jinja → ...s/tool_chat_template_granite_20b_fc.jinja b/...es/tool_chat_template_granite20b_fc.jinja → ...s/tool_chat_template_granite_20b_fc.jinja
diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py
@@ -100,8 +100,8 @@ def ensure_system_prompt(messages: List[Dict[str, Any]],
         "model":
         "ibm-granite/granite-20b-functioncalling",
         "arguments": [
-            "--tool-call-parser", "granite20b-fc", "--chat-template",
-            str(VLLM_PATH / "examples/tool_chat_template_granite.jinja")
+            "--tool-call-parser", "granite-20b-fc", "--chat-template",
+            str(VLLM_PATH / "examples/tool_chat_template_granite_20b_fc.jinja")
         ],
     }
 }

diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py
@@ -1,5 +1,5 @@
 from .abstract_tool_parser import ToolParser
-from .granite_20bfc_tool_parser import Granite20bFCToolParser
+from .granite_20b_fc_tool_parser import Granite20bFCToolParser
 from .hermes_tool_parser import Hermes2ProToolParser
 from .llama_tool_parser import Llama3JsonToolParser
 from .mistral_tool_parser import MistralToolParser

diff --git a/...tool_parsers/granite_20bfc_tool_parser.py → ...ool_parsers/granite_20b_fc_tool_parser.py b/...tool_parsers/granite_20bfc_tool_parser.py → ...ool_parsers/granite_20b_fc_tool_parser.py
@@ -12,7 +12,8 @@
                                               FunctionCall, ToolCall)
 from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import (
     ToolParser)
-from vllm.entrypoints.openai.tool_parsers.utils import (find_common_prefix,
+from vllm.entrypoints.openai.tool_parsers.utils import (consume_space,
+                                                        find_common_prefix,
                                                         is_complete_json,
                                                         partial_json_loads)
 from vllm.logger import init_logger
@@ -121,14 +122,19 @@ def extract_tool_calls_streaming(
             is_complete = []
             try:
                 start_idx = len(self.bot_token)
+                start_idx = consume_space(start_idx, current_text)
+
                 while start_idx < len(current_text):
                     (obj,
                      end_idx) = partial_json_loads(current_text[start_idx:],
                                                    flags)
                     is_complete.append(
                         is_complete_json(current_text[start_idx:start_idx +
                                                       end_idx]))
-                    start_idx += end_idx + len(self.bot_token) + 1
+                    start_idx += end_idx
+                    start_idx = consume_space(start_idx, current_text)
+                    start_idx += len(self.bot_token)
+                    start_idx = consume_space(start_idx, current_text)
                     tool_call_arr.append(obj)
             except partial_json_parser.core.exceptions.MalformedJSON:
                 logger.debug('not enough tokens to parse into JSON yet')

diff --git a/vllm/entrypoints/openai/tool_parsers/utils.py b/vllm/entrypoints/openai/tool_parsers/utils.py
@@ -112,3 +112,9 @@ def is_complete_json(input_str):
         return True
     except JSONDecodeError:
         return False
+
+
+def consume_space(i, s):
+    while i < len(s) and s[i].isspace():
+        i += 1
+    return i