Skip tokenize/detokenize when it is disabled by arg --skip-tokenizer-init (#367)

maleksan85 · maleksan85 · web-flow · commit b5839a17b5c9 · 2025-01-22T09:23:50.000-08:00
* switching detokenize flag to be False

* detokenize = False for benchmarks

* restoring default in main vllm code for detokenize

* removing extra spaces

* moving detokenize to flag

* adding support for token ids

---------

Co-authored-by: maleksan85 &lt;maleksan@amd.com&gt;
diff --git a/benchmarks/profiling/benchmark_throughput.py b/benchmarks/profiling/benchmark_throughput.py
@@ -272,7 +272,8 @@ def main(args: argparse.Namespace):
         args.tokenizer, trust_remote_code=args.trust_remote_code)
     if args.dataset is None:
         # Synthesize a prompt with the given input length.
-        prompt = "hi" * (args.input_len - 1)
+        prompt = { "prompt_token_ids" : [42] * (args.input_len - 1) } \
+            if args.skip_tokenizer_init else "hi" * (args.input_len - 1)
         requests = [(prompt, args.input_len, args.output_len)
                     for _ in range(args.num_prompts)]
     else:
diff --git a/vllm/engine/output_processor/multi_step.py b/vllm/engine/output_processor/multi_step.py
@@ -144,7 +144,7 @@ def process_outputs(self,
     def _process_decode_and_stop(self, seq: Sequence,
                                  sampling_params: SamplingParams) -> None:
         new_char_count = 0
-        if sampling_params.detokenize:
+        if sampling_params.detokenize and self.detokenizer:
             new_char_count = self.detokenizer.decode_sequence_inplace(
                 seq, sampling_params)