Skip to content

Commit 998d9d1

Browse files
authored
[Tokenizer] Add tokenizer mode (#298)
1 parent 425040d commit 998d9d1

File tree

6 files changed

+38
-5
lines changed

6 files changed

+38
-5
lines changed

vllm/config.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ class ModelConfig:
1717
Args:
1818
model: Name or path of the huggingface model to use.
1919
tokenizer: Name or path of the huggingface tokenizer to use.
20+
tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
21+
available, and "slow" will always use the slow tokenizer.
2022
download_dir: Directory to download and load the weights, default to the
2123
default cache directory of huggingface.
2224
use_np_weights: Save a numpy copy of model weights for faster loading.
@@ -31,7 +33,8 @@ class ModelConfig:
3133
def __init__(
3234
self,
3335
model: str,
34-
tokenizer: Optional[str],
36+
tokenizer: str,
37+
tokenizer_mode: str,
3538
download_dir: Optional[str],
3639
use_np_weights: bool,
3740
use_dummy_weights: bool,
@@ -40,13 +43,23 @@ def __init__(
4043
) -> None:
4144
self.model = model
4245
self.tokenizer = tokenizer
46+
self.tokenizer_mode = tokenizer_mode
4347
self.download_dir = download_dir
4448
self.use_np_weights = use_np_weights
4549
self.use_dummy_weights = use_dummy_weights
4650
self.seed = seed
4751

4852
self.hf_config: PretrainedConfig = AutoConfig.from_pretrained(model)
4953
self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
54+
self._verify_tokenizer_mode()
55+
56+
def _verify_tokenizer_mode(self) -> None:
57+
tokenizer_mode = self.tokenizer_mode.lower()
58+
if tokenizer_mode not in ["auto", "slow"]:
59+
raise ValueError(
60+
f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
61+
"either 'auto' or 'slow'.")
62+
self.tokenizer_mode = tokenizer_mode
5063

5164
def verify_with_parallel_config(
5265
self,

vllm/engine/arg_utils.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ class EngineArgs:
1212
"""Arguments for vLLM engine."""
1313
model: str
1414
tokenizer: Optional[str] = None
15+
tokenizer_mode: str = "auto"
1516
download_dir: Optional[str] = None
1617
use_np_weights: bool = False
1718
use_dummy_weights: bool = False
@@ -42,6 +43,12 @@ def add_cli_args(
4243
help='name or path of the huggingface model to use')
4344
parser.add_argument('--tokenizer', type=str, default=EngineArgs.tokenizer,
4445
help='name or path of the huggingface tokenizer to use')
46+
parser.add_argument('--tokenizer-mode', type=str,
47+
default=EngineArgs.tokenizer_mode,
48+
choices=['auto', 'slow'],
49+
help='tokenizer mode. "auto" will use the fast '
50+
'tokenizer if available, and "slow" will '
51+
'always use the slow tokenizer.')
4552
parser.add_argument('--download-dir', type=str,
4653
default=EngineArgs.download_dir,
4754
help='directory to download and load the weights, '
@@ -109,8 +116,8 @@ def create_engine_configs(
109116
) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig]:
110117
# Initialize the configs.
111118
model_config = ModelConfig(
112-
self.model, self.tokenizer, self.download_dir, self.use_np_weights,
113-
self.use_dummy_weights, self.dtype, self.seed)
119+
self.model, self.tokenizer, self.tokenizer_mode, self.download_dir,
120+
self.use_np_weights, self.use_dummy_weights, self.dtype, self.seed)
114121
cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization,
115122
self.swap_space)
116123
parallel_config = ParallelConfig(self.pipeline_parallel_size,

vllm/engine/llm_engine.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ def __init__(
6161
"Initializing an LLM engine with config: "
6262
f"model={model_config.model!r}, "
6363
f"tokenizer={model_config.tokenizer!r}, "
64+
f"tokenizer_mode={model_config.tokenizer_mode}, "
6465
f"dtype={model_config.dtype}, "
6566
f"use_dummy_weights={model_config.use_dummy_weights}, "
6667
f"download_dir={model_config.download_dir!r}, "
@@ -77,7 +78,8 @@ def __init__(
7778
self.log_stats = log_stats
7879
self._verify_args()
7980

80-
self.tokenizer = get_tokenizer(model_config.tokenizer)
81+
self.tokenizer = get_tokenizer(model_config.tokenizer,
82+
model_config.tokenizer_mode)
8183
self.seq_counter = Counter()
8284

8385
# Create the parallel GPU workers.

vllm/entrypoints/llm.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ class LLM:
2626
Args:
2727
model: The name or path of a HuggingFace Transformers model.
2828
tokenizer: The name or path of a HuggingFace Transformers tokenizer.
29+
tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer
30+
if available, and "slow" will always use the slow tokenizer.
2931
tensor_parallel_size: The number of GPUs to use for distributed
3032
execution with tensor parallelism.
3133
dtype: The data type for the model weights and activations. Currently,
@@ -40,6 +42,7 @@ def __init__(
4042
self,
4143
model: str,
4244
tokenizer: Optional[str] = None,
45+
tokenizer_mode: str = "auto",
4346
tensor_parallel_size: int = 1,
4447
dtype: str = "auto",
4548
seed: int = 0,
@@ -50,6 +53,7 @@ def __init__(
5053
engine_args = EngineArgs(
5154
model=model,
5255
tokenizer=tokenizer,
56+
tokenizer_mode=tokenizer_mode,
5357
tensor_parallel_size=tensor_parallel_size,
5458
dtype=dtype,
5559
seed=seed,

vllm/entrypoints/openai/api_server.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,7 @@ async def fake_stream_generator() -> AsyncGenerator[str, None]:
313313
engine = AsyncLLMEngine.from_engine_args(engine_args)
314314

315315
# A separate tokenizer to map token IDs to strings.
316-
tokenizer = get_tokenizer(args.model)
316+
tokenizer = get_tokenizer(engine_args.tokenizer, engine_args.tokenizer_mode)
317317

318318
uvicorn.run(app, host=args.host, port=args.port, log_level="info",
319319
timeout_keep_alive=TIMEOUT_KEEP_ALIVE)

vllm/transformers_utils/tokenizer.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,17 @@
1313

1414
def get_tokenizer(
1515
tokenizer_name: str,
16+
tokenizer_mode: str = "auto",
1617
*args,
1718
**kwargs,
1819
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
1920
"""Gets a tokenizer for the given model name via Huggingface."""
21+
if tokenizer_mode == "slow":
22+
if kwargs.get("use_fast", False):
23+
raise ValueError(
24+
"Cannot use the fast tokenizer in slow tokenizer mode.")
25+
kwargs["use_fast"] = False
26+
2027
if "llama" in tokenizer_name.lower() and kwargs.get("use_fast", True):
2128
logger.info(
2229
"For some LLaMA-based models, initializing the fast tokenizer may "

0 commit comments

Comments
 (0)