Skip to content

[Tokenizer] Add tokenizer mode #298

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion vllm/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ class ModelConfig:
Args:
model: Name or path of the huggingface model to use.
tokenizer: Name or path of the huggingface tokenizer to use.
tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
available, and "slow" will always use the slow tokenizer.
download_dir: Directory to download and load the weights, default to the
default cache directory of huggingface.
use_np_weights: Save a numpy copy of model weights for faster loading.
Expand All @@ -31,7 +33,8 @@ class ModelConfig:
def __init__(
self,
model: str,
tokenizer: Optional[str],
tokenizer: str,
tokenizer_mode: str,
download_dir: Optional[str],
use_np_weights: bool,
use_dummy_weights: bool,
Expand All @@ -40,13 +43,23 @@ def __init__(
) -> None:
self.model = model
self.tokenizer = tokenizer
self.tokenizer_mode = tokenizer_mode
self.download_dir = download_dir
self.use_np_weights = use_np_weights
self.use_dummy_weights = use_dummy_weights
self.seed = seed

self.hf_config: PretrainedConfig = AutoConfig.from_pretrained(model)
self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
self._verify_tokenizer_mode()

def _verify_tokenizer_mode(self) -> None:
tokenizer_mode = self.tokenizer_mode.lower()
if tokenizer_mode not in ["auto", "slow"]:
raise ValueError(
f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
"either 'auto' or 'slow'.")
self.tokenizer_mode = tokenizer_mode

def verify_with_parallel_config(
self,
Expand Down
11 changes: 9 additions & 2 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class EngineArgs:
"""Arguments for vLLM engine."""
model: str
tokenizer: Optional[str] = None
tokenizer_mode: str = "auto"
download_dir: Optional[str] = None
use_np_weights: bool = False
use_dummy_weights: bool = False
Expand Down Expand Up @@ -42,6 +43,12 @@ def add_cli_args(
help='name or path of the huggingface model to use')
parser.add_argument('--tokenizer', type=str, default=EngineArgs.tokenizer,
help='name or path of the huggingface tokenizer to use')
parser.add_argument('--tokenizer-mode', type=str,
default=EngineArgs.tokenizer_mode,
choices=['auto', 'slow'],
help='tokenizer mode. "auto" will use the fast '
'tokenizer if available, and "slow" will '
'always use the slow tokenizer.')
parser.add_argument('--download-dir', type=str,
default=EngineArgs.download_dir,
help='directory to download and load the weights, '
Expand Down Expand Up @@ -109,8 +116,8 @@ def create_engine_configs(
) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig]:
# Initialize the configs.
model_config = ModelConfig(
self.model, self.tokenizer, self.download_dir, self.use_np_weights,
self.use_dummy_weights, self.dtype, self.seed)
self.model, self.tokenizer, self.tokenizer_mode, self.download_dir,
self.use_np_weights, self.use_dummy_weights, self.dtype, self.seed)
cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization,
self.swap_space)
parallel_config = ParallelConfig(self.pipeline_parallel_size,
Expand Down
4 changes: 3 additions & 1 deletion vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def __init__(
"Initializing an LLM engine with config: "
f"model={model_config.model!r}, "
f"tokenizer={model_config.tokenizer!r}, "
f"tokenizer_mode={model_config.tokenizer_mode}, "
f"dtype={model_config.dtype}, "
f"use_dummy_weights={model_config.use_dummy_weights}, "
f"download_dir={model_config.download_dir!r}, "
Expand All @@ -77,7 +78,8 @@ def __init__(
self.log_stats = log_stats
self._verify_args()

self.tokenizer = get_tokenizer(model_config.tokenizer)
self.tokenizer = get_tokenizer(model_config.tokenizer,
model_config.tokenizer_mode)
self.seq_counter = Counter()

# Create the parallel GPU workers.
Expand Down
4 changes: 4 additions & 0 deletions vllm/entrypoints/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ class LLM:
Args:
model: The name or path of a HuggingFace Transformers model.
tokenizer: The name or path of a HuggingFace Transformers tokenizer.
tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer
if available, and "slow" will always use the slow tokenizer.
tensor_parallel_size: The number of GPUs to use for distributed
execution with tensor parallelism.
dtype: The data type for the model weights and activations. Currently,
Expand All @@ -40,6 +42,7 @@ def __init__(
self,
model: str,
tokenizer: Optional[str] = None,
tokenizer_mode: str = "auto",
tensor_parallel_size: int = 1,
dtype: str = "auto",
seed: int = 0,
Expand All @@ -50,6 +53,7 @@ def __init__(
engine_args = EngineArgs(
model=model,
tokenizer=tokenizer,
tokenizer_mode=tokenizer_mode,
tensor_parallel_size=tensor_parallel_size,
dtype=dtype,
seed=seed,
Expand Down
2 changes: 1 addition & 1 deletion vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ async def fake_stream_generator() -> AsyncGenerator[str, None]:
engine = AsyncLLMEngine.from_engine_args(engine_args)

# A separate tokenizer to map token IDs to strings.
tokenizer = get_tokenizer(args.model)
tokenizer = get_tokenizer(engine_args.tokenizer, engine_args.tokenizer_mode)

uvicorn.run(app, host=args.host, port=args.port, log_level="info",
timeout_keep_alive=TIMEOUT_KEEP_ALIVE)
7 changes: 7 additions & 0 deletions vllm/transformers_utils/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,17 @@

def get_tokenizer(
tokenizer_name: str,
tokenizer_mode: str = "auto",
*args,
**kwargs,
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
"""Gets a tokenizer for the given model name via Huggingface."""
if tokenizer_mode == "slow":
if kwargs.get("use_fast", False):
raise ValueError(
"Cannot use the fast tokenizer in slow tokenizer mode.")
kwargs["use_fast"] = False

if "llama" in tokenizer_name.lower() and kwargs.get("use_fast", True):
logger.info(
"For some LLaMA-based models, initializing the fast tokenizer may "
Expand Down