Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Quality] Add code formatter and linter #326

Merged
merged 17 commits into from
Jul 3, 2023
Prev Previous commit
Next Next commit
fix all simple errors
zhuohan123 committed Jul 1, 2023
commit 6deb2fa962c68d2299c90a7f3d58ec56bc058849
2 changes: 1 addition & 1 deletion .pylintrc
Original file line number Diff line number Diff line change
@@ -8,7 +8,7 @@
[MASTER]

# Files or directories to be skipped. They should be base names, not paths.
ignore=docs
ignore=docs,parallel_utils

# Files or directories matching the regex patterns are skipped. The regex
# matches against base names, not paths.
1 change: 1 addition & 0 deletions format.sh
Original file line number Diff line number Diff line change
@@ -44,6 +44,7 @@ YAPF_FLAGS=(

YAPF_EXCLUDES=(
'--exclude' 'build/**'
'--exclude' 'vllm/model_executor/parallel_utils/**'
)

# Format specified files
3 changes: 3 additions & 0 deletions vllm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
"""vLLM is a high-throughput and memory-efficient inference and serving engine
for LLMs"""

from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.engine.llm_engine import LLMEngine
12 changes: 6 additions & 6 deletions vllm/config.py
Original file line number Diff line number Diff line change
@@ -8,7 +8,7 @@

logger = init_logger(__name__)

_GiB = 1 << 30
_GB = 1 << 30


class ModelConfig:
@@ -115,7 +115,7 @@ def __init__(
) -> None:
self.block_size = block_size
self.gpu_memory_utilization = gpu_memory_utilization
self.swap_space_bytes = swap_space * _GiB
self.swap_space_bytes = swap_space * _GB
self._verify_args()

# Will be set after profiling.
@@ -138,13 +138,13 @@ def verify_with_parallel_config(
num_gpus_per_node = parallel_config.tensor_parallel_size
cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node

msg = (f"{cpu_memory_usage / _GiB:.2f} GiB out of "
f"the {total_cpu_memory / _GiB:.2f} GiB total CPU memory is "
msg = (f"{cpu_memory_usage / _GB:.2f} GiB out of "
f"the {total_cpu_memory / _GB:.2f} GiB total CPU memory is "
"allocated for the swap space.")
if cpu_memory_usage > 0.7 * total_cpu_memory:
raise ValueError("Too large swap space. " + msg)
elif cpu_memory_usage > 0.4 * total_cpu_memory:
logger.warn("Possibly too large swap space. " + msg)
logger.warning("Possibly too large swap space. " + msg)


class ParallelConfig:
@@ -239,7 +239,7 @@ def _get_and_verify_dtype(
pass
else:
# Casting between float16 and bfloat16 is allowed with a warning.
logger.warn(f"Casting {config_dtype} to {torch_dtype}.")
logger.warning(f"Casting {config_dtype} to {torch_dtype}.")

# Check if the GPU supports the dtype.
if torch_dtype == torch.bfloat16:
6 changes: 4 additions & 2 deletions vllm/core/block_manager.py
Original file line number Diff line number Diff line change
@@ -85,10 +85,12 @@ def can_allocate(self, seq_group: SequenceGroup) -> bool:
num_required_blocks = len(seq.logical_token_blocks)
num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks()
# Use watermark to avoid frequent cache eviction.
return num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks
return (num_free_gpu_blocks - num_required_blocks >=
self.watermark_blocks)

def allocate(self, seq_group: SequenceGroup) -> None:
# NOTE: Here we assume that all sequences in the group have the same prompt.
# NOTE: Here we assume that all sequences in the group have the same
# prompt.
seq = seq_group.get_seqs()[0]

# Allocate new physical token blocks that will store the prompt tokens.
18 changes: 10 additions & 8 deletions vllm/core/scheduler.py
Original file line number Diff line number Diff line change
@@ -60,7 +60,7 @@ def __init__(
self.log_stats = log_stats

# Instantiate the scheduling policy.
self.policy = PolicyFactory.get_policy(policy_name='fcfs')
self.policy = PolicyFactory.get_policy(policy_name="fcfs")
# Create the block space manager.
self.block_manager = BlockSpaceManager(
block_size=self.cache_config.block_size,
@@ -158,7 +158,8 @@ def _schedule(self) -> Tuple[SchedulerOutputs, List[str]]:
num_curr_seqs = sum(
seq_group.num_seqs(status=SequenceStatus.RUNNING)
for seq_group in self.running)
if num_curr_seqs + num_new_seqs > self.scheduler_config.max_num_seqs:
if (num_curr_seqs + num_new_seqs >
self.scheduler_config.max_num_seqs):
break

seq_group = self.swapped.pop(0)
@@ -202,7 +203,8 @@ def _schedule(self) -> Tuple[SchedulerOutputs, List[str]]:
num_curr_seqs = sum(
seq_group.num_seqs(status=SequenceStatus.RUNNING)
for seq_group in self.running)
if num_curr_seqs + num_new_seqs > self.scheduler_config.max_num_seqs:
if (num_curr_seqs + num_new_seqs >
self.scheduler_config.max_num_seqs):
break

seq_group = self.waiting.pop(0)
@@ -243,8 +245,8 @@ def _schedule(self) -> Tuple[SchedulerOutputs, List[str]]:

total_num_cpu_blocks = self.cache_config.num_cpu_blocks
if total_num_cpu_blocks > 0:
num_free_cpu_blocks = self.block_manager.get_num_free_cpu_blocks(
)
num_free_cpu_blocks = (
self.block_manager.get_num_free_cpu_blocks())
num_used_cpu_blocks = total_num_cpu_blocks - num_free_cpu_blocks
cpu_cache_usage = num_used_cpu_blocks / total_num_cpu_blocks
else:
@@ -296,8 +298,8 @@ def update(
for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
output = seq_outputs[seq.seq_id]
if seq.seq_id != output.parent_seq_id:
# The sequence is a fork of the parent sequence (beam search).
# Free the current sequence.
# The sequence is a fork of the parent sequence (beam
# search). Free the current sequence.
self.block_manager.free(seq)
# Fork the parent sequence.
parent_seq = seq_group.find(output.parent_seq_id)
@@ -370,7 +372,7 @@ def _preempt(
elif preemption_mode == PreemptionMode.SWAP:
self._preempt_by_swap(seq_group, blocks_to_swap_out)
else:
assert False, 'Invalid preemption mode.'
assert False, "Invalid preemption mode."

def _preempt_by_recompute(
self,
6 changes: 3 additions & 3 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
@@ -12,11 +12,11 @@ class EngineArgs:
"""Arguments for vLLM engine."""
model: str
tokenizer: Optional[str] = None
tokenizer_mode: str = "auto"
tokenizer_mode: str = 'auto'
download_dir: Optional[str] = None
use_np_weights: bool = False
use_dummy_weights: bool = False
dtype: str = "auto"
dtype: str = 'auto'
seed: int = 0
worker_use_ray: bool = False
pipeline_parallel_size: int = 1
@@ -129,7 +129,7 @@ def add_cli_args(
return parser

@classmethod
def from_cli_args(cls, args: argparse.Namespace) -> "EngineArgs":
def from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs':
# Get the list of attributes of this dataclass.
attrs = [attr.name for attr in dataclasses.fields(cls)]
# Set the attributes from the parsed arguments.
4 changes: 2 additions & 2 deletions vllm/engine/async_llm_engine.py
Original file line number Diff line number Diff line change
@@ -39,8 +39,8 @@ class AsyncLLMEngine:
def __init__(self,
worker_use_ray: bool,
engine_use_ray: bool,
log_requests: bool = True,
*args,
log_requests: bool = True,
**kwargs) -> None:
self.worker_use_ray = worker_use_ray
self.engine_use_ray = engine_use_ray
@@ -219,9 +219,9 @@ def from_engine_args(cls,
# Create the async LLM engine.
engine = cls(engine_args.worker_use_ray,
engine_args.engine_use_ray,
not engine_args.disable_log_requests,
*engine_configs,
distributed_init_method,
devices,
log_requests=not engine_args.disable_log_requests,
log_stats=not engine_args.disable_log_stats)
return engine
10 changes: 5 additions & 5 deletions vllm/engine/llm_engine.py
Original file line number Diff line number Diff line change
@@ -77,8 +77,8 @@ def __init__(
self.log_stats = log_stats
self._verify_args()

self.tokenizer = get_tokenizer(model_config.tokenizer,
model_config.tokenizer_mode)
self.tokenizer = get_tokenizer(
model_config.tokenizer, tokenizer_mode=model_config.tokenizer_mode)
self.seq_counter = Counter()

# Create the parallel GPU workers.
@@ -128,8 +128,8 @@ def _init_cache(self) -> None:
num_gpu_blocks = min(b[0] for b in num_blocks)
num_cpu_blocks = min(b[1] for b in num_blocks)
# FIXME(woosuk): Change to debug log.
logger.info(f'# GPU blocks: {num_gpu_blocks}, '
f'# CPU blocks: {num_cpu_blocks}')
logger.info(f"# GPU blocks: {num_gpu_blocks}, "
f"# CPU blocks: {num_cpu_blocks}")

if num_gpu_blocks <= 0:
raise ValueError("No available memory for the cache blocks. "
@@ -304,8 +304,8 @@ def _stop_sequences(self, seq_groups: List[SequenceGroup]) -> None:
def _run_workers(
self,
method: str,
get_all_outputs: bool = False,
*args,
get_all_outputs: bool = False,
**kwargs,
) -> Any:
"""Runs the given method on all workers."""
10 changes: 5 additions & 5 deletions vllm/engine/ray_utils.py
Original file line number Diff line number Diff line change
@@ -54,15 +54,15 @@ def initialize_cluster(
valid_node_resources = []
num_devices_per_node = None
for node in ray.nodes():
if (not node['Alive']) or node['Resources']['GPU'] <= 0:
if (not node["Alive"]) or node["Resources"]["GPU"] <= 0:
continue
if num_devices_per_node is None:
num_devices_per_node = node['Resources']['GPU']
num_devices_per_node = node["Resources"]["GPU"]
else:
assert num_devices_per_node == node['Resources']['GPU'], (
assert num_devices_per_node == node["Resources"]["GPU"], (
"The number of GPUs per node is not uniform.")
for key in node['Resources']:
if key.startswith('node:'):
for key in node["Resources"]:
if key.startswith("node:"):
valid_node_resources.append(key)

# Verify the parallel config.
2 changes: 1 addition & 1 deletion vllm/entrypoints/llm.py
Original file line number Diff line number Diff line change
@@ -63,7 +63,7 @@ def __init__(
self.request_counter = Counter()

def get_tokenizer(
self, ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
self) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
return self.llm_engine.tokenizer

def set_tokenizer(
7 changes: 4 additions & 3 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Adapted from https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/serve/openai_api_server.py
# Adapted from
# https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/serve/openai_api_server.py

import argparse
from http import HTTPStatus
@@ -40,7 +41,7 @@ def create_error_response(status_code: HTTPStatus,


@app.exception_handler(RequestValidationError)
async def validation_exception_handler(request, exc):
async def validation_exception_handler(request, exc): # pylint: disable=unused-argument
return create_error_response(HTTPStatus.BAD_REQUEST, str(exc))


@@ -329,7 +330,7 @@ async def fake_stream_generator() -> AsyncGenerator[str, None]:

# A separate tokenizer to map token IDs to strings.
tokenizer = get_tokenizer(engine_args.tokenizer,
engine_args.tokenizer_mode)
tokenizer_mode=engine_args.tokenizer_mode)

uvicorn.run(app,
host=args.host,
3 changes: 2 additions & 1 deletion vllm/entrypoints/openai/protocol.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Adapted from https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
# Adapted from
# https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
import time
from typing import Dict, List, Literal, Optional, Union

5 changes: 3 additions & 2 deletions vllm/logger.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Adapted from https://github.com/skypilot-org/skypilot/blob/86dc0f6283a335e4aa37b3c10716f90999f48ab6/sky/sky_logging.py

# Adapted from
# https://github.com/skypilot-org/skypilot/blob/86dc0f6283a335e4aa37b3c10716f90999f48ab6/sky/sky_logging.py
"""Logging configuration for vLLM."""
import logging
import sys

3 changes: 0 additions & 3 deletions vllm/model_executor/layers/activation.py
Original file line number Diff line number Diff line change
@@ -30,9 +30,6 @@ class SiluAndMul(nn.Module):
The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[1] // 2.
"""

def __init__(self):
super().__init__()

def forward(
self,
x: torch.Tensor, # (num_tokens, 2 * d)
3 changes: 2 additions & 1 deletion vllm/model_executor/layers/attention.py
Original file line number Diff line number Diff line change
@@ -14,6 +14,7 @@


class PagedAttention(nn.Module):
# pylint: disable=line-too-long
"""GPT-style multi-head PagedAttention.

This class takes flattened 1D query, key, and value tensors as input. The
@@ -183,7 +184,7 @@ def __init__(
# Create the cos and sin cache.
inv_freq = 1.0 / (base**(torch.arange(0, rotary_dim, 2) / rotary_dim))
t = torch.arange(max_position).float()
freqs = torch.einsum('i,j -> ij', t, inv_freq.float())
freqs = torch.einsum("i,j -> ij", t, inv_freq.float())
cos = freqs.cos()
sin = freqs.sin()
cache = torch.cat((cos, sin), dim=-1)
8 changes: 4 additions & 4 deletions vllm/model_executor/layers/sampler.py
Original file line number Diff line number Diff line change
@@ -411,16 +411,16 @@ def _sample(

# Get top-k log probabilities for the next tokens.
next_logprobs: Dict[int, Dict[int, float]] = {}
for i, seq_id in enumerate(seq_ids):
for j, seq_id in enumerate(seq_ids):
next_logprobs[seq_id] = _get_topk_logprobs(
logprob[i], sampling_params.logprobs)
logprob[j], sampling_params.logprobs)

# Build the output.
for seq_id, parent_seq_id, next_token_id in zip(
seq_ids, parent_seq_ids, next_token_ids):
i = seq_ids.index(parent_seq_id)
j = seq_ids.index(parent_seq_id)
output_logprobs = next_logprobs[parent_seq_id].copy()
output_logprobs[next_token_id] = logprob[i,
output_logprobs[next_token_id] = logprob[j,
next_token_id].item()
seq_outputs[seq_id] = SequenceOutputs(
seq_id,
18 changes: 11 additions & 7 deletions vllm/model_executor/models/gpt2.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# coding=utf-8
# Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
# Copyright 2023 The vLLM team.
# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
@@ -117,7 +118,8 @@ class GPT2Block(nn.Module):
def __init__(self, config: GPT2Config):
super().__init__()
hidden_size = config.hidden_size
inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
inner_dim = (config.n_inner if config.n_inner is not None else 4 *
hidden_size)

self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
self.attn = GPT2Attention(config)
@@ -155,9 +157,9 @@ class GPT2Model(nn.Module):
def __init__(self, config: GPT2Config):
super().__init__()
self.config = config
assert config.add_cross_attention == False
assert config.scale_attn_by_inverse_layer_idx == False
assert config.reorder_and_upcast_attn == False
assert not config.add_cross_attention
assert not config.scale_attn_by_inverse_layer_idx
assert not config.reorder_and_upcast_attn
self.embed_dim = config.hidden_size

# Optimization: While the vocab size of GPT-2 is 50257, we extend it
@@ -270,8 +272,10 @@ def load_weights(self,

# For the fused QKV linear layer, manually shard the weights.
if "c_attn" in name:
# GPT-2's fused QKV has the shape of [3 * num_heads * head_size, hidden_size].
# When tensor parallelism is used, we shard the weights along the head dimension.
# GPT-2's fused QKV has the shape of
# [3 * num_heads * head_size, hidden_size].
# When tensor parallelism is used, we shard the weights along
# the head dimension.
total_num_heads = self.config.num_attention_heads
hidden_size = self.config.hidden_size
head_size = hidden_size // total_num_heads
25 changes: 16 additions & 9 deletions vllm/model_executor/models/gpt_bigcode.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# coding=utf-8
# Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
# Copyright 2023 The vLLM team.
# Copyright 2023 CTranslate2, and Michael Feil
# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
@@ -119,7 +120,8 @@ class GPTBigCodeBlock(nn.Module):
def __init__(self, config: GPTBigCodeConfig):
super().__init__()
hidden_size = config.hidden_size
inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
inner_dim = (config.n_inner if config.n_inner is not None else 4 *
hidden_size)

self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
self.attn = GPTBigCodeAttention(config)
@@ -157,7 +159,7 @@ class GPTBigCodeModel(nn.Module):
def __init__(self, config: GPTBigCodeConfig):
super().__init__()
self.config = config
assert config.add_cross_attention == False
assert not config.add_cross_attention

self.embed_dim = config.hidden_size

@@ -272,26 +274,31 @@ def _expand_mqa_mha(qkv_array, n_head, head_dim):
qkv_array = qkv_array.numpy()

dims_q = n_head * head_dim
# pylint: disable=unbalanced-tuple-unpacking
q, k, v = np.split(qkv_array, (dims_q, dims_q + head_dim),
axis=0)
# q is fine, but k & v have not replicated shape along the first axis
# as long as MQA is not nativly supported, increase memory and replicated
# (head_dim, hidden_dim) to (n_heads * head_dim, hidden_dim)
# q is fine, but k & v have not replicated shape along the first
# axis as long as MQA is not nativly supported, increase memory
# and replicated (head_dim, hidden_dim) to
# (n_heads * head_dim, hidden_dim)
if k.ndim == 2 and v.ndim == 2:
replication = (n_head, 1) # weights
else:
replication = n_head # biases
# replicate n_head times for q, v
k, v = np.tile(k, replication), np.tile(v, replication)
# concat q, k, v along the first axis (n_heads * head_dim, hidden_dim)
# concat q, k, v along the first axis
# (n_heads * head_dim, hidden_dim)
# to (3 * n_heads * head_dim, hidden_dim)
qkv_array = np.concatenate((q, k, v), axis=0)
return torch.from_numpy(qkv_array)

# For the fused QKV linear layer, manually shard the weights.
if "c_attn" in name:
# GPT-2's fused QKV has the shape of [3 * num_heads * head_size, hidden_size].
# When tensor parallelism is used, we shard the weights along the head dimension.
# GPT-2's fused QKV has the shape of
# [3 * num_heads * head_size, hidden_size].
# When tensor parallelism is used, we shard the weights along
# the head dimension.
total_num_heads = self.config.num_attention_heads
hidden_size = self.config.hidden_size
head_size = hidden_size // total_num_heads
10 changes: 6 additions & 4 deletions vllm/model_executor/models/gpt_neox.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# coding=utf-8
# Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py
# Copyright 2023 The vLLM team.
# Copyright 2022 EleutherAI The HuggingFace Inc. team. All rights reserved.
#
@@ -51,7 +52,8 @@ def __init__(self, config: GPTNeoXConfig):
tensor_model_parallel_world_size = get_tensor_model_parallel_world_size(
)
assert self.total_num_heads % tensor_model_parallel_world_size == 0
self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
self.num_heads = (self.total_num_heads //
tensor_model_parallel_world_size)

self.query_key_value = ColumnParallelLinear(
config.hidden_size,
@@ -252,12 +254,12 @@ def load_weights(self,
num_heads = self.config.num_attention_heads
hidden_size = self.config.hidden_size
head_size = hidden_size // num_heads
if 'query_key_value.weight' in name:
if "query_key_value.weight" in name:
loaded_weight = loaded_weight.view(-1, 3, head_size,
hidden_size)
loaded_weight = loaded_weight.transpose(0, 1)
loaded_weight = loaded_weight.reshape(-1, hidden_size)
elif 'query_key_value.bias' in name:
elif "query_key_value.bias" in name:
loaded_weight = loaded_weight.view(-1, 3, head_size)
loaded_weight = loaded_weight.transpose(0, 1)
loaded_weight = loaded_weight.reshape(-1)
15 changes: 8 additions & 7 deletions vllm/model_executor/models/llama.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# coding=utf-8
# Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
# Copyright 2023 The vLLM team.
# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
#
# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
# This code is based on EleutherAI"s GPT-NeoX library and the GPT-NeoX
# and OPT implementations in this library. It has been modified from its
# original forms to accommodate minor architectural differences compared
# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
@@ -30,7 +31,6 @@
from torch import nn
from transformers import LlamaConfig

from vllm.sequence import SequenceOutputs
from vllm.model_executor.input_metadata import InputMetadata
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.layernorm import RMSNorm
@@ -66,9 +66,9 @@ def __init__(
bias=False,
input_is_parallel=True,
perform_initialization=False)
if hidden_act != 'silu':
raise ValueError(f'Unsupported activation: {hidden_act}. '
'Only silu is supported for now.')
if hidden_act != "silu":
raise ValueError(f"Unsupported activation: {hidden_act}. "
"Only silu is supported for now.")
self.act_fn = SiluAndMul()

def forward(self, x):
@@ -91,7 +91,8 @@ def __init__(
)
self.total_num_heads = num_heads
assert self.total_num_heads % tensor_model_parallel_world_size == 0
self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
self.num_heads = (self.total_num_heads //
tensor_model_parallel_world_size)
self.head_dim = hidden_size // self.total_num_heads
self.scaling = self.head_dim**-0.5

16 changes: 10 additions & 6 deletions vllm/model_executor/models/opt.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
# coding=utf-8
# Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py
# Copyright 2023 The vLLM team.
# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights
# reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -43,8 +45,9 @@
class OPTLearnedPositionalEmbedding(nn.Embedding):

def __init__(self, num_embeddings: int, embedding_dim: int):
# OPT is set up so that if padding_idx is specified then offset the embedding ids by 2
# and adjust num_embeddings appropriately. Other models don't have this hack
# OPT is set up so that if padding_idx is specified then offset the
# embedding ids by 2 and adjust num_embeddings appropriately. Other
# models don't have this hack
self.offset = 2
super().__init__(num_embeddings + self.offset, embedding_dim)

@@ -199,8 +202,9 @@ def __init__(self, config: OPTConfig):
else:
self.project_in = None

# Note that the only purpose of `config._remove_final_layer_norm` is to keep backward compatibility
# with checkpoints that have been fine-tuned before transformers v4.20.1
# Note that the only purpose of `config._remove_final_layer_norm` is to
# keep backward compatibility with checkpoints that have been fine-tuned
# before transformers v4.20.1
# see https://github.com/facebookresearch/metaseq/pull/164
if config.do_layer_norm_before and not config._remove_final_layer_norm:
self.final_layer_norm = nn.LayerNorm(
58 changes: 21 additions & 37 deletions vllm/model_executor/parallel_utils/parallel_state.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright 2023 The vLLM team.
# Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.

"""Model and data parallel groups."""

import torch
@@ -45,7 +46,6 @@

_ALL_REDUCE_LAUNCHER: Optional['GraphAllReduce'] = None


def initialize_model_parallel(
tensor_model_parallel_size: int = 1,
pipeline_model_parallel_size: int = 1,
@@ -83,8 +83,7 @@ def initialize_model_parallel(
assert torch.distributed.is_initialized()
world_size: int = torch.distributed.get_world_size()

if world_size % (tensor_model_parallel_size *
pipeline_model_parallel_size) != 0:
if world_size % (tensor_model_parallel_size * pipeline_model_parallel_size) != 0:
raise RuntimeError(
f"world_size ({world_size}) is not divisible by tensor_model_parallel_size "
f"({tensor_model_parallel_size}) x pipeline_model_parallel_size ({pipeline_model_parallel_size})"
@@ -93,15 +92,14 @@ def initialize_model_parallel(
data_parallel_size: int = world_size // (tensor_model_parallel_size *
pipeline_model_parallel_size)

num_tensor_model_parallel_groups: int = world_size // tensor_model_parallel_size
num_tensor_model_parallel_groups: int = world_size // tensor_model_parallel_size
num_pipeline_model_parallel_groups: int = world_size // pipeline_model_parallel_size
num_data_parallel_groups: int = world_size // data_parallel_size

if virtual_pipeline_model_parallel_size is not None:
if not pipeline_model_parallel_size > 2:
raise RuntimeError(
"pipeline-model-parallel size should be greater than 2 with "
"interleaved schedule")
raise RuntimeError("pipeline-model-parallel size should be greater than 2 with "
"interleaved schedule")
global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
_VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = 0
@@ -133,10 +131,8 @@ def initialize_model_parallel(
global _MODEL_PARALLEL_GROUP
assert _MODEL_PARALLEL_GROUP is None, 'model parallel group is already initialized'
for i in range(data_parallel_size):
ranks = [
data_parallel_group_ranks[i]
for data_parallel_group_ranks in all_data_parallel_group_ranks
]
ranks = [data_parallel_group_ranks[i]
for data_parallel_group_ranks in all_data_parallel_group_ranks]
group = torch.distributed.new_group(ranks)
if rank in ranks:
_MODEL_PARALLEL_GROUP = group
@@ -177,17 +173,13 @@ def initialize_model_parallel(
embedding_ranks = [ranks[0], ranks[-1]]
position_embedding_ranks = [ranks[0]]
if pipeline_model_parallel_split_rank is not None:
if ranks[
pipeline_model_parallel_split_rank] not in embedding_ranks:
embedding_ranks = [
ranks[0], ranks[pipeline_model_parallel_split_rank],
ranks[-1]
]
if ranks[
pipeline_model_parallel_split_rank] not in position_embedding_ranks:
position_embedding_ranks = [
ranks[0], ranks[pipeline_model_parallel_split_rank]
]
if ranks[pipeline_model_parallel_split_rank] not in embedding_ranks:
embedding_ranks = [ranks[0],
ranks[pipeline_model_parallel_split_rank],
ranks[-1]]
if ranks[pipeline_model_parallel_split_rank] not in position_embedding_ranks:
position_embedding_ranks = [ranks[0],
ranks[pipeline_model_parallel_split_rank]]
else:
embedding_ranks = ranks
position_embedding_ranks = ranks
@@ -204,7 +196,6 @@ def initialize_model_parallel(
if rank in ranks:
_POSITION_EMBEDDING_GLOBAL_RANKS = position_embedding_ranks


def initialize_all_reduce_launcher(
max_num_tokens: int,
hidden_size: int,
@@ -219,7 +210,6 @@ def initialize_all_reduce_launcher(
disable_graph=disable_graph,
)


def model_parallel_is_initialized():
"""Check if model and data parallel groups are initialized."""
if _TENSOR_MODEL_PARALLEL_GROUP is None or \
@@ -288,17 +278,15 @@ def get_tensor_model_parallel_world_size():
global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
if _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE is not None:
return _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
return torch.distributed.get_world_size(
group=get_tensor_model_parallel_group())
return torch.distributed.get_world_size(group=get_tensor_model_parallel_group())


def get_pipeline_model_parallel_world_size():
"""Return world size for the pipeline model parallel group."""
global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
if _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE is not None:
return _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
return torch.distributed.get_world_size(
group=get_pipeline_model_parallel_group())
return torch.distributed.get_world_size(group=get_pipeline_model_parallel_group())


def set_tensor_model_parallel_rank(rank):
@@ -332,8 +320,8 @@ def get_pipeline_model_parallel_rank():
global _MPU_PIPELINE_MODEL_PARALLEL_RANK
if _MPU_PIPELINE_MODEL_PARALLEL_RANK is not None:
return _MPU_PIPELINE_MODEL_PARALLEL_RANK
return torch.distributed.get_rank(
group=get_pipeline_model_parallel_group())
return torch.distributed.get_rank(group=get_pipeline_model_parallel_group())



def is_pipeline_first_stage(ignore_virtual=False):
@@ -470,7 +458,6 @@ def get_pipeline_model_parallel_last_rank():
last_rank_local = get_pipeline_model_parallel_world_size() - 1
return _PIPELINE_GLOBAL_RANKS[last_rank_local]


def get_pipeline_model_parallel_next_rank():
"""Return the global rank that follows the caller in the pipeline"""
assert _PIPELINE_GLOBAL_RANKS is not None, \
@@ -498,12 +485,10 @@ def get_data_parallel_rank():
"""Return my rank for the data parallel group."""
return torch.distributed.get_rank(group=get_data_parallel_group())


def get_all_reduce_launcher() -> 'GraphAllReduce':
assert _ALL_REDUCE_LAUNCHER is not None, 'all reduce launcher is not initialized'
return _ALL_REDUCE_LAUNCHER


def destroy_model_parallel():
"""Set the groups to none."""
global _MODEL_PARALLEL_GROUP
@@ -564,15 +549,14 @@ def __init__(

def _build_graph(self, num_tokens: int) -> torch.cuda.CUDAGraph:
# Warm up.
torch.distributed.all_reduce(self.buffer[:num_tokens],
group=self.group)
torch.distributed.all_reduce(self.buffer[:num_tokens], group=self.group)
torch.cuda.synchronize()

# Build graph.
graph = torch.cuda.CUDAGraph()
with torch.cuda.graph(graph):
torch.distributed.all_reduce(self.buffer[:num_tokens],
group=self.group)
torch.distributed.all_reduce(
self.buffer[:num_tokens], group=self.group)
torch.cuda.synchronize()
return graph

Original file line number Diff line number Diff line change
@@ -22,7 +22,8 @@
)

from .utils import (
split_tensor_along_last_dim, )
split_tensor_along_last_dim,
)

__all__ = [
#layers.py
@@ -37,7 +38,7 @@
"copy_to_tensor_model_parallel_region",
"gather_from_tensor_model_parallel_region",
"gather_from_sequence_parallel_region",
# "reduce_from_tensor_model_parallel_region",
# "reduce_from_tensor_model_parallel_region",
"scatter_to_tensor_model_parallel_region",
"scatter_to_sequence_parallel_region",
# random.py
223 changes: 81 additions & 142 deletions vllm/model_executor/parallel_utils/tensor_parallel/layers.py
Original file line number Diff line number Diff line change
@@ -5,6 +5,7 @@
# Parts of the code here are adapted from PyTorch
# repo: https://github.com/pytorch/pytorch


import torch
import torch.nn.functional as F
import torch.nn.init as init
@@ -28,17 +29,14 @@
VocabUtility,
)

_MODEL_PARALLEL_ATTRIBUTE_DEFAULTS = {
'tensor_model_parallel': False,
'partition_dim': -1,
'partition_stride': 1
}

_MODEL_PARALLEL_ATTRIBUTE_DEFAULTS = {'tensor_model_parallel': False,
'partition_dim': -1,
'partition_stride': 1}

def param_is_not_tensor_parallel_duplicate(param):
return (hasattr(param, 'tensor_model_parallel') and
param.tensor_model_parallel) or (get_tensor_model_parallel_rank()
== 0)
param.tensor_model_parallel) or (
get_tensor_model_parallel_rank() == 0)


def set_tensor_model_parallel_attributes(tensor, is_parallel, dim, stride):
@@ -52,30 +50,24 @@ def set_tensor_model_parallel_attributes(tensor, is_parallel, dim, stride):


def set_defaults_if_not_set_tensor_model_parallel_attributes(tensor):

def maybe_set(attribute, value):
if not hasattr(tensor, attribute):
setattr(tensor, attribute, value)

for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
maybe_set(attribute, _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS[attribute])


def copy_tensor_model_parallel_attributes(destination_tensor, source_tensor):

def maybe_copy(attribute):
if hasattr(source_tensor, attribute):
setattr(destination_tensor, attribute,
getattr(source_tensor, attribute))

for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
maybe_copy(attribute)


def _initialize_affine_weight_gpu(weight,
init_method,
partition_dim,
stride=1):
def _initialize_affine_weight_gpu(weight, init_method,
partition_dim, stride=1):
"""Initialize affine weight for model parallel on GPU."""

set_tensor_model_parallel_attributes(tensor=weight,
@@ -87,16 +79,11 @@ def _initialize_affine_weight_gpu(weight,
init_method(weight)


def _initialize_affine_weight_cpu(weight,
output_size,
input_size,
per_partition_size,
partition_dim,
init_method,
stride=1,
def _initialize_affine_weight_cpu(weight, output_size, input_size,
per_partition_size, partition_dim,
init_method, stride=1,
return_master_weight=False,
*,
params_dtype=None):
*, params_dtype=None):
"""Initialize affine weight for model parallel.
Build the master weight on all processes and scatter
@@ -111,17 +98,15 @@ def _initialize_affine_weight_cpu(weight,
params_dtype = torch.get_default_dtype()

# Initialize master weight
master_weight = torch.empty(output_size,
input_size,
master_weight = torch.empty(output_size, input_size,
dtype=torch.float,
requires_grad=False)
init_method(master_weight)
master_weight = master_weight.to(dtype=params_dtype)

# Split and copy
per_partition_per_stride_size = divide(per_partition_size, stride)
weight_list = torch.split(master_weight,
per_partition_per_stride_size,
weight_list = torch.split(master_weight, per_partition_per_stride_size,
dim=partition_dim)
rank = get_tensor_model_parallel_rank()
world_size = get_tensor_model_parallel_world_size()
@@ -150,14 +135,11 @@ class VocabParallelEmbedding(torch.nn.Module):
perform_initialization
"""

def __init__(self,
num_embeddings: int,
embedding_dim: int,
*,
def __init__(self, num_embeddings: int, embedding_dim: int, *,
init_method=init.xavier_normal_,
params_dtype: torch.dtype = None,
use_cpu_initialization: bool = False,
perform_initialization: bool = True):
params_dtype: torch.dtype=None,
use_cpu_initialization: bool=False,
perform_initialization: bool=True):
super(VocabParallelEmbedding, self).__init__()
# Keep the input dimensions.
self.num_embeddings = num_embeddings
@@ -172,8 +154,7 @@ def __init__(self,
self.scale_grad_by_freq = False
self.sparse = False
self._weight = None
self.tensor_model_parallel_size = get_tensor_model_parallel_world_size(
)
self.tensor_model_parallel_size = get_tensor_model_parallel_world_size()
# Divide the weight matrix along the vocaburaly dimension.
self.vocab_start_index, self.vocab_end_index = \
VocabUtility.vocab_range_from_global_vocab_size(
@@ -184,30 +165,21 @@ def __init__(self,

# Allocate weights and initialize.
if use_cpu_initialization:
self.weight = Parameter(
torch.empty(self.num_embeddings_per_partition,
self.embedding_dim,
dtype=params_dtype))
self.weight = Parameter(torch.empty(
self.num_embeddings_per_partition, self.embedding_dim,
dtype=params_dtype))
if perform_initialization:
_initialize_affine_weight_cpu(
self.weight,
self.num_embeddings,
self.embedding_dim,
self.num_embeddings_per_partition,
0,
init_method,
self.weight, self.num_embeddings, self.embedding_dim,
self.num_embeddings_per_partition, 0, init_method,
params_dtype=params_dtype)
else:
self.weight = Parameter(
torch.empty(self.num_embeddings_per_partition,
self.embedding_dim,
device=torch.cuda.current_device(),
dtype=params_dtype))
self.weight = Parameter(torch.empty(
self.num_embeddings_per_partition, self.embedding_dim,
device=torch.cuda.current_device(), dtype=params_dtype))
if perform_initialization:
_initialize_affine_weight_gpu(self.weight,
init_method,
partition_dim=0,
stride=1)
_initialize_affine_weight_gpu(self.weight, init_method,
partition_dim=0, stride=1)

def forward(self, input_):
if self.tensor_model_parallel_size > 1:
@@ -260,21 +232,15 @@ class ColumnParallelLinear(torch.nn.Module):
use_cpu_initialization:
"""

def __init__(
self,
input_size,
output_size,
*,
bias=True,
gather_output=True,
init_method=init.xavier_normal_,
stride=1,
keep_master_weight_for_test=False,
skip_bias_add=False,
params_dtype=None,
use_cpu_initialization=False,
perform_initialization=True,
):
def __init__(self, input_size, output_size, *,
bias=True, gather_output=True,
init_method=init.xavier_normal_, stride=1,
keep_master_weight_for_test=False,
skip_bias_add=False,
params_dtype=None,
use_cpu_initialization=False,
perform_initialization=True,
):
super(ColumnParallelLinear, self).__init__()

# Keep input parameters
@@ -294,49 +260,39 @@ def __init__(
# we allocate the transpose.
# Initialize weight.
if use_cpu_initialization:
self.weight = Parameter(
torch.empty(self.output_size_per_partition,
self.input_size,
dtype=params_dtype))
self.weight = Parameter(torch.empty(self.output_size_per_partition,
self.input_size,
dtype=params_dtype))
if perform_initialization:
self.master_weight = _initialize_affine_weight_cpu(
self.weight,
self.output_size,
self.input_size,
self.output_size_per_partition,
0,
init_method,
stride=stride,
return_master_weight=keep_master_weight_for_test)
self.weight, self.output_size, self.input_size,
self.output_size_per_partition, 0, init_method,
stride=stride, return_master_weight=keep_master_weight_for_test)
else:
self.weight = Parameter(
torch.empty(self.output_size_per_partition,
self.input_size,
device=torch.cuda.current_device(),
dtype=params_dtype))
self.weight = Parameter(torch.empty(
self.output_size_per_partition, self.input_size,
device=torch.cuda.current_device(), dtype=params_dtype))
if perform_initialization:
_initialize_affine_weight_gpu(self.weight,
init_method,
partition_dim=0,
stride=stride)
_initialize_affine_weight_gpu(self.weight, init_method,
partition_dim=0, stride=stride)

if bias:
if use_cpu_initialization:
self.bias = Parameter(
torch.empty(self.output_size_per_partition,
dtype=params_dtype))
self.bias = Parameter(torch.empty(
self.output_size_per_partition, dtype=params_dtype))
else:
self.bias = Parameter(
torch.empty(self.output_size_per_partition,
device=torch.cuda.current_device(),
dtype=params_dtype))
self.bias = Parameter(torch.empty(
self.output_size_per_partition,
device=torch.cuda.current_device(),
dtype=params_dtype))
set_tensor_model_parallel_attributes(self.bias, True, 0, stride)
# Always initialize bias to zero.
with torch.no_grad():
self.bias.zero_()
else:
self.register_parameter('bias', None)


def forward(self, input_):
"""Forward of ColumnParallelLinear
@@ -396,21 +352,15 @@ class RowParallelLinear(torch.nn.Module):
perform_initialization:
"""

def __init__(
self,
input_size,
output_size,
*,
bias=True,
input_is_parallel=False,
init_method=init.xavier_normal_,
stride=1,
keep_master_weight_for_test=False,
skip_bias_add=False,
params_dtype=None,
use_cpu_initialization=False,
perform_initialization=True,
):
def __init__(self, input_size, output_size, *,
bias=True, input_is_parallel=False,
init_method=init.xavier_normal_, stride=1,
keep_master_weight_for_test=False,
skip_bias_add=False,
params_dtype=None,
use_cpu_initialization=False,
perform_initialization=True,
):
super(RowParallelLinear, self).__init__()

# Keep input parameters
@@ -430,41 +380,30 @@ def __init__(
# we allocate the transpose.
# Initialize weight.
if use_cpu_initialization:
self.weight = Parameter(
torch.empty(self.output_size,
self.input_size_per_partition,
dtype=params_dtype))
self.weight = Parameter(torch.empty(self.output_size,
self.input_size_per_partition,
dtype=params_dtype))
if perform_initialization:
self.master_weight = _initialize_affine_weight_cpu(
self.weight,
self.output_size,
self.input_size,
self.input_size_per_partition,
1,
init_method,
stride=stride,
return_master_weight=keep_master_weight_for_test,
self.weight, self.output_size, self.input_size,
self.input_size_per_partition, 1, init_method,
stride=stride, return_master_weight=keep_master_weight_for_test,
params_dtype=params_dtype)
else:
self.weight = Parameter(
torch.empty(self.output_size,
self.input_size_per_partition,
device=torch.cuda.current_device(),
dtype=params_dtype))
self.weight = Parameter(torch.empty(
self.output_size, self.input_size_per_partition,
device=torch.cuda.current_device(), dtype=params_dtype))
if perform_initialization:
_initialize_affine_weight_gpu(self.weight,
init_method,
partition_dim=1,
stride=stride)
_initialize_affine_weight_gpu(self.weight, init_method,
partition_dim=1, stride=stride)
if bias:
if use_cpu_initialization:
self.bias = Parameter(
torch.empty(self.output_size, dtype=params_dtype))
self.bias = Parameter(torch.empty(self.output_size,
dtype=params_dtype))
else:
self.bias = Parameter(
torch.empty(self.output_size,
device=torch.cuda.current_device(),
dtype=params_dtype))
self.bias = Parameter(torch.empty(
self.output_size, device=torch.cuda.current_device(),
dtype=params_dtype))

# Always initialize bias to zero.
with torch.no_grad():
33 changes: 12 additions & 21 deletions vllm/model_executor/parallel_utils/tensor_parallel/mappings.py
Original file line number Diff line number Diff line change
@@ -16,12 +16,11 @@ def _reduce(input_):
"""All-reduce the input tensor across model parallel group."""

# Bypass the function if we are using only 1 GPU.
if get_tensor_model_parallel_world_size() == 1:
if get_tensor_model_parallel_world_size()==1:
return input_

# All-reduce.
torch.distributed.all_reduce(input_,
group=get_tensor_model_parallel_group())
torch.distributed.all_reduce(input_, group=get_tensor_model_parallel_group())

return input_

@@ -62,7 +61,7 @@ def _split_along_first_dim(input_):
rank = get_tensor_model_parallel_rank()
dim_offset = rank * local_dim_size

output = input_[dim_offset:dim_offset + local_dim_size].contiguous()
output = input_[dim_offset:dim_offset+local_dim_size].contiguous()

return output

@@ -81,9 +80,7 @@ def _gather_along_last_dim(input_):

tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
tensor_list[rank] = input_
torch.distributed.all_gather(tensor_list,
input_,
group=get_tensor_model_parallel_group())
torch.distributed.all_gather(tensor_list, input_, group=get_tensor_model_parallel_group())

# Note: torch.cat already creates a contiguous tensor.
output = torch.cat(tensor_list, dim=last_dim).contiguous()
@@ -102,16 +99,13 @@ def _gather_along_first_dim(input_):
dim_size = list(input_.size())
dim_size[0] = dim_size[0] * world_size

output = torch.empty(dim_size,
dtype=input_.dtype,
output = torch.empty(dim_size, dtype=input_.dtype,
device=torch.cuda.current_device())
torch.distributed._all_gather_base(output,
input_.contiguous(),
torch.distributed._all_gather_base(output, input_.contiguous(),
group=get_tensor_model_parallel_group())

return output


def _reduce_scatter_along_first_dim(input_):
"""Reduce-scatter the input tensor across model parallel group."""
world_size = get_tensor_model_parallel_world_size()
@@ -125,11 +119,10 @@ def _reduce_scatter_along_first_dim(input_):

dim_size[0] = dim_size[0] // world_size

output = torch.empty(dim_size,
dtype=input_.dtype,
output = torch.empty(dim_size, dtype=input_.dtype,
device=torch.cuda.current_device())
torch.distributed._reduce_scatter_base(
output, input_.contiguous(), group=get_tensor_model_parallel_group())
torch.distributed._reduce_scatter_base(output, input_.contiguous(),
group=get_tensor_model_parallel_group())
return output


@@ -259,7 +252,6 @@ def backward(ctx, grad_output):
# Helper functions.
# -----------------


def copy_to_tensor_model_parallel_region(input_):
return _CopyToModelParallelRegion.apply(input_)

@@ -280,11 +272,10 @@ def scatter_to_sequence_parallel_region(input_):
return _ScatterToSequenceParallelRegion.apply(input_)


def gather_from_sequence_parallel_region(input_,
tensor_parallel_output_grad=True):
return _GatherFromSequenceParallelRegion.apply(
input_, tensor_parallel_output_grad)
def gather_from_sequence_parallel_region(input_, tensor_parallel_output_grad=True):
return _GatherFromSequenceParallelRegion.apply(input_, tensor_parallel_output_grad)


def reduce_scatter_to_sequence_parallel_region(input_):
return _ReduceScatterToSequenceParallelRegion.apply(input_)

4 changes: 3 additions & 1 deletion vllm/model_executor/parallel_utils/tensor_parallel/random.py
Original file line number Diff line number Diff line change
@@ -12,7 +12,8 @@
from torch.cuda import _lazy_call, device as device_ctx_manager

from vllm.model_executor.parallel_utils.parallel_state import (
get_tensor_model_parallel_rank, )
get_tensor_model_parallel_rank,
)

# Default name for the model parallel rng tracker.
_MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng'
@@ -51,6 +52,7 @@ def cb():
_lazy_call(cb)



class CudaRNGStatesTracker:
"""Tracker for the cuda RNG states.
14 changes: 7 additions & 7 deletions vllm/model_executor/parallel_utils/tensor_parallel/utils.py
Original file line number Diff line number Diff line change
@@ -5,11 +5,11 @@
import torch
from typing import List, Sequence


def ensure_divisibility(numerator, denominator):
"""Ensure that numerator is divisible by the denominator."""
assert numerator % denominator == 0, "{} is not divisible by {}".format(
numerator, denominator)
numerator, denominator
)


def divide(numerator, denominator):
@@ -56,15 +56,15 @@ class VocabUtility:

@staticmethod
def vocab_range_from_per_partition_vocab_size(
per_partition_vocab_size: int, rank,
world_size: int) -> Sequence[int]:
per_partition_vocab_size: int, rank, world_size: int
) -> Sequence[int]:
index_f = rank * per_partition_vocab_size
index_l = index_f + per_partition_vocab_size
return index_f, index_l

@staticmethod
def vocab_range_from_global_vocab_size(global_vocab_size: int, rank: int,
world_size: int) -> Sequence[int]:
def vocab_range_from_global_vocab_size(global_vocab_size: int, rank: int, world_size: int) -> Sequence[int]:
per_partition_vocab_size = divide(global_vocab_size, world_size)
return VocabUtility.vocab_range_from_per_partition_vocab_size(
per_partition_vocab_size, rank, world_size)
per_partition_vocab_size, rank, world_size
)
8 changes: 4 additions & 4 deletions vllm/model_executor/weight_utils.py
Original file line number Diff line number Diff line change
@@ -44,9 +44,9 @@ def hf_model_weights_iterator(
if use_np_cache:
# Convert the model weights from torch tensors to numpy arrays for
# faster loading.
np_folder = os.path.join(hf_folder, 'np')
np_folder = os.path.join(hf_folder, "np")
os.makedirs(np_folder, exist_ok=True)
weight_names_file = os.path.join(np_folder, 'weight_names.json')
weight_names_file = os.path.join(np_folder, "weight_names.json")
with lock:
if not os.path.exists(weight_names_file):
weight_names = []
@@ -57,10 +57,10 @@ def hf_model_weights_iterator(
with open(param_path, "wb") as f:
np.save(f, param.cpu().detach().numpy())
weight_names.append(name)
with open(weight_names_file, 'w') as f:
with open(weight_names_file, "w") as f:
json.dump(weight_names, f)

with open(weight_names_file, 'r') as f:
with open(weight_names_file, "r") as f:
weight_names = json.load(f)

for name in weight_names:
4 changes: 2 additions & 2 deletions vllm/sampling_params.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Sampling parameters for text generation."""
from typing import List, Optional, Union
from typing import Optional, Sequence, Union

_SAMPLING_EPS = 1e-5

@@ -52,7 +52,7 @@ def __init__(
top_p: float = 1.0,
top_k: int = -1,
use_beam_search: bool = False,
stop: Union[str, List[str]] = [],
stop: Union[str, Sequence[str]] = (),
ignore_eos: bool = False,
max_tokens: int = 16,
logprobs: Optional[int] = None,
19 changes: 9 additions & 10 deletions vllm/sequence.py
Original file line number Diff line number Diff line change
@@ -145,17 +145,16 @@ def get_cumulative_logprob(self) -> float:
def is_finished(self) -> bool:
return SequenceStatus.is_finished(self.status)

def fork(self, child_seq: 'Sequence') -> None:
def fork(self, child_seq: "Sequence") -> None:
child_seq.logical_token_blocks = copy.deepcopy(
self.logical_token_blocks)
child_seq.output_logprobs = copy.deepcopy(self.output_logprobs)
child_seq.data = copy.deepcopy(self.data)
return None

def __repr__(self) -> str:
return (f'Sequence(seq_id={self.seq_id}, '
f'status={self.status.name}, '
f'num_blocks={len(self.logical_token_blocks)})')
return (f"Sequence(seq_id={self.seq_id}, "
f"status={self.status.name}, "
f"num_blocks={len(self.logical_token_blocks)})")


class SequenceGroup:
@@ -188,7 +187,7 @@ def find(self, seq_id: int) -> Sequence:
for seq in self.seqs:
if seq.seq_id == seq_id:
return seq
raise ValueError(f'Sequence {seq_id} not found.')
raise ValueError(f"Sequence {seq_id} not found.")

def is_finished(self) -> bool:
return all(seq.is_finished() for seq in self.seqs)
@@ -233,10 +232,10 @@ def __init__(
self.logprobs = logprobs

def __repr__(self) -> str:
return (f'SequenceOutputs(seq_id={self.seq_id}, '
f'parent_seq_id={self.parent_seq_id}, '
f'output_token={self.output_token}), '
f'logprobs={self.logprobs}')
return (f"SequenceOutputs(seq_id={self.seq_id}, "
f"parent_seq_id={self.parent_seq_id}, "
f"output_token={self.output_token}), "
f"logprobs={self.logprobs}")

def __eq__(self, other: object) -> bool:
if not isinstance(other, SequenceOutputs):
7 changes: 4 additions & 3 deletions vllm/transformers_utils/tokenizer.py
Original file line number Diff line number Diff line change
@@ -13,8 +13,8 @@

def get_tokenizer(
tokenizer_name: str,
tokenizer_mode: str = "auto",
*args,
tokenizer_mode: str = "auto",
**kwargs,
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
"""Gets a tokenizer for the given model name via Huggingface."""
@@ -42,7 +42,7 @@ def get_tokenizer(
raise RuntimeError(err_msg) from e

if not isinstance(tokenizer, PreTrainedTokenizerFast):
logger.warning(
logger.warninging(
"Using a slow tokenizer. This might cause a significant "
"slowdown. Consider using a fast tokenizer instead.")
return tokenizer
@@ -73,7 +73,8 @@ def detokenize_incrementally(
output_text = tokenizer.convert_tokens_to_string(output_tokens)
return new_token, output_text

# Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
# Adapted from
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
# NOTE(woosuk): The following code is slow because it runs a for loop over
# the output_tokens. In Python, running a for loop over a list can be slow
# even when the loop body is very simple.
4 changes: 2 additions & 2 deletions vllm/utils.py
Original file line number Diff line number Diff line change
@@ -17,9 +17,9 @@ def __init__(self, start: int = 0) -> None:
self.counter = start

def __next__(self) -> int:
id = self.counter
i = self.counter
self.counter += 1
return id
return i

def reset(self) -> None:
self.counter = 0
4 changes: 2 additions & 2 deletions vllm/worker/cache_engine.py
Original file line number Diff line number Diff line change
@@ -93,8 +93,8 @@ def allocate_cpu_cache(self) -> List[KVCache]:
if not pin_memory:
# Pinning memory in WSL is not supported.
# https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
logger.warn("Using 'pin_memory=False' as WSL is detected. "
"This may slow down the performance.")
logger.warning("Using 'pin_memory=False' as WSL is detected. "
"This may slow down the performance.")
for _ in range(self.num_layers):
key_blocks = torch.empty(
size=(self.num_cpu_blocks, *key_block_shape),