fix all simple errors

vllm-project · zhuohan123 · Jul 3, 2023 · Jul 1, 2023 · Jul 1, 2023 · Jul 1, 2023
commit 6deb2fa962c68d2299c90a7f3d58ec56bc058849
diff --git a/.pylintrc b/.pylintrc
@@ -8,7 +8,7 @@
 [MASTER]
 
 # Files or directories to be skipped. They should be base names, not paths.
-ignore=docs
+ignore=docs,parallel_utils
 
 # Files or directories matching the regex patterns are skipped. The regex
 # matches against base names, not paths.

diff --git a/format.sh b/format.sh
@@ -44,6 +44,7 @@ YAPF_FLAGS=(
 
 YAPF_EXCLUDES=(
     '--exclude' 'build/**'
+    '--exclude' 'vllm/model_executor/parallel_utils/**'
 )
 
 # Format specified files

diff --git a/vllm/__init__.py b/vllm/__init__.py
@@ -1,3 +1,6 @@
+"""vLLM is a high-throughput and memory-efficient inference and serving engine
+ for LLMs"""
+
 from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.engine.llm_engine import LLMEngine

diff --git a/vllm/config.py b/vllm/config.py
@@ -8,7 +8,7 @@
 
 logger = init_logger(__name__)
 
-_GiB = 1 << 30
+_GB = 1 << 30
 
 
 class ModelConfig:
@@ -115,7 +115,7 @@ def __init__(
     ) -> None:
         self.block_size = block_size
         self.gpu_memory_utilization = gpu_memory_utilization
-        self.swap_space_bytes = swap_space * _GiB
+        self.swap_space_bytes = swap_space * _GB
         self._verify_args()
 
         # Will be set after profiling.
@@ -138,13 +138,13 @@ def verify_with_parallel_config(
         num_gpus_per_node = parallel_config.tensor_parallel_size
         cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node
 
-        msg = (f"{cpu_memory_usage / _GiB:.2f} GiB out of "
-               f"the {total_cpu_memory / _GiB:.2f} GiB total CPU memory is "
+        msg = (f"{cpu_memory_usage / _GB:.2f} GiB out of "
+               f"the {total_cpu_memory / _GB:.2f} GiB total CPU memory is "
                "allocated for the swap space.")
         if cpu_memory_usage > 0.7 * total_cpu_memory:
             raise ValueError("Too large swap space. " + msg)
         elif cpu_memory_usage > 0.4 * total_cpu_memory:
-            logger.warn("Possibly too large swap space. " + msg)
+            logger.warning("Possibly too large swap space. " + msg)
 
 
 class ParallelConfig:
@@ -239,7 +239,7 @@ def _get_and_verify_dtype(
             pass
         else:
             # Casting between float16 and bfloat16 is allowed with a warning.
-            logger.warn(f"Casting {config_dtype} to {torch_dtype}.")
+            logger.warning(f"Casting {config_dtype} to {torch_dtype}.")
 
     # Check if the GPU supports the dtype.
     if torch_dtype == torch.bfloat16:

@@ -85,10 +85,12 @@ def can_allocate(self, seq_group: SequenceGroup) -> bool:
         num_required_blocks = len(seq.logical_token_blocks)
         num_free_gpu_blocks = self.gpu_allocator.get_num_free_blocks()
         # Use watermark to avoid frequent cache eviction.
-        return num_free_gpu_blocks - num_required_blocks >= self.watermark_blocks
+        return (num_free_gpu_blocks - num_required_blocks >=
+                self.watermark_blocks)
 
     def allocate(self, seq_group: SequenceGroup) -> None:
-        # NOTE: Here we assume that all sequences in the group have the same prompt.
+        # NOTE: Here we assume that all sequences in the group have the same
+        # prompt.
         seq = seq_group.get_seqs()[0]
 
         # Allocate new physical token blocks that will store the prompt tokens.

@@ -60,7 +60,7 @@ def __init__(
         self.log_stats = log_stats
 
         # Instantiate the scheduling policy.
-        self.policy = PolicyFactory.get_policy(policy_name='fcfs')
+        self.policy = PolicyFactory.get_policy(policy_name="fcfs")
         # Create the block space manager.
         self.block_manager = BlockSpaceManager(
             block_size=self.cache_config.block_size,
@@ -158,7 +158,8 @@ def _schedule(self) -> Tuple[SchedulerOutputs, List[str]]:
             num_curr_seqs = sum(
                 seq_group.num_seqs(status=SequenceStatus.RUNNING)
                 for seq_group in self.running)
-            if num_curr_seqs + num_new_seqs > self.scheduler_config.max_num_seqs:
+            if (num_curr_seqs + num_new_seqs >
+                    self.scheduler_config.max_num_seqs):
                 break
 
             seq_group = self.swapped.pop(0)
@@ -202,7 +203,8 @@ def _schedule(self) -> Tuple[SchedulerOutputs, List[str]]:
                 num_curr_seqs = sum(
                     seq_group.num_seqs(status=SequenceStatus.RUNNING)
                     for seq_group in self.running)
-                if num_curr_seqs + num_new_seqs > self.scheduler_config.max_num_seqs:
+                if (num_curr_seqs + num_new_seqs >
+                        self.scheduler_config.max_num_seqs):
                     break
 
                 seq_group = self.waiting.pop(0)
@@ -243,8 +245,8 @@ def _schedule(self) -> Tuple[SchedulerOutputs, List[str]]:
 
             total_num_cpu_blocks = self.cache_config.num_cpu_blocks
             if total_num_cpu_blocks > 0:
-                num_free_cpu_blocks = self.block_manager.get_num_free_cpu_blocks(
-                )
+                num_free_cpu_blocks = (
+                    self.block_manager.get_num_free_cpu_blocks())
                 num_used_cpu_blocks = total_num_cpu_blocks - num_free_cpu_blocks
                 cpu_cache_usage = num_used_cpu_blocks / total_num_cpu_blocks
             else:
@@ -296,8 +298,8 @@ def update(
             for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
                 output = seq_outputs[seq.seq_id]
                 if seq.seq_id != output.parent_seq_id:
-                    # The sequence is a fork of the parent sequence (beam search).
-                    # Free the current sequence.
+                    # The sequence is a fork of the parent sequence (beam
+                    # search). Free the current sequence.
                     self.block_manager.free(seq)
                     # Fork the parent sequence.
                     parent_seq = seq_group.find(output.parent_seq_id)
@@ -370,7 +372,7 @@ def _preempt(
         elif preemption_mode == PreemptionMode.SWAP:
             self._preempt_by_swap(seq_group, blocks_to_swap_out)
         else:
-            assert False, 'Invalid preemption mode.'
+            assert False, "Invalid preemption mode."
 
     def _preempt_by_recompute(
         self,

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
@@ -12,11 +12,11 @@ class EngineArgs:
     """Arguments for vLLM engine."""
     model: str
     tokenizer: Optional[str] = None
-    tokenizer_mode: str = "auto"
+    tokenizer_mode: str = 'auto'
     download_dir: Optional[str] = None
     use_np_weights: bool = False
     use_dummy_weights: bool = False
-    dtype: str = "auto"
+    dtype: str = 'auto'
     seed: int = 0
     worker_use_ray: bool = False
     pipeline_parallel_size: int = 1
@@ -129,7 +129,7 @@ def add_cli_args(
         return parser
 
     @classmethod
-    def from_cli_args(cls, args: argparse.Namespace) -> "EngineArgs":
+    def from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs':
         # Get the list of attributes of this dataclass.
         attrs = [attr.name for attr in dataclasses.fields(cls)]
         # Set the attributes from the parsed arguments.

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
@@ -39,8 +39,8 @@ class AsyncLLMEngine:
     def __init__(self,
                  worker_use_ray: bool,
                  engine_use_ray: bool,
-                 log_requests: bool = True,
                  *args,
+                 log_requests: bool = True,
                  **kwargs) -> None:
         self.worker_use_ray = worker_use_ray
         self.engine_use_ray = engine_use_ray
@@ -219,9 +219,9 @@ def from_engine_args(cls,
         # Create the async LLM engine.
         engine = cls(engine_args.worker_use_ray,
                      engine_args.engine_use_ray,
-                     not engine_args.disable_log_requests,
                      *engine_configs,
                      distributed_init_method,
                      devices,
+                     log_requests=not engine_args.disable_log_requests,
                      log_stats=not engine_args.disable_log_stats)
         return engine
@@ -77,8 +77,8 @@ def __init__(
         self.log_stats = log_stats
         self._verify_args()
 
-        self.tokenizer = get_tokenizer(model_config.tokenizer,
-                                       model_config.tokenizer_mode)
+        self.tokenizer = get_tokenizer(
+            model_config.tokenizer, tokenizer_mode=model_config.tokenizer_mode)
         self.seq_counter = Counter()
 
         # Create the parallel GPU workers.
@@ -128,8 +128,8 @@ def _init_cache(self) -> None:
         num_gpu_blocks = min(b[0] for b in num_blocks)
         num_cpu_blocks = min(b[1] for b in num_blocks)
         # FIXME(woosuk): Change to debug log.
-        logger.info(f'# GPU blocks: {num_gpu_blocks}, '
-                    f'# CPU blocks: {num_cpu_blocks}')
+        logger.info(f"# GPU blocks: {num_gpu_blocks}, "
+                    f"# CPU blocks: {num_cpu_blocks}")
 
         if num_gpu_blocks <= 0:
             raise ValueError("No available memory for the cache blocks. "
@@ -304,8 +304,8 @@ def _stop_sequences(self, seq_groups: List[SequenceGroup]) -> None:
     def _run_workers(
         self,
         method: str,
-        get_all_outputs: bool = False,
         *args,
+        get_all_outputs: bool = False,
         **kwargs,
     ) -> Any:
         """Runs the given method on all workers."""

diff --git a/vllm/engine/ray_utils.py b/vllm/engine/ray_utils.py
@@ -54,15 +54,15 @@ def initialize_cluster(
     valid_node_resources = []
     num_devices_per_node = None
     for node in ray.nodes():
-        if (not node['Alive']) or node['Resources']['GPU'] <= 0:
+        if (not node["Alive"]) or node["Resources"]["GPU"] <= 0:
             continue
         if num_devices_per_node is None:
-            num_devices_per_node = node['Resources']['GPU']
+            num_devices_per_node = node["Resources"]["GPU"]
         else:
-            assert num_devices_per_node == node['Resources']['GPU'], (
+            assert num_devices_per_node == node["Resources"]["GPU"], (
                 "The number of GPUs per node is not uniform.")
-        for key in node['Resources']:
-            if key.startswith('node:'):
+        for key in node["Resources"]:
+            if key.startswith("node:"):
                 valid_node_resources.append(key)
 
     # Verify the parallel config.

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
@@ -63,7 +63,7 @@ def __init__(
         self.request_counter = Counter()
 
     def get_tokenizer(
-        self, ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
+            self) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
         return self.llm_engine.tokenizer
 
     def set_tokenizer(

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
@@ -1,4 +1,5 @@
-# Adapted from https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/serve/openai_api_server.py
+# Adapted from
+# https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/serve/openai_api_server.py
 
 import argparse
 from http import HTTPStatus
@@ -40,7 +41,7 @@ def create_error_response(status_code: HTTPStatus,
 
 
 @app.exception_handler(RequestValidationError)
-async def validation_exception_handler(request, exc):
+async def validation_exception_handler(request, exc):  # pylint: disable=unused-argument
     return create_error_response(HTTPStatus.BAD_REQUEST, str(exc))
 
 
@@ -329,7 +330,7 @@ async def fake_stream_generator() -> AsyncGenerator[str, None]:
 
     # A separate tokenizer to map token IDs to strings.
     tokenizer = get_tokenizer(engine_args.tokenizer,
-                              engine_args.tokenizer_mode)
+                              tokenizer_mode=engine_args.tokenizer_mode)
 
     uvicorn.run(app,
                 host=args.host,

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
@@ -1,4 +1,5 @@
-# Adapted from https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
+# Adapted from
+# https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
 import time
 from typing import Dict, List, Literal, Optional, Union
 

diff --git a/vllm/logger.py b/vllm/logger.py
@@ -1,5 +1,6 @@
-# Adapted from https://github.com/skypilot-org/skypilot/blob/86dc0f6283a335e4aa37b3c10716f90999f48ab6/sky/sky_logging.py
-
+# Adapted from
+# https://github.com/skypilot-org/skypilot/blob/86dc0f6283a335e4aa37b3c10716f90999f48ab6/sky/sky_logging.py
+"""Logging configuration for vLLM."""
 import logging
 import sys
 

diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
@@ -30,9 +30,6 @@ class SiluAndMul(nn.Module):
     The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[1] // 2.
     """
 
-    def __init__(self):
-        super().__init__()
-
     def forward(
             self,
             x: torch.Tensor,  # (num_tokens, 2 * d)

diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py
@@ -14,6 +14,7 @@
 
 
 class PagedAttention(nn.Module):
+    # pylint: disable=line-too-long
     """GPT-style multi-head PagedAttention.
 
     This class takes flattened 1D query, key, and value tensors as input. The
@@ -183,7 +184,7 @@ def __init__(
         # Create the cos and sin cache.
         inv_freq = 1.0 / (base**(torch.arange(0, rotary_dim, 2) / rotary_dim))
         t = torch.arange(max_position).float()
-        freqs = torch.einsum('i,j -> ij', t, inv_freq.float())
+        freqs = torch.einsum("i,j -> ij", t, inv_freq.float())
         cos = freqs.cos()
         sin = freqs.sin()
         cache = torch.cat((cos, sin), dim=-1)

@@ -411,16 +411,16 @@ def _sample(
 
             # Get top-k log probabilities for the next tokens.
             next_logprobs: Dict[int, Dict[int, float]] = {}
-            for i, seq_id in enumerate(seq_ids):
+            for j, seq_id in enumerate(seq_ids):
                 next_logprobs[seq_id] = _get_topk_logprobs(
-                    logprob[i], sampling_params.logprobs)
+                    logprob[j], sampling_params.logprobs)
 
             # Build the output.
             for seq_id, parent_seq_id, next_token_id in zip(
                     seq_ids, parent_seq_ids, next_token_ids):
-                i = seq_ids.index(parent_seq_id)
+                j = seq_ids.index(parent_seq_id)
                 output_logprobs = next_logprobs[parent_seq_id].copy()
-                output_logprobs[next_token_id] = logprob[i,
+                output_logprobs[next_token_id] = logprob[j,
                                                          next_token_id].item()
                 seq_outputs[seq_id] = SequenceOutputs(
                     seq_id,

diff --git a/vllm/model_executor/models/gpt2.py b/vllm/model_executor/models/gpt2.py
@@ -1,5 +1,6 @@
 # coding=utf-8
-# Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
 # Copyright 2023 The vLLM team.
 # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
@@ -117,7 +118,8 @@ class GPT2Block(nn.Module):
     def __init__(self, config: GPT2Config):
         super().__init__()
         hidden_size = config.hidden_size
-        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
+        inner_dim = (config.n_inner if config.n_inner is not None else 4 *
+                     hidden_size)
 
         self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
         self.attn = GPT2Attention(config)
@@ -155,9 +157,9 @@ class GPT2Model(nn.Module):
     def __init__(self, config: GPT2Config):
         super().__init__()
         self.config = config
-        assert config.add_cross_attention == False
-        assert config.scale_attn_by_inverse_layer_idx == False
-        assert config.reorder_and_upcast_attn == False
+        assert not config.add_cross_attention
+        assert not config.scale_attn_by_inverse_layer_idx
+        assert not config.reorder_and_upcast_attn
         self.embed_dim = config.hidden_size
 
         # Optimization: While the vocab size of GPT-2 is 50257, we extend it
@@ -270,8 +272,10 @@ def load_weights(self,
 
             # For the fused QKV linear layer, manually shard the weights.
             if "c_attn" in name:
-                # GPT-2's fused QKV has the shape of [3 * num_heads * head_size, hidden_size].
-                # When tensor parallelism is used, we shard the weights along the head dimension.
+                # GPT-2's fused QKV has the shape of
+                # [3 * num_heads * head_size, hidden_size].
+                # When tensor parallelism is used, we shard the weights along
+                # the head dimension.
                 total_num_heads = self.config.num_attention_heads
                 hidden_size = self.config.hidden_size
                 head_size = hidden_size // total_num_heads

diff --git a/vllm/model_executor/models/gpt_bigcode.py b/vllm/model_executor/models/gpt_bigcode.py
@@ -1,5 +1,6 @@
 # coding=utf-8
-# Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt2/modeling_gpt2.py
 # Copyright 2023 The vLLM team.
 # Copyright 2023 CTranslate2, and Michael Feil
 # Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
@@ -119,7 +120,8 @@ class GPTBigCodeBlock(nn.Module):
     def __init__(self, config: GPTBigCodeConfig):
         super().__init__()
         hidden_size = config.hidden_size
-        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
+        inner_dim = (config.n_inner if config.n_inner is not None else 4 *
+                     hidden_size)
 
         self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
         self.attn = GPTBigCodeAttention(config)
@@ -157,7 +159,7 @@ class GPTBigCodeModel(nn.Module):
     def __init__(self, config: GPTBigCodeConfig):
         super().__init__()
         self.config = config
-        assert config.add_cross_attention == False
+        assert not config.add_cross_attention
 
         self.embed_dim = config.hidden_size
 
@@ -272,26 +274,31 @@ def _expand_mqa_mha(qkv_array, n_head, head_dim):
                 qkv_array = qkv_array.numpy()
 
                 dims_q = n_head * head_dim
+                # pylint: disable=unbalanced-tuple-unpacking
                 q, k, v = np.split(qkv_array, (dims_q, dims_q + head_dim),
                                    axis=0)
-                # q is fine, but k & v have not replicated shape along the first axis
-                # as long as MQA is not nativly supported, increase memory and replicated
-                # (head_dim, hidden_dim) to (n_heads * head_dim, hidden_dim)
+                # q is fine, but k & v have not replicated shape along the first
+                # axis as long as MQA is not nativly supported, increase memory
+                # and replicated (head_dim, hidden_dim) to
+                # (n_heads * head_dim, hidden_dim)
                 if k.ndim == 2 and v.ndim == 2:
                     replication = (n_head, 1)  # weights
                 else:
                     replication = n_head  # biases
                 # replicate n_head times for q, v
                 k, v = np.tile(k, replication), np.tile(v, replication)
-                # concat q, k, v along the first axis (n_heads * head_dim, hidden_dim)
+                # concat q, k, v along the first axis
+                # (n_heads * head_dim, hidden_dim)
                 # to (3 * n_heads * head_dim, hidden_dim)
                 qkv_array = np.concatenate((q, k, v), axis=0)
                 return torch.from_numpy(qkv_array)
 
             # For the fused QKV linear layer, manually shard the weights.
             if "c_attn" in name:
-                # GPT-2's fused QKV has the shape of [3 * num_heads * head_size, hidden_size].
-                # When tensor parallelism is used, we shard the weights along the head dimension.
+                # GPT-2's fused QKV has the shape of
+                # [3 * num_heads * head_size, hidden_size].
+                # When tensor parallelism is used, we shard the weights along
+                # the head dimension.
                 total_num_heads = self.config.num_attention_heads
                 hidden_size = self.config.hidden_size
                 head_size = hidden_size // total_num_heads

diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py
@@ -1,5 +1,6 @@
 # coding=utf-8
-# Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/gpt_neox/modeling_gpt_neox.py
 # Copyright 2023 The vLLM team.
 # Copyright 2022 EleutherAI The HuggingFace Inc. team. All rights reserved.
 #
@@ -51,7 +52,8 @@ def __init__(self, config: GPTNeoXConfig):
         tensor_model_parallel_world_size = get_tensor_model_parallel_world_size(
         )
         assert self.total_num_heads % tensor_model_parallel_world_size == 0
-        self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
+        self.num_heads = (self.total_num_heads //
+                          tensor_model_parallel_world_size)
 
         self.query_key_value = ColumnParallelLinear(
             config.hidden_size,
@@ -252,12 +254,12 @@ def load_weights(self,
                 num_heads = self.config.num_attention_heads
                 hidden_size = self.config.hidden_size
                 head_size = hidden_size // num_heads
-                if 'query_key_value.weight' in name:
+                if "query_key_value.weight" in name:
                     loaded_weight = loaded_weight.view(-1, 3, head_size,
                                                        hidden_size)
                     loaded_weight = loaded_weight.transpose(0, 1)
                     loaded_weight = loaded_weight.reshape(-1, hidden_size)
-                elif 'query_key_value.bias' in name:
+                elif "query_key_value.bias" in name:
                     loaded_weight = loaded_weight.view(-1, 3, head_size)
                     loaded_weight = loaded_weight.transpose(0, 1)
                     loaded_weight = loaded_weight.reshape(-1)

diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
@@ -1,9 +1,10 @@
 # coding=utf-8
-# Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py
 # Copyright 2023 The vLLM team.
 # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 #
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# This code is based on EleutherAI"s GPT-NeoX library and the GPT-NeoX
 # and OPT implementations in this library. It has been modified from its
 # original forms to accommodate minor architectural differences compared
 # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
@@ -30,7 +31,6 @@
 from torch import nn
 from transformers import LlamaConfig
 
-from vllm.sequence import SequenceOutputs
 from vllm.model_executor.input_metadata import InputMetadata
 from vllm.model_executor.layers.activation import SiluAndMul
 from vllm.model_executor.layers.layernorm import RMSNorm
@@ -66,9 +66,9 @@ def __init__(
                                            bias=False,
                                            input_is_parallel=True,
                                            perform_initialization=False)
-        if hidden_act != 'silu':
-            raise ValueError(f'Unsupported activation: {hidden_act}. '
-                             'Only silu is supported for now.')
+        if hidden_act != "silu":
+            raise ValueError(f"Unsupported activation: {hidden_act}. "
+                             "Only silu is supported for now.")
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
@@ -91,7 +91,8 @@ def __init__(
         )
         self.total_num_heads = num_heads
         assert self.total_num_heads % tensor_model_parallel_world_size == 0
-        self.num_heads = self.total_num_heads // tensor_model_parallel_world_size
+        self.num_heads = (self.total_num_heads //
+                          tensor_model_parallel_world_size)
         self.head_dim = hidden_size // self.total_num_heads
         self.scaling = self.head_dim**-0.5
 

diff --git a/vllm/model_executor/models/opt.py b/vllm/model_executor/models/opt.py
@@ -1,7 +1,9 @@
 # coding=utf-8
-# Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py
+# Adapted from
+# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/opt/modeling_opt.py
 # Copyright 2023 The vLLM team.
-# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights
+# reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -43,8 +45,9 @@
 class OPTLearnedPositionalEmbedding(nn.Embedding):
 
     def __init__(self, num_embeddings: int, embedding_dim: int):
-        # OPT is set up so that if padding_idx is specified then offset the embedding ids by 2
-        # and adjust num_embeddings appropriately. Other models don't have this hack
+        # OPT is set up so that if padding_idx is specified then offset the
+        # embedding ids by 2 and adjust num_embeddings appropriately. Other
+        # models don't have this hack
         self.offset = 2
         super().__init__(num_embeddings + self.offset, embedding_dim)
 
@@ -199,8 +202,9 @@ def __init__(self, config: OPTConfig):
         else:
             self.project_in = None
 
-        # Note that the only purpose of `config._remove_final_layer_norm` is to keep backward compatibility
-        # with checkpoints that have been fine-tuned before transformers v4.20.1
+        # Note that the only purpose of `config._remove_final_layer_norm` is to
+        # keep backward compatibility with checkpoints that have been fine-tuned
+        # before transformers v4.20.1
         # see https://github.com/facebookresearch/metaseq/pull/164
         if config.do_layer_norm_before and not config._remove_final_layer_norm:
             self.final_layer_norm = nn.LayerNorm(

diff --git a/vllm/model_executor/parallel_utils/parallel_state.py b/vllm/model_executor/parallel_utils/parallel_state.py
@@ -1,6 +1,7 @@
 # Copyright 2023 The vLLM team.
 # Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
 # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+
 """Model and data parallel groups."""
 
 import torch
@@ -45,7 +46,6 @@
 
 _ALL_REDUCE_LAUNCHER: Optional['GraphAllReduce'] = None
 
-
 def initialize_model_parallel(
     tensor_model_parallel_size: int = 1,
     pipeline_model_parallel_size: int = 1,
@@ -83,8 +83,7 @@ def initialize_model_parallel(
     assert torch.distributed.is_initialized()
     world_size: int = torch.distributed.get_world_size()
 
-    if world_size % (tensor_model_parallel_size *
-                     pipeline_model_parallel_size) != 0:
+    if world_size % (tensor_model_parallel_size * pipeline_model_parallel_size) != 0:
         raise RuntimeError(
             f"world_size ({world_size}) is not divisible by tensor_model_parallel_size "
             f"({tensor_model_parallel_size}) x pipeline_model_parallel_size ({pipeline_model_parallel_size})"
@@ -93,15 +92,14 @@ def initialize_model_parallel(
     data_parallel_size: int = world_size // (tensor_model_parallel_size *
                                              pipeline_model_parallel_size)
 
-    num_tensor_model_parallel_groups: int = world_size // tensor_model_parallel_size
+    num_tensor_model_parallel_groups: int  = world_size // tensor_model_parallel_size
     num_pipeline_model_parallel_groups: int = world_size // pipeline_model_parallel_size
     num_data_parallel_groups: int = world_size // data_parallel_size
 
     if virtual_pipeline_model_parallel_size is not None:
         if not pipeline_model_parallel_size > 2:
-            raise RuntimeError(
-                "pipeline-model-parallel size should be greater than 2 with "
-                "interleaved schedule")
+            raise RuntimeError("pipeline-model-parallel size should be greater than 2 with "
+                               "interleaved schedule")
         global _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK
         global _VIRTUAL_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
         _VIRTUAL_PIPELINE_MODEL_PARALLEL_RANK = 0
@@ -133,10 +131,8 @@ def initialize_model_parallel(
     global _MODEL_PARALLEL_GROUP
     assert _MODEL_PARALLEL_GROUP is None, 'model parallel group is already initialized'
     for i in range(data_parallel_size):
-        ranks = [
-            data_parallel_group_ranks[i]
-            for data_parallel_group_ranks in all_data_parallel_group_ranks
-        ]
+        ranks = [data_parallel_group_ranks[i]
+                 for data_parallel_group_ranks in all_data_parallel_group_ranks]
         group = torch.distributed.new_group(ranks)
         if rank in ranks:
             _MODEL_PARALLEL_GROUP = group
@@ -177,17 +173,13 @@ def initialize_model_parallel(
             embedding_ranks = [ranks[0], ranks[-1]]
             position_embedding_ranks = [ranks[0]]
             if pipeline_model_parallel_split_rank is not None:
-                if ranks[
-                        pipeline_model_parallel_split_rank] not in embedding_ranks:
-                    embedding_ranks = [
-                        ranks[0], ranks[pipeline_model_parallel_split_rank],
-                        ranks[-1]
-                    ]
-                if ranks[
-                        pipeline_model_parallel_split_rank] not in position_embedding_ranks:
-                    position_embedding_ranks = [
-                        ranks[0], ranks[pipeline_model_parallel_split_rank]
-                    ]
+                if ranks[pipeline_model_parallel_split_rank] not in embedding_ranks:
+                    embedding_ranks = [ranks[0],
+                                       ranks[pipeline_model_parallel_split_rank],
+                                       ranks[-1]]
+                if ranks[pipeline_model_parallel_split_rank] not in position_embedding_ranks:
+                    position_embedding_ranks = [ranks[0],
+                                       ranks[pipeline_model_parallel_split_rank]]
         else:
             embedding_ranks = ranks
             position_embedding_ranks = ranks
@@ -204,7 +196,6 @@ def initialize_model_parallel(
         if rank in ranks:
             _POSITION_EMBEDDING_GLOBAL_RANKS = position_embedding_ranks
 
-
 def initialize_all_reduce_launcher(
     max_num_tokens: int,
     hidden_size: int,
@@ -219,7 +210,6 @@ def initialize_all_reduce_launcher(
         disable_graph=disable_graph,
     )
 
-
 def model_parallel_is_initialized():
     """Check if model and data parallel groups are initialized."""
     if _TENSOR_MODEL_PARALLEL_GROUP is None or \
@@ -288,17 +278,15 @@ def get_tensor_model_parallel_world_size():
     global _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
     if _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE is not None:
         return _MPU_TENSOR_MODEL_PARALLEL_WORLD_SIZE
-    return torch.distributed.get_world_size(
-        group=get_tensor_model_parallel_group())
+    return torch.distributed.get_world_size(group=get_tensor_model_parallel_group())
 
 
 def get_pipeline_model_parallel_world_size():
     """Return world size for the pipeline model parallel group."""
     global _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
     if _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE is not None:
         return _MPU_PIPELINE_MODEL_PARALLEL_WORLD_SIZE
-    return torch.distributed.get_world_size(
-        group=get_pipeline_model_parallel_group())
+    return torch.distributed.get_world_size(group=get_pipeline_model_parallel_group())
 
 
 def set_tensor_model_parallel_rank(rank):
@@ -332,8 +320,8 @@ def get_pipeline_model_parallel_rank():
     global _MPU_PIPELINE_MODEL_PARALLEL_RANK
     if _MPU_PIPELINE_MODEL_PARALLEL_RANK is not None:
         return _MPU_PIPELINE_MODEL_PARALLEL_RANK
-    return torch.distributed.get_rank(
-        group=get_pipeline_model_parallel_group())
+    return torch.distributed.get_rank(group=get_pipeline_model_parallel_group())
+
 
 
 def is_pipeline_first_stage(ignore_virtual=False):
@@ -470,7 +458,6 @@ def get_pipeline_model_parallel_last_rank():
     last_rank_local = get_pipeline_model_parallel_world_size() - 1
     return _PIPELINE_GLOBAL_RANKS[last_rank_local]
 
-
 def get_pipeline_model_parallel_next_rank():
     """Return the global rank that follows the caller in the pipeline"""
     assert _PIPELINE_GLOBAL_RANKS is not None, \
@@ -498,12 +485,10 @@ def get_data_parallel_rank():
     """Return my rank for the data parallel group."""
     return torch.distributed.get_rank(group=get_data_parallel_group())
 
-
 def get_all_reduce_launcher() -> 'GraphAllReduce':
     assert _ALL_REDUCE_LAUNCHER is not None, 'all reduce launcher is not initialized'
     return _ALL_REDUCE_LAUNCHER
 
-
 def destroy_model_parallel():
     """Set the groups to none."""
     global _MODEL_PARALLEL_GROUP
@@ -564,15 +549,14 @@ def __init__(
 
     def _build_graph(self, num_tokens: int) -> torch.cuda.CUDAGraph:
         # Warm up.
-        torch.distributed.all_reduce(self.buffer[:num_tokens],
-                                     group=self.group)
+        torch.distributed.all_reduce(self.buffer[:num_tokens], group=self.group)
         torch.cuda.synchronize()
 
         # Build graph.
         graph = torch.cuda.CUDAGraph()
         with torch.cuda.graph(graph):
-            torch.distributed.all_reduce(self.buffer[:num_tokens],
-                                         group=self.group)
+            torch.distributed.all_reduce(
+                self.buffer[:num_tokens], group=self.group)
         torch.cuda.synchronize()
         return graph
 

diff --git a/vllm/model_executor/parallel_utils/tensor_parallel/__init__.py b/vllm/model_executor/parallel_utils/tensor_parallel/__init__.py
@@ -22,7 +22,8 @@
 )
 
 from .utils import (
-    split_tensor_along_last_dim, )
+    split_tensor_along_last_dim,
+)
 
 __all__ = [
     #layers.py
@@ -37,7 +38,7 @@
     "copy_to_tensor_model_parallel_region",
     "gather_from_tensor_model_parallel_region",
     "gather_from_sequence_parallel_region",
-    #    "reduce_from_tensor_model_parallel_region",
+#    "reduce_from_tensor_model_parallel_region",
     "scatter_to_tensor_model_parallel_region",
     "scatter_to_sequence_parallel_region",
     # random.py

diff --git a/vllm/model_executor/parallel_utils/tensor_parallel/layers.py b/vllm/model_executor/parallel_utils/tensor_parallel/layers.py
@@ -5,6 +5,7 @@
 # Parts of the code here are adapted from PyTorch
 # repo: https://github.com/pytorch/pytorch
 
+
 import torch
 import torch.nn.functional as F
 import torch.nn.init as init
@@ -28,17 +29,14 @@
     VocabUtility,
 )
 
-_MODEL_PARALLEL_ATTRIBUTE_DEFAULTS = {
-    'tensor_model_parallel': False,
-    'partition_dim': -1,
-    'partition_stride': 1
-}
-
+_MODEL_PARALLEL_ATTRIBUTE_DEFAULTS = {'tensor_model_parallel': False,
+                                      'partition_dim': -1,
+                                      'partition_stride': 1}
 
 def param_is_not_tensor_parallel_duplicate(param):
     return (hasattr(param, 'tensor_model_parallel') and
-            param.tensor_model_parallel) or (get_tensor_model_parallel_rank()
-                                             == 0)
+            param.tensor_model_parallel) or (
+                get_tensor_model_parallel_rank() == 0)
 
 
 def set_tensor_model_parallel_attributes(tensor, is_parallel, dim, stride):
@@ -52,30 +50,24 @@ def set_tensor_model_parallel_attributes(tensor, is_parallel, dim, stride):
 
 
 def set_defaults_if_not_set_tensor_model_parallel_attributes(tensor):
-
     def maybe_set(attribute, value):
         if not hasattr(tensor, attribute):
             setattr(tensor, attribute, value)
-
     for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
         maybe_set(attribute, _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS[attribute])
 
 
 def copy_tensor_model_parallel_attributes(destination_tensor, source_tensor):
-
     def maybe_copy(attribute):
         if hasattr(source_tensor, attribute):
             setattr(destination_tensor, attribute,
                     getattr(source_tensor, attribute))
-
     for attribute in _MODEL_PARALLEL_ATTRIBUTE_DEFAULTS:
         maybe_copy(attribute)
 
 
-def _initialize_affine_weight_gpu(weight,
-                                  init_method,
-                                  partition_dim,
-                                  stride=1):
+def _initialize_affine_weight_gpu(weight, init_method,
+                                  partition_dim, stride=1):
     """Initialize affine weight for model parallel on GPU."""
 
     set_tensor_model_parallel_attributes(tensor=weight,
@@ -87,16 +79,11 @@ def _initialize_affine_weight_gpu(weight,
         init_method(weight)
 
 
-def _initialize_affine_weight_cpu(weight,
-                                  output_size,
-                                  input_size,
-                                  per_partition_size,
-                                  partition_dim,
-                                  init_method,
-                                  stride=1,
+def _initialize_affine_weight_cpu(weight, output_size, input_size,
+                                  per_partition_size, partition_dim,
+                                  init_method, stride=1,
                                   return_master_weight=False,
-                                  *,
-                                  params_dtype=None):
+                                  *, params_dtype=None):
     """Initialize affine weight for model parallel.
 
     Build the master weight on all processes and scatter
@@ -111,17 +98,15 @@ def _initialize_affine_weight_cpu(weight,
         params_dtype = torch.get_default_dtype()
 
     # Initialize master weight
-    master_weight = torch.empty(output_size,
-                                input_size,
+    master_weight = torch.empty(output_size, input_size,
                                 dtype=torch.float,
                                 requires_grad=False)
     init_method(master_weight)
     master_weight = master_weight.to(dtype=params_dtype)
 
     # Split and copy
     per_partition_per_stride_size = divide(per_partition_size, stride)
-    weight_list = torch.split(master_weight,
-                              per_partition_per_stride_size,
+    weight_list = torch.split(master_weight, per_partition_per_stride_size,
                               dim=partition_dim)
     rank = get_tensor_model_parallel_rank()
     world_size = get_tensor_model_parallel_world_size()
@@ -150,14 +135,11 @@ class VocabParallelEmbedding(torch.nn.Module):
         perform_initialization
     """
 
-    def __init__(self,
-                 num_embeddings: int,
-                 embedding_dim: int,
-                 *,
+    def __init__(self, num_embeddings: int, embedding_dim: int, *,
                  init_method=init.xavier_normal_,
-                 params_dtype: torch.dtype = None,
-                 use_cpu_initialization: bool = False,
-                 perform_initialization: bool = True):
+                 params_dtype: torch.dtype=None,
+                 use_cpu_initialization: bool=False,
+                 perform_initialization: bool=True):
         super(VocabParallelEmbedding, self).__init__()
         # Keep the input dimensions.
         self.num_embeddings = num_embeddings
@@ -172,8 +154,7 @@ def __init__(self,
         self.scale_grad_by_freq = False
         self.sparse = False
         self._weight = None
-        self.tensor_model_parallel_size = get_tensor_model_parallel_world_size(
-        )
+        self.tensor_model_parallel_size = get_tensor_model_parallel_world_size()
         # Divide the weight matrix along the vocaburaly dimension.
         self.vocab_start_index, self.vocab_end_index = \
             VocabUtility.vocab_range_from_global_vocab_size(
@@ -184,30 +165,21 @@ def __init__(self,
 
         # Allocate weights and initialize.
         if use_cpu_initialization:
-            self.weight = Parameter(
-                torch.empty(self.num_embeddings_per_partition,
-                            self.embedding_dim,
-                            dtype=params_dtype))
+            self.weight = Parameter(torch.empty(
+                self.num_embeddings_per_partition, self.embedding_dim,
+                dtype=params_dtype))
             if perform_initialization:
                 _initialize_affine_weight_cpu(
-                    self.weight,
-                    self.num_embeddings,
-                    self.embedding_dim,
-                    self.num_embeddings_per_partition,
-                    0,
-                    init_method,
+                    self.weight, self.num_embeddings, self.embedding_dim,
+                    self.num_embeddings_per_partition, 0, init_method,
                     params_dtype=params_dtype)
         else:
-            self.weight = Parameter(
-                torch.empty(self.num_embeddings_per_partition,
-                            self.embedding_dim,
-                            device=torch.cuda.current_device(),
-                            dtype=params_dtype))
+            self.weight = Parameter(torch.empty(
+                self.num_embeddings_per_partition, self.embedding_dim,
+                device=torch.cuda.current_device(), dtype=params_dtype))
             if perform_initialization:
-                _initialize_affine_weight_gpu(self.weight,
-                                              init_method,
-                                              partition_dim=0,
-                                              stride=1)
+                _initialize_affine_weight_gpu(self.weight, init_method,
+                                              partition_dim=0, stride=1)
 
     def forward(self, input_):
         if self.tensor_model_parallel_size > 1:
@@ -260,21 +232,15 @@ class ColumnParallelLinear(torch.nn.Module):
         use_cpu_initialization:
     """
 
-    def __init__(
-        self,
-        input_size,
-        output_size,
-        *,
-        bias=True,
-        gather_output=True,
-        init_method=init.xavier_normal_,
-        stride=1,
-        keep_master_weight_for_test=False,
-        skip_bias_add=False,
-        params_dtype=None,
-        use_cpu_initialization=False,
-        perform_initialization=True,
-    ):
+    def __init__(self, input_size, output_size, *,
+                 bias=True, gather_output=True,
+                 init_method=init.xavier_normal_, stride=1,
+                 keep_master_weight_for_test=False,
+                 skip_bias_add=False,
+                 params_dtype=None,
+                 use_cpu_initialization=False,
+                 perform_initialization=True,
+                 ):
         super(ColumnParallelLinear, self).__init__()
 
         # Keep input parameters
@@ -294,49 +260,39 @@ def __init__(
         # we allocate the transpose.
         # Initialize weight.
         if use_cpu_initialization:
-            self.weight = Parameter(
-                torch.empty(self.output_size_per_partition,
-                            self.input_size,
-                            dtype=params_dtype))
+            self.weight = Parameter(torch.empty(self.output_size_per_partition,
+                                                self.input_size,
+                                                dtype=params_dtype))
             if perform_initialization:
                 self.master_weight = _initialize_affine_weight_cpu(
-                    self.weight,
-                    self.output_size,
-                    self.input_size,
-                    self.output_size_per_partition,
-                    0,
-                    init_method,
-                    stride=stride,
-                    return_master_weight=keep_master_weight_for_test)
+                    self.weight, self.output_size, self.input_size,
+                    self.output_size_per_partition, 0, init_method,
+                    stride=stride, return_master_weight=keep_master_weight_for_test)
         else:
-            self.weight = Parameter(
-                torch.empty(self.output_size_per_partition,
-                            self.input_size,
-                            device=torch.cuda.current_device(),
-                            dtype=params_dtype))
+            self.weight = Parameter(torch.empty(
+                self.output_size_per_partition, self.input_size,
+                device=torch.cuda.current_device(), dtype=params_dtype))
             if perform_initialization:
-                _initialize_affine_weight_gpu(self.weight,
-                                              init_method,
-                                              partition_dim=0,
-                                              stride=stride)
+                _initialize_affine_weight_gpu(self.weight, init_method,
+                                              partition_dim=0, stride=stride)
 
         if bias:
             if use_cpu_initialization:
-                self.bias = Parameter(
-                    torch.empty(self.output_size_per_partition,
-                                dtype=params_dtype))
+                self.bias = Parameter(torch.empty(
+                    self.output_size_per_partition, dtype=params_dtype))
             else:
-                self.bias = Parameter(
-                    torch.empty(self.output_size_per_partition,
-                                device=torch.cuda.current_device(),
-                                dtype=params_dtype))
+                self.bias = Parameter(torch.empty(
+                    self.output_size_per_partition,
+                    device=torch.cuda.current_device(),
+                    dtype=params_dtype))
             set_tensor_model_parallel_attributes(self.bias, True, 0, stride)
             # Always initialize bias to zero.
             with torch.no_grad():
                 self.bias.zero_()
         else:
             self.register_parameter('bias', None)
 
+
     def forward(self, input_):
         """Forward of ColumnParallelLinear
 
@@ -396,21 +352,15 @@ class RowParallelLinear(torch.nn.Module):
         perform_initialization:
     """
 
-    def __init__(
-        self,
-        input_size,
-        output_size,
-        *,
-        bias=True,
-        input_is_parallel=False,
-        init_method=init.xavier_normal_,
-        stride=1,
-        keep_master_weight_for_test=False,
-        skip_bias_add=False,
-        params_dtype=None,
-        use_cpu_initialization=False,
-        perform_initialization=True,
-    ):
+    def __init__(self, input_size, output_size, *,
+                 bias=True, input_is_parallel=False,
+                 init_method=init.xavier_normal_, stride=1,
+                 keep_master_weight_for_test=False,
+                 skip_bias_add=False,
+                 params_dtype=None,
+                 use_cpu_initialization=False,
+                 perform_initialization=True,
+                 ):
         super(RowParallelLinear, self).__init__()
 
         # Keep input parameters
@@ -430,41 +380,30 @@ def __init__(
         # we allocate the transpose.
         # Initialize weight.
         if use_cpu_initialization:
-            self.weight = Parameter(
-                torch.empty(self.output_size,
-                            self.input_size_per_partition,
-                            dtype=params_dtype))
+            self.weight = Parameter(torch.empty(self.output_size,
+                                                self.input_size_per_partition,
+                                                dtype=params_dtype))
             if perform_initialization:
                 self.master_weight = _initialize_affine_weight_cpu(
-                    self.weight,
-                    self.output_size,
-                    self.input_size,
-                    self.input_size_per_partition,
-                    1,
-                    init_method,
-                    stride=stride,
-                    return_master_weight=keep_master_weight_for_test,
+                    self.weight, self.output_size, self.input_size,
+                    self.input_size_per_partition, 1, init_method,
+                    stride=stride, return_master_weight=keep_master_weight_for_test,
                     params_dtype=params_dtype)
         else:
-            self.weight = Parameter(
-                torch.empty(self.output_size,
-                            self.input_size_per_partition,
-                            device=torch.cuda.current_device(),
-                            dtype=params_dtype))
+            self.weight = Parameter(torch.empty(
+                self.output_size, self.input_size_per_partition,
+                device=torch.cuda.current_device(), dtype=params_dtype))
             if perform_initialization:
-                _initialize_affine_weight_gpu(self.weight,
-                                              init_method,
-                                              partition_dim=1,
-                                              stride=stride)
+                _initialize_affine_weight_gpu(self.weight, init_method,
+                                              partition_dim=1, stride=stride)
         if bias:
             if use_cpu_initialization:
-                self.bias = Parameter(
-                    torch.empty(self.output_size, dtype=params_dtype))
+                self.bias = Parameter(torch.empty(self.output_size,
+                                                  dtype=params_dtype))
             else:
-                self.bias = Parameter(
-                    torch.empty(self.output_size,
-                                device=torch.cuda.current_device(),
-                                dtype=params_dtype))
+                self.bias = Parameter(torch.empty(
+                    self.output_size, device=torch.cuda.current_device(),
+                    dtype=params_dtype))
 
             # Always initialize bias to zero.
             with torch.no_grad():

diff --git a/vllm/model_executor/parallel_utils/tensor_parallel/mappings.py b/vllm/model_executor/parallel_utils/tensor_parallel/mappings.py
@@ -16,12 +16,11 @@ def _reduce(input_):
     """All-reduce the input tensor across model parallel group."""
 
     # Bypass the function if we are using only 1 GPU.
-    if get_tensor_model_parallel_world_size() == 1:
+    if get_tensor_model_parallel_world_size()==1:
         return input_
 
     # All-reduce.
-    torch.distributed.all_reduce(input_,
-                                 group=get_tensor_model_parallel_group())
+    torch.distributed.all_reduce(input_, group=get_tensor_model_parallel_group())
 
     return input_
 
@@ -62,7 +61,7 @@ def _split_along_first_dim(input_):
     rank = get_tensor_model_parallel_rank()
     dim_offset = rank * local_dim_size
 
-    output = input_[dim_offset:dim_offset + local_dim_size].contiguous()
+    output = input_[dim_offset:dim_offset+local_dim_size].contiguous()
 
     return output
 
@@ -81,9 +80,7 @@ def _gather_along_last_dim(input_):
 
     tensor_list = [torch.empty_like(input_) for _ in range(world_size)]
     tensor_list[rank] = input_
-    torch.distributed.all_gather(tensor_list,
-                                 input_,
-                                 group=get_tensor_model_parallel_group())
+    torch.distributed.all_gather(tensor_list, input_, group=get_tensor_model_parallel_group())
 
     # Note: torch.cat already creates a contiguous tensor.
     output = torch.cat(tensor_list, dim=last_dim).contiguous()
@@ -102,16 +99,13 @@ def _gather_along_first_dim(input_):
     dim_size = list(input_.size())
     dim_size[0] = dim_size[0] * world_size
 
-    output = torch.empty(dim_size,
-                         dtype=input_.dtype,
+    output = torch.empty(dim_size, dtype=input_.dtype,
                          device=torch.cuda.current_device())
-    torch.distributed._all_gather_base(output,
-                                       input_.contiguous(),
+    torch.distributed._all_gather_base(output, input_.contiguous(),
                                        group=get_tensor_model_parallel_group())
 
     return output
 
-
 def _reduce_scatter_along_first_dim(input_):
     """Reduce-scatter the input tensor across model parallel group."""
     world_size = get_tensor_model_parallel_world_size()
@@ -125,11 +119,10 @@ def _reduce_scatter_along_first_dim(input_):
 
     dim_size[0] = dim_size[0] // world_size
 
-    output = torch.empty(dim_size,
-                         dtype=input_.dtype,
+    output = torch.empty(dim_size, dtype=input_.dtype,
                          device=torch.cuda.current_device())
-    torch.distributed._reduce_scatter_base(
-        output, input_.contiguous(), group=get_tensor_model_parallel_group())
+    torch.distributed._reduce_scatter_base(output, input_.contiguous(),
+                                           group=get_tensor_model_parallel_group())
     return output
 
 
@@ -259,7 +252,6 @@ def backward(ctx, grad_output):
 # Helper functions.
 # -----------------
 
-
 def copy_to_tensor_model_parallel_region(input_):
     return _CopyToModelParallelRegion.apply(input_)
 
@@ -280,11 +272,10 @@ def scatter_to_sequence_parallel_region(input_):
     return _ScatterToSequenceParallelRegion.apply(input_)
 
 
-def gather_from_sequence_parallel_region(input_,
-                                         tensor_parallel_output_grad=True):
-    return _GatherFromSequenceParallelRegion.apply(
-        input_, tensor_parallel_output_grad)
+def gather_from_sequence_parallel_region(input_, tensor_parallel_output_grad=True):
+    return _GatherFromSequenceParallelRegion.apply(input_, tensor_parallel_output_grad)
 
 
 def reduce_scatter_to_sequence_parallel_region(input_):
     return _ReduceScatterToSequenceParallelRegion.apply(input_)
+
diff --git a/vllm/model_executor/parallel_utils/tensor_parallel/random.py b/vllm/model_executor/parallel_utils/tensor_parallel/random.py
@@ -12,7 +12,8 @@
 from torch.cuda import _lazy_call, device as device_ctx_manager
 
 from vllm.model_executor.parallel_utils.parallel_state import (
-    get_tensor_model_parallel_rank, )
+    get_tensor_model_parallel_rank,
+)
 
 # Default name for the model parallel rng tracker.
 _MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng'
@@ -51,6 +52,7 @@ def cb():
     _lazy_call(cb)
 
 
+
 class CudaRNGStatesTracker:
     """Tracker for the cuda RNG states.
 

diff --git a/vllm/model_executor/parallel_utils/tensor_parallel/utils.py b/vllm/model_executor/parallel_utils/tensor_parallel/utils.py
@@ -5,11 +5,11 @@
 import torch
 from typing import List, Sequence
 
-
 def ensure_divisibility(numerator, denominator):
     """Ensure that numerator is divisible by the denominator."""
     assert numerator % denominator == 0, "{} is not divisible by {}".format(
-        numerator, denominator)
+        numerator, denominator
+    )
 
 
 def divide(numerator, denominator):
@@ -56,15 +56,15 @@ class VocabUtility:
 
     @staticmethod
     def vocab_range_from_per_partition_vocab_size(
-            per_partition_vocab_size: int, rank,
-            world_size: int) -> Sequence[int]:
+        per_partition_vocab_size: int, rank, world_size: int
+    ) -> Sequence[int]:
         index_f = rank * per_partition_vocab_size
         index_l = index_f + per_partition_vocab_size
         return index_f, index_l
 
     @staticmethod
-    def vocab_range_from_global_vocab_size(global_vocab_size: int, rank: int,
-                                           world_size: int) -> Sequence[int]:
+    def vocab_range_from_global_vocab_size(global_vocab_size: int, rank: int, world_size: int) -> Sequence[int]:
         per_partition_vocab_size = divide(global_vocab_size, world_size)
         return VocabUtility.vocab_range_from_per_partition_vocab_size(
-            per_partition_vocab_size, rank, world_size)
+            per_partition_vocab_size, rank, world_size
+        )
diff --git a/vllm/model_executor/weight_utils.py b/vllm/model_executor/weight_utils.py
@@ -44,9 +44,9 @@ def hf_model_weights_iterator(
     if use_np_cache:
         # Convert the model weights from torch tensors to numpy arrays for
         # faster loading.
-        np_folder = os.path.join(hf_folder, 'np')
+        np_folder = os.path.join(hf_folder, "np")
         os.makedirs(np_folder, exist_ok=True)
-        weight_names_file = os.path.join(np_folder, 'weight_names.json')
+        weight_names_file = os.path.join(np_folder, "weight_names.json")
         with lock:
             if not os.path.exists(weight_names_file):
                 weight_names = []
@@ -57,10 +57,10 @@ def hf_model_weights_iterator(
                         with open(param_path, "wb") as f:
                             np.save(f, param.cpu().detach().numpy())
                         weight_names.append(name)
-                with open(weight_names_file, 'w') as f:
+                with open(weight_names_file, "w") as f:
                     json.dump(weight_names, f)
 
-        with open(weight_names_file, 'r') as f:
+        with open(weight_names_file, "r") as f:
             weight_names = json.load(f)
 
         for name in weight_names:

diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
@@ -1,5 +1,5 @@
 """Sampling parameters for text generation."""
-from typing import List, Optional, Union
+from typing import Optional, Sequence, Union
 
 _SAMPLING_EPS = 1e-5
 
@@ -52,7 +52,7 @@ def __init__(
         top_p: float = 1.0,
         top_k: int = -1,
         use_beam_search: bool = False,
-        stop: Union[str, List[str]] = [],
+        stop: Union[str, Sequence[str]] = (),
         ignore_eos: bool = False,
         max_tokens: int = 16,
         logprobs: Optional[int] = None,

diff --git a/vllm/sequence.py b/vllm/sequence.py
@@ -145,17 +145,16 @@ def get_cumulative_logprob(self) -> float:
     def is_finished(self) -> bool:
         return SequenceStatus.is_finished(self.status)
 
-    def fork(self, child_seq: 'Sequence') -> None:
+    def fork(self, child_seq: "Sequence") -> None:
         child_seq.logical_token_blocks = copy.deepcopy(
             self.logical_token_blocks)
         child_seq.output_logprobs = copy.deepcopy(self.output_logprobs)
         child_seq.data = copy.deepcopy(self.data)
-        return None
 
     def __repr__(self) -> str:
-        return (f'Sequence(seq_id={self.seq_id}, '
-                f'status={self.status.name}, '
-                f'num_blocks={len(self.logical_token_blocks)})')
+        return (f"Sequence(seq_id={self.seq_id}, "
+                f"status={self.status.name}, "
+                f"num_blocks={len(self.logical_token_blocks)})")
 
 
 class SequenceGroup:
@@ -188,7 +187,7 @@ def find(self, seq_id: int) -> Sequence:
         for seq in self.seqs:
             if seq.seq_id == seq_id:
                 return seq
-        raise ValueError(f'Sequence {seq_id} not found.')
+        raise ValueError(f"Sequence {seq_id} not found.")
 
     def is_finished(self) -> bool:
         return all(seq.is_finished() for seq in self.seqs)
@@ -233,10 +232,10 @@ def __init__(
         self.logprobs = logprobs
 
     def __repr__(self) -> str:
-        return (f'SequenceOutputs(seq_id={self.seq_id}, '
-                f'parent_seq_id={self.parent_seq_id}, '
-                f'output_token={self.output_token}), '
-                f'logprobs={self.logprobs}')
+        return (f"SequenceOutputs(seq_id={self.seq_id}, "
+                f"parent_seq_id={self.parent_seq_id}, "
+                f"output_token={self.output_token}), "
+                f"logprobs={self.logprobs}")
 
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, SequenceOutputs):

diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
@@ -13,8 +13,8 @@
 
 def get_tokenizer(
     tokenizer_name: str,
-    tokenizer_mode: str = "auto",
     *args,
+    tokenizer_mode: str = "auto",
     **kwargs,
 ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
     """Gets a tokenizer for the given model name via Huggingface."""
@@ -42,7 +42,7 @@ def get_tokenizer(
         raise RuntimeError(err_msg) from e
 
     if not isinstance(tokenizer, PreTrainedTokenizerFast):
-        logger.warning(
+        logger.warninging(
             "Using a slow tokenizer. This might cause a significant "
             "slowdown. Consider using a fast tokenizer instead.")
     return tokenizer
@@ -73,7 +73,8 @@ def detokenize_incrementally(
         output_text = tokenizer.convert_tokens_to_string(output_tokens)
         return new_token, output_text
 
-    # Adapted from https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
+    # Adapted from
+    # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
     # NOTE(woosuk): The following code is slow because it runs a for loop over
     # the output_tokens. In Python, running a for loop over a list can be slow
     # even when the loop body is very simple.

diff --git a/vllm/utils.py b/vllm/utils.py
@@ -17,9 +17,9 @@ def __init__(self, start: int = 0) -> None:
         self.counter = start
 
     def __next__(self) -> int:
-        id = self.counter
+        i = self.counter
         self.counter += 1
-        return id
+        return i
 
     def reset(self) -> None:
         self.counter = 0

diff --git a/vllm/worker/cache_engine.py b/vllm/worker/cache_engine.py
@@ -93,8 +93,8 @@ def allocate_cpu_cache(self) -> List[KVCache]:
         if not pin_memory:
             # Pinning memory in WSL is not supported.
             # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
-            logger.warn("Using 'pin_memory=False' as WSL is detected. "
-                        "This may slow down the performance.")
+            logger.warning("Using 'pin_memory=False' as WSL is detected. "
+                           "This may slow down the performance.")
         for _ in range(self.num_layers):
             key_blocks = torch.empty(
                 size=(self.num_cpu_blocks, *key_block_shape),