Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Models] Add remaining model PP support #7168

Merged
merged 62 commits into from
Oct 4, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
62 commits
Select commit Hold shift + click to select a range
4d358bb
Model PP support
andoorve Aug 5, 2024
6685431
Format
andoorve Aug 5, 2024
44f3537
Format
andoorve Aug 5, 2024
a4aeba1
Merge branch 'main' of github.com:andoorve/vllm into qwen-pp
andoorve Sep 3, 2024
aff35f9
Format
andoorve Sep 3, 2024
3632dc6
Merge branch 'main' of github.com:andoorve/vllm into qwen-pp
andoorve Sep 24, 2024
6c16347
Merge branch 'main' of github.com:andoorve/vllm into qwen-pp
andoorve Sep 24, 2024
cfcba73
Format
andoorve Sep 24, 2024
674e28f
Merge branch 'main' of github.com:andoorve/vllm into qwen-pp
andoorve Sep 27, 2024
bc7385e
Merge branch 'main' into qwen-pp
DarkLight1337 Sep 30, 2024
332d66c
fix wrong type
DarkLight1337 Sep 30, 2024
18841f3
fix typo
DarkLight1337 Sep 30, 2024
313d04c
Merge branch 'main' into qwen-pp
DarkLight1337 Oct 1, 2024
eea3fc5
Add SupportsPP interface and stateless protocol check
DarkLight1337 Oct 1, 2024
b4ce5f7
Subclass SupportsPP in relevant models
DarkLight1337 Oct 1, 2024
30e454a
Remove hardcoded list
DarkLight1337 Oct 1, 2024
e9ea5b7
Remove unused import
DarkLight1337 Oct 1, 2024
8b40176
Check using function
DarkLight1337 Oct 1, 2024
ec4c6b3
Update docstring
DarkLight1337 Oct 1, 2024
cdc4dbe
Simplify
DarkLight1337 Oct 1, 2024
dcc2a49
Add tests
DarkLight1337 Oct 1, 2024
7280766
Test CUDA initialization
DarkLight1337 Oct 1, 2024
37cc51b
Add platform guard
DarkLight1337 Oct 1, 2024
3814246
Trigger CI
DarkLight1337 Oct 1, 2024
cf91f7b
Fix OOT registration
DarkLight1337 Oct 1, 2024
38b090a
Update docstring
DarkLight1337 Oct 1, 2024
d394985
Remove unnecessary global
DarkLight1337 Oct 1, 2024
1404e92
Merge branch 'main' into qwen-pp
DarkLight1337 Oct 2, 2024
6a4287a
Update interfaces
DarkLight1337 Oct 3, 2024
1e010c7
format
DarkLight1337 Oct 3, 2024
a6b99c3
Merge branch 'supports-pp' into qwen-pp
DarkLight1337 Oct 3, 2024
1e0baba
Fix error check
DarkLight1337 Oct 3, 2024
9ef69de
Make `prefix` required
DarkLight1337 Oct 3, 2024
76355d9
Inherit from `SupportsPP`
DarkLight1337 Oct 3, 2024
7be7ac2
Merge branch 'main' into qwen-pp
DarkLight1337 Oct 3, 2024
c3f3d4a
Inherit from `SupportsPP`
DarkLight1337 Oct 3, 2024
9cc78ae
Merge branch 'supports-pp' into qwen-pp
DarkLight1337 Oct 3, 2024
7c2a922
Fix PP for language models
DarkLight1337 Oct 3, 2024
a36f7ed
Fix environment variables not being copied over
DarkLight1337 Oct 3, 2024
5b960bc
Merge branch 'main' into supports-pp
DarkLight1337 Oct 3, 2024
66a634e
Merge branch 'supports-pp' into qwen-pp
DarkLight1337 Oct 3, 2024
f9cae12
Use inferred type
DarkLight1337 Oct 3, 2024
591bf85
Add missing type annotations
DarkLight1337 Oct 3, 2024
addc8cd
Add PP support for more multimodal models
DarkLight1337 Oct 3, 2024
ed669a5
Fix the real problem, which is that modelscope is not installed
DarkLight1337 Oct 3, 2024
9ac8a99
Merge branch 'supports-pp' into qwen-pp
DarkLight1337 Oct 3, 2024
d211003
Update tests
DarkLight1337 Oct 3, 2024
beb609c
Update docs
DarkLight1337 Oct 3, 2024
4cb66d5
Fix missing `SupportsPP`; support PP for olmoe
DarkLight1337 Oct 3, 2024
e01d59f
Fix type annotations
DarkLight1337 Oct 3, 2024
b8958a9
Move modelscope installation into regression test
DarkLight1337 Oct 3, 2024
ec0f4e0
Merge branch 'supports-pp' into qwen-pp
DarkLight1337 Oct 3, 2024
8fdcaa0
format
DarkLight1337 Oct 3, 2024
3ed8a8b
Update LoRA support in docs
DarkLight1337 Oct 3, 2024
ba174b6
PP support for phimoe
DarkLight1337 Oct 3, 2024
5da7ff1
Fix capitalization
DarkLight1337 Oct 3, 2024
e9f0601
Fix `LLMWrapper`
DarkLight1337 Oct 3, 2024
fda3b66
Merge branch 'supports-pp' into qwen-pp
DarkLight1337 Oct 3, 2024
b65813c
Update test configs
DarkLight1337 Oct 3, 2024
99e653e
Add more tp to pixtral
andoorve Oct 3, 2024
62f1980
Update test_pipeline_parallel.py
andoorve Oct 3, 2024
7c7251e
Fix gpt_j.py
andoorve Oct 4, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Format
Signed-off-by: Muralidhar Andoorveedu <muralidhar.andoorveedu@centml.ai>
  • Loading branch information
andoorve committed Aug 5, 2024
commit 44f35378f6566e7110d0c8350a6b48c8816da6ce
11 changes: 7 additions & 4 deletions vllm/model_executor/models/arctic.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@
from vllm.sequence import IntermediateTensors, SamplerOutput
from vllm.transformers_utils.configs.arctic import ArcticConfig

from .utils import is_pp_missing_parameter, make_layers, make_empty_intermediate_tensors_factory
from .utils import (is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers)

logger = init_logger(__name__)

Expand Down Expand Up @@ -382,8 +383,9 @@ def __init__(
prefix=f"{prefix}.layers")
self._attn_implementation = config._attn_implementation
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
["hidden_states"], config.hidden_size)
self.make_empty_intermediate_tensors = (
make_empty_intermediate_tensors_factory(["hidden_states"],
config.hidden_size))

def forward(
self,
Expand Down Expand Up @@ -431,7 +433,8 @@ def __init__(self,
self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
config.vocab_size)
self.sampler = Sampler()
self.make_empty_intermediate_tensors = self.model.make_empty_intermediate_tensors
self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors)

def forward(
self,
Expand Down
16 changes: 9 additions & 7 deletions vllm/model_executor/models/baichuan.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,8 @@

from vllm.attention import Attention, AttentionMetadata
from vllm.config import CacheConfig, LoRAConfig
from vllm.distributed import (get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size,
get_pp_group)
from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size)
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
Expand All @@ -47,7 +46,8 @@
from vllm.sequence import IntermediateTensors, SamplerOutput

from .interfaces import SupportsLoRA
from .utils import is_pp_missing_parameter, make_layers, make_empty_intermediate_tensors_factory
from .utils import (is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers)


def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
Expand Down Expand Up @@ -275,8 +275,9 @@ def __init__(self,
prefix=f"{prefix}.layers",
)
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
["hidden_states", "residual"], config.hidden_size)
self.make_empty_intermediate_tensors = (
make_empty_intermediate_tensors_factory(
["hidden_states", "residual"], config.hidden_size))

def forward(
self,
Expand Down Expand Up @@ -350,7 +351,8 @@ def __init__(
quant_config=quant_config)
self.logits_processor = LogitsProcessor(config.vocab_size)
self.sampler = Sampler()
self.make_empty_intermediate_tensors = self.model.make_empty_intermediate_tensors
self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors)

def forward(
self,
Expand Down
5 changes: 4 additions & 1 deletion vllm/model_executor/models/blip2.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from .blip import (BlipVisionModel, dummy_image_for_blip,
get_max_blip_image_tokens)
from .interfaces import SupportsVision
from .utils import merge_vision_embeddings, is_pp_missing_parameter, make_empty_intermediate_tensors_factory
from .utils import is_pp_missing_parameter, merge_vision_embeddings

_KEYS_TO_MODIFY_MAPPING = {
"language_model.lm_head": "lm_head",
Expand Down Expand Up @@ -487,6 +487,9 @@ def __init__(self,
self.logits_processor = LogitsProcessor(self.unpadded_vocab_size)
self.sampler = Sampler()

self.make_empty_intermediate_tensors = (
self.language_model.make_empty_intermediate_tensors)

def get_lm_head(self):
return self.language_model.decoder.embed_tokens

Expand Down
16 changes: 9 additions & 7 deletions vllm/model_executor/models/bloom.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,8 @@

from vllm.attention import Attention, AttentionMetadata
from vllm.config import CacheConfig
from vllm.distributed import (get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size,
get_pp_group)
from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size)
from vllm.model_executor.layers.activation import get_act_fn
from vllm.model_executor.layers.linear import (ColumnParallelLinear,
QKVParallelLinear,
Expand All @@ -42,7 +41,8 @@
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors, SamplerOutput

from .utils import make_empty_intermediate_tensors_factory, make_layers, is_pp_missing_parameter
from .utils import (is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers)


def _get_alibi_slopes(total_num_heads: int) -> torch.Tensor:
Expand Down Expand Up @@ -246,8 +246,9 @@ def __init__(

# Final Layer Norm
self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
["hidden_states"], config.hidden_size)
self.make_empty_intermediate_tensors = (
make_empty_intermediate_tensors_factory(["hidden_states"],
config.hidden_size))

def forward(
self,
Expand Down Expand Up @@ -292,7 +293,8 @@ def __init__(
self.lm_head = self.transformer.word_embeddings
self.logits_processor = LogitsProcessor(config.vocab_size)
self.sampler = Sampler()
self.make_empty_intermediate_tensors = self.transformer.make_empty_intermediate_tensors
self.make_empty_intermediate_tensors = (
self.transformer.make_empty_intermediate_tensors)

def forward(
self,
Expand Down
14 changes: 8 additions & 6 deletions vllm/model_executor/models/chameleon.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from vllm.attention import Attention, AttentionMetadata
from vllm.config import CacheConfig, MultiModalConfig
from vllm.distributed import get_tensor_model_parallel_world_size, get_pp_group
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
from vllm.logger import init_logger
from vllm.model_executor.layers.activation import SiluAndMul
Expand All @@ -33,9 +33,9 @@
from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData
from vllm.utils import print_warning_once

from .utils import is_pp_missing_parameter, make_empty_intermediate_tensors_factory, make_layers

from .interfaces import SupportsVision
from .utils import (is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers)

logger = init_logger(__name__)

Expand Down Expand Up @@ -837,8 +837,9 @@ def __init__(

self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.vqmodel = ChameleonVQVAE(config.vq_config)
self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
["hidden_states", "residual"], config.hidden_size)
self.make_empty_intermediate_tensors = (
make_empty_intermediate_tensors_factory(
["hidden_states", "residual"], config.hidden_size))

def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
return self.embed_tokens(input_ids)
Expand Down Expand Up @@ -921,7 +922,8 @@ def __init__(
self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
config.vocab_size, logit_scale)
self.sampler = Sampler()
self.make_empty_intermediate_tensors = self.model.make_empty_intermediate_tensors
self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors)

def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:

Expand Down
20 changes: 11 additions & 9 deletions vllm/model_executor/models/chatglm.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from vllm.attention import Attention, AttentionMetadata
from vllm.config import CacheConfig, LoRAConfig
from vllm.distributed import get_tensor_model_parallel_world_size, get_pp_group
from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
Expand All @@ -29,7 +29,8 @@
from vllm.transformers_utils.configs import ChatGLMConfig

from .interfaces import SupportsLoRA
from .utils import make_layers, make_empty_intermediate_tensors_factory, is_pp_missing_parameter
from .utils import (is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers)


class GLMAttention(nn.Module):
Expand Down Expand Up @@ -280,10 +281,9 @@ def forward(
kv_cache=kv_caches[i - self.start_layer],
attn_metadata=attn_metadata,
)
if get_pp_group().is_last_rank:
# Final layer norm.
if self.post_layer_norm:
hidden_states = self.final_layernorm(hidden_states)
# Final layer norm.
if get_pp_group().is_last_rank and self.post_layer_norm:
hidden_states = self.final_layernorm(hidden_states)

return hidden_states

Expand All @@ -309,8 +309,9 @@ def __init__(
self.output_layer = ParallelLMHead(config.padded_vocab_size,
config.hidden_size,
quant_config=quant_config)
self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
["hidden_states"], config.hidden_size)
self.make_empty_intermediate_tensors = (
make_empty_intermediate_tensors_factory(["hidden_states"],
config.hidden_size))

def forward(
self, input_ids: torch.Tensor, position_ids: torch.Tensor,
Expand Down Expand Up @@ -369,7 +370,8 @@ def __init__(
self.lm_head = self.transformer.output_layer
self.logits_processor = LogitsProcessor(config.padded_vocab_size)
self.sampler = Sampler()
self.make_empty_intermediate_tensors = self.transformer.make_empty_intermediate_tensors
self.make_empty_intermediate_tensors = (
self.transformer.make_empty_intermediate_tensors)

def forward(
self,
Expand Down
3 changes: 1 addition & 2 deletions vllm/model_executor/models/clip.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Minimal implementation of CLIPVisionModel intended to be only used
within a vision language model."""
from typing import Optional, Union
from typing import Optional

import torch
import torch.nn as nn
Expand All @@ -17,7 +17,6 @@
from vllm.multimodal.image import (cached_get_tokenizer,
repeat_and_pad_image_tokens)
from vllm.sequence import SequenceData
from .utils import make_layers, make_empty_intermediate_tensors_factory, is_pp_missing_parameter


def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
Expand Down
16 changes: 9 additions & 7 deletions vllm/model_executor/models/commandr.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,8 @@

from vllm.attention import Attention, AttentionMetadata
from vllm.config import CacheConfig, LoRAConfig
from vllm.distributed import (get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size,
get_pp_group)
from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size)
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
QKVParallelLinear,
Expand All @@ -49,7 +48,8 @@
from vllm.model_executor.utils import set_weight_attrs
from vllm.sequence import IntermediateTensors, SamplerOutput

from .utils import make_empty_intermediate_tensors_factory, make_layers, is_pp_missing_parameter
from .utils import (is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers)


@torch.compile
Expand Down Expand Up @@ -286,8 +286,9 @@ def __init__(
prefix=f"{prefix}.layers")
self.norm = LayerNorm(param_shape=(config.hidden_size),
eps=config.layer_norm_eps)
self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
["hidden_states", "residual"], config.hidden_size)
self.make_empty_intermediate_tensors = (
make_empty_intermediate_tensors_factory(
["hidden_states", "residual"], config.hidden_size))

def forward(
self, input_ids: torch.Tensor, positions: torch.Tensor,
Expand Down Expand Up @@ -360,7 +361,8 @@ def __init__(
quant_config,
lora_config=lora_config)
self.sampler = Sampler()
self.make_empty_intermediate_tensors = self.model.make_empty_intermediate_tensors
self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors)

@torch.no_grad()
def forward(
Expand Down
15 changes: 9 additions & 6 deletions vllm/model_executor/models/dbrx.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,9 @@

from vllm.attention import Attention, AttentionMetadata
from vllm.config import CacheConfig
from vllm.distributed import (get_tensor_model_parallel_rank,
from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size,
tensor_model_parallel_all_reduce, get_pp_group)
tensor_model_parallel_all_reduce)
from vllm.model_executor.layers.fused_moe import fused_moe
from vllm.model_executor.layers.linear import (QKVParallelLinear,
ReplicatedLinear,
Expand All @@ -26,7 +26,8 @@
from vllm.sequence import IntermediateTensors, SamplerOutput
from vllm.transformers_utils.configs.dbrx import DbrxConfig

from .utils import make_empty_intermediate_tensors_factory, make_layers, is_pp_missing_parameter
from .utils import (is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers)


class DbrxRouter(nn.Module):
Expand Down Expand Up @@ -335,8 +336,9 @@ def __init__(
nn.Parameter):
# Remove the bias term in Linear and LayerNorm.
module.register_parameter("bias", None)
self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
["hidden_states"], config.d_model)
self.make_empty_intermediate_tensors = (
make_empty_intermediate_tensors_factory(["hidden_states"],
config.d_model))

def forward(
self, input_ids: torch.Tensor, position_ids: torch.Tensor,
Expand Down Expand Up @@ -385,7 +387,8 @@ def __init__(
self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
config.vocab_size)
self.sampler = Sampler()
self.make_empty_intermediate_tensors = self.transformer.make_empty_intermediate_tensors
self.make_empty_intermediate_tensors = (
self.transformer.make_empty_intermediate_tensors)

def forward(
self,
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/models/decilm.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
# limitations under the License.
"""Inference-only DeciLM model compatible with HuggingFace weights."""

from typing import Iterable, Optional, Tuple, Union
from typing import Iterable, Optional, Tuple

import torch
from transformers import LlamaConfig
Expand Down
15 changes: 9 additions & 6 deletions vllm/model_executor/models/deepseek.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@

from vllm.attention import Attention, AttentionMetadata
from vllm.config import CacheConfig
from vllm.distributed import (get_tensor_model_parallel_rank,
from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank,
get_tensor_model_parallel_world_size,
tensor_model_parallel_all_reduce, get_pp_group)
tensor_model_parallel_all_reduce)
from vllm.model_executor.layers.activation import SiluAndMul
from vllm.model_executor.layers.fused_moe import fused_moe
from vllm.model_executor.layers.layernorm import RMSNorm
Expand All @@ -50,7 +50,8 @@
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors, SamplerOutput

from .utils import make_empty_intermediate_tensors_factory, make_layers, is_pp_missing_parameter
from .utils import (is_pp_missing_parameter,
make_empty_intermediate_tensors_factory, make_layers)


class DeepseekMLP(nn.Module):
Expand Down Expand Up @@ -349,8 +350,9 @@ def __init__(
quant_config=quant_config),
prefix=f"{prefix}.layers")
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
["hidden_states", "residual"], config.hidden_size)
self.make_empty_intermediate_tensors = (
make_empty_intermediate_tensors_factory(
["hidden_states", "residual"], config.hidden_size))

def forward(
self, input_ids: torch.Tensor, positions: torch.Tensor,
Expand Down Expand Up @@ -394,7 +396,8 @@ def __init__(
quant_config=quant_config)
self.logits_processor = LogitsProcessor(config.vocab_size)
self.sampler = Sampler()
self.make_empty_intermediate_tensors = self.model.make_empty_intermediate_tensors
self.make_empty_intermediate_tensors = (
self.model.make_empty_intermediate_tensors)

def forward(
self,
Expand Down
Loading
Loading