From 36bf8150cc3a048d69d9d2196128462014b9599d Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sun, 8 Sep 2024 01:45:44 +0800 Subject: [PATCH 1/3] [Model][VLM] Decouple weight loading logic for `Paligemma` (#8269) --- vllm/model_executor/models/paligemma.py | 112 ++++++++---------------- vllm/model_executor/models/siglip.py | 23 ++++- 2 files changed, 54 insertions(+), 81 deletions(-) diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index b6f4275fbc948..5fd39b5e35be6 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -1,3 +1,4 @@ +import itertools from typing import (Iterable, List, Literal, Mapping, Optional, Tuple, TypedDict, Union) @@ -13,7 +14,7 @@ from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.model_executor.models.gemma import GemmaModel +from vllm.model_executor.models.gemma import GemmaForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.utils import cached_get_tokenizer @@ -22,14 +23,10 @@ from .interfaces import SupportsMultiModal from .siglip import (SiglipVisionModel, dummy_image_for_siglip, dummy_seq_data_for_siglip, get_max_siglip_image_tokens) -from .utils import merge_multimodal_embeddings +from .utils import filter_weights, merge_multimodal_embeddings logger = init_logger(__name__) -_KEYS_TO_MODIFY_MAPPING = { - "language_model.model": "language_model", -} - class PaliGemmaImagePixelInputs(TypedDict): type: Literal["pixel_values"] @@ -151,8 +148,8 @@ def __init__(self, projection_dim=config.vision_config.projection_dim) self.quant_config = quant_config - self.language_model = GemmaModel(config.text_config, cache_config, - quant_config) + self.language_model = GemmaForCausalLM(config.text_config, + cache_config, quant_config) self.unpadded_vocab_size = config.text_config.vocab_size logit_scale = getattr(config, "logit_scale", 1.0) self.logits_processor = LogitsProcessor(self.unpadded_vocab_size, @@ -252,7 +249,8 @@ def forward(self, vision_embeddings = vision_embeddings * (self.config.hidden_size** -0.5) - inputs_embeds = self.language_model.get_input_embeddings(input_ids) + inputs_embeds = self.language_model.model.get_input_embeddings( + input_ids) inputs_embeds = merge_multimodal_embeddings( input_ids, inputs_embeds, vision_embeddings, @@ -262,87 +260,47 @@ def forward(self, else: inputs_embeds = None - hidden_states = self.language_model(input_ids, - positions, - kv_caches, - attn_metadata, - None, - inputs_embeds=inputs_embeds) + hidden_states = self.language_model.model(input_ids, + positions, + kv_caches, + attn_metadata, + None, + inputs_embeds=inputs_embeds) return hidden_states - # Copied from vllm/model_executor/models/gemma.py def compute_logits( self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> Optional[torch.Tensor]: - logits = self.logits_processor(self.language_model.embed_tokens, - hidden_states, sampling_metadata) - return logits + return self.language_model.compute_logits(hidden_states, + sampling_metadata) - # Copied from vllm/model_executor/models/gemma.py def sample( self, logits: torch.Tensor, sampling_metadata: SamplingMetadata, ) -> Optional[SamplerOutput]: - next_tokens = self.sampler(logits, sampling_metadata) - return next_tokens + return self.language_model.sample(logits, sampling_metadata) - # Adapted from vllm/model_executor/models/gemma.py def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): - stacked_params_mapping = [ - # (param_name, shard_name, shard_id) - ("qkv_proj", "q_proj", "q"), - ("qkv_proj", "k_proj", "k"), - ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] - params_dict = dict(self.named_parameters()) - loaded_params = set() - for name, loaded_weight in weights: - for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items(): - if key_to_modify in name: - name = name.replace(key_to_modify, new_key) - use_default_weight_loading = False - if "vision" not in name or self.vision_tower.shard_weight: - for (param_name, shard_name, - shard_id) in stacked_params_mapping: - if shard_name not in name: - continue - name = name.replace(shard_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # lm_head is not used in vllm as it is tied with - # embed_token. To prevent errors, skip loading - # lm_head.weight. - if "lm_head.weight" in name: - continue - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - use_default_weight_loading = True - else: - use_default_weight_loading = True - - if use_default_weight_loading: - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - - loaded_params.add(name) - - unloaded_params = params_dict.keys() - loaded_params - if unloaded_params: - logger.warning( - "Some weights are not initialized from checkpoints: %s", - unloaded_params) + # prepare weight iterators for components + vit_weights, mlp_weights, llm_weights = itertools.tee(weights, 3) + + # load vision tower + vit_weights = filter_weights(vit_weights, "vision_tower") + self.vision_tower.load_weights(vit_weights) + + # load mlp projector + mlp_weights = filter_weights(mlp_weights, "multi_modal_projector") + mlp_params_dict = dict(self.multi_modal_projector.named_parameters()) + for name, loaded_weight in mlp_weights: + param = mlp_params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + + # load llm backbone + llm_weights = filter_weights(llm_weights, "language_model") + self.language_model.load_weights(llm_weights) diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index fb4c30c1a13f9..13d09e4cd4c23 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -529,6 +529,12 @@ def forward( ) def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ] if self.shard_weight else [] params_dict = dict(self.named_parameters()) layer_count = len(self.vision_model.encoder.layers) @@ -544,7 +550,16 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): if layer_idx >= layer_count: continue - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + + param = params_dict[name.replace(weight_name, param_name)] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) From b962ee1470a019a72a1c17eddcf3a0471658a123 Mon Sep 17 00:00:00 2001 From: sumitd2 <91451282+sumitd2@users.noreply.github.com> Date: Sat, 7 Sep 2024 23:48:40 +0530 Subject: [PATCH 2/3] ppc64le: Dockerfile fixed, and a script for buildkite (#8026) --- .buildkite/run-cpu-test-ppc64le.sh | 32 ++++++++++++++++++++++++++++++ Dockerfile.ppc64le | 16 ++++++++++----- 2 files changed, 43 insertions(+), 5 deletions(-) create mode 100755 .buildkite/run-cpu-test-ppc64le.sh diff --git a/.buildkite/run-cpu-test-ppc64le.sh b/.buildkite/run-cpu-test-ppc64le.sh new file mode 100755 index 0000000000000..a01cf3fe67489 --- /dev/null +++ b/.buildkite/run-cpu-test-ppc64le.sh @@ -0,0 +1,32 @@ +# This script build the CPU docker image and run the offline inference inside the container. +# It serves a sanity check for compilation and basic model usage. +set -ex + +# Try building the docker image +docker build -t cpu-test -f Dockerfile.ppc64le . + +# Setup cleanup +remove_docker_container() { docker rm -f cpu-test || true; } +trap remove_docker_container EXIT +remove_docker_container + +# Run the image, setting --shm-size=4g for tensor parallel. +#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test +docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --name cpu-test cpu-test + +# Run basic model test +docker exec cpu-test bash -c " + pip install pytest matplotlib einops transformers_stream_generator + pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported + +# online inference +docker exec cpu-test bash -c " + python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & + timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1 + python3 benchmarks/benchmark_serving.py \ + --backend vllm \ + --dataset-name random \ + --model facebook/opt-125m \ + --num-prompts 20 \ + --endpoint /v1/completions \ + --tokenizer facebook/opt-125m" diff --git a/Dockerfile.ppc64le b/Dockerfile.ppc64le index d4e4c483cada8..16780f8ab950c 100644 --- a/Dockerfile.ppc64le +++ b/Dockerfile.ppc64le @@ -2,21 +2,27 @@ FROM mambaorg/micromamba ARG MAMBA_DOCKERFILE_ACTIVATE=1 USER root -RUN apt-get update -y && apt-get install -y git wget vim numactl gcc-12 g++-12 protobuf-compiler libprotobuf-dev && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 +ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/" + +RUN apt-get update -y && apt-get install -y git wget vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential # Some packages in requirements-cpu are installed here # IBM provides optimized packages for ppc64le processors in the open-ce project for mamba # Currently these may not be available for venv or pip directly -RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 pytorch-cpu=2.1.2 torchvision-cpu=0.16.2 && micromamba clean --all --yes +RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 torchvision-cpu=0.16.2 rust && micromamba clean --all --yes COPY ./ /workspace/vllm WORKDIR /workspace/vllm # These packages will be in rocketce eventually -RUN pip install -v -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing +RUN pip install -v cmake torch==2.3.1 uvloop==0.20.0 -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install -WORKDIR /vllm-workspace -ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"] +WORKDIR /workspace/ + +RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks + +ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] + From cfe712bf1aedbee4f26105737710ff80ae9d624e Mon Sep 17 00:00:00 2001 From: Joe Runde Date: Sat, 7 Sep 2024 14:03:16 -0600 Subject: [PATCH 3/3] [CI/Build] Use python 3.12 in cuda image (#8133) Signed-off-by: Joe Runde --- Dockerfile | 8 ++++++-- requirements-common.txt | 1 + tests/test_logger.py | 6 +++--- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2375e3f4d7387..0ec6655ed449e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,7 +10,7 @@ ARG CUDA_VERSION=12.4.1 # prepare basic build environment FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base ARG CUDA_VERSION=12.4.1 -ARG PYTHON_VERSION=3.10 +ARG PYTHON_VERSION=3.12 ENV DEBIAN_FRONTEND=noninteractive # Install Python and other dependencies @@ -133,7 +133,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \ # image with vLLM installed FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base ARG CUDA_VERSION=12.4.1 -ARG PYTHON_VERSION=3.10 +ARG PYTHON_VERSION=3.12 WORKDIR /vllm-workspace ENV DEBIAN_FRONTEND=noninteractive @@ -179,6 +179,10 @@ FROM vllm-base AS test ADD . /vllm-workspace/ # install development dependencies (for testing) +# A newer setuptools is required for installing some test dependencies from source that do not publish python 3.12 wheels +# This installation must complete before the test dependencies are collected and installed. +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip install "setuptools>=74.1.1" RUN --mount=type=cache,target=/root/.cache/pip \ python3 -m pip install -r requirements-dev.txt diff --git a/requirements-common.txt b/requirements-common.txt index e430753357ca0..49a290317f818 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -27,3 +27,4 @@ gguf == 0.9.1 importlib_metadata mistral_common >= 1.3.4 pyyaml +six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 diff --git a/tests/test_logger.py b/tests/test_logger.py index 29346cd0878b8..8f3d218416870 100644 --- a/tests/test_logger.py +++ b/tests/test_logger.py @@ -95,7 +95,7 @@ def test_logger_configuring_can_be_disabled(): config behavior, however mocks are used to ensure no changes in behavior or configuration occur.""" - with patch("logging.config.dictConfig") as dict_config_mock: + with patch("vllm.logger.dictConfig") as dict_config_mock: _configure_vllm_root_logger() dict_config_mock.assert_not_called() @@ -175,9 +175,9 @@ def test_custom_logging_config_is_parsed_and_used_when_provided(): logging_config_file.flush() with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH", logging_config_file.name), patch( - "logging.config.dictConfig") as dict_config_mock: + "vllm.logger.dictConfig") as dict_config_mock: _configure_vllm_root_logger() - assert dict_config_mock.called_with(valid_logging_config) + dict_config_mock.assert_called_with(valid_logging_config) @patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 0)