Skip to content

Commit

Permalink
Merge branch 'upstream' into fix-clip-layernorm
Browse files Browse the repository at this point in the history
  • Loading branch information
DarkLight1337 committed Sep 8, 2024
2 parents 33869b5 + cfe712b commit 5430a25
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 87 deletions.
32 changes: 32 additions & 0 deletions .buildkite/run-cpu-test-ppc64le.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# This script build the CPU docker image and run the offline inference inside the container.
# It serves a sanity check for compilation and basic model usage.
set -ex

# Try building the docker image
docker build -t cpu-test -f Dockerfile.ppc64le .

# Setup cleanup
remove_docker_container() { docker rm -f cpu-test || true; }
trap remove_docker_container EXIT
remove_docker_container

# Run the image, setting --shm-size=4g for tensor parallel.
#docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --shm-size=4g --name cpu-test cpu-test
docker run -itd --entrypoint /bin/bash -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true --network host -e HF_TOKEN --name cpu-test cpu-test

# Run basic model test
docker exec cpu-test bash -c "
pip install pytest matplotlib einops transformers_stream_generator
pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported

# online inference
docker exec cpu-test bash -c "
python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m &
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \
--backend vllm \
--dataset-name random \
--model facebook/opt-125m \
--num-prompts 20 \
--endpoint /v1/completions \
--tokenizer facebook/opt-125m"
8 changes: 6 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ ARG CUDA_VERSION=12.4.1
# prepare basic build environment
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3.10
ARG PYTHON_VERSION=3.12
ENV DEBIAN_FRONTEND=noninteractive

# Install Python and other dependencies
Expand Down Expand Up @@ -133,7 +133,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
# image with vLLM installed
FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu20.04 AS vllm-base
ARG CUDA_VERSION=12.4.1
ARG PYTHON_VERSION=3.10
ARG PYTHON_VERSION=3.12
WORKDIR /vllm-workspace
ENV DEBIAN_FRONTEND=noninteractive

Expand Down Expand Up @@ -179,6 +179,10 @@ FROM vllm-base AS test
ADD . /vllm-workspace/

# install development dependencies (for testing)
# A newer setuptools is required for installing some test dependencies from source that do not publish python 3.12 wheels
# This installation must complete before the test dependencies are collected and installed.
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install "setuptools>=74.1.1"
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install -r requirements-dev.txt

Expand Down
16 changes: 11 additions & 5 deletions Dockerfile.ppc64le
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,27 @@ FROM mambaorg/micromamba
ARG MAMBA_DOCKERFILE_ACTIVATE=1
USER root

RUN apt-get update -y && apt-get install -y git wget vim numactl gcc-12 g++-12 protobuf-compiler libprotobuf-dev && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"

RUN apt-get update -y && apt-get install -y git wget vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential

# Some packages in requirements-cpu are installed here
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
# Currently these may not be available for venv or pip directly
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 pytorch-cpu=2.1.2 torchvision-cpu=0.16.2 && micromamba clean --all --yes
RUN micromamba install -y -n base -c https://ftp.osuosl.org/pub/open-ce/1.11.0-p10/ -c defaults python=3.10 torchvision-cpu=0.16.2 rust && micromamba clean --all --yes

COPY ./ /workspace/vllm

WORKDIR /workspace/vllm

# These packages will be in rocketce eventually
RUN pip install -v -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing
RUN pip install -v cmake torch==2.3.1 uvloop==0.20.0 -r requirements-cpu.txt --prefer-binary --extra-index-url https://repo.fury.io/mgiessing

RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install

WORKDIR /vllm-workspace
ENTRYPOINT ["/opt/conda/bin/python3", "-m", "vllm.entrypoints.openai.api_server"]
WORKDIR /workspace/

RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks

ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]

1 change: 1 addition & 0 deletions requirements-common.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,4 @@ gguf == 0.9.1
importlib_metadata
mistral_common >= 1.3.4
pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
6 changes: 3 additions & 3 deletions tests/test_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ def test_logger_configuring_can_be_disabled():
config behavior, however mocks are used to ensure no changes in behavior or
configuration occur."""

with patch("logging.config.dictConfig") as dict_config_mock:
with patch("vllm.logger.dictConfig") as dict_config_mock:
_configure_vllm_root_logger()
dict_config_mock.assert_not_called()

Expand Down Expand Up @@ -175,9 +175,9 @@ def test_custom_logging_config_is_parsed_and_used_when_provided():
logging_config_file.flush()
with patch("vllm.logger.VLLM_LOGGING_CONFIG_PATH",
logging_config_file.name), patch(
"logging.config.dictConfig") as dict_config_mock:
"vllm.logger.dictConfig") as dict_config_mock:
_configure_vllm_root_logger()
assert dict_config_mock.called_with(valid_logging_config)
dict_config_mock.assert_called_with(valid_logging_config)


@patch("vllm.logger.VLLM_CONFIGURE_LOGGING", 0)
Expand Down
112 changes: 35 additions & 77 deletions vllm/model_executor/models/paligemma.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import itertools
from typing import (Iterable, List, Literal, Mapping, Optional, Tuple,
TypedDict, Union)

Expand All @@ -13,7 +14,7 @@
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.sampler import Sampler, SamplerOutput
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
from vllm.model_executor.models.gemma import GemmaModel
from vllm.model_executor.models.gemma import GemmaForCausalLM
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.multimodal import MULTIMODAL_REGISTRY
from vllm.multimodal.utils import cached_get_tokenizer
Expand All @@ -22,14 +23,10 @@
from .interfaces import SupportsMultiModal
from .siglip import (SiglipVisionModel, dummy_image_for_siglip,
dummy_seq_data_for_siglip, get_max_siglip_image_tokens)
from .utils import merge_multimodal_embeddings
from .utils import filter_weights, merge_multimodal_embeddings

logger = init_logger(__name__)

_KEYS_TO_MODIFY_MAPPING = {
"language_model.model": "language_model",
}


class PaliGemmaImagePixelInputs(TypedDict):
type: Literal["pixel_values"]
Expand Down Expand Up @@ -151,8 +148,8 @@ def __init__(self,
projection_dim=config.vision_config.projection_dim)

self.quant_config = quant_config
self.language_model = GemmaModel(config.text_config, cache_config,
quant_config)
self.language_model = GemmaForCausalLM(config.text_config,
cache_config, quant_config)
self.unpadded_vocab_size = config.text_config.vocab_size
logit_scale = getattr(config, "logit_scale", 1.0)
self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
Expand Down Expand Up @@ -252,7 +249,8 @@ def forward(self,
vision_embeddings = vision_embeddings * (self.config.hidden_size**
-0.5)

inputs_embeds = self.language_model.get_input_embeddings(input_ids)
inputs_embeds = self.language_model.model.get_input_embeddings(
input_ids)

inputs_embeds = merge_multimodal_embeddings(
input_ids, inputs_embeds, vision_embeddings,
Expand All @@ -262,87 +260,47 @@ def forward(self,
else:
inputs_embeds = None

hidden_states = self.language_model(input_ids,
positions,
kv_caches,
attn_metadata,
None,
inputs_embeds=inputs_embeds)
hidden_states = self.language_model.model(input_ids,
positions,
kv_caches,
attn_metadata,
None,
inputs_embeds=inputs_embeds)

return hidden_states

# Copied from vllm/model_executor/models/gemma.py
def compute_logits(
self,
hidden_states: torch.Tensor,
sampling_metadata: SamplingMetadata,
) -> Optional[torch.Tensor]:
logits = self.logits_processor(self.language_model.embed_tokens,
hidden_states, sampling_metadata)
return logits
return self.language_model.compute_logits(hidden_states,
sampling_metadata)

# Copied from vllm/model_executor/models/gemma.py
def sample(
self,
logits: torch.Tensor,
sampling_metadata: SamplingMetadata,
) -> Optional[SamplerOutput]:
next_tokens = self.sampler(logits, sampling_metadata)
return next_tokens
return self.language_model.sample(logits, sampling_metadata)

# Adapted from vllm/model_executor/models/gemma.py
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
("qkv_proj", "k_proj", "k"),
("qkv_proj", "v_proj", "v"),
("gate_up_proj", "gate_proj", 0),
("gate_up_proj", "up_proj", 1),
]
params_dict = dict(self.named_parameters())
loaded_params = set()
for name, loaded_weight in weights:
for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
if key_to_modify in name:
name = name.replace(key_to_modify, new_key)
use_default_weight_loading = False
if "vision" not in name or self.vision_tower.shard_weight:
for (param_name, shard_name,
shard_id) in stacked_params_mapping:
if shard_name not in name:
continue
name = name.replace(shard_name, param_name)
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
param = params_dict[name]
weight_loader = param.weight_loader
weight_loader(param, loaded_weight, shard_id)
break
else:
# lm_head is not used in vllm as it is tied with
# embed_token. To prevent errors, skip loading
# lm_head.weight.
if "lm_head.weight" in name:
continue
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
use_default_weight_loading = True
else:
use_default_weight_loading = True

if use_default_weight_loading:
param = params_dict[name]
weight_loader = getattr(param, "weight_loader",
default_weight_loader)
weight_loader(param, loaded_weight)

loaded_params.add(name)

unloaded_params = params_dict.keys() - loaded_params
if unloaded_params:
logger.warning(
"Some weights are not initialized from checkpoints: %s",
unloaded_params)
# prepare weight iterators for components
vit_weights, mlp_weights, llm_weights = itertools.tee(weights, 3)

# load vision tower
vit_weights = filter_weights(vit_weights, "vision_tower")
self.vision_tower.load_weights(vit_weights)

# load mlp projector
mlp_weights = filter_weights(mlp_weights, "multi_modal_projector")
mlp_params_dict = dict(self.multi_modal_projector.named_parameters())
for name, loaded_weight in mlp_weights:
param = mlp_params_dict[name]
weight_loader = getattr(param, "weight_loader",
default_weight_loader)
weight_loader(param, loaded_weight)

# load llm backbone
llm_weights = filter_weights(llm_weights, "language_model")
self.language_model.load_weights(llm_weights)

0 comments on commit 5430a25

Please sign in to comment.