Skip to content

Commit

Permalink
[model] Support for Llava-Next-Video model (vllm-project#7559)
Browse files Browse the repository at this point in the history
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
  • Loading branch information
4 people authored Sep 11, 2024
1 parent efcf946 commit 6a512a0
Show file tree
Hide file tree
Showing 21 changed files with 1,083 additions and 18 deletions.
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
Expand Down
1 change: 1 addition & 0 deletions Dockerfile.cpu
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ FROM ubuntu:22.04 AS cpu-test-1
RUN --mount=type=cache,target=/var/cache/apt \
apt-get update -y \
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12

# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html
Expand Down
4 changes: 3 additions & 1 deletion Dockerfile.neuron
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@ FROM $BASE_IMAGE
RUN echo "Base image is $BASE_IMAGE"

# Install some basic utilities
RUN apt-get update && apt-get install python3 python3-pip -y
RUN apt-get update \
&& apt-get install python3 python3-pip -y \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1

### Mount Point ###
# When launching the container, mount the code directory to /app
Expand Down
3 changes: 2 additions & 1 deletion Dockerfile.openvino
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
FROM ubuntu:22.04 AS dev

RUN apt-get update -y && \
apt-get install -y python3-pip git
apt-get install -y python3-pip git && \
apt-get install -y ffmpeg libsm6 libxext6 libgl1
WORKDIR /workspace

# copy requirements
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile.ppc64le
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ USER root

ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"

RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential
RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1

# Some packages in requirements-cpu are installed here
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba
Expand Down
3 changes: 3 additions & 0 deletions Dockerfile.tpu
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:night
FROM $BASE_IMAGE
WORKDIR /workspace

# Install some basic utilities
RUN apt-get update && apt-get install -y ffmpeg libsm6 libxext6 libgl1

# Install the TPU and Pallas dependencies.
RUN python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
RUN python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
Expand Down
3 changes: 1 addition & 2 deletions Dockerfile.xpu
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
chmod 644 /usr/share/keyrings/intel-graphics.gpg

RUN apt-get update -y \
&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip

&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1
COPY ./ /workspace/vllm

WORKDIR /workspace/vllm
Expand Down
1 change: 1 addition & 0 deletions docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ def setup(app):
"aiohttp",
"compressed_tensors",
"cpuinfo",
"cv2",
"torch",
"transformers",
"psutil",
Expand Down
14 changes: 14 additions & 0 deletions docs/source/models/supported_models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,11 @@ Multimodal Language Models
- Image\ :sup:`E+`
- :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
-
* - :code:`LlavaNextVideoForConditionalGeneration`
- LLaVA-NeXT-Video
- Video
- :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. (see note)
-
* - :code:`MiniCPMV`
- MiniCPM-V
- Image\ :sup:`+`
Expand Down Expand Up @@ -260,6 +265,15 @@ Multimodal Language Models
For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630

For :code:`LLaVA-NeXT-Video`, the latest release of :code:`huggingface/transformers` doesn't work yet, so we need to use a developer version (:code:`21fac7abba2a37fae86106f87fcf9974fd1e3830`) for now.
This can be installed by running the following command:


.. code-block:: bash
pip install git+https://github.com/huggingface/transformers.git@21fac7abba2a37fae86106f87fcf9974fd1e3830
----

If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
Expand Down
70 changes: 61 additions & 9 deletions examples/offline_inference_vision_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,9 @@

from vllm import LLM, SamplingParams
from vllm.assets.image import ImageAsset
from vllm.assets.video import VideoAsset
from vllm.utils import FlexibleArgumentParser

# Input image and question
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
question = "What is the content of this image?"


# LLaVA-1.5
def run_llava(question):
Expand All @@ -30,7 +27,16 @@ def run_llava(question):
def run_llava_next(question):

prompt = f"[INST] <image>\n{question} [/INST]"
llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf")
llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192)
stop_token_ids = None
return llm, prompt, stop_token_ids


# LlaVA-NeXT-Video
# Currently only support for video input
def run_llava_next_video(question):
prompt = f"USER: <video>\n{question} ASSISTANT:"
llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192)
stop_token_ids = None
return llm, prompt, stop_token_ids

Expand Down Expand Up @@ -176,6 +182,7 @@ def run_qwen_vl(question):
model_example_map = {
"llava": run_llava,
"llava-next": run_llava_next,
"llava-next-video": run_llava_next_video,
"fuyu": run_fuyu,
"phi3_v": run_phi3v,
"paligemma": run_paligemma,
Expand All @@ -187,11 +194,49 @@ def run_qwen_vl(question):
}


def get_multi_modal_input(args):
"""
return {
"data": image or video,
"question": question,
}
"""
if args.modality == "image":
# Input image and question
image = ImageAsset("cherry_blossom") \
.pil_image.convert("RGB")
img_question = "What is the content of this image?"

return {
"data": image,
"question": img_question,
}

if args.modality == "video":
# Input video and question
video = VideoAsset(name="sample_demo_1.mp4",
num_frames=args.num_frames).np_ndarrays
vid_question = "Why is this video funny?"

return {
"data": video,
"question": vid_question,
}

msg = f"Modality {args.modality} is not supported."
raise ValueError(msg)


def main(args):
model = args.model_type
if model not in model_example_map:
raise ValueError(f"Model type {model} is not supported.")

modality = args.modality
mm_input = get_multi_modal_input(args)
data = mm_input["data"]
question = mm_input["question"]

llm, prompt, stop_token_ids = model_example_map[model](question)

# We set temperature to 0.2 so that outputs can be different
Expand All @@ -206,7 +251,7 @@ def main(args):
inputs = {
"prompt": prompt,
"multi_modal_data": {
"image": image
modality: data
},
}

Expand All @@ -215,7 +260,7 @@ def main(args):
inputs = [{
"prompt": prompt,
"multi_modal_data": {
"image": image
modality: data
},
} for _ in range(args.num_prompts)]

Expand All @@ -238,8 +283,15 @@ def main(args):
help='Huggingface "model_type".')
parser.add_argument('--num-prompts',
type=int,
default=1,
default=4,
help='Number of prompts to run.')

parser.add_argument('--modality',
type=str,
default="image",
help='Modality of the input.')
parser.add_argument('--num-frames',
type=int,
default=16,
help='Number of frames to extract from the video.')
args = parser.parse_args()
main(args)
1 change: 1 addition & 0 deletions requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ awscli
einops # required for MPT, qwen-vl and Mamba
httpx
librosa # required for audio test
opencv-python # required for video test
peft
requests
ray[adag]>=2.35
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,7 @@ def _read_requirements(filename: str) -> List[str]:
ext_modules=ext_modules,
extras_require={
"tensorizer": ["tensorizer>=2.9.0"],
"video": ["opencv-python"], # Required for video processing
"audio": ["librosa", "soundfile"] # Required for audio processing
},
cmdclass={"build_ext": cmake_build_ext} if len(ext_modules) > 0 else {},
Expand Down
Loading

0 comments on commit 6a512a0

Please sign in to comment.