Skip to content

Commit 6a512a0

Browse files
TKONIYywang96DarkLight1337
authored
[model] Support for Llava-Next-Video model (vllm-project#7559)
Co-authored-by: Roger Wang <ywang@roblox.com> Co-authored-by: Cyrus Leung <tlleungac@connect.ust.hk> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
1 parent efcf946 commit 6a512a0

21 files changed

+1083
-18
lines changed

Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,7 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
145145
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
146146
&& apt-get update -y \
147147
&& apt-get install -y ccache software-properties-common git curl sudo vim python3-pip \
148+
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
148149
&& add-apt-repository ppa:deadsnakes/ppa \
149150
&& apt-get update -y \
150151
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \

Dockerfile.cpu

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ FROM ubuntu:22.04 AS cpu-test-1
55
RUN --mount=type=cache,target=/var/cache/apt \
66
apt-get update -y \
77
&& apt-get install -y curl ccache git wget vim numactl gcc-12 g++-12 python3 python3-pip libtcmalloc-minimal4 libnuma-dev \
8+
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
89
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
910

1011
# https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/performance_tuning/tuning_guide.html

Dockerfile.neuron

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@ FROM $BASE_IMAGE
66
RUN echo "Base image is $BASE_IMAGE"
77

88
# Install some basic utilities
9-
RUN apt-get update && apt-get install python3 python3-pip -y
9+
RUN apt-get update \
10+
&& apt-get install python3 python3-pip -y \
11+
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1
1012

1113
### Mount Point ###
1214
# When launching the container, mount the code directory to /app

Dockerfile.openvino

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44
FROM ubuntu:22.04 AS dev
55

66
RUN apt-get update -y && \
7-
apt-get install -y python3-pip git
7+
apt-get install -y python3-pip git && \
8+
apt-get install -y ffmpeg libsm6 libxext6 libgl1
89
WORKDIR /workspace
910

1011
# copy requirements

Dockerfile.ppc64le

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ USER root
44

55
ENV PATH="/usr/local/cargo/bin:$PATH:/opt/conda/bin/"
66

7-
RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential
7+
RUN apt-get update -y && apt-get install -y git wget curl vim libnuma-dev libsndfile-dev libprotobuf-dev build-essential ffmpeg libsm6 libxext6 libgl1
88

99
# Some packages in requirements-cpu are installed here
1010
# IBM provides optimized packages for ppc64le processors in the open-ce project for mamba

Dockerfile.tpu

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@ ARG BASE_IMAGE="us-central1-docker.pkg.dev/tpu-pytorch-releases/docker/xla:night
44
FROM $BASE_IMAGE
55
WORKDIR /workspace
66

7+
# Install some basic utilities
8+
RUN apt-get update && apt-get install -y ffmpeg libsm6 libxext6 libgl1
9+
710
# Install the TPU and Pallas dependencies.
811
RUN python3 -m pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
912
RUN python3 -m pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html

Dockerfile.xpu

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,7 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
99
chmod 644 /usr/share/keyrings/intel-graphics.gpg
1010

1111
RUN apt-get update -y \
12-
&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip
13-
12+
&& apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1
1413
COPY ./ /workspace/vllm
1514

1615
WORKDIR /workspace/vllm

docs/source/conf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ def setup(app):
9999
"aiohttp",
100100
"compressed_tensors",
101101
"cpuinfo",
102+
"cv2",
102103
"torch",
103104
"transformers",
104105
"psutil",

docs/source/models/supported_models.rst

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,11 @@ Multimodal Language Models
227227
- Image\ :sup:`E+`
228228
- :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
229229
-
230+
* - :code:`LlavaNextVideoForConditionalGeneration`
231+
- LLaVA-NeXT-Video
232+
- Video
233+
- :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. (see note)
234+
-
230235
* - :code:`MiniCPMV`
231236
- MiniCPM-V
232237
- Image\ :sup:`+`
@@ -260,6 +265,15 @@ Multimodal Language Models
260265
For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now.
261266
For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
262267

268+
For :code:`LLaVA-NeXT-Video`, the latest release of :code:`huggingface/transformers` doesn't work yet, so we need to use a developer version (:code:`21fac7abba2a37fae86106f87fcf9974fd1e3830`) for now.
269+
This can be installed by running the following command:
270+
271+
272+
.. code-block:: bash
273+
274+
pip install git+https://github.com/huggingface/transformers.git@21fac7abba2a37fae86106f87fcf9974fd1e3830
275+
276+
263277
----
264278

265279
If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.

examples/offline_inference_vision_language.py

Lines changed: 61 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,9 @@
99

1010
from vllm import LLM, SamplingParams
1111
from vllm.assets.image import ImageAsset
12+
from vllm.assets.video import VideoAsset
1213
from vllm.utils import FlexibleArgumentParser
1314

14-
# Input image and question
15-
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
16-
question = "What is the content of this image?"
17-
1815

1916
# LLaVA-1.5
2017
def run_llava(question):
@@ -30,7 +27,16 @@ def run_llava(question):
3027
def run_llava_next(question):
3128

3229
prompt = f"[INST] <image>\n{question} [/INST]"
33-
llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf")
30+
llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192)
31+
stop_token_ids = None
32+
return llm, prompt, stop_token_ids
33+
34+
35+
# LlaVA-NeXT-Video
36+
# Currently only support for video input
37+
def run_llava_next_video(question):
38+
prompt = f"USER: <video>\n{question} ASSISTANT:"
39+
llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192)
3440
stop_token_ids = None
3541
return llm, prompt, stop_token_ids
3642

@@ -176,6 +182,7 @@ def run_qwen_vl(question):
176182
model_example_map = {
177183
"llava": run_llava,
178184
"llava-next": run_llava_next,
185+
"llava-next-video": run_llava_next_video,
179186
"fuyu": run_fuyu,
180187
"phi3_v": run_phi3v,
181188
"paligemma": run_paligemma,
@@ -187,11 +194,49 @@ def run_qwen_vl(question):
187194
}
188195

189196

197+
def get_multi_modal_input(args):
198+
"""
199+
return {
200+
"data": image or video,
201+
"question": question,
202+
}
203+
"""
204+
if args.modality == "image":
205+
# Input image and question
206+
image = ImageAsset("cherry_blossom") \
207+
.pil_image.convert("RGB")
208+
img_question = "What is the content of this image?"
209+
210+
return {
211+
"data": image,
212+
"question": img_question,
213+
}
214+
215+
if args.modality == "video":
216+
# Input video and question
217+
video = VideoAsset(name="sample_demo_1.mp4",
218+
num_frames=args.num_frames).np_ndarrays
219+
vid_question = "Why is this video funny?"
220+
221+
return {
222+
"data": video,
223+
"question": vid_question,
224+
}
225+
226+
msg = f"Modality {args.modality} is not supported."
227+
raise ValueError(msg)
228+
229+
190230
def main(args):
191231
model = args.model_type
192232
if model not in model_example_map:
193233
raise ValueError(f"Model type {model} is not supported.")
194234

235+
modality = args.modality
236+
mm_input = get_multi_modal_input(args)
237+
data = mm_input["data"]
238+
question = mm_input["question"]
239+
195240
llm, prompt, stop_token_ids = model_example_map[model](question)
196241

197242
# We set temperature to 0.2 so that outputs can be different
@@ -206,7 +251,7 @@ def main(args):
206251
inputs = {
207252
"prompt": prompt,
208253
"multi_modal_data": {
209-
"image": image
254+
modality: data
210255
},
211256
}
212257

@@ -215,7 +260,7 @@ def main(args):
215260
inputs = [{
216261
"prompt": prompt,
217262
"multi_modal_data": {
218-
"image": image
263+
modality: data
219264
},
220265
} for _ in range(args.num_prompts)]
221266

@@ -238,8 +283,15 @@ def main(args):
238283
help='Huggingface "model_type".')
239284
parser.add_argument('--num-prompts',
240285
type=int,
241-
default=1,
286+
default=4,
242287
help='Number of prompts to run.')
243-
288+
parser.add_argument('--modality',
289+
type=str,
290+
default="image",
291+
help='Modality of the input.')
292+
parser.add_argument('--num-frames',
293+
type=int,
294+
default=16,
295+
help='Number of frames to extract from the video.')
244296
args = parser.parse_args()
245297
main(args)

0 commit comments

Comments
 (0)