Skip to content

Commit 354ca31

Browse files
Merge pull request vllm-project#3 from ilya-lavrenov/docker-file
Added dockerfile with vLLM + openvino
2 parents e913d6b + 5b0db2b commit 354ca31

File tree

5 files changed

+81
-9
lines changed

5 files changed

+81
-9
lines changed

Dockerfile.openvino

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
2+
# to run the OpenAI compatible server.
3+
4+
#################### BASE BUILD IMAGE ####################
5+
FROM ubuntu:22.04 AS dev
6+
7+
RUN apt-get update -y && \
8+
apt-get install -y python3-pip git
9+
WORKDIR /workspace
10+
11+
# build and install OpenVINO
12+
RUN git clone --recurse-submodules -b pytorch_module_extension https://github.com/slyalin/openvino.git
13+
RUN /workspace/openvino/install_build_dependencies.sh
14+
RUN cmake -DCPACK_GENERATOR=DEB -DENABLE_PYTHON=ON -DENABLE_PYTHON_PACKAGING=ON -DENABLE_CPPLINT=OFF \
15+
-DENABLE_INTEL_GPU=OFF -DENABLE_TEMPLATE=OFF -DENABLE_AUTO=OFF -DENABLE_HETERO=OFF -DENABLE_AUTO_BATCH=OFF \
16+
-DENABLE_OV_TF_FRONTEND=OFF -DENABLE_OV_ONNX_FRONTEND=OFF -DENABLE_OV_TF_LITE_FRONTEND=OFF -DENABLE_OV_PADDLE_FRONTEND=OFF \
17+
-S /workspace/openvino -B /workspace/openvino_build
18+
RUN python3 -m pip install -r /workspace/openvino/src/bindings/python/wheel/requirements-dev.txt
19+
RUN cmake --build /workspace/openvino_build --parallel 8
20+
RUN cmake -P /workspace/openvino_build/cmake_install.cmake
21+
22+
# build and install OpenVINO Contrib with PagedAttention
23+
RUN git clone --branch paged-attention https://github.com/ilya-lavrenov/openvino_contrib.git
24+
RUN cmake -DCUSTOM_OPERATIONS=paged_attention -DCMAKE_INSTALL_PREFIX=/usr \
25+
-S /workspace/openvino_contrib/modules/custom_operations/ -B /workspace/paged_attention_build/
26+
RUN cmake --build /workspace/paged_attention_build/ --parallel 8
27+
RUN cmake -P /workspace/openvino_build/cmake_install.cmake
28+
29+
# Install OpenVINO tokenizers
30+
RUN PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://storage.openvinotoolkit.org/simple/wheels/nightly" python3 -m pip install openvino-tokenizers
31+
#################### BASE BUILD IMAGE ####################
32+
33+
34+
#################### EXTENSION BUILD IMAGE ####################
35+
FROM dev AS build
36+
37+
COPY requirements-build.txt /workspace/vllm/
38+
COPY requirements-openvino.txt /workspace/vllm/
39+
40+
# install build dependencies
41+
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
42+
# install runtime dependencies
43+
RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-openvino.txt
44+
45+
COPY vllm/ /workspace/vllm/vllm
46+
COPY setup.py /workspace/vllm/
47+
48+
RUN cmake -P /workspace/paged_attention_build/cmake_install.cmake
49+
RUN python3 -m pip install --no-build-isolation /workspace/vllm/
50+
#################### EXTENSION Build IMAGE ####################
51+
52+
53+
#################### OPENAI API SERVER ####################
54+
# openai api server alternative
55+
FROM build AS vllm-openai
56+
# install additional dependencies for openai api server
57+
RUN --mount=type=cache,target=/root/.cache/pip \
58+
python3 -m pip install accelerate
59+
60+
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
61+
#################### OPENAI API SERVER ####################

requirements-openvino.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,5 @@ prometheus_client >= 0.18.0
88
torch >= 2.1.2
99
transformers >= 4.38.0 # Required for Gemma.
1010
openvino==2024.1.0
11+
optimum-intel[nncf,openvino]
12+
outlines >= 0.0.27

setup.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,6 @@ def _is_openvino() -> bool:
5151
import openvino
5252
except ImportError:
5353
openvino_available = False
54-
openvino_available = os.getenv("VLLM_OPENVINO", "0") == "1"
5554
return openvino_available
5655

5756
# Compiler flags.
@@ -124,9 +123,8 @@ def get_neuronxcc_version():
124123

125124

126125
def get_openvino_version():
127-
# import openvino
128-
# return openvino.__version__[:8]
129-
return "2024.1.0"
126+
import openvino
127+
return openvino.__version__[:8]
130128

131129
def get_nvcc_cuda_version(cuda_dir: str) -> Version:
132130
"""Get the CUDA version from nvcc.

vllm/engine/async_llm_engine.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,12 @@ def from_engine_args(cls,
325325
# Create the engine configs.
326326
engine_configs = engine_args.create_engine_configs()
327327
parallel_config = engine_configs[2]
328-
if parallel_config.worker_use_ray or engine_args.engine_use_ray:
328+
device_config = engine_configs[4]
329+
330+
if device_config.is_openvino:
331+
from vllm.executor.openvino_executor import OpenVINOExecutorAsync
332+
executor_class = OpenVINOExecutorAsync
333+
elif parallel_config.worker_use_ray or engine_args.engine_use_ray:
329334
initialize_ray_cluster(parallel_config)
330335
from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync
331336
executor_class = RayGPUExecutorAsync
@@ -334,6 +339,7 @@ def from_engine_args(cls,
334339
"Ray is required if parallel_config.world_size > 1.")
335340
from vllm.executor.gpu_executor import GPUExecutorAsync
336341
executor_class = GPUExecutorAsync
342+
337343
# Create the async LLM engine.
338344
engine = cls(parallel_config.worker_use_ray,
339345
engine_args.engine_use_ray,

vllm/utils.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -127,15 +127,20 @@ def is_neuron() -> bool:
127127
return transformers_neuronx is not None
128128

129129
def is_openvino() -> bool:
130+
is_openvino_available = True
130131
try:
131132
import openvino
132133
except ImportError:
133-
openvino = None
134-
return openvino is not None
134+
is_openvino_available = False
135+
return is_openvino_available
135136

136137
def is_openvino_optimum_intel() -> bool:
137-
openvino_optimum_intel = True if os.getenv('VLLM_OPENVINO_OPTIMUM', "0") == "1" else False
138-
return is_openvino() and openvino_optimum_intel
138+
is_optimum_intel_available = is_openvino()
139+
try:
140+
import optimum.intel
141+
except:
142+
is_optimum_intel_available = False
143+
return is_optimum_intel_available
139144

140145
def get_max_shared_memory_bytes(gpu: int = 0) -> int:
141146
"""Returns the maximum shared memory per thread block in bytes."""

0 commit comments

Comments
 (0)