Merge pull request vllm-project#3 from ilya-lavrenov/docker-file

ilya-lavrenov · web-flow · commit 354ca31cc6d1 · 2024-03-18T20:22:56.000+04:00
Added dockerfile with vLLM + openvino
diff --git a/Dockerfile.openvino b/Dockerfile.openvino
@@ -0,0 +1,61 @@
+# The vLLM Dockerfile is used to construct vLLM image that can be directly used
+# to run the OpenAI compatible server.
+
+#################### BASE BUILD IMAGE ####################
+FROM ubuntu:22.04 AS dev
+
+RUN apt-get update -y && \
+    apt-get install -y python3-pip git
+WORKDIR /workspace
+
+# build and install OpenVINO
+RUN git clone --recurse-submodules -b pytorch_module_extension https://github.com/slyalin/openvino.git
+RUN /workspace/openvino/install_build_dependencies.sh
+RUN cmake -DCPACK_GENERATOR=DEB -DENABLE_PYTHON=ON -DENABLE_PYTHON_PACKAGING=ON -DENABLE_CPPLINT=OFF \
+    -DENABLE_INTEL_GPU=OFF -DENABLE_TEMPLATE=OFF -DENABLE_AUTO=OFF -DENABLE_HETERO=OFF -DENABLE_AUTO_BATCH=OFF \
+    -DENABLE_OV_TF_FRONTEND=OFF -DENABLE_OV_ONNX_FRONTEND=OFF -DENABLE_OV_TF_LITE_FRONTEND=OFF -DENABLE_OV_PADDLE_FRONTEND=OFF \
+    -S /workspace/openvino -B /workspace/openvino_build
+RUN python3 -m pip install -r /workspace/openvino/src/bindings/python/wheel/requirements-dev.txt
+RUN cmake --build /workspace/openvino_build --parallel 8
+RUN cmake -P /workspace/openvino_build/cmake_install.cmake
+
+# build and install OpenVINO Contrib with PagedAttention
+RUN git clone --branch paged-attention https://github.com/ilya-lavrenov/openvino_contrib.git
+RUN cmake -DCUSTOM_OPERATIONS=paged_attention -DCMAKE_INSTALL_PREFIX=/usr \
+    -S /workspace/openvino_contrib/modules/custom_operations/ -B /workspace/paged_attention_build/
+RUN cmake --build /workspace/paged_attention_build/ --parallel 8
+RUN cmake -P /workspace/openvino_build/cmake_install.cmake
+
+# Install OpenVINO tokenizers
+RUN PIP_PRE=1 PIP_EXTRA_INDEX_URL="https://storage.openvinotoolkit.org/simple/wheels/nightly" python3 -m pip install openvino-tokenizers
+#################### BASE BUILD IMAGE ####################
+
+
+#################### EXTENSION BUILD IMAGE ####################
+FROM dev AS build
+
+COPY requirements-build.txt /workspace/vllm/
+COPY requirements-openvino.txt /workspace/vllm/
+
+# install build dependencies
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-build.txt
+# install runtime dependencies
+RUN PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" python3 -m pip install -r /workspace/vllm/requirements-openvino.txt
+
+COPY vllm/ /workspace/vllm/vllm
+COPY setup.py /workspace/vllm/
+
+RUN cmake -P /workspace/paged_attention_build/cmake_install.cmake
+RUN python3 -m pip install --no-build-isolation /workspace/vllm/
+#################### EXTENSION Build IMAGE ####################
+
+
+#################### OPENAI API SERVER ####################
+# openai api server alternative
+FROM build AS vllm-openai
+# install additional dependencies for openai api server
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install accelerate
+
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
+#################### OPENAI API SERVER ####################
diff --git a/requirements-openvino.txt b/requirements-openvino.txt
@@ -8,3 +8,5 @@ prometheus_client >= 0.18.0
 torch >= 2.1.2
 transformers >= 4.38.0  # Required for Gemma.
 openvino==2024.1.0
+optimum-intel[nncf,openvino]
+outlines >= 0.0.27
diff --git a/setup.py b/setup.py
@@ -51,7 +51,6 @@ def _is_openvino() -> bool:
         import openvino
     except ImportError:
         openvino_available = False
-    openvino_available = os.getenv("VLLM_OPENVINO", "0") == "1"
     return openvino_available
 
 # Compiler flags.
@@ -124,9 +123,8 @@ def get_neuronxcc_version():
 
 
 def get_openvino_version():
-    # import openvino
-    # return openvino.__version__[:8]
-    return "2024.1.0"
+    import openvino
+    return openvino.__version__[:8]
 
 def get_nvcc_cuda_version(cuda_dir: str) -> Version:
     """Get the CUDA version from nvcc.
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
@@ -325,7 +325,12 @@ def from_engine_args(cls,
         # Create the engine configs.
         engine_configs = engine_args.create_engine_configs()
         parallel_config = engine_configs[2]
-        if parallel_config.worker_use_ray or engine_args.engine_use_ray:
+        device_config = engine_configs[4]
+
+        if device_config.is_openvino:
+            from vllm.executor.openvino_executor import OpenVINOExecutorAsync
+            executor_class = OpenVINOExecutorAsync
+        elif parallel_config.worker_use_ray or engine_args.engine_use_ray:
             initialize_ray_cluster(parallel_config)
             from vllm.executor.ray_gpu_executor import RayGPUExecutorAsync
             executor_class = RayGPUExecutorAsync
@@ -334,6 +339,7 @@ def from_engine_args(cls,
                 "Ray is required if parallel_config.world_size > 1.")
             from vllm.executor.gpu_executor import GPUExecutorAsync
             executor_class = GPUExecutorAsync
+
         # Create the async LLM engine.
         engine = cls(parallel_config.worker_use_ray,
                      engine_args.engine_use_ray,
diff --git a/vllm/utils.py b/vllm/utils.py
@@ -127,15 +127,20 @@ def is_neuron() -> bool:
     return transformers_neuronx is not None
 
 def is_openvino() -> bool:
+    is_openvino_available = True
     try:
         import openvino
     except ImportError:
-        openvino = None
-    return openvino is not None
+        is_openvino_available = False
+    return is_openvino_available
 
 def is_openvino_optimum_intel() -> bool:
-    openvino_optimum_intel = True if os.getenv('VLLM_OPENVINO_OPTIMUM', "0") == "1" else False
-    return is_openvino() and openvino_optimum_intel
+    is_optimum_intel_available = is_openvino()
+    try:
+        import optimum.intel
+    except:
+        is_optimum_intel_available = False
+    return is_optimum_intel_available
 
 def get_max_shared_memory_bytes(gpu: int = 0) -> int:
     """Returns the maximum shared memory per thread block in bytes."""