intel · carsonwang · Apr 28, 2024 · Apr 8, 2024 · Apr 8, 2024 · Apr 8, 2024
diff --git a/.github/workflows/workflow_finetune.yml b/.github/workflows/workflow_finetune.yml
@@ -70,7 +70,7 @@ jobs:
 
       - name: Build Docker Image
         run: |
-          docker build ./ --build-arg CACHEBUST=1 --build-arg http_proxy=${{ inputs.http_proxy }} --build-arg https_proxy=${{ inputs.https_proxy }} -f dev/docker/Dockerfile.cpu_and_deepspeed -t finetune:latest 
+          docker build ./ --build-arg CACHEBUST=1 --build-arg http_proxy=${{ inputs.http_proxy }} --build-arg https_proxy=${{ inputs.https_proxy }} -f dev/docker/Dockerfile.cpu_and_deepspeed -t finetune:latest
           docker container prune -f
           docker image prune -f
 

diff --git a/.github/workflows/workflow_inference.yml b/.github/workflows/workflow_inference.yml
@@ -96,7 +96,7 @@ jobs:
             DF_SUFFIX=".cpu_and_deepspeed"
           fi
           TARGET=${{steps.target.outputs.target}}
-          docker build ./ --build-arg CACHEBUST=1 --build-arg http_proxy=${{ inputs.http_proxy }} --build-arg https_proxy=${{ inputs.https_proxy }} -f dev/docker/Dockerfile${DF_SUFFIX} -t ${TARGET}:latest 
+          docker build ./ --build-arg CACHEBUST=1 --build-arg http_proxy=${{ inputs.http_proxy }} --build-arg https_proxy=${{ inputs.https_proxy }} -f dev/docker/Dockerfile${DF_SUFFIX} -t ${TARGET}:latest
           docker container prune -f
           docker image prune -f
 
@@ -118,32 +118,8 @@ jobs:
       - name: Run Inference Test
         run: |
           TARGET=${{steps.target.outputs.target}}
-          CMD=$(cat << EOF
-          import yaml
-          if ("${{ matrix.model }}" == "starcoder"):
-              conf_path = "llm_on_ray/inference/models/starcoder.yaml"
-              with open(conf_path, encoding="utf-8") as reader:
-                  result = yaml.load(reader, Loader=yaml.FullLoader)
-                  result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
-              with open(conf_path, 'w') as output:
-                  yaml.dump(result, output, sort_keys=False)
-          if ("${{ matrix.model }}" == "llama-2-7b-chat-hf"):
-              conf_path = "llm_on_ray/inference/models/llama-2-7b-chat-hf.yaml"
-              with open(conf_path, encoding="utf-8") as reader:
-                  result = yaml.load(reader, Loader=yaml.FullLoader)
-                  result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
-              with open(conf_path, 'w') as output:
-                  yaml.dump(result, output, sort_keys=False)
-          if ("${{ matrix.model }}" == "gemma-2b"):
-              conf_path = "llm_on_ray/inference/models/gemma-2b.yaml"
-              with open(conf_path, encoding="utf-8") as reader:
-                  result = yaml.load(reader, Loader=yaml.FullLoader)
-                  result['model_description']["config"]["use_auth_token"] = "${{ env.HF_ACCESS_TOKEN }}"
-              with open(conf_path, 'w') as output:
-                  yaml.dump(result, output, sort_keys=False)
-          EOF
-          )
-          docker exec "${TARGET}" python -c "$CMD"
+          # Enable non-gated and gated models access
+          docker exec "${TARGET}" bash -c "huggingface-cli login --token ${{ env.HF_ACCESS_TOKEN }}"
           if [[ ${{ matrix.model }} == "mpt-7b-ipex-llm" ]]; then
             docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/ipex-llm/mpt-7b-ipex-llm.yaml --simple"
           elif [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then

diff --git a/.github/workflows/workflow_orders_on_merge.yml b/.github/workflows/workflow_orders_on_merge.yml
@@ -5,16 +5,8 @@ on:
     branches:
       - main
     paths:
-      - '.github/**'
-      - 'docker/**'
-      - 'dev/docker/**'
-      - 'llm_on_ray/common/**'
-      - 'llm_on_ray/finetune/**'
-      - 'llm_on_ray/inference/**'
-      - 'llm_on_ray/rlhf/**'
-      - 'tools/**'
-      - 'pyproject.toml'
-      - 'tests/**'
+      - '**'
+      - '!*.md'
 
 jobs:
   Lint:

diff --git a/.github/workflows/workflow_orders_on_pr.yml b/.github/workflows/workflow_orders_on_pr.yml
@@ -5,16 +5,8 @@ on:
     branches:
       - main
     paths:
-      - '.github/**'
-      - 'docker/**'
-      - 'dev/docker/**'
-      - 'llm_on_ray/common/**'
-      - 'llm_on_ray/finetune/**'
-      - 'llm_on_ray/inference/**'
-      - 'llm_on_ray/rlhf/**'
-      - 'tools/**'
-      - 'pyproject.toml'
-      - 'tests/**'
+      - '**'
+      - '!*.md'
 
 jobs:
 

diff --git a/dev/docker/Dockerfile.vllm b/dev/docker/Dockerfile.vllm
@@ -28,14 +28,14 @@ COPY ./pyproject.toml .
 COPY ./MANIFEST.in .
 COPY ./dev/scripts/install-vllm-cpu.sh .
 
-# create llm_on_ray package directory to bypass the following 'pip install -e' command
-RUN mkdir ./llm_on_ray
-
-RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[cpu] --extra-index-url https://download.pytorch.org/whl/cpu \
-    --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
-
 # Install vllm-cpu
 # Activate base first for loading g++ envs ($CONDA_PREFIX/etc/conda/activate.d/*)
 RUN --mount=type=cache,target=/root/.cache/pip \
     source /opt/conda/bin/activate base && ./install-vllm-cpu.sh
 
+# Install llm_on_ray
+# Create llm_on_ray package directory to bypass the following 'pip install -e' command
+RUN mkdir ./llm_on_ray
+RUN --mount=type=cache,target=/root/.cache/pip pip install -e .[cpu] --extra-index-url https://download.pytorch.org/whl/cpu \
+    --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/cpu/us/
+
diff --git a/dev/scripts/install-vllm-cpu.sh b/dev/scripts/install-vllm-cpu.sh
@@ -4,17 +4,21 @@
 [[ -n $(which g++) ]] || { echo "GNU C++ Compiler (g++) is not found!";  exit 1; }
 [[ -n $(which pip) ]] || { echo "pip command is not found!";  exit 1; }
 
-# g++ version should be >=12.3
+# g++ version should be >=12.3. On Ubuntu 22.4, you can run:
+# sudo apt-get update  -y
+# sudo apt-get install -y gcc-12 g++-12
+# sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 version_greater_equal()
 {
     printf '%s\n%s\n' "$2" "$1" | sort --check=quiet --version-sort
 }
-gcc_version=$(g++ -dumpversion)
+gcc_version=$(g++ --version | grep -o -E '[0-9]+\.[0-9]+\.[0-9]+' | head -n1)
 echo
 echo Current GNU C++ Compiler version: $gcc_version
 echo
 version_greater_equal "${gcc_version}" 12.3.0 || { echo "GNU C++ Compiler 12.3.0 or above is required!"; exit 1; }
 
-# Install from source
-MAX_JOBS=8 pip install -v git+https://github.com/bigPYJ1151/vllm@PR_Branch \
+# Refer to https://docs.vllm.ai/en/latest/getting_started/cpu-installation.html to install from source
+# We use this one-liner to install latest vllm-cpu
+MAX_JOBS=8 VLLM_TARGET_DEVICE=cpu pip install -v git+https://github.com/vllm-project/vllm.git \
     --extra-index-url https://download.pytorch.org/whl/cpu
diff --git a/llm_on_ray/inference/vllm_predictor.py b/llm_on_ray/inference/vllm_predictor.py
@@ -15,6 +15,7 @@
 #
 
 import asyncio
+import os
 from typing import AsyncGenerator, List, Union
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.engine.async_llm_engine import AsyncLLMEngine
@@ -25,20 +26,25 @@
 
 
 class VllmPredictor(Predictor):
+    VLLM_CPU_KVCACHE_SPACE_DEFAULT = 40
+
     def __init__(self, infer_conf: InferenceConfig, max_num_seqs):
         super().__init__(infer_conf)
 
         model_desc = infer_conf.model_description
         model_config = model_desc.config
         dtype = "bfloat16" if infer_conf.vllm.precision == PRECISION_BF16 else "float32"
 
+        # Set environment variable VLLM_CPU_KVCACHE_SPACE to control the size of the CPU key-value cache.
+        # The default value is 40GB.
+        os.environ["VLLM_CPU_KVCACHE_SPACE"] = str(self.VLLM_CPU_KVCACHE_SPACE_DEFAULT)
+
         args = AsyncEngineArgs(
             model=model_desc.model_id_or_path,
             trust_remote_code=model_config.trust_remote_code,
             device=infer_conf.device,
             dtype=dtype,
             disable_log_requests=True,
-            swap_space=40,
             max_num_seqs=max_num_seqs,
         )