Fix docker upload bugs (#352)

- Fixed a bug due to output whl and build version value missing - Renamed input argument wheel to whl to be consistent across actions and jobs - Removed unnecessary part in Dockerfile that caused image build to fail due to permission issue --------- Co-authored-by: dhuangnm <dhuang@MacBook-Pro-2.local>
neuralmagic · Jul 3, 2024 · f6f2554 · f6f2554 · github-actions · Jul 4, 2024
1 parent 7144d20
commit f6f2554
Show file tree

Hide file tree

Showing 5 changed files with 31 additions and 37 deletions.
diff --git a/.github/actions/nm-get-docker-tags/action.yml b/.github/actions/nm-get-docker-tags/action.yml
@@ -4,8 +4,8 @@ inputs:
   wf_category:
     description: "type of nm-vllm to install for the docker image: NIGHTLY or RELEASE"
     required: true
-  wheel:
-    description: "wheel name, if latest use the latest from nm pypi"
+  whl:
+    description: "name of nm-vllm wheel to install for the docker image"
     required: true
 outputs:
   tag:
@@ -22,16 +22,23 @@ runs:
   steps:
   - id: tags
     run: |
-      BUILD_VERSION=`echo "${{ inputs.wheel }}" | cut -d'-' -f2`
+      BUILD_VERSION=`echo "${{ inputs.whl }}" | cut -d'-' -f2`
       if [[ "${{ inputs.wf_category }}" == "RELEASE" ]]; then
-          TAG="v${build_version}"
-          EXTRA_TAG=latest
+          if [[ "${BUILD_VERSION}" =~ ^[0-9]+.[0-9]+.[0-9]+$ ]]; then
+              TAG="v${BUILD_VERSION}"
+              EXTRA_TAG=latest
+          else
+              echo "ERROR: wheel version ${BUILD_VERSION} doesn't match RELEASE format. Check input."
+              exit 1
+          fi
       else
-          TAG=`echo "${build_version}" | cut -d'.' -f4`
-          EXTRA_TAG=nightly
-      fi
-      if [[ "${{ inputs.wheel }}" == "latest" ]]; then
-          BUILD_VERSION="latest"
+          if [[ "${BUILD_VERSION}" =~ ^[0-9]+.[0-9]+.[0-9]+.[0-9]{8}$ ]]; then
+              TAG=`echo "${BUILD_VERSION}" | cut -d'.' -f4`
+              EXTRA_TAG=nightly
+          else
+              echo "ERROR: wheel version ${BUILD_VERSION} doesn't match NIGHTLY format. Check input."
+              exit 1
+          fi
       fi
       echo "tag=${TAG}" >> $GITHUB_OUTPUT
       echo "extra_tag=${EXTRA_TAG}" >> $GITHUB_OUTPUT

diff --git a/.github/workflows/nm-build-test.yml b/.github/workflows/nm-build-test.yml
@@ -173,12 +173,12 @@ jobs:
 
     # update docker
     DOCKER:
-        needs: [BUILD, UPLOAD]
-        if: ${{ inputs.push_to_pypi }}
+        needs: [BUILD]
+        if: ${{ inputs.wf_category != 'REMOTE' }}
         uses: ./.github/workflows/publish-docker.yml
         with:
             push_to_repository: ${{ inputs.push_to_pypi }}
             gitref: ${{ inputs.gitref }}
             wf_category: ${{ inputs.wf_category }}
-            wheel: ${{ needs.BUILD.outputs.whl }}
+            whl: ${{ needs.BUILD.outputs.whl }}
         secrets: inherit
diff --git a/.github/workflows/nm-build.yml b/.github/workflows/nm-build.yml
@@ -31,6 +31,10 @@ on:
         description: "python version, e.g. 3.10.12"
         type: string
         required: true
+    outputs:
+      whl:
+        description: 'basename for generated whl'
+        value: ${{ jobs.BUILD.outputs.whl }}
 
   # makes workflow manually callable
   workflow_dispatch:

diff --git a/.github/workflows/publish-docker.yml b/.github/workflows/publish-docker.yml
@@ -15,10 +15,10 @@ on:
         description: "type of nm-vllm to install for the docker image: NIGHTLY (default) or RELEASE"
         type: string
         default: 'NIGHTLY'
-      wheel:
-        description: "nm-vllm wheel to install for the docker image: latest (default) or specific wheel name"
+      whl:
+        description: "nm-vllm wheel to install for the docker image"
         type: string
-        default: 'latest'
+        required: true
 
   workflow_dispatch:
     inputs:
@@ -34,10 +34,10 @@ on:
         description: "type of nm-vllm to install for the docker image: NIGHTLY (default) or RELEASE"
         type: string
         default: 'NIGHTLY'
-      wheel:
-        description: "nm-vllm wheel to install for the docker image: latest (default) or specific wheel name"
+      whl:
+        description: "nm-vllm wheel to install for the docker image"
         type: string
-        default: 'latest'
+        required: true
 
 jobs:
     build-docker-image:
@@ -70,7 +70,7 @@ jobs:
             uses: ./.github/actions/nm-get-docker-tags/
             with:
               wf_category: ${{ inputs.wf_category }}
-              wheel: ${{ inputs.wheel }}
+              whl: ${{ inputs.whl }}
 
           - name: Build image
             id: build

diff --git a/Dockerfile b/Dockerfile
@@ -62,20 +62,6 @@ RUN apt-get update -y && apt-get install -y ccache
 
 #################### EXTENSION Build IMAGE ####################
 
-#################### FLASH_ATTENTION Build IMAGE ####################
-FROM dev as flash-attn-builder
-# flash attention version
-ARG flash_attn_version=v2.5.8
-ENV FLASH_ATTN_VERSION=${flash_attn_version}
-
-WORKDIR /usr/src/flash-attention-v2
-
-# Download the wheel or build it if a pre-compiled release doesn't exist
-RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
-    --no-build-isolation --no-deps --no-cache-dir
-
-#################### FLASH_ATTENTION Build IMAGE ####################
-
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
 FROM nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04 AS vllm-base
@@ -114,9 +100,6 @@ RUN --mount=type=bind,from=build \
         fi; \
     fi
 
-RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
-    --mount=type=cache,target=/root/.cache/pip \
-    pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
 #################### vLLM installation IMAGE ####################
 
 #################### TEST IMAGE ####################
Benchmark suite	Current: `f6f2554`	Previous: `53347d5`	Ratio
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`187.90222024999898` ms	`189.56996849333186` ms	`0.99`
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`83.68341362953504` ms	`84.82286798803653` ms	`0.99`
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`24.958321440002084` ms	`23.707702909998716` ms	`1.05`
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`6.288577137086927` ms	`5.927541407824298` ms	`1.06`
Benchmark suite	Current: `f6f2554`	Previous: `53347d5`	Ratio
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`187.9166367666623` ms	`189.56996849333186` ms	`0.99`
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`84.81336620919495` ms	`84.82286798803653` ms	`1.00`
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`23.566118316668206` ms	`23.707702909998716` ms	`0.99`
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`5.9528972617267435` ms	`5.927541407824298` ms	`1.00`
Benchmark suite	Current: `f6f2554`	Previous: `53347d5`	Ratio
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`23.640777080014836` ms	`23.707702909998716` ms	`1.00`
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`5.9662897229532526` ms	`5.927541407824298` ms	`1.01`
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`189.8570324766691` ms	`189.56996849333186` ms	`1.00`
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`83.69674489188301` ms	`84.82286798803653` ms	`0.99`
Benchmark suite	Current: `f6f2554`	Previous: `53347d5`	Ratio
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`24.5676224299973` ms	`23.707702909998716` ms	`1.04`
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`6.1184165169899085` ms	`5.927541407824298` ms	`1.03`
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`187.93780359666093` ms	`189.56996849333186` ms	`0.99`
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`85.69236929201485` ms	`84.82286798803653` ms	`1.01`
Benchmark suite	Current: `f6f2554`	Previous: `53347d5`	Ratio
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`24.363126653334653` ms	`23.707702909998716` ms	`1.03`
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - facebook/opt-350m\nmax-model-len - 2048\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`6.047910136160147` ms	`5.927541407824298` ms	`1.02`
`{"name": "mean_ttft_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`184.3794278033306` ms	`189.56996849333186` ms	`0.97`
`{"name": "mean_tpot_ms", "description": "VLLM Serving - Dense\nmodel - meta-llama/Meta-Llama-3-8B-Instruct\nmax-model-len - 4096\nsparsity - None\nbenchmark_serving {\n \"nr-qps-pair_\": \"300,1\",\n \"dataset\": \"sharegpt\"\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.1", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`85.039470859161` ms	`84.82286798803653` ms	`1.00`