diff --git a/.github/actions/nm-build-docker/action.yml b/.github/actions/nm-build-docker/action.yml
new file mode 100644
index 0000000000000..db128c5d103d2
--- /dev/null
+++ b/.github/actions/nm-build-docker/action.yml
@@ -0,0 +1,33 @@
+name: Build docker image
+description: 'build docker image for nm-vllm'
+inputs:
+  docker_tag:
+    description: "tag to be used for the docker image"
+    type: string
+    required: true
+  build_type:
+    description: "type of nm-vllm to install for the docker image: nightly (default) or release"
+    type: string
+    default: 'nightly'
+  build_version:
+    description: "version of nm-vllm to install for the docker image: latest (default) or specific version e.g. 0.4.0, 0.4.0.20240531"
+    type: string
+    default: 'latest'
+runs:
+  using: composite
+  steps:
+  - run: |
+      # clean up
+      docker stop $(docker ps -a -q) || echo 'no container to stop'
+      docker rm $(docker ps -a -q) || echo 'no container to remove'
+      docker rmi -f $(docker images -aq) || echo 'no image to remove'
+      docker system prune --all --force
+      # build
+      docker build --tag ghcr.io/neuralmagic/nm-vllm-openai:${{ inputs.docker_tag }} \
+             --build-arg build_type=${{ inputs.build_type }} \
+             --build-arg build_version=${{ inputs.build_version }} \
+             --target vllm-openai . || status=$?
+      echo "status=${status}" >> $GITHUB_OUTPUT
+      echo "status=${status}"
+      exit ${status}
+    shell: bash
diff --git a/.github/actions/nm-setup-nvidia-container-toolkit/action.yml b/.github/actions/nm-setup-nvidia-container-toolkit/action.yml
new file mode 100644
index 0000000000000..2e95d43d87827
--- /dev/null
+++ b/.github/actions/nm-setup-nvidia-container-toolkit/action.yml
@@ -0,0 +1,20 @@
+name: set up nvidia-container-toolkit for docker
+description: 'sets up nvidia-container-toolkit for docker'
+runs:
+  using: composite
+  steps:
+  - run: |
+        # install nvidia-container-toolkit
+        curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
+          && curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
+          sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
+          sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
+        sudo sed -i -e '/experimental/ s/^#//g' /etc/apt/sources.list.d/nvidia-container-toolkit.list
+        sudo killall apt apt-get || echo 'no apt or apt-get process to kill'
+        sudo apt-get update
+        sudo apt-get install -y nvidia-container-toolkit
+        # config and restart docker
+        sudo systemctl stop docker
+        sudo nvidia-ctk runtime configure --runtime=docker
+        sudo systemctl start docker
+    shell: bash
diff --git a/.github/workflows/publish-docker.yml b/.github/workflows/publish-docker.yml
index 7e8a08064f459..1e129ebc55996 100644
--- a/.github/workflows/publish-docker.yml
+++ b/.github/workflows/publish-docker.yml
@@ -1,26 +1,36 @@
 name: Docker Build + Publish
 
 on:
-  # For now, just manually trigger
-  # push:
-  #   branches:
-  #     - main
-  # pull_request:
-  #   branches:
-  #     - main
   workflow_dispatch:
+    inputs:
+      docker_tag:
+        description: "tag to be used for the docker image"
+        type: string
+        required: true
+      push_to_repository:
+        description: "whether to push out the docker image: no (default) or yes"
+        type: string
+        default: 'no'
+      gitref:
+        description: "git commit hash or branch name"
+        type: string
+        default: 'main'
+      build_type:
+        description: "type of nm-vllm to install for the docker image: nightly (default) or release"
+        type: string
+        default: 'nightly'
+      build_version:
+        description: "version of nm-vllm to install for the docker image: latest (default) or specific version e.g. 0.4.0, 0.4.0.20240531"
+        type: string
+        default: 'latest'
 
 jobs:
     build-docker-image:
 
-        runs-on: aws-avx2-192G-4-a10g-96G
-        timeout-minutes: 240
+        runs-on: aws-avx2-32G-a10g-24G
+        timeout-minutes: 60
 
         steps:
-          
-          - name: Set up Docker Buildx
-            id: buildx
-            uses: docker/setup-buildx-action@v3
 
           - name: Login to Github Packages
             uses: docker/login-action@v3
@@ -35,17 +45,23 @@ jobs:
               fetch-depth: 1
               submodules: recursive
 
-          - name: Get version tag
-            id: extract_tag
-            run: echo "tag=$(date +%Y%m%d)" >> $GITHUB_OUTPUT
+          - name: Set up nvidia-container-toolkit
+            id: setup
+            uses: ./.github/actions/nm-setup-nvidia-container-toolkit/
 
-          - name: Current Version Name
-            run: echo ${{ steps.extract_tag.outputs.tag }}
+          - name: Build image
+            id: build
+            uses: ./.github/actions/nm-build-docker/
+            with:
+              docker_tag: ${{ inputs.docker_tag }}
+              build_type: ${{ inputs.build_type }}
+              build_version: ${{ inputs.build_version }}
 
-          - name: nm-vllm latest
+          - name: Push image
             uses: docker/build-push-action@v5
+            if: ${{ inputs.push_to_repository == 'yes' && steps.build.outputs.status == 0 }}
             with:
               context: .
               target: vllm-openai
               push: true
-              tags: ghcr.io/neuralmagic/nm-vllm-openai:${{ steps.extract_tag.outputs.tag }}
+              tags: ghcr.io/neuralmagic/nm-vllm-openai:${{ inputs.docker_tag }}
diff --git a/Dockerfile b/Dockerfile
index 29c561fdfe884..9ec091a103741 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -31,61 +31,29 @@ COPY requirements-dev.txt requirements-dev.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install -r requirements-dev.txt
 
-# cuda arch list used by torch
-# can be useful for both `dev` and `test`
-# explicitly set the list to avoid issues with torch 2.2
-# see https://github.com/pytorch/pytorch/pull/123243
-ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
-ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 #################### BASE BUILD IMAGE ####################
 
-
 #################### WHEEL BUILD IMAGE ####################
 FROM dev AS build
 
-# install build dependencies
-COPY requirements-build.txt requirements-build.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements-build.txt
-
 # install compiler cache to speed up compilation leveraging local or remote caching
 RUN apt-get update -y && apt-get install -y ccache
 
-# files and directories related to build wheels
-COPY csrc csrc
-COPY setup.py setup.py
-COPY cmake cmake
-COPY CMakeLists.txt CMakeLists.txt
-COPY requirements-common.txt requirements-common.txt
-COPY requirements-cuda.txt requirements-cuda.txt
-COPY pyproject.toml pyproject.toml
-COPY vllm vllm
-
-# max jobs used by Ninja to build extensions
-ARG max_jobs=2
-ENV MAX_JOBS=${max_jobs}
-# number of threads used by nvcc
-ARG nvcc_threads=8
-ENV NVCC_THREADS=$nvcc_threads
-# make sure punica kernels are built (for LoRA)
-ENV VLLM_INSTALL_PUNICA_KERNELS=1
-
-ENV CCACHE_DIR=/root/.cache/ccache
-RUN --mount=type=cache,target=/root/.cache/ccache \
-    --mount=type=cache,target=/root/.cache/pip \
-    python3 setup.py bdist_wheel --dist-dir=dist
+#################### EXTENSION Build IMAGE ####################
 
-# check the size of the wheel, we cannot upload wheels larger than 100MB
-COPY .buildkite/check-wheel-size.py check-wheel-size.py
-RUN python3 check-wheel-size.py dist
+#################### FLASH_ATTENTION Build IMAGE ####################
+FROM dev as flash-attn-builder
+# flash attention version
+ARG flash_attn_version=v2.5.8
+ENV FLASH_ATTN_VERSION=${flash_attn_version}
 
-# the `vllm_nccl` package must be installed from source distribution
-# pip is too smart to store a wheel in the cache, and other CI jobs
-# will directly use the wheel from the cache, which is not what we want.
-# we need to remove it manually
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip cache remove vllm_nccl*
-#################### EXTENSION Build IMAGE ####################
+WORKDIR /usr/src/flash-attention-v2
+
+# Download the wheel or build it if a pre-compiled release doesn't exist
+RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
+    --no-build-isolation --no-deps --no-cache-dir
+
+#################### FLASH_ATTENTION Build IMAGE ####################
 
 #################### vLLM installation IMAGE ####################
 # image with vLLM installed
@@ -101,18 +69,34 @@ RUN apt-get update -y \
 # or future versions of triton.
 RUN ldconfig /usr/local/cuda-12.4/compat/
 
-# UPSTREAM SYNC: Install sparsity extras
-RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
+# install nm-vllm wheel first, so that torch etc will be installed
+ARG build_type="nightly"
+ARG build_version="latest"
+ENV INSTALL_TYPE=${build_type}
+ENV INSTALL_VERSION=${build_version}
+# UPSTREAM SYNC: Install nm-vllm with sparsity extras
+# use nm pypi for now for testing
+RUN --mount=type=bind,from=build \
     --mount=type=cache,target=/root/.cache/pip \
-    pip install nm-magic-wand-nightly --extra-index-url https://pypi.neuralmagic.com/simple
-
-# install vllm wheel first, so that torch etc will be installed
-RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
+    if [ "${INSTALL_TYPE}" = "nightly" ]; then \
+        if [ "${INSTALL_VERSION}" = "latest" ]; then \
+            pip install nm-vllm-nightly[sparse] --extra-index-url https://pypi.neuralmagic.com/simple; \
+        else \
+            pip install nm-vllm-nightly[sparse]==${INSTALL_VERSION} --extra-index-url https://pypi.neuralmagic.com/simple; \
+        fi; \
+    else \
+        if [ "${INSTALL_VERSION}" = "latest" ]; then \
+            pip install nm-vllm[sparse] --extra-index-url https://pypi.neuralmagic.com/simple; \
+        else \
+            pip install nm-vllm[sparse]==${INSTALL_VERSION} --extra-index-url https://pypi.neuralmagic.com/simple; \
+        fi; \
+    fi
+
+RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
     --mount=type=cache,target=/root/.cache/pip \
-    pip install dist/*.whl --verbose
+    pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
 #################### vLLM installation IMAGE ####################
 
-
 #################### TEST IMAGE ####################
 # image to run unit testing suite
 # note that this uses vllm installed by `pip`
@@ -120,9 +104,8 @@ FROM vllm-base AS test
 
 ADD . /vllm-workspace/
 
-# install development dependencies (for testing)
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install -r requirements-dev.txt
+# check installed version
+RUN pip freeze | grep -e nm-vllm -e nm-magic-wand
 
 # doc requires source code
 # we hide them inside `test_docs/` , so that this source code
@@ -144,4 +127,4 @@ RUN --mount=type=cache,target=/root/.cache/pip \
 ENV VLLM_USAGE_SOURCE production-docker-image
 
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
-#################### OPENAI API SERVER ####################
\ No newline at end of file
+#################### OPENAI API SERVER ####################