InternLM · CUHKSZzxy · Oct 30, 2025 · Oct 30, 2025 · Nov 24, 2025 · Nov 24, 2025
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -40,18 +40,23 @@ COPY . /opt/lmdeploy
 WORKDIR /opt/lmdeploy
 
 RUN --mount=type=cache,target=/root/.cache \
-    docker/build.sh && \
+    docker/build.sh
+
+RUN --mount=type=cache,target=/root/.cache \
     docker/prepare_wheel.sh
 
 # Runtime image
 FROM nvidia/cuda:12.8.1-base-ubuntu22.04 AS cu12.8-base
 ENV CUDA_VERSION_SHORT=cu128
+ARG CUDA_COMPAT_PATH=/usr/local/cuda-12.8/compat
 
 FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS cu12-base
 ENV CUDA_VERSION_SHORT=cu124
+ARG CUDA_COMPAT_PATH=/usr/local/cuda-12.4/compat
 
 FROM nvidia/cuda:11.8.0-base-ubuntu22.04 AS cu11-base
 ENV CUDA_VERSION_SHORT=cu118
+ARG CUDA_COMPAT_PATH=""
 
 FROM ${CUDA_VERSION}-base AS final
 ARG PYTHON_VERSION=3.10
@@ -67,4 +72,7 @@ RUN --mount=type=cache,target=/root/.cache \
 ENV PATH=/opt/py3/bin:$PATH
 ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
 ENV LD_LIBRARY_PATH=/nccl/lib:$LD_LIBRARY_PATH
-FROM ${IMAGE_TYPE}
+
+# set path for deep_gemm
+ENV LD_LIBRARY_PATH=/opt/py3/lib/python${PYTHON_VERSION}/site-packages/torch/lib:$LD_LIBRARY_PATH
+ENV LD_LIBRARY_PATH=${CUDA_COMPAT_PATH}:$LD_LIBRARY_PATH
diff --git a/docker/install.sh b/docker/install.sh
@@ -1,43 +1,58 @@
 #!/bin/bash -ex
 
+# install system packages
 export DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC
 sed -i 's|http://archive.ubuntu.com|http://azure.archive.ubuntu.com|g' /etc/apt/sources.list
 apt-get update -y
 apt-get install -y --no-install-recommends \
-    tzdata wget curl ssh sudo git-core libibverbs1 ibverbs-providers ibverbs-utils librdmacm1 libibverbs-dev rdma-core libmlx5-1
+    tzdata wget curl ssh sudo git-core vim libibverbs1 ibverbs-providers ibverbs-utils librdmacm1 libibverbs-dev rdma-core libmlx5-1
 
 if [[ ${PYTHON_VERSION} != "3.10" ]]; then
     apt-get install -y --no-install-recommends software-properties-common
     add-apt-repository -y ppa:deadsnakes/ppa
     apt-get update -y
 fi
 
+# install python, create virtual env
 apt-get install -y --no-install-recommends \
     python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv
 
 pushd /opt >/dev/null
     python${PYTHON_VERSION} -m venv py3
 popd >/dev/null
 
+# install CUDA build tools
 if [[ "${CUDA_VERSION_SHORT}" = "cu118" ]]; then
     apt-get install -y --no-install-recommends cuda-minimal-build-11-8
 elif [[ "${CUDA_VERSION_SHORT}" = "cu124" ]]; then
-    apt-get install -y --no-install-recommends cuda-minimal-build-12-4
+    apt-get install -y --no-install-recommends cuda-minimal-build-12-4 cuda-nvrtc-12-4 build-essential devscripts debhelper fakeroot pkg-config dkms
 elif [[ "${CUDA_VERSION_SHORT}" = "cu128" ]]; then
-    apt-get install -y --no-install-recommends cuda-minimal-build-12-8
+    apt-get install -y --no-install-recommends cuda-minimal-build-12-8 cuda-nvrtc-12-8 build-essential devscripts debhelper fakeroot pkg-config dkms
 fi
 
 apt-get clean -y
 rm -rf /var/lib/apt/lists/*
 
+# install GDRCopy
+GDRCOPY_VERSION=2.5.1
+mkdir -p /tmp/gdrcopy && cd /tmp \
+ && wget -q https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz \
+ && tar -xzf v${GDRCOPY_VERSION}.tar.gz && rm v${GDRCOPY_VERSION}.tar.gz \
+ && cd gdrcopy-${GDRCOPY_VERSION}/packages \
+ && CUDA=/usr/local/cuda ./build-deb-packages.sh \
+ && dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \
+ && cd / && rm -rf /tmp/gdrcopy
+
+# install python packages
 export PATH=/opt/py3/bin:$PATH
 
 if [[ "${CUDA_VERSION_SHORT}" = "cu118" ]]; then
     FA_VERSION=2.7.3
     TORCH_VERSION="<2.7"
 else
     FA_VERSION=2.8.3
-    TORCH_VERSION=""
+    # pin torch version to avoid build and runtime mismatch, o.w. deep_gemm undefined symbol error
+    TORCH_VERSION="==2.8.0"
 fi
 
 pip install -U pip wheel setuptools
@@ -46,13 +61,15 @@ if [[ "${CUDA_VERSION_SHORT}" != "cu118" ]]; then
     pip install nvidia-nvshmem-cu12
 fi
 
-pip install /wheels/*.whl torch${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/${CUDA_VERSION_SHORT}
+pip install torch${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/${CUDA_VERSION_SHORT}
+pip install /wheels/*.whl
 
 
 if [[ "${CUDA_VERSION_SHORT}" != "cu118" ]] && [[ "${PYTHON_VERSION}" != "3.9" ]]; then
-    pip install cuda-python dlblas
+    pip install cuda-python dlblas==0.0.6
 fi
 
+# install pre-compiled flash attention wheel
 PLATFORM="linux_x86_64"
 PY_VERSION=$(python3 - <<'PY'
 import torch, sys

diff --git a/docker/prepare_wheel.sh b/docker/prepare_wheel.sh
@@ -5,7 +5,8 @@ export PATH=/opt/py3/bin:$PATH
 if [[ "${CUDA_VERSION_SHORT}" = "cu118" ]]; then
     TORCH_VERSION="<2.7"
 else
-    TORCH_VERSION=""
+    # pin torch version to avoid build and runtime mismatch, o.w. deep_gemm undefined symbol error
+    TORCH_VERSION="==2.8.0"
 fi
 
 pip install "cmake<4.0" wheel ninja setuptools packaging
@@ -21,18 +22,18 @@ fi
 
 if [[ "${CUDA_VERSION_SHORT}" != "cu118" ]]; then
 
-    if [[ "${CUDA_VERSION_SHORT}" = "cu124" ]]; then
-        DEEP_GEMM_VERSION=03d0be3
-        FLASH_MLA_VERSION=9edee0c
-    else
-        DEEP_GEMM_VERSION=79f48ee
-        FLASH_MLA_VERSION=c759027
-    fi
+    DEEP_EP_VERSION=9af0e0d  # v1.2.1
+    DEEP_GEMM_VERSION=c9f8b34  # v2.1.1.post3
+    FLASH_MLA_VERSION=1408756  # no release, pick the latest commit
 
-    DEEP_EP_VERSION=26cf250
+    # DeepEP
     pip install nvidia-nvshmem-cu12
-
     pip wheel -v --no-build-isolation --no-deps -w /wheels "git+https://github.com/deepseek-ai/DeepEP.git@${DEEP_EP_VERSION}"
-    pip wheel -v --no-build-isolation --no-deps -w /wheels "git+https://github.com/deepseek-ai/FlashMLA.git@${FLASH_MLA_VERSION}"
+
+    # DeepGEMM
     pip wheel -v --no-build-isolation --no-deps -w /wheels "git+https://github.com/deepseek-ai/DeepGEMM.git@${DEEP_GEMM_VERSION}"
+
+    # FlashMLA
+    # sm100 compilation for Flash MLA requires NVCC 12.9 or higher
+    FLASH_MLA_DISABLE_SM100=1 pip wheel -v --no-build-isolation --no-deps -w /wheels "git+https://github.com/deepseek-ai/FlashMLA.git@${FLASH_MLA_VERSION}"
 fi
diff --git a/lmdeploy/pytorch/backends/cuda/graph_runner.py b/lmdeploy/pytorch/backends/cuda/graph_runner.py
@@ -17,6 +17,13 @@
 
 logger = get_logger('lmdeploy')
 
+try:
+    from dlblas.layers.moe.token_dispatcher import DeepEPBuffer, DeepEPMode
+    use_deepep = True
+except ImportError:
+    logger.warning('Currently not using DeepEP, please install DeepEP and DLBLas properly.')
+    use_deepep = False
+
 
 def next_power_of_2(n: int):
     """Return the smallest power of 2 greater than or equal to n."""
@@ -242,6 +249,11 @@ def prepare_inputs_for_generation(
         context: StepContext = None,
     ):
         """Prepare inputs."""
+
+        if use_deepep:
+            deepep_mode = DeepEPMode.LOW_LATENCY if context.is_decoding else DeepEPMode.NORMAL
+            DeepEPBuffer.set_deepep_mode(deepep_mode)
+
         return self.model.prepare_inputs_for_generation(
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,

diff --git a/lmdeploy/pytorch/backends/cuda/token_dispatcher.py b/lmdeploy/pytorch/backends/cuda/token_dispatcher.py
@@ -1,6 +1,13 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 try:
     from deep_ep import Buffer
+
+    from lmdeploy.pytorch.envs import env_to_int
+
+    # default value refers to DeepEP code
+    # https://github.com/deepseek-ai/DeepEP/blob/bfded34800dfec415b71503f8205181de90b2480/deep_ep/buffer.py#L30
+    deep_ep_buffer_num_sms = env_to_int(env_var='DEEPEP_BUFFER_NUM_SMS', default=20)
+    Buffer.set_num_sms(deep_ep_buffer_num_sms)
     use_deepep = True
 except ImportError:
     use_deepep = False

diff --git a/lmdeploy/pytorch/envs.py b/lmdeploy/pytorch/envs.py
@@ -126,6 +126,7 @@ def _patched_get_env(
     # we don't need to read this, it would be passed to ray workers
     # If Ray is launched from outside, it may fail to access the environment variables.
     os.getenv('DEEPEP_MAX_BATCH_SIZE', None)
+    os.getenv('DEEPEP_BUFFER_NUM_SMS', None)
 
     # deepgemm
     os.getenv('DG_JIT_DEBUG', '0')