Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -40,18 +40,23 @@ COPY . /opt/lmdeploy
WORKDIR /opt/lmdeploy

RUN --mount=type=cache,target=/root/.cache \
docker/build.sh && \
docker/build.sh

RUN --mount=type=cache,target=/root/.cache \
docker/prepare_wheel.sh

# Runtime image
FROM nvidia/cuda:12.8.1-base-ubuntu22.04 AS cu12.8-base
ENV CUDA_VERSION_SHORT=cu128
ARG CUDA_COMPAT_PATH=/usr/local/cuda-12.8/compat

FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS cu12-base
ENV CUDA_VERSION_SHORT=cu124
ARG CUDA_COMPAT_PATH=/usr/local/cuda-12.4/compat

FROM nvidia/cuda:11.8.0-base-ubuntu22.04 AS cu11-base
ENV CUDA_VERSION_SHORT=cu118
ARG CUDA_COMPAT_PATH=""

FROM ${CUDA_VERSION}-base AS final
ARG PYTHON_VERSION=3.10
Expand All @@ -67,4 +72,7 @@ RUN --mount=type=cache,target=/root/.cache \
ENV PATH=/opt/py3/bin:$PATH
ENV TRITON_PTXAS_PATH=/usr/local/cuda/bin/ptxas
ENV LD_LIBRARY_PATH=/nccl/lib:$LD_LIBRARY_PATH
FROM ${IMAGE_TYPE}

# set path for deep_gemm
ENV LD_LIBRARY_PATH=/opt/py3/lib/python${PYTHON_VERSION}/site-packages/torch/lib:$LD_LIBRARY_PATH
ENV LD_LIBRARY_PATH=${CUDA_COMPAT_PATH}:$LD_LIBRARY_PATH
29 changes: 23 additions & 6 deletions docker/install.sh
Original file line number Diff line number Diff line change
@@ -1,43 +1,58 @@
#!/bin/bash -ex

# install system packages
export DEBIAN_FRONTEND=noninteractive TZ=Etc/UTC
sed -i 's|http://archive.ubuntu.com|http://azure.archive.ubuntu.com|g' /etc/apt/sources.list
apt-get update -y
apt-get install -y --no-install-recommends \
tzdata wget curl ssh sudo git-core libibverbs1 ibverbs-providers ibverbs-utils librdmacm1 libibverbs-dev rdma-core libmlx5-1
tzdata wget curl ssh sudo git-core vim libibverbs1 ibverbs-providers ibverbs-utils librdmacm1 libibverbs-dev rdma-core libmlx5-1

if [[ ${PYTHON_VERSION} != "3.10" ]]; then
apt-get install -y --no-install-recommends software-properties-common
add-apt-repository -y ppa:deadsnakes/ppa
apt-get update -y
fi

# install python, create virtual env
apt-get install -y --no-install-recommends \
python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv

pushd /opt >/dev/null
python${PYTHON_VERSION} -m venv py3
popd >/dev/null

# install CUDA build tools
if [[ "${CUDA_VERSION_SHORT}" = "cu118" ]]; then
apt-get install -y --no-install-recommends cuda-minimal-build-11-8
elif [[ "${CUDA_VERSION_SHORT}" = "cu124" ]]; then
apt-get install -y --no-install-recommends cuda-minimal-build-12-4
apt-get install -y --no-install-recommends cuda-minimal-build-12-4 cuda-nvrtc-12-4 build-essential devscripts debhelper fakeroot pkg-config dkms
elif [[ "${CUDA_VERSION_SHORT}" = "cu128" ]]; then
apt-get install -y --no-install-recommends cuda-minimal-build-12-8
apt-get install -y --no-install-recommends cuda-minimal-build-12-8 cuda-nvrtc-12-8 build-essential devscripts debhelper fakeroot pkg-config dkms
fi

apt-get clean -y
rm -rf /var/lib/apt/lists/*

# install GDRCopy
GDRCOPY_VERSION=2.5.1
mkdir -p /tmp/gdrcopy && cd /tmp \
&& wget -q https://github.com/NVIDIA/gdrcopy/archive/refs/tags/v${GDRCOPY_VERSION}.tar.gz \
&& tar -xzf v${GDRCOPY_VERSION}.tar.gz && rm v${GDRCOPY_VERSION}.tar.gz \
&& cd gdrcopy-${GDRCOPY_VERSION}/packages \
&& CUDA=/usr/local/cuda ./build-deb-packages.sh \
&& dpkg -i gdrdrv-dkms_*.deb libgdrapi_*.deb gdrcopy-tests_*.deb gdrcopy_*.deb \
&& cd / && rm -rf /tmp/gdrcopy

# install python packages
export PATH=/opt/py3/bin:$PATH

if [[ "${CUDA_VERSION_SHORT}" = "cu118" ]]; then
FA_VERSION=2.7.3
TORCH_VERSION="<2.7"
else
FA_VERSION=2.8.3
TORCH_VERSION=""
# pin torch version to avoid build and runtime mismatch, o.w. deep_gemm undefined symbol error
TORCH_VERSION="==2.8.0"
fi

pip install -U pip wheel setuptools
Expand All @@ -46,13 +61,15 @@ if [[ "${CUDA_VERSION_SHORT}" != "cu118" ]]; then
pip install nvidia-nvshmem-cu12
fi

pip install /wheels/*.whl torch${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/${CUDA_VERSION_SHORT}
pip install torch${TORCH_VERSION} --extra-index-url https://download.pytorch.org/whl/${CUDA_VERSION_SHORT}
pip install /wheels/*.whl


if [[ "${CUDA_VERSION_SHORT}" != "cu118" ]] && [[ "${PYTHON_VERSION}" != "3.9" ]]; then
pip install cuda-python dlblas
pip install cuda-python dlblas==0.0.6
fi

# install pre-compiled flash attention wheel
PLATFORM="linux_x86_64"
PY_VERSION=$(python3 - <<'PY'
import torch, sys
Expand Down
23 changes: 12 additions & 11 deletions docker/prepare_wheel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ export PATH=/opt/py3/bin:$PATH
if [[ "${CUDA_VERSION_SHORT}" = "cu118" ]]; then
TORCH_VERSION="<2.7"
else
TORCH_VERSION=""
# pin torch version to avoid build and runtime mismatch, o.w. deep_gemm undefined symbol error
TORCH_VERSION="==2.8.0"
fi

pip install "cmake<4.0" wheel ninja setuptools packaging
Expand All @@ -21,18 +22,18 @@ fi

if [[ "${CUDA_VERSION_SHORT}" != "cu118" ]]; then

if [[ "${CUDA_VERSION_SHORT}" = "cu124" ]]; then
DEEP_GEMM_VERSION=03d0be3
FLASH_MLA_VERSION=9edee0c
else
DEEP_GEMM_VERSION=79f48ee
FLASH_MLA_VERSION=c759027
fi
DEEP_EP_VERSION=9af0e0d # v1.2.1
DEEP_GEMM_VERSION=c9f8b34 # v2.1.1.post3
FLASH_MLA_VERSION=1408756 # no release, pick the latest commit

DEEP_EP_VERSION=26cf250
# DeepEP
pip install nvidia-nvshmem-cu12

pip wheel -v --no-build-isolation --no-deps -w /wheels "git+https://github.com/deepseek-ai/DeepEP.git@${DEEP_EP_VERSION}"
pip wheel -v --no-build-isolation --no-deps -w /wheels "git+https://github.com/deepseek-ai/FlashMLA.git@${FLASH_MLA_VERSION}"

# DeepGEMM
pip wheel -v --no-build-isolation --no-deps -w /wheels "git+https://github.com/deepseek-ai/DeepGEMM.git@${DEEP_GEMM_VERSION}"

# FlashMLA
# sm100 compilation for Flash MLA requires NVCC 12.9 or higher
FLASH_MLA_DISABLE_SM100=1 pip wheel -v --no-build-isolation --no-deps -w /wheels "git+https://github.com/deepseek-ai/FlashMLA.git@${FLASH_MLA_VERSION}"
fi
12 changes: 12 additions & 0 deletions lmdeploy/pytorch/backends/cuda/graph_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,13 @@

logger = get_logger('lmdeploy')

try:
from dlblas.layers.moe.token_dispatcher import DeepEPBuffer, DeepEPMode
use_deepep = True
except ImportError:
logger.warning('Currently not using DeepEP, please install DeepEP and DLBLas properly.')
use_deepep = False


def next_power_of_2(n: int):
"""Return the smallest power of 2 greater than or equal to n."""
Expand Down Expand Up @@ -242,6 +249,11 @@ def prepare_inputs_for_generation(
context: StepContext = None,
):
"""Prepare inputs."""

if use_deepep:
deepep_mode = DeepEPMode.LOW_LATENCY if context.is_decoding else DeepEPMode.NORMAL
DeepEPBuffer.set_deepep_mode(deepep_mode)

return self.model.prepare_inputs_for_generation(
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
Expand Down
7 changes: 7 additions & 0 deletions lmdeploy/pytorch/backends/cuda/token_dispatcher.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
# Copyright (c) OpenMMLab. All rights reserved.
try:
from deep_ep import Buffer

from lmdeploy.pytorch.envs import env_to_int

# default value refers to DeepEP code
# https://github.com/deepseek-ai/DeepEP/blob/bfded34800dfec415b71503f8205181de90b2480/deep_ep/buffer.py#L30
deep_ep_buffer_num_sms = env_to_int(env_var='DEEPEP_BUFFER_NUM_SMS', default=20)
Buffer.set_num_sms(deep_ep_buffer_num_sms)
use_deepep = True
except ImportError:
use_deepep = False
Expand Down
1 change: 1 addition & 0 deletions lmdeploy/pytorch/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ def _patched_get_env(
# we don't need to read this, it would be passed to ray workers
# If Ray is launched from outside, it may fail to access the environment variables.
os.getenv('DEEPEP_MAX_BATCH_SIZE', None)
os.getenv('DEEPEP_BUFFER_NUM_SMS', None)

# deepgemm
os.getenv('DG_JIT_DEBUG', '0')
Expand Down
Loading