Skip to content

Commit 057f01d

Browse files
committed
Fixes and documentation for building with NIXL support
1 parent 724341b commit 057f01d

File tree

4 files changed

+98
-24
lines changed

4 files changed

+98
-24
lines changed

container/Dockerfile.tensorrt_llm

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -324,16 +324,34 @@ RUN pip install dist/ai_dynamo_runtime*cp312*.whl && \
324324

325325
ENV DYNAMO_HOME=/workspace
326326

327+
ARG ARCH_ALT
328+
ENV LD_LIBRARY_PATH=/usr/local/nixl/lib/${ARCH_ALT}-linux-gnu:$LD_LIBRARY_PATH
329+
327330
# Use UCX for TRTLLM KV Cache Transfer
328-
ENV TRTLLM_USE_UCX_KVCACHE=1
331+
ARG TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL
332+
RUN if [ "$TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL" = "1" ]; then \
333+
echo "TRTLLM_USE_NIXL_KVCACHE=1" >> /etc/environment; \
334+
else \
335+
echo "TRTLLM_USE_UCX_KVCACHE=1" >> /etc/environment; \
336+
fi
329337

338+
# Create a script that sets the environment variables and source it
339+
RUN echo '#!/bin/bash' > /usr/local/bin/set_trtllm_env.sh && \
340+
if [ "$TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL" = "1" ]; then \
341+
echo 'export TRTLLM_USE_NIXL_KVCACHE=1' >> /usr/local/bin/set_trtllm_env.sh; \
342+
else \
343+
echo 'export TRTLLM_USE_UCX_KVCACHE=1' >> /usr/local/bin/set_trtllm_env.sh; \
344+
fi && \
345+
chmod +x /usr/local/bin/set_trtllm_env.sh
346+
347+
# Source the script in bashrc
348+
RUN echo 'source /usr/local/bin/set_trtllm_env.sh' >> /root/.bashrc
330349

331350
# Copy launch banner
332351
RUN --mount=type=bind,source=./container/launch_message.txt,target=/workspace/launch_message.txt \
333352
sed '/^#\s/d' /workspace/launch_message.txt > ~/.launch_screen && \
334353
echo "cat ~/.launch_screen" >> ~/.bashrc
335354

336-
337355
# FIXME: May want a modification with dynamo banner on entry
338356
ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
339357
CMD []

container/build.sh

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/"
9090
# variables to learn how to run a pipeline with a specific commit.
9191
DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="137fe35539ea182f1495f5021bfda97c729e50c3"
9292
TRTLLM_COMMIT=""
93+
TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0"
9394

9495
# TensorRT-LLM PyPI index URL
9596
TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple"
@@ -166,6 +167,13 @@ get_options() {
166167
fi
167168
USE_DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT=true
168169
;;
170+
--trtllm-use-nixl-kvcache-experimental)
171+
if [ -n "$2" ] && [[ "$2" != --* ]]; then
172+
echo "ERROR: --trtllm-use-nixl-kvcache-experimental does not take any argument"
173+
exit 1
174+
fi
175+
TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="1"
176+
;;
169177
--tensorrtllm-pip-wheel)
170178
if [ "$2" ]; then
171179
TENSORRTLLM_PIP_WHEEL=$2
@@ -364,6 +372,7 @@ show_help() {
364372
echo " [--build-context name=path to add build context]"
365373
echo " [--release-build perform a release build]"
366374
echo " [--make-efa Enables EFA support for NIXL]"
375+
echo " [--trtllm-use-nixl-kvcache-experimental Enables NIXL KVCACHE experimental support for TensorRT-LLM]"
367376
exit 0
368377
}
369378

@@ -492,6 +501,10 @@ if [[ $FRAMEWORK == "TENSORRTLLM" ]]; then
492501
TRTLLM_COMMIT="$DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT"
493502
fi
494503

504+
if [ -n "${TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL}" ]; then
505+
BUILD_ARGS+=" --build-arg TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL=${TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL} "
506+
fi
507+
495508
# If user didn't set both wheel and commit, use default tensorrt_llm pip wheel
496509
if [ -z "$TENSORRTLLM_PIP_WHEEL" ] && [ -z "$TRTLLM_COMMIT" ]; then
497510
TENSORRTLLM_PIP_WHEEL="$DEFAULT_TENSORRTLLM_PIP_WHEEL"
@@ -507,7 +520,7 @@ if [[ $FRAMEWORK == "TENSORRTLLM" ]]; then
507520
echo "Checking for TensorRT-LLM wheel in ${TENSORRTLLM_PIP_WHEEL_DIR}"
508521
if ! check_wheel_file "${TENSORRTLLM_PIP_WHEEL_DIR}" "${ARCH}_${TRTLLM_COMMIT}"; then
509522
echo "WARN: Valid trtllm wheel file not found in ${TENSORRTLLM_PIP_WHEEL_DIR}, attempting to build from source"
510-
if ! env -i ${SOURCE_DIR}/build_trtllm_wheel.sh -o ${TENSORRTLLM_PIP_WHEEL_DIR} -c ${TRTLLM_COMMIT} -a ${ARCH}; then
523+
if ! env -i ${SOURCE_DIR}/build_trtllm_wheel.sh -o ${TENSORRTLLM_PIP_WHEEL_DIR} -c ${TRTLLM_COMMIT} -a ${ARCH} -n ${NIXL_COMMIT}; then
511524
error "ERROR: Failed to build TensorRT-LLM wheel"
512525
fi
513526
fi

container/build_trtllm_wheel.sh

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,17 @@
1818

1919
# This script builds the TRT-LLM base image for Dynamo with TensorRT-LLM.
2020

21-
while getopts "c:o:a:" opt; do
21+
while getopts "c:o:a:n:" opt; do
2222
case ${opt} in
2323
c) TRTLLM_COMMIT=$OPTARG ;;
2424
o) OUTPUT_DIR=$OPTARG ;;
2525
a) ARCH=$OPTARG ;;
26-
*) echo "Usage: $(basename $0) [-c commit] [-o output_dir] [-a arch]"
26+
n) NIXL_COMMIT=$OPTARG ;;
27+
*) echo "Usage: $(basename $0) [-c commit] [-o output_dir] [-a arch] [-n nixl_commit]"
2728
echo " -c: TensorRT-LLM commit to build"
2829
echo " -o: Output directory for wheel files"
2930
echo " -a: Architecture (amd64 or arm64)"
31+
echo " -n: NIXL commit"
3032
exit 1 ;;
3133
esac
3234
done
@@ -36,6 +38,8 @@ if [ -z "$OUTPUT_DIR" ]; then
3638
OUTPUT_DIR="/tmp/trtllm_wheel"
3739
fi
3840

41+
# Store directory where script is being launched from
42+
MAIN_DIR=$(dirname "$(readlink -f "$0")")
3943

4044
(cd /tmp && \
4145
# Clone the TensorRT-LLM repository.
@@ -79,9 +83,16 @@ sed -i "s/__version__ = \"\(.*\)\"/__version__ = \"\1+dev${COMMIT_VERSION}\"/" "
7983
echo "Updated version:"
8084
grep "__version__" "$VERSION_FILE"
8185

86+
echo "Copying install_nixl.sh from $MAIN_DIR to ${PWD}/docker/common/"
87+
# Copy install_nixl.sh to docker/common/
88+
cp $MAIN_DIR/deps/tensorrt_llm/install_nixl.sh docker/common/install_nixl.sh
89+
# Update NIXL_COMMIT in install_nixl.sh to use the parameter passed to this script
90+
sed -i "s/NIXL_COMMIT=\"[^\"]*\"/NIXL_COMMIT=\"${NIXL_COMMIT}\"/" docker/common/install_nixl.sh
8291

92+
93+
# Need to build in the Triton Devel Image for NIXL support.
8394
make -C docker tritondevel_build
84-
make -C docker wheel_build DEVEL_IMAGE=tritondevel_build BUILD_WHEEL_OPTS='--extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl'
95+
make -C docker wheel_build DEVEL_IMAGE=tritondevel BUILD_WHEEL_OPTS='--extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl'
8596

8697
# Copy the wheel to the host
8798
mkdir -p $OUTPUT_DIR

examples/tensorrt_llm/README.md

Lines changed: 50 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -69,15 +69,6 @@ apt-get update && apt-get -y install git git-lfs
6969
./container/build.sh --framework tensorrtllm --use-default-experimental-tensorrtllm-commit
7070
```
7171

72-
> [!NOTE]
73-
> Because of a known issue of C++11 ABI compatibility within the NGC pytorch container,
74-
> we rebuild TensorRT-LLM from source. See [here](https://nvidia.github.io/TensorRT-LLM/installation/linux.html)
75-
> for more information.
76-
>
77-
> Hence, when running this script for the first time, the time taken by this script can be
78-
> quite long.
79-
80-
8172
### Run container
8273

8374
```
@@ -306,13 +297,54 @@ See [close deployment](../../docs/guides/dynamo_serve.md#close-deployment) secti
306297
To benchmark your deployment with GenAI-Perf, see this utility script, configuring the
307298
`model` name and `host` based on your deployment: [perf.sh](../../benchmarks/llm/perf.sh)
308299

309-
### Future Work
310300

311-
Remaining tasks:
312-
- [x] Add support for the disaggregated serving.
313-
- [x] Add multi-node support.
314-
- [x] Add instructions for benchmarking.
315-
- [x] Use processor from dynamo-llm framework.
316-
- [ ] Add integration test coverage.
317-
- [ ] Merge the code base with llm example to reduce the code duplication.
318-
- [ ] Enable NIXL integration with TensorRT-LLM once available. Currently, TensorRT-LLM uses UCX to transfer KV cache.
301+
### Disaggregated Serving with KV Cache Transfer using **NIXL** (EXPERIMENTAL)
302+
303+
In disaggregated serving architectures, KV cache must be transferred between prefill and decode nodes. TensorRT-LLM supports two methods for this transfer:
304+
305+
#### Default Method: UCX
306+
By default, TensorRT-LLM uses UCX (Unified Communication X) for KV cache transfer between prefill and decode nodes. UCX provides high-performance communication optimized for GPU-to-GPU transfers.
307+
308+
#### Experimental Method: NIXL
309+
TensorRT-LLM also provides experimental support for using **NIXL** (NVIDIA InfiniBand eXchange Library) for KV cache transfer. NIXL is NVIDIA's high-performance communication library designed for efficient data transfer in distributed GPU environments.
310+
311+
**Note:** NIXL support is experimental and is not be suitable for production environments.
312+
313+
#### Using NIXL for KV Cache Transfer
314+
315+
To enable NIXL for KV cache transfer in disaggregated serving:
316+
317+
1. **Build the container with NIXL support:**
318+
The TensorRT-LLM wheel must be built from source with NIXL support. The `./container/build.sh` script caches previously built TensorRT-LLM wheels to reduce build time. If you have previously built a TensorRT-LLM wheel without NIXL support, you must delete the cached wheel to force a rebuild with NIXL support.
319+
320+
**Remove cached TensorRT-LLM wheel (only if previously built without NIXL support):**
321+
```bash
322+
rm -rf /tmp/trtllm_wheel
323+
```
324+
325+
**Build the container with NIXL support:**
326+
```bash
327+
./container/build.sh --framework tensorrtllm \
328+
--use-default-experimental-tensorrtllm-commit \
329+
--trtllm-use-nixl-kvcache-experimental
330+
```
331+
332+
**Note:** Both `--use-default-experimental-tensorrtllm-commit` and `--trtllm-use-nixl-kvcache-experimental` flags are required to enable NIXL support.
333+
334+
2. **Run the containerized environment:**
335+
See [run container](#run-container) section to learn how to start the container image built in previous step.
336+
337+
3. **Start the disaggregated service:**
338+
See [disaggregated serving](#disaggregated-serving) to see how to start the deployment.
339+
340+
4. **Send the request:**
341+
See [client](#client) section to learn how to send the request to deployment.
342+
343+
**Important:** Ensure that ETCD and NATS services are running before starting the service.
344+
345+
The container will automatically configure the appropriate environment variables (`TRTLLM_USE_NIXL_KVCACHE=1`) when built with the NIXL flag. The same container image can be used to use UCX for KV cache transfer.
346+
```bash
347+
unset TRTLLM_USE_NIXL_KVCACHE
348+
export TRTLLM_USE_UCX_KVCACHE=1
349+
```
350+

0 commit comments

Comments
 (0)