Fixes and documentation for building with NIXL support

tanmayv25 · tanmayv25 · commit 057f01d12e49 · 2025-06-18T15:27:38.000-07:00
diff --git a/container/Dockerfile.tensorrt_llm b/container/Dockerfile.tensorrt_llm
@@ -324,16 +324,34 @@ RUN pip install dist/ai_dynamo_runtime*cp312*.whl  && \
 
 ENV DYNAMO_HOME=/workspace
 
+ARG ARCH_ALT
+ENV LD_LIBRARY_PATH=/usr/local/nixl/lib/${ARCH_ALT}-linux-gnu:$LD_LIBRARY_PATH
+
 # Use UCX for TRTLLM KV Cache Transfer
-ENV TRTLLM_USE_UCX_KVCACHE=1
+ARG TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL
+RUN if [ "$TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL" = "1" ]; then \
+        echo "TRTLLM_USE_NIXL_KVCACHE=1" >> /etc/environment; \
+    else \
+        echo "TRTLLM_USE_UCX_KVCACHE=1" >> /etc/environment; \
+    fi
 
+# Create a script that sets the environment variables and source it
+RUN echo '#!/bin/bash' > /usr/local/bin/set_trtllm_env.sh && \
+    if [ "$TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL" = "1" ]; then \
+        echo 'export TRTLLM_USE_NIXL_KVCACHE=1' >> /usr/local/bin/set_trtllm_env.sh; \
+    else \
+        echo 'export TRTLLM_USE_UCX_KVCACHE=1' >> /usr/local/bin/set_trtllm_env.sh; \
+    fi && \
+    chmod +x /usr/local/bin/set_trtllm_env.sh
+
+# Source the script in bashrc
+RUN echo 'source /usr/local/bin/set_trtllm_env.sh' >> /root/.bashrc
 
 # Copy launch banner
 RUN --mount=type=bind,source=./container/launch_message.txt,target=/workspace/launch_message.txt \
     sed '/^#\s/d' /workspace/launch_message.txt > ~/.launch_screen && \
     echo "cat ~/.launch_screen" >> ~/.bashrc
 
-
 # FIXME: May want a modification with dynamo banner on entry
 ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
 CMD []
diff --git a/container/build.sh b/container/build.sh
@@ -90,6 +90,7 @@ TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/"
 # variables to learn how to run a pipeline with a specific commit.
 DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="137fe35539ea182f1495f5021bfda97c729e50c3"
 TRTLLM_COMMIT=""
+TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0"
 
 # TensorRT-LLM PyPI index URL
 TENSORRTLLM_INDEX_URL="https://pypi.python.org/simple"
@@ -166,6 +167,13 @@ get_options() {
             fi
             USE_DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT=true
             ;;
+        --trtllm-use-nixl-kvcache-experimental)
+            if [ -n "$2" ] && [[ "$2" != --* ]]; then
+                echo "ERROR: --trtllm-use-nixl-kvcache-experimental does not take any argument"
+                exit 1
+            fi
+            TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="1"
+            ;;
         --tensorrtllm-pip-wheel)
             if [ "$2" ]; then
                 TENSORRTLLM_PIP_WHEEL=$2
@@ -364,6 +372,7 @@ show_help() {
     echo "  [--build-context name=path to add build context]"
     echo "  [--release-build perform a release build]"
     echo "  [--make-efa Enables EFA support for NIXL]"
+    echo "  [--trtllm-use-nixl-kvcache-experimental Enables NIXL KVCACHE experimental support for TensorRT-LLM]"
     exit 0
 }
 
@@ -492,6 +501,10 @@ if [[ $FRAMEWORK == "TENSORRTLLM" ]]; then
         TRTLLM_COMMIT="$DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT"
     fi
 
+    if [ -n "${TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL}" ]; then
+        BUILD_ARGS+=" --build-arg TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL=${TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL} "
+    fi
+
     # If user didn't set both wheel and commit, use default tensorrt_llm pip wheel
     if [ -z "$TENSORRTLLM_PIP_WHEEL" ] && [ -z "$TRTLLM_COMMIT" ]; then
         TENSORRTLLM_PIP_WHEEL="$DEFAULT_TENSORRTLLM_PIP_WHEEL"
@@ -507,7 +520,7 @@ if [[ $FRAMEWORK == "TENSORRTLLM" ]]; then
         echo "Checking for TensorRT-LLM wheel in ${TENSORRTLLM_PIP_WHEEL_DIR}"
         if ! check_wheel_file "${TENSORRTLLM_PIP_WHEEL_DIR}" "${ARCH}_${TRTLLM_COMMIT}"; then
             echo "WARN: Valid trtllm wheel file not found in ${TENSORRTLLM_PIP_WHEEL_DIR}, attempting to build from source"
-            if ! env -i ${SOURCE_DIR}/build_trtllm_wheel.sh -o ${TENSORRTLLM_PIP_WHEEL_DIR} -c ${TRTLLM_COMMIT} -a ${ARCH}; then
+            if ! env -i ${SOURCE_DIR}/build_trtllm_wheel.sh -o ${TENSORRTLLM_PIP_WHEEL_DIR} -c ${TRTLLM_COMMIT} -a ${ARCH} -n ${NIXL_COMMIT}; then
                 error "ERROR: Failed to build TensorRT-LLM wheel"
             fi
         fi
diff --git a/container/build_trtllm_wheel.sh b/container/build_trtllm_wheel.sh
@@ -18,15 +18,17 @@
 
 # This script builds the TRT-LLM base image for Dynamo with TensorRT-LLM.
 
-while getopts "c:o:a:" opt; do
+while getopts "c:o:a:n:" opt; do
   case ${opt} in
     c) TRTLLM_COMMIT=$OPTARG ;;
     o) OUTPUT_DIR=$OPTARG ;;
     a) ARCH=$OPTARG ;;
-    *) echo "Usage: $(basename $0) [-c commit] [-o output_dir] [-a arch]"
+    n) NIXL_COMMIT=$OPTARG ;;
+    *) echo "Usage: $(basename $0) [-c commit] [-o output_dir] [-a arch] [-n nixl_commit]"
        echo "  -c: TensorRT-LLM commit to build"
        echo "  -o: Output directory for wheel files"
        echo "  -a: Architecture (amd64 or arm64)"
+       echo "  -n: NIXL commit"
        exit 1 ;;
   esac
 done
@@ -36,6 +38,8 @@ if [ -z "$OUTPUT_DIR" ]; then
     OUTPUT_DIR="/tmp/trtllm_wheel"
 fi
 
+# Store directory where script is being launched from
+MAIN_DIR=$(dirname "$(readlink -f "$0")")
 
 (cd /tmp && \
 # Clone the TensorRT-LLM repository.
@@ -79,9 +83,16 @@ sed -i "s/__version__ = \"\(.*\)\"/__version__ = \"\1+dev${COMMIT_VERSION}\"/" "
 echo "Updated version:"
 grep "__version__" "$VERSION_FILE"
 
+echo "Copying install_nixl.sh from $MAIN_DIR to ${PWD}/docker/common/"
+# Copy install_nixl.sh to docker/common/
+cp $MAIN_DIR/deps/tensorrt_llm/install_nixl.sh docker/common/install_nixl.sh
+# Update NIXL_COMMIT in install_nixl.sh to use the parameter passed to this script
+sed -i "s/NIXL_COMMIT=\"[^\"]*\"/NIXL_COMMIT=\"${NIXL_COMMIT}\"/" docker/common/install_nixl.sh
 
+
+# Need to build in the Triton Devel Image for NIXL support.
 make -C docker tritondevel_build
-make -C docker wheel_build DEVEL_IMAGE=tritondevel_build BUILD_WHEEL_OPTS='--extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl'
+make -C docker wheel_build DEVEL_IMAGE=tritondevel BUILD_WHEEL_OPTS='--extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl'
 
 # Copy the wheel to the host
 mkdir -p $OUTPUT_DIR
diff --git a/examples/tensorrt_llm/README.md b/examples/tensorrt_llm/README.md
@@ -69,15 +69,6 @@ apt-get update && apt-get -y install git git-lfs
 ./container/build.sh --framework tensorrtllm --use-default-experimental-tensorrtllm-commit
 ```
 
-> [!NOTE]
-> Because of a known issue of C++11 ABI compatibility within the NGC pytorch container,
-> we rebuild TensorRT-LLM from source. See [here](https://nvidia.github.io/TensorRT-LLM/installation/linux.html)
-> for more information.
->
-> Hence, when running this script for the first time, the time taken by this script can be
-> quite long.
-
-
 ### Run container
 
 ```
@@ -306,13 +297,54 @@ See [close deployment](../../docs/guides/dynamo_serve.md#close-deployment) secti
 To benchmark your deployment with GenAI-Perf, see this utility script, configuring the
 `model` name and `host` based on your deployment: [perf.sh](../../benchmarks/llm/perf.sh)
 
-### Future Work
 
-Remaining tasks:
-- [x] Add support for the disaggregated serving.
-- [x] Add multi-node support.
-- [x] Add instructions for benchmarking.
-- [x] Use processor from dynamo-llm framework.
-- [ ] Add integration test coverage.
-- [ ] Merge the code base with llm example to reduce the code duplication.
-- [ ] Enable NIXL integration with TensorRT-LLM once available. Currently, TensorRT-LLM uses UCX to transfer KV cache.
+### Disaggregated Serving with KV Cache Transfer using **NIXL** (EXPERIMENTAL)
+
+In disaggregated serving architectures, KV cache must be transferred between prefill and decode nodes. TensorRT-LLM supports two methods for this transfer:
+
+#### Default Method: UCX
+By default, TensorRT-LLM uses UCX (Unified Communication X) for KV cache transfer between prefill and decode nodes. UCX provides high-performance communication optimized for GPU-to-GPU transfers.
+
+#### Experimental Method: NIXL
+TensorRT-LLM also provides experimental support for using **NIXL** (NVIDIA InfiniBand eXchange Library) for KV cache transfer. NIXL is NVIDIA's high-performance communication library designed for efficient data transfer in distributed GPU environments.
+
+**Note:** NIXL support is experimental and is not be suitable for production environments.
+
+#### Using NIXL for KV Cache Transfer
+
+To enable NIXL for KV cache transfer in disaggregated serving:
+
+1. **Build the container with NIXL support:**
+   The TensorRT-LLM wheel must be built from source with NIXL support. The `./container/build.sh` script caches previously built TensorRT-LLM wheels to reduce build time. If you have previously built a TensorRT-LLM wheel without NIXL support, you must delete the cached wheel to force a rebuild with NIXL support.
+
+   **Remove cached TensorRT-LLM wheel (only if previously built without NIXL support):**
+   ```bash
+   rm -rf /tmp/trtllm_wheel
+   ```
+
+   **Build the container with NIXL support:**
+   ```bash
+   ./container/build.sh --framework tensorrtllm \
+     --use-default-experimental-tensorrtllm-commit \
+     --trtllm-use-nixl-kvcache-experimental
+   ```
+
+   **Note:** Both `--use-default-experimental-tensorrtllm-commit` and `--trtllm-use-nixl-kvcache-experimental` flags are required to enable NIXL support.
+
+2. **Run the containerized environment:**
+   See [run container](#run-container) section to learn how to start the container image built in previous step.
+
+3. **Start the disaggregated service:**
+   See [disaggregated serving](#disaggregated-serving) to see how to start the deployment.
+
+4. **Send the request:**
+   See [client](#client) section to learn how to send the request to deployment.
+
+**Important:** Ensure that ETCD and NATS services are running before starting the service.
+
+The container will automatically configure the appropriate environment variables (`TRTLLM_USE_NIXL_KVCACHE=1`) when built with the NIXL flag. The same container image can be used to use UCX for KV cache transfer.
+```bash
+unset TRTLLM_USE_NIXL_KVCACHE
+export TRTLLM_USE_UCX_KVCACHE=1
+```
+