From a3f151380ce9acd1919207ca52025b3689797f63 Mon Sep 17 00:00:00 2001
From: rohithkrn <rohith.nallamaddi@gmail.com>
Date: Mon, 6 Nov 2023 05:00:31 +0000
Subject: [PATCH] trtllm integ tests

add trtllm integration tests
---
 .github/workflows/llm_integration.yml         | 111 +++++++++++++++++-
 .../rolling_batch/trtllm_rolling_batch.py     |   4 +-
 tests/integration/launch_container.sh         |   6 +-
 tests/integration/llm/client.py               |  23 ++++
 tests/integration/llm/prepare.py              |  31 +++++
 5 files changed, 170 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/llm_integration.yml b/.github/workflows/llm_integration.yml
index 3df5ded99..bcc7e0e88 100644
--- a/.github/workflows/llm_integration.yml
+++ b/.github/workflows/llm_integration.yml
@@ -8,7 +8,7 @@ on:
         required: false
         default: ''
       run_test:
-        description: 'Run only the tests you need [ds, hf, aot, lora-correctness]'
+        description: 'Run only the tests you need [ds, hf, aot, trtllm, lora-correctness]'
         required: false
         default: ''
   schedule:
@@ -687,10 +687,117 @@ jobs:
           name: ds-smoothquant-logs
           path: tests/integration/logs/
 
+  trt-llm-handler-test:
+    if: contains(fromJson('["", "trtllm"]'), github.event.inputs.run_test)
+    runs-on: [ self-hosted, g5 ]
+    timeout-minutes: 120
+    needs: create-runners
+    steps:
+      - uses: actions/checkout@v3
+      - name: Clean env
+        run: |
+          yes | docker system prune -a --volumes
+          sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
+          echo "wait dpkg lock..."
+          while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
+      - name: Set up Python3
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10.x'
+      - name: Install pip dependencies
+        run: pip3 install requests numpy huggingface_hub
+      - name: Build container name
+        run: ./serving/docker/scripts/docker_name_builder.sh tensorrt-llm ${{ github.event.inputs.djl-version }}
+      - name: Download models and dockers
+        working-directory: tests/integration
+        run: |
+          docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG
+      - name: llama2-13b HF model with tp=4
+        working-directory: tests/integration
+        run: |
+          rm -rf models
+          echo -en "CUDA_VISIBLE_DEVICES=0,1,2,3" > docker_env
+          python3 llm/prepare.py trtllm llama2-13b
+          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm \
+          serve
+          python3 llm/client.py trtllm llama2-13b
+          rm -rf docker_env
+          docker rm -f $(docker ps -aq)
+      - name: falcon-7b triton repo with tp=1
+        working-directory: tests/integration
+        run: |
+          rm -rf models
+          echo -en "CUDA_VISIBLE_DEVICES=0" > docker_env
+          python3 llm/prepare.py trtllm falcon-7b
+          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm \
+          serve
+          python3 llm/client.py trtllm falcon-7b
+          rm -rf docker_env
+          docker rm -f $(docker ps -aq)
+      - name: On fail step
+        if: ${{ failure() }}
+        working-directory: tests/integration
+        run: |
+          docker rm -f $(docker ps -aq) || true
+          cat logs/serving.log
+      - name: Upload test logs
+        uses: actions/upload-artifact@v3
+        with:
+          name: trtllm-handler-logs
+          path: tests/integration/logs/
+
+  trt-llm-handler-quantization-test:
+    if: contains(fromJson('["", "trtllm"]'), github.event.inputs.run_test)
+    runs-on: [ self-hosted, g5 ]
+    timeout-minutes: 120
+    needs: create-runners
+    steps:
+      - uses: actions/checkout@v3
+      - name: Clean env
+        run: |
+          yes | docker system prune -a --volumes
+          sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
+          echo "wait dpkg lock..."
+          while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
+      - name: Set up Python3
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.10.x'
+      - name: Install pip dependencies
+        run: pip3 install requests numpy huggingface_hub
+      - name: Build container name
+        run: ./serving/docker/scripts/docker_name_builder.sh tensorrt-llm ${{ github.event.inputs.djl-version }}
+      - name: Download models and dockers
+        working-directory: tests/integration
+        run: |
+          docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG
+      - name: llama2-7b HF model with tp=4 and smoothquant
+        working-directory: tests/integration
+        run: |
+          rm -rf models
+          echo -en "CUDA_VISIBLE_DEVICES=0,1,2,3" > docker_env
+          python3 llm/prepare.py trtllm llama2-7b-smoothquant
+          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm-sq \
+          serve
+          python3 llm/client.py trtllm llama2-7b-smoothquant
+          rm -rf docker_env
+          docker rm -f $(docker ps -aq)
+      - name: On fail step
+        if: ${{ failure() }}
+        working-directory: tests/integration
+        run: |
+          docker rm -f $(docker ps -aq) || true
+          cat logs/serving.log
+      - name: Upload test logs
+        uses: actions/upload-artifact@v3
+        with:
+          name: trtllm-handler-quantization-logs
+          path: tests/integration/logs/
+
   stop-runners:
     if: always()
     runs-on: [ self-hosted, scheduler ]
-    needs: [ create-runners, hf-handler-test, hf-lora-correctness-test, ds-raw-test, ds-handler-test, ds-aot-raw-test, no-code-test, ds-handler-aot-test, ds-smoothquant-handler-test ]
+    needs: [ create-runners, hf-handler-test, hf-lora-correctness-test, ds-raw-test, ds-handler-test, ds-aot-raw-test, no-code-test, ds-handler-aot-test, ds-smoothquant-handler-test, trt-llm-handler-test,  trt-llm-handler-quantization-test]
     steps:
       - name: Stop all instances
         run: |
diff --git a/engines/python/setup/djl_python/rolling_batch/trtllm_rolling_batch.py b/engines/python/setup/djl_python/rolling_batch/trtllm_rolling_batch.py
index 020cdaa4f..ccbf3f1a2 100644
--- a/engines/python/setup/djl_python/rolling_batch/trtllm_rolling_batch.py
+++ b/engines/python/setup/djl_python/rolling_batch/trtllm_rolling_batch.py
@@ -38,8 +38,8 @@ def reset(self):
         super().reset()
 
     def translate_triton_params(self, parameters):
-        parameters["max_new_tokens"] = parameters.get("max_new_tokens", 128)
-        parameters["request_output_len"] = parameters.pop("max_new_tokens")
+        parameters["request_output_len"] = int(
+            parameters.get("max_new_tokens", 128))
         if "top_k" in parameters.keys():
             parameters["runtime_top_k"] = parameters.pop("top_k")
         if "top_p" in parameters.keys():
diff --git a/tests/integration/launch_container.sh b/tests/integration/launch_container.sh
index 5d1fc196b..c6b9ffad6 100755
--- a/tests/integration/launch_container.sh
+++ b/tests/integration/launch_container.sh
@@ -23,7 +23,7 @@ fi
 is_llm=false
 if [[ "$platform" == *"cu1"* ]]; then # if the platform has cuda capabilities
   runtime="nvidia"
-elif [[ "$platform" == *"deepspeed"* ]]; then # Runs multi-gpu
+elif [[ "$platform" == *"deepspeed"* || "$platform" == *"trtllm"* ]]; then # Runs multi-gpu
   runtime="nvidia"
   is_llm=true
   shm="12gb"
@@ -107,6 +107,10 @@ if $is_llm; then
   if [[ "$platform" == *"inf2"* ]]; then
     total=80
   fi
+  if [[ "$platform" == *"trtllm-sq"* ]]; then
+    echo "extra sleep of 15 min for smoothquant calibration"
+    total=120
+  fi
   sleep 120
 fi
 
diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py
index 9add68e7a..a5d642a16 100644
--- a/tests/integration/llm/client.py
+++ b/tests/integration/llm/client.py
@@ -395,6 +395,27 @@ def get_model_name():
     },
 }
 
+trtllm_model_spec = {
+    "llama2-13b": {
+        "max_memory_per_gpu": [22.0],
+        "batch_size": [1],
+        "seq_length": [64, 128, 256],
+        "stream_output": True,
+    },
+    "falcon-7b": {
+        "max_memory_per_gpu": [22.0],
+        "batch_size": [1],
+        "seq_length": [64, 128],
+        "stream_output": True,
+    },
+    "llama2-7b-smoothquant": {
+        "max_memory_per_gpu": [22.0],
+        "batch_size": [1],
+        "seq_length": [64, 128, 256],
+        "stream_output": True,
+    },
+}
+
 
 def check_worker_number(desired):
     model_name = get_model_name()
@@ -807,6 +828,8 @@ def test_unmerged_lora_correctness():
         test_ds_smoothquant(args.model, ds_smoothquant_model_spec)
     elif args.handler == "lmi_dist_aiccl":
         test_handler(args.model, lmi_dist_aiccl_model_spec)
+    elif args.handler == "trtllm":
+        test_handler(args.model, trtllm_model_spec)
     else:
         raise ValueError(
             f"{args.handler} is not one of the supporting handler")
diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py
index 5b3fc62f1..a6a471dc7 100644
--- a/tests/integration/llm/prepare.py
+++ b/tests/integration/llm/prepare.py
@@ -550,6 +550,27 @@
     },
 }
 
+trtllm_handler_list = {
+    "llama2-13b": {
+        "option.model_id": "s3://djl-llm/llama-2-13b-hf/",
+        "option.tensor_parallel_degree": 4,
+        "option.output_formatter": "jsonlines",
+    },
+    "falcon-7b": {
+        "option.model_id": "s3://djl-llm/triton/falcon-7b-tp1-bs4/",
+        "option.tensor_parallel_degree": 1,
+        "option.output_formatter": "jsonlines",
+    },
+    "llama2-7b-smoothquant": {
+        "option.model_id": "s3://djl-llm/llama-2-7b-hf/",
+        "option.tensor_parallel_degree": 4,
+        "option.use_smoothquant": "True",
+        "option.smoothquant_per_token": "True",
+        "option.smoothquant_per_channel": "True",
+        "option.output_formatter": "jsonlines",
+    },
+}
+
 
 def write_model_artifacts(properties,
                           requirements=None,
@@ -761,6 +782,15 @@ def build_lmi_dist_aiccl_model(model):
     write_model_artifacts(options)
 
 
+def build_trtllm_handler_model(model):
+    if model not in trtllm_handler_list:
+        raise ValueError(
+            f"{model} is not one of the supporting handler {list(trtllm_handler_list.keys())}"
+        )
+    options = trtllm_handler_list[model]
+    write_model_artifacts(options)
+
+
 supported_handler = {
     'deepspeed': build_ds_handler_model,
     'huggingface': build_hf_handler_model,
@@ -776,6 +806,7 @@ def build_lmi_dist_aiccl_model(model):
     'unmerged_lora': build_unmerged_lora_correctness_model,
     'deepspeed_smoothquant': build_ds_smoothquant_model,
     'lmi_dist_aiccl': build_lmi_dist_aiccl_model,
+    'trtllm': build_trtllm_handler_model,
 }
 
 if __name__ == '__main__':