From a3f151380ce9acd1919207ca52025b3689797f63 Mon Sep 17 00:00:00 2001 From: rohithkrn Date: Mon, 6 Nov 2023 05:00:31 +0000 Subject: [PATCH] trtllm integ tests add trtllm integration tests --- .github/workflows/llm_integration.yml | 111 +++++++++++++++++- .../rolling_batch/trtllm_rolling_batch.py | 4 +- tests/integration/launch_container.sh | 6 +- tests/integration/llm/client.py | 23 ++++ tests/integration/llm/prepare.py | 31 +++++ 5 files changed, 170 insertions(+), 5 deletions(-) diff --git a/.github/workflows/llm_integration.yml b/.github/workflows/llm_integration.yml index 3df5ded99..bcc7e0e88 100644 --- a/.github/workflows/llm_integration.yml +++ b/.github/workflows/llm_integration.yml @@ -8,7 +8,7 @@ on: required: false default: '' run_test: - description: 'Run only the tests you need [ds, hf, aot, lora-correctness]' + description: 'Run only the tests you need [ds, hf, aot, trtllm, lora-correctness]' required: false default: '' schedule: @@ -687,10 +687,117 @@ jobs: name: ds-smoothquant-logs path: tests/integration/logs/ + trt-llm-handler-test: + if: contains(fromJson('["", "trtllm"]'), github.event.inputs.run_test) + runs-on: [ self-hosted, g5 ] + timeout-minutes: 120 + needs: create-runners + steps: + - uses: actions/checkout@v3 + - name: Clean env + run: | + yes | docker system prune -a --volumes + sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ + echo "wait dpkg lock..." + while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done + - name: Set up Python3 + uses: actions/setup-python@v4 + with: + python-version: '3.10.x' + - name: Install pip dependencies + run: pip3 install requests numpy huggingface_hub + - name: Build container name + run: ./serving/docker/scripts/docker_name_builder.sh tensorrt-llm ${{ github.event.inputs.djl-version }} + - name: Download models and dockers + working-directory: tests/integration + run: | + docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG + - name: llama2-13b HF model with tp=4 + working-directory: tests/integration + run: | + rm -rf models + echo -en "CUDA_VISIBLE_DEVICES=0,1,2,3" > docker_env + python3 llm/prepare.py trtllm llama2-13b + ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm \ + serve + python3 llm/client.py trtllm llama2-13b + rm -rf docker_env + docker rm -f $(docker ps -aq) + - name: falcon-7b triton repo with tp=1 + working-directory: tests/integration + run: | + rm -rf models + echo -en "CUDA_VISIBLE_DEVICES=0" > docker_env + python3 llm/prepare.py trtllm falcon-7b + ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm \ + serve + python3 llm/client.py trtllm falcon-7b + rm -rf docker_env + docker rm -f $(docker ps -aq) + - name: On fail step + if: ${{ failure() }} + working-directory: tests/integration + run: | + docker rm -f $(docker ps -aq) || true + cat logs/serving.log + - name: Upload test logs + uses: actions/upload-artifact@v3 + with: + name: trtllm-handler-logs + path: tests/integration/logs/ + + trt-llm-handler-quantization-test: + if: contains(fromJson('["", "trtllm"]'), github.event.inputs.run_test) + runs-on: [ self-hosted, g5 ] + timeout-minutes: 120 + needs: create-runners + steps: + - uses: actions/checkout@v3 + - name: Clean env + run: | + yes | docker system prune -a --volumes + sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/ + echo "wait dpkg lock..." + while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done + - name: Set up Python3 + uses: actions/setup-python@v4 + with: + python-version: '3.10.x' + - name: Install pip dependencies + run: pip3 install requests numpy huggingface_hub + - name: Build container name + run: ./serving/docker/scripts/docker_name_builder.sh tensorrt-llm ${{ github.event.inputs.djl-version }} + - name: Download models and dockers + working-directory: tests/integration + run: | + docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG + - name: llama2-7b HF model with tp=4 and smoothquant + working-directory: tests/integration + run: | + rm -rf models + echo -en "CUDA_VISIBLE_DEVICES=0,1,2,3" > docker_env + python3 llm/prepare.py trtllm llama2-7b-smoothquant + ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm-sq \ + serve + python3 llm/client.py trtllm llama2-7b-smoothquant + rm -rf docker_env + docker rm -f $(docker ps -aq) + - name: On fail step + if: ${{ failure() }} + working-directory: tests/integration + run: | + docker rm -f $(docker ps -aq) || true + cat logs/serving.log + - name: Upload test logs + uses: actions/upload-artifact@v3 + with: + name: trtllm-handler-quantization-logs + path: tests/integration/logs/ + stop-runners: if: always() runs-on: [ self-hosted, scheduler ] - needs: [ create-runners, hf-handler-test, hf-lora-correctness-test, ds-raw-test, ds-handler-test, ds-aot-raw-test, no-code-test, ds-handler-aot-test, ds-smoothquant-handler-test ] + needs: [ create-runners, hf-handler-test, hf-lora-correctness-test, ds-raw-test, ds-handler-test, ds-aot-raw-test, no-code-test, ds-handler-aot-test, ds-smoothquant-handler-test, trt-llm-handler-test, trt-llm-handler-quantization-test] steps: - name: Stop all instances run: | diff --git a/engines/python/setup/djl_python/rolling_batch/trtllm_rolling_batch.py b/engines/python/setup/djl_python/rolling_batch/trtllm_rolling_batch.py index 020cdaa4f..ccbf3f1a2 100644 --- a/engines/python/setup/djl_python/rolling_batch/trtllm_rolling_batch.py +++ b/engines/python/setup/djl_python/rolling_batch/trtllm_rolling_batch.py @@ -38,8 +38,8 @@ def reset(self): super().reset() def translate_triton_params(self, parameters): - parameters["max_new_tokens"] = parameters.get("max_new_tokens", 128) - parameters["request_output_len"] = parameters.pop("max_new_tokens") + parameters["request_output_len"] = int( + parameters.get("max_new_tokens", 128)) if "top_k" in parameters.keys(): parameters["runtime_top_k"] = parameters.pop("top_k") if "top_p" in parameters.keys(): diff --git a/tests/integration/launch_container.sh b/tests/integration/launch_container.sh index 5d1fc196b..c6b9ffad6 100755 --- a/tests/integration/launch_container.sh +++ b/tests/integration/launch_container.sh @@ -23,7 +23,7 @@ fi is_llm=false if [[ "$platform" == *"cu1"* ]]; then # if the platform has cuda capabilities runtime="nvidia" -elif [[ "$platform" == *"deepspeed"* ]]; then # Runs multi-gpu +elif [[ "$platform" == *"deepspeed"* || "$platform" == *"trtllm"* ]]; then # Runs multi-gpu runtime="nvidia" is_llm=true shm="12gb" @@ -107,6 +107,10 @@ if $is_llm; then if [[ "$platform" == *"inf2"* ]]; then total=80 fi + if [[ "$platform" == *"trtllm-sq"* ]]; then + echo "extra sleep of 15 min for smoothquant calibration" + total=120 + fi sleep 120 fi diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py index 9add68e7a..a5d642a16 100644 --- a/tests/integration/llm/client.py +++ b/tests/integration/llm/client.py @@ -395,6 +395,27 @@ def get_model_name(): }, } +trtllm_model_spec = { + "llama2-13b": { + "max_memory_per_gpu": [22.0], + "batch_size": [1], + "seq_length": [64, 128, 256], + "stream_output": True, + }, + "falcon-7b": { + "max_memory_per_gpu": [22.0], + "batch_size": [1], + "seq_length": [64, 128], + "stream_output": True, + }, + "llama2-7b-smoothquant": { + "max_memory_per_gpu": [22.0], + "batch_size": [1], + "seq_length": [64, 128, 256], + "stream_output": True, + }, +} + def check_worker_number(desired): model_name = get_model_name() @@ -807,6 +828,8 @@ def test_unmerged_lora_correctness(): test_ds_smoothquant(args.model, ds_smoothquant_model_spec) elif args.handler == "lmi_dist_aiccl": test_handler(args.model, lmi_dist_aiccl_model_spec) + elif args.handler == "trtllm": + test_handler(args.model, trtllm_model_spec) else: raise ValueError( f"{args.handler} is not one of the supporting handler") diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py index 5b3fc62f1..a6a471dc7 100644 --- a/tests/integration/llm/prepare.py +++ b/tests/integration/llm/prepare.py @@ -550,6 +550,27 @@ }, } +trtllm_handler_list = { + "llama2-13b": { + "option.model_id": "s3://djl-llm/llama-2-13b-hf/", + "option.tensor_parallel_degree": 4, + "option.output_formatter": "jsonlines", + }, + "falcon-7b": { + "option.model_id": "s3://djl-llm/triton/falcon-7b-tp1-bs4/", + "option.tensor_parallel_degree": 1, + "option.output_formatter": "jsonlines", + }, + "llama2-7b-smoothquant": { + "option.model_id": "s3://djl-llm/llama-2-7b-hf/", + "option.tensor_parallel_degree": 4, + "option.use_smoothquant": "True", + "option.smoothquant_per_token": "True", + "option.smoothquant_per_channel": "True", + "option.output_formatter": "jsonlines", + }, +} + def write_model_artifacts(properties, requirements=None, @@ -761,6 +782,15 @@ def build_lmi_dist_aiccl_model(model): write_model_artifacts(options) +def build_trtllm_handler_model(model): + if model not in trtllm_handler_list: + raise ValueError( + f"{model} is not one of the supporting handler {list(trtllm_handler_list.keys())}" + ) + options = trtllm_handler_list[model] + write_model_artifacts(options) + + supported_handler = { 'deepspeed': build_ds_handler_model, 'huggingface': build_hf_handler_model, @@ -776,6 +806,7 @@ def build_lmi_dist_aiccl_model(model): 'unmerged_lora': build_unmerged_lora_correctness_model, 'deepspeed_smoothquant': build_ds_smoothquant_model, 'lmi_dist_aiccl': build_lmi_dist_aiccl_model, + 'trtllm': build_trtllm_handler_model, } if __name__ == '__main__':