Skip to content

Commit

Permalink
trtllm integ tests
Browse files Browse the repository at this point in the history
add trtllm integration tests
  • Loading branch information
rohithkrn authored and Qing Lan committed Nov 7, 2023
1 parent 0d22a23 commit a3f1513
Show file tree
Hide file tree
Showing 5 changed files with 170 additions and 5 deletions.
111 changes: 109 additions & 2 deletions .github/workflows/llm_integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ on:
required: false
default: ''
run_test:
description: 'Run only the tests you need [ds, hf, aot, lora-correctness]'
description: 'Run only the tests you need [ds, hf, aot, trtllm, lora-correctness]'
required: false
default: ''
schedule:
Expand Down Expand Up @@ -687,10 +687,117 @@ jobs:
name: ds-smoothquant-logs
path: tests/integration/logs/

trt-llm-handler-test:
if: contains(fromJson('["", "trtllm"]'), github.event.inputs.run_test)
runs-on: [ self-hosted, g5 ]
timeout-minutes: 120
needs: create-runners
steps:
- uses: actions/checkout@v3
- name: Clean env
run: |
yes | docker system prune -a --volumes
sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
echo "wait dpkg lock..."
while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
- name: Set up Python3
uses: actions/setup-python@v4
with:
python-version: '3.10.x'
- name: Install pip dependencies
run: pip3 install requests numpy huggingface_hub
- name: Build container name
run: ./serving/docker/scripts/docker_name_builder.sh tensorrt-llm ${{ github.event.inputs.djl-version }}
- name: Download models and dockers
working-directory: tests/integration
run: |
docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG
- name: llama2-13b HF model with tp=4
working-directory: tests/integration
run: |
rm -rf models
echo -en "CUDA_VISIBLE_DEVICES=0,1,2,3" > docker_env
python3 llm/prepare.py trtllm llama2-13b
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm \
serve
python3 llm/client.py trtllm llama2-13b
rm -rf docker_env
docker rm -f $(docker ps -aq)
- name: falcon-7b triton repo with tp=1
working-directory: tests/integration
run: |
rm -rf models
echo -en "CUDA_VISIBLE_DEVICES=0" > docker_env
python3 llm/prepare.py trtllm falcon-7b
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm \
serve
python3 llm/client.py trtllm falcon-7b
rm -rf docker_env
docker rm -f $(docker ps -aq)
- name: On fail step
if: ${{ failure() }}
working-directory: tests/integration
run: |
docker rm -f $(docker ps -aq) || true
cat logs/serving.log
- name: Upload test logs
uses: actions/upload-artifact@v3
with:
name: trtllm-handler-logs
path: tests/integration/logs/

trt-llm-handler-quantization-test:
if: contains(fromJson('["", "trtllm"]'), github.event.inputs.run_test)
runs-on: [ self-hosted, g5 ]
timeout-minutes: 120
needs: create-runners
steps:
- uses: actions/checkout@v3
- name: Clean env
run: |
yes | docker system prune -a --volumes
sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
echo "wait dpkg lock..."
while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
- name: Set up Python3
uses: actions/setup-python@v4
with:
python-version: '3.10.x'
- name: Install pip dependencies
run: pip3 install requests numpy huggingface_hub
- name: Build container name
run: ./serving/docker/scripts/docker_name_builder.sh tensorrt-llm ${{ github.event.inputs.djl-version }}
- name: Download models and dockers
working-directory: tests/integration
run: |
docker pull deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG
- name: llama2-7b HF model with tp=4 and smoothquant
working-directory: tests/integration
run: |
rm -rf models
echo -en "CUDA_VISIBLE_DEVICES=0,1,2,3" > docker_env
python3 llm/prepare.py trtllm llama2-7b-smoothquant
./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm-sq \
serve
python3 llm/client.py trtllm llama2-7b-smoothquant
rm -rf docker_env
docker rm -f $(docker ps -aq)
- name: On fail step
if: ${{ failure() }}
working-directory: tests/integration
run: |
docker rm -f $(docker ps -aq) || true
cat logs/serving.log
- name: Upload test logs
uses: actions/upload-artifact@v3
with:
name: trtllm-handler-quantization-logs
path: tests/integration/logs/

stop-runners:
if: always()
runs-on: [ self-hosted, scheduler ]
needs: [ create-runners, hf-handler-test, hf-lora-correctness-test, ds-raw-test, ds-handler-test, ds-aot-raw-test, no-code-test, ds-handler-aot-test, ds-smoothquant-handler-test ]
needs: [ create-runners, hf-handler-test, hf-lora-correctness-test, ds-raw-test, ds-handler-test, ds-aot-raw-test, no-code-test, ds-handler-aot-test, ds-smoothquant-handler-test, trt-llm-handler-test, trt-llm-handler-quantization-test]
steps:
- name: Stop all instances
run: |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ def reset(self):
super().reset()

def translate_triton_params(self, parameters):
parameters["max_new_tokens"] = parameters.get("max_new_tokens", 128)
parameters["request_output_len"] = parameters.pop("max_new_tokens")
parameters["request_output_len"] = int(
parameters.get("max_new_tokens", 128))
if "top_k" in parameters.keys():
parameters["runtime_top_k"] = parameters.pop("top_k")
if "top_p" in parameters.keys():
Expand Down
6 changes: 5 additions & 1 deletion tests/integration/launch_container.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ fi
is_llm=false
if [[ "$platform" == *"cu1"* ]]; then # if the platform has cuda capabilities
runtime="nvidia"
elif [[ "$platform" == *"deepspeed"* ]]; then # Runs multi-gpu
elif [[ "$platform" == *"deepspeed"* || "$platform" == *"trtllm"* ]]; then # Runs multi-gpu
runtime="nvidia"
is_llm=true
shm="12gb"
Expand Down Expand Up @@ -107,6 +107,10 @@ if $is_llm; then
if [[ "$platform" == *"inf2"* ]]; then
total=80
fi
if [[ "$platform" == *"trtllm-sq"* ]]; then
echo "extra sleep of 15 min for smoothquant calibration"
total=120
fi
sleep 120
fi

Expand Down
23 changes: 23 additions & 0 deletions tests/integration/llm/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,27 @@ def get_model_name():
},
}

trtllm_model_spec = {
"llama2-13b": {
"max_memory_per_gpu": [22.0],
"batch_size": [1],
"seq_length": [64, 128, 256],
"stream_output": True,
},
"falcon-7b": {
"max_memory_per_gpu": [22.0],
"batch_size": [1],
"seq_length": [64, 128],
"stream_output": True,
},
"llama2-7b-smoothquant": {
"max_memory_per_gpu": [22.0],
"batch_size": [1],
"seq_length": [64, 128, 256],
"stream_output": True,
},
}


def check_worker_number(desired):
model_name = get_model_name()
Expand Down Expand Up @@ -807,6 +828,8 @@ def test_unmerged_lora_correctness():
test_ds_smoothquant(args.model, ds_smoothquant_model_spec)
elif args.handler == "lmi_dist_aiccl":
test_handler(args.model, lmi_dist_aiccl_model_spec)
elif args.handler == "trtllm":
test_handler(args.model, trtllm_model_spec)
else:
raise ValueError(
f"{args.handler} is not one of the supporting handler")
31 changes: 31 additions & 0 deletions tests/integration/llm/prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -550,6 +550,27 @@
},
}

trtllm_handler_list = {
"llama2-13b": {
"option.model_id": "s3://djl-llm/llama-2-13b-hf/",
"option.tensor_parallel_degree": 4,
"option.output_formatter": "jsonlines",
},
"falcon-7b": {
"option.model_id": "s3://djl-llm/triton/falcon-7b-tp1-bs4/",
"option.tensor_parallel_degree": 1,
"option.output_formatter": "jsonlines",
},
"llama2-7b-smoothquant": {
"option.model_id": "s3://djl-llm/llama-2-7b-hf/",
"option.tensor_parallel_degree": 4,
"option.use_smoothquant": "True",
"option.smoothquant_per_token": "True",
"option.smoothquant_per_channel": "True",
"option.output_formatter": "jsonlines",
},
}


def write_model_artifacts(properties,
requirements=None,
Expand Down Expand Up @@ -761,6 +782,15 @@ def build_lmi_dist_aiccl_model(model):
write_model_artifacts(options)


def build_trtllm_handler_model(model):
if model not in trtllm_handler_list:
raise ValueError(
f"{model} is not one of the supporting handler {list(trtllm_handler_list.keys())}"
)
options = trtllm_handler_list[model]
write_model_artifacts(options)


supported_handler = {
'deepspeed': build_ds_handler_model,
'huggingface': build_hf_handler_model,
Expand All @@ -776,6 +806,7 @@ def build_lmi_dist_aiccl_model(model):
'unmerged_lora': build_unmerged_lora_correctness_model,
'deepspeed_smoothquant': build_ds_smoothquant_model,
'lmi_dist_aiccl': build_lmi_dist_aiccl_model,
'trtllm': build_trtllm_handler_model,
}

if __name__ == '__main__':
Expand Down

0 comments on commit a3f1513

Please sign in to comment.