[TRTLLM] Add gpt model to docs and ci (#1475)

deepjavalibrary · Jan 11, 2024 · 86a1a6e · 86a1a6e
1 parent dca98df
commit 86a1a6e
Show file tree

Hide file tree

Showing 5 changed files with 52 additions and 0 deletions.
diff --git a/.github/workflows/llm_integration.yml b/.github/workflows/llm_integration.yml
@@ -767,6 +767,28 @@ jobs:
           python3 llm/client.py trtllm chatglm3-6b
           rm -rf docker_env
           docker rm -f $(docker ps -aq)
+      - name: GPT2 HF model with tp=4
+        working-directory: tests/integration
+        run: |
+          rm -rf models
+          echo -en "CUDA_VISIBLE_DEVICES=0,1,2,3" > docker_env
+          python3 llm/prepare.py trtllm gpt2
+          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm \
+          serve
+          python3 llm/client.py trtllm gpt2
+          rm -rf docker_env
+          docker rm -f $(docker ps -aq)
+      - name: SantaCoder HF model with tp=4
+        working-directory: tests/integration
+        run: |
+          rm -rf models
+          echo -en "CUDA_VISIBLE_DEVICES=0,1,2,3" > docker_env
+          python3 llm/prepare.py trtllm santacoder
+          ./launch_container.sh deepjavalibrary/djl-serving:$DJLSERVING_DOCKER_TAG $PWD/models trtllm \
+          serve
+          python3 llm/client.py trtllm santacoder
+          rm -rf docker_env
+          docker rm -f $(docker ps -aq)
       - name: On fail step
         if: ${{ failure() }}
         working-directory: tests/integration

diff --git a/serving/docs/lmi/configurations_large_model_inference_containers.md b/serving/docs/lmi/configurations_large_model_inference_containers.md
@@ -125,6 +125,7 @@ If you specify MPI engine in TensorRT LLM container, the following parameters wi
 | option.enable_kv_cache_reuse	                    | No	      | This feature is only supported for GPT-like model on TRTLLM (as of 0.7.1) and need to compile the model with `--use_paged_context_fmha`. Let the LLM model to remember the last used input KV cache and try to reuse it in the next run. An instant benefit will be blazing fast first token latency. This is typically helpful for document understanding, chat applications that usually have the same input prefix. The TRTLLM backends will remember the prefix tree of the input and reuse most of its part for the next generation. However, this does come with the cost of extra GPU memory. | `true`, `false`. <br/> Default is `false`	                                                                                                                                  |
 | option.baichuan_model_version	                | No	      | Parameter that exclusively for Baichuan LLM model to specify the version of the model. Need to specify the HF Baichuan checkpoint path. For v1_13b, you should use whether baichuan-inc/Baichuan-13B-Chat or baichuan-inc/Baichuan-13B-Base. For v2_13b, you should use whether baichuan-inc/Baichuan2-13B-Chat or baichuan-inc/Baichuan2-13B-Base. More Baichuan models could be found on baichuan-inc.	                                                                                                                                                                                            | `v1_7b`, `v1_13b`, `v2_7b`, `v2_13b`. <br/> Default is `v1_13b`	                                                                                                            |
 | option.chatglm_model_version                  | No             | Parameter exclusive to ChatGLM models to specify the exact model type. Required for ChatGLM models.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  | `chatglm_6b`, `chatglm2_6b`, `chatglm2_6b_32k`, `chatglm3_6b`, `chatglm3_6b_base`, `chatglm3_6b_32k`, `glm_10b`. <br/> Default is `unspecified`, which will throw an error. |
+| option.gpt_model_version                  | No             | Parameter exclusive to GPT2 models to specify the exact model type. Required for GPT2 models.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  | `gpt2`, `santacoder`, `starcoder`. <br/> Default is `gpt2`. |
 | option.multi_block_mode                  | No             | Split long kv sequence into multiple blocks (applied to generation MHA kernels). It is beneifical when `batch x num_heads` cannot fully utilize GPU. This is **not** supported for qwen model type.                                                                                                                                                                                                                                                                                                                                                                                                  | `true`, `false`. <br/> Default is `false`	 |
 | option.use_fused_mlp                  | No             | Enable horizontal fusion in GatedMLP, reduces layer input traffic and potentially improves performance for large Llama models(e.g. llama-2-70b). This option is only supported for Llama model type.                                                                                                                                                                                                                                                                                                                                                                                                 | `true`, `false`. <br/> Default is `false`	 |
 | option.rotary_base                  | No             | Rotary base parameter for RoPE embedding. This is supported for llama, internlm, qwen model types                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    | `float` value. <br/> Default is `10000.0`	 |

diff --git a/serving/docs/lmi/tutorials/trtllm_aot_tutorial.md b/serving/docs/lmi/tutorials/trtllm_aot_tutorial.md
@@ -20,6 +20,7 @@ The goal of this document is for the user to be able to:
 - Mistral (since LMI V8 0.26.0)
 - Mixtral (since LMI V8 0.26.0)
 - Qwen (since LMI V8 0.26.0)
+- GPT2/SantaCoder/StarCoder (since LMI V8 0.26.0)
 
 For model that are not listed here, you can use [this tutorial](trtllm_manual_convert_tutorial.md) instead to prepare model manually.
 

diff --git a/tests/integration/llm/client.py b/tests/integration/llm/client.py
@@ -538,6 +538,18 @@ def get_model_name():
         "seq_length": [256],
         "tokenizer": "Qwen/Qwen-7B"
     },
+    "gpt2": {
+        "max_memory_per_gpu": [22.0],
+        "batch_size": [1, 4],
+        "seq_length": [256],
+        "tokenizer": "gpt2"
+    },
+    "santacoder": {
+        "max_memory_per_gpu": [22.0],
+        "batch_size": [1, 4],
+        "seq_length": [256],
+        "tokenizer": "bigcode/santacoder"
+    },
     "llama2-70b": {
         "max_memory_per_gpu": [40.0],
         "batch_size": [1, 8],

diff --git a/tests/integration/llm/prepare.py b/tests/integration/llm/prepare.py
@@ -781,6 +781,22 @@
         "option.trust_remote_code": True,
         "option.output_formatter": "jsonlines"
     },
+    "gpt2": {
+        "option.model_id": "gpt2",
+        "option.tensor_parallel_degree": 4,
+        "option.max_rolling_batch_size": 16,
+        "option.trust_remote_code": True,
+        "option.max_draft_len": 20,
+        "option.output_formatter": "jsonlines"
+    },
+    "santacoder": {
+        "option.model_id": "bigcode/santacoder",
+        "option.tensor_parallel_degree": 4,
+        "option.max_rolling_batch_size": 16,
+        "option.trust_remote_code": True,
+        "option.gpt_model_version": "santacoder",
+        "option.output_formatter": "jsonlines"
+    },
     "llama2-70b": {
         "option.model_id": "s3://djl-llm/llama-2-70b-hf/",
         "option.tensor_parallel_degree": 8,