Skip to content

Commit 4edceae

Browse files
committed
add TP4 test cases according to findings from AWS benchmarking
Signed-off-by: Tsai, Louie <louie.tsai@intel.com> Signed-off-by: louie-tsai <louie.tsai@intel.com>
1 parent 70e284c commit 4edceae

File tree

2 files changed

+246
-8
lines changed

2 files changed

+246
-8
lines changed

.buildkite/nightly-benchmarks/tests/serving-tests-cpu-snc3.json

Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,38 @@
9595
"num_prompts": 200
9696
}
9797
},
98+
{
99+
"test_name": "serving_llama8B_bf16_tp4_sharegpt",
100+
"qps_list": ["inf"],
101+
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
102+
"server_environment_variables": {
103+
"VLLM_RPC_TIMEOUT": 100000,
104+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
105+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
106+
"VLLM_CPU_SGL_KERNEL": 1,
107+
"VLLM_CPU_KVCACHE_SPACE": 40
108+
},
109+
"server_parameters": {
110+
"model": "meta-llama/Llama-3.1-8B-Instruct",
111+
"tensor_parallel_size": 4,
112+
"dtype": "bfloat16",
113+
"distributed_executor_backend": "mp",
114+
"block_size": 128,
115+
"trust_remote_code": "",
116+
"disable_log_stats": "",
117+
"enforce_eager": "",
118+
"max_num_batched_tokens": 2048,
119+
"max_num_seqs": 256,
120+
"load_format": "dummy"
121+
},
122+
"client_parameters": {
123+
"model": "meta-llama/Llama-3.1-8B-Instruct",
124+
"backend": "vllm",
125+
"dataset_name": "sharegpt",
126+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
127+
"num_prompts": 200
128+
}
129+
},
98130
{
99131
"test_name": "serving_llama8B_bf16_tp2pp3_sharegpt",
100132
"qps_list": ["inf"],
@@ -233,6 +265,41 @@
233265
"num_prompts": 1000
234266
}
235267
},
268+
{
269+
"test_name": "serving_llama8B_bf16_tp4_random_128_128",
270+
"qps_list": ["inf"],
271+
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
272+
"server_environment_variables": {
273+
"VLLM_RPC_TIMEOUT": 100000,
274+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
275+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
276+
"VLLM_CPU_SGL_KERNEL": 1,
277+
"VLLM_CPU_KVCACHE_SPACE": 40
278+
},
279+
"server_parameters": {
280+
"model": "meta-llama/Llama-3.1-8B-Instruct",
281+
"tensor_parallel_size": 4,
282+
"dtype": "bfloat16",
283+
"distributed_executor_backend": "mp",
284+
"block_size": 128,
285+
"trust_remote_code": "",
286+
"enable_chunked_prefill": "",
287+
"disable_log_stats": "",
288+
"enforce_eager": "",
289+
"max_num_batched_tokens": 2048,
290+
"max_num_seqs": 256,
291+
"load_format": "dummy"
292+
},
293+
"client_parameters": {
294+
"model": "meta-llama/Llama-3.1-8B-Instruct",
295+
"backend": "vllm",
296+
"dataset_name": "random",
297+
"random-input-len": 128,
298+
"random-output-len": 128,
299+
"ignore-eos": "",
300+
"num_prompts": 1000
301+
}
302+
},
236303
{
237304
"test_name": "serving_llama8B_bf16_tp2pp3_random_128_128",
238305
"qps_list": ["inf"],
@@ -365,6 +432,38 @@
365432
"num_prompts": 200
366433
}
367434
},
435+
{
436+
"test_name": "serving_llama8B_int8_tp4_sharegpt",
437+
"qps_list": ["inf"],
438+
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
439+
"server_environment_variables": {
440+
"VLLM_RPC_TIMEOUT": 100000,
441+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
442+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
443+
"VLLM_CPU_SGL_KERNEL": 1,
444+
"VLLM_CPU_KVCACHE_SPACE": 40
445+
},
446+
"server_parameters": {
447+
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
448+
"tensor_parallel_size": 4,
449+
"dtype": "bfloat16",
450+
"distributed_executor_backend": "mp",
451+
"block_size": 128,
452+
"trust_remote_code": "",
453+
"disable_log_stats": "",
454+
"enforce_eager": "",
455+
"max_num_batched_tokens": 2048,
456+
"max_num_seqs": 256,
457+
"load_format": "dummy"
458+
},
459+
"client_parameters": {
460+
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
461+
"backend": "vllm",
462+
"dataset_name": "sharegpt",
463+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
464+
"num_prompts": 200
465+
}
466+
},
368467
{
369468
"test_name": "serving_llama8B_int8_tp2pp3_sharegpt",
370469
"qps_list": ["inf"],
@@ -503,6 +602,41 @@
503602
"num_prompts": 1000
504603
}
505604
},
605+
{
606+
"test_name": "serving_llama8B_int8_tp4_random_128_128",
607+
"qps_list": ["inf"],
608+
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
609+
"server_environment_variables": {
610+
"VLLM_RPC_TIMEOUT": 100000,
611+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
612+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
613+
"VLLM_CPU_SGL_KERNEL": 1,
614+
"VLLM_CPU_KVCACHE_SPACE": 40
615+
},
616+
"server_parameters": {
617+
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
618+
"tensor_parallel_size": 4,
619+
"dtype": "bfloat16",
620+
"distributed_executor_backend": "mp",
621+
"block_size": 128,
622+
"trust_remote_code": "",
623+
"enable_chunked_prefill": "",
624+
"disable_log_stats": "",
625+
"enforce_eager": "",
626+
"max_num_batched_tokens": 2048,
627+
"max_num_seqs": 256,
628+
"load_format": "dummy"
629+
},
630+
"client_parameters": {
631+
"model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8",
632+
"backend": "vllm",
633+
"dataset_name": "random",
634+
"random-input-len": 128,
635+
"random-output-len": 128,
636+
"ignore-eos": "",
637+
"num_prompts": 1000
638+
}
639+
},
506640
{
507641
"test_name": "serving_llama8B_int8_tp2pp3_random_128_128",
508642
"qps_list": ["inf"],
@@ -638,6 +772,39 @@
638772
"num_prompts": 200
639773
}
640774
},
775+
{
776+
"test_name": "serving_llama8B_int4_tp4_sharegpt",
777+
"qps_list": ["inf"],
778+
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
779+
"server_environment_variables": {
780+
"VLLM_RPC_TIMEOUT": 100000,
781+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
782+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
783+
"VLLM_CPU_SGL_KERNEL": 1,
784+
"VLLM_CPU_KVCACHE_SPACE": 40
785+
},
786+
"server_parameters": {
787+
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
788+
"quantization": "awq",
789+
"tensor_parallel_size": 4,
790+
"dtype": "bfloat16",
791+
"distributed_executor_backend": "mp",
792+
"block_size": 128,
793+
"trust_remote_code": "",
794+
"disable_log_stats": "",
795+
"enforce_eager": "",
796+
"max_num_batched_tokens": 2048,
797+
"max_num_seqs": 256,
798+
"load_format": "dummy"
799+
},
800+
"client_parameters": {
801+
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
802+
"backend": "vllm",
803+
"dataset_name": "sharegpt",
804+
"dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
805+
"num_prompts": 200
806+
}
807+
},
641808
{
642809
"test_name": "serving_llama8B_int4_tp2pp3_sharegpt",
643810
"qps_list": ["inf"],
@@ -780,6 +947,42 @@
780947
"num_prompts": 1000
781948
}
782949
},
950+
{
951+
"test_name": "serving_llama8B_int4_tp4_random_128_128",
952+
"qps_list": ["inf"],
953+
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000],
954+
"server_environment_variables": {
955+
"VLLM_RPC_TIMEOUT": 100000,
956+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
957+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
958+
"VLLM_CPU_SGL_KERNEL": 1,
959+
"VLLM_CPU_KVCACHE_SPACE": 40
960+
},
961+
"server_parameters": {
962+
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
963+
"quantization": "awq",
964+
"tensor_parallel_size": 4,
965+
"dtype": "bfloat16",
966+
"distributed_executor_backend": "mp",
967+
"block_size": 128,
968+
"trust_remote_code": "",
969+
"enable_chunked_prefill": "",
970+
"disable_log_stats": "",
971+
"enforce_eager": "",
972+
"max_num_batched_tokens": 2048,
973+
"max_num_seqs": 256,
974+
"load_format": "dummy"
975+
},
976+
"client_parameters": {
977+
"model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4",
978+
"backend": "vllm",
979+
"dataset_name": "random",
980+
"random-input-len": 128,
981+
"random-output-len": 128,
982+
"ignore-eos": "",
983+
"num_prompts": 1000
984+
}
985+
},
783986
{
784987
"test_name": "serving_llama8B_int4_tp2pp3_random_128_128",
785988
"qps_list": ["inf"],

.buildkite/nightly-benchmarks/tests/serving-tests-cpu.json

Lines changed: 43 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@
9696
}
9797
},
9898
{
99-
"test_name": "serving_llama8B_tp4_random_1024_128",
99+
"test_name": "serving_llama8B_tp1_random_128_128",
100100
"qps_list": [1, 4, 16, "inf"],
101101
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
102102
"server_environment_variables": {
@@ -108,7 +108,42 @@
108108
},
109109
"server_parameters": {
110110
"model": "meta-llama/Llama-3.1-8B-Instruct",
111-
"tensor_parallel_size": 4,
111+
"tensor_parallel_size": 1,
112+
"dtype": "bfloat16",
113+
"distributed_executor_backend": "mp",
114+
"block_size": 128,
115+
"trust_remote_code": "",
116+
"enable_chunked_prefill": "",
117+
"disable_log_stats": "",
118+
"enforce_eager": "",
119+
"max_num_batched_tokens": 2048,
120+
"max_num_seqs": 256,
121+
"load_format": "dummy"
122+
},
123+
"client_parameters": {
124+
"model": "meta-llama/Llama-3.1-8B-Instruct",
125+
"backend": "vllm",
126+
"dataset_name": "random",
127+
"random-input-len": 128,
128+
"random-output-len": 128,
129+
"ignore-eos": "",
130+
"num_prompts": 200
131+
}
132+
},
133+
{
134+
"test_name": "serving_llama8B_tp2_random_128_128",
135+
"qps_list": [1, 4, 16, "inf"],
136+
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
137+
"server_environment_variables": {
138+
"VLLM_RPC_TIMEOUT": 100000,
139+
"VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1,
140+
"VLLM_ENGINE_ITERATION_TIMEOUT_S": 120,
141+
"VLLM_CPU_SGL_KERNEL": 1,
142+
"VLLM_CPU_KVCACHE_SPACE": 40
143+
},
144+
"server_parameters": {
145+
"model": "meta-llama/Llama-3.1-8B-Instruct",
146+
"tensor_parallel_size": 2,
112147
"dtype": "bfloat16",
113148
"distributed_executor_backend": "mp",
114149
"block_size": 128,
@@ -124,14 +159,14 @@
124159
"model": "meta-llama/Llama-3.1-8B-Instruct",
125160
"backend": "vllm",
126161
"dataset_name": "random",
127-
"random-input-len": 1024,
162+
"random-input-len": 128,
128163
"random-output-len": 128,
129164
"ignore-eos": "",
130-
"num_prompts": 100
165+
"num_prompts": 200
131166
}
132167
},
133168
{
134-
"test_name": "serving_llama8B_pp6_random_1024_128",
169+
"test_name": "serving_llama8B_tp4_random_128_128",
135170
"qps_list": [1, 4, 16, "inf"],
136171
"max_concurrency_list": [12, 16, 24, 32, 64, 128, 200],
137172
"server_environment_variables": {
@@ -143,7 +178,7 @@
143178
},
144179
"server_parameters": {
145180
"model": "meta-llama/Llama-3.1-8B-Instruct",
146-
"pipeline_parallel_size": 6,
181+
"tensor_parallel_size": 4,
147182
"dtype": "bfloat16",
148183
"distributed_executor_backend": "mp",
149184
"block_size": 128,
@@ -159,10 +194,10 @@
159194
"model": "meta-llama/Llama-3.1-8B-Instruct",
160195
"backend": "vllm",
161196
"dataset_name": "random",
162-
"random-input-len": 1024,
197+
"random-input-len": 128,
163198
"random-output-len": 128,
164199
"ignore-eos": "",
165-
"num_prompts": 100
200+
"num_prompts": 200
166201
}
167202
}
168203
]

0 commit comments

Comments
 (0)