|
95 | 95 | "num_prompts": 200 |
96 | 96 | } |
97 | 97 | }, |
| 98 | + { |
| 99 | + "test_name": "serving_llama8B_bf16_tp4_sharegpt", |
| 100 | + "qps_list": ["inf"], |
| 101 | + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], |
| 102 | + "server_environment_variables": { |
| 103 | + "VLLM_RPC_TIMEOUT": 100000, |
| 104 | + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |
| 105 | + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |
| 106 | + "VLLM_CPU_SGL_KERNEL": 1, |
| 107 | + "VLLM_CPU_KVCACHE_SPACE": 40 |
| 108 | + }, |
| 109 | + "server_parameters": { |
| 110 | + "model": "meta-llama/Llama-3.1-8B-Instruct", |
| 111 | + "tensor_parallel_size": 4, |
| 112 | + "dtype": "bfloat16", |
| 113 | + "distributed_executor_backend": "mp", |
| 114 | + "block_size": 128, |
| 115 | + "trust_remote_code": "", |
| 116 | + "disable_log_stats": "", |
| 117 | + "enforce_eager": "", |
| 118 | + "max_num_batched_tokens": 2048, |
| 119 | + "max_num_seqs": 256, |
| 120 | + "load_format": "dummy" |
| 121 | + }, |
| 122 | + "client_parameters": { |
| 123 | + "model": "meta-llama/Llama-3.1-8B-Instruct", |
| 124 | + "backend": "vllm", |
| 125 | + "dataset_name": "sharegpt", |
| 126 | + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", |
| 127 | + "num_prompts": 200 |
| 128 | + } |
| 129 | + }, |
98 | 130 | { |
99 | 131 | "test_name": "serving_llama8B_bf16_tp2pp3_sharegpt", |
100 | 132 | "qps_list": ["inf"], |
|
233 | 265 | "num_prompts": 1000 |
234 | 266 | } |
235 | 267 | }, |
| 268 | + { |
| 269 | + "test_name": "serving_llama8B_bf16_tp4_random_128_128", |
| 270 | + "qps_list": ["inf"], |
| 271 | + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], |
| 272 | + "server_environment_variables": { |
| 273 | + "VLLM_RPC_TIMEOUT": 100000, |
| 274 | + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |
| 275 | + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |
| 276 | + "VLLM_CPU_SGL_KERNEL": 1, |
| 277 | + "VLLM_CPU_KVCACHE_SPACE": 40 |
| 278 | + }, |
| 279 | + "server_parameters": { |
| 280 | + "model": "meta-llama/Llama-3.1-8B-Instruct", |
| 281 | + "tensor_parallel_size": 4, |
| 282 | + "dtype": "bfloat16", |
| 283 | + "distributed_executor_backend": "mp", |
| 284 | + "block_size": 128, |
| 285 | + "trust_remote_code": "", |
| 286 | + "enable_chunked_prefill": "", |
| 287 | + "disable_log_stats": "", |
| 288 | + "enforce_eager": "", |
| 289 | + "max_num_batched_tokens": 2048, |
| 290 | + "max_num_seqs": 256, |
| 291 | + "load_format": "dummy" |
| 292 | + }, |
| 293 | + "client_parameters": { |
| 294 | + "model": "meta-llama/Llama-3.1-8B-Instruct", |
| 295 | + "backend": "vllm", |
| 296 | + "dataset_name": "random", |
| 297 | + "random-input-len": 128, |
| 298 | + "random-output-len": 128, |
| 299 | + "ignore-eos": "", |
| 300 | + "num_prompts": 1000 |
| 301 | + } |
| 302 | + }, |
236 | 303 | { |
237 | 304 | "test_name": "serving_llama8B_bf16_tp2pp3_random_128_128", |
238 | 305 | "qps_list": ["inf"], |
|
365 | 432 | "num_prompts": 200 |
366 | 433 | } |
367 | 434 | }, |
| 435 | + { |
| 436 | + "test_name": "serving_llama8B_int8_tp4_sharegpt", |
| 437 | + "qps_list": ["inf"], |
| 438 | + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], |
| 439 | + "server_environment_variables": { |
| 440 | + "VLLM_RPC_TIMEOUT": 100000, |
| 441 | + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |
| 442 | + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |
| 443 | + "VLLM_CPU_SGL_KERNEL": 1, |
| 444 | + "VLLM_CPU_KVCACHE_SPACE": 40 |
| 445 | + }, |
| 446 | + "server_parameters": { |
| 447 | + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", |
| 448 | + "tensor_parallel_size": 4, |
| 449 | + "dtype": "bfloat16", |
| 450 | + "distributed_executor_backend": "mp", |
| 451 | + "block_size": 128, |
| 452 | + "trust_remote_code": "", |
| 453 | + "disable_log_stats": "", |
| 454 | + "enforce_eager": "", |
| 455 | + "max_num_batched_tokens": 2048, |
| 456 | + "max_num_seqs": 256, |
| 457 | + "load_format": "dummy" |
| 458 | + }, |
| 459 | + "client_parameters": { |
| 460 | + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", |
| 461 | + "backend": "vllm", |
| 462 | + "dataset_name": "sharegpt", |
| 463 | + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", |
| 464 | + "num_prompts": 200 |
| 465 | + } |
| 466 | + }, |
368 | 467 | { |
369 | 468 | "test_name": "serving_llama8B_int8_tp2pp3_sharegpt", |
370 | 469 | "qps_list": ["inf"], |
|
503 | 602 | "num_prompts": 1000 |
504 | 603 | } |
505 | 604 | }, |
| 605 | + { |
| 606 | + "test_name": "serving_llama8B_int8_tp4_random_128_128", |
| 607 | + "qps_list": ["inf"], |
| 608 | + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], |
| 609 | + "server_environment_variables": { |
| 610 | + "VLLM_RPC_TIMEOUT": 100000, |
| 611 | + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |
| 612 | + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |
| 613 | + "VLLM_CPU_SGL_KERNEL": 1, |
| 614 | + "VLLM_CPU_KVCACHE_SPACE": 40 |
| 615 | + }, |
| 616 | + "server_parameters": { |
| 617 | + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", |
| 618 | + "tensor_parallel_size": 4, |
| 619 | + "dtype": "bfloat16", |
| 620 | + "distributed_executor_backend": "mp", |
| 621 | + "block_size": 128, |
| 622 | + "trust_remote_code": "", |
| 623 | + "enable_chunked_prefill": "", |
| 624 | + "disable_log_stats": "", |
| 625 | + "enforce_eager": "", |
| 626 | + "max_num_batched_tokens": 2048, |
| 627 | + "max_num_seqs": 256, |
| 628 | + "load_format": "dummy" |
| 629 | + }, |
| 630 | + "client_parameters": { |
| 631 | + "model": "RedHatAI/Meta-Llama-3.1-8B-Instruct-quantized.w8a8", |
| 632 | + "backend": "vllm", |
| 633 | + "dataset_name": "random", |
| 634 | + "random-input-len": 128, |
| 635 | + "random-output-len": 128, |
| 636 | + "ignore-eos": "", |
| 637 | + "num_prompts": 1000 |
| 638 | + } |
| 639 | + }, |
506 | 640 | { |
507 | 641 | "test_name": "serving_llama8B_int8_tp2pp3_random_128_128", |
508 | 642 | "qps_list": ["inf"], |
|
638 | 772 | "num_prompts": 200 |
639 | 773 | } |
640 | 774 | }, |
| 775 | + { |
| 776 | + "test_name": "serving_llama8B_int4_tp4_sharegpt", |
| 777 | + "qps_list": ["inf"], |
| 778 | + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200], |
| 779 | + "server_environment_variables": { |
| 780 | + "VLLM_RPC_TIMEOUT": 100000, |
| 781 | + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |
| 782 | + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |
| 783 | + "VLLM_CPU_SGL_KERNEL": 1, |
| 784 | + "VLLM_CPU_KVCACHE_SPACE": 40 |
| 785 | + }, |
| 786 | + "server_parameters": { |
| 787 | + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", |
| 788 | + "quantization": "awq", |
| 789 | + "tensor_parallel_size": 4, |
| 790 | + "dtype": "bfloat16", |
| 791 | + "distributed_executor_backend": "mp", |
| 792 | + "block_size": 128, |
| 793 | + "trust_remote_code": "", |
| 794 | + "disable_log_stats": "", |
| 795 | + "enforce_eager": "", |
| 796 | + "max_num_batched_tokens": 2048, |
| 797 | + "max_num_seqs": 256, |
| 798 | + "load_format": "dummy" |
| 799 | + }, |
| 800 | + "client_parameters": { |
| 801 | + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", |
| 802 | + "backend": "vllm", |
| 803 | + "dataset_name": "sharegpt", |
| 804 | + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", |
| 805 | + "num_prompts": 200 |
| 806 | + } |
| 807 | + }, |
641 | 808 | { |
642 | 809 | "test_name": "serving_llama8B_int4_tp2pp3_sharegpt", |
643 | 810 | "qps_list": ["inf"], |
|
780 | 947 | "num_prompts": 1000 |
781 | 948 | } |
782 | 949 | }, |
| 950 | + { |
| 951 | + "test_name": "serving_llama8B_int4_tp4_random_128_128", |
| 952 | + "qps_list": ["inf"], |
| 953 | + "max_concurrency_list": [12, 16, 24, 32, 64, 128, 200, 1000], |
| 954 | + "server_environment_variables": { |
| 955 | + "VLLM_RPC_TIMEOUT": 100000, |
| 956 | + "VLLM_ALLOW_LONG_MAX_MODEL_LEN": 1, |
| 957 | + "VLLM_ENGINE_ITERATION_TIMEOUT_S": 120, |
| 958 | + "VLLM_CPU_SGL_KERNEL": 1, |
| 959 | + "VLLM_CPU_KVCACHE_SPACE": 40 |
| 960 | + }, |
| 961 | + "server_parameters": { |
| 962 | + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", |
| 963 | + "quantization": "awq", |
| 964 | + "tensor_parallel_size": 4, |
| 965 | + "dtype": "bfloat16", |
| 966 | + "distributed_executor_backend": "mp", |
| 967 | + "block_size": 128, |
| 968 | + "trust_remote_code": "", |
| 969 | + "enable_chunked_prefill": "", |
| 970 | + "disable_log_stats": "", |
| 971 | + "enforce_eager": "", |
| 972 | + "max_num_batched_tokens": 2048, |
| 973 | + "max_num_seqs": 256, |
| 974 | + "load_format": "dummy" |
| 975 | + }, |
| 976 | + "client_parameters": { |
| 977 | + "model": "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4", |
| 978 | + "backend": "vllm", |
| 979 | + "dataset_name": "random", |
| 980 | + "random-input-len": 128, |
| 981 | + "random-output-len": 128, |
| 982 | + "ignore-eos": "", |
| 983 | + "num_prompts": 1000 |
| 984 | + } |
| 985 | + }, |
783 | 986 | { |
784 | 987 | "test_name": "serving_llama8B_int4_tp2pp3_random_128_128", |
785 | 988 | "qps_list": ["inf"], |
|
0 commit comments