Skip to content
This repository has been archived by the owner on Oct 11, 2024. It is now read-only.

Commit

Permalink
Benchmark : Update benchmark configs for Nightly (#126)
Browse files Browse the repository at this point in the history
SUMMARY:
Update the benchmark configs such that the Nightly runs the following
models,
 - `7b Mistral`
   * Base : teknium/OpenHermes-2.5-Mistral-7B 
   * GPTQ :  TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ 
   * Marlin : neuralmagic/OpenHermes-2.5-Mistral-7B-marlin
   * Sparse  : neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50
   * Sparse 2:4 : neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4 
- `Llama 7b fp16`
  * NousResearch/Llama-2-7b-chat-hf #fp16

- Update benchmark_serving num_prompts and qps pairs.
- Minor update to the benchmark_throughput prefill and decode cases.


TEST PLAN:
Manual testing

---------

Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
  • Loading branch information
varun-sundar-rabindranath and Varun Sundar Rabindranath authored Mar 15, 2024
1 parent b8c95c3 commit ac8f242
Show file tree
Hide file tree
Showing 9 changed files with 316 additions and 83 deletions.
2 changes: 2 additions & 0 deletions .github/data/nm_benchmark_nightly_configs_list.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
neuralmagic/benchmarks/configs/benchmark_serving.json
neuralmagic/benchmarks/configs/benchmark_throughput.json
neuralmagic/benchmarks/configs/benchmark_throughput_decode.json
neuralmagic/benchmarks/configs/benchmark_throughput_prefill.json
neuralmagic/benchmarks/configs/benchmark_remote_push.json
4 changes: 2 additions & 2 deletions .github/workflows/nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
with:
label: aws-avx2-192G-4-a10g-96G
benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt
timeout: 240
timeout: 480
gitref: '${{ github.ref }}'
Gi_per_thread: 4
python: "3.10.12"
Expand All @@ -49,7 +49,7 @@ jobs:
with:
label: aws-avx2-32G-a10g-24G
benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt
timeout: 240
timeout: 720
gitref: '${{ github.ref }}'
Gi_per_thread: 12
python: "3.10.12"
Expand Down
16 changes: 9 additions & 7 deletions neuralmagic/benchmarks/configs/benchmark_remote_push.json
Original file line number Diff line number Diff line change
@@ -1,24 +1,26 @@
{
"configs": [
{
"description": "Benchmark vllm engine throughput - synthetic",
"description": "VLLM Engine throughput - synthetic",
"models": [
"NousResearch/Llama-2-7b-chat-hf"
],
"max_model_lens" : [4096],
"max_model_lens": [
4096
],
"script_name": "benchmark_throughput",
"script_args": {
"input-len" : [
256
],
"input-len": [
256
],
"output-len": [
128
],
"num-prompts": [
1000
],
"use-all-available-gpus_" : []
"use-all-available-gpus_": []
}
}
]
}
}
68 changes: 57 additions & 11 deletions neuralmagic/benchmarks/configs/benchmark_serving.json
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
{
"configs": [
{
"description": "VLLM Serving",
"description": "VLLM Serving - Dense",
"models": [
"facebook/opt-125m",
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"mistralai/Mistral-7B-Instruct-v0.2",
"NousResearch/Llama-2-7b-chat-hf"
"teknium/OpenHermes-2.5-Mistral-7B",
"NousResearch/Llama-2-7b-chat-hf",
"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin",
"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ"
],
"use_all_available_gpus" : "",
"max_model_lens": [
Expand All @@ -16,13 +16,59 @@
"script_name": "benchmark_serving",
"script_args": {
"nr-qps-pair_": [
"50,0.5",
"100,1",
"200,2",
"500,5"
"150,0.5",
"300,1",
"750,2.5",
"1500,5",
"3000,10"
],
"best-of": [
1
"dataset": [
"sharegpt"
]
}
},
{
"description": "VLLM Serving - Sparse",
"models": [
"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50"
],
"use_all_available_gpus" : "",
"max_model_lens": [
4096
],
"sparsity": ["sparse_w16a16"],
"script_name": "benchmark_serving",
"script_args": {
"nr-qps-pair_": [
"150,0.5",
"300,1",
"750,2.5",
"1500,5",
"3000,10"
],
"dataset": [
"sharegpt"
]
}
},
{
"description": "VLLM Serving - 2:4 Sparse",
"models": [
"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4"
],
"use_all_available_gpus" : "",
"max_model_lens": [
4096
],
"sparsity": ["semi_structured_sparse_w16a16"],
"script_name": "benchmark_serving",
"script_args": {
"nr-qps-pair_": [
"150,0.5",
"300,1",
"750,2.5",
"1500,5",
"3000,10"
],
"dataset": [
"sharegpt"
Expand Down
86 changes: 34 additions & 52 deletions neuralmagic/benchmarks/configs/benchmark_throughput.json
Original file line number Diff line number Diff line change
@@ -1,97 +1,79 @@
{
"configs": [
{
"description": "VLLM Engine throughput (with dataset)",
"description": "VLLM Engine throughput - Dense (with dataset)",
"models": [
"facebook/opt-125m",
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"mistralai/Mistral-7B-Instruct-v0.2",
"teknium/OpenHermes-2.5-Mistral-7B",
"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin",
"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ",
"NousResearch/Llama-2-7b-chat-hf"
],
"max_model_lens" : [4096],
"max_model_lens": [
4096
],
"script_name": "benchmark_throughput",
"script_args": {
"backend": [
"vllm"
],
"dataset": [
"sharegpt"
],
"output-len": [
128
],
"n": [
1
],
"num-prompts": [
1000
],
"seed": [
0
],
"dtype": [
"auto"
],
"use-all-available-gpus_" : []
"use-all-available-gpus_": []
}
},
{
"description": "VLLM Engine prefill throughput (synthetic)",
"description": "VLLM Engine throughput - Sparse (with dataset)",
"models": [
"facebook/opt-125m",
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"mistralai/Mistral-7B-Instruct-v0.2",
"NousResearch/Llama-2-7b-chat-hf"
"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50"
],
"max_model_lens": [
4096
],
"max_model_lens" : [4096],
"script_name": "benchmark_throughput",
"script_args": {
"input-len": [
1,
16,
32,
64,
128,
256,
512,
1024
"dataset": [
"sharegpt"
],
"output-len": [
1
128
],
"num-prompts": [
1
1000
],
"sparsity": [
"sparse_w16a16"
],
"use-all-available-gpus_" : []
"use-all-available-gpus_": []
}
},
{
"description": "VLLM Engine decode throughput (synthetic)",
"description": "VLLM Engine throughput - 2:4 Sparse (with dataset)",
"models": [
"facebook/opt-125m",
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"mistralai/Mistral-7B-Instruct-v0.2",
"NousResearch/Llama-2-7b-chat-hf"
"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4"
],
"max_model_lens": [
4096
],
"max_model_lens" : [4096],
"script_name": "benchmark_throughput",
"script_args": {
"input-len": [
2
"dataset": [
"sharegpt"
],
"output-len": [
128
],
"num-prompts": [
1,
4,
8,
16,
32,
64
1000
],
"sparsity": [
"semi_structured_sparse_w16a16"
],
"use-all-available-gpus_" : []
"use-all-available-gpus_": []
}
}
]
}
}
94 changes: 94 additions & 0 deletions neuralmagic/benchmarks/configs/benchmark_throughput_decode.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
{
"configs": [
{
"description": "VLLM Engine decode throughput - Dense (synthetic)",
"models": [
"teknium/OpenHermes-2.5-Mistral-7B",
"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin",
"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ",
"NousResearch/Llama-2-7b-chat-hf"
],
"max_model_lens": [
4096
],
"script_name": "benchmark_throughput",
"script_args": {
"input-len": [
2
],
"output-len": [
128
],
"num-prompts": [
1,
4,
8,
16,
32,
64
],
"use-all-available-gpus_": []
}
},
{
"description": "VLLM Engine decode throughput - Sparse (synthetic)",
"models": [
"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50"
],
"max_model_lens": [
4096
],
"script_name": "benchmark_throughput",
"script_args": {
"input-len": [
2
],
"output-len": [
128
],
"num-prompts": [
1,
4,
8,
16,
32,
64
],
"sparsity": [
"sparse_w16a16"
],
"use-all-available-gpus_": []
}
},
{
"description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)",
"models": [
"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4"
],
"max_model_lens": [
4096
],
"script_name": "benchmark_throughput",
"script_args": {
"input-len": [
2
],
"output-len": [
128
],
"num-prompts": [
1,
4,
8,
16,
32,
64
],
"sparsity": [
"semi_structured_sparse_w16a16"
],
"use-all-available-gpus_": []
}
}
]
}
Loading

0 comments on commit ac8f242

Please sign in to comment.