Skip to content
This repository has been archived by the owner on Oct 11, 2024. It is now read-only.

Benchmark : Update benchmark configs for Nightly #126

Merged
merged 6 commits into from
Mar 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/data/nm_benchmark_nightly_configs_list.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
neuralmagic/benchmarks/configs/benchmark_serving.json
neuralmagic/benchmarks/configs/benchmark_throughput.json
neuralmagic/benchmarks/configs/benchmark_throughput_decode.json
neuralmagic/benchmarks/configs/benchmark_throughput_prefill.json
neuralmagic/benchmarks/configs/benchmark_remote_push.json
4 changes: 2 additions & 2 deletions .github/workflows/nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
with:
label: aws-avx2-192G-4-a10g-96G
benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt
timeout: 240
timeout: 480
gitref: '${{ github.ref }}'
Gi_per_thread: 4
python: "3.10.12"
Expand All @@ -49,7 +49,7 @@ jobs:
with:
label: aws-avx2-32G-a10g-24G
benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt
timeout: 240
timeout: 720
gitref: '${{ github.ref }}'
Gi_per_thread: 12
python: "3.10.12"
Expand Down
16 changes: 9 additions & 7 deletions neuralmagic/benchmarks/configs/benchmark_remote_push.json
Original file line number Diff line number Diff line change
@@ -1,24 +1,26 @@
{
"configs": [
{
"description": "Benchmark vllm engine throughput - synthetic",
"description": "VLLM Engine throughput - synthetic",
"models": [
"NousResearch/Llama-2-7b-chat-hf"
],
"max_model_lens" : [4096],
"max_model_lens": [
4096
],
"script_name": "benchmark_throughput",
"script_args": {
"input-len" : [
256
],
"input-len": [
256
],
"output-len": [
128
],
"num-prompts": [
1000
],
"use-all-available-gpus_" : []
"use-all-available-gpus_": []
}
}
]
}
}
68 changes: 57 additions & 11 deletions neuralmagic/benchmarks/configs/benchmark_serving.json
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
{
"configs": [
{
"description": "VLLM Serving",
"description": "VLLM Serving - Dense",
"models": [
"facebook/opt-125m",
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"mistralai/Mistral-7B-Instruct-v0.2",
"NousResearch/Llama-2-7b-chat-hf"
"teknium/OpenHermes-2.5-Mistral-7B",
"NousResearch/Llama-2-7b-chat-hf",
"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin",
"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ"
],
"use_all_available_gpus" : "",
"max_model_lens": [
Expand All @@ -16,13 +16,59 @@
"script_name": "benchmark_serving",
"script_args": {
"nr-qps-pair_": [
"50,0.5",
"100,1",
"200,2",
"500,5"
"150,0.5",
"300,1",
"750,2.5",
"1500,5",
"3000,10"
],
"best-of": [
1
"dataset": [
"sharegpt"
]
}
},
{
"description": "VLLM Serving - Sparse",
"models": [
"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50"
],
"use_all_available_gpus" : "",
"max_model_lens": [
4096
],
"sparsity": ["sparse_w16a16"],
"script_name": "benchmark_serving",
"script_args": {
"nr-qps-pair_": [
"150,0.5",
"300,1",
"750,2.5",
"1500,5",
"3000,10"
],
"dataset": [
"sharegpt"
]
}
},
{
"description": "VLLM Serving - 2:4 Sparse",
"models": [
"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4"
],
"use_all_available_gpus" : "",
"max_model_lens": [
4096
],
"sparsity": ["semi_structured_sparse_w16a16"],
"script_name": "benchmark_serving",
"script_args": {
"nr-qps-pair_": [
"150,0.5",
"300,1",
"750,2.5",
"1500,5",
"3000,10"
],
"dataset": [
"sharegpt"
Expand Down
86 changes: 34 additions & 52 deletions neuralmagic/benchmarks/configs/benchmark_throughput.json
Original file line number Diff line number Diff line change
@@ -1,97 +1,79 @@
{
"configs": [
{
"description": "VLLM Engine throughput (with dataset)",
"description": "VLLM Engine throughput - Dense (with dataset)",
"models": [
"facebook/opt-125m",
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"mistralai/Mistral-7B-Instruct-v0.2",
"teknium/OpenHermes-2.5-Mistral-7B",
"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin",
"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ",
"NousResearch/Llama-2-7b-chat-hf"
],
"max_model_lens" : [4096],
"max_model_lens": [
4096
],
"script_name": "benchmark_throughput",
"script_args": {
"backend": [
"vllm"
],
"dataset": [
"sharegpt"
],
"output-len": [
128
],
"n": [
1
],
"num-prompts": [
1000
],
"seed": [
0
],
"dtype": [
"auto"
],
"use-all-available-gpus_" : []
"use-all-available-gpus_": []
}
},
{
"description": "VLLM Engine prefill throughput (synthetic)",
"description": "VLLM Engine throughput - Sparse (with dataset)",
"models": [
"facebook/opt-125m",
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"mistralai/Mistral-7B-Instruct-v0.2",
"NousResearch/Llama-2-7b-chat-hf"
"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50"
],
"max_model_lens": [
4096
],
"max_model_lens" : [4096],
"script_name": "benchmark_throughput",
"script_args": {
"input-len": [
1,
16,
32,
64,
128,
256,
512,
1024
"dataset": [
"sharegpt"
],
"output-len": [
1
128
],
"num-prompts": [
1
1000
],
"sparsity": [
"sparse_w16a16"
],
"use-all-available-gpus_" : []
"use-all-available-gpus_": []
}
},
{
"description": "VLLM Engine decode throughput (synthetic)",
"description": "VLLM Engine throughput - 2:4 Sparse (with dataset)",
"models": [
"facebook/opt-125m",
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"mistralai/Mistral-7B-Instruct-v0.2",
"NousResearch/Llama-2-7b-chat-hf"
"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4"
],
"max_model_lens": [
4096
],
"max_model_lens" : [4096],
"script_name": "benchmark_throughput",
"script_args": {
"input-len": [
2
"dataset": [
"sharegpt"
],
"output-len": [
128
],
"num-prompts": [
1,
4,
8,
16,
32,
64
1000
],
"sparsity": [
"semi_structured_sparse_w16a16"
],
"use-all-available-gpus_" : []
"use-all-available-gpus_": []
}
}
]
}
}
94 changes: 94 additions & 0 deletions neuralmagic/benchmarks/configs/benchmark_throughput_decode.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
{
"configs": [
{
"description": "VLLM Engine decode throughput - Dense (synthetic)",
"models": [
"teknium/OpenHermes-2.5-Mistral-7B",
"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin",
"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ",
"NousResearch/Llama-2-7b-chat-hf"
],
"max_model_lens": [
4096
],
"script_name": "benchmark_throughput",
"script_args": {
"input-len": [
2
],
"output-len": [
128
],
"num-prompts": [
1,
4,
8,
16,
32,
64
],
"use-all-available-gpus_": []
}
},
{
"description": "VLLM Engine decode throughput - Sparse (synthetic)",
"models": [
"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50"
],
"max_model_lens": [
4096
],
"script_name": "benchmark_throughput",
"script_args": {
"input-len": [
2
],
"output-len": [
128
],
"num-prompts": [
1,
4,
8,
16,
32,
64
],
"sparsity": [
"sparse_w16a16"
],
"use-all-available-gpus_": []
}
},
{
"description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)",
"models": [
"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4"
],
"max_model_lens": [
4096
],
"script_name": "benchmark_throughput",
"script_args": {
"input-len": [
2
],
"output-len": [
128
],
"num-prompts": [
1,
4,
8,
16,
32,
64
],
"sparsity": [
"semi_structured_sparse_w16a16"
],
"use-all-available-gpus_": []
}
}
]
}
Loading
Loading