neuralmagic · varun-sundar-rabindranath · Mar 15, 2024 · Mar 14, 2024 · Mar 14, 2024 · Mar 14, 2024
diff --git a/.github/data/nm_benchmark_nightly_configs_list.txt b/.github/data/nm_benchmark_nightly_configs_list.txt
@@ -1,3 +1,5 @@
 neuralmagic/benchmarks/configs/benchmark_serving.json
 neuralmagic/benchmarks/configs/benchmark_throughput.json
+neuralmagic/benchmarks/configs/benchmark_throughput_decode.json
+neuralmagic/benchmarks/configs/benchmark_throughput_prefill.json
 neuralmagic/benchmarks/configs/benchmark_remote_push.json
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
@@ -37,7 +37,7 @@ jobs:
         with:
             label: aws-avx2-192G-4-a10g-96G
             benchmark_config_list_file:  ./.github/data/nm_benchmark_nightly_configs_list.txt
-            timeout: 240
+            timeout: 480
             gitref: '${{ github.ref }}'
             Gi_per_thread: 4
             python: "3.10.12"
@@ -49,7 +49,7 @@ jobs:
         with:
             label: aws-avx2-32G-a10g-24G
             benchmark_config_list_file:  ./.github/data/nm_benchmark_nightly_configs_list.txt
-            timeout: 240
+            timeout: 720
             gitref: '${{ github.ref }}'
             Gi_per_thread: 12
             python: "3.10.12"

diff --git a/neuralmagic/benchmarks/configs/benchmark_remote_push.json b/neuralmagic/benchmarks/configs/benchmark_remote_push.json
@@ -1,24 +1,26 @@
 {
 	"configs": [
 		{
-			"description": "Benchmark vllm engine throughput - synthetic",
+			"description": "VLLM Engine throughput - synthetic",
 			"models": [
 				"NousResearch/Llama-2-7b-chat-hf"
 			],
-			"max_model_lens" : [4096],
+			"max_model_lens": [
+				4096
+			],
 			"script_name": "benchmark_throughput",
 			"script_args": {
-                                "input-len" : [
-                                        256
-                                ],
+				"input-len": [
+					256
+				],
 				"output-len": [
 					128
 				],
 				"num-prompts": [
 					1000
 				],
-				"use-all-available-gpus_" : []
+				"use-all-available-gpus_": []
 			}
 		}
 	]
-}
+}
diff --git a/neuralmagic/benchmarks/configs/benchmark_serving.json b/neuralmagic/benchmarks/configs/benchmark_serving.json
@@ -1,12 +1,12 @@
 {
 	"configs": [
 		{
-			"description": "VLLM Serving",
+			"description": "VLLM Serving - Dense",
 			"models": [
-				"facebook/opt-125m",
-				"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-				"mistralai/Mistral-7B-Instruct-v0.2",
-				"NousResearch/Llama-2-7b-chat-hf"
+                          "teknium/OpenHermes-2.5-Mistral-7B",
+                          "NousResearch/Llama-2-7b-chat-hf",
+                          "neuralmagic/OpenHermes-2.5-Mistral-7B-marlin",
+                          "TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ"
 			],
 			"use_all_available_gpus" : "",
 			"max_model_lens": [
@@ -16,13 +16,59 @@
 			"script_name": "benchmark_serving",
 			"script_args": {
 				"nr-qps-pair_": [
-					"50,0.5",
-					"100,1",
-					"200,2",
-					"500,5"
+                                        "150,0.5",
+                                        "300,1",
+                                        "750,2.5",
+                                        "1500,5",
+                                        "3000,10"
 				],
-				"best-of": [
-					1
+				"dataset": [
+					"sharegpt"
+				]
+			}
+		},
+                {
+			"description": "VLLM Serving - Sparse",
+			"models": [
+                          "neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50"
+			],
+			"use_all_available_gpus" : "",
+			"max_model_lens": [
+				4096
+			],
+			"sparsity": ["sparse_w16a16"],
+			"script_name": "benchmark_serving",
+			"script_args": {
+				"nr-qps-pair_": [
+                                        "150,0.5",
+                                        "300,1",
+                                        "750,2.5",
+                                        "1500,5",
+                                        "3000,10"
+				],
+				"dataset": [
+					"sharegpt"
+				]
+			}
+		},
+                {
+			"description": "VLLM Serving - 2:4 Sparse",
+			"models": [
+                          "neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4"
+			],
+			"use_all_available_gpus" : "",
+			"max_model_lens": [
+				4096
+			],
+			"sparsity": ["semi_structured_sparse_w16a16"],
+			"script_name": "benchmark_serving",
+			"script_args": {
+				"nr-qps-pair_": [
+                                        "150,0.5",
+                                        "300,1",
+                                        "750,2.5",
+                                        "1500,5",
+                                        "3000,10"
 				],
 				"dataset": [
 					"sharegpt"

diff --git a/neuralmagic/benchmarks/configs/benchmark_throughput.json b/neuralmagic/benchmarks/configs/benchmark_throughput.json
@@ -1,97 +1,79 @@
 {
 	"configs": [
 		{
-			"description": "VLLM Engine throughput (with dataset)",
+			"description": "VLLM Engine throughput - Dense (with dataset)",
 			"models": [
-				"facebook/opt-125m",
-				"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-				"mistralai/Mistral-7B-Instruct-v0.2",
+				"teknium/OpenHermes-2.5-Mistral-7B",
+				"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin",
+				"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ",
 				"NousResearch/Llama-2-7b-chat-hf"
 			],
-			"max_model_lens" : [4096],
+			"max_model_lens": [
+				4096
+			],
 			"script_name": "benchmark_throughput",
 			"script_args": {
-				"backend": [
-					"vllm"
-				],
 				"dataset": [
 					"sharegpt"
 				],
 				"output-len": [
 					128
 				],
-				"n": [
-					1
-				],
 				"num-prompts": [
 					1000
 				],
-				"seed": [
-					0
-				],
-				"dtype": [
-					"auto"
-				],
-				"use-all-available-gpus_" : []
+				"use-all-available-gpus_": []
 			}
 		},
 		{
-			"description": "VLLM Engine prefill throughput (synthetic)",
+			"description": "VLLM Engine throughput - Sparse (with dataset)",
 			"models": [
-				"facebook/opt-125m",
-				"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-				"mistralai/Mistral-7B-Instruct-v0.2",
-				"NousResearch/Llama-2-7b-chat-hf"
+				"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50"
+			],
+			"max_model_lens": [
+				4096
 			],
-			"max_model_lens" : [4096],
 			"script_name": "benchmark_throughput",
 			"script_args": {
-				"input-len": [
-					1,
-					16,
-					32,
-					64,
-					128,
-					256,
-					512,
-					1024
+				"dataset": [
+					"sharegpt"
 				],
 				"output-len": [
-					1
+					128
 				],
 				"num-prompts": [
-					1
+					1000
+				],
+				"sparsity": [
+					"sparse_w16a16"
 				],
-				"use-all-available-gpus_" : []
+				"use-all-available-gpus_": []
 			}
 		},
 		{
-			"description": "VLLM Engine decode throughput (synthetic)",
+			"description": "VLLM Engine throughput - 2:4 Sparse (with dataset)",
 			"models": [
-				"facebook/opt-125m",
-				"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-				"mistralai/Mistral-7B-Instruct-v0.2",
-				"NousResearch/Llama-2-7b-chat-hf"
+				"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4"
+			],
+			"max_model_lens": [
+				4096
 			],
-			"max_model_lens" : [4096],
 			"script_name": "benchmark_throughput",
 			"script_args": {
-				"input-len": [
-					2
+				"dataset": [
+					"sharegpt"
 				],
 				"output-len": [
 					128
 				],
 				"num-prompts": [
-					1,
-					4,
-					8,
-					16,
-					32,
-					64
+					1000
+				],
+				"sparsity": [
+					"semi_structured_sparse_w16a16"
 				],
-				"use-all-available-gpus_" : []
+				"use-all-available-gpus_": []
 			}
 		}
 	]
-}
+}
diff --git a/neuralmagic/benchmarks/configs/benchmark_throughput_decode.json b/neuralmagic/benchmarks/configs/benchmark_throughput_decode.json
@@ -0,0 +1,94 @@
+{
+	"configs": [
+		{
+			"description": "VLLM Engine decode throughput - Dense (synthetic)",
+			"models": [
+				"teknium/OpenHermes-2.5-Mistral-7B",
+				"neuralmagic/OpenHermes-2.5-Mistral-7B-marlin",
+				"TheBloke/OpenHermes-2.5-Mistral-7B-GPTQ",
+				"NousResearch/Llama-2-7b-chat-hf"
+			],
+			"max_model_lens": [
+				4096
+			],
+			"script_name": "benchmark_throughput",
+			"script_args": {
+				"input-len": [
+					2
+				],
+				"output-len": [
+					128
+				],
+				"num-prompts": [
+					1,
+					4,
+					8,
+					16,
+					32,
+					64
+				],
+				"use-all-available-gpus_": []
+			}
+		},
+		{
+			"description": "VLLM Engine decode throughput - Sparse (synthetic)",
+			"models": [
+				"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned50"
+			],
+			"max_model_lens": [
+				4096
+			],
+			"script_name": "benchmark_throughput",
+			"script_args": {
+				"input-len": [
+					2
+				],
+				"output-len": [
+					128
+				],
+				"num-prompts": [
+					1,
+					4,
+					8,
+					16,
+					32,
+					64
+				],
+				"sparsity": [
+					"sparse_w16a16"
+				],
+				"use-all-available-gpus_": []
+			}
+		},
+		{
+			"description": "VLLM Engine decode throughput - 2:4 Sparse (synthetic)",
+			"models": [
+				"neuralmagic/OpenHermes-2.5-Mistral-7B-pruned2.4"
+			],
+			"max_model_lens": [
+				4096
+			],
+			"script_name": "benchmark_throughput",
+			"script_args": {
+				"input-len": [
+					2
+				],
+				"output-len": [
+					128
+				],
+				"num-prompts": [
+					1,
+					4,
+					8,
+					16,
+					32,
+					64
+				],
+				"sparsity": [
+					"semi_structured_sparse_w16a16"
+				],
+				"use-all-available-gpus_": []
+			}
+		}
+	]
+}