updated configs

neuralmagic · Jun 9, 2024 · c51beba · c51beba · github-actions · Jun 9, 2024
1 parent f20641d
commit c51beba
Show file tree

Hide file tree

Showing 3 changed files with 0 additions and 149 deletions.
diff --git a/neuralmagic/lm-eval/full-small-models.yaml b/neuralmagic/lm-eval/full-small-models.yaml
@@ -21,15 +21,3 @@
       value: 0.688
   limit: 250
   num_fewshot: 5
-
-# ./nm-run-lm-eval-gsm-hf-baseline -m kaitchup/Meta-Llama-3-8B-gptq-8bit -b 32 -d cuda -l 250 -f 5
-- model_name: "kaitchup/Meta-Llama-3-8B-gptq-8bit"
-  tasks:
-  - name: "gsm8k"
-    metrics:
-    - name: "exact_match,strict-match"
-      value: 0.72
-    - name: "exact_match,flexible-extract"
-      value: 0.72
-  limit: 250
-  num_fewshot: 5
diff --git a/neuralmagic/lm-eval/smoke-small-models.yaml b/neuralmagic/lm-eval/smoke-small-models.yaml
@@ -21,15 +21,3 @@
       value: 0.688
   limit: 250
   num_fewshot: 5
-
-# ./nm-run-lm-eval-gsm-hf-baseline -m kaitchup/Meta-Llama-3-8B-gptq-8bit -b 32 -d cuda -l 250 -f 5
-- model_name: "kaitchup/Meta-Llama-3-8B-gptq-8bit"
-  tasks:
-  - name: "gsm8k"
-    metrics:
-    - name: "exact_match,strict-match"
-      value: 0.72
-    - name: "exact_match,flexible-extract"
-      value: 0.72
-  limit: 250
-  num_fewshot: 5
diff --git a/tests/accuracy/lm_eval_compare_hf_vs_vllm.py b/tests/accuracy/lm_eval_compare_hf_vs_vllm.py
Benchmark suite	Current: `c51beba`	Previous: `9fe9187`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.9.17 (main, Jun 7 2023, 12:34:12) \n[GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`2.374538480563878` prompts/s	`2.352493940598332` prompts/s	`0.99`
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.9.17 (main, Jun 7 2023, 12:34:12) \n[GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`911.8227765365291` tokens/s	`903.3576731897596` tokens/s	`0.99`
Benchmark suite	Current: `c51beba`	Previous: `9fe9187`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.8.17 (default, Jun 7 2023, 12:29:56) \n[GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`2.4028694343174934` prompts/s
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.8.17 (default, Jun 7 2023, 12:29:56) \n[GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`922.7018627779174` tokens/s
Benchmark suite	Current: `c51beba`	Previous: `9fe9187`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.11.4 (main, Jun 7 2023, 11:01:02) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`2.397341995751268` prompts/s
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.11.4 (main, Jun 7 2023, 11:01:02) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`920.5793263684869` tokens/s
Benchmark suite	Current: `c51beba`	Previous: `9fe9187`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`2.359976792408583` prompts/s
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA L4 x 1", "vllm_version": "0.5.0", "python_version": "3.10.12 (main, Jun 7 2023, 13:43:11) [GCC 11.3.0]", "torch_version": "2.3.0+cu121"}`	`906.2310882848958` tokens/s