switch to GCP based build VM

neuralmagic · Apr 22, 2024 · f7a0cda · f7a0cda · github-actions · Apr 22, 2024
1 parent e8e00d2
commit f7a0cda
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 8 deletions.
diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml
@@ -69,7 +69,7 @@ jobs:
     BUILD:
         uses: ./.github/workflows/build.yml
         with:
-            build_label: aws-avx512-192G-4-T4-64G
+            build_label: ${{ inputs.build_label }}
             timeout: ${{ inputs.timeout }}
             gitref: ${{ inputs.gitref }}
             Gi_per_thread: ${{ inputs.Gi_per_thread }}

diff --git a/.github/workflows/remote-push.yml b/.github/workflows/remote-push.yml
@@ -18,11 +18,11 @@ jobs:
                 python: [3.10.12]
         uses: ./.github/workflows/build-test.yml
         with:
-            build_label: aws-avx512-192G-4-T4-64G
-            timeout: 360
+            build_label: gcp-build-static
+            timeout: 240
             gitref: '${{ github.ref }}'
-            Gi_per_thread: 4
-            nvcc_threads: 8
+            Gi_per_thread: 1
+            nvcc_threads: 4
             python: ${{ matrix.python }}
             test_skip_list: neuralmagic/tests/skip-for-remote-push.txt
         secrets: inherit
@@ -35,8 +35,8 @@ jobs:
             benchmark_config_list_file:  ./.github/data/nm_benchmark_remote_push_configs_list.txt
             timeout: 180
             gitref: '${{ github.ref }}'
-            Gi_per_thread: 12
-            nvcc_threads: 1
-            python: "3.10.12"
+            Gi_per_thread: 1
+            nvcc_threads: 4
+            python: 3.10.12
             push_benchmark_results_to_gh_pages: "false"
         secrets: inherit
Benchmark suite	Current: `f7a0cda`	Previous: `e8e00d2`	Ratio
`{"name": "request_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.2.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.2.1+cu121"}`	`3.8030644834382996` prompts/s	`3.802302478973358` prompts/s	`1.00`
`{"name": "token_throughput", "description": "VLLM Engine throughput - synthetic\nmodel - NousResearch/Llama-2-7b-chat-hf\nmax_model_len - 4096\nbenchmark_throughput {\n \"use-all-available-gpus_\": \"\",\n \"input-len\": 256,\n \"output-len\": 128,\n \"num-prompts\": 1000\n}", "gpu_description": "NVIDIA A10G x 1", "vllm_version": "0.2.0", "python_version": "3.10.12 (main, Mar 7 2024, 18:39:53) [GCC 9.4.0]", "torch_version": "2.2.1+cu121"}`	`1460.376761640307` tokens/s	`1460.0841519257694` tokens/s	`1.00`