From 1ccd388c1a1a49d1e6a4e70ef35b83c806cab065 Mon Sep 17 00:00:00 2001 From: "Kevin H. Luu" Date: Wed, 19 Jun 2024 16:30:03 -0700 Subject: [PATCH] [ci] Limit num gpus if specified for A100 (#5694) Signed-off-by: kevin --- .buildkite/test-pipeline.yaml | 1 + .buildkite/test-template-aws.j2 | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index b1602dd9496ba..95cd5b1989ee2 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -184,6 +184,7 @@ steps: - label: Distributed Tests (A100) gpu: a100 + num_gpus: 4 commands: # NOTE: don't test llama model here, it seems hf implementation is buggy # see https://github.com/vllm-project/vllm/pull/5689 for details diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2 index 08146bf4454cc..fb34b787e0cbd 100644 --- a/.buildkite/test-template-aws.j2 +++ b/.buildkite/test-template-aws.j2 @@ -75,7 +75,7 @@ steps: - "'cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}'" resources: limits: - nvidia.com/gpu: 8 + nvidia.com/gpu: {{ step.num_gpus or 1 }} volumeMounts: - name: devshm mountPath: /dev/shm