dsxsteven
diff --git a/‎.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py‎
Lines changed: 1 addition & 1 deletion b/‎.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.buildkite/nightly-benchmarks/scripts/launch-server.sh‎
Lines changed: 2 additions & 6 deletions b/‎.buildkite/nightly-benchmarks/scripts/launch-server.sh‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh‎
Lines changed: 1 addition & 7 deletions b/‎.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh‎
Lines changed: 1 addition & 7 deletions
diff --git a/‎.buildkite/pyproject.toml‎
Lines changed: 0 additions & 46 deletions b/‎.buildkite/pyproject.toml‎
Lines changed: 0 additions & 46 deletions
diff --git a/‎.buildkite/release-pipeline.yaml‎
Lines changed: 12 additions & 6 deletions b/‎.buildkite/release-pipeline.yaml‎
Lines changed: 12 additions & 6 deletions
diff --git a/‎.buildkite/scripts/cleanup-nightly-builds.sh‎
Lines changed: 26 additions & 3 deletions b/‎.buildkite/scripts/cleanup-nightly-builds.sh‎
Lines changed: 26 additions & 3 deletions
diff --git a/‎.buildkite/scripts/hardware_ci/run-amd-test.sh‎
Lines changed: 0 additions & 10 deletions b/‎.buildkite/scripts/hardware_ci/run-amd-test.sh‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎.buildkite/scripts/hardware_ci/run-cpu-test.sh‎
Lines changed: 2 additions & 5 deletions b/‎.buildkite/scripts/hardware_ci/run-cpu-test.sh‎
Lines changed: 2 additions & 5 deletions
@@ -368,7 +368,7 @@ def parse_client_command(cmd: str) -> dict[str, Any]:
         # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",
         # we want to turn it into "8xGPUTYPE"
         df["GPU"] = df["GPU"].apply(
-            lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"
+            lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"
         )
 
     # get markdown tables
 
@@ -181,18 +181,14 @@ launch_vllm_server() {
   if echo "$common_params" | jq -e 'has("fp8")' >/dev/null; then
     echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
     model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
-    server_command="python3 \
-        -m vllm.entrypoints.openai.api_server \
+    server_command="vllm serve $model \
         -tp $tp \
-        --model $model \
         --port $port \
         $server_args"
   else
     echo "Key 'fp8' does not exist in common params."
-    server_command="python3 \
-        -m vllm.entrypoints.openai.api_server \
+    server_command="vllm serve $model \
         -tp $tp \
-        --model $model \
         --port $port \
         $server_args"
   fi
 
@@ -365,8 +365,7 @@ run_serving_tests() {
       continue
     fi
 
-    server_command="$server_envs python3 \
-      -m vllm.entrypoints.openai.api_server \
+    server_command="$server_envs vllm serve \
       $server_args"
 
     # run the server
@@ -455,11 +454,6 @@ main() {
   fi
   check_hf_token
 
-  # Set to v1 to run v1 benchmark
-  if [[ "${ENGINE_VERSION:-v0}" == "v1" ]]; then
-    export VLLM_USE_V1=1
-  fi
-
   # dependencies
   (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
   (which jq) || (apt-get update && apt-get -y install jq)
 
@@ -76,7 +76,7 @@ steps:
       queue: arm64_cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
+      - "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 --build-arg GIT_REPO_CHECK=1 --build-arg CUDA_VERSION=12.9.1 --build-arg FLASHINFER_AOT_COMPILE=true --build-arg torch_cuda_arch_list='8.7 9.0 10.0+PTX 12.0' --build-arg INSTALL_KV_CONNECTORS=true --tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m) --target vllm-openai --progress plain -f docker/Dockerfile ."
       - "docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-$(uname -m)"
 
   # Add job to create multi-arch manifest
@@ -150,11 +150,16 @@ steps:
       queue: cpu_queue_postmerge
     commands:
       - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
-      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT"
-      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly"
-      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
-      - "docker push vllm/vllm-openai:nightly"
-      - "docker push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
+      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64"
+      - "docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64"
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-x86_64 vllm/vllm-openai:nightly-x86_64"
+      - "docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:$BUILDKITE_COMMIT-aarch64 vllm/vllm-openai:nightly-aarch64"
+      - "docker push vllm/vllm-openai:nightly-x86_64"
+      - "docker push vllm/vllm-openai:nightly-aarch64"
+      - "docker manifest create vllm/vllm-openai:nightly vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
+      - "docker manifest create vllm/vllm-openai:nightly-$BUILDKITE_COMMIT vllm/vllm-openai:nightly-x86_64 vllm/vllm-openai:nightly-aarch64 --amend"
+      - "docker manifest push vllm/vllm-openai:nightly"
+      - "docker manifest push vllm/vllm-openai:nightly-$BUILDKITE_COMMIT"
       # Clean up old nightly builds (keep only last 14)
       - "bash .buildkite/scripts/cleanup-nightly-builds.sh"
     plugins:
@@ -163,3 +168,4 @@ steps:
           password-env: DOCKERHUB_TOKEN
     env:
       DOCKER_BUILDKIT: "1"
+      DOCKERHUB_USERNAME: "vllmbot"
@@ -8,20 +8,41 @@ set -ex
 # DockerHub API endpoint for vllm/vllm-openai repository
 REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
 
-# Get DockerHub token from environment
+# Get DockerHub credentials from environment
 if [ -z "$DOCKERHUB_TOKEN" ]; then
     echo "Error: DOCKERHUB_TOKEN environment variable is not set"
     exit 1
 fi
 
+if [ -z "$DOCKERHUB_USERNAME" ]; then
+    echo "Error: DOCKERHUB_USERNAME environment variable is not set"
+    exit 1
+fi
+
+# Get DockerHub bearer token
+echo "Getting DockerHub bearer token..."
+set +x
+BEARER_TOKEN=$(curl -s -X POST \
+    -H "Content-Type: application/json" \
+    -d "{\"username\": \"$DOCKERHUB_USERNAME\", \"password\": \"$DOCKERHUB_TOKEN\"}" \
+    "https://hub.docker.com/v2/users/login" | jq -r '.token')
+set -x
+
+if [ -z "$BEARER_TOKEN" ] || [ "$BEARER_TOKEN" = "null" ]; then
+    echo "Error: Failed to get DockerHub bearer token"
+    exit 1
+fi
+
 # Function to get all tags from DockerHub
 get_all_tags() {
     local page=1
     local all_tags=""
 
     while true; do
-        local response=$(curl -s -H "Authorization: Bearer $DOCKERHUB_TOKEN" \
+        set +x
+        local response=$(curl -s -H "Authorization: Bearer $BEARER_TOKEN" \
             "$REPO_API_URL?page=$page&page_size=100")
+        set -x
 
         # Get both last_updated timestamp and tag name, separated by |
         local tags=$(echo "$response" | jq -r '.results[] | select(.name | startswith("nightly-")) | "\(.last_updated)|\(.name)"')
@@ -43,7 +64,9 @@ delete_tag() {
     echo "Deleting tag: $tag_name"
 
     local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
-    local response=$(curl -s -X DELETE -H "Authorization: Bearer $DOCKERHUB_TOKEN" "$delete_url")
+    set +x
+    local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
+    set -x
 
     if echo "$response" | jq -e '.detail' > /dev/null 2>&1; then
         echo "Warning: Failed to delete tag $tag_name: $(echo "$response" | jq -r '.detail')"
 
@@ -86,10 +86,6 @@ if [[ $commands == *"pytest -v -s models/test_registry.py"* ]]; then
   commands=${commands//"pytest -v -s models/test_registry.py"/"pytest -v -s models/test_registry.py -k 'not BambaForCausalLM and not GritLM and not Mamba2ForCausalLM and not Zamba2ForCausalLM'"}
 fi
 
-if [[ $commands == *"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"* ]]; then
-  commands=${commands//"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2'"/"VLLM_USE_V1=0 pytest -v -s models/test_initialization.py -k 'not llama4 and not plamo2 and not BambaForCausalLM and not Gemma2ForCausalLM and not Grok1ModelForCausalLM and not Zamba2ForCausalLM and not Gemma2Model and not GritLM'"}
-fi
-
 if [[ $commands == *"pytest -v -s compile/test_basic_correctness.py"* ]]; then
   commands=${commands//"pytest -v -s compile/test_basic_correctness.py"/"VLLM_USE_TRITON_FLASH_ATTN=0 pytest -v -s compile/test_basic_correctness.py"}
 fi
@@ -167,12 +163,6 @@ if [[ $commands == *" entrypoints/llm "* ]]; then
   --ignore=entrypoints/llm/test_prompt_validation.py "}
 fi
 
-#Obsolete currently
-##ignore certain Entrypoints/llm tests
-#if [[ $commands == *" && pytest -v -s entrypoints/llm/test_guided_generate.py"* ]]; then
-#  commands=${commands//" && pytest -v -s entrypoints/llm/test_guided_generate.py"/" "}
-#fi
-
 # --ignore=entrypoints/openai/test_encoder_decoder.py \
 # --ignore=entrypoints/openai/test_embedding.py \
 # --ignore=entrypoints/openai/test_oot_registration.py
 
@@ -58,11 +58,8 @@ function cpu_tests() {
     # pytest -x -v -s tests/kernels/attention/test_cache.py -m cpu_model
     # pytest -x -v -s tests/kernels/attention/test_mla_decode_cpu.py -m cpu_model
 
-    # Note: disable Bart until supports V1
-    pytest -x -v -s tests/models/language/generation -m cpu_model \
-                --ignore=tests/models/language/generation/test_bart.py
-    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model \
-                --ignore=tests/models/language/generation/test_bart.py
+    pytest -x -v -s tests/models/language/generation -m cpu_model
+    VLLM_CPU_SGL_KERNEL=1 pytest -x -v -s tests/models/language/generation -m cpu_model
 
     pytest -x -v -s tests/models/language/pooling -m cpu_model
     pytest -x -v -s tests/models/multimodal/generation \
Original file line number	Diff line number	Diff line change
`@@ -368,7 +368,7 @@ def parse_client_command(cmd: str) -> dict[str, Any]:`
`368`	`368`	`# The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...",`
`369`	`369`	`# we want to turn it into "8xGPUTYPE"`
`370`	`370`	`df["GPU"] = df["GPU"].apply(`
`371`		`- lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}"`
	`371`	`+ lambda x: f"{len(x.splitlines())}x{x.splitlines()[0]}"`
`372`	`372`	`)`
`373`	`373`
`374`	`374`	`# get markdown tables`