diff --git a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
index eec2a51e2f8fd..5c069b38b2d7d 100644
--- a/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/benchmark-pipeline.yaml
@@ -9,7 +9,9 @@ steps:
           - image: badouralix/curl-jq
             command:
             - sh .buildkite/nightly-benchmarks/scripts/wait-for-image.sh
+
   - wait
+
   - label: "A100"
     agents:
       queue: A100
@@ -41,6 +43,27 @@ steps:
           - name: devshm
             emptyDir:
               medium: Memory
+
+  - label: "H200"
+    agents:
+      queue: H200
+    plugins:
+    - docker#v5.12.0:
+        image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+        command:
+        - bash
+        - .buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+        mount-buildkite-agent: true
+        propagate-environment: true
+        ipc: host
+        gpus: 4,5,6,7
+        volumes:
+          - /data/benchmark-hf-cache:/root/.cache/huggingface
+        environment:
+        - VLLM_USAGE_SOURCE
+        - HF_TOKEN
+
+
   # - label: "H100"
   #   agents:
   #     queue: H100
diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
index 7cf05610b9953..d640563252a0c 100644
--- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
+++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py
@@ -157,6 +157,11 @@ def results_to_json(latency, throughput, serving):
                                              throughput_results,
                                              serving_results)
 
+    # Sort all dataframes by their respective "Test name" columns
+    for df in [latency_results, serving_results, throughput_results]:
+        if not df.empty:
+            df.sort_values(by="Test name", inplace=True)
+
     # get markdown tables
     latency_md_table = tabulate(latency_results,
                                 headers='keys',
diff --git a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
index d397b05cdff23..0d16a83781ab2 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-performance-benchmarks.sh
@@ -6,6 +6,7 @@
 
 # Do not set -e, as the mixtral 8x22B model tends to crash occasionally
 # and we still want to see other benchmarking results even when mixtral crashes.
+set -x
 set -o pipefail
 
 check_gpus() {
@@ -85,11 +86,7 @@ kill_gpu_processes() {
 
   ps -aux
   lsof -t -i:8000 | xargs -r kill -9
-  pkill -f pt_main_thread
-  # this line doesn't work now
-  # ps aux | grep python | grep openai | awk '{print $2}' | xargs -r kill -9
-  pkill -f python3
-  pkill -f /usr/bin/python3
+  pgrep python3 | xargs -r kill -9
 
 
   # wait until GPU memory usage smaller than 1GB
@@ -289,7 +286,7 @@ run_serving_tests() {
     # run the server
     echo "Running test case $test_name"
     echo "Server command: $server_command"
-    eval "$server_command" &
+    bash -c "$server_command" &
     server_pid=$!
 
     # wait until the server is alive
@@ -322,7 +319,7 @@ run_serving_tests() {
       echo "Running test case $test_name with qps $qps"
       echo "Client command: $client_command"
 
-      eval "$client_command"
+      bash -c "$client_command"
 
       # record the benchmarking commands
       jq_output=$(jq -n \