Benchmarks: Micro benchmark - Add simultanneously all-to-host / host-to-all bandwidth testcases to nvbandwidth (#736)

yukirora · web-flow · commit 93e9d262a040 · 2025-09-30T04:31:32.000Z
**Description**
Add simultanneously all-to-host / host-to-all bandwidth testcases to
nvbandwidth .

**Major Revision**
- nvbandwidth.patch: Add simultanneously all-to-host / host-to-all
bandwidth testcases to nvbandwidth
- upgrade nvbandwidth submodule into v0.8
- add patch into makefile build
diff --git a/third_party/Makefile b/third_party/Makefile
@@ -299,5 +299,5 @@ endif
 
 # Build nvbandwidth.
 nvbandwidth: sb_micro_path
-	cd ./nvbandwidth && cmake . && make && cd ..
+	cd ./nvbandwidth && git apply ../nvbandwidth.patch && cp ../nvbandwidth_testcases_patched.h ./testcases_patched.h && cmake . && make && cd ..
 	cp -v ./nvbandwidth/nvbandwidth $(SB_MICRO_PATH)/bin
diff --git a/third_party/nvbandwidth b/third_party/nvbandwidth
@@ -1 +1 @@
-Subproject commit 445d8aef742e8a48a69779a939996f9e8863df9d
+Subproject commit fb851de841a0b1fb261cbc3a6fe626f17a19ba0f
diff --git a/third_party/nvbandwidth.patch b/third_party/nvbandwidth.patch
@@ -0,0 +1,46 @@
+diff --git a/nvbandwidth.cpp b/nvbandwidth.cpp
+index 61a228f..488372a 100644
+--- a/nvbandwidth.cpp
++++ b/nvbandwidth.cpp
+@@ -29,6 +29,7 @@
+ #include "kernels.cuh"
+ #include "output.h"
+ #include "testcase.h"
++#include "testcases_patched.h"
+ #include "version.h"
+ #include "inline_common.h"
+
+@@ -73,8 +74,10 @@ std::vector<Testcase*> createTestcases() {
+         new DeviceToDeviceBidirWriteCE(),
+         new AllToHostCE(),
+         new AllToHostBidirCE(),
++        new AllToHostBatchCE(),
+         new HostToAllCE(),
+         new HostToAllBidirCE(),
++        new HostToAllBatchCE(),
+         new AllToOneWriteCE(),
+         new AllToOneReadCE(),
+         new OneToAllWriteCE(),
+@@ -89,8 +92,10 @@ std::vector<Testcase*> createTestcases() {
+         new DeviceToDeviceBidirWriteSM(),
+         new AllToHostSM(),
+         new AllToHostBidirSM(),
++        new AllToHostBatchSM(),
+         new HostToAllSM(),
+         new HostToAllBidirSM(),
++        new HostToAllBatchSM(),
+         new AllToOneWriteSM(),
+         new AllToOneReadSM(),
+         new OneToAllWriteSM(),
+diff --git a/testcase.h b/testcase.h
+index c276850..f26e7d8 100644
+--- a/testcase.h
++++ b/testcase.h
+@@ -39,6 +39,7 @@ class Testcase {
+     void oneToAllHelper(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix<double> &bandwidthValues, bool isRead);
+     void allHostHelper(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix<double> &bandwidthValues, bool sourceIsHost);
+     void allHostBidirHelper(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix<double> &bandwidthValues, bool sourceIsHost);
++    void allHostHelperBatch(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix<double> &bandwidthValues, bool sourceIsHost);
+     void latencyHelper(const MemcpyBuffer &dataBuffer, bool measureDeviceToDeviceLatency);
+
+  public:
diff --git a/third_party/nvbandwidth_testcases_patched.h b/third_party/nvbandwidth_testcases_patched.h
@@ -0,0 +1,125 @@
+// Copyright(c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "common.h"
+#include "inline_common.h"
+#include "output.h"
+#include "testcase.h"
+
+// All to Host Batch CE memcpy using cuMemcpyAsync
+class AllToHostBatchCE : public Testcase {
+  public:
+    AllToHostBatchCE()
+        : Testcase("all_to_host_batch_memcpy_ce",
+                   "\tMeasures bandwidth of cuMemcpyAsync from all devices to host simultaneously.\n"
+                   "\tAll devices perform memcpy operations concurrently with the same buffer size.\n"
+                   "\tIndividual device bandwidths are measured and reported separately.") {}
+    virtual ~AllToHostBatchCE() {}
+    void run(unsigned long long size, unsigned long long loopCount);
+};
+
+// Host to All Batch CE memcpy using cuMemcpyAsync
+class HostToAllBatchCE : public Testcase {
+  public:
+    HostToAllBatchCE()
+        : Testcase("host_to_all_batch_memcpy_ce",
+                   "\tMeasures bandwidth of cuMemcpyAsync from host to all devices simultaneously.\n"
+                   "\tAll devices perform memcpy operations concurrently with the same buffer size.\n"
+                   "\tIndividual device bandwidths are measured and reported separately.") {}
+    virtual ~HostToAllBatchCE() {}
+    void run(unsigned long long size, unsigned long long loopCount);
+};
+
+// All to Host Batch SM memcpy using a copy kernel
+class AllToHostBatchSM : public Testcase {
+  public:
+    AllToHostBatchSM()
+        : Testcase("all_to_host_batch_memcpy_sm",
+                   "\tMeasures bandwidth of copy kernels from all devices to host simultaneously.\n"
+                   "\tAll devices perform memcpy operations concurrently with the same buffer size.\n"
+                   "\tIndividual device bandwidths are measured and reported separately.") {}
+    virtual ~AllToHostBatchSM() {}
+    void run(unsigned long long size, unsigned long long loopCount);
+};
+
+// Host to All Batch SM memcpy using a copy kernel
+class HostToAllBatchSM : public Testcase {
+  public:
+    HostToAllBatchSM()
+        : Testcase("host_to_all_batch_memcpy_sm",
+                   "\tMeasures bandwidth of copy kernels from host to all devices simultaneously.\n"
+                   "\tAll devices perform memcpy operations concurrently with the same buffer size.\n"
+                   "\tIndividual device bandwidths are measured and reported separately.") {}
+    virtual ~HostToAllBatchSM() {}
+    void run(unsigned long long size, unsigned long long loopCount);
+};
+
+void Testcase::allHostHelperBatch(unsigned long long size, MemcpyOperation &memcpyInstance,
+                                  PeerValueMatrix<double> &bandwidthValues, bool sourceIsHost) {
+    std::vector<const MemcpyBuffer *> allSrcBuffers;
+    std::vector<const MemcpyBuffer *> allDstBuffers;
+
+    // Create buffers for all devices with the same size
+    for (int deviceId = 0; deviceId < deviceCount; deviceId++) {
+        if (sourceIsHost) {
+            allSrcBuffers.push_back(new HostBuffer(size, deviceId));
+            allDstBuffers.push_back(new DeviceBuffer(size, deviceId));
+        } else {
+            allSrcBuffers.push_back(new DeviceBuffer(size, deviceId));
+            allDstBuffers.push_back(new HostBuffer(size, deviceId));
+        }
+    }
+
+    // Perform memcpy for all devices in a single run and get individual bandwidths
+    std::vector<double> deviceBandwidths = memcpyInstance.doMemcpyVector(allSrcBuffers, allDstBuffers);
+
+    // Store individual bandwidth for each device
+    for (int deviceId = 0; deviceId < deviceCount; deviceId++) {
+        bandwidthValues.value(0, deviceId) = deviceBandwidths[deviceId];
+    }
+
+    // Clean up all buffers
+    for (auto node : allSrcBuffers) {
+        delete node;
+    }
+
+    for (auto node : allDstBuffers) {
+        delete node;
+    }
+}
+
+void AllToHostBatchCE::run(unsigned long long size, unsigned long long loopCount) {
+    PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
+    MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW);
+
+    allHostHelperBatch(size, memcpyInstance, bandwidthValues, false);
+
+    output->addTestcaseResults(bandwidthValues, "memcpy CE CPU(row) <- GPU(column) batch bandwidth (GB/s)");
+}
+
+void HostToAllBatchCE::run(unsigned long long size, unsigned long long loopCount) {
+    PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
+    MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW);
+
+    allHostHelperBatch(size, memcpyInstance, bandwidthValues, true);
+
+    output->addTestcaseResults(bandwidthValues, "memcpy CE CPU(row) -> GPU(column) batch bandwidth (GB/s)");
+}
+
+void AllToHostBatchSM::run(unsigned long long size, unsigned long long loopCount) {
+    PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
+    MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW);
+
+    allHostHelperBatch(size, memcpyInstance, bandwidthValues, false);
+
+    output->addTestcaseResults(bandwidthValues, "memcpy SM CPU(row) <- GPU(column) batch bandwidth (GB/s)");
+}
+
+void HostToAllBatchSM::run(unsigned long long size, unsigned long long loopCount) {
+    PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
+    MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW);
+
+    allHostHelperBatch(size, memcpyInstance, bandwidthValues, true);
+
+    output->addTestcaseResults(bandwidthValues, "memcpy SM CPU(row) -> GPU(column) batch bandwidth (GB/s)");
+}