microsoft · yukirora · Sep 30, 2025 · Sep 8, 2025 · Sep 8, 2025 · Sep 19, 2025
@@ -299,5 +299,5 @@ endif
 
 # Build nvbandwidth.
 nvbandwidth: sb_micro_path
-	cd ./nvbandwidth && cmake . && make && cd ..
+	cd ./nvbandwidth && git apply ../nvbandwidth.patch && cp ../nvbandwidth_testcases_patched.h ./testcases_patched.h && cmake . && make && cd ..
 	cp -v ./nvbandwidth/nvbandwidth $(SB_MICRO_PATH)/bin
@@ -0,0 +1,46 @@
+diff --git a/nvbandwidth.cpp b/nvbandwidth.cpp
+index 61a228f..488372a 100644
+--- a/nvbandwidth.cpp
++++ b/nvbandwidth.cpp
+@@ -29,6 +29,7 @@
+ #include "kernels.cuh"
+ #include "output.h"
+ #include "testcase.h"
++#include "testcases_patched.h"
+ #include "version.h"
+ #include "inline_common.h"
+
+@@ -73,8 +74,10 @@ std::vector<Testcase*> createTestcases() {
+         new DeviceToDeviceBidirWriteCE(),
+         new AllToHostCE(),
+         new AllToHostBidirCE(),
++        new AllToHostBatchCE(),
+         new HostToAllCE(),
+         new HostToAllBidirCE(),
++        new HostToAllBatchCE(),
+         new AllToOneWriteCE(),
+         new AllToOneReadCE(),
+         new OneToAllWriteCE(),
+@@ -89,8 +92,10 @@ std::vector<Testcase*> createTestcases() {
+         new DeviceToDeviceBidirWriteSM(),
+         new AllToHostSM(),
+         new AllToHostBidirSM(),
++        new AllToHostBatchSM(),
+         new HostToAllSM(),
+         new HostToAllBidirSM(),
++        new HostToAllBatchSM(),
+         new AllToOneWriteSM(),
+         new AllToOneReadSM(),
+         new OneToAllWriteSM(),
+diff --git a/testcase.h b/testcase.h
+index c276850..f26e7d8 100644
+--- a/testcase.h
++++ b/testcase.h
+@@ -39,6 +39,7 @@ class Testcase {
+     void oneToAllHelper(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix<double> &bandwidthValues, bool isRead);
+     void allHostHelper(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix<double> &bandwidthValues, bool sourceIsHost);
+     void allHostBidirHelper(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix<double> &bandwidthValues, bool sourceIsHost);
++    void allHostHelperBatch(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix<double> &bandwidthValues, bool sourceIsHost);
+     void latencyHelper(const MemcpyBuffer &dataBuffer, bool measureDeviceToDeviceLatency);
+
+  public:
@@ -0,0 +1,125 @@
+// Copyright(c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+#include "common.h"
+#include "inline_common.h"
+#include "output.h"
+#include "testcase.h"
+
+// All to Host Batch CE memcpy using cuMemcpyAsync
+class AllToHostBatchCE : public Testcase {
+  public:
+    AllToHostBatchCE()
+        : Testcase("all_to_host_batch_memcpy_ce",
+                   "\tMeasures bandwidth of cuMemcpyAsync from all devices to host simultaneously.\n"
+                   "\tAll devices perform memcpy operations concurrently with the same buffer size.\n"
+                   "\tIndividual device bandwidths are measured and reported separately.") {}
+    virtual ~AllToHostBatchCE() {}
+    void run(unsigned long long size, unsigned long long loopCount);
+};
+
+// Host to All Batch CE memcpy using cuMemcpyAsync
+class HostToAllBatchCE : public Testcase {
+  public:
+    HostToAllBatchCE()
+        : Testcase("host_to_all_batch_memcpy_ce",
+                   "\tMeasures bandwidth of cuMemcpyAsync from host to all devices simultaneously.\n"
+                   "\tAll devices perform memcpy operations concurrently with the same buffer size.\n"
+                   "\tIndividual device bandwidths are measured and reported separately.") {}
+    virtual ~HostToAllBatchCE() {}
+    void run(unsigned long long size, unsigned long long loopCount);
+};
+
+// All to Host Batch SM memcpy using a copy kernel
+class AllToHostBatchSM : public Testcase {
+  public:
+    AllToHostBatchSM()
+        : Testcase("all_to_host_batch_memcpy_sm",
+                   "\tMeasures bandwidth of copy kernels from all devices to host simultaneously.\n"
+                   "\tAll devices perform memcpy operations concurrently with the same buffer size.\n"
+                   "\tIndividual device bandwidths are measured and reported separately.") {}
+    virtual ~AllToHostBatchSM() {}
+    void run(unsigned long long size, unsigned long long loopCount);
+};
+
+// Host to All Batch SM memcpy using a copy kernel
+class HostToAllBatchSM : public Testcase {
+  public:
+    HostToAllBatchSM()
+        : Testcase("host_to_all_batch_memcpy_sm",
+                   "\tMeasures bandwidth of copy kernels from host to all devices simultaneously.\n"
+                   "\tAll devices perform memcpy operations concurrently with the same buffer size.\n"
+                   "\tIndividual device bandwidths are measured and reported separately.") {}
+    virtual ~HostToAllBatchSM() {}
+    void run(unsigned long long size, unsigned long long loopCount);
+};
+
+void Testcase::allHostHelperBatch(unsigned long long size, MemcpyOperation &memcpyInstance,
+                                  PeerValueMatrix<double> &bandwidthValues, bool sourceIsHost) {
+    std::vector<const MemcpyBuffer *> allSrcBuffers;
+    std::vector<const MemcpyBuffer *> allDstBuffers;
+
+    // Create buffers for all devices with the same size
+    for (int deviceId = 0; deviceId < deviceCount; deviceId++) {
+        if (sourceIsHost) {
+            allSrcBuffers.push_back(new HostBuffer(size, deviceId));
+            allDstBuffers.push_back(new DeviceBuffer(size, deviceId));
+        } else {
+            allSrcBuffers.push_back(new DeviceBuffer(size, deviceId));
+            allDstBuffers.push_back(new HostBuffer(size, deviceId));
+        }
+    }
+
+    // Perform memcpy for all devices in a single run and get individual bandwidths
+    std::vector<double> deviceBandwidths = memcpyInstance.doMemcpyVector(allSrcBuffers, allDstBuffers);
+
+    // Store individual bandwidth for each device
+    for (int deviceId = 0; deviceId < deviceCount; deviceId++) {
+        bandwidthValues.value(0, deviceId) = deviceBandwidths[deviceId];
+    }
+
+    // Clean up all buffers
+    for (auto node : allSrcBuffers) {
+        delete node;
+    }
+
+    for (auto node : allDstBuffers) {
+        delete node;
+    }
+}
+
+void AllToHostBatchCE::run(unsigned long long size, unsigned long long loopCount) {
+    PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
+    MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW);
+
+    allHostHelperBatch(size, memcpyInstance, bandwidthValues, false);
+
+    output->addTestcaseResults(bandwidthValues, "memcpy CE CPU(row) <- GPU(column) batch bandwidth (GB/s)");
+}
+
+void HostToAllBatchCE::run(unsigned long long size, unsigned long long loopCount) {
+    PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
+    MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW);
+
+    allHostHelperBatch(size, memcpyInstance, bandwidthValues, true);
+
+    output->addTestcaseResults(bandwidthValues, "memcpy CE CPU(row) -> GPU(column) batch bandwidth (GB/s)");
+}
+
+void AllToHostBatchSM::run(unsigned long long size, unsigned long long loopCount) {
+    PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
+    MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW);
+
+    allHostHelperBatch(size, memcpyInstance, bandwidthValues, false);
+
+    output->addTestcaseResults(bandwidthValues, "memcpy SM CPU(row) <- GPU(column) batch bandwidth (GB/s)");
+}
+
+void HostToAllBatchSM::run(unsigned long long size, unsigned long long loopCount) {
+    PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
+    MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW);
+
+    allHostHelperBatch(size, memcpyInstance, bandwidthValues, true);
+
+    output->addTestcaseResults(bandwidthValues, "memcpy SM CPU(row) -> GPU(column) batch bandwidth (GB/s)");
+}
+12 −9		CHANGELOG.md
+20 −5		CMakeLists.txt
+67 −14		README.md
+68 −19		common.h
+28 −11		error_handling.h
+44 −10		inline_common.h
+22 −3		json_output.cpp
+12 −8		json_output.h
+303 −29		kernels.cu
+12 −5		kernels.cuh
+240 −146		memcpy.cpp
+87 −41		memcpy.h
+327 −0		multinode_memcpy.cpp
+148 −0		multinode_memcpy.h
+383 −0		multinode_testcases.cpp
+74 −12		nvbandwidth.cpp
+104 −18		output.cpp
+11 −9		output.h
+48 −16		testcase.cpp
+222 −55		testcase.h
+14 −1		testcases_ce.cpp
+72 −30		testcases_sm.cpp
+4 −4		version.h