Skip to content

Commit 93e9d26

Browse files
authored
Benchmarks: Micro benchmark - Add simultanneously all-to-host / host-to-all bandwidth testcases to nvbandwidth (#736)
**Description** Add simultanneously all-to-host / host-to-all bandwidth testcases to nvbandwidth . **Major Revision** - nvbandwidth.patch: Add simultanneously all-to-host / host-to-all bandwidth testcases to nvbandwidth - upgrade nvbandwidth submodule into v0.8 - add patch into makefile build
1 parent 76066b6 commit 93e9d26

File tree

4 files changed

+173
-2
lines changed

4 files changed

+173
-2
lines changed

third_party/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -299,5 +299,5 @@ endif
299299

300300
# Build nvbandwidth.
301301
nvbandwidth: sb_micro_path
302-
cd ./nvbandwidth && cmake . && make && cd ..
302+
cd ./nvbandwidth && git apply ../nvbandwidth.patch && cp ../nvbandwidth_testcases_patched.h ./testcases_patched.h && cmake . && make && cd ..
303303
cp -v ./nvbandwidth/nvbandwidth $(SB_MICRO_PATH)/bin

third_party/nvbandwidth.patch

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
diff --git a/nvbandwidth.cpp b/nvbandwidth.cpp
2+
index 61a228f..488372a 100644
3+
--- a/nvbandwidth.cpp
4+
+++ b/nvbandwidth.cpp
5+
@@ -29,6 +29,7 @@
6+
#include "kernels.cuh"
7+
#include "output.h"
8+
#include "testcase.h"
9+
+#include "testcases_patched.h"
10+
#include "version.h"
11+
#include "inline_common.h"
12+
13+
@@ -73,8 +74,10 @@ std::vector<Testcase*> createTestcases() {
14+
new DeviceToDeviceBidirWriteCE(),
15+
new AllToHostCE(),
16+
new AllToHostBidirCE(),
17+
+ new AllToHostBatchCE(),
18+
new HostToAllCE(),
19+
new HostToAllBidirCE(),
20+
+ new HostToAllBatchCE(),
21+
new AllToOneWriteCE(),
22+
new AllToOneReadCE(),
23+
new OneToAllWriteCE(),
24+
@@ -89,8 +92,10 @@ std::vector<Testcase*> createTestcases() {
25+
new DeviceToDeviceBidirWriteSM(),
26+
new AllToHostSM(),
27+
new AllToHostBidirSM(),
28+
+ new AllToHostBatchSM(),
29+
new HostToAllSM(),
30+
new HostToAllBidirSM(),
31+
+ new HostToAllBatchSM(),
32+
new AllToOneWriteSM(),
33+
new AllToOneReadSM(),
34+
new OneToAllWriteSM(),
35+
diff --git a/testcase.h b/testcase.h
36+
index c276850..f26e7d8 100644
37+
--- a/testcase.h
38+
+++ b/testcase.h
39+
@@ -39,6 +39,7 @@ class Testcase {
40+
void oneToAllHelper(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix<double> &bandwidthValues, bool isRead);
41+
void allHostHelper(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix<double> &bandwidthValues, bool sourceIsHost);
42+
void allHostBidirHelper(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix<double> &bandwidthValues, bool sourceIsHost);
43+
+ void allHostHelperBatch(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix<double> &bandwidthValues, bool sourceIsHost);
44+
void latencyHelper(const MemcpyBuffer &dataBuffer, bool measureDeviceToDeviceLatency);
45+
46+
public:
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
// Copyright(c) Microsoft Corporation.
2+
// Licensed under the MIT License.
3+
4+
#include "common.h"
5+
#include "inline_common.h"
6+
#include "output.h"
7+
#include "testcase.h"
8+
9+
// All to Host Batch CE memcpy using cuMemcpyAsync
10+
class AllToHostBatchCE : public Testcase {
11+
public:
12+
AllToHostBatchCE()
13+
: Testcase("all_to_host_batch_memcpy_ce",
14+
"\tMeasures bandwidth of cuMemcpyAsync from all devices to host simultaneously.\n"
15+
"\tAll devices perform memcpy operations concurrently with the same buffer size.\n"
16+
"\tIndividual device bandwidths are measured and reported separately.") {}
17+
virtual ~AllToHostBatchCE() {}
18+
void run(unsigned long long size, unsigned long long loopCount);
19+
};
20+
21+
// Host to All Batch CE memcpy using cuMemcpyAsync
22+
class HostToAllBatchCE : public Testcase {
23+
public:
24+
HostToAllBatchCE()
25+
: Testcase("host_to_all_batch_memcpy_ce",
26+
"\tMeasures bandwidth of cuMemcpyAsync from host to all devices simultaneously.\n"
27+
"\tAll devices perform memcpy operations concurrently with the same buffer size.\n"
28+
"\tIndividual device bandwidths are measured and reported separately.") {}
29+
virtual ~HostToAllBatchCE() {}
30+
void run(unsigned long long size, unsigned long long loopCount);
31+
};
32+
33+
// All to Host Batch SM memcpy using a copy kernel
34+
class AllToHostBatchSM : public Testcase {
35+
public:
36+
AllToHostBatchSM()
37+
: Testcase("all_to_host_batch_memcpy_sm",
38+
"\tMeasures bandwidth of copy kernels from all devices to host simultaneously.\n"
39+
"\tAll devices perform memcpy operations concurrently with the same buffer size.\n"
40+
"\tIndividual device bandwidths are measured and reported separately.") {}
41+
virtual ~AllToHostBatchSM() {}
42+
void run(unsigned long long size, unsigned long long loopCount);
43+
};
44+
45+
// Host to All Batch SM memcpy using a copy kernel
46+
class HostToAllBatchSM : public Testcase {
47+
public:
48+
HostToAllBatchSM()
49+
: Testcase("host_to_all_batch_memcpy_sm",
50+
"\tMeasures bandwidth of copy kernels from host to all devices simultaneously.\n"
51+
"\tAll devices perform memcpy operations concurrently with the same buffer size.\n"
52+
"\tIndividual device bandwidths are measured and reported separately.") {}
53+
virtual ~HostToAllBatchSM() {}
54+
void run(unsigned long long size, unsigned long long loopCount);
55+
};
56+
57+
void Testcase::allHostHelperBatch(unsigned long long size, MemcpyOperation &memcpyInstance,
58+
PeerValueMatrix<double> &bandwidthValues, bool sourceIsHost) {
59+
std::vector<const MemcpyBuffer *> allSrcBuffers;
60+
std::vector<const MemcpyBuffer *> allDstBuffers;
61+
62+
// Create buffers for all devices with the same size
63+
for (int deviceId = 0; deviceId < deviceCount; deviceId++) {
64+
if (sourceIsHost) {
65+
allSrcBuffers.push_back(new HostBuffer(size, deviceId));
66+
allDstBuffers.push_back(new DeviceBuffer(size, deviceId));
67+
} else {
68+
allSrcBuffers.push_back(new DeviceBuffer(size, deviceId));
69+
allDstBuffers.push_back(new HostBuffer(size, deviceId));
70+
}
71+
}
72+
73+
// Perform memcpy for all devices in a single run and get individual bandwidths
74+
std::vector<double> deviceBandwidths = memcpyInstance.doMemcpyVector(allSrcBuffers, allDstBuffers);
75+
76+
// Store individual bandwidth for each device
77+
for (int deviceId = 0; deviceId < deviceCount; deviceId++) {
78+
bandwidthValues.value(0, deviceId) = deviceBandwidths[deviceId];
79+
}
80+
81+
// Clean up all buffers
82+
for (auto node : allSrcBuffers) {
83+
delete node;
84+
}
85+
86+
for (auto node : allDstBuffers) {
87+
delete node;
88+
}
89+
}
90+
91+
void AllToHostBatchCE::run(unsigned long long size, unsigned long long loopCount) {
92+
PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
93+
MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW);
94+
95+
allHostHelperBatch(size, memcpyInstance, bandwidthValues, false);
96+
97+
output->addTestcaseResults(bandwidthValues, "memcpy CE CPU(row) <- GPU(column) batch bandwidth (GB/s)");
98+
}
99+
100+
void HostToAllBatchCE::run(unsigned long long size, unsigned long long loopCount) {
101+
PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
102+
MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW);
103+
104+
allHostHelperBatch(size, memcpyInstance, bandwidthValues, true);
105+
106+
output->addTestcaseResults(bandwidthValues, "memcpy CE CPU(row) -> GPU(column) batch bandwidth (GB/s)");
107+
}
108+
109+
void AllToHostBatchSM::run(unsigned long long size, unsigned long long loopCount) {
110+
PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
111+
MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW);
112+
113+
allHostHelperBatch(size, memcpyInstance, bandwidthValues, false);
114+
115+
output->addTestcaseResults(bandwidthValues, "memcpy SM CPU(row) <- GPU(column) batch bandwidth (GB/s)");
116+
}
117+
118+
void HostToAllBatchSM::run(unsigned long long size, unsigned long long loopCount) {
119+
PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
120+
MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW);
121+
122+
allHostHelperBatch(size, memcpyInstance, bandwidthValues, true);
123+
124+
output->addTestcaseResults(bandwidthValues, "memcpy SM CPU(row) -> GPU(column) batch bandwidth (GB/s)");
125+
}

0 commit comments

Comments
 (0)