Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion third_party/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -299,5 +299,5 @@ endif

# Build nvbandwidth.
nvbandwidth: sb_micro_path
cd ./nvbandwidth && cmake . && make && cd ..
cd ./nvbandwidth && git apply ../nvbandwidth.patch && cp ../nvbandwidth_testcases_patched.h ./testcases_patched.h && cmake . && make && cd ..
cp -v ./nvbandwidth/nvbandwidth $(SB_MICRO_PATH)/bin
2 changes: 1 addition & 1 deletion third_party/nvbandwidth
46 changes: 46 additions & 0 deletions third_party/nvbandwidth.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
diff --git a/nvbandwidth.cpp b/nvbandwidth.cpp
index 61a228f..488372a 100644
--- a/nvbandwidth.cpp
+++ b/nvbandwidth.cpp
@@ -29,6 +29,7 @@
#include "kernels.cuh"
#include "output.h"
#include "testcase.h"
+#include "testcases_patched.h"
#include "version.h"
#include "inline_common.h"

@@ -73,8 +74,10 @@ std::vector<Testcase*> createTestcases() {
new DeviceToDeviceBidirWriteCE(),
new AllToHostCE(),
new AllToHostBidirCE(),
+ new AllToHostBatchCE(),
new HostToAllCE(),
new HostToAllBidirCE(),
+ new HostToAllBatchCE(),
new AllToOneWriteCE(),
new AllToOneReadCE(),
new OneToAllWriteCE(),
@@ -89,8 +92,10 @@ std::vector<Testcase*> createTestcases() {
new DeviceToDeviceBidirWriteSM(),
new AllToHostSM(),
new AllToHostBidirSM(),
+ new AllToHostBatchSM(),
new HostToAllSM(),
new HostToAllBidirSM(),
+ new HostToAllBatchSM(),
new AllToOneWriteSM(),
new AllToOneReadSM(),
new OneToAllWriteSM(),
diff --git a/testcase.h b/testcase.h
index c276850..f26e7d8 100644
--- a/testcase.h
+++ b/testcase.h
@@ -39,6 +39,7 @@ class Testcase {
void oneToAllHelper(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix<double> &bandwidthValues, bool isRead);
void allHostHelper(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix<double> &bandwidthValues, bool sourceIsHost);
void allHostBidirHelper(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix<double> &bandwidthValues, bool sourceIsHost);
+ void allHostHelperBatch(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix<double> &bandwidthValues, bool sourceIsHost);
void latencyHelper(const MemcpyBuffer &dataBuffer, bool measureDeviceToDeviceLatency);

public:
125 changes: 125 additions & 0 deletions third_party/nvbandwidth_testcases_patched.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
// Copyright(c) Microsoft Corporation.
// Licensed under the MIT License.

#include "common.h"
#include "inline_common.h"
#include "output.h"
#include "testcase.h"

// All to Host Batch CE memcpy using cuMemcpyAsync
class AllToHostBatchCE : public Testcase {
public:
AllToHostBatchCE()
: Testcase("all_to_host_batch_memcpy_ce",
"\tMeasures bandwidth of cuMemcpyAsync from all devices to host simultaneously.\n"
"\tAll devices perform memcpy operations concurrently with the same buffer size.\n"
"\tIndividual device bandwidths are measured and reported separately.") {}
virtual ~AllToHostBatchCE() {}
void run(unsigned long long size, unsigned long long loopCount);
};

// Host to All Batch CE memcpy using cuMemcpyAsync
class HostToAllBatchCE : public Testcase {
public:
HostToAllBatchCE()
: Testcase("host_to_all_batch_memcpy_ce",
"\tMeasures bandwidth of cuMemcpyAsync from host to all devices simultaneously.\n"
"\tAll devices perform memcpy operations concurrently with the same buffer size.\n"
"\tIndividual device bandwidths are measured and reported separately.") {}
virtual ~HostToAllBatchCE() {}
void run(unsigned long long size, unsigned long long loopCount);
};

// All to Host Batch SM memcpy using a copy kernel
class AllToHostBatchSM : public Testcase {
public:
AllToHostBatchSM()
: Testcase("all_to_host_batch_memcpy_sm",
"\tMeasures bandwidth of copy kernels from all devices to host simultaneously.\n"
"\tAll devices perform memcpy operations concurrently with the same buffer size.\n"
"\tIndividual device bandwidths are measured and reported separately.") {}
virtual ~AllToHostBatchSM() {}
void run(unsigned long long size, unsigned long long loopCount);
};

// Host to All Batch SM memcpy using a copy kernel
class HostToAllBatchSM : public Testcase {
public:
HostToAllBatchSM()
: Testcase("host_to_all_batch_memcpy_sm",
"\tMeasures bandwidth of copy kernels from host to all devices simultaneously.\n"
"\tAll devices perform memcpy operations concurrently with the same buffer size.\n"
"\tIndividual device bandwidths are measured and reported separately.") {}
virtual ~HostToAllBatchSM() {}
void run(unsigned long long size, unsigned long long loopCount);
};

void Testcase::allHostHelperBatch(unsigned long long size, MemcpyOperation &memcpyInstance,
PeerValueMatrix<double> &bandwidthValues, bool sourceIsHost) {
std::vector<const MemcpyBuffer *> allSrcBuffers;
std::vector<const MemcpyBuffer *> allDstBuffers;

// Create buffers for all devices with the same size
for (int deviceId = 0; deviceId < deviceCount; deviceId++) {
if (sourceIsHost) {
allSrcBuffers.push_back(new HostBuffer(size, deviceId));
allDstBuffers.push_back(new DeviceBuffer(size, deviceId));
} else {
allSrcBuffers.push_back(new DeviceBuffer(size, deviceId));
allDstBuffers.push_back(new HostBuffer(size, deviceId));
}
}

// Perform memcpy for all devices in a single run and get individual bandwidths
std::vector<double> deviceBandwidths = memcpyInstance.doMemcpyVector(allSrcBuffers, allDstBuffers);

// Store individual bandwidth for each device
for (int deviceId = 0; deviceId < deviceCount; deviceId++) {
bandwidthValues.value(0, deviceId) = deviceBandwidths[deviceId];
}

// Clean up all buffers
for (auto node : allSrcBuffers) {
delete node;
}

for (auto node : allDstBuffers) {
delete node;
}
}

void AllToHostBatchCE::run(unsigned long long size, unsigned long long loopCount) {
PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW);

allHostHelperBatch(size, memcpyInstance, bandwidthValues, false);

output->addTestcaseResults(bandwidthValues, "memcpy CE CPU(row) <- GPU(column) batch bandwidth (GB/s)");
}

void HostToAllBatchCE::run(unsigned long long size, unsigned long long loopCount) {
PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW);

allHostHelperBatch(size, memcpyInstance, bandwidthValues, true);

output->addTestcaseResults(bandwidthValues, "memcpy CE CPU(row) -> GPU(column) batch bandwidth (GB/s)");
}

void AllToHostBatchSM::run(unsigned long long size, unsigned long long loopCount) {
PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW);

allHostHelperBatch(size, memcpyInstance, bandwidthValues, false);

output->addTestcaseResults(bandwidthValues, "memcpy SM CPU(row) <- GPU(column) batch bandwidth (GB/s)");
}

void HostToAllBatchSM::run(unsigned long long size, unsigned long long loopCount) {
PeerValueMatrix<double> bandwidthValues(1, deviceCount, key);
MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW);

allHostHelperBatch(size, memcpyInstance, bandwidthValues, true);

output->addTestcaseResults(bandwidthValues, "memcpy SM CPU(row) -> GPU(column) batch bandwidth (GB/s)");
}
Loading