-
Notifications
You must be signed in to change notification settings - Fork 80
Benchmarks: Micro benchmark - Add simultanneously all-to-host / host-to-all bandwidth testcases to nvbandwidth #736
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
6 commits
Select commit
Hold shift + click to select a range
409d1f6
upgrade nvbandwidth and add alltohostbatch test case
yukirora 5eb0935
add patch in makefile
yukirora 5a2c553
clean patch and add a new testcases_new.h
yukirora 823c4e8
clang format and rename testcases_new.h to nvbandwidth_testcases_new.…
yukirora eac3aff
rename the file into testcases_patched.h
yukirora bfc2d7b
Merge branch 'main' into yutji/nvbandwidth
yukirora File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Submodule nvbandwidth
updated
23 files
| +12 −9 | CHANGELOG.md | |
| +20 −5 | CMakeLists.txt | |
| +67 −14 | README.md | |
| +68 −19 | common.h | |
| +28 −11 | error_handling.h | |
| +44 −10 | inline_common.h | |
| +22 −3 | json_output.cpp | |
| +12 −8 | json_output.h | |
| +303 −29 | kernels.cu | |
| +12 −5 | kernels.cuh | |
| +240 −146 | memcpy.cpp | |
| +87 −41 | memcpy.h | |
| +327 −0 | multinode_memcpy.cpp | |
| +148 −0 | multinode_memcpy.h | |
| +383 −0 | multinode_testcases.cpp | |
| +74 −12 | nvbandwidth.cpp | |
| +104 −18 | output.cpp | |
| +11 −9 | output.h | |
| +48 −16 | testcase.cpp | |
| +222 −55 | testcase.h | |
| +14 −1 | testcases_ce.cpp | |
| +72 −30 | testcases_sm.cpp | |
| +4 −4 | version.h |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,46 @@ | ||
| diff --git a/nvbandwidth.cpp b/nvbandwidth.cpp | ||
| index 61a228f..488372a 100644 | ||
| --- a/nvbandwidth.cpp | ||
| +++ b/nvbandwidth.cpp | ||
| @@ -29,6 +29,7 @@ | ||
| #include "kernels.cuh" | ||
| #include "output.h" | ||
| #include "testcase.h" | ||
| +#include "testcases_patched.h" | ||
| #include "version.h" | ||
| #include "inline_common.h" | ||
|
|
||
| @@ -73,8 +74,10 @@ std::vector<Testcase*> createTestcases() { | ||
| new DeviceToDeviceBidirWriteCE(), | ||
| new AllToHostCE(), | ||
| new AllToHostBidirCE(), | ||
| + new AllToHostBatchCE(), | ||
| new HostToAllCE(), | ||
| new HostToAllBidirCE(), | ||
| + new HostToAllBatchCE(), | ||
| new AllToOneWriteCE(), | ||
| new AllToOneReadCE(), | ||
| new OneToAllWriteCE(), | ||
| @@ -89,8 +92,10 @@ std::vector<Testcase*> createTestcases() { | ||
| new DeviceToDeviceBidirWriteSM(), | ||
| new AllToHostSM(), | ||
| new AllToHostBidirSM(), | ||
| + new AllToHostBatchSM(), | ||
| new HostToAllSM(), | ||
| new HostToAllBidirSM(), | ||
| + new HostToAllBatchSM(), | ||
| new AllToOneWriteSM(), | ||
| new AllToOneReadSM(), | ||
| new OneToAllWriteSM(), | ||
| diff --git a/testcase.h b/testcase.h | ||
| index c276850..f26e7d8 100644 | ||
| --- a/testcase.h | ||
| +++ b/testcase.h | ||
| @@ -39,6 +39,7 @@ class Testcase { | ||
| void oneToAllHelper(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix<double> &bandwidthValues, bool isRead); | ||
| void allHostHelper(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix<double> &bandwidthValues, bool sourceIsHost); | ||
| void allHostBidirHelper(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix<double> &bandwidthValues, bool sourceIsHost); | ||
| + void allHostHelperBatch(unsigned long long size, MemcpyOperation &memcpyInstance, PeerValueMatrix<double> &bandwidthValues, bool sourceIsHost); | ||
| void latencyHelper(const MemcpyBuffer &dataBuffer, bool measureDeviceToDeviceLatency); | ||
|
|
||
| public: |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,125 @@ | ||
| // Copyright(c) Microsoft Corporation. | ||
guoshzhao marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| // Licensed under the MIT License. | ||
|
|
||
| #include "common.h" | ||
| #include "inline_common.h" | ||
| #include "output.h" | ||
| #include "testcase.h" | ||
|
|
||
| // All to Host Batch CE memcpy using cuMemcpyAsync | ||
| class AllToHostBatchCE : public Testcase { | ||
| public: | ||
| AllToHostBatchCE() | ||
| : Testcase("all_to_host_batch_memcpy_ce", | ||
| "\tMeasures bandwidth of cuMemcpyAsync from all devices to host simultaneously.\n" | ||
| "\tAll devices perform memcpy operations concurrently with the same buffer size.\n" | ||
| "\tIndividual device bandwidths are measured and reported separately.") {} | ||
| virtual ~AllToHostBatchCE() {} | ||
| void run(unsigned long long size, unsigned long long loopCount); | ||
| }; | ||
|
|
||
| // Host to All Batch CE memcpy using cuMemcpyAsync | ||
| class HostToAllBatchCE : public Testcase { | ||
| public: | ||
| HostToAllBatchCE() | ||
| : Testcase("host_to_all_batch_memcpy_ce", | ||
| "\tMeasures bandwidth of cuMemcpyAsync from host to all devices simultaneously.\n" | ||
| "\tAll devices perform memcpy operations concurrently with the same buffer size.\n" | ||
| "\tIndividual device bandwidths are measured and reported separately.") {} | ||
| virtual ~HostToAllBatchCE() {} | ||
| void run(unsigned long long size, unsigned long long loopCount); | ||
| }; | ||
|
|
||
| // All to Host Batch SM memcpy using a copy kernel | ||
| class AllToHostBatchSM : public Testcase { | ||
| public: | ||
| AllToHostBatchSM() | ||
| : Testcase("all_to_host_batch_memcpy_sm", | ||
| "\tMeasures bandwidth of copy kernels from all devices to host simultaneously.\n" | ||
| "\tAll devices perform memcpy operations concurrently with the same buffer size.\n" | ||
| "\tIndividual device bandwidths are measured and reported separately.") {} | ||
| virtual ~AllToHostBatchSM() {} | ||
| void run(unsigned long long size, unsigned long long loopCount); | ||
| }; | ||
|
|
||
| // Host to All Batch SM memcpy using a copy kernel | ||
| class HostToAllBatchSM : public Testcase { | ||
| public: | ||
| HostToAllBatchSM() | ||
| : Testcase("host_to_all_batch_memcpy_sm", | ||
| "\tMeasures bandwidth of copy kernels from host to all devices simultaneously.\n" | ||
| "\tAll devices perform memcpy operations concurrently with the same buffer size.\n" | ||
| "\tIndividual device bandwidths are measured and reported separately.") {} | ||
| virtual ~HostToAllBatchSM() {} | ||
| void run(unsigned long long size, unsigned long long loopCount); | ||
| }; | ||
|
|
||
| void Testcase::allHostHelperBatch(unsigned long long size, MemcpyOperation &memcpyInstance, | ||
| PeerValueMatrix<double> &bandwidthValues, bool sourceIsHost) { | ||
| std::vector<const MemcpyBuffer *> allSrcBuffers; | ||
| std::vector<const MemcpyBuffer *> allDstBuffers; | ||
|
|
||
| // Create buffers for all devices with the same size | ||
| for (int deviceId = 0; deviceId < deviceCount; deviceId++) { | ||
| if (sourceIsHost) { | ||
| allSrcBuffers.push_back(new HostBuffer(size, deviceId)); | ||
| allDstBuffers.push_back(new DeviceBuffer(size, deviceId)); | ||
| } else { | ||
| allSrcBuffers.push_back(new DeviceBuffer(size, deviceId)); | ||
| allDstBuffers.push_back(new HostBuffer(size, deviceId)); | ||
| } | ||
| } | ||
|
|
||
| // Perform memcpy for all devices in a single run and get individual bandwidths | ||
| std::vector<double> deviceBandwidths = memcpyInstance.doMemcpyVector(allSrcBuffers, allDstBuffers); | ||
|
|
||
| // Store individual bandwidth for each device | ||
| for (int deviceId = 0; deviceId < deviceCount; deviceId++) { | ||
| bandwidthValues.value(0, deviceId) = deviceBandwidths[deviceId]; | ||
| } | ||
|
|
||
| // Clean up all buffers | ||
| for (auto node : allSrcBuffers) { | ||
| delete node; | ||
| } | ||
|
|
||
| for (auto node : allDstBuffers) { | ||
| delete node; | ||
| } | ||
| } | ||
|
|
||
| void AllToHostBatchCE::run(unsigned long long size, unsigned long long loopCount) { | ||
| PeerValueMatrix<double> bandwidthValues(1, deviceCount, key); | ||
| MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW); | ||
|
|
||
| allHostHelperBatch(size, memcpyInstance, bandwidthValues, false); | ||
|
|
||
| output->addTestcaseResults(bandwidthValues, "memcpy CE CPU(row) <- GPU(column) batch bandwidth (GB/s)"); | ||
| } | ||
|
|
||
| void HostToAllBatchCE::run(unsigned long long size, unsigned long long loopCount) { | ||
| PeerValueMatrix<double> bandwidthValues(1, deviceCount, key); | ||
| MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW); | ||
|
|
||
| allHostHelperBatch(size, memcpyInstance, bandwidthValues, true); | ||
|
|
||
| output->addTestcaseResults(bandwidthValues, "memcpy CE CPU(row) -> GPU(column) batch bandwidth (GB/s)"); | ||
| } | ||
|
|
||
| void AllToHostBatchSM::run(unsigned long long size, unsigned long long loopCount) { | ||
| PeerValueMatrix<double> bandwidthValues(1, deviceCount, key); | ||
| MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW); | ||
|
|
||
| allHostHelperBatch(size, memcpyInstance, bandwidthValues, false); | ||
|
|
||
| output->addTestcaseResults(bandwidthValues, "memcpy SM CPU(row) <- GPU(column) batch bandwidth (GB/s)"); | ||
| } | ||
|
|
||
| void HostToAllBatchSM::run(unsigned long long size, unsigned long long loopCount) { | ||
| PeerValueMatrix<double> bandwidthValues(1, deviceCount, key); | ||
| MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW); | ||
|
|
||
| allHostHelperBatch(size, memcpyInstance, bandwidthValues, true); | ||
|
|
||
| output->addTestcaseResults(bandwidthValues, "memcpy SM CPU(row) -> GPU(column) batch bandwidth (GB/s)"); | ||
| } | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.