|
| 1 | +// Copyright(c) Microsoft Corporation. |
| 2 | +// Licensed under the MIT License. |
| 3 | + |
| 4 | +#include "common.h" |
| 5 | +#include "inline_common.h" |
| 6 | +#include "output.h" |
| 7 | +#include "testcase.h" |
| 8 | + |
| 9 | +// All to Host Batch CE memcpy using cuMemcpyAsync |
| 10 | +class AllToHostBatchCE : public Testcase { |
| 11 | + public: |
| 12 | + AllToHostBatchCE() |
| 13 | + : Testcase("all_to_host_batch_memcpy_ce", |
| 14 | + "\tMeasures bandwidth of cuMemcpyAsync from all devices to host simultaneously.\n" |
| 15 | + "\tAll devices perform memcpy operations concurrently with the same buffer size.\n" |
| 16 | + "\tIndividual device bandwidths are measured and reported separately.") {} |
| 17 | + virtual ~AllToHostBatchCE() {} |
| 18 | + void run(unsigned long long size, unsigned long long loopCount); |
| 19 | +}; |
| 20 | + |
| 21 | +// Host to All Batch CE memcpy using cuMemcpyAsync |
| 22 | +class HostToAllBatchCE : public Testcase { |
| 23 | + public: |
| 24 | + HostToAllBatchCE() |
| 25 | + : Testcase("host_to_all_batch_memcpy_ce", |
| 26 | + "\tMeasures bandwidth of cuMemcpyAsync from host to all devices simultaneously.\n" |
| 27 | + "\tAll devices perform memcpy operations concurrently with the same buffer size.\n" |
| 28 | + "\tIndividual device bandwidths are measured and reported separately.") {} |
| 29 | + virtual ~HostToAllBatchCE() {} |
| 30 | + void run(unsigned long long size, unsigned long long loopCount); |
| 31 | +}; |
| 32 | + |
| 33 | +// All to Host Batch SM memcpy using a copy kernel |
| 34 | +class AllToHostBatchSM : public Testcase { |
| 35 | + public: |
| 36 | + AllToHostBatchSM() |
| 37 | + : Testcase("all_to_host_batch_memcpy_sm", |
| 38 | + "\tMeasures bandwidth of copy kernels from all devices to host simultaneously.\n" |
| 39 | + "\tAll devices perform memcpy operations concurrently with the same buffer size.\n" |
| 40 | + "\tIndividual device bandwidths are measured and reported separately.") {} |
| 41 | + virtual ~AllToHostBatchSM() {} |
| 42 | + void run(unsigned long long size, unsigned long long loopCount); |
| 43 | +}; |
| 44 | + |
| 45 | +// Host to All Batch SM memcpy using a copy kernel |
| 46 | +class HostToAllBatchSM : public Testcase { |
| 47 | + public: |
| 48 | + HostToAllBatchSM() |
| 49 | + : Testcase("host_to_all_batch_memcpy_sm", |
| 50 | + "\tMeasures bandwidth of copy kernels from host to all devices simultaneously.\n" |
| 51 | + "\tAll devices perform memcpy operations concurrently with the same buffer size.\n" |
| 52 | + "\tIndividual device bandwidths are measured and reported separately.") {} |
| 53 | + virtual ~HostToAllBatchSM() {} |
| 54 | + void run(unsigned long long size, unsigned long long loopCount); |
| 55 | +}; |
| 56 | + |
| 57 | +void Testcase::allHostHelperBatch(unsigned long long size, MemcpyOperation &memcpyInstance, |
| 58 | + PeerValueMatrix<double> &bandwidthValues, bool sourceIsHost) { |
| 59 | + std::vector<const MemcpyBuffer *> allSrcBuffers; |
| 60 | + std::vector<const MemcpyBuffer *> allDstBuffers; |
| 61 | + |
| 62 | + // Create buffers for all devices with the same size |
| 63 | + for (int deviceId = 0; deviceId < deviceCount; deviceId++) { |
| 64 | + if (sourceIsHost) { |
| 65 | + allSrcBuffers.push_back(new HostBuffer(size, deviceId)); |
| 66 | + allDstBuffers.push_back(new DeviceBuffer(size, deviceId)); |
| 67 | + } else { |
| 68 | + allSrcBuffers.push_back(new DeviceBuffer(size, deviceId)); |
| 69 | + allDstBuffers.push_back(new HostBuffer(size, deviceId)); |
| 70 | + } |
| 71 | + } |
| 72 | + |
| 73 | + // Perform memcpy for all devices in a single run and get individual bandwidths |
| 74 | + std::vector<double> deviceBandwidths = memcpyInstance.doMemcpyVector(allSrcBuffers, allDstBuffers); |
| 75 | + |
| 76 | + // Store individual bandwidth for each device |
| 77 | + for (int deviceId = 0; deviceId < deviceCount; deviceId++) { |
| 78 | + bandwidthValues.value(0, deviceId) = deviceBandwidths[deviceId]; |
| 79 | + } |
| 80 | + |
| 81 | + // Clean up all buffers |
| 82 | + for (auto node : allSrcBuffers) { |
| 83 | + delete node; |
| 84 | + } |
| 85 | + |
| 86 | + for (auto node : allDstBuffers) { |
| 87 | + delete node; |
| 88 | + } |
| 89 | +} |
| 90 | + |
| 91 | +void AllToHostBatchCE::run(unsigned long long size, unsigned long long loopCount) { |
| 92 | + PeerValueMatrix<double> bandwidthValues(1, deviceCount, key); |
| 93 | + MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW); |
| 94 | + |
| 95 | + allHostHelperBatch(size, memcpyInstance, bandwidthValues, false); |
| 96 | + |
| 97 | + output->addTestcaseResults(bandwidthValues, "memcpy CE CPU(row) <- GPU(column) batch bandwidth (GB/s)"); |
| 98 | +} |
| 99 | + |
| 100 | +void HostToAllBatchCE::run(unsigned long long size, unsigned long long loopCount) { |
| 101 | + PeerValueMatrix<double> bandwidthValues(1, deviceCount, key); |
| 102 | + MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorCE(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW); |
| 103 | + |
| 104 | + allHostHelperBatch(size, memcpyInstance, bandwidthValues, true); |
| 105 | + |
| 106 | + output->addTestcaseResults(bandwidthValues, "memcpy CE CPU(row) -> GPU(column) batch bandwidth (GB/s)"); |
| 107 | +} |
| 108 | + |
| 109 | +void AllToHostBatchSM::run(unsigned long long size, unsigned long long loopCount) { |
| 110 | + PeerValueMatrix<double> bandwidthValues(1, deviceCount, key); |
| 111 | + MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW); |
| 112 | + |
| 113 | + allHostHelperBatch(size, memcpyInstance, bandwidthValues, false); |
| 114 | + |
| 115 | + output->addTestcaseResults(bandwidthValues, "memcpy SM CPU(row) <- GPU(column) batch bandwidth (GB/s)"); |
| 116 | +} |
| 117 | + |
| 118 | +void HostToAllBatchSM::run(unsigned long long size, unsigned long long loopCount) { |
| 119 | + PeerValueMatrix<double> bandwidthValues(1, deviceCount, key); |
| 120 | + MemcpyOperation memcpyInstance(loopCount, new MemcpyInitiatorSM(), PREFER_SRC_CONTEXT, MemcpyOperation::VECTOR_BW); |
| 121 | + |
| 122 | + allHostHelperBatch(size, memcpyInstance, bandwidthValues, true); |
| 123 | + |
| 124 | + output->addTestcaseResults(bandwidthValues, "memcpy SM CPU(row) -> GPU(column) batch bandwidth (GB/s)"); |
| 125 | +} |
0 commit comments