Closed
Description
Can someone please comment if reduction_variables are supported only with nd_range
and not with range
launch configuration.
#include <CL/sycl.hpp>
#include <iostream>
#include <numeric>
#include <cassert>
int main() {
cl::sycl::queue myQueue(cl::sycl::gpu_selector{});
std::vector<int> valuesVec(1024);
std::iota(std::begin(valuesVec), std::end(valuesVec), 0);
int* valuesBuf = cl::sycl::malloc_device<int>(1024, myQueue);
myQueue.memcpy(valuesBuf, valuesVec.data(), 1024*sizeof(int));
// cl::sycl::buffers with just 1 element to get the reduction results
int sumResult_host = 0, maxResult_host = 0;
int* sumResult = cl::sycl::malloc_device<int>(1, myQueue);
int* maxResult = cl::sycl::malloc_device<int>(1, myQueue);
myQueue.submit([&](cl::sycl::handler& cgh) {
// Create temporary objects describing variables with reduction semantics
auto sumReduction = cl::sycl::ONEAPI::reduction(sumResult, cl::sycl::ONEAPI::plus<>());
auto maxReduction = cl::sycl::ONEAPI::reduction(maxResult, cl::sycl::ONEAPI::maximum<>());
// parallel_for performs two reduction operations
// For each reduction variable, the implementation:
// - Creates a corresponding reducer
// - Passes a reference to the reducer to the lambda as a parameter
cgh.parallel_for(cl::sycl::range<1>{1024},
sumReduction, maxReduction,
[=](cl::sycl::item<1> idx, auto& sum, auto& max) {
sum += valuesBuf[idx];
max.combine(valuesBuf[idx]);
});
});
myQueue.memcpy(&sumResult_host, sumResult, sizeof(int)).wait();
myQueue.memcpy(&maxResult_host, maxResult, sizeof(int)).wait();
std::cout << "value of Result_Host: " << sumResult_host << ", " << maxResult_host << std::endl;
assert(maxResult_host == 1023 && sumResult_host == 523776);
}
Given that nd_range
supports reduction variable launches from parallel_for
, a similar support for range
is beneficial. Link to SYCL specs: https://www.khronos.org/registry/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:reduction