Description
Describe the bug
sycl::reduction for float data type is not working on AMD (MI100) GPU
Summing int array works but summing float array fails.
To Reproduce
I used this code to test it. Block above dashed line does int array summing while that below dashed line does float array summing.
Sum of int array should be 28 while sum of float array should be 28.0
#include <CL/sycl.hpp>
int main()
{
const int N = 8;
const int WG_SIZE = 64; //128
const int NUM_WGS = (N + WG_SIZE - 1) / WG_SIZE;
sycl::queue q {sycl::gpu_selector{}};
{
auto signal = sycl::malloc_shared<int>( N, q);
auto sum = sycl::malloc_shared<int>( 1, q);
for (unsigned int i = 0; i < N; ++i) {
signal[i] = i;
}
sum[0] = 0;
q.parallel_for(
sycl::nd_range<1>{NUM_WGS * WG_SIZE, WG_SIZE},
sycl::reduction(sum, std::plus<int>()),
[=](sycl::nd_item<1> item, auto& sum) {
int i = item.get_global_id(0);
if (i >= N) return;
sum += signal[i];
}
);
q.wait();
std::cout << "sum of int array: " << sum[0] << std::endl;
}
//----------------------------------------------------------------------------
{
auto signal = sycl::malloc_shared<float>(N, q);
auto sum = sycl::malloc_shared<float>(1, q);
for (unsigned int i = 0; i < N; ++i) {
signal[i] = (float)i;
}
sum[0] = 0.0;
q.parallel_for(
sycl::nd_range<1>{NUM_WGS * WG_SIZE, WG_SIZE},
sycl::reduction(sum, sycl::ext::oneapi::plus<float>()),
[=](sycl::nd_item<1> item, auto& sum) {
int i = item.get_global_id(0);
if (i >= N) return;
sum += signal[i];
}
);
q.wait();
std::cout << "sum of float array: " << sum[0] << std::endl;
}
return 0;
}
Compile command and output:
test_reduction/src $ clang++ test4.cpp -O3 -fsycl -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=gfx908
warning: linking module '/usr/DPA/tools/syclos_amd/20220406/sycl_workspace/llvm/build/lib/clang/15.0.0/../../clc/remangled-l64-signed_char.libspirv-amdgcn--amdhsa.bc': Linking two modules of different target triples: '/usr/DPA/tools/syclos_amd/20220406/sycl_workspace/llvm/build/lib/clang/15.0.0/../../clc/remangled-l64-signed_char.libspirv-amdgcn--amdhsa.bc' is 'amdgcn-unknown-amdhsa' whereas 'test4.cpp' is 'amdgcn-amd-amdhsa'
[-Wlinker-warnings]
1 warning generated.
Run command and output:
test_reduction/src $ SYCL_DEVICE_FILTER=hip:gpu ./a.out
sum of int array: 28
terminate called after throwing an instance of 'cl::sycl::runtime_error'
what(): Native API failed. Native API returns: -30 (CL_INVALID_VALUE) -30 (CL_INVALID_VALUE)
Aborted (core dumped)
Environment (please complete the following information):
- OS: Linux Ubuntu 20.04.4
- Target device and vendor: AMD MI100
- DPC++ version:
test_reduction/src $ clang++ --version
clang version 15.0.0 (https://github.com/intel/llvm 433a073)
Target: x86_64-unknown-linux-gnu
Thread model: posix
InstalledDir: /usr/DPA/tools/syclos_amd/20220406/sycl_workspace/llvm/build/bin