Skip to content

Commit f5a062d

Browse files
authored
[SYCL][ESIMD] Add function to get reference to underlying data (#8725)
This is required for inline assembly. Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
1 parent 3a60c85 commit f5a062d

File tree

7 files changed

+522
-0
lines changed

7 files changed

+522
-0
lines changed

sycl/doc/extensions/experimental/sycl_ext_intel_esimd/sycl_ext_intel_esimd.md

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -815,6 +815,37 @@ simd<float, 8> __regcall SCALE(simd<float, 8> v);
815815
The parameter and the return type in the ABI form will be `<8 x float>`.
816816
<br>
817817

818+
### Inline assembly
819+
Inline assembly is supported with ESIMD classes `simd`, `simd_mask` and `simd_view`. `simd_view` only supports read operations.
820+
In order the access the raw underlying vector required for inline assembly, the `data` function can be used for read-only access and
821+
the `data_ref` function can be used for write access. The `data_ref` function only exists for `simd` and `simd_mask`, and should only be used in inline assembly.
822+
If the `simd` or `simd_mask` object is a private global variable, the `commit` function must be called after any write in inline assembly.
823+
824+
Example of inline GEN assembly:
825+
```cpp
826+
simd<float, 16> va;
827+
simd<float, 16> vb;
828+
simd<float, 16> vc;
829+
830+
__asm__("add (M1, 16) %0 %1 %2"
831+
: "=rw"(vc.data_ref())
832+
: "rw"(va.data()), "rw"(vb.data()));
833+
```
834+
835+
Example of inline GEN assembly writing to a private global variable:
836+
```cpp
837+
ESIMD_PRIVATE ESIMD_REGISTER(0) simd<float, 16> vc;
838+
839+
void calledFromKernel() {
840+
simd<float, 16> va;
841+
simd<float, 16> vb;
842+
__asm__("add (M1, 16) %0 %1 %2"
843+
: "=rw"(vc.data_ref())
844+
: "rw"(va.data()), "rw"(vb.data()));
845+
vc.commit();
846+
}
847+
```
848+
818849
## Examples
819850
### Vector addition (USM)
820851
```cpp

sycl/include/sycl/ext/intel/esimd/detail/simd_obj_impl.hpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,15 @@ class simd_obj_impl {
354354
#endif
355355
}
356356

357+
/// @return A reference to the value of the
358+
/// underlying raw vector. Intended for use
359+
/// with l-value contexts in inline assembly.
360+
raw_vector_type &data_ref() { return M_data; }
361+
362+
/// Commit the current stored underlying raw vector to memory.
363+
/// This is required when using inline assembly with private global variables.
364+
void commit() { __esimd_vstore<RawTy, N>(&M_data, M_data); }
365+
357366
/// @return Newly constructed (from the underlying data) object of the Derived
358367
/// type.
359368
Derived read() const { return Derived{data()}; }
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
//==---------------- asm_glb.cpp - DPC++ ESIMD on-device test -------------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// REQUIRES: gpu
9+
// UNSUPPORTED: cuda || hip
10+
// RUN: %clangxx -fsycl %s -o %t.out
11+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
12+
13+
#include "../esimd_test_utils.hpp"
14+
15+
#include <iostream>
16+
#include <sycl/ext/intel/esimd.hpp>
17+
#include <sycl/sycl.hpp>
18+
19+
using namespace sycl;
20+
using namespace sycl::ext::intel::esimd;
21+
22+
ESIMD_PRIVATE ESIMD_REGISTER(0) simd<float, 16> va;
23+
ESIMD_PRIVATE ESIMD_REGISTER(0) simd<float, 16> vc;
24+
25+
int main(void) {
26+
constexpr unsigned Size = 1024 * 128;
27+
constexpr unsigned VL = 16;
28+
29+
float *A = new float[Size];
30+
float *B = new float[Size];
31+
float *C = new float[Size];
32+
33+
for (unsigned i = 0; i < Size; ++i) {
34+
A[i] = B[i] = i;
35+
C[i] = 0.0f;
36+
}
37+
38+
try {
39+
buffer<float, 1> bufa(A, range<1>(Size));
40+
buffer<float, 1> bufb(B, range<1>(Size));
41+
buffer<float, 1> bufc(C, range<1>(Size));
42+
43+
// We need that many workgroups
44+
range<1> GlobalRange{Size / VL};
45+
46+
// We need that many threads in each group
47+
range<1> LocalRange{1};
48+
49+
queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
50+
51+
auto dev = q.get_device();
52+
std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
53+
54+
auto e = q.submit([&](handler &cgh) {
55+
auto PA = bufa.get_access<access::mode::read>(cgh);
56+
auto PB = bufb.get_access<access::mode::read>(cgh);
57+
auto PC = bufc.get_access<access::mode::write>(cgh);
58+
cgh.parallel_for<class Test>(
59+
GlobalRange * LocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL {
60+
using namespace sycl::ext::intel::esimd;
61+
unsigned int offset = i * VL * sizeof(float);
62+
va.copy_from(PA, offset);
63+
simd<float, VL> vb;
64+
vb.copy_from(PB, offset);
65+
#ifdef __SYCL_DEVICE_ONLY__
66+
__asm__("add (M1, 16) %0 %1 %2"
67+
: "=rw"(vc.data_ref())
68+
: "rw"(va.data()), "rw"(vb.data()));
69+
vc.commit();
70+
#else
71+
vc = va+vb;
72+
#endif
73+
vc.copy_to(PC, offset);
74+
});
75+
});
76+
e.wait();
77+
} catch (sycl::exception const &e) {
78+
std::cout << "SYCL exception caught: " << e.what() << '\n';
79+
80+
delete[] A;
81+
delete[] B;
82+
delete[] C;
83+
return 1;
84+
}
85+
86+
int err_cnt = 0;
87+
88+
for (unsigned i = 0; i < Size; ++i) {
89+
if (A[i] + B[i] != C[i]) {
90+
if (++err_cnt < 10) {
91+
std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i]
92+
<< " + " << B[i] << "\n";
93+
}
94+
}
95+
}
96+
if (err_cnt > 0) {
97+
std::cout << " pass rate: "
98+
<< ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% ("
99+
<< (Size - err_cnt) << "/" << Size << ")\n";
100+
}
101+
102+
delete[] A;
103+
delete[] B;
104+
delete[] C;
105+
106+
std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n");
107+
return err_cnt > 0 ? 1 : 0;
108+
}
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
//==---------------- asm_simd_mask.cpp - DPC++ ESIMD on-device test
2+
//-------------==//
3+
//
4+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5+
// See https://llvm.org/LICENSE.txt for license information.
6+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7+
//
8+
//===----------------------------------------------------------------------===//
9+
// REQUIRES: gpu
10+
// UNSUPPORTED: cuda || hip
11+
// RUN: %clangxx -fsycl %s -o %t.out
12+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
13+
14+
#include "../esimd_test_utils.hpp"
15+
16+
#include <iostream>
17+
#include <sycl/ext/intel/esimd.hpp>
18+
#include <sycl/sycl.hpp>
19+
20+
using namespace sycl;
21+
22+
int main(void) {
23+
constexpr unsigned Size = 1024 * 128;
24+
constexpr unsigned VL = 8;
25+
26+
float *A = new float[Size];
27+
float *B = new float[Size];
28+
float *C = new float[Size];
29+
30+
for (unsigned i = 0; i < Size; ++i) {
31+
A[i] = B[i] = i;
32+
C[i] = 0.0f;
33+
}
34+
35+
try {
36+
buffer<float, 1> bufa(A, range<1>(Size));
37+
buffer<float, 1> bufb(B, range<1>(Size));
38+
buffer<float, 1> bufc(C, range<1>(Size));
39+
40+
// We need that many workgroups
41+
range<1> GlobalRange{Size / VL};
42+
43+
// We need that many threads in each group
44+
range<1> LocalRange{1};
45+
46+
queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
47+
48+
auto dev = q.get_device();
49+
std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
50+
51+
auto e = q.submit([&](handler &cgh) {
52+
auto PA = bufa.get_access<access::mode::read>(cgh);
53+
auto PB = bufb.get_access<access::mode::read>(cgh);
54+
auto PC = bufc.get_access<access::mode::write>(cgh);
55+
cgh.parallel_for<class Test>(
56+
GlobalRange * LocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL {
57+
using namespace sycl::ext::intel::esimd;
58+
unsigned int offset = i * VL * sizeof(float);
59+
simd<float, VL> va;
60+
va.copy_from(PA, offset);
61+
simd<float, VL> vb;
62+
vb.copy_from(PB, offset);
63+
simd<float, VL> vc;
64+
#ifdef __SYCL_DEVICE_ONLY__
65+
simd_mask<VL> m;
66+
__asm__("mov (M1, 8) %0 0x1010101:v" : "=rw"(m.data_ref()));
67+
__asm__("{\n"
68+
".decl P1 v_type=P num_elts=8\n"
69+
"mov (M1, 8) %0 0x1:ud\n"
70+
"setp (M1, 8) P1 %3\n"
71+
"(P1) add (M1, 8) %0 %1 %2\n"
72+
"}"
73+
: "=rw"(vc.data_ref())
74+
: "rw"(va.data()), "rw"(vb.data()), "rw"(m.data()));
75+
#else
76+
simd_mask<VL> m({1,0,1,0,1,0,1,0});
77+
vc = va+vb;
78+
vc.merge(1, !m);
79+
#endif
80+
vc.copy_to(PC, offset);
81+
});
82+
});
83+
e.wait();
84+
} catch (sycl::exception const &e) {
85+
std::cout << "SYCL exception caught: " << e.what() << '\n';
86+
87+
delete[] A;
88+
delete[] B;
89+
delete[] C;
90+
return 1;
91+
}
92+
93+
int err_cnt = 0;
94+
95+
for (unsigned i = 0; i < Size; ++i) {
96+
if ((i % 2 == 0) && (A[i] + B[i] != C[i])) {
97+
if (++err_cnt < 10) {
98+
std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i]
99+
<< " + " << B[i] << "\n";
100+
}
101+
} else if ((i % 2 == 1) && (C[i] != 1)) {
102+
if (++err_cnt < 10) {
103+
std::cout << "failed at index " << i << ", " << C[i] << " != 1\n";
104+
}
105+
}
106+
}
107+
if (err_cnt > 0) {
108+
std::cout << " pass rate: "
109+
<< ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% ("
110+
<< (Size - err_cnt) << "/" << Size << ")\n";
111+
}
112+
113+
delete[] A;
114+
delete[] B;
115+
delete[] C;
116+
117+
std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n");
118+
return err_cnt > 0 ? 1 : 0;
119+
}

0 commit comments

Comments
 (0)