forked from intel/llvm
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[SYCL][Fusion] Test caching of fused kernels (intel/llvm-test-suite#1551
) Check that JIT compilation for kernel fusion is or is not repeated, depending on whether a newly submitted sequence of kernels is equivalent to a previous sequence. Next to the sequence of kernels, other invocation information, e.g., the user-specified internalization properties play a role to assess equivalence with a previous compilation. Different scenarios are tested by the test added in this PR. Implementation: intel#8051 Signed-off-by: Lukas Sommer <lukas.sommer@codeplay.com>
- Loading branch information
1 parent
b10f20f
commit 0fccf29
Showing
1 changed file
with
142 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out | ||
// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\ | ||
// RUN: %CPU_CHECK_PLACEHOLDER --implicit-check-not "COMPUTATION ERROR" --implicit-check-not "WRONG INTERNALIZATION" | ||
// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\ | ||
// RUN: %GPU_CHECK_PLACEHOLDER --implicit-check-not "COMPUTATION ERROR" --implicit-check-not "WRONG INTERNALIZATION" | ||
// UNSUPPORTED: cuda || hip | ||
// REQUIRES: fusion | ||
|
||
// Test caching for JIT fused kernels. Also test for debug messages being | ||
// printed when SYCL_RT_WARNING_LEVEL=1. | ||
|
||
#include <iostream> | ||
#include <sycl/sycl.hpp> | ||
|
||
using namespace sycl; | ||
|
||
constexpr size_t dataSize = 512; | ||
|
||
enum class Internalization { None, Local, Private }; | ||
|
||
void performFusion(queue &q, Internalization internalize, range<1> globalSize, | ||
int beta, int gamma, bool insertBarriers = false) { | ||
int alpha = 1; | ||
int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize]; | ||
|
||
for (size_t i = 0; i < dataSize; ++i) { | ||
in1[i] = i * 2; | ||
in2[i] = i * 3; | ||
in3[i] = i * 4; | ||
tmp[i] = -1; | ||
out[i] = -1; | ||
} | ||
{ | ||
buffer<int> bIn1{in1, globalSize}; | ||
buffer<int> bIn2{in2, globalSize}; | ||
buffer<int> bIn3{in3, globalSize}; | ||
buffer<int> bTmp{tmp, globalSize}; | ||
buffer<int> bOut{out, globalSize}; | ||
|
||
ext::codeplay::experimental::fusion_wrapper fw{q}; | ||
fw.start_fusion(); | ||
|
||
assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode"); | ||
|
||
q.submit([&](handler &cgh) { | ||
auto accIn1 = bIn1.get_access(cgh); | ||
auto accIn2 = bIn2.get_access(cgh); | ||
property_list properties{}; | ||
if (internalize == Internalization::Private) { | ||
properties = { | ||
sycl::ext::codeplay::experimental::property::promote_private{}}; | ||
} else if (internalize == Internalization::Local) { | ||
properties = { | ||
sycl::ext::codeplay::experimental::property::promote_local{}}; | ||
} | ||
accessor<int> accTmp = bTmp.get_access(cgh, properties); | ||
cgh.parallel_for<class KernelOne>(globalSize, [=](id<1> i) { | ||
accTmp[i] = accIn1[i] + accIn2[i] * alpha; | ||
}); | ||
}); | ||
|
||
q.submit([&](handler &cgh) { | ||
property_list properties{}; | ||
if (internalize == Internalization::Private) { | ||
properties = { | ||
sycl::ext::codeplay::experimental::property::promote_private{}}; | ||
} else if (internalize == Internalization::Local) { | ||
properties = { | ||
sycl::ext::codeplay::experimental::property::promote_local{}}; | ||
} | ||
accessor<int> accTmp = bTmp.get_access(cgh, properties); | ||
auto accIn3 = bIn3.get_access(cgh); | ||
auto accOut = bOut.get_access(cgh); | ||
cgh.parallel_for<class KernelTwo>(globalSize, [=](id<1> i) { | ||
accOut[i] = accTmp[i] * accIn3[i] * beta * gamma; | ||
}); | ||
}); | ||
|
||
if (insertBarriers) { | ||
fw.complete_fusion(); | ||
} else { | ||
fw.complete_fusion( | ||
{ext::codeplay::experimental::property::no_barriers{}}); | ||
} | ||
|
||
assert(!fw.is_in_fusion_mode() && | ||
"Queue should not be in fusion mode anymore"); | ||
} | ||
|
||
// Check the results | ||
size_t numErrors = 0; | ||
size_t numInternalized = 0; | ||
for (size_t i = 0; i < dataSize; ++i) { | ||
if (i < globalSize.size() && out[i] != (20 * i * i * beta * gamma)) { | ||
++numErrors; | ||
} | ||
if (tmp[i] == -1) { | ||
++numInternalized; | ||
} | ||
} | ||
if (numErrors) { | ||
std::cout << "COMPUTATION ERROR\n"; | ||
} | ||
if ((internalize == Internalization::None) && numInternalized) { | ||
std::cout << "WRONG INTERNALIZATION\n"; | ||
} | ||
} | ||
|
||
int main() { | ||
queue q{ext::codeplay::experimental::property::queue::enable_fusion{}}; | ||
|
||
// Initial invocation | ||
performFusion(q, Internalization::Private, range<1>{dataSize}, 1, 1); | ||
// CHECK: JIT DEBUG: Compiling new kernel, no suitable cached kernel found | ||
|
||
// Identical invocation, should lead to JIT cache hit. | ||
performFusion(q, Internalization::Private, range<1>{dataSize}, 1, 1); | ||
// CHECK-NEXT: JIT DEBUG: Re-using cached JIT kernel | ||
// CHECK-NEXT: INFO: Re-using existing device binary for fused kernel | ||
|
||
// Invocation with a different beta. Because beta was identical to alpha so | ||
// far, this should lead to a cache miss. | ||
performFusion(q, Internalization::Private, range<1>{dataSize}, 2, 1); | ||
// CHECK-NEXT: JIT DEBUG: Compiling new kernel, no suitable cached kernel found | ||
|
||
// Invocation with barrier insertion should lead to a cache miss. | ||
performFusion(q, Internalization::Private, range<1>{dataSize}, 1, 1, | ||
/* insertBarriers */ true); | ||
// CHECK-NEXT: JIT DEBUG: Compiling new kernel, no suitable cached kernel found | ||
|
||
// Invocation with different internalization target should lead to a cache | ||
// miss. | ||
performFusion(q, Internalization::None, range<1>{dataSize}, 1, 1); | ||
// CHECK-NEXT: JIT DEBUG: Compiling new kernel, no suitable cached kernel found | ||
|
||
// Invocation with a different gamma should lead to a cache miss because gamma | ||
// participates in constant propagation. | ||
performFusion(q, Internalization::Private, range<1>{dataSize}, 1, 2); | ||
// CHECK-NEXT: JIT DEBUG: Compiling new kernel, no suitable cached kernel found | ||
|
||
return 0; | ||
} |