Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions clang/lib/Sema/SemaSYCL.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2763,14 +2763,18 @@ class SyclKernelBodyCreator : public SyclKernelFieldHandler {
}

static VarDecl *createKernelObjClone(ASTContext &Ctx, DeclContext *DC,
const CXXRecordDecl *KernelObj) {
const CXXRecordDecl *KernelObj,
FunctionDecl *KernelCallerFunc) {
TypeSourceInfo *TSInfo =
KernelObj->isLambda() ? KernelObj->getLambdaTypeInfo() : nullptr;
auto Type = QualType(KernelObj->getTypeForDecl(), 0);
Type->getAsRecordDecl()->setAnonymousStructOrUnion(true);
VarDecl *VD = VarDecl::Create(
Ctx, DC, KernelObj->getLocation(), KernelObj->getLocation(),
KernelObj->getIdentifier(), QualType(KernelObj->getTypeForDecl(), 0),
TSInfo, SC_None);

KernelObj->getIdentifier(), Type, TSInfo, SC_None);
if (getKernelInvocationKind(KernelCallerFunc) == InvokeParallelForWorkGroup)
VD->addAttr(
SYCLScopeAttr::CreateImplicit(Ctx, SYCLScopeAttr::Level::WorkGroup));
return VD;
}

Expand Down Expand Up @@ -2846,7 +2850,8 @@ class SyclKernelBodyCreator : public SyclKernelFieldHandler {
FunctionDecl *KernelCallerFunc)
: SyclKernelFieldHandler(S), DeclCreator(DC),
KernelObjClone(createKernelObjClone(S.getASTContext(),
DC.getKernelDecl(), KernelObj)),
DC.getKernelDecl(), KernelObj,
KernelCallerFunc)),
VarEntity(InitializedEntity::InitializeVariable(KernelObjClone)),
KernelObj(KernelObj), KernelCallerFunc(KernelCallerFunc),
KernelCallerSrcLoc(KernelCallerFunc->getLocation()) {
Expand Down
1 change: 1 addition & 0 deletions clang/test/SemaSYCL/kernel-handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ int main() {
// NONATIVESUPPORT-NEXT: InitListExpr
// NONATIVESUPPORT-NEXT: ImplicitCastExpr {{.*}} 'int' <LValueToRValue>
// NONATIVESUPPORT-NEXT: DeclRefExpr {{.*}} 'int' lvalue ParmVar {{.*}} '_arg_a' 'int'
// NONATIVESUPPORT-NEXT: SYCLScopeAttr {{.*}} Implicit WorkGroup

// Check declaration and initialization of kernel handler local clone using default constructor
// NONATIVESUPPORT-NEXT: DeclStmt
Expand Down
73 changes: 30 additions & 43 deletions llvm/lib/SYCLLowerIR/LowerWGScope.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,19 @@
// (1) - materialization of a PFWI object
// (2) - "fixup" of the private variable address.
//
// TODO: add support for the case when there are other functions between
// parallel_for_work_group and parallel_for_work_item in the call stack.
// For example:
//
// void foo(sycl::group<1> group, ...) {
// group.parallel_for_work_item(range<1>(), [&](h_item<1> i) { ... });
// }
// ...
// cgh.parallel_for_work_group<class kernel>(
// range<1>(...), range<1>(...), [=](group<1> g) {
// foo(g, ...);
// });
//
// TODO The approach employed by this pass generates lots of barriers and data
// copying between private and local memory, which might not be efficient. There
// are optimization opportunities listed below. Also other approaches can be
Expand Down Expand Up @@ -385,16 +398,8 @@ static void copyBetweenPrivateAndShadow(Value *L, GlobalVariable *Shadow,
LocAlign = MaybeAlign(AI->getAlignment());
} else {
auto Arg = cast<Argument>(L);
if (Arg->hasByValAttr()) {
T = Arg->getParamByValType();
LocAlign = MaybeAlign(Arg->getParamAlignment());
} else {
Type *Ty = Arg->getType();
Module &M = *Shadow->getParent();
LocAlign = M.getDataLayout().getValueOrABITypeAlignment(
MaybeAlign(Arg->getParamAlignment()), Ty);
T = Arg->getType()->getPointerElementType();
}
T = Arg->getParamByValType();
LocAlign = MaybeAlign(Arg->getParamAlignment());
}

assert(T && "Unexpected type");
Expand Down Expand Up @@ -698,16 +703,7 @@ static void fixupPrivateMemoryPFWILambdaCaptures(CallInst *PFWICall) {
// Go through "byval" parameters which are passed as AS(0) pointers
// and: (1) create local shadows for them (2) and initialize them from the
// leader's copy and (3) materialize the value in the local variable before use
//
// Do the same for 'this' pointer which points to PFWG lamda object which is
// allocated in the caller. Caller is a kernel function which is generated by
// SYCL frontend. Kernel function allocates PFWG lambda object and initalizes
// captured objects (like accessors) using arguments of the kernel. After
// intialization kernel calls PFWG function (which is the operator() of the PFWG
// object). PFWG object captures all objects by value and all uses (except
// initialization from kernel arguments) of this values can only be in scope of
// PFWG function that is why copy back of PFWG object is not needed.
static void sharePFWGPrivateObjects(Function &F, const Triple &TT) {
static void shareByValParams(Function &F, const Triple &TT) {
// Skip alloca instructions and split. Alloca instructions must be in the
// beginning of the function otherwise they are considered as dynamic which
// can cause the problems with inlining.
Expand All @@ -726,29 +722,20 @@ static void sharePFWGPrivateObjects(Function &F, const Triple &TT) {
Instruction &At = LeaderBB->back();

for (auto &Arg : F.args()) {
Type *T;
LLVMContext &Ctx = At.getContext();
IRBuilder<> Builder(Ctx);
Builder.SetInsertPoint(&LeaderBB->front());
if (!Arg.hasByValAttr())
Copy link
Contributor

@kbobrovs kbobrovs Jun 9, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: we skip "this" because it is allocated in the proper AS by the FE, correct? Comment would be helpful for the reader.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right. I just reverted the changes from #1455 and tried to re-implement it by fixing address space in clang instead.
Do you want me to comment that this points to the object in local address space, so we don't need a shadow copy for that argument?

continue;

assert(Arg.getType()->getPointerAddressSpace() ==
asUInt(spirv::AddrSpace::Private));

// Create the shared copy - "shadow" - for current arg
GlobalVariable *Shadow = nullptr;
if (Arg.hasByValAttr()) {
assert(Arg.getType()->getPointerAddressSpace() ==
asUInt(spirv::AddrSpace::Private));
T = Arg.getParamByValType();
Shadow = spirv::createWGLocalVariable(*F.getParent(), T, "ArgShadow");
}
// Process 'this' pointer which points to PFWG lambda object
else if (Arg.getArgNo() == 0) {
PointerType *PtrT = dyn_cast<PointerType>(Arg.getType());
assert(PtrT && "Expected this pointer as the first argument");
T = PtrT->getPointerElementType();
Shadow = spirv::createWGLocalVariable(*F.getParent(), T, "ArgShadow");
}
Type *T = Arg.getParamByValType();
GlobalVariable *Shadow =
spirv::createWGLocalVariable(*F.getParent(), T, "ArgShadow");

if (!Shadow)
continue;
LLVMContext &Ctx = At.getContext();
IRBuilder<> Builder(Ctx);
Builder.SetInsertPoint(&LeaderBB->front());

copyBetweenPrivateAndShadow(&Arg, Shadow, Builder,
true /*private->shadow*/);
Expand All @@ -766,6 +753,7 @@ PreservedAnalyses SYCLLowerWGScopePass::run(Function &F,
FunctionAnalysisManager &FAM) {
if (!F.getMetadata(WG_SCOPE_MD))
return PreservedAnalyses::all();
LLVM_DEBUG(llvm::dbgs() << "Function name: " << F.getName() << "\n");
const auto &TT = llvm::Triple(F.getParent()->getTargetTriple());
// Ranges of "side effect" instructions
SmallVector<InstrRange, 16> Ranges;
Expand Down Expand Up @@ -866,9 +854,8 @@ PreservedAnalyses SYCLLowerWGScopePass::run(Function &F,
for (auto *PFWICall : PFWICalls)
fixupPrivateMemoryPFWILambdaCaptures(PFWICall);

// Finally, create shadows for and replace usages of byval pointer params and
// PFWG lambda object ('this' pointer).
sharePFWGPrivateObjects(F, TT);
// Finally, create shadows for and replace usages of byval pointer params.
shareByValParams(F, TT);

#ifndef NDEBUG
if (HaveChanges && Debug > 0)
Expand Down
87 changes: 39 additions & 48 deletions llvm/test/SYCLLowerIR/addrspacecast_handling.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,73 +9,64 @@
%struct.bar = type { i64 }
%struct.spam = type { i64, i64, i64, i64, i32 }

; CHECK: @[[SHADOW4:.*]] = internal unnamed_addr addrspace(3) global %struct.ham addrspace(4)*
; CHECK: @[[SHADOW3:.*]] = internal unnamed_addr addrspace(3) global %struct.spam
; CHECK: @[[SHADOW2:.*]] = internal unnamed_addr addrspace(3) global %struct.ham
; CHECK: @[[SHADOW1:.*]] = internal unnamed_addr addrspace(3) global %struct.bar

define linkonce_odr dso_local spir_func void @foo(%struct.ham addrspace(4)* dereferenceable_or_null(56) %arg, %struct.bar* byval(%struct.bar) align 8 %arg1) !work_group_scope !0 {
; CHECK-LABEL: @foo(
; CHECK-NEXT: bb:
; CHECK-NEXT: [[TMP:%.*]] = alloca [[STRUCT_HAM:%.*]] addrspace(4)*, align 8
; CHECK-NEXT: [[TMP0:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex, align 4
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) [[ATTR0:#.*]]
; CHECK-NEXT: [[CMPZ3:%.*]] = icmp eq i64 [[TMP0]], 0
; CHECK-NEXT: [[TMP0:%.*]] = alloca [[STRUCT_HAM:%.*]] addrspace(4)*, align 8
; CHECK-NEXT: [[TMP1:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex, align 4
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) #[[ATTR0:[0-9]+]]
; CHECK-NEXT: [[CMPZ3:%.*]] = icmp eq i64 [[TMP1]], 0
; CHECK-NEXT: br i1 [[CMPZ3]], label [[LEADER:%.*]], label [[MERGE:%.*]]
; CHECK: leader:
; CHECK-NEXT: [[TMP1:%.*]] = bitcast %struct.bar* [[ARG1:%.*]] to i8*
; CHECK-NEXT: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align 8 bitcast (%struct.bar addrspace(3)* @[[SHADOW1]] to i8 addrspace(3)*), i8* align 8 [[TMP1]], i64 8, i1 false)
; CHECK-NEXT: [[TMP2:%.*]] = bitcast [[STRUCT_HAM]] addrspace(4)* [[ARG:%.*]] to i8 addrspace(4)*
; CHECK-NEXT: call void @llvm.memcpy.p3i8.p4i8.i64(i8 addrspace(3)* align 16 bitcast (%struct.ham addrspace(3)* @[[SHADOW2]] to i8 addrspace(3)*), i8 addrspace(4)* align 8 [[TMP2]], i64 24, i1 false)
; CHECK-NEXT: [[TMP2:%.*]] = bitcast %struct.bar* [[ARG1:%.*]] to i8*
; CHECK-NEXT: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align 8 bitcast ([[STRUCT_BAR:%.*]] addrspace(3)* @ArgShadow to i8 addrspace(3)*), i8* align 8 [[TMP2]], i64 8, i1 false)
; CHECK-NEXT: br label [[MERGE]]
; CHECK: merge:
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) [[ATTR0]]
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) #[[ATTR0]]
; CHECK-NEXT: [[TMP3:%.*]] = bitcast %struct.bar* [[ARG1]] to i8*
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p3i8.i64(i8* align 8 [[TMP3]], i8 addrspace(3)* align 8 bitcast (%struct.bar addrspace(3)* @[[SHADOW1]] to i8 addrspace(3)*), i64 8, i1 false)
; CHECK-NEXT: [[TMP4:%.*]] = bitcast [[STRUCT_HAM]] addrspace(4)* [[ARG]] to i8 addrspace(4)*
; CHECK-NEXT: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* align 8 [[TMP4]], i8 addrspace(3)* align 16 bitcast (%struct.ham addrspace(3)* @[[SHADOW2]] to i8 addrspace(3)*), i64 24, i1 false)
; CHECK-NEXT: [[TMP2:%.*]] = addrspacecast [[STRUCT_HAM]] addrspace(4)** [[TMP]] to [[STRUCT_HAM]] addrspace(4)* addrspace(4)*
; CHECK-NEXT: [[TMP3:%.*]] = alloca [[STRUCT_SPAM:%.*]], align 8
; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast %struct.spam* [[TMP3]] to [[STRUCT_SPAM]] addrspace(4)*
; CHECK-NEXT: [[TMP5:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex, align 4
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) [[ATTR0]]
; CHECK-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[TMP5]], 0
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p3i8.i64(i8* align 8 [[TMP3]], i8 addrspace(3)* align 8 bitcast ([[STRUCT_BAR]] addrspace(3)* @ArgShadow to i8 addrspace(3)*), i64 8, i1 false)
; CHECK-NEXT: [[TMP4:%.*]] = addrspacecast [[STRUCT_HAM]] addrspace(4)** [[TMP0]] to [[STRUCT_HAM]] addrspace(4)* addrspace(4)*
; CHECK-NEXT: [[TMP5:%.*]] = alloca [[STRUCT_SPAM:%.*]], align 8
; CHECK-NEXT: [[TMP6:%.*]] = addrspacecast %struct.spam* [[TMP5]] to [[STRUCT_SPAM]] addrspace(4)*
; CHECK-NEXT: [[TMP7:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex, align 4
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) #[[ATTR0]]
; CHECK-NEXT: [[CMPZ:%.*]] = icmp eq i64 [[TMP7]], 0
; CHECK-NEXT: br i1 [[CMPZ]], label [[WG_LEADER:%.*]], label [[WG_CF:%.*]]
; CHECK: wg_leader:
; CHECK-NEXT: store [[STRUCT_HAM]] addrspace(4)* [[ARG]], [[STRUCT_HAM]] addrspace(4)* addrspace(4)* [[TMP2]], align 8
; CHECK-NEXT: store [[STRUCT_HAM]] addrspace(4)* [[ARG:%.*]], [[STRUCT_HAM]] addrspace(4)* addrspace(4)* [[TMP4]], align 8
; CHECK-NEXT: br label [[WG_CF]]
; CHECK: wg_cf:
; CHECK-NEXT: [[TMP6:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex, align 4
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) [[ATTR0]]
; CHECK-NEXT: [[CMPZ2:%.*]] = icmp eq i64 [[TMP6]], 0
; CHECK-NEXT: [[TMP8:%.*]] = load i64, i64 addrspace(1)* @__spirv_BuiltInLocalInvocationIndex, align 4
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) #[[ATTR0]]
; CHECK-NEXT: [[CMPZ2:%.*]] = icmp eq i64 [[TMP8]], 0
; CHECK-NEXT: br i1 [[CMPZ2]], label [[TESTMAT:%.*]], label [[LEADERMAT:%.*]]
; CHECK: TestMat:
; CHECK-NEXT: [[TMP7:%.*]] = bitcast %struct.spam* [[TMP3]] to i8*
; CHECK-NEXT: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align 16 bitcast (%struct.spam addrspace(3)* @[[SHADOW3]] to i8 addrspace(3)*), i8* align 8 [[TMP7]], i64 36, i1 false)
; CHECK-NEXT: [[MAT_LD:%.*]] = load [[STRUCT_HAM]] addrspace(4)*, [[STRUCT_HAM]] addrspace(4)** [[TMP]], align 8
; CHECK-NEXT: store [[STRUCT_HAM]] addrspace(4)* [[MAT_LD]], [[STRUCT_HAM]] addrspace(4)* addrspace(3)* @[[SHADOW4]], align 8
; CHECK-NEXT: [[TMP9:%.*]] = bitcast %struct.spam* [[TMP5]] to i8*
; CHECK-NEXT: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align 16 bitcast ([[STRUCT_SPAM]] addrspace(3)* @WGCopy.1 to i8 addrspace(3)*), i8* align 8 [[TMP9]], i64 36, i1 false)
; CHECK-NEXT: [[MAT_LD:%.*]] = load [[STRUCT_HAM]] addrspace(4)*, [[STRUCT_HAM]] addrspace(4)** [[TMP0]], align 8
; CHECK-NEXT: store [[STRUCT_HAM]] addrspace(4)* [[MAT_LD]], [[STRUCT_HAM]] addrspace(4)* addrspace(3)* @WGCopy, align 8
; CHECK-NEXT: br label [[LEADERMAT]]
; CHECK: LeaderMat:
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) [[ATTR0]]
; CHECK-NEXT: [[MAT_LD1:%.*]] = load [[STRUCT_HAM]] addrspace(4)*, [[STRUCT_HAM]] addrspace(4)* addrspace(3)* @[[SHADOW4]], align 8
; CHECK-NEXT: store [[STRUCT_HAM]] addrspace(4)* [[MAT_LD1]], [[STRUCT_HAM]] addrspace(4)** [[TMP]], align 8
; CHECK-NEXT: [[TMP8:%.*]] = bitcast %struct.spam* [[TMP3]] to i8*
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p3i8.i64(i8* align 8 [[TMP8]], i8 addrspace(3)* align 16 bitcast (%struct.spam addrspace(3)* @[[SHADOW3]] to i8 addrspace(3)*), i64 36, i1 false)
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) [[ATTR0]]
; CHECK-NEXT: [[TMP5:%.*]] = addrspacecast %struct.bar* [[ARG1]] to [[STRUCT_BAR:%.*]] addrspace(4)*
; CHECK-NEXT: [[TMP6:%.*]] = addrspacecast [[STRUCT_SPAM]] addrspace(4)* [[TMP4]] to %struct.spam*
; CHECK-NEXT: call spir_func void @widget(%struct.bar addrspace(4)* dereferenceable_or_null(32) [[TMP5]], %struct.spam* byval(%struct.spam) align 8 [[TMP6]])
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) #[[ATTR0]]
; CHECK-NEXT: [[MAT_LD1:%.*]] = load [[STRUCT_HAM]] addrspace(4)*, [[STRUCT_HAM]] addrspace(4)* addrspace(3)* @WGCopy, align 8
; CHECK-NEXT: store [[STRUCT_HAM]] addrspace(4)* [[MAT_LD1]], [[STRUCT_HAM]] addrspace(4)** [[TMP0]], align 8
; CHECK-NEXT: [[TMP10:%.*]] = bitcast %struct.spam* [[TMP5]] to i8*
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p3i8.i64(i8* align 8 [[TMP10]], i8 addrspace(3)* align 16 bitcast ([[STRUCT_SPAM]] addrspace(3)* @WGCopy.1 to i8 addrspace(3)*), i64 36, i1 false)
; CHECK-NEXT: call void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 272) #[[ATTR0]]
; CHECK-NEXT: [[TMP11:%.*]] = addrspacecast %struct.bar* [[ARG1]] to [[STRUCT_BAR]] addrspace(4)*
; CHECK-NEXT: [[TMP12:%.*]] = addrspacecast [[STRUCT_SPAM]] addrspace(4)* [[TMP6]] to %struct.spam*
; CHECK-NEXT: call spir_func void @widget([[STRUCT_BAR]] addrspace(4)* dereferenceable_or_null(32) [[TMP11]], %struct.spam* byval([[STRUCT_SPAM]]) align 8 [[TMP12]])
; CHECK-NEXT: ret void
;
bb:
%tmp = alloca %struct.ham addrspace(4)*, align 8
%tmp2 = addrspacecast %struct.ham addrspace(4)** %tmp to %struct.ham addrspace(4)* addrspace(4)*
%tmp3 = alloca %struct.spam, align 8
%tmp4 = addrspacecast %struct.spam* %tmp3 to %struct.spam addrspace(4)*
store %struct.ham addrspace(4)* %arg, %struct.ham addrspace(4)* addrspace(4)* %tmp2, align 8
%tmp5 = addrspacecast %struct.bar* %arg1 to %struct.bar addrspace(4)*
%tmp6 = addrspacecast %struct.spam addrspace(4)* %tmp4 to %struct.spam*
call spir_func void @widget(%struct.bar addrspace(4)* dereferenceable_or_null(32) %tmp5, %struct.spam* byval(%struct.spam) align 8 %tmp6)
%0 = alloca %struct.ham addrspace(4)*, align 8
%1 = addrspacecast %struct.ham addrspace(4)** %0 to %struct.ham addrspace(4)* addrspace(4)*
%2 = alloca %struct.spam, align 8
%3 = addrspacecast %struct.spam* %2 to %struct.spam addrspace(4)*
store %struct.ham addrspace(4)* %arg, %struct.ham addrspace(4)* addrspace(4)* %1, align 8
%4 = addrspacecast %struct.bar* %arg1 to %struct.bar addrspace(4)*
%5 = addrspacecast %struct.spam addrspace(4)* %3 to %struct.spam*
call spir_func void @widget(%struct.bar addrspace(4)* dereferenceable_or_null(32) %4, %struct.spam* byval(%struct.spam) align 8 %5)
ret void
}

Expand Down
Loading