Skip to content

Commit e9c7291

Browse files
committed
[OpenMP] Codegen aggregate for outlined function captures
Parallel regions are outlined as functions with capture variables explicitly generated as distinct parameters in the function's argument list. That complicates the fork_call interface in the OpenMP runtime: (1) the fork_call is variadic since there is a variable number of arguments to forward to the outlined function, (2) wrapping/unwrapping arguments happens in the OpenMP runtime, which is sub-optimal, has been a source of ABI bugs, and has a hardcoded limit (16) in the number of arguments, (3) forwarded arguments must cast to pointer types, which complicates debugging. This patch avoids those issues by aggregating captured arguments in a struct to pass to the fork_call. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D102107
1 parent 92430b4 commit e9c7291

File tree

200 files changed

+322884
-288116
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

200 files changed

+322884
-288116
lines changed

clang/lib/CodeGen/CGOpenMPRuntime.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1280,7 +1280,7 @@ static llvm::Function *emitParallelOrTeamsOutlinedFunction(
12801280
CGOpenMPOutlinedRegionInfo CGInfo(*CS, ThreadIDVar, CodeGen, InnermostKind,
12811281
HasCancel, OutlinedHelperName);
12821282
CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
1283-
return CGF.GenerateOpenMPCapturedStmtFunction(*CS, D.getBeginLoc());
1283+
return CGF.GenerateOpenMPCapturedStmtFunctionAggregate(*CS, D.getBeginLoc());
12841284
}
12851285

12861286
llvm::Function *CGOpenMPRuntime::emitParallelOutlinedFunction(

clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp

Lines changed: 68 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1513,21 +1513,56 @@ void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
15131513
// TODO: Is that needed?
15141514
CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
15151515

1516+
// Store addresses of global arguments to pass to the parallel call.
15161517
Address CapturedVarsAddrs = CGF.CreateDefaultAlignTempAlloca(
15171518
llvm::ArrayType::get(CGM.VoidPtrTy, CapturedVars.size()),
15181519
"captured_vars_addrs");
1519-
// There's something to share.
1520+
1521+
// Store globalized values to push, pop through the global stack.
1522+
SmallVector<llvm::Value *, 4> GlobalValues;
15201523
if (!CapturedVars.empty()) {
1521-
// Prepare for parallel region. Indicate the outlined function.
15221524
ASTContext &Ctx = CGF.getContext();
15231525
unsigned Idx = 0;
15241526
for (llvm::Value *V : CapturedVars) {
15251527
Address Dst = Bld.CreateConstArrayGEP(CapturedVarsAddrs, Idx);
15261528
llvm::Value *PtrV;
15271529
if (V->getType()->isIntegerTy())
15281530
PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
1529-
else
1530-
PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
1531+
else {
1532+
assert(V->getType()->isPointerTy() &&
1533+
"Expected Pointer Type to globalize.");
1534+
// Globalize and store pointer.
1535+
llvm::Type *PtrElemTy = V->getType()->getPointerElementType();
1536+
auto &DL = CGM.getDataLayout();
1537+
unsigned GlobalSize = DL.getTypeAllocSize(PtrElemTy);
1538+
1539+
/*
1540+
llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
1541+
llvm::Instruction *VoidPtr =
1542+
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1543+
CGM.getModule(),
1544+
OMPRTL___kmpc_alloc_shared), AllocArgs, VD->getName());
1545+
*/
1546+
// Use shared memory to store globalized pointer values, for now this
1547+
// should be the outlined args aggregate struct.
1548+
llvm::Value *GlobalSizeArg[] = {
1549+
llvm::ConstantInt::get(CGM.SizeTy, GlobalSize)};
1550+
llvm::Value *GlobalValue = CGF.EmitRuntimeCall(
1551+
OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
1552+
OMPRTL___kmpc_alloc_shared),
1553+
GlobalSizeArg);
1554+
GlobalValues.push_back(GlobalValue);
1555+
1556+
llvm::Value *CapturedVarVal = Bld.CreateAlignedLoad(
1557+
PtrElemTy, V, DL.getABITypeAlign(PtrElemTy));
1558+
llvm::Value *GlobalValueCast =
1559+
Bld.CreatePointerBitCastOrAddrSpaceCast(
1560+
GlobalValue, PtrElemTy->getPointerTo());
1561+
Bld.CreateDefaultAlignedStore(CapturedVarVal, GlobalValueCast);
1562+
1563+
PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(GlobalValue,
1564+
CGF.VoidPtrTy);
1565+
}
15311566
CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
15321567
Ctx.getPointerType(Ctx.VoidPtrTy));
15331568
++Idx;
@@ -1540,8 +1575,9 @@ void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
15401575
/* isSigned */ false);
15411576
else
15421577
IfCondVal = llvm::ConstantInt::get(CGF.Int32Ty, 1);
1543-
15441578
assert(IfCondVal && "Expected a value");
1579+
1580+
// Create the parallel call.
15451581
llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
15461582
llvm::Value *Args[] = {
15471583
RTLoc,
@@ -1557,6 +1593,13 @@ void CGOpenMPRuntimeGPU::emitParallelCall(CodeGenFunction &CGF,
15571593
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
15581594
CGM.getModule(), OMPRTL___kmpc_parallel_51),
15591595
Args);
1596+
1597+
// Pop any globalized values from the global stack.
1598+
for (auto *V : GlobalValues) {
1599+
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
1600+
CGM.getModule(), OMPRTL___kmpc_free_shared),
1601+
V);
1602+
}
15601603
};
15611604

15621605
RegionCodeGenTy RCG(ParallelGen);
@@ -3465,7 +3508,6 @@ llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
34653508
D.getBeginLoc(), D.getBeginLoc());
34663509

34673510
const auto *RD = CS.getCapturedRecordDecl();
3468-
auto CurField = RD->field_begin();
34693511

34703512
Address ZeroAddr = CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty,
34713513
/*Name=*/".zero.addr");
@@ -3477,7 +3519,6 @@ llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
34773519
Args.emplace_back(ZeroAddr.getPointer());
34783520

34793521
CGBuilderTy &Bld = CGF.Builder;
3480-
auto CI = CS.capture_begin();
34813522

34823523
// Use global memory for data sharing.
34833524
// Handle passing of global args to workers.
@@ -3524,23 +3565,27 @@ llvm::Function *CGOpenMPRuntimeGPU::createParallelDataSharingWrapper(
35243565
++Idx;
35253566
}
35263567
if (CS.capture_size() > 0) {
3568+
auto CI = CS.capture_begin();
3569+
// Load the outlined arg aggregate struct.
35273570
ASTContext &CGFContext = CGF.getContext();
3528-
for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
3529-
QualType ElemTy = CurField->getType();
3530-
Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx);
3531-
Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3532-
Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
3533-
llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
3534-
/*Volatile=*/false,
3535-
CGFContext.getPointerType(ElemTy),
3536-
CI->getLocation());
3537-
if (CI->capturesVariableByCopy() &&
3538-
!CI->getCapturedVar()->getType()->isAnyPointerType()) {
3539-
Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
3540-
CI->getLocation());
3541-
}
3542-
Args.emplace_back(Arg);
3543-
}
3571+
QualType RecordPointerTy =
3572+
CGFContext.getPointerType(CGFContext.getRecordType(RD));
3573+
Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx);
3574+
Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3575+
Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(RecordPointerTy)));
3576+
llvm::Value *Arg = CGF.EmitLoadOfScalar(
3577+
TypedAddress,
3578+
/*Volatile=*/false, CGFContext.getPointerType(RecordPointerTy),
3579+
CI->getLocation());
3580+
Args.emplace_back(Arg);
3581+
} else {
3582+
// If there are no captured arguments, use nullptr.
3583+
ASTContext &CGFContext = CGF.getContext();
3584+
QualType RecordPointerTy =
3585+
CGFContext.getPointerType(CGFContext.getRecordType(RD));
3586+
llvm::Value *Arg =
3587+
llvm::Constant::getNullValue(CGF.ConvertTypeForMem(RecordPointerTy));
3588+
Args.emplace_back(Arg);
35443589
}
35453590

35463591
emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);

clang/lib/CodeGen/CGStmtOpenMP.cpp

Lines changed: 154 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,32 @@ llvm::Value *CodeGenFunction::getTypeSize(QualType Ty) {
320320
return CGM.getSize(SizeInChars);
321321
}
322322

323+
void CodeGenFunction::GenerateOpenMPCapturedVarsAggregate(
324+
const CapturedStmt &S, SmallVectorImpl<llvm::Value *> &CapturedVars) {
325+
const RecordDecl *RD = S.getCapturedRecordDecl();
326+
QualType RecordTy = getContext().getRecordType(RD);
327+
// Create the aggregate argument struct for the outlined function.
328+
LValue AggLV = MakeAddrLValue(
329+
CreateMemTemp(RecordTy, "omp.outlined.arg.agg."), RecordTy);
330+
331+
// Initialize the aggregate with captured values.
332+
auto CurField = RD->field_begin();
333+
for (CapturedStmt::const_capture_init_iterator I = S.capture_init_begin(),
334+
E = S.capture_init_end();
335+
I != E; ++I, ++CurField) {
336+
LValue LV = EmitLValueForFieldInitialization(AggLV, *CurField);
337+
// Initialize for VLA.
338+
if (CurField->hasCapturedVLAType()) {
339+
EmitLambdaVLACapture(CurField->getCapturedVLAType(), LV);
340+
} else
341+
// Initialize for capturesThis, capturesVariableByCopy,
342+
// capturesVariable
343+
EmitInitializerForField(*CurField, LV, *I);
344+
}
345+
346+
CapturedVars.push_back(AggLV.getPointer(*this));
347+
}
348+
323349
void CodeGenFunction::GenerateOpenMPCapturedVars(
324350
const CapturedStmt &S, SmallVectorImpl<llvm::Value *> &CapturedVars) {
325351
const RecordDecl *RD = S.getCapturedRecordDecl();
@@ -420,6 +446,101 @@ struct FunctionOptions {
420446
};
421447
} // namespace
422448

449+
static llvm::Function *emitOutlinedFunctionPrologueAggregate(
450+
CodeGenFunction &CGF, FunctionArgList &Args,
451+
llvm::MapVector<const Decl *, std::pair<const VarDecl *, Address>>
452+
&LocalAddrs,
453+
llvm::DenseMap<const Decl *, std::pair<const Expr *, llvm::Value *>>
454+
&VLASizes,
455+
llvm::Value *&CXXThisValue, const CapturedStmt &CS, SourceLocation Loc,
456+
StringRef FunctionName) {
457+
const CapturedDecl *CD = CS.getCapturedDecl();
458+
const RecordDecl *RD = CS.getCapturedRecordDecl();
459+
assert(CD->hasBody() && "missing CapturedDecl body");
460+
461+
CXXThisValue = nullptr;
462+
// Build the argument list.
463+
CodeGenModule &CGM = CGF.CGM;
464+
ASTContext &Ctx = CGM.getContext();
465+
Args.append(CD->param_begin(), CD->param_end());
466+
467+
// Create the function declaration.
468+
const CGFunctionInfo &FuncInfo =
469+
CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, Args);
470+
llvm::FunctionType *FuncLLVMTy = CGM.getTypes().GetFunctionType(FuncInfo);
471+
472+
auto *F =
473+
llvm::Function::Create(FuncLLVMTy, llvm::GlobalValue::InternalLinkage,
474+
FunctionName, &CGM.getModule());
475+
CGM.SetInternalFunctionAttributes(CD, F, FuncInfo);
476+
if (CD->isNothrow())
477+
F->setDoesNotThrow();
478+
F->setDoesNotRecurse();
479+
480+
// Generate the function.
481+
CGF.StartFunction(CD, Ctx.VoidTy, F, FuncInfo, Args, Loc, Loc);
482+
Address ContextAddr = CGF.GetAddrOfLocalVar(CD->getContextParam());
483+
llvm::Value *ContextV = CGF.Builder.CreateLoad(ContextAddr);
484+
LValue ContextLV = CGF.MakeNaturalAlignAddrLValue(
485+
ContextV, CGM.getContext().getTagDeclType(RD));
486+
auto I = CS.captures().begin();
487+
for (const FieldDecl *FD : RD->fields()) {
488+
LValue FieldLV = CGF.EmitLValueForFieldInitialization(ContextLV, FD);
489+
// Do not map arguments if we emit function with non-original types.
490+
Address LocalAddr = FieldLV.getAddress(CGF);
491+
// If we are capturing a pointer by copy we don't need to do anything, just
492+
// use the value that we get from the arguments.
493+
if (I->capturesVariableByCopy() && FD->getType()->isAnyPointerType()) {
494+
const VarDecl *CurVD = I->getCapturedVar();
495+
LocalAddrs.insert({FD, {CurVD, LocalAddr}});
496+
++I;
497+
continue;
498+
}
499+
500+
LValue ArgLVal =
501+
CGF.MakeAddrLValue(LocalAddr, FD->getType(), AlignmentSource::Decl);
502+
if (FD->hasCapturedVLAType()) {
503+
llvm::Value *ExprArg = CGF.EmitLoadOfScalar(ArgLVal, I->getLocation());
504+
const VariableArrayType *VAT = FD->getCapturedVLAType();
505+
VLASizes.try_emplace(FD, VAT->getSizeExpr(), ExprArg);
506+
} else if (I->capturesVariable()) {
507+
const VarDecl *Var = I->getCapturedVar();
508+
QualType VarTy = Var->getType();
509+
Address ArgAddr = ArgLVal.getAddress(CGF);
510+
if (ArgLVal.getType()->isLValueReferenceType()) {
511+
ArgAddr = CGF.EmitLoadOfReference(ArgLVal);
512+
} else if (!VarTy->isVariablyModifiedType() || !VarTy->isPointerType()) {
513+
assert(ArgLVal.getType()->isPointerType());
514+
ArgAddr = CGF.EmitLoadOfPointer(
515+
ArgAddr, ArgLVal.getType()->castAs<PointerType>());
516+
}
517+
LocalAddrs.insert(
518+
{FD, {Var, Address(ArgAddr.getPointer(), Ctx.getDeclAlign(Var))}});
519+
} else if (I->capturesVariableByCopy()) {
520+
assert(!FD->getType()->isAnyPointerType() &&
521+
"Not expecting a captured pointer.");
522+
const VarDecl *Var = I->getCapturedVar();
523+
Address CopyAddr = CGF.CreateMemTemp(FD->getType(), Ctx.getDeclAlign(FD),
524+
Var->getName());
525+
LValue CopyLVal =
526+
CGF.MakeAddrLValue(CopyAddr, FD->getType(), AlignmentSource::Decl);
527+
528+
RValue ArgRVal = CGF.EmitLoadOfLValue(ArgLVal, I->getLocation());
529+
CGF.EmitStoreThroughLValue(ArgRVal, CopyLVal);
530+
531+
LocalAddrs.insert({FD, {Var, CopyAddr}});
532+
} else {
533+
// If 'this' is captured, load it into CXXThisValue.
534+
assert(I->capturesThis());
535+
CXXThisValue = CGF.EmitLoadOfScalar(ArgLVal, I->getLocation());
536+
LocalAddrs.insert({FD, {nullptr, ArgLVal.getAddress(CGF)}});
537+
}
538+
++I;
539+
}
540+
541+
return F;
542+
}
543+
423544
static llvm::Function *emitOutlinedFunctionPrologue(
424545
CodeGenFunction &CGF, FunctionArgList &Args,
425546
llvm::MapVector<const Decl *, std::pair<const VarDecl *, Address>>
@@ -595,6 +716,37 @@ static llvm::Function *emitOutlinedFunctionPrologue(
595716
return F;
596717
}
597718

719+
llvm::Function *CodeGenFunction::GenerateOpenMPCapturedStmtFunctionAggregate(
720+
const CapturedStmt &S, SourceLocation Loc) {
721+
assert(
722+
CapturedStmtInfo &&
723+
"CapturedStmtInfo should be set when generating the captured function");
724+
const CapturedDecl *CD = S.getCapturedDecl();
725+
// Build the argument list.
726+
FunctionArgList Args;
727+
llvm::MapVector<const Decl *, std::pair<const VarDecl *, Address>> LocalAddrs;
728+
llvm::DenseMap<const Decl *, std::pair<const Expr *, llvm::Value *>> VLASizes;
729+
StringRef FunctionName = CapturedStmtInfo->getHelperName();
730+
llvm::Function *F = emitOutlinedFunctionPrologueAggregate(
731+
*this, Args, LocalAddrs, VLASizes, CXXThisValue, S, Loc, FunctionName);
732+
CodeGenFunction::OMPPrivateScope LocalScope(*this);
733+
for (const auto &LocalAddrPair : LocalAddrs) {
734+
if (LocalAddrPair.second.first) {
735+
LocalScope.addPrivate(LocalAddrPair.second.first, [&LocalAddrPair]() {
736+
return LocalAddrPair.second.second;
737+
});
738+
}
739+
}
740+
(void)LocalScope.Privatize();
741+
for (const auto &VLASizePair : VLASizes)
742+
VLASizeMap[VLASizePair.second.first] = VLASizePair.second.second;
743+
PGO.assignRegionCounters(GlobalDecl(CD), F);
744+
CapturedStmtInfo->EmitBody(*this, CD->getBody());
745+
(void)LocalScope.ForceCleanup();
746+
FinishFunction(CD->getBodyRBrace());
747+
return F;
748+
}
749+
598750
llvm::Function *
599751
CodeGenFunction::GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S,
600752
SourceLocation Loc) {
@@ -1582,7 +1734,7 @@ static void emitCommonOMPParallelDirective(
15821734
// The following lambda takes care of appending the lower and upper bound
15831735
// parameters when necessary
15841736
CodeGenBoundParameters(CGF, S, CapturedVars);
1585-
CGF.GenerateOpenMPCapturedVars(*CS, CapturedVars);
1737+
CGF.GenerateOpenMPCapturedVarsAggregate(*CS, CapturedVars);
15861738
CGF.CGM.getOpenMPRuntime().emitParallelCall(CGF, S.getBeginLoc(), OutlinedFn,
15871739
CapturedVars, IfCond);
15881740
}
@@ -6050,7 +6202,7 @@ static void emitCommonOMPTeamsDirective(CodeGenFunction &CGF,
60506202

60516203
OMPTeamsScope Scope(CGF, S);
60526204
llvm::SmallVector<llvm::Value *, 16> CapturedVars;
6053-
CGF.GenerateOpenMPCapturedVars(*CS, CapturedVars);
6205+
CGF.GenerateOpenMPCapturedVarsAggregate(*CS, CapturedVars);
60546206
CGF.CGM.getOpenMPRuntime().emitTeamsCall(CGF, S, S.getBeginLoc(), OutlinedFn,
60556207
CapturedVars);
60566208
}

clang/lib/CodeGen/CodeGenFunction.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3283,8 +3283,13 @@ class CodeGenFunction : public CodeGenTypeCache {
32833283
llvm::Function *EmitCapturedStmt(const CapturedStmt &S, CapturedRegionKind K);
32843284
llvm::Function *GenerateCapturedStmtFunction(const CapturedStmt &S);
32853285
Address GenerateCapturedStmtArgument(const CapturedStmt &S);
3286+
llvm::Function *
3287+
GenerateOpenMPCapturedStmtFunctionAggregate(const CapturedStmt &S,
3288+
SourceLocation Loc);
32863289
llvm::Function *GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S,
32873290
SourceLocation Loc);
3291+
void GenerateOpenMPCapturedVarsAggregate(
3292+
const CapturedStmt &S, SmallVectorImpl<llvm::Value *> &CapturedVars);
32883293
void GenerateOpenMPCapturedVars(const CapturedStmt &S,
32893294
SmallVectorImpl<llvm::Value *> &CapturedVars);
32903295
void emitOMPSimpleStore(LValue LVal, RValue RVal, QualType RValTy,

clang/test/CodeGenCXX/observe-noexcept.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ void ffcomplex (int a) {
99

1010
// CHECK: call { double, double } @__muldc3(double %{{.+}}, double %{{.+}}, double %{{.+}}, double %{{.+}})
1111
dc *= dc;
12-
// CHECK: call {{.+}} @__kmpc_fork_call({{.+}} [[REGNAME1:@.*]] to void (i32*, i32*, ...)*), { double, double }* %{{.+}})
12+
// CHECK: call {{.+}} @__kmpc_fork_call({{.+}} [[REGNAME1:@.*]] to void (i32*, i32*, ...)*), %struct.anon* %{{.+}})
1313
#pragma omp parallel
1414
{
1515
dc *= dc;
@@ -32,7 +32,7 @@ void foo(int a, int b) {
3232

3333
void (*fptr)(void) noexcept = fnoexcp;
3434

35-
// CHECK: call {{.+}} @__kmpc_fork_call({{.+}} [[REGNAME2:@.*]] to void (i32*, i32*, ...)*), void ()** %{{.+}})
35+
// CHECK: call {{.+}} @__kmpc_fork_call({{.+}} [[REGNAME2:@.*]] to void (i32*, i32*, ...)*), %struct.anon.0* %{{.+}})
3636
#pragma omp parallel
3737
{
3838
fptr();

0 commit comments

Comments
 (0)