-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[OpenMP] Migrate GPU Reductions CodeGen from Clang to OMPIRBuilder #80343
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-clang @llvm/pr-subscribers-clang-codegen Author: Akash Banerjee (TIFitis) ChangesThis patch migrates the CGOpenMPRuntimeGPU::emitReduction and related functions to the OpenMPIRBUilder. In future patches MLIR OpenMP translation would be making use of these functions. Co-authored-by: Jan Leyonberg <jan.leyonberg@amd.com> Patch is 267.26 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/80343.diff 8 Files Affected:
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 299ee1460b3db..7cddf73306f2d 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -501,31 +501,6 @@ class CheckVarsEscapingDeclContext final
};
} // anonymous namespace
-/// Get the id of the warp in the block.
-/// We assume that the warp size is 32, which is always the case
-/// on the NVPTX device, to generate more efficient code.
-static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
- CGBuilderTy &Bld = CGF.Builder;
- unsigned LaneIDBits =
- llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
- auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
- return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
-}
-
-/// Get the id of the current lane in the Warp.
-/// We assume that the warp size is 32, which is always the case
-/// on the NVPTX device, to generate more efficient code.
-static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
- CGBuilderTy &Bld = CGF.Builder;
- unsigned LaneIDBits =
- llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
- assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
- unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
- auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
- return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
- "nvptx_lane_id");
-}
-
CGOpenMPRuntimeGPU::ExecutionMode
CGOpenMPRuntimeGPU::getExecutionMode() const {
return CurrentExecutionMode;
@@ -1429,1132 +1404,6 @@ static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
TBAAAccessInfo());
}
-/// This function creates calls to one of two shuffle functions to copy
-/// variables between lanes in a warp.
-static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
- llvm::Value *Elem,
- QualType ElemType,
- llvm::Value *Offset,
- SourceLocation Loc) {
- CodeGenModule &CGM = CGF.CGM;
- CGBuilderTy &Bld = CGF.Builder;
- CGOpenMPRuntimeGPU &RT =
- *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
- llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
-
- CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
- assert(Size.getQuantity() <= 8 &&
- "Unsupported bitwidth in shuffle instruction.");
-
- RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
- ? OMPRTL___kmpc_shuffle_int32
- : OMPRTL___kmpc_shuffle_int64;
-
- // Cast all types to 32- or 64-bit values before calling shuffle routines.
- QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
- Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
- llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
- llvm::Value *WarpSize =
- Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
-
- llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
- OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
- {ElemCast, Offset, WarpSize});
-
- return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
-}
-
-static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
- Address DestAddr, QualType ElemType,
- llvm::Value *Offset, SourceLocation Loc) {
- CGBuilderTy &Bld = CGF.Builder;
-
- CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
- // Create the loop over the big sized data.
- // ptr = (void*)Elem;
- // ptrEnd = (void*) Elem + 1;
- // Step = 8;
- // while (ptr + Step < ptrEnd)
- // shuffle((int64_t)*ptr);
- // Step = 4;
- // while (ptr + Step < ptrEnd)
- // shuffle((int32_t)*ptr);
- // ...
- Address ElemPtr = DestAddr;
- Address Ptr = SrcAddr;
- Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
- Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy, CGF.Int8Ty);
- for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
- if (Size < CharUnits::fromQuantity(IntSize))
- continue;
- QualType IntType = CGF.getContext().getIntTypeForBitwidth(
- CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
- /*Signed=*/1);
- llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
- Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo(),
- IntTy);
- ElemPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
- ElemPtr, IntTy->getPointerTo(), IntTy);
- if (Size.getQuantity() / IntSize > 1) {
- llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
- llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
- llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
- llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
- CGF.EmitBlock(PreCondBB);
- llvm::PHINode *PhiSrc =
- Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
- PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
- llvm::PHINode *PhiDest =
- Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
- PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
- Ptr = Address(PhiSrc, Ptr.getElementType(), Ptr.getAlignment());
- ElemPtr =
- Address(PhiDest, ElemPtr.getElementType(), ElemPtr.getAlignment());
- llvm::Value *PtrDiff = Bld.CreatePtrDiff(
- CGF.Int8Ty, PtrEnd.getPointer(),
- Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr.getPointer(),
- CGF.VoidPtrTy));
- Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
- ThenBB, ExitBB);
- CGF.EmitBlock(ThenBB);
- llvm::Value *Res = createRuntimeShuffleFunction(
- CGF,
- CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
- LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo()),
- IntType, Offset, Loc);
- CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
- LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo());
- Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
- Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
- PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
- PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
- CGF.EmitBranch(PreCondBB);
- CGF.EmitBlock(ExitBB);
- } else {
- llvm::Value *Res = createRuntimeShuffleFunction(
- CGF,
- CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
- LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo()),
- IntType, Offset, Loc);
- CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
- LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo());
- Ptr = Bld.CreateConstGEP(Ptr, 1);
- ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
- }
- Size = Size % IntSize;
- }
-}
-
-namespace {
-enum CopyAction : unsigned {
- // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
- // the warp using shuffle instructions.
- RemoteLaneToThread,
- // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
- ThreadCopy,
-};
-} // namespace
-
-struct CopyOptionsTy {
- llvm::Value *RemoteLaneOffset;
- llvm::Value *ScratchpadIndex;
- llvm::Value *ScratchpadWidth;
-};
-
-/// Emit instructions to copy a Reduce list, which contains partially
-/// aggregated values, in the specified direction.
-static void emitReductionListCopy(
- CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
- ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
- CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
-
- CodeGenModule &CGM = CGF.CGM;
- ASTContext &C = CGM.getContext();
- CGBuilderTy &Bld = CGF.Builder;
-
- llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
-
- // Iterates, element-by-element, through the source Reduce list and
- // make a copy.
- unsigned Idx = 0;
- for (const Expr *Private : Privates) {
- Address SrcElementAddr = Address::invalid();
- Address DestElementAddr = Address::invalid();
- Address DestElementPtrAddr = Address::invalid();
- // Should we shuffle in an element from a remote lane?
- bool ShuffleInElement = false;
- // Set to true to update the pointer in the dest Reduce list to a
- // newly created element.
- bool UpdateDestListPtr = false;
- QualType PrivatePtrType = C.getPointerType(Private->getType());
- llvm::Type *PrivateLlvmPtrType = CGF.ConvertType(PrivatePtrType);
-
- switch (Action) {
- case RemoteLaneToThread: {
- // Step 1.1: Get the address for the src element in the Reduce list.
- Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
- SrcElementAddr = CGF.EmitLoadOfPointer(
- SrcElementPtrAddr.withElementType(PrivateLlvmPtrType),
- PrivatePtrType->castAs<PointerType>());
-
- // Step 1.2: Create a temporary to store the element in the destination
- // Reduce list.
- DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
- DestElementAddr =
- CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
- ShuffleInElement = true;
- UpdateDestListPtr = true;
- break;
- }
- case ThreadCopy: {
- // Step 1.1: Get the address for the src element in the Reduce list.
- Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
- SrcElementAddr = CGF.EmitLoadOfPointer(
- SrcElementPtrAddr.withElementType(PrivateLlvmPtrType),
- PrivatePtrType->castAs<PointerType>());
-
- // Step 1.2: Get the address for dest element. The destination
- // element has already been created on the thread's stack.
- DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
- DestElementAddr = CGF.EmitLoadOfPointer(
- DestElementPtrAddr.withElementType(PrivateLlvmPtrType),
- PrivatePtrType->castAs<PointerType>());
- break;
- }
- }
-
- // Regardless of src and dest of copy, we emit the load of src
- // element as this is required in all directions
- SrcElementAddr = SrcElementAddr.withElementType(
- CGF.ConvertTypeForMem(Private->getType()));
- DestElementAddr =
- DestElementAddr.withElementType(SrcElementAddr.getElementType());
-
- // Now that all active lanes have read the element in the
- // Reduce list, shuffle over the value from the remote lane.
- if (ShuffleInElement) {
- shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
- RemoteLaneOffset, Private->getExprLoc());
- } else {
- switch (CGF.getEvaluationKind(Private->getType())) {
- case TEK_Scalar: {
- llvm::Value *Elem = CGF.EmitLoadOfScalar(
- SrcElementAddr, /*Volatile=*/false, Private->getType(),
- Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
- TBAAAccessInfo());
- // Store the source element value to the dest element address.
- CGF.EmitStoreOfScalar(
- Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
- LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
- break;
- }
- case TEK_Complex: {
- CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
- CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
- Private->getExprLoc());
- CGF.EmitStoreOfComplex(
- Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
- /*isInit=*/false);
- break;
- }
- case TEK_Aggregate:
- CGF.EmitAggregateCopy(
- CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
- CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
- Private->getType(), AggValueSlot::DoesNotOverlap);
- break;
- }
- }
-
- // Step 3.1: Modify reference in dest Reduce list as needed.
- // Modifying the reference in Reduce list to point to the newly
- // created element. The element is live in the current function
- // scope and that of functions it invokes (i.e., reduce_function).
- // RemoteReduceData[i] = (void*)&RemoteElem
- if (UpdateDestListPtr) {
- CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
- DestElementAddr.getPointer(), CGF.VoidPtrTy),
- DestElementPtrAddr, /*Volatile=*/false,
- C.VoidPtrTy);
- }
-
- ++Idx;
- }
-}
-
-/// This function emits a helper that gathers Reduce lists from the first
-/// lane of every active warp to lanes in the first warp.
-///
-/// void inter_warp_copy_func(void* reduce_data, num_warps)
-/// shared smem[warp_size];
-/// For all data entries D in reduce_data:
-/// sync
-/// If (I am the first lane in each warp)
-/// Copy my local D to smem[warp_id]
-/// sync
-/// if (I am the first warp)
-/// Copy smem[thread_id] to my local D
-static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
- ArrayRef<const Expr *> Privates,
- QualType ReductionArrayTy,
- SourceLocation Loc) {
- ASTContext &C = CGM.getContext();
- llvm::Module &M = CGM.getModule();
-
- // ReduceList: thread local Reduce list.
- // At the stage of the computation when this function is called, partially
- // aggregated values reside in the first lane of every active warp.
- ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.VoidPtrTy, ImplicitParamKind::Other);
- // NumWarps: number of warps active in the parallel region. This could
- // be smaller than 32 (max warps in a CTA) for partial block reduction.
- ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
- C.getIntTypeForBitwidth(32, /* Signed */ true),
- ImplicitParamKind::Other);
- FunctionArgList Args;
- Args.push_back(&ReduceListArg);
- Args.push_back(&NumWarpsArg);
-
- const CGFunctionInfo &CGFI =
- CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
- auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
- llvm::GlobalValue::InternalLinkage,
- "_omp_reduction_inter_warp_copy_func", &M);
- CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
- Fn->setDoesNotRecurse();
- CodeGenFunction CGF(CGM);
- CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
-
- CGBuilderTy &Bld = CGF.Builder;
-
- // This array is used as a medium to transfer, one reduce element at a time,
- // the data from the first lane of every warp to lanes in the first warp
- // in order to perform the final step of a reduction in a parallel region
- // (reduction across warps). The array is placed in NVPTX __shared__ memory
- // for reduced latency, as well as to have a distinct copy for concurrently
- // executing target regions. The array is declared with common linkage so
- // as to be shared across compilation units.
- StringRef TransferMediumName =
- "__openmp_nvptx_data_transfer_temporary_storage";
- llvm::GlobalVariable *TransferMedium =
- M.getGlobalVariable(TransferMediumName);
- unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
- if (!TransferMedium) {
- auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
- unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
- TransferMedium = new llvm::GlobalVariable(
- M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
- llvm::UndefValue::get(Ty), TransferMediumName,
- /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
- SharedAddressSpace);
- CGM.addCompilerUsedGlobal(TransferMedium);
- }
-
- auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
- // Get the CUDA thread id of the current OpenMP thread on the GPU.
- llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
- // nvptx_lane_id = nvptx_id % warpsize
- llvm::Value *LaneID = getNVPTXLaneID(CGF);
- // nvptx_warp_id = nvptx_id / warpsize
- llvm::Value *WarpID = getNVPTXWarpID(CGF);
-
- Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
- llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
- Address LocalReduceList(
- Bld.CreatePointerBitCastOrAddrSpaceCast(
- CGF.EmitLoadOfScalar(
- AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
- LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
- ElemTy->getPointerTo()),
- ElemTy, CGF.getPointerAlign());
-
- unsigned Idx = 0;
- for (const Expr *Private : Privates) {
- //
- // Warp master copies reduce element to transfer medium in __shared__
- // memory.
- //
- unsigned RealTySize =
- C.getTypeSizeInChars(Private->getType())
- .alignTo(C.getTypeAlignInChars(Private->getType()))
- .getQuantity();
- for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
- unsigned NumIters = RealTySize / TySize;
- if (NumIters == 0)
- continue;
- QualType CType = C.getIntTypeForBitwidth(
- C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
- llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
- CharUnits Align = CharUnits::fromQuantity(TySize);
- llvm::Value *Cnt = nullptr;
- Address CntAddr = Address::invalid();
- llvm::BasicBlock *PrecondBB = nullptr;
- llvm::BasicBlock *ExitBB = nullptr;
- if (NumIters > 1) {
- CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
- CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
- /*Volatile=*/false, C.IntTy);
- PrecondBB = CGF.createBasicBlock("precond");
- ExitBB = CGF.createBasicBlock("exit");
- llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
- // There is no need to emit line number for unconditional branch.
- (void)ApplyDebugLocation::CreateEmpty(CGF);
- CGF.EmitBlock(PrecondBB);
- Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
- llvm::Value *Cmp =
- Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
- Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
- CGF.EmitBlock(BodyBB);
- }
- // kmpc_barrier.
- CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
- /*EmitChecks=*/false,
- /*ForceSimpleCall=*/true);
- llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
- llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
- llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
-
- // if (lane_id == 0)
- llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
- Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
- CGF.EmitBlock(ThenBB);
-
- // Reduce element = LocalReduceList[i]
- Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
- llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
- ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
- // elemptr = ((CopyType*)(elemptrptr)) + I
- Address ElemPtr(ElemPtrPtr, CopyType, Align);
- if (NumIters > 1)
- ElemPtr = Bld.CreateGEP(ElemPtr, Cnt);
-
- // Get pointer to location in transfer medium.
- // MediumPtr = &medium[warp_id]
- llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
- TransferMedium->getValueType(), TransferM...
[truncated]
|
@@ -2953,7 +2953,7 @@ int bar(int n){ | |||
// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 [[TMP5]] | |||
// CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP8]], i32 0, i32 0 | |||
// CHECK3-NEXT: [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1 | |||
// CHECK3-NEXT: store i8 [[TMP9]], ptr [[C]], align 4 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure if this change (and a few more similar cases in other places) is correct. My intuition says that the alignment was incorrect and this now fixes it, hopefully someone can comment.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Similar to above, this affects only accesses to a static global. Even if the new alignment is lower than optimal, we should be able to reconstruct it from the allocation. This is fine, IMHO.
@@ -36,14 +36,14 @@ void test() { | |||
// CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 | |||
// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 | |||
// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 | |||
// CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8, !tbaa [[TBAA10:![0-9]+]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
From what I've heard from @jsjodin the TBAA info was pending to be removed. There is no good way to have OMPIRBuilder generate the TBAA so it has been muted for reductions codegen, let me know if this change is acceptable.
Otherwise, we would have to look into callbacks or other methods to retain this information.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Browsing through these, it seems we only loose TBAA for accesses of allocas. This is fine.
I think the patch should be split up into at least 2 pieces. First would be the migration work and changes to Clang, the second would be working on adding reduction support for flang. |
@jsjodin This PR only has the migration work. Flang/MLIR is left unchanged other than the few necessary changes. Once this patch is merged, I'll put up a PR for having MLIR use these additions. |
break; | ||
} | ||
case EvaluationKindTy::Complex: { | ||
// FIXME(Jan): Complex type |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Are there no clang tests that hit this case? Remove comment and put an assertion.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If there are no tests that check this, we need to add tests and fix the implementation.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes there are currently no tests for the Complex type, only for scalar and aggregate.
I don't see any other handling of complex type in the OMPIRBuilder, so I'm working on what needs to be done here. In the meanwhile, I've added assertion failures for hitting complex types.
Module &M = OMPBuilder->M; | ||
IRBuilder<> &Builder = OMPBuilder->Builder; | ||
Value *TripCount = TripCountOrig; | ||
// FIXME(JAN): The trip count is 1 larger than it should be for GPU, this may | ||
// not be the right way to fix it, but this works for now. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will this give the correct trip count in Clang? I know Flang's trip count value is +1 compared to what Clang was generating so that's why this was added (temporarily). It would be best if we could agree on the same trip count and this adjustment could go away. Maybe that would have to be a separate patch?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This section of the code is never executed from Clang, so I'm not sure.
There is some code that is related to Flang/MLIR that shouldn't be in the patch. e.g. ReductionInfoManager is not used anywhere. I am currently working on a patch that refactors this code a bit and how it is used in the MLIR lowering. |
Thanks for pointing this out, I've removed it from this PR. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
So, just to double check. On the Clang side, all you did is move the codegen and adapt it to work in the OMPIRBuilder, right? Expected outcome is the same, modulo the two small changes.
@@ -36,14 +36,14 @@ void test() { | |||
// CHECK1-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8 | |||
// CHECK1-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4 | |||
// CHECK1-NEXT: [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4 | |||
// CHECK1-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8, !tbaa [[TBAA10:![0-9]+]] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Browsing through these, it seems we only loose TBAA for accesses of allocas. This is fine.
@@ -2953,7 +2953,7 @@ int bar(int n){ | |||
// CHECK3-NEXT: [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 [[TMP5]] | |||
// CHECK3-NEXT: [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP8]], i32 0, i32 0 | |||
// CHECK3-NEXT: [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1 | |||
// CHECK3-NEXT: store i8 [[TMP9]], ptr [[C]], align 4 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Similar to above, this affects only accesses to a static global. Even if the new alignment is lower than optimal, we should be able to reconstruct it from the allocation. This is fine, IMHO.
Yup, code generated is identical, except for the align issue I've highlighted and some cast instructions being moved around. Also no TBAA Info. |
513b278
to
5f0b817
Compare
With the latest changes the build bots are all clean now. Let me know if there are any further changes required for this patch to go through. |
case EvaluationKindTy::Complex: { | ||
assert(false && "Complex data type not handled"); | ||
break; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jdoerfert Currently there are no test cases for the Complex kind. I was trying to come up with test cases that might fall under this case but I wasn't able to.
Please let me know if this case is infeasible, and we can leave it as an assertion failure.
Here's my attempt at creating a test case, but it still falls under the aggregate kind:
int foo() {
int i;
int j;
std::complex<int> sum[10][10];
std::complex<int> res;
#pragma omp target teams loop reduction(+:sum)
for(i=0; i<10; i++)
for(j=0; j<10; j++)
res += sum[i][j];
return 0;
}
Ping for review :) |
This patch removes the complex reduction variables codegen. There are currently no tests for this, and from playing around with some complex reduction variable test cases the code seems unreachable. The PR llvm#80343 proposes to migrate reductions codegen to the OMPIRBuilder and it isn't currently well equipped to handle complex variables. This patch would enable PR llvm#80343 to go forward.
@jsjodin @jdoerfert Hi, this patch has been sitting for a while now. Would be great to get it reviewed and accepted. Thanks. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This patch is huge, and just skimming over it shows various places that could be split off, and others that should not make it to the review stage (commented out code).
Please update.
@@ -145,6 +145,8 @@ static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType) { | |||
} | |||
#endif | |||
|
|||
Function *GLOBAL_ReductionFunc = nullptr; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We should (almost) never have globals flying around. What is going on here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We don't need this for now, I've removed it.
@@ -99,14 +100,20 @@ class OpenMPIRBuilderConfig { | |||
/// expanded. | |||
std::optional<bool> IsGPU; | |||
|
|||
// Flag for specifying if offloading is mandatory. | |||
/// Flag for specifying if LLVMUsed information should be emitted. | |||
std::optional<bool> EmitLLVMUsed; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a generic sounding flag that guards a very very specific thing.
Is there ever a use case where the flag is not set (and it is checked)?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've changed the name to EmitLLVMUsedMetaInfo
, please let me know if you have a better name.
I don't think we have such a use case. I've updated the current code to check the flag is set before checking it.
InsertPointTy createBarrier(const LocationDescription &Loc, | ||
omp::Directive Kind, bool ForceSimpleCall = false, | ||
bool CheckCancelFlag = true, | ||
Value *ThreadID = nullptr); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What has this to do with reductions? Either make it a pre-commit or just don't introduce this concept of "existing ThreadID values" again. Most (>90%) of the runtime functions that take a thread ID don't use it anyway. It's leftover from the old runtime.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Most of the clang reduction tests have it in them where they reuse the ThreadID. This change allows us to pass an existing ThreadID if available.
If you'd like I could remove this change and update the tests instead, please let me know which you prefer.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The ThreadID doesn't seem to be passed in the original code. If there was reuse before perhaps some optimization is not happening? Probably worth looking into the cause of this instead of adding the parameter.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This entire threadId in API stuff is only a leftover from the old (broken) runtime design.
We don't use the thread Id in barriers, and we don't need to pass it in.
If that means you need to change the tests, so be it. Do it before this commit, and feel free to modify the device RTL API to remove the argument completely.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for clearing this up. I've updated the PR with the changes. I couldn't do it as a pre-commit as it relies on changes from this PR to use the OpenMPIRBuilder createBarrier func and the changes aren't trivially portable. However, it has only required updating 5 clang tests and the differences are minimal.
Thanks for the the hard work reviewing this patch so far :)
} else { | ||
// LHS = Builder.CreateLoad(LHS); | ||
// LHS = Builder.CreateLoad(LHS); | ||
// Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), LHS, RHS)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What is happening here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Code should be added here when we add MLIR/Flang codegen support. This is unreachable for now, I've replaced the commented code with an assertion failure.
5534943
to
4046186
Compare
@jdoerfert Hi, thanks for talking the time out to review the patch. I've addressed the comments you made and also fixed the build issues. It now builds cleanly again and passes all checks. I'm not sure however on how to split the patch, as we definitely need all the pieces to pass the CI tests. I hope it isn't too much of a burden to review the patch. Thanks. |
Not super familiar with the work here, but could we maybe split it as - one patch for OpenMPIRBuilder changes with tests in OpenMPIRBuilderTest.cpp and then second patch for the clang changes to use those OpenMPIRBuilder changes? Would the CI tests cause issues? |
Only the functions required for OpenMP reductions directive are being moved in this patch and the OpenMPIRBuilder changes are only to facilitate this move. No new functionality is being added. If we were to move only a subset of the functions then we would have to add extra code to remove the dependancy on Clang CGF etc which is unnecessary. |
I see. Thanks for the reply. |
92f2432
to
f32ebe3
Compare
Ping for review 😄 |
Ping for review. |
@jdoerfert Kind reminder for review. Thanks. |
b91b0e8
to
15fd9a1
Compare
Ping for review. |
This patch migrates the CGOpenMPRuntimeGPU::emitReduction and related functions to the OpenMPIRBUilder. In future patches MLIR OpenMP translation would be making use of these functions. Co-authored-by: Jan Leyonberg <jan.leyonberg@amd.com>
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks good to me now. You can wait a few days before merging to let @jdoerfert look it over again.
15fd9a1
to
2351eb2
Compare
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/134/builds/699 Here is the relevant piece of the build log for the reference:
|
The above is an intermittent failure on the tsan buildbot (https://lab.llvm.org/buildbot/#/builders/134) and can be ignored. The next patch processed shows status is green. |
This patch causes the > clang ../offload/test/offloading/bug51781.c -fopenmp -O1 --offload-arch=sm_89 -DADD_REDUCTION --offload-device-only -gline-tables-only
!dbg attachment points at wrong subprogram for function
!19 = distinct !DISubprogram(name: "__omp_offloading_10302_af88b66_main_l44", scope: !11, file: !11, line: 44, type: !20, scopeLine: 44, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !10)
ptr @__omp_offloading_10302_af88b66_main_l44
%16 = load i32, ptr %14, align 4, !dbg !50, !tbaa !35
!50 = !DILocation(line: 44, column: 58, scope: !32)
!32 = distinct !DISubprogram(name: "__omp_offloading_10302_af88b66_main_l44_omp_outlined", scope: !11, file: !11, line: 44, type: !20, scopeLine: 44, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !10)
!32 = distinct !DISubprogram(name: "__omp_offloading_10302_af88b66_main_l44_omp_outlined", scope: !11, file: !11, line: 44, type: !20, scopeLine: 44, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !10)
!dbg attachment points at wrong subprogram for function
!19 = distinct !DISubprogram(name: "__omp_offloading_10302_af88b66_main_l44", scope: !11, file: !11, line: 44, type: !20, scopeLine: 44, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !10)
ptr @__omp_offloading_10302_af88b66_main_l44
%14 = load i32, ptr %12, align 4, !dbg !50, !tbaa !35
!50 = !DILocation(line: 44, column: 58, scope: !32)
!32 = distinct !DISubprogram(name: "__omp_offloading_10302_af88b66_main_l44_omp_outlined", scope: !11, file: !11, line: 44, type: !20, scopeLine: 44, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !10)
!32 = distinct !DISubprogram(name: "__omp_offloading_10302_af88b66_main_l44_omp_outlined", scope: !11, file: !11, line: 44, type: !20, scopeLine: 44, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !10) This test directly uses a reduction, and if I revert this patch it no longer breaks. I'm fairly confident that somewhere in this code we did not copy debug information correctly. Any clue where that might be? |
…lvm#80343) This patch migrates the CGOpenMPRuntimeGPU::emitReduction and related functions to the OpenMPIRBUilder. In future patches MLIR OpenMP translation would be making use of these functions. Co-authored-by: Jan Leyonberg <jan.leyonberg@amd.com>
…lvm#80343) This patch migrates the CGOpenMPRuntimeGPU::emitReduction and related functions to the OpenMPIRBUilder. In future patches MLIR OpenMP translation would be making use of these functions. Co-authored-by: Jan Leyonberg <jan.leyonberg@amd.com>
This patch migrates the CGOpenMPRuntimeGPU::emitReduction and related functions to the OpenMPIRBUilder. In future patches MLIR OpenMP translation would be making use of these functions.
Co-authored-by: Jan Leyonberg jan.leyonberg@amd.com