[OpenMP] Migrate GPU Reductions CodeGen from Clang to OMPIRBuilder #80343

TIFitis · 2024-02-01T21:33:49Z

This patch migrates the CGOpenMPRuntimeGPU::emitReduction and related functions to the OpenMPIRBUilder. In future patches MLIR OpenMP translation would be making use of these functions.

Co-authored-by: Jan Leyonberg jan.leyonberg@amd.com

llvmbot · 2024-02-01T21:34:15Z

@llvm/pr-subscribers-clang
@llvm/pr-subscribers-mlir-openmp
@llvm/pr-subscribers-flang-openmp
@llvm/pr-subscribers-mlir-llvm

@llvm/pr-subscribers-clang-codegen

Author: Akash Banerjee (TIFitis)

Changes

This patch migrates the CGOpenMPRuntimeGPU::emitReduction and related functions to the OpenMPIRBUilder. In future patches MLIR OpenMP translation would be making use of these functions.

Co-authored-by: Jan Leyonberg <jan.leyonberg@amd.com>

Patch is 267.26 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/80343.diff

8 Files Affected:

(modified) clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp (+71-1303)
(modified) clang/lib/CodeGen/CGOpenMPRuntimeGPU.h (-3)
(modified) clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp (+273-273)
(modified) clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp (+12-12)
(modified) clang/test/OpenMP/target_teams_generic_loop_codegen.cpp (+6-6)
(modified) llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h (+640-32)
(modified) llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp (+1420-66)
(modified) mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp (+18-4)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
index 299ee1460b3db..7cddf73306f2d 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp
@@ -501,31 +501,6 @@ class CheckVarsEscapingDeclContext final
 };
 } // anonymous namespace
 
-/// Get the id of the warp in the block.
-/// We assume that the warp size is 32, which is always the case
-/// on the NVPTX device, to generate more efficient code.
-static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
-  CGBuilderTy &Bld = CGF.Builder;
-  unsigned LaneIDBits =
-      llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
-  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
-  return Bld.CreateAShr(RT.getGPUThreadID(CGF), LaneIDBits, "nvptx_warp_id");
-}
-
-/// Get the id of the current lane in the Warp.
-/// We assume that the warp size is 32, which is always the case
-/// on the NVPTX device, to generate more efficient code.
-static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
-  CGBuilderTy &Bld = CGF.Builder;
-  unsigned LaneIDBits =
-      llvm::Log2_32(CGF.getTarget().getGridValue().GV_Warp_Size);
-  assert(LaneIDBits < 32 && "Invalid LaneIDBits size in NVPTX device.");
-  unsigned LaneIDMask = ~0u >> (32u - LaneIDBits);
-  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
-  return Bld.CreateAnd(RT.getGPUThreadID(CGF), Bld.getInt32(LaneIDMask),
-                       "nvptx_lane_id");
-}
-
 CGOpenMPRuntimeGPU::ExecutionMode
 CGOpenMPRuntimeGPU::getExecutionMode() const {
   return CurrentExecutionMode;
@@ -1429,1132 +1404,6 @@ static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
                               TBAAAccessInfo());
 }
 
-/// This function creates calls to one of two shuffle functions to copy
-/// variables between lanes in a warp.
-static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
-                                                 llvm::Value *Elem,
-                                                 QualType ElemType,
-                                                 llvm::Value *Offset,
-                                                 SourceLocation Loc) {
-  CodeGenModule &CGM = CGF.CGM;
-  CGBuilderTy &Bld = CGF.Builder;
-  CGOpenMPRuntimeGPU &RT =
-      *(static_cast<CGOpenMPRuntimeGPU *>(&CGM.getOpenMPRuntime()));
-  llvm::OpenMPIRBuilder &OMPBuilder = RT.getOMPBuilder();
-
-  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
-  assert(Size.getQuantity() <= 8 &&
-         "Unsupported bitwidth in shuffle instruction.");
-
-  RuntimeFunction ShuffleFn = Size.getQuantity() <= 4
-                                  ? OMPRTL___kmpc_shuffle_int32
-                                  : OMPRTL___kmpc_shuffle_int64;
-
-  // Cast all types to 32- or 64-bit values before calling shuffle routines.
-  QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
-      Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
-  llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
-  llvm::Value *WarpSize =
-      Bld.CreateIntCast(RT.getGPUWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
-
-  llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
-      OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(), ShuffleFn),
-      {ElemCast, Offset, WarpSize});
-
-  return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
-}
-
-static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
-                            Address DestAddr, QualType ElemType,
-                            llvm::Value *Offset, SourceLocation Loc) {
-  CGBuilderTy &Bld = CGF.Builder;
-
-  CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
-  // Create the loop over the big sized data.
-  // ptr = (void*)Elem;
-  // ptrEnd = (void*) Elem + 1;
-  // Step = 8;
-  // while (ptr + Step < ptrEnd)
-  //   shuffle((int64_t)*ptr);
-  // Step = 4;
-  // while (ptr + Step < ptrEnd)
-  //   shuffle((int32_t)*ptr);
-  // ...
-  Address ElemPtr = DestAddr;
-  Address Ptr = SrcAddr;
-  Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
-      Bld.CreateConstGEP(SrcAddr, 1), CGF.VoidPtrTy, CGF.Int8Ty);
-  for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
-    if (Size < CharUnits::fromQuantity(IntSize))
-      continue;
-    QualType IntType = CGF.getContext().getIntTypeForBitwidth(
-        CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
-        /*Signed=*/1);
-    llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
-    Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo(),
-                                                  IntTy);
-    ElemPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
-        ElemPtr, IntTy->getPointerTo(), IntTy);
-    if (Size.getQuantity() / IntSize > 1) {
-      llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
-      llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
-      llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
-      llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
-      CGF.EmitBlock(PreCondBB);
-      llvm::PHINode *PhiSrc =
-          Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
-      PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
-      llvm::PHINode *PhiDest =
-          Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
-      PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
-      Ptr = Address(PhiSrc, Ptr.getElementType(), Ptr.getAlignment());
-      ElemPtr =
-          Address(PhiDest, ElemPtr.getElementType(), ElemPtr.getAlignment());
-      llvm::Value *PtrDiff = Bld.CreatePtrDiff(
-          CGF.Int8Ty, PtrEnd.getPointer(),
-          Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr.getPointer(),
-                                                  CGF.VoidPtrTy));
-      Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
-                       ThenBB, ExitBB);
-      CGF.EmitBlock(ThenBB);
-      llvm::Value *Res = createRuntimeShuffleFunction(
-          CGF,
-          CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
-                               LValueBaseInfo(AlignmentSource::Type),
-                               TBAAAccessInfo()),
-          IntType, Offset, Loc);
-      CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
-                            LValueBaseInfo(AlignmentSource::Type),
-                            TBAAAccessInfo());
-      Address LocalPtr = Bld.CreateConstGEP(Ptr, 1);
-      Address LocalElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
-      PhiSrc->addIncoming(LocalPtr.getPointer(), ThenBB);
-      PhiDest->addIncoming(LocalElemPtr.getPointer(), ThenBB);
-      CGF.EmitBranch(PreCondBB);
-      CGF.EmitBlock(ExitBB);
-    } else {
-      llvm::Value *Res = createRuntimeShuffleFunction(
-          CGF,
-          CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc,
-                               LValueBaseInfo(AlignmentSource::Type),
-                               TBAAAccessInfo()),
-          IntType, Offset, Loc);
-      CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType,
-                            LValueBaseInfo(AlignmentSource::Type),
-                            TBAAAccessInfo());
-      Ptr = Bld.CreateConstGEP(Ptr, 1);
-      ElemPtr = Bld.CreateConstGEP(ElemPtr, 1);
-    }
-    Size = Size % IntSize;
-  }
-}
-
-namespace {
-enum CopyAction : unsigned {
-  // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
-  // the warp using shuffle instructions.
-  RemoteLaneToThread,
-  // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
-  ThreadCopy,
-};
-} // namespace
-
-struct CopyOptionsTy {
-  llvm::Value *RemoteLaneOffset;
-  llvm::Value *ScratchpadIndex;
-  llvm::Value *ScratchpadWidth;
-};
-
-/// Emit instructions to copy a Reduce list, which contains partially
-/// aggregated values, in the specified direction.
-static void emitReductionListCopy(
-    CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
-    ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
-    CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
-
-  CodeGenModule &CGM = CGF.CGM;
-  ASTContext &C = CGM.getContext();
-  CGBuilderTy &Bld = CGF.Builder;
-
-  llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
-
-  // Iterates, element-by-element, through the source Reduce list and
-  // make a copy.
-  unsigned Idx = 0;
-  for (const Expr *Private : Privates) {
-    Address SrcElementAddr = Address::invalid();
-    Address DestElementAddr = Address::invalid();
-    Address DestElementPtrAddr = Address::invalid();
-    // Should we shuffle in an element from a remote lane?
-    bool ShuffleInElement = false;
-    // Set to true to update the pointer in the dest Reduce list to a
-    // newly created element.
-    bool UpdateDestListPtr = false;
-    QualType PrivatePtrType = C.getPointerType(Private->getType());
-    llvm::Type *PrivateLlvmPtrType = CGF.ConvertType(PrivatePtrType);
-
-    switch (Action) {
-    case RemoteLaneToThread: {
-      // Step 1.1: Get the address for the src element in the Reduce list.
-      Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
-      SrcElementAddr = CGF.EmitLoadOfPointer(
-          SrcElementPtrAddr.withElementType(PrivateLlvmPtrType),
-          PrivatePtrType->castAs<PointerType>());
-
-      // Step 1.2: Create a temporary to store the element in the destination
-      // Reduce list.
-      DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
-      DestElementAddr =
-          CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
-      ShuffleInElement = true;
-      UpdateDestListPtr = true;
-      break;
-    }
-    case ThreadCopy: {
-      // Step 1.1: Get the address for the src element in the Reduce list.
-      Address SrcElementPtrAddr = Bld.CreateConstArrayGEP(SrcBase, Idx);
-      SrcElementAddr = CGF.EmitLoadOfPointer(
-          SrcElementPtrAddr.withElementType(PrivateLlvmPtrType),
-          PrivatePtrType->castAs<PointerType>());
-
-      // Step 1.2: Get the address for dest element.  The destination
-      // element has already been created on the thread's stack.
-      DestElementPtrAddr = Bld.CreateConstArrayGEP(DestBase, Idx);
-      DestElementAddr = CGF.EmitLoadOfPointer(
-          DestElementPtrAddr.withElementType(PrivateLlvmPtrType),
-          PrivatePtrType->castAs<PointerType>());
-      break;
-    }
-    }
-
-    // Regardless of src and dest of copy, we emit the load of src
-    // element as this is required in all directions
-    SrcElementAddr = SrcElementAddr.withElementType(
-        CGF.ConvertTypeForMem(Private->getType()));
-    DestElementAddr =
-        DestElementAddr.withElementType(SrcElementAddr.getElementType());
-
-    // Now that all active lanes have read the element in the
-    // Reduce list, shuffle over the value from the remote lane.
-    if (ShuffleInElement) {
-      shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
-                      RemoteLaneOffset, Private->getExprLoc());
-    } else {
-      switch (CGF.getEvaluationKind(Private->getType())) {
-      case TEK_Scalar: {
-        llvm::Value *Elem = CGF.EmitLoadOfScalar(
-            SrcElementAddr, /*Volatile=*/false, Private->getType(),
-            Private->getExprLoc(), LValueBaseInfo(AlignmentSource::Type),
-            TBAAAccessInfo());
-        // Store the source element value to the dest element address.
-        CGF.EmitStoreOfScalar(
-            Elem, DestElementAddr, /*Volatile=*/false, Private->getType(),
-            LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo());
-        break;
-      }
-      case TEK_Complex: {
-        CodeGenFunction::ComplexPairTy Elem = CGF.EmitLoadOfComplex(
-            CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
-            Private->getExprLoc());
-        CGF.EmitStoreOfComplex(
-            Elem, CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
-            /*isInit=*/false);
-        break;
-      }
-      case TEK_Aggregate:
-        CGF.EmitAggregateCopy(
-            CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
-            CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
-            Private->getType(), AggValueSlot::DoesNotOverlap);
-        break;
-      }
-    }
-
-    // Step 3.1: Modify reference in dest Reduce list as needed.
-    // Modifying the reference in Reduce list to point to the newly
-    // created element.  The element is live in the current function
-    // scope and that of functions it invokes (i.e., reduce_function).
-    // RemoteReduceData[i] = (void*)&RemoteElem
-    if (UpdateDestListPtr) {
-      CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
-                                DestElementAddr.getPointer(), CGF.VoidPtrTy),
-                            DestElementPtrAddr, /*Volatile=*/false,
-                            C.VoidPtrTy);
-    }
-
-    ++Idx;
-  }
-}
-
-/// This function emits a helper that gathers Reduce lists from the first
-/// lane of every active warp to lanes in the first warp.
-///
-/// void inter_warp_copy_func(void* reduce_data, num_warps)
-///   shared smem[warp_size];
-///   For all data entries D in reduce_data:
-///     sync
-///     If (I am the first lane in each warp)
-///       Copy my local D to smem[warp_id]
-///     sync
-///     if (I am the first warp)
-///       Copy smem[thread_id] to my local D
-static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
-                                              ArrayRef<const Expr *> Privates,
-                                              QualType ReductionArrayTy,
-                                              SourceLocation Loc) {
-  ASTContext &C = CGM.getContext();
-  llvm::Module &M = CGM.getModule();
-
-  // ReduceList: thread local Reduce list.
-  // At the stage of the computation when this function is called, partially
-  // aggregated values reside in the first lane of every active warp.
-  ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
-                                  C.VoidPtrTy, ImplicitParamKind::Other);
-  // NumWarps: number of warps active in the parallel region.  This could
-  // be smaller than 32 (max warps in a CTA) for partial block reduction.
-  ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
-                                C.getIntTypeForBitwidth(32, /* Signed */ true),
-                                ImplicitParamKind::Other);
-  FunctionArgList Args;
-  Args.push_back(&ReduceListArg);
-  Args.push_back(&NumWarpsArg);
-
-  const CGFunctionInfo &CGFI =
-      CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
-  auto *Fn = llvm::Function::Create(CGM.getTypes().GetFunctionType(CGFI),
-                                    llvm::GlobalValue::InternalLinkage,
-                                    "_omp_reduction_inter_warp_copy_func", &M);
-  CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
-  Fn->setDoesNotRecurse();
-  CodeGenFunction CGF(CGM);
-  CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
-
-  CGBuilderTy &Bld = CGF.Builder;
-
-  // This array is used as a medium to transfer, one reduce element at a time,
-  // the data from the first lane of every warp to lanes in the first warp
-  // in order to perform the final step of a reduction in a parallel region
-  // (reduction across warps).  The array is placed in NVPTX __shared__ memory
-  // for reduced latency, as well as to have a distinct copy for concurrently
-  // executing target regions.  The array is declared with common linkage so
-  // as to be shared across compilation units.
-  StringRef TransferMediumName =
-      "__openmp_nvptx_data_transfer_temporary_storage";
-  llvm::GlobalVariable *TransferMedium =
-      M.getGlobalVariable(TransferMediumName);
-  unsigned WarpSize = CGF.getTarget().getGridValue().GV_Warp_Size;
-  if (!TransferMedium) {
-    auto *Ty = llvm::ArrayType::get(CGM.Int32Ty, WarpSize);
-    unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
-    TransferMedium = new llvm::GlobalVariable(
-        M, Ty, /*isConstant=*/false, llvm::GlobalVariable::WeakAnyLinkage,
-        llvm::UndefValue::get(Ty), TransferMediumName,
-        /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
-        SharedAddressSpace);
-    CGM.addCompilerUsedGlobal(TransferMedium);
-  }
-
-  auto &RT = static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
-  // Get the CUDA thread id of the current OpenMP thread on the GPU.
-  llvm::Value *ThreadID = RT.getGPUThreadID(CGF);
-  // nvptx_lane_id = nvptx_id % warpsize
-  llvm::Value *LaneID = getNVPTXLaneID(CGF);
-  // nvptx_warp_id = nvptx_id / warpsize
-  llvm::Value *WarpID = getNVPTXWarpID(CGF);
-
-  Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
-  llvm::Type *ElemTy = CGF.ConvertTypeForMem(ReductionArrayTy);
-  Address LocalReduceList(
-      Bld.CreatePointerBitCastOrAddrSpaceCast(
-          CGF.EmitLoadOfScalar(
-              AddrReduceListArg, /*Volatile=*/false, C.VoidPtrTy, Loc,
-              LValueBaseInfo(AlignmentSource::Type), TBAAAccessInfo()),
-          ElemTy->getPointerTo()),
-      ElemTy, CGF.getPointerAlign());
-
-  unsigned Idx = 0;
-  for (const Expr *Private : Privates) {
-    //
-    // Warp master copies reduce element to transfer medium in __shared__
-    // memory.
-    //
-    unsigned RealTySize =
-        C.getTypeSizeInChars(Private->getType())
-            .alignTo(C.getTypeAlignInChars(Private->getType()))
-            .getQuantity();
-    for (unsigned TySize = 4; TySize > 0 && RealTySize > 0; TySize /=2) {
-      unsigned NumIters = RealTySize / TySize;
-      if (NumIters == 0)
-        continue;
-      QualType CType = C.getIntTypeForBitwidth(
-          C.toBits(CharUnits::fromQuantity(TySize)), /*Signed=*/1);
-      llvm::Type *CopyType = CGF.ConvertTypeForMem(CType);
-      CharUnits Align = CharUnits::fromQuantity(TySize);
-      llvm::Value *Cnt = nullptr;
-      Address CntAddr = Address::invalid();
-      llvm::BasicBlock *PrecondBB = nullptr;
-      llvm::BasicBlock *ExitBB = nullptr;
-      if (NumIters > 1) {
-        CntAddr = CGF.CreateMemTemp(C.IntTy, ".cnt.addr");
-        CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.IntTy), CntAddr,
-                              /*Volatile=*/false, C.IntTy);
-        PrecondBB = CGF.createBasicBlock("precond");
-        ExitBB = CGF.createBasicBlock("exit");
-        llvm::BasicBlock *BodyBB = CGF.createBasicBlock("body");
-        // There is no need to emit line number for unconditional branch.
-        (void)ApplyDebugLocation::CreateEmpty(CGF);
-        CGF.EmitBlock(PrecondBB);
-        Cnt = CGF.EmitLoadOfScalar(CntAddr, /*Volatile=*/false, C.IntTy, Loc);
-        llvm::Value *Cmp =
-            Bld.CreateICmpULT(Cnt, llvm::ConstantInt::get(CGM.IntTy, NumIters));
-        Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
-        CGF.EmitBlock(BodyBB);
-      }
-      // kmpc_barrier.
-      CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
-                                             /*EmitChecks=*/false,
-                                             /*ForceSimpleCall=*/true);
-      llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
-      llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
-      llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
-
-      // if (lane_id == 0)
-      llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
-      Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
-      CGF.EmitBlock(ThenBB);
-
-      // Reduce element = LocalReduceList[i]
-      Address ElemPtrPtrAddr = Bld.CreateConstArrayGEP(LocalReduceList, Idx);
-      llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
-          ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
-      // elemptr = ((CopyType*)(elemptrptr)) + I
-      Address ElemPtr(ElemPtrPtr, CopyType, Align);
-      if (NumIters > 1)
-        ElemPtr = Bld.CreateGEP(ElemPtr, Cnt);
-
-      // Get pointer to location in transfer medium.
-      // MediumPtr = &medium[warp_id]
-      llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
-          TransferMedium->getValueType(), TransferM...
[truncated]

TIFitis · 2024-02-05T15:05:12Z

clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp

@@ -2953,7 +2953,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 [[TMP5]]
 // CHECK3-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP8]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-// CHECK3-NEXT:    store i8 [[TMP9]], ptr [[C]], align 4


I'm not sure if this change (and a few more similar cases in other places) is correct. My intuition says that the alignment was incorrect and this now fixes it, hopefully someone can comment.

Similar to above, this affects only accesses to a static global. Even if the new alignment is lower than optimal, we should be able to reconstruct it from the allocation. This is fine, IMHO.

TIFitis · 2024-02-05T15:08:31Z

clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp

@@ -36,14 +36,14 @@ void test() {
 // CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8, !tbaa [[TBAA10:![0-9]+]]


From what I've heard from @jsjodin the TBAA info was pending to be removed. There is no good way to have OMPIRBuilder generate the TBAA so it has been muted for reductions codegen, let me know if this change is acceptable.

Otherwise, we would have to look into callbacks or other methods to retain this information.

Browsing through these, it seems we only loose TBAA for accesses of allocas. This is fine.

jsjodin · 2024-02-06T17:51:37Z

I think the patch should be split up into at least 2 pieces. First would be the migration work and changes to Clang, the second would be working on adding reduction support for flang.

TIFitis · 2024-02-06T18:13:36Z

@jsjodin This PR only has the migration work. Flang/MLIR is left unchanged other than the few necessary changes.

Once this patch is merged, I'll put up a PR for having MLIR use these additions.

jsjodin · 2024-02-06T17:52:42Z

llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp

+      break;
+    }
+    case EvaluationKindTy::Complex: {
+      // FIXME(Jan): Complex type


Are there no clang tests that hit this case? Remove comment and put an assertion.

If there are no tests that check this, we need to add tests and fix the implementation.

Yes there are currently no tests for the Complex type, only for scalar and aggregate.

I don't see any other handling of complex type in the OMPIRBuilder, so I'm working on what needs to be done here. In the meanwhile, I've added assertion failures for hitting complex types.

jsjodin · 2024-02-06T17:57:37Z

llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp

  Module &M = OMPBuilder->M;
  IRBuilder<> &Builder = OMPBuilder->Builder;
+  Value *TripCount = TripCountOrig;
+  // FIXME(JAN): The trip count is 1 larger than it should be for GPU, this may
+  // not be the right way to fix it, but this works for now.


Will this give the correct trip count in Clang? I know Flang's trip count value is +1 compared to what Clang was generating so that's why this was added (temporarily). It would be best if we could agree on the same trip count and this adjustment could go away. Maybe that would have to be a separate patch?

This section of the code is never executed from Clang, so I'm not sure.

jsjodin · 2024-02-06T18:23:02Z

@jsjodin This PR only has the migration work. Flang/MLIR is left unchanged other than the few necessary changes.

Once this patch is merged, I'll put up a PR for having MLIR use these additions.

There is some code that is related to Flang/MLIR that shouldn't be in the patch. e.g. ReductionInfoManager is not used anywhere. I am currently working on a patch that refactors this code a bit and how it is used in the MLIR lowering.

TIFitis · 2024-02-07T14:25:10Z

There is some code that is related to Flang/MLIR that shouldn't be in the patch. e.g. ReductionInfoManager is not used anywhere. I am currently working on a patch that refactors this code a bit and how it is used in the MLIR lowering.

Thanks for pointing this out, I've removed it from this PR.

jdoerfert

So, just to double check. On the Clang side, all you did is move the codegen and adapt it to work in the OMPIRBuilder, right? Expected outcome is the same, modulo the two small changes.

jdoerfert · 2024-02-09T21:04:04Z

clang/test/OpenMP/nvptx_target_parallel_reduction_codegen_tbaa_PR46146.cpp

@@ -36,14 +36,14 @@ void test() {
 // CHECK1-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8
 // CHECK1-NEXT:    [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
 // CHECK1-NEXT:    [[DOTTHREADID_TEMP_:%.*]] = alloca i32, align 4
-// CHECK1-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR]], align 8, !tbaa [[TBAA10:![0-9]+]]


Browsing through these, it seems we only loose TBAA for accesses of allocas. This is fine.

jdoerfert · 2024-02-09T21:05:57Z

clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp

@@ -2953,7 +2953,7 @@ int bar(int n){
 // CHECK3-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], ptr [[TMP4]], i32 [[TMP5]]
 // CHECK3-NEXT:    [[C:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0]], ptr [[TMP8]], i32 0, i32 0
 // CHECK3-NEXT:    [[TMP9:%.*]] = load i8, ptr [[TMP7]], align 1
-// CHECK3-NEXT:    store i8 [[TMP9]], ptr [[C]], align 4


Similar to above, this affects only accesses to a static global. Even if the new alignment is lower than optimal, we should be able to reconstruct it from the allocation. This is fine, IMHO.

TIFitis · 2024-02-09T21:15:19Z

Yup, code generated is identical, except for the align issue I've highlighted and some cast instructions being moved around. Also no TBAA Info.

TIFitis · 2024-02-15T15:54:51Z

With the latest changes the build bots are all clean now. Let me know if there are any further changes required for this patch to go through.

TIFitis · 2024-02-15T15:58:59Z

llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp

+      case EvaluationKindTy::Complex: {
+        assert(false && "Complex data type not handled");
+        break;
+      }


@jdoerfert Currently there are no test cases for the Complex kind. I was trying to come up with test cases that might fall under this case but I wasn't able to.

Please let me know if this case is infeasible, and we can leave it as an assertion failure.

Here's my attempt at creating a test case, but it still falls under the aggregate kind:

int foo() { int i; int j; std::complex<int> sum[10][10]; std::complex<int> res; #pragma omp target teams loop reduction(+:sum) for(i=0; i<10; i++) for(j=0; j<10; j++) res += sum[i][j]; return 0; }

TIFitis · 2024-02-20T13:09:28Z

Ping for review :)

This patch removes the complex reduction variables codegen. There are currently no tests for this, and from playing around with some complex reduction variable test cases the code seems unreachable. The PR llvm#80343 proposes to migrate reductions codegen to the OMPIRBuilder and it isn't currently well equipped to handle complex variables. This patch would enable PR llvm#80343 to go forward.

TIFitis · 2024-04-08T17:49:43Z

@jsjodin @jdoerfert Hi, this patch has been sitting for a while now. Would be great to get it reviewed and accepted. Thanks.

jdoerfert

This patch is huge, and just skimming over it shows various places that could be split off, and others that should not make it to the review stage (commented out code).
Please update.

jdoerfert · 2024-04-15T21:28:19Z

llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp

@@ -145,6 +145,8 @@ static bool isValidWorkshareLoopScheduleType(OMPScheduleType SchedType) {
 }
 #endif

+Function *GLOBAL_ReductionFunc = nullptr;


We should (almost) never have globals flying around. What is going on here?

We don't need this for now, I've removed it.

jdoerfert · 2024-04-15T21:30:43Z

llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h

@@ -99,14 +100,20 @@ class OpenMPIRBuilderConfig {
  /// expanded.
  std::optional<bool> IsGPU;

-  // Flag for specifying if offloading is mandatory.
+  /// Flag for specifying if LLVMUsed information should be emitted.
+  std::optional<bool> EmitLLVMUsed;


This is a generic sounding flag that guards a very very specific thing.
Is there ever a use case where the flag is not set (and it is checked)?

I've changed the name to EmitLLVMUsedMetaInfo, please let me know if you have a better name.

I don't think we have such a use case. I've updated the current code to check the flag is set before checking it.

jdoerfert · 2024-04-15T21:32:44Z

llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h

+  InsertPointTy createBarrier(const LocationDescription &Loc,
+                              omp::Directive Kind, bool ForceSimpleCall = false,
+                              bool CheckCancelFlag = true,
+                              Value *ThreadID = nullptr);


What has this to do with reductions? Either make it a pre-commit or just don't introduce this concept of "existing ThreadID values" again. Most (>90%) of the runtime functions that take a thread ID don't use it anyway. It's leftover from the old runtime.

Most of the clang reduction tests have it in them where they reuse the ThreadID. This change allows us to pass an existing ThreadID if available.

If you'd like I could remove this change and update the tests instead, please let me know which you prefer.

The ThreadID doesn't seem to be passed in the original code. If there was reuse before perhaps some optimization is not happening? Probably worth looking into the cause of this instead of adding the parameter.

This entire threadId in API stuff is only a leftover from the old (broken) runtime design.
We don't use the thread Id in barriers, and we don't need to pass it in.
If that means you need to change the tests, so be it. Do it before this commit, and feel free to modify the device RTL API to remove the argument completely.

Thanks for clearing this up. I've updated the PR with the changes. I couldn't do it as a pre-commit as it relies on changes from this PR to use the OpenMPIRBuilder createBarrier func and the changes aren't trivially portable. However, it has only required updating 5 clang tests and the differences are minimal.

Thanks for the the hard work reviewing this patch so far :)

jdoerfert · 2024-04-15T21:34:01Z

llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp

+    } else {
+      // LHS = Builder.CreateLoad(LHS);
+      // LHS = Builder.CreateLoad(LHS);
+      // Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), LHS, RHS));


What is happening here?

Code should be added here when we add MLIR/Flang codegen support. This is unreachable for now, I've replaced the commented code with an assertion failure.

TIFitis · 2024-04-22T11:08:46Z

This patch is huge, and just skimming over it shows various places that could be split off, and others that should not make it to the review stage (commented out code). Please update.

@jdoerfert Hi, thanks for talking the time out to review the patch.

I've addressed the comments you made and also fixed the build issues. It now builds cleanly again and passes all checks.

I'm not sure however on how to split the patch, as we definitely need all the pieces to pass the CI tests. I hope it isn't too much of a burden to review the patch.

Thanks.

shraiysh · 2024-04-22T15:38:29Z

Not super familiar with the work here, but could we maybe split it as - one patch for OpenMPIRBuilder changes with tests in OpenMPIRBuilderTest.cpp and then second patch for the clang changes to use those OpenMPIRBuilder changes? Would the CI tests cause issues?

TIFitis · 2024-04-22T15:53:02Z

Not super familiar with the work here, but could we maybe split it as - one patch for OpenMPIRBuilder changes with tests in OpenMPIRBuilderTest.cpp and then second patch for the clang changes to use those OpenMPIRBuilder changes? Would the CI tests cause issues?

Only the functions required for OpenMP reductions directive are being moved in this patch and the OpenMPIRBuilder changes are only to facilitate this move.

No new functionality is being added.

If we were to move only a subset of the functions then we would have to add extra code to remove the dependancy on Clang CGF etc which is unnecessary.

shraiysh · 2024-04-22T20:45:43Z

I see. Thanks for the reply.

TIFitis · 2024-05-15T08:35:30Z

Ping for review 😄

TIFitis · 2024-05-28T07:41:58Z

Ping for review.

TIFitis · 2024-06-05T14:34:21Z

@jdoerfert Kind reminder for review. Thanks.

TIFitis · 2024-06-12T15:10:29Z

Ping for review.

This patch migrates the CGOpenMPRuntimeGPU::emitReduction and related functions to the OpenMPIRBUilder. In future patches MLIR OpenMP translation would be making use of these functions. Co-authored-by: Jan Leyonberg <jan.leyonberg@amd.com>

jsjodin

This looks good to me now. You can wait a few days before merging to let @jdoerfert look it over again.

llvm-ci · 2024-06-26T19:24:58Z

LLVM Buildbot has detected a new failure on builder clangd-ubuntu-tsan running on clangd-ubuntu-clang while building clang,llvm,mlir at step 6 "test-build-clangd-clangd-index-server-clangd-indexer-check-clangd".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/134/builds/699

Here is the relevant piece of the build log for the reference:

Step 6 (test-build-clangd-clangd-index-server-clangd-indexer-check-clangd) failure: test (failure)
******************** TEST 'Clangd :: target_info.test' FAILED ********************
Exit Code: 66

Command Output (stderr):
--
RUN: at line 5: rm -rf /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir && mkdir -p /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir
+ rm -rf /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir
+ mkdir -p /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir
RUN: at line 7: echo '[{"directory": "/vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir", "command": "/vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir/armv7-clang -x c++ the-file.cpp -v", "file": "the-file.cpp"}]' > /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir/compile_commands.json
+ echo '[{"directory": "/vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir", "command": "/vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir/armv7-clang -x c++ the-file.cpp -v", "file": "the-file.cpp"}]'
RUN: at line 9: sed -e "s|INPUT_DIR|/vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir|g" /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/llvm-project/clang-tools-extra/clangd/test/target_info.test > /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.test.1
+ sed -e 's|INPUT_DIR|/vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.dir|g' /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/llvm-project/clang-tools-extra/clangd/test/target_info.test
RUN: at line 12: sed -E -e 's|"file://([A-Z]):/|"file:///\1:/|g' /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.test.1 > /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.test
+ sed -E -e 's|"file://([A-Z]):/|"file:///\1:/|g' /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.test.1
RUN: at line 14: clangd -lit-test < /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.test 2>&1 | /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/bin/FileCheck -strict-whitespace /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.test
+ clangd -lit-test
+ /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/bin/FileCheck -strict-whitespace /vol/worker/clangd-ubuntu-clang/clangd-ubuntu-tsan/build/tools/clang/tools/extra/clangd/test/Output/target_info.test.tmp.test

--

********************

dpalermo · 2024-06-26T19:49:00Z

The above is an intermittent failure on the tsan buildbot (https://lab.llvm.org/buildbot/#/builders/134) and can be ignored. The next patch processed shows status is green.

jhuber6 · 2024-07-02T18:22:54Z

This patch causes the offloading/bug51781.c test to fail when compiled with reductions + debug information.

> clang ../offload/test/offloading/bug51781.c -fopenmp -O1 --offload-arch=sm_89 -DADD_REDUCTION --offload-device-only -gline-tables-only
!dbg attachment points at wrong subprogram for function
!19 = distinct !DISubprogram(name: "__omp_offloading_10302_af88b66_main_l44", scope: !11, file: !11, line: 44, type: !20, scopeLine: 44, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !10)
ptr @__omp_offloading_10302_af88b66_main_l44
  %16 = load i32, ptr %14, align 4, !dbg !50, !tbaa !35
!50 = !DILocation(line: 44, column: 58, scope: !32)
!32 = distinct !DISubprogram(name: "__omp_offloading_10302_af88b66_main_l44_omp_outlined", scope: !11, file: !11, line: 44, type: !20, scopeLine: 44, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !10)
!32 = distinct !DISubprogram(name: "__omp_offloading_10302_af88b66_main_l44_omp_outlined", scope: !11, file: !11, line: 44, type: !20, scopeLine: 44, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !10)
!dbg attachment points at wrong subprogram for function
!19 = distinct !DISubprogram(name: "__omp_offloading_10302_af88b66_main_l44", scope: !11, file: !11, line: 44, type: !20, scopeLine: 44, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !10)
ptr @__omp_offloading_10302_af88b66_main_l44
  %14 = load i32, ptr %12, align 4, !dbg !50, !tbaa !35
!50 = !DILocation(line: 44, column: 58, scope: !32)
!32 = distinct !DISubprogram(name: "__omp_offloading_10302_af88b66_main_l44_omp_outlined", scope: !11, file: !11, line: 44, type: !20, scopeLine: 44, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !10)
!32 = distinct !DISubprogram(name: "__omp_offloading_10302_af88b66_main_l44_omp_outlined", scope: !11, file: !11, line: 44, type: !20, scopeLine: 44, flags: DIFlagArtificial | DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !10)

This test directly uses a reduction, and if I revert this patch it no longer breaks. I'm fairly confident that somewhere in this code we did not copy debug information correctly. Any clue where that might be?

…lvm#80343) This patch migrates the CGOpenMPRuntimeGPU::emitReduction and related functions to the OpenMPIRBUilder. In future patches MLIR OpenMP translation would be making use of these functions. Co-authored-by: Jan Leyonberg <jan.leyonberg@amd.com>

TIFitis requested a review from jdoerfert February 1, 2024 21:33

llvmbot added clang Clang issues not falling into any other category clang:codegen IR generation bugs: mangling, exceptions, etc. mlir:llvm mlir mlir:openmp flang:openmp clang:openmp OpenMP related changes to Clang labels Feb 1, 2024

TIFitis requested review from jsjodin and jplehr February 1, 2024 21:34

TIFitis requested review from DominikAdamski, skatrak and agozillon February 1, 2024 21:34

ronlieb requested a review from dhruvachak February 1, 2024 21:37

TIFitis commented Feb 5, 2024

View reviewed changes

jsjodin reviewed Feb 6, 2024

View reviewed changes

jdoerfert reviewed Feb 9, 2024

View reviewed changes

TIFitis force-pushed the akash/red-upstream branch from 513b278 to 5f0b817 Compare February 13, 2024 13:49

TIFitis commented Feb 15, 2024

View reviewed changes

TIFitis mentioned this pull request Feb 21, 2024

[OpenMP] Remove complex reduction variable support #82497

Closed

jdoerfert reviewed Apr 15, 2024

View reviewed changes

TIFitis force-pushed the akash/red-upstream branch from 5534943 to 4046186 Compare April 16, 2024 15:18

jsjodin requested review from alexey-bataev and jdoerfert April 22, 2024 18:22

TIFitis force-pushed the akash/red-upstream branch from 92f2432 to f32ebe3 Compare May 7, 2024 15:02

TIFitis force-pushed the akash/red-upstream branch from b91b0e8 to 15fd9a1 Compare June 5, 2024 14:52

jsjodin approved these changes Jun 25, 2024

View reviewed changes

ronlieb requested a review from dpalermo June 25, 2024 20:11

Fix TargetDeInit missing TeamsReductions

2351eb2

TIFitis force-pushed the akash/red-upstream branch from 15fd9a1 to 2351eb2 Compare June 26, 2024 15:51

TIFitis merged commit 6b1c51b into llvm:main Jun 26, 2024
5 of 6 checks passed

jhuber6 mentioned this pull request Jul 2, 2024

[OpenMP] New OMPIRBuilder reduction handling does not properly preserve debug metadata. #97458

Closed

TIFitis deleted the akash/red-upstream branch July 30, 2024 11:43

[OpenMP] Migrate GPU Reductions CodeGen from Clang to OMPIRBuilder #80343

[OpenMP] Migrate GPU Reductions CodeGen from Clang to OMPIRBuilder #80343

Uh oh!

Conversation

TIFitis commented Feb 1, 2024

Uh oh!

llvmbot commented Feb 1, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

TIFitis Feb 5, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

jsjodin commented Feb 6, 2024

Uh oh!

TIFitis commented Feb 6, 2024

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

jsjodin commented Feb 6, 2024

Uh oh!

TIFitis commented Feb 7, 2024

Uh oh!

jdoerfert left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

TIFitis commented Feb 9, 2024

Uh oh!

TIFitis commented Feb 15, 2024

Uh oh!

Choose a reason for hiding this comment

Uh oh!

TIFitis commented Feb 20, 2024

Uh oh!

TIFitis commented Apr 8, 2024

Uh oh!

jdoerfert left a comment

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

TIFitis commented Apr 22, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

llvmbot commented Feb 1, 2024 •

edited

Loading

TIFitis Feb 5, 2024 •

edited

Loading

TIFitis commented Apr 22, 2024 •

edited

Loading