[mlir][OpenMP] Allow composite SIMD REDUCTION and IF

tblah · tblah · commit b38b2ecffd21 · 2025-07-08T17:00:05.000Z
Reduction support: llvm#146671 If Support is fixed in this PR The problem for the IF clause in composite constructs was that wsloop and simd both operate on the same CanonicalLoopInfo structure: with the SIMD processed first, followed by the wsloop. Previously the IF clause generated code like if (cond) { while (...) { simd_loop_body; } } else { while (...) { nonsimd_loop_body; } } The problem with this is that this invalidates the CanonicalLoopInfo structure to be processed by the wsloop later. To avoid this, in this patch I preserve the original loop, moving the IF clause inside of the loop: while (...) { if (cond) { simd_loop_body; } else { non_simd_loop_body; } } On simple examples I tried LLVM was able to hoist the if condition outside of the loop at -O3. The disadvantage of this is that we cannot add the llvm.loop.vectorize.enable attribute on either the SIMD or non-SIMD loops because they both share a loop back edge. There's no way of solving this without keeping the old design of having two different loops: which cannot be represented using only one CanonicalLoopInfo structure. I don't think the presence or absence of this attribute makes much difference. In my testing it is the llvm.loop.parallel_access metadata which makes the difference to vectorization. LLVM will vectorize if legal whether or not this attribute is there in the TRUE branch. In the FALSE branch this means the loop might be vectorized even when the condition is false: but I think this is still standards compliant: OpenMP 6.0 says that when the if clause is false that should be treated like the SIMDLEN clause is one. The SIMDLEN clause is defined as a "hint". For the same reason, SIMDLEN and SAFELEN clauses are silently ignored when SIMD IF is used. I think it is better to implement SIMD IF and ignore SIMDLEN and SAFELEN and some vectorization encouragement metadata when combined with IF than to ignore IF because IF could have correctness consequences whereas the rest are optimiztion hints. For example, the user might use the IF clause to disable SIMD programatically when it is known not safe to vectorize the loop. In this case it is not at all safe to add the parallel access or SAFELEN metadata.
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -5373,8 +5373,27 @@ void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
                                       const Twine &NamePrefix) {
   Function *F = CanonicalLoop->getFunction();
 
+  // We can't do
+  // if (cond) {
+  //   simd_loop;
+  // } else {
+  //   non_simd_loop;
+  // }
+  // because then the CanonicalLoopInfo would only point to one of the loops:
+  // leading to other constructs operating on the same loop to malfunction.
+  // Instead generate
+  // while (...) {
+  //   if (cond) {
+  //     simd_body;
+  //   } else {
+  //     not_simd_body;
+  //   }
+  // }
+  // At least for simple loops, LLVM seems able to hoist the if out of the loop
+  // body at -O3
+
   // Define where if branch should be inserted
-  Instruction *SplitBefore = CanonicalLoop->getPreheader()->getTerminator();
+  auto SplitBeforeIt = CanonicalLoop->getBody()->getFirstNonPHIIt();
 
   // TODO: We should not rely on pass manager. Currently we use pass manager
   // only for getting llvm::Loop which corresponds to given CanonicalLoopInfo
@@ -5391,37 +5410,51 @@ void OpenMPIRBuilder::createIfVersion(CanonicalLoopInfo *CanonicalLoop,
   Loop *L = LI.getLoopFor(CanonicalLoop->getHeader());
 
   // Create additional blocks for the if statement
-  BasicBlock *Head = SplitBefore->getParent();
-  Instruction *HeadOldTerm = Head->getTerminator();
-  llvm::LLVMContext &C = Head->getContext();
+  BasicBlock *Cond = SplitBeforeIt->getParent();
+  Instruction *CondOldTerm = Cond->getTerminator();
+  llvm::LLVMContext &C = Cond->getContext();
   llvm::BasicBlock *ThenBlock = llvm::BasicBlock::Create(
-      C, NamePrefix + ".if.then", Head->getParent(), Head->getNextNode());
+      C, NamePrefix + ".if.then", Cond->getParent(), Cond->getNextNode());
   llvm::BasicBlock *ElseBlock = llvm::BasicBlock::Create(
-      C, NamePrefix + ".if.else", Head->getParent(), CanonicalLoop->getExit());
+      C, NamePrefix + ".if.else", Cond->getParent(), CanonicalLoop->getExit());
 
   // Create if condition branch.
-  Builder.SetInsertPoint(HeadOldTerm);
+  Builder.SetInsertPoint(CondOldTerm);
   Instruction *BrInstr =
       Builder.CreateCondBr(IfCond, ThenBlock, /*ifFalse*/ ElseBlock);
   InsertPointTy IP{BrInstr->getParent(), ++BrInstr->getIterator()};
-  // Then block contains branch to omp loop which needs to be vectorized
+  // Then block contains branch to omp loop body which needs to be vectorized
   spliceBB(IP, ThenBlock, false, Builder.getCurrentDebugLocation());
-  ThenBlock->replaceSuccessorsPhiUsesWith(Head, ThenBlock);
+  ThenBlock->replaceSuccessorsPhiUsesWith(Cond, ThenBlock);
 
   Builder.SetInsertPoint(ElseBlock);
 
   // Clone loop for the else branch
   SmallVector<BasicBlock *, 8> NewBlocks;
 
-  VMap[CanonicalLoop->getPreheader()] = ElseBlock;
+  // Cond is the block that has the if clause condition
+  // LoopCond is omp_loop.cond
+  // LoopHeader is omp_loop.header
+  BasicBlock *LoopCond = Cond->getUniquePredecessor();
+  BasicBlock *LoopHeader = LoopCond->getUniquePredecessor();
+  assert(LoopCond && LoopHeader && "Invalid loop structure");
   for (BasicBlock *Block : L->getBlocks()) {
+    if (Block == L->getLoopPreheader() || Block == L->getLoopLatch() ||
+        Block == LoopHeader || Block == LoopCond || Block == Cond) {
+      continue;
+    }
     BasicBlock *NewBB = CloneBasicBlock(Block, VMap, "", F);
     NewBB->moveBefore(CanonicalLoop->getExit());
     VMap[Block] = NewBB;
     NewBlocks.push_back(NewBB);
   }
   remapInstructionsInBlocks(NewBlocks, VMap);
   Builder.CreateBr(NewBlocks.front());
+
+  // The loop latch must have only one predecessor. Currently it is branched to
+  // from both the 'then' and 'else' branches.
+  L->getLoopLatch()->splitBasicBlock(
+      L->getLoopLatch()->begin(), NamePrefix + ".pre_latch", /*Before=*/true);
 }
 
 unsigned
@@ -5478,19 +5511,6 @@ void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop,
   if (IfCond) {
     ValueToValueMapTy VMap;
     createIfVersion(CanonicalLoop, IfCond, VMap, "simd");
-    // Add metadata to the cloned loop which disables vectorization
-    Value *MappedLatch = VMap.lookup(CanonicalLoop->getLatch());
-    assert(MappedLatch &&
-           "Cannot find value which corresponds to original loop latch");
-    assert(isa<BasicBlock>(MappedLatch) &&
-           "Cannot cast mapped latch block value to BasicBlock");
-    BasicBlock *NewLatchBlock = dyn_cast<BasicBlock>(MappedLatch);
-    ConstantAsMetadata *BoolConst =
-        ConstantAsMetadata::get(ConstantInt::getFalse(Type::getInt1Ty(Ctx)));
-    addBasicBlockMetadata(
-        NewLatchBlock,
-        {MDNode::get(Ctx, {MDString::get(Ctx, "llvm.loop.vectorize.enable"),
-                           BoolConst})});
   }
 
   SmallSet<BasicBlock *, 8> Reachable;
@@ -5524,6 +5544,14 @@ void OpenMPIRBuilder::applySimd(CanonicalLoopInfo *CanonicalLoop,
         Ctx, {MDString::get(Ctx, "llvm.loop.parallel_accesses"), AccessGroup}));
   }
 
+  // FIXME: the IF clause shares a loop backedge for the SIMD and non-SIMD
+  // versions so we can't add the loop attributes in that case.
+  if (IfCond) {
+    // we can still add llvm.loop.parallel_access
+    addLoopMetadata(CanonicalLoop, LoopMDList);
+    return;
+  }
+
   // Use the above access group metadata to create loop level
   // metadata, which should be distinct for each loop.
   ConstantAsMetadata *BoolConst =
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -702,30 +702,6 @@ static void forwardArgs(LLVM::ModuleTranslation &moduleTranslation,
     moduleTranslation.mapValue(arg, moduleTranslation.lookupValue(var));
 }
 
-/// Helper function to map block arguments defined by ignored loop wrappers to
-/// LLVM values and prevent any uses of those from triggering null pointer
-/// dereferences.
-///
-/// This must be called after block arguments of parent wrappers have already
-/// been mapped to LLVM IR values.
-static LogicalResult
-convertIgnoredWrapper(omp::LoopWrapperInterface opInst,
-                      LLVM::ModuleTranslation &moduleTranslation) {
-  // Map block arguments directly to the LLVM value associated to the
-  // corresponding operand. This is semantically equivalent to this wrapper not
-  // being present.
-  return llvm::TypeSwitch<Operation *, LogicalResult>(opInst)
-      .Case([&](omp::SimdOp op) {
-        forwardArgs(moduleTranslation,
-                    cast<omp::BlockArgOpenMPOpInterface>(*op));
-        op.emitWarning() << "simd information on composite construct discarded";
-        return success();
-      })
-      .Default([&](Operation *op) {
-        return op->emitError() << "cannot ignore wrapper";
-      });
-}
-
 /// Converts an OpenMP 'masked' operation into LLVM IR using OpenMPIRBuilder.
 static LogicalResult
 convertOmpMasked(Operation &opInst, llvm::IRBuilderBase &builder,
@@ -2852,17 +2828,6 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
   auto simdOp = cast<omp::SimdOp>(opInst);
 
-  // Ignore simd in composite constructs with unsupported clauses
-  // TODO: Replace this once simd + clause combinations are properly supported
-  if (simdOp.isComposite() &&
-      (simdOp.getReductionByref().has_value() || simdOp.getIfExpr())) {
-    if (failed(convertIgnoredWrapper(simdOp, moduleTranslation)))
-      return failure();
-
-    return inlineConvertOmpRegions(simdOp.getRegion(), "omp.simd.region",
-                                   builder, moduleTranslation);
-  }
-
   if (failed(checkImplementationStatus(opInst)))
     return failure();
 
diff --git a/mlir/test/Target/LLVMIR/openmp-composite-simd-if.mlir b/mlir/test/Target/LLVMIR/openmp-composite-simd-if.mlir
@@ -0,0 +1,93 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+llvm.func @_QPfoo(%arg0: !llvm.ptr {fir.bindc_name = "array", llvm.nocapture}, %arg1: !llvm.ptr {fir.bindc_name = "t", llvm.nocapture}) {
+  %0 = llvm.mlir.constant(0 : i64) : i32
+  %1 = llvm.mlir.constant(1 : i32) : i32
+  %2 = llvm.mlir.constant(10 : i64) : i64
+  %3 = llvm.mlir.constant(1 : i64) : i64
+  %4 = llvm.alloca %3 x i32 {bindc_name = "i", pinned} : (i64) -> !llvm.ptr
+  %5 = llvm.load %arg1 : !llvm.ptr -> i32
+  %6 = llvm.icmp "ne" %5, %0 : i32
+  %7 = llvm.trunc %2 : i64 to i32
+  omp.wsloop {
+    omp.simd if(%6) {
+      omp.loop_nest (%arg2) : i32 = (%1) to (%7) inclusive step (%1) {
+        llvm.store %arg2, %4 : i32, !llvm.ptr
+        %8 = llvm.load %4 : !llvm.ptr -> i32
+        %9 = llvm.sext %8 : i32 to i64
+        %10 = llvm.getelementptr %arg0[%9] : (!llvm.ptr, i64) -> !llvm.ptr, i32
+        llvm.store %8, %10 : i32, !llvm.ptr
+        omp.yield
+      }
+    } {omp.composite}
+  } {omp.composite}
+  llvm.return
+}
+
+// CHECK-LABEL: @_QPfoo
+// ...
+// CHECK:       omp_loop.preheader:                               ; preds =
+// CHECK:         store i32 0, ptr %[[LB_ADDR:.*]], align 4
+// CHECK:         store i32 9, ptr %[[UB_ADDR:.*]], align 4
+// CHECK:         store i32 1, ptr %[[STEP_ADDR:.*]], align 4
+// CHECK:         %[[VAL_15:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         call void @__kmpc_for_static_init_4u(ptr @1, i32 %[[VAL_15]], i32 34, ptr %{{.*}}, ptr %[[LB_ADDR]], ptr %[[UB_ADDR]], ptr %[[STEP_ADDR]], i32 1, i32 0)
+// CHECK:         %[[LB:.*]] = load i32, ptr %[[LB_ADDR]], align 4
+// CHECK:         %[[UB:.*]] = load i32, ptr %[[UB_ADDR]], align 4
+// CHECK:         %[[VAL_18:.*]] = sub i32 %[[UB]], %[[LB]]
+// CHECK:         %[[COUNT:.*]] = add i32 %[[VAL_18]], 1
+// CHECK:         br label %[[OMP_LOOP_HEADER:.*]]
+// CHECK:       omp_loop.header:                                  ; preds = %[[OMP_LOOP_INC:.*]], %[[OMP_LOOP_PREHEADER:.*]]
+// CHECK:         %[[IV:.*]] = phi i32 [ 0, %[[OMP_LOOP_PREHEADER]] ], [ %[[NEW_IV:.*]], %[[OMP_LOOP_INC]] ]
+// CHECK:         br label %[[OMP_LOOP_COND:.*]]
+// CHECK:       omp_loop.cond:                                    ; preds = %[[OMP_LOOP_HEADER]]
+// CHECK:         %[[VAL_25:.*]] = icmp ult i32 %[[IV]], %[[COUNT]]
+// CHECK:         br i1 %[[VAL_25]], label %[[OMP_LOOP_BODY:.*]], label %[[OMP_LOOP_EXIT:.*]]
+// CHECK:       omp_loop.body:                                    ; preds = %[[OMP_LOOP_COND]]
+// CHECK:         %[[VAL_28:.*]] = add i32 %[[IV]], %[[LB]]
+// CHECK:         %[[VAL_29:.*]] = mul i32 %[[VAL_28]], 1
+// CHECK:         %[[VAL_30:.*]] = add i32 %[[VAL_29]], 1
+// This is the IF clause:
+// CHECK:         br i1 %{{.*}}, label %[[SIMD_IF_THEN:.*]], label %[[SIMD_IF_ELSE:.*]]
+
+// CHECK:       simd.if.then:                                     ; preds = %[[OMP_LOOP_BODY]]
+// CHECK:         br label %[[VAL_33:.*]]
+// CHECK:       omp.loop_nest.region:                             ; preds = %[[SIMD_IF_THEN]]
+// This version contains !llvm.access.group metadata for SIMD
+// CHECK:         store i32 %[[VAL_30]], ptr %{{.*}}, align 4, !llvm.access.group !1
+// CHECK:         %[[VAL_34:.*]] = load i32, ptr %{{.*}}, align 4, !llvm.access.group !1
+// CHECK:         %[[VAL_35:.*]] = sext i32 %[[VAL_34]] to i64
+// CHECK:         %[[VAL_36:.*]] = getelementptr i32, ptr %[[VAL_37:.*]], i64 %[[VAL_35]]
+// CHECK:         store i32 %[[VAL_34]], ptr %[[VAL_36]], align 4, !llvm.access.group !1
+// CHECK:         br label %[[OMP_REGION_CONT3:.*]]
+// CHECK:       omp.region.cont3:                                 ; preds = %[[VAL_33]]
+// CHECK:         br label %[[SIMD_PRE_LATCH:.*]]
+
+// CHECK:       simd.pre_latch:                                   ; preds = %[[OMP_REGION_CONT3]], %[[OMP_REGION_CONT35:.*]]
+// CHECK:         br label %[[OMP_LOOP_INC]]
+// CHECK:       omp_loop.inc:                                     ; preds = %[[SIMD_PRE_LATCH]]
+// CHECK:         %[[NEW_IV]] = add nuw i32 %[[IV]], 1
+// CHECK:         br label %[[OMP_LOOP_HEADER]], !llvm.loop !2
+
+// CHECK:       simd.if.else:                                     ; preds = %[[OMP_LOOP_BODY]]
+// CHECK:         br label %[[VAL_41:.*]]
+// CHECK:       omp.loop_nest.region4:                            ; preds = %[[SIMD_IF_ELSE]]
+// No llvm.access.group metadata for else clause
+// CHECK:         store i32 %[[VAL_30]], ptr %{{.*}}, align 4
+// CHECK:         %[[VAL_42:.*]] = load i32, ptr %{{.*}}, align 4
+// CHECK:         %[[VAL_43:.*]] = sext i32 %[[VAL_42]] to i64
+// CHECK:         %[[VAL_44:.*]] = getelementptr i32, ptr %[[VAL_37]], i64 %[[VAL_43]]
+// CHECK:         store i32 %[[VAL_42]], ptr %[[VAL_44]], align 4
+// CHECK:         br label %[[OMP_REGION_CONT35]]
+// CHECK:       omp.region.cont35:                                ; preds = %[[VAL_41]]
+// CHECK:         br label %[[SIMD_PRE_LATCH]]
+
+// CHECK:       omp_loop.exit:                                    ; preds = %[[OMP_LOOP_COND]]
+// CHECK:         call void @__kmpc_for_static_fini(ptr @1, i32 %[[VAL_15]])
+// CHECK:         %[[VAL_45:.*]] = call i32 @__kmpc_global_thread_num(ptr @1)
+// CHECK:         call void @__kmpc_barrier(ptr @2, i32 %[[VAL_45]])
+
+// CHECK: !1 = distinct !{}
+// CHECK: !2 = distinct !{!2, !3}
+// CHECK: !3 = !{!"llvm.loop.parallel_accesses", !1}
+// CHECK-NOT: llvm.loop.vectorize
diff --git a/mlir/test/Target/LLVMIR/openmp-llvm.mlir b/mlir/test/Target/LLVMIR/openmp-llvm.mlir
@@ -820,8 +820,6 @@ llvm.func @simd_if(%arg0: !llvm.ptr {fir.bindc_name = "n"}, %arg1: !llvm.ptr {fi
 }
 // Be sure that llvm.loop.vectorize.enable metadata appears twice
 // CHECK: llvm.loop.parallel_accesses
-// CHECK-NEXT: llvm.loop.vectorize.enable
-// CHECK: llvm.loop.vectorize.enable
 
 // -----
 
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction.mlir b/mlir/test/Target/LLVMIR/openmp-reduction.mlir
@@ -637,9 +637,12 @@ llvm.func @wsloop_simd_reduction(%lb : i64, %ub : i64, %step : i64) {
 // Outlined function.
 // CHECK: define internal void @[[OUTLINED]]
 
-// Private reduction variable and its initialization.
+// reduction variable in wsloop
 // CHECK: %[[PRIVATE:.+]] = alloca float
+// reduction variable in simd
+// CHECK: %[[PRIVATE2:.+]] = alloca float
 // CHECK: store float 0.000000e+00, ptr %[[PRIVATE]]
+// CHECK: store float 0.000000e+00, ptr %[[PRIVATE2]]
 
 // Call to the reduction function.
 // CHECK: call i32 @__kmpc_reduce
@@ -659,9 +662,9 @@ llvm.func @wsloop_simd_reduction(%lb : i64, %ub : i64, %step : i64) {
 
 // Update of the private variable using the reduction region
 // (the body block currently comes after all the other blocks).
-// CHECK: %[[PARTIAL:.+]] = load float, ptr %[[PRIVATE]]
+// CHECK: %[[PARTIAL:.+]] = load float, ptr %[[PRIVATE2]]
 // CHECK: %[[UPDATED:.+]] = fadd float 2.000000e+00, %[[PARTIAL]]
-// CHECK: store float %[[UPDATED]], ptr %[[PRIVATE]]
+// CHECK: store float %[[UPDATED]], ptr %[[PRIVATE2]]
 
 // Reduction function.
 // CHECK: define internal void @[[REDFUNC]]
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -489,43 +489,3 @@ llvm.func @wsloop_order(%lb : i32, %ub : i32, %step : i32) {
   }
   llvm.return
 }
-
-// -----
-
-llvm.func @do_simd_if(%1 : !llvm.ptr, %5 : i32, %4 : i32, %6 : i1) {
-  omp.wsloop {
-    // expected-warning@below {{simd information on composite construct discarded}}
-    omp.simd if(%6) {
-      omp.loop_nest (%arg0) : i32 = (%5) to (%4) inclusive step (%5) {
-        llvm.store %arg0, %1 : i32, !llvm.ptr
-        omp.yield
-      }
-    } {omp.composite}
-  } {omp.composite}
-  llvm.return
-}
-
-// -----
-
-omp.declare_reduction @add_reduction_i32 : i32 init {
-^bb0(%arg0: i32):
-  %0 = llvm.mlir.constant(0 : i32) : i32
-  omp.yield(%0 : i32)
-} combiner {
-^bb0(%arg0: i32, %arg1: i32):
-  %0 = llvm.add %arg0, %arg1 : i32
-  omp.yield(%0 : i32)
-}
-llvm.func @do_simd_reduction(%1 : !llvm.ptr, %3 : !llvm.ptr, %6 : i32, %7 : i32) {
-  omp.wsloop reduction(@add_reduction_i32 %3 -> %arg0 : !llvm.ptr) {
-    // expected-warning@below {{simd information on composite construct discarded}}
-    omp.simd reduction(@add_reduction_i32 %arg0 -> %arg1 : !llvm.ptr) {
-      omp.loop_nest (%arg2) : i32 = (%7) to (%6) inclusive step (%7) {
-        llvm.store %arg2, %1 : i32, !llvm.ptr
-        %12 = llvm.load %arg1 : !llvm.ptr -> i32
-        omp.yield
-      }
-    } {omp.composite}
-  } {omp.composite}
-  llvm.return
-}

Original file line number	Diff line number	Diff line change
`@@ -820,8 +820,6 @@ llvm.func @simd_if(%arg0: !llvm.ptr {fir.bindc_name = "n"}, %arg1: !llvm.ptr {fi`
`820`	`820`	`}`
`821`	`821`	`// Be sure that llvm.loop.vectorize.enable metadata appears twice`
`822`	`822`	`// CHECK: llvm.loop.parallel_accesses`
`823`		`-// CHECK-NEXT: llvm.loop.vectorize.enable`
`824`		`-// CHECK: llvm.loop.vectorize.enable`
`825`	`823`
`826`	`824`	`// -----`
`827`	`825`