Skip to content

[WIP][LV][VPlan] For reverse iterative continuous load and store access, use unit load/store #130032

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from

Conversation

LiqinWeng
Copy link
Contributor

No description provided.

@LiqinWeng LiqinWeng requested review from fhahn, ayalz and SamTebbs33 and removed request for ayalz March 6, 2025 09:39
@llvmbot
Copy link
Member

llvmbot commented Mar 6, 2025

@llvm/pr-subscribers-vectorizers

Author: LiqinWeng (LiqinWeng)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/130032.diff

2 Files Affected:

  • (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+8-1)
  • (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+42-40)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c447fa4843591..0b155c95f0221 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7487,6 +7487,13 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
       // comparing against the legacy cost isn't desirable.
       if (isa<VPPartialReductionRecipe>(&R))
         return true;
+
+      // The VPlan-based cost model may calculate the cost of strided load/store
+      // which can't be modeled in the legacy cost model.
+      if (isa<VPWidenLoadEVLRecipe, VPWidenStoreEVLRecipe>(&R))
+        if (cast<VPWidenMemoryRecipe>(&R)->isReverse())
+          return true;
+
       if (Instruction *UI = GetInstructionForCost(&R))
         SeenInstrs.insert(UI);
     }
@@ -8344,7 +8351,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
     auto *GEP = dyn_cast<GetElementPtrInst>(
         Ptr->getUnderlyingValue()->stripPointerCasts());
     VPSingleDefRecipe *VectorPtr;
-    if (Reverse) {
+    if (Reverse && !CM.foldTailWithEVL()) {
       // When folding the tail, we may compute an address that we don't in the
       // original scalar loop and it may not be inbounds. Drop Inbounds in that
       // case.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index e9f50e88867b2..2fd2cbffec1c9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2640,17 +2640,6 @@ void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
-/// Use all-true mask for reverse rather than actual mask, as it avoids a
-/// dependence w/o affecting the result.
-static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
-                                     Value *EVL, const Twine &Name) {
-  VectorType *ValTy = cast<VectorType>(Operand->getType());
-  Value *AllTrueMask =
-      Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
-  return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
-                                 {Operand, AllTrueMask, EVL}, nullptr, Name);
-}
-
 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
   auto *LI = cast<LoadInst>(&Ingredient);
 
@@ -2665,19 +2654,26 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
   Value *EVL = State.get(getEVL(), VPLane(0));
   Value *Addr = State.get(getAddr(), !CreateGather);
   Value *Mask = nullptr;
-  if (VPValue *VPMask = getMask()) {
+  if (VPValue *VPMask = getMask())
     Mask = State.get(VPMask);
-    if (isReverse())
-      Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
-  } else {
+  else
     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
-  }
 
   if (CreateGather) {
     NewLI =
         Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
                                 nullptr, "wide.masked.gather");
   } else {
+    if (isReverse()) {
+      auto *EltTy = DataTy->getElementType();
+      // if (EltTy->getScalarSizeInBits() !=
+      // EVL->getType()->getScalarSizeInBits())
+      //   EVL = ConstantInt::getSigned(EVL->getType(),
+      //   static_cast<int64_t>(EltTy->getScalarSizeInBits()) / 8);
+      auto *GEP = dyn_cast<GetElementPtrInst>(Addr->stripPointerCasts());
+      Value *Offset = Builder.CreateSub(State.Builder.getInt32(1), EVL);
+      Addr = Builder.CreateGEP(EltTy, Addr, Offset, "", GEP->isInBounds());
+    }
     VectorBuilder VBuilder(Builder);
     VBuilder.setEVL(EVL).setMask(Mask);
     NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
@@ -2686,10 +2682,7 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
   NewLI->addParamAttr(
       0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
   State.addMetadata(NewLI, LI);
-  Instruction *Res = NewLI;
-  if (isReverse())
-    Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
-  State.set(this, Res);
+  State.set(this, NewLI);
 }
 
 InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
@@ -2707,14 +2700,13 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
       getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
   unsigned AS =
       getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
-  InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
-      Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind);
   if (!Reverse)
-    return Cost;
+    return Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
+                                         AS, Ctx.CostKind);
 
-  return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
-                                       cast<VectorType>(Ty), {}, Ctx.CostKind,
-                                       0);
+  return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty,
+                                        getAddr()->getUnderlyingValue(), false,
+                                        Alignment, Ctx.CostKind);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -2775,7 +2767,8 @@ void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
 
 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
   auto *SI = cast<StoreInst>(&Ingredient);
-
+  Type *ScalarDataTy = getLoadStoreType(&Ingredient);
+  auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
   VPValue *StoredValue = getStoredValue();
   bool CreateScatter = !isConsecutive();
   const Align Alignment = getLoadStoreAlignment(&Ingredient);
@@ -2786,22 +2779,32 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
   CallInst *NewSI = nullptr;
   Value *StoredVal = State.get(StoredValue);
   Value *EVL = State.get(getEVL(), VPLane(0));
-  if (isReverse())
-    StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
   Value *Mask = nullptr;
-  if (VPValue *VPMask = getMask()) {
+  if (VPValue *VPMask = getMask())
     Mask = State.get(VPMask);
-    if (isReverse())
-      Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
-  } else {
+  else
     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
-  }
+
   Value *Addr = State.get(getAddr(), !CreateScatter);
   if (CreateScatter) {
     NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
                                     Intrinsic::vp_scatter,
                                     {StoredVal, Addr, Mask, EVL});
   } else {
+    if (isReverse()) {
+      auto *EltTy = DataTy->getElementType();
+      // FIXME: we may need not deal with the size, the InstCombine will deal
+      // with the Offset Type if (EltTy->getScalarSizeInBits() !=
+      // EVL->getType()->getScalarSizeInBits())
+      //   EVL = ConstantInt::getSigned(EVL->getType(),
+      //   static_cast<int64_t>(EltTy->getScalarSizeInBits()) / 8);
+      auto *GEP = dyn_cast<GetElementPtrInst>(Addr->stripPointerCasts());
+      // Value *Offset =
+      // Builder.CreateSub(State.Builder.getIntN(EVL->getType()->getScalarSizeInBits(),
+      // 1), EVL);
+      Value *Offset = Builder.CreateSub(State.Builder.getInt32(1), EVL);
+      Addr = Builder.CreateGEP(EltTy, Addr, Offset, "", GEP->isInBounds());
+    }
     VectorBuilder VBuilder(Builder);
     VBuilder.setEVL(EVL).setMask(Mask);
     NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
@@ -2828,14 +2831,13 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
       getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
   unsigned AS =
       getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
-  InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
-      Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind);
   if (!Reverse)
-    return Cost;
+    return Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
+                                         AS, Ctx.CostKind);
 
-  return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
-                                       cast<VectorType>(Ty), {}, Ctx.CostKind,
-                                       0);
+  return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty,
+                                        getAddr()->getUnderlyingValue(), false,
+                                        Alignment, Ctx.CostKind);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

@llvmbot
Copy link
Member

llvmbot commented Mar 6, 2025

@llvm/pr-subscribers-llvm-transforms

Author: LiqinWeng (LiqinWeng)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/130032.diff

2 Files Affected:

  • (modified) llvm/lib/Transforms/Vectorize/LoopVectorize.cpp (+8-1)
  • (modified) llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp (+42-40)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c447fa4843591..0b155c95f0221 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7487,6 +7487,13 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
       // comparing against the legacy cost isn't desirable.
       if (isa<VPPartialReductionRecipe>(&R))
         return true;
+
+      // The VPlan-based cost model may calculate the cost of strided load/store
+      // which can't be modeled in the legacy cost model.
+      if (isa<VPWidenLoadEVLRecipe, VPWidenStoreEVLRecipe>(&R))
+        if (cast<VPWidenMemoryRecipe>(&R)->isReverse())
+          return true;
+
       if (Instruction *UI = GetInstructionForCost(&R))
         SeenInstrs.insert(UI);
     }
@@ -8344,7 +8351,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
     auto *GEP = dyn_cast<GetElementPtrInst>(
         Ptr->getUnderlyingValue()->stripPointerCasts());
     VPSingleDefRecipe *VectorPtr;
-    if (Reverse) {
+    if (Reverse && !CM.foldTailWithEVL()) {
       // When folding the tail, we may compute an address that we don't in the
       // original scalar loop and it may not be inbounds. Drop Inbounds in that
       // case.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index e9f50e88867b2..2fd2cbffec1c9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2640,17 +2640,6 @@ void VPWidenLoadRecipe::print(raw_ostream &O, const Twine &Indent,
 }
 #endif
 
-/// Use all-true mask for reverse rather than actual mask, as it avoids a
-/// dependence w/o affecting the result.
-static Instruction *createReverseEVL(IRBuilderBase &Builder, Value *Operand,
-                                     Value *EVL, const Twine &Name) {
-  VectorType *ValTy = cast<VectorType>(Operand->getType());
-  Value *AllTrueMask =
-      Builder.CreateVectorSplat(ValTy->getElementCount(), Builder.getTrue());
-  return Builder.CreateIntrinsic(ValTy, Intrinsic::experimental_vp_reverse,
-                                 {Operand, AllTrueMask, EVL}, nullptr, Name);
-}
-
 void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
   auto *LI = cast<LoadInst>(&Ingredient);
 
@@ -2665,19 +2654,26 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
   Value *EVL = State.get(getEVL(), VPLane(0));
   Value *Addr = State.get(getAddr(), !CreateGather);
   Value *Mask = nullptr;
-  if (VPValue *VPMask = getMask()) {
+  if (VPValue *VPMask = getMask())
     Mask = State.get(VPMask);
-    if (isReverse())
-      Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
-  } else {
+  else
     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
-  }
 
   if (CreateGather) {
     NewLI =
         Builder.CreateIntrinsic(DataTy, Intrinsic::vp_gather, {Addr, Mask, EVL},
                                 nullptr, "wide.masked.gather");
   } else {
+    if (isReverse()) {
+      auto *EltTy = DataTy->getElementType();
+      // if (EltTy->getScalarSizeInBits() !=
+      // EVL->getType()->getScalarSizeInBits())
+      //   EVL = ConstantInt::getSigned(EVL->getType(),
+      //   static_cast<int64_t>(EltTy->getScalarSizeInBits()) / 8);
+      auto *GEP = dyn_cast<GetElementPtrInst>(Addr->stripPointerCasts());
+      Value *Offset = Builder.CreateSub(State.Builder.getInt32(1), EVL);
+      Addr = Builder.CreateGEP(EltTy, Addr, Offset, "", GEP->isInBounds());
+    }
     VectorBuilder VBuilder(Builder);
     VBuilder.setEVL(EVL).setMask(Mask);
     NewLI = cast<CallInst>(VBuilder.createVectorInstruction(
@@ -2686,10 +2682,7 @@ void VPWidenLoadEVLRecipe::execute(VPTransformState &State) {
   NewLI->addParamAttr(
       0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
   State.addMetadata(NewLI, LI);
-  Instruction *Res = NewLI;
-  if (isReverse())
-    Res = createReverseEVL(Builder, Res, EVL, "vp.reverse");
-  State.set(this, Res);
+  State.set(this, NewLI);
 }
 
 InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
@@ -2707,14 +2700,13 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
       getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
   unsigned AS =
       getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
-  InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
-      Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind);
   if (!Reverse)
-    return Cost;
+    return Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
+                                         AS, Ctx.CostKind);
 
-  return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
-                                       cast<VectorType>(Ty), {}, Ctx.CostKind,
-                                       0);
+  return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty,
+                                        getAddr()->getUnderlyingValue(), false,
+                                        Alignment, Ctx.CostKind);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -2775,7 +2767,8 @@ void VPWidenStoreRecipe::print(raw_ostream &O, const Twine &Indent,
 
 void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
   auto *SI = cast<StoreInst>(&Ingredient);
-
+  Type *ScalarDataTy = getLoadStoreType(&Ingredient);
+  auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
   VPValue *StoredValue = getStoredValue();
   bool CreateScatter = !isConsecutive();
   const Align Alignment = getLoadStoreAlignment(&Ingredient);
@@ -2786,22 +2779,32 @@ void VPWidenStoreEVLRecipe::execute(VPTransformState &State) {
   CallInst *NewSI = nullptr;
   Value *StoredVal = State.get(StoredValue);
   Value *EVL = State.get(getEVL(), VPLane(0));
-  if (isReverse())
-    StoredVal = createReverseEVL(Builder, StoredVal, EVL, "vp.reverse");
   Value *Mask = nullptr;
-  if (VPValue *VPMask = getMask()) {
+  if (VPValue *VPMask = getMask())
     Mask = State.get(VPMask);
-    if (isReverse())
-      Mask = createReverseEVL(Builder, Mask, EVL, "vp.reverse.mask");
-  } else {
+  else
     Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
-  }
+
   Value *Addr = State.get(getAddr(), !CreateScatter);
   if (CreateScatter) {
     NewSI = Builder.CreateIntrinsic(Type::getVoidTy(EVL->getContext()),
                                     Intrinsic::vp_scatter,
                                     {StoredVal, Addr, Mask, EVL});
   } else {
+    if (isReverse()) {
+      auto *EltTy = DataTy->getElementType();
+      // FIXME: we may need not deal with the size, the InstCombine will deal
+      // with the Offset Type if (EltTy->getScalarSizeInBits() !=
+      // EVL->getType()->getScalarSizeInBits())
+      //   EVL = ConstantInt::getSigned(EVL->getType(),
+      //   static_cast<int64_t>(EltTy->getScalarSizeInBits()) / 8);
+      auto *GEP = dyn_cast<GetElementPtrInst>(Addr->stripPointerCasts());
+      // Value *Offset =
+      // Builder.CreateSub(State.Builder.getIntN(EVL->getType()->getScalarSizeInBits(),
+      // 1), EVL);
+      Value *Offset = Builder.CreateSub(State.Builder.getInt32(1), EVL);
+      Addr = Builder.CreateGEP(EltTy, Addr, Offset, "", GEP->isInBounds());
+    }
     VectorBuilder VBuilder(Builder);
     VBuilder.setEVL(EVL).setMask(Mask);
     NewSI = cast<CallInst>(VBuilder.createVectorInstruction(
@@ -2828,14 +2831,13 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
       getLoadStoreAlignment(const_cast<Instruction *>(&Ingredient));
   unsigned AS =
       getLoadStoreAddressSpace(const_cast<Instruction *>(&Ingredient));
-  InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
-      Ingredient.getOpcode(), Ty, Alignment, AS, Ctx.CostKind);
   if (!Reverse)
-    return Cost;
+    return Ctx.TTI.getMaskedMemoryOpCost(Ingredient.getOpcode(), Ty, Alignment,
+                                         AS, Ctx.CostKind);
 
-  return Cost + Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
-                                       cast<VectorType>(Ty), {}, Ctx.CostKind,
-                                       0);
+  return Ctx.TTI.getStridedMemoryOpCost(Ingredient.getOpcode(), Ty,
+                                        getAddr()->getUnderlyingValue(), false,
+                                        Alignment, Ctx.CostKind);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)

@LiqinWeng LiqinWeng changed the title [LV][VPlan] For reverse iterative continuous load and store access, use unit load/store [WIP][LV][VPlan] For reverse iterative continuous load and store access, use unit load/store Mar 6, 2025
@LiqinWeng LiqinWeng requested review from lukel97 and wangpc-pp and removed request for fhahn, SamTebbs33 and lukel97 March 6, 2025 09:46
@LiqinWeng LiqinWeng force-pushed the vprevere-convert-to-unitmemory branch from 7b66981 to 344c2e1 Compare March 6, 2025 10:29
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants