-
Notifications
You must be signed in to change notification settings - Fork 14.2k
[IA][RISCV] Add support for vp.load/vp.store with shufflevector #135445
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-ir @llvm/pr-subscribers-backend-risc-v Author: Min-Yih Hsu (mshockwave) ChangesTeach InterleavedAccessPass to recognize vp.load + shufflevector and shufflevector + vp.store. Though this patch only adds RISC-V support to actually lower this pattern. The vp.load/vp.store in this pattern require constant mask and EVL. I really don't want to add yet another new TLI hook on top of the six we already have, and that's why I reuse the existing Also, that I'm planning to consolidate and generalize InterleavedAccessPass to support any combinations of load/vp.load/masked.load with shufflevector/vector.deinterleave (and their store operation counterparts). Patch is 60.54 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/135445.diff 11 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 4f2f202f94841..5407bf8b2ba13 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3157,11 +3157,11 @@ class TargetLoweringBase {
/// Lower an interleaved load to target specific intrinsics. Return
/// true on success.
///
- /// \p LI is the vector load instruction.
+ /// \p LoadOp is a vector load or vp.load instruction.
/// \p Shuffles is the shufflevector list to DE-interleave the loaded vector.
/// \p Indices is the corresponding indices for each shufflevector.
/// \p Factor is the interleave factor.
- virtual bool lowerInterleavedLoad(LoadInst *LI,
+ virtual bool lowerInterleavedLoad(Instruction *LoadOp,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const {
@@ -3171,10 +3171,11 @@ class TargetLoweringBase {
/// Lower an interleaved store to target specific intrinsics. Return
/// true on success.
///
- /// \p SI is the vector store instruction.
+ /// \p StoreOp is a vector store or vp.store instruction.
/// \p SVI is the shufflevector to RE-interleave the stored vector.
/// \p Factor is the interleave factor.
- virtual bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+ virtual bool lowerInterleavedStore(Instruction *StoreOp,
+ ShuffleVectorInst *SVI,
unsigned Factor) const {
return false;
}
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 9e47510e9cd1a..83bde96cc725a 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -45,6 +45,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallVector.h"
@@ -100,11 +101,11 @@ class InterleavedAccessImpl {
unsigned MaxFactor = 0u;
/// Transform an interleaved load into target specific intrinsics.
- bool lowerInterleavedLoad(LoadInst *LI,
+ bool lowerInterleavedLoad(Instruction *LoadOp,
SmallSetVector<Instruction *, 32> &DeadInsts);
/// Transform an interleaved store into target specific intrinsics.
- bool lowerInterleavedStore(StoreInst *SI,
+ bool lowerInterleavedStore(Instruction *StoreOp,
SmallSetVector<Instruction *, 32> &DeadInsts);
/// Transform a load and a deinterleave intrinsic into target specific
@@ -131,7 +132,7 @@ class InterleavedAccessImpl {
/// made.
bool replaceBinOpShuffles(ArrayRef<ShuffleVectorInst *> BinOpShuffles,
SmallVectorImpl<ShuffleVectorInst *> &Shuffles,
- LoadInst *LI);
+ Instruction *LI);
};
class InterleavedAccess : public FunctionPass {
@@ -250,10 +251,23 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
}
bool InterleavedAccessImpl::lowerInterleavedLoad(
- LoadInst *LI, SmallSetVector<Instruction *, 32> &DeadInsts) {
- if (!LI->isSimple() || isa<ScalableVectorType>(LI->getType()))
+ Instruction *LoadOp, SmallSetVector<Instruction *, 32> &DeadInsts) {
+ if (isa<ScalableVectorType>(LoadOp->getType()))
return false;
+ if (auto *LI = dyn_cast<LoadInst>(LoadOp)) {
+ if (!LI->isSimple())
+ return false;
+ } else if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
+ assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load);
+ // Require a constant mask and evl.
+ if (!isa<ConstantVector>(VPLoad->getArgOperand(1)) ||
+ !isa<ConstantInt>(VPLoad->getArgOperand(2)))
+ return false;
+ } else {
+ llvm_unreachable("unsupported load operation");
+ }
+
// Check if all users of this load are shufflevectors. If we encounter any
// users that are extractelement instructions or binary operators, we save
// them to later check if they can be modified to extract from one of the
@@ -265,7 +279,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
// binop are the same load.
SmallSetVector<ShuffleVectorInst *, 4> BinOpShuffles;
- for (auto *User : LI->users()) {
+ for (auto *User : LoadOp->users()) {
auto *Extract = dyn_cast<ExtractElementInst>(User);
if (Extract && isa<ConstantInt>(Extract->getIndexOperand())) {
Extracts.push_back(Extract);
@@ -294,13 +308,31 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
unsigned Factor, Index;
unsigned NumLoadElements =
- cast<FixedVectorType>(LI->getType())->getNumElements();
+ cast<FixedVectorType>(LoadOp->getType())->getNumElements();
auto *FirstSVI = Shuffles.size() > 0 ? Shuffles[0] : BinOpShuffles[0];
// Check if the first shufflevector is DE-interleave shuffle.
if (!isDeInterleaveMask(FirstSVI->getShuffleMask(), Factor, Index, MaxFactor,
NumLoadElements))
return false;
+ // If this is a vp.load, record its mask (NOT shuffle mask).
+ BitVector MaskedIndices(NumLoadElements);
+ if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
+ auto *Mask = cast<ConstantVector>(VPLoad->getArgOperand(1));
+ assert(cast<FixedVectorType>(Mask->getType())->getNumElements() ==
+ NumLoadElements);
+ if (auto *Splat = Mask->getSplatValue()) {
+ // All-zeros mask, bail out early.
+ if (Splat->isZeroValue())
+ return false;
+ } else {
+ for (unsigned i = 0U; i < NumLoadElements; ++i) {
+ if (Mask->getAggregateElement(i)->isZeroValue())
+ MaskedIndices.set(i);
+ }
+ }
+ }
+
// Holds the corresponding index for each DE-interleave shuffle.
SmallVector<unsigned, 4> Indices;
@@ -327,9 +359,9 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
assert(Shuffle->getShuffleMask().size() <= NumLoadElements);
- if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(0) == LI)
+ if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(0) == LoadOp)
Indices.push_back(Index);
- if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(1) == LI)
+ if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(1) == LoadOp)
Indices.push_back(Index);
}
@@ -339,25 +371,61 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
return false;
bool BinOpShuffleChanged =
- replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LI);
+ replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LoadOp);
+
+ // Check if we extract only the unmasked elements.
+ if (MaskedIndices.any()) {
+ if (any_of(Shuffles, [&](const auto *Shuffle) {
+ ArrayRef<int> ShuffleMask = Shuffle->getShuffleMask();
+ for (int Idx : ShuffleMask) {
+ if (Idx < 0)
+ continue;
+ if (MaskedIndices.test(unsigned(Idx)))
+ return true;
+ }
+ return false;
+ })) {
+ LLVM_DEBUG(dbgs() << "IA: trying to extract a masked element through "
+ << "shufflevector\n");
+ return false;
+ }
+ }
+ // Check if we extract only the elements within evl.
+ if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
+ uint64_t EVL = cast<ConstantInt>(VPLoad->getArgOperand(2))->getZExtValue();
+ if (any_of(Shuffles, [&](const auto *Shuffle) {
+ ArrayRef<int> ShuffleMask = Shuffle->getShuffleMask();
+ for (int Idx : ShuffleMask) {
+ if (Idx < 0)
+ continue;
+ if (unsigned(Idx) >= EVL)
+ return true;
+ }
+ return false;
+ })) {
+ LLVM_DEBUG(
+ dbgs() << "IA: trying to extract an element out of EVL range\n");
+ return false;
+ }
+ }
- LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n");
+ LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LoadOp << "\n");
// Try to create target specific intrinsics to replace the load and shuffles.
- if (!TLI->lowerInterleavedLoad(LI, Shuffles, Indices, Factor)) {
+ if (!TLI->lowerInterleavedLoad(LoadOp, Shuffles, Indices, Factor)) {
// If Extracts is not empty, tryReplaceExtracts made changes earlier.
return !Extracts.empty() || BinOpShuffleChanged;
}
DeadInsts.insert_range(Shuffles);
- DeadInsts.insert(LI);
+ DeadInsts.insert(LoadOp);
return true;
}
bool InterleavedAccessImpl::replaceBinOpShuffles(
ArrayRef<ShuffleVectorInst *> BinOpShuffles,
- SmallVectorImpl<ShuffleVectorInst *> &Shuffles, LoadInst *LI) {
+ SmallVectorImpl<ShuffleVectorInst *> &Shuffles, Instruction *LoadOp) {
for (auto *SVI : BinOpShuffles) {
BinaryOperator *BI = cast<BinaryOperator>(SVI->getOperand(0));
Type *BIOp0Ty = BI->getOperand(0)->getType();
@@ -380,9 +448,9 @@ bool InterleavedAccessImpl::replaceBinOpShuffles(
<< "\n With : " << *NewSVI1 << "\n And : "
<< *NewSVI2 << "\n And : " << *NewBI << "\n");
RecursivelyDeleteTriviallyDeadInstructions(SVI);
- if (NewSVI1->getOperand(0) == LI)
+ if (NewSVI1->getOperand(0) == LoadOp)
Shuffles.push_back(NewSVI1);
- if (NewSVI2->getOperand(0) == LI)
+ if (NewSVI2->getOperand(0) == LoadOp)
Shuffles.push_back(NewSVI2);
}
@@ -454,27 +522,79 @@ bool InterleavedAccessImpl::tryReplaceExtracts(
}
bool InterleavedAccessImpl::lowerInterleavedStore(
- StoreInst *SI, SmallSetVector<Instruction *, 32> &DeadInsts) {
- if (!SI->isSimple())
- return false;
+ Instruction *StoreOp, SmallSetVector<Instruction *, 32> &DeadInsts) {
+ Value *StoredValue;
+ if (auto *SI = dyn_cast<StoreInst>(StoreOp)) {
+ if (!SI->isSimple())
+ return false;
+ StoredValue = SI->getValueOperand();
+ } else if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
+ assert(VPStore->getIntrinsicID() == Intrinsic::vp_store);
+ // Require a constant mask and evl.
+ if (!isa<ConstantVector>(VPStore->getArgOperand(2)) ||
+ !isa<ConstantInt>(VPStore->getArgOperand(3)))
+ return false;
+ StoredValue = VPStore->getArgOperand(0);
+ } else {
+ llvm_unreachable("unsupported store operation");
+ }
- auto *SVI = dyn_cast<ShuffleVectorInst>(SI->getValueOperand());
+ auto *SVI = dyn_cast<ShuffleVectorInst>(StoredValue);
if (!SVI || !SVI->hasOneUse() || isa<ScalableVectorType>(SVI->getType()))
return false;
+ unsigned NumStoredElements =
+ cast<FixedVectorType>(SVI->getType())->getNumElements();
+ // If this is a vp.store, record its mask (NOT shuffle mask).
+ BitVector MaskedIndices(NumStoredElements);
+ if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
+ auto *Mask = cast<ConstantVector>(VPStore->getArgOperand(2));
+ assert(cast<FixedVectorType>(Mask->getType())->getNumElements() ==
+ NumStoredElements);
+ if (auto *Splat = Mask->getSplatValue()) {
+ // All-zeros mask, bail out early.
+ if (Splat->isZeroValue())
+ return false;
+ } else {
+ for (unsigned i = 0U; i < NumStoredElements; ++i) {
+ if (Mask->getAggregateElement(i)->isZeroValue())
+ MaskedIndices.set(i);
+ }
+ }
+ }
+
// Check if the shufflevector is RE-interleave shuffle.
unsigned Factor;
if (!isReInterleaveMask(SVI, Factor, MaxFactor))
return false;
- LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n");
+ // Check if we store only the unmasked elements.
+ if (MaskedIndices.any()) {
+ if (any_of(SVI->getShuffleMask(), [&](int Idx) {
+ return Idx >= 0 && MaskedIndices.test(unsigned(Idx));
+ })) {
+ LLVM_DEBUG(dbgs() << "IA: trying to store a masked element\n");
+ return false;
+ }
+ }
+ // Check if we store only the elements within evl.
+ if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
+ uint64_t EVL = cast<ConstantInt>(VPStore->getArgOperand(3))->getZExtValue();
+ if (any_of(SVI->getShuffleMask(),
+ [&](int Idx) { return Idx >= 0 && unsigned(Idx) >= EVL; })) {
+ LLVM_DEBUG(dbgs() << "IA: trying to store an element out of EVL range\n");
+ return false;
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *StoreOp << "\n");
// Try to create target specific intrinsics to replace the store and shuffle.
- if (!TLI->lowerInterleavedStore(SI, SVI, Factor))
+ if (!TLI->lowerInterleavedStore(StoreOp, SVI, Factor))
return false;
// Already have a new target specific interleaved store. Erase the old store.
- DeadInsts.insert(SI);
+ DeadInsts.insert(StoreOp);
DeadInsts.insert(SVI);
return true;
}
@@ -766,12 +886,15 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
SmallSetVector<Instruction *, 32> DeadInsts;
bool Changed = false;
+ using namespace PatternMatch;
for (auto &I : instructions(F)) {
- if (auto *LI = dyn_cast<LoadInst>(&I))
- Changed |= lowerInterleavedLoad(LI, DeadInsts);
+ if (match(&I, m_CombineOr(m_Load(m_Value()),
+ m_Intrinsic<Intrinsic::vp_load>())))
+ Changed |= lowerInterleavedLoad(&I, DeadInsts);
- if (auto *SI = dyn_cast<StoreInst>(&I))
- Changed |= lowerInterleavedStore(SI, DeadInsts);
+ if (match(&I, m_CombineOr(m_Store(m_Value(), m_Value()),
+ m_Intrinsic<Intrinsic::vp_store>())))
+ Changed |= lowerInterleavedStore(&I, DeadInsts);
if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
// At present, we only have intrinsics to represent (de)interleaving
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e366d7cb54490..d74cc3161684d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17176,7 +17176,7 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
bool AArch64TargetLowering::lowerInterleavedLoad(
- LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+ Instruction *LoadOp, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
@@ -17184,6 +17184,10 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
assert(Shuffles.size() == Indices.size() &&
"Unmatched number of shufflevectors and indices");
+ auto *LI = dyn_cast<LoadInst>(LoadOp);
+ if (!LI)
+ return false;
+
const DataLayout &DL = LI->getDataLayout();
VectorType *VTy = Shuffles[0]->getType();
@@ -17359,13 +17363,17 @@ bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
-bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
+bool AArch64TargetLowering::lowerInterleavedStore(Instruction *StoreOp,
ShuffleVectorInst *SVI,
unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
+ auto *SI = dyn_cast<StoreInst>(StoreOp);
+ if (!SI)
+ return false;
+
auto *VecTy = cast<FixedVectorType>(SVI->getType());
assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 0d51ef2be8631..34446abb1474c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -716,11 +716,11 @@ class AArch64TargetLowering : public TargetLowering {
unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
- bool lowerInterleavedLoad(LoadInst *LI,
+ bool lowerInterleavedLoad(Instruction *LoadOp,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
- bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+ bool lowerInterleavedStore(Instruction *StoreOp, ShuffleVectorInst *SVI,
unsigned Factor) const override;
bool lowerDeinterleaveIntrinsicToLoad(
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 2290ac2728c6d..64d12a0eb1d9b 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21608,7 +21608,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
bool ARMTargetLowering::lowerInterleavedLoad(
- LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+ Instruction *LoadOp, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
@@ -21616,6 +21616,10 @@ bool ARMTargetLowering::lowerInterleavedLoad(
assert(Shuffles.size() == Indices.size() &&
"Unmatched number of shufflevectors and indices");
+ auto *LI = dyn_cast<LoadInst>(LoadOp);
+ if (!LI)
+ return false;
+
auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
Type *EltTy = VecTy->getElementType();
@@ -21750,12 +21754,16 @@ bool ARMTargetLowering::lowerInterleavedLoad(
/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
-bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
+bool ARMTargetLowering::lowerInterleavedStore(Instruction *StoreOp,
ShuffleVectorInst *SVI,
unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
+ auto *SI = dyn_cast<StoreInst>(StoreOp);
+ if (!SI)
+ return false;
+
auto *VecTy = cast<FixedVectorType>(SVI->getType());
assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 9fad056edd3f1..635a6cd226936 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -673,11 +673,11 @@ class VectorType;
unsigned getMaxSupportedInterleaveFactor() const override;
- bool lowerInterleavedLoad(LoadInst *LI,
+ bool lowerInterleavedLoad(Instruction *LoadOp,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
- bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+ bool lowerInterleavedStore(Instruction *StoreOp, ShuffleVectorInst *SVI,
unsigned Factor) const override;
bool shouldInsertFencesForAtomic(const Instruction *I) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f7d192756fd56..9558783963500 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -23376,19 +23376,36 @@ static const Intrinsic::ID FixedVlsegIntrIds[] = {
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
bool RISCVTargetLowering::lowerInterleavedLoad(
- LoadInst *LI, ArrayRef<Sh...
[truncated]
|
@llvm/pr-subscribers-backend-x86 Author: Min-Yih Hsu (mshockwave) ChangesTeach InterleavedAccessPass to recognize vp.load + shufflevector and shufflevector + vp.store. Though this patch only adds RISC-V support to actually lower this pattern. The vp.load/vp.store in this pattern require constant mask and EVL. I really don't want to add yet another new TLI hook on top of the six we already have, and that's why I reuse the existing Also, that I'm planning to consolidate and generalize InterleavedAccessPass to support any combinations of load/vp.load/masked.load with shufflevector/vector.deinterleave (and their store operation counterparts). Patch is 60.54 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/135445.diff 11 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 4f2f202f94841..5407bf8b2ba13 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3157,11 +3157,11 @@ class TargetLoweringBase {
/// Lower an interleaved load to target specific intrinsics. Return
/// true on success.
///
- /// \p LI is the vector load instruction.
+ /// \p LoadOp is a vector load or vp.load instruction.
/// \p Shuffles is the shufflevector list to DE-interleave the loaded vector.
/// \p Indices is the corresponding indices for each shufflevector.
/// \p Factor is the interleave factor.
- virtual bool lowerInterleavedLoad(LoadInst *LI,
+ virtual bool lowerInterleavedLoad(Instruction *LoadOp,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const {
@@ -3171,10 +3171,11 @@ class TargetLoweringBase {
/// Lower an interleaved store to target specific intrinsics. Return
/// true on success.
///
- /// \p SI is the vector store instruction.
+ /// \p StoreOp is a vector store or vp.store instruction.
/// \p SVI is the shufflevector to RE-interleave the stored vector.
/// \p Factor is the interleave factor.
- virtual bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+ virtual bool lowerInterleavedStore(Instruction *StoreOp,
+ ShuffleVectorInst *SVI,
unsigned Factor) const {
return false;
}
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 9e47510e9cd1a..83bde96cc725a 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -45,6 +45,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallVector.h"
@@ -100,11 +101,11 @@ class InterleavedAccessImpl {
unsigned MaxFactor = 0u;
/// Transform an interleaved load into target specific intrinsics.
- bool lowerInterleavedLoad(LoadInst *LI,
+ bool lowerInterleavedLoad(Instruction *LoadOp,
SmallSetVector<Instruction *, 32> &DeadInsts);
/// Transform an interleaved store into target specific intrinsics.
- bool lowerInterleavedStore(StoreInst *SI,
+ bool lowerInterleavedStore(Instruction *StoreOp,
SmallSetVector<Instruction *, 32> &DeadInsts);
/// Transform a load and a deinterleave intrinsic into target specific
@@ -131,7 +132,7 @@ class InterleavedAccessImpl {
/// made.
bool replaceBinOpShuffles(ArrayRef<ShuffleVectorInst *> BinOpShuffles,
SmallVectorImpl<ShuffleVectorInst *> &Shuffles,
- LoadInst *LI);
+ Instruction *LI);
};
class InterleavedAccess : public FunctionPass {
@@ -250,10 +251,23 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
}
bool InterleavedAccessImpl::lowerInterleavedLoad(
- LoadInst *LI, SmallSetVector<Instruction *, 32> &DeadInsts) {
- if (!LI->isSimple() || isa<ScalableVectorType>(LI->getType()))
+ Instruction *LoadOp, SmallSetVector<Instruction *, 32> &DeadInsts) {
+ if (isa<ScalableVectorType>(LoadOp->getType()))
return false;
+ if (auto *LI = dyn_cast<LoadInst>(LoadOp)) {
+ if (!LI->isSimple())
+ return false;
+ } else if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
+ assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load);
+ // Require a constant mask and evl.
+ if (!isa<ConstantVector>(VPLoad->getArgOperand(1)) ||
+ !isa<ConstantInt>(VPLoad->getArgOperand(2)))
+ return false;
+ } else {
+ llvm_unreachable("unsupported load operation");
+ }
+
// Check if all users of this load are shufflevectors. If we encounter any
// users that are extractelement instructions or binary operators, we save
// them to later check if they can be modified to extract from one of the
@@ -265,7 +279,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
// binop are the same load.
SmallSetVector<ShuffleVectorInst *, 4> BinOpShuffles;
- for (auto *User : LI->users()) {
+ for (auto *User : LoadOp->users()) {
auto *Extract = dyn_cast<ExtractElementInst>(User);
if (Extract && isa<ConstantInt>(Extract->getIndexOperand())) {
Extracts.push_back(Extract);
@@ -294,13 +308,31 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
unsigned Factor, Index;
unsigned NumLoadElements =
- cast<FixedVectorType>(LI->getType())->getNumElements();
+ cast<FixedVectorType>(LoadOp->getType())->getNumElements();
auto *FirstSVI = Shuffles.size() > 0 ? Shuffles[0] : BinOpShuffles[0];
// Check if the first shufflevector is DE-interleave shuffle.
if (!isDeInterleaveMask(FirstSVI->getShuffleMask(), Factor, Index, MaxFactor,
NumLoadElements))
return false;
+ // If this is a vp.load, record its mask (NOT shuffle mask).
+ BitVector MaskedIndices(NumLoadElements);
+ if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
+ auto *Mask = cast<ConstantVector>(VPLoad->getArgOperand(1));
+ assert(cast<FixedVectorType>(Mask->getType())->getNumElements() ==
+ NumLoadElements);
+ if (auto *Splat = Mask->getSplatValue()) {
+ // All-zeros mask, bail out early.
+ if (Splat->isZeroValue())
+ return false;
+ } else {
+ for (unsigned i = 0U; i < NumLoadElements; ++i) {
+ if (Mask->getAggregateElement(i)->isZeroValue())
+ MaskedIndices.set(i);
+ }
+ }
+ }
+
// Holds the corresponding index for each DE-interleave shuffle.
SmallVector<unsigned, 4> Indices;
@@ -327,9 +359,9 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
assert(Shuffle->getShuffleMask().size() <= NumLoadElements);
- if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(0) == LI)
+ if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(0) == LoadOp)
Indices.push_back(Index);
- if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(1) == LI)
+ if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(1) == LoadOp)
Indices.push_back(Index);
}
@@ -339,25 +371,61 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
return false;
bool BinOpShuffleChanged =
- replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LI);
+ replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LoadOp);
+
+ // Check if we extract only the unmasked elements.
+ if (MaskedIndices.any()) {
+ if (any_of(Shuffles, [&](const auto *Shuffle) {
+ ArrayRef<int> ShuffleMask = Shuffle->getShuffleMask();
+ for (int Idx : ShuffleMask) {
+ if (Idx < 0)
+ continue;
+ if (MaskedIndices.test(unsigned(Idx)))
+ return true;
+ }
+ return false;
+ })) {
+ LLVM_DEBUG(dbgs() << "IA: trying to extract a masked element through "
+ << "shufflevector\n");
+ return false;
+ }
+ }
+ // Check if we extract only the elements within evl.
+ if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
+ uint64_t EVL = cast<ConstantInt>(VPLoad->getArgOperand(2))->getZExtValue();
+ if (any_of(Shuffles, [&](const auto *Shuffle) {
+ ArrayRef<int> ShuffleMask = Shuffle->getShuffleMask();
+ for (int Idx : ShuffleMask) {
+ if (Idx < 0)
+ continue;
+ if (unsigned(Idx) >= EVL)
+ return true;
+ }
+ return false;
+ })) {
+ LLVM_DEBUG(
+ dbgs() << "IA: trying to extract an element out of EVL range\n");
+ return false;
+ }
+ }
- LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n");
+ LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LoadOp << "\n");
// Try to create target specific intrinsics to replace the load and shuffles.
- if (!TLI->lowerInterleavedLoad(LI, Shuffles, Indices, Factor)) {
+ if (!TLI->lowerInterleavedLoad(LoadOp, Shuffles, Indices, Factor)) {
// If Extracts is not empty, tryReplaceExtracts made changes earlier.
return !Extracts.empty() || BinOpShuffleChanged;
}
DeadInsts.insert_range(Shuffles);
- DeadInsts.insert(LI);
+ DeadInsts.insert(LoadOp);
return true;
}
bool InterleavedAccessImpl::replaceBinOpShuffles(
ArrayRef<ShuffleVectorInst *> BinOpShuffles,
- SmallVectorImpl<ShuffleVectorInst *> &Shuffles, LoadInst *LI) {
+ SmallVectorImpl<ShuffleVectorInst *> &Shuffles, Instruction *LoadOp) {
for (auto *SVI : BinOpShuffles) {
BinaryOperator *BI = cast<BinaryOperator>(SVI->getOperand(0));
Type *BIOp0Ty = BI->getOperand(0)->getType();
@@ -380,9 +448,9 @@ bool InterleavedAccessImpl::replaceBinOpShuffles(
<< "\n With : " << *NewSVI1 << "\n And : "
<< *NewSVI2 << "\n And : " << *NewBI << "\n");
RecursivelyDeleteTriviallyDeadInstructions(SVI);
- if (NewSVI1->getOperand(0) == LI)
+ if (NewSVI1->getOperand(0) == LoadOp)
Shuffles.push_back(NewSVI1);
- if (NewSVI2->getOperand(0) == LI)
+ if (NewSVI2->getOperand(0) == LoadOp)
Shuffles.push_back(NewSVI2);
}
@@ -454,27 +522,79 @@ bool InterleavedAccessImpl::tryReplaceExtracts(
}
bool InterleavedAccessImpl::lowerInterleavedStore(
- StoreInst *SI, SmallSetVector<Instruction *, 32> &DeadInsts) {
- if (!SI->isSimple())
- return false;
+ Instruction *StoreOp, SmallSetVector<Instruction *, 32> &DeadInsts) {
+ Value *StoredValue;
+ if (auto *SI = dyn_cast<StoreInst>(StoreOp)) {
+ if (!SI->isSimple())
+ return false;
+ StoredValue = SI->getValueOperand();
+ } else if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
+ assert(VPStore->getIntrinsicID() == Intrinsic::vp_store);
+ // Require a constant mask and evl.
+ if (!isa<ConstantVector>(VPStore->getArgOperand(2)) ||
+ !isa<ConstantInt>(VPStore->getArgOperand(3)))
+ return false;
+ StoredValue = VPStore->getArgOperand(0);
+ } else {
+ llvm_unreachable("unsupported store operation");
+ }
- auto *SVI = dyn_cast<ShuffleVectorInst>(SI->getValueOperand());
+ auto *SVI = dyn_cast<ShuffleVectorInst>(StoredValue);
if (!SVI || !SVI->hasOneUse() || isa<ScalableVectorType>(SVI->getType()))
return false;
+ unsigned NumStoredElements =
+ cast<FixedVectorType>(SVI->getType())->getNumElements();
+ // If this is a vp.store, record its mask (NOT shuffle mask).
+ BitVector MaskedIndices(NumStoredElements);
+ if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
+ auto *Mask = cast<ConstantVector>(VPStore->getArgOperand(2));
+ assert(cast<FixedVectorType>(Mask->getType())->getNumElements() ==
+ NumStoredElements);
+ if (auto *Splat = Mask->getSplatValue()) {
+ // All-zeros mask, bail out early.
+ if (Splat->isZeroValue())
+ return false;
+ } else {
+ for (unsigned i = 0U; i < NumStoredElements; ++i) {
+ if (Mask->getAggregateElement(i)->isZeroValue())
+ MaskedIndices.set(i);
+ }
+ }
+ }
+
// Check if the shufflevector is RE-interleave shuffle.
unsigned Factor;
if (!isReInterleaveMask(SVI, Factor, MaxFactor))
return false;
- LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n");
+ // Check if we store only the unmasked elements.
+ if (MaskedIndices.any()) {
+ if (any_of(SVI->getShuffleMask(), [&](int Idx) {
+ return Idx >= 0 && MaskedIndices.test(unsigned(Idx));
+ })) {
+ LLVM_DEBUG(dbgs() << "IA: trying to store a masked element\n");
+ return false;
+ }
+ }
+ // Check if we store only the elements within evl.
+ if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
+ uint64_t EVL = cast<ConstantInt>(VPStore->getArgOperand(3))->getZExtValue();
+ if (any_of(SVI->getShuffleMask(),
+ [&](int Idx) { return Idx >= 0 && unsigned(Idx) >= EVL; })) {
+ LLVM_DEBUG(dbgs() << "IA: trying to store an element out of EVL range\n");
+ return false;
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *StoreOp << "\n");
// Try to create target specific intrinsics to replace the store and shuffle.
- if (!TLI->lowerInterleavedStore(SI, SVI, Factor))
+ if (!TLI->lowerInterleavedStore(StoreOp, SVI, Factor))
return false;
// Already have a new target specific interleaved store. Erase the old store.
- DeadInsts.insert(SI);
+ DeadInsts.insert(StoreOp);
DeadInsts.insert(SVI);
return true;
}
@@ -766,12 +886,15 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
SmallSetVector<Instruction *, 32> DeadInsts;
bool Changed = false;
+ using namespace PatternMatch;
for (auto &I : instructions(F)) {
- if (auto *LI = dyn_cast<LoadInst>(&I))
- Changed |= lowerInterleavedLoad(LI, DeadInsts);
+ if (match(&I, m_CombineOr(m_Load(m_Value()),
+ m_Intrinsic<Intrinsic::vp_load>())))
+ Changed |= lowerInterleavedLoad(&I, DeadInsts);
- if (auto *SI = dyn_cast<StoreInst>(&I))
- Changed |= lowerInterleavedStore(SI, DeadInsts);
+ if (match(&I, m_CombineOr(m_Store(m_Value(), m_Value()),
+ m_Intrinsic<Intrinsic::vp_store>())))
+ Changed |= lowerInterleavedStore(&I, DeadInsts);
if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
// At present, we only have intrinsics to represent (de)interleaving
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e366d7cb54490..d74cc3161684d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17176,7 +17176,7 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
bool AArch64TargetLowering::lowerInterleavedLoad(
- LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+ Instruction *LoadOp, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
@@ -17184,6 +17184,10 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
assert(Shuffles.size() == Indices.size() &&
"Unmatched number of shufflevectors and indices");
+ auto *LI = dyn_cast<LoadInst>(LoadOp);
+ if (!LI)
+ return false;
+
const DataLayout &DL = LI->getDataLayout();
VectorType *VTy = Shuffles[0]->getType();
@@ -17359,13 +17363,17 @@ bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
-bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
+bool AArch64TargetLowering::lowerInterleavedStore(Instruction *StoreOp,
ShuffleVectorInst *SVI,
unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
+ auto *SI = dyn_cast<StoreInst>(StoreOp);
+ if (!SI)
+ return false;
+
auto *VecTy = cast<FixedVectorType>(SVI->getType());
assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 0d51ef2be8631..34446abb1474c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -716,11 +716,11 @@ class AArch64TargetLowering : public TargetLowering {
unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
- bool lowerInterleavedLoad(LoadInst *LI,
+ bool lowerInterleavedLoad(Instruction *LoadOp,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
- bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+ bool lowerInterleavedStore(Instruction *StoreOp, ShuffleVectorInst *SVI,
unsigned Factor) const override;
bool lowerDeinterleaveIntrinsicToLoad(
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 2290ac2728c6d..64d12a0eb1d9b 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21608,7 +21608,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
bool ARMTargetLowering::lowerInterleavedLoad(
- LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+ Instruction *LoadOp, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
@@ -21616,6 +21616,10 @@ bool ARMTargetLowering::lowerInterleavedLoad(
assert(Shuffles.size() == Indices.size() &&
"Unmatched number of shufflevectors and indices");
+ auto *LI = dyn_cast<LoadInst>(LoadOp);
+ if (!LI)
+ return false;
+
auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
Type *EltTy = VecTy->getElementType();
@@ -21750,12 +21754,16 @@ bool ARMTargetLowering::lowerInterleavedLoad(
/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
-bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
+bool ARMTargetLowering::lowerInterleavedStore(Instruction *StoreOp,
ShuffleVectorInst *SVI,
unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
+ auto *SI = dyn_cast<StoreInst>(StoreOp);
+ if (!SI)
+ return false;
+
auto *VecTy = cast<FixedVectorType>(SVI->getType());
assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 9fad056edd3f1..635a6cd226936 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -673,11 +673,11 @@ class VectorType;
unsigned getMaxSupportedInterleaveFactor() const override;
- bool lowerInterleavedLoad(LoadInst *LI,
+ bool lowerInterleavedLoad(Instruction *LoadOp,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
- bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+ bool lowerInterleavedStore(Instruction *StoreOp, ShuffleVectorInst *SVI,
unsigned Factor) const override;
bool shouldInsertFencesForAtomic(const Instruction *I) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f7d192756fd56..9558783963500 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -23376,19 +23376,36 @@ static const Intrinsic::ID FixedVlsegIntrIds[] = {
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
bool RISCVTargetLowering::lowerInterleavedLoad(
- LoadInst *LI, ArrayRef<Sh...
[truncated]
|
@llvm/pr-subscribers-backend-aarch64 Author: Min-Yih Hsu (mshockwave) ChangesTeach InterleavedAccessPass to recognize vp.load + shufflevector and shufflevector + vp.store. Though this patch only adds RISC-V support to actually lower this pattern. The vp.load/vp.store in this pattern require constant mask and EVL. I really don't want to add yet another new TLI hook on top of the six we already have, and that's why I reuse the existing Also, that I'm planning to consolidate and generalize InterleavedAccessPass to support any combinations of load/vp.load/masked.load with shufflevector/vector.deinterleave (and their store operation counterparts). Patch is 60.54 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/135445.diff 11 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 4f2f202f94841..5407bf8b2ba13 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3157,11 +3157,11 @@ class TargetLoweringBase {
/// Lower an interleaved load to target specific intrinsics. Return
/// true on success.
///
- /// \p LI is the vector load instruction.
+ /// \p LoadOp is a vector load or vp.load instruction.
/// \p Shuffles is the shufflevector list to DE-interleave the loaded vector.
/// \p Indices is the corresponding indices for each shufflevector.
/// \p Factor is the interleave factor.
- virtual bool lowerInterleavedLoad(LoadInst *LI,
+ virtual bool lowerInterleavedLoad(Instruction *LoadOp,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const {
@@ -3171,10 +3171,11 @@ class TargetLoweringBase {
/// Lower an interleaved store to target specific intrinsics. Return
/// true on success.
///
- /// \p SI is the vector store instruction.
+ /// \p StoreOp is a vector store or vp.store instruction.
/// \p SVI is the shufflevector to RE-interleave the stored vector.
/// \p Factor is the interleave factor.
- virtual bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+ virtual bool lowerInterleavedStore(Instruction *StoreOp,
+ ShuffleVectorInst *SVI,
unsigned Factor) const {
return false;
}
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 9e47510e9cd1a..83bde96cc725a 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -45,6 +45,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallVector.h"
@@ -100,11 +101,11 @@ class InterleavedAccessImpl {
unsigned MaxFactor = 0u;
/// Transform an interleaved load into target specific intrinsics.
- bool lowerInterleavedLoad(LoadInst *LI,
+ bool lowerInterleavedLoad(Instruction *LoadOp,
SmallSetVector<Instruction *, 32> &DeadInsts);
/// Transform an interleaved store into target specific intrinsics.
- bool lowerInterleavedStore(StoreInst *SI,
+ bool lowerInterleavedStore(Instruction *StoreOp,
SmallSetVector<Instruction *, 32> &DeadInsts);
/// Transform a load and a deinterleave intrinsic into target specific
@@ -131,7 +132,7 @@ class InterleavedAccessImpl {
/// made.
bool replaceBinOpShuffles(ArrayRef<ShuffleVectorInst *> BinOpShuffles,
SmallVectorImpl<ShuffleVectorInst *> &Shuffles,
- LoadInst *LI);
+ Instruction *LI);
};
class InterleavedAccess : public FunctionPass {
@@ -250,10 +251,23 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
}
bool InterleavedAccessImpl::lowerInterleavedLoad(
- LoadInst *LI, SmallSetVector<Instruction *, 32> &DeadInsts) {
- if (!LI->isSimple() || isa<ScalableVectorType>(LI->getType()))
+ Instruction *LoadOp, SmallSetVector<Instruction *, 32> &DeadInsts) {
+ if (isa<ScalableVectorType>(LoadOp->getType()))
return false;
+ if (auto *LI = dyn_cast<LoadInst>(LoadOp)) {
+ if (!LI->isSimple())
+ return false;
+ } else if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
+ assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load);
+ // Require a constant mask and evl.
+ if (!isa<ConstantVector>(VPLoad->getArgOperand(1)) ||
+ !isa<ConstantInt>(VPLoad->getArgOperand(2)))
+ return false;
+ } else {
+ llvm_unreachable("unsupported load operation");
+ }
+
// Check if all users of this load are shufflevectors. If we encounter any
// users that are extractelement instructions or binary operators, we save
// them to later check if they can be modified to extract from one of the
@@ -265,7 +279,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
// binop are the same load.
SmallSetVector<ShuffleVectorInst *, 4> BinOpShuffles;
- for (auto *User : LI->users()) {
+ for (auto *User : LoadOp->users()) {
auto *Extract = dyn_cast<ExtractElementInst>(User);
if (Extract && isa<ConstantInt>(Extract->getIndexOperand())) {
Extracts.push_back(Extract);
@@ -294,13 +308,31 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
unsigned Factor, Index;
unsigned NumLoadElements =
- cast<FixedVectorType>(LI->getType())->getNumElements();
+ cast<FixedVectorType>(LoadOp->getType())->getNumElements();
auto *FirstSVI = Shuffles.size() > 0 ? Shuffles[0] : BinOpShuffles[0];
// Check if the first shufflevector is DE-interleave shuffle.
if (!isDeInterleaveMask(FirstSVI->getShuffleMask(), Factor, Index, MaxFactor,
NumLoadElements))
return false;
+ // If this is a vp.load, record its mask (NOT shuffle mask).
+ BitVector MaskedIndices(NumLoadElements);
+ if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
+ auto *Mask = cast<ConstantVector>(VPLoad->getArgOperand(1));
+ assert(cast<FixedVectorType>(Mask->getType())->getNumElements() ==
+ NumLoadElements);
+ if (auto *Splat = Mask->getSplatValue()) {
+ // All-zeros mask, bail out early.
+ if (Splat->isZeroValue())
+ return false;
+ } else {
+ for (unsigned i = 0U; i < NumLoadElements; ++i) {
+ if (Mask->getAggregateElement(i)->isZeroValue())
+ MaskedIndices.set(i);
+ }
+ }
+ }
+
// Holds the corresponding index for each DE-interleave shuffle.
SmallVector<unsigned, 4> Indices;
@@ -327,9 +359,9 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
assert(Shuffle->getShuffleMask().size() <= NumLoadElements);
- if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(0) == LI)
+ if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(0) == LoadOp)
Indices.push_back(Index);
- if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(1) == LI)
+ if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(1) == LoadOp)
Indices.push_back(Index);
}
@@ -339,25 +371,61 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
return false;
bool BinOpShuffleChanged =
- replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LI);
+ replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LoadOp);
+
+ // Check if we extract only the unmasked elements.
+ if (MaskedIndices.any()) {
+ if (any_of(Shuffles, [&](const auto *Shuffle) {
+ ArrayRef<int> ShuffleMask = Shuffle->getShuffleMask();
+ for (int Idx : ShuffleMask) {
+ if (Idx < 0)
+ continue;
+ if (MaskedIndices.test(unsigned(Idx)))
+ return true;
+ }
+ return false;
+ })) {
+ LLVM_DEBUG(dbgs() << "IA: trying to extract a masked element through "
+ << "shufflevector\n");
+ return false;
+ }
+ }
+ // Check if we extract only the elements within evl.
+ if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
+ uint64_t EVL = cast<ConstantInt>(VPLoad->getArgOperand(2))->getZExtValue();
+ if (any_of(Shuffles, [&](const auto *Shuffle) {
+ ArrayRef<int> ShuffleMask = Shuffle->getShuffleMask();
+ for (int Idx : ShuffleMask) {
+ if (Idx < 0)
+ continue;
+ if (unsigned(Idx) >= EVL)
+ return true;
+ }
+ return false;
+ })) {
+ LLVM_DEBUG(
+ dbgs() << "IA: trying to extract an element out of EVL range\n");
+ return false;
+ }
+ }
- LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n");
+ LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LoadOp << "\n");
// Try to create target specific intrinsics to replace the load and shuffles.
- if (!TLI->lowerInterleavedLoad(LI, Shuffles, Indices, Factor)) {
+ if (!TLI->lowerInterleavedLoad(LoadOp, Shuffles, Indices, Factor)) {
// If Extracts is not empty, tryReplaceExtracts made changes earlier.
return !Extracts.empty() || BinOpShuffleChanged;
}
DeadInsts.insert_range(Shuffles);
- DeadInsts.insert(LI);
+ DeadInsts.insert(LoadOp);
return true;
}
bool InterleavedAccessImpl::replaceBinOpShuffles(
ArrayRef<ShuffleVectorInst *> BinOpShuffles,
- SmallVectorImpl<ShuffleVectorInst *> &Shuffles, LoadInst *LI) {
+ SmallVectorImpl<ShuffleVectorInst *> &Shuffles, Instruction *LoadOp) {
for (auto *SVI : BinOpShuffles) {
BinaryOperator *BI = cast<BinaryOperator>(SVI->getOperand(0));
Type *BIOp0Ty = BI->getOperand(0)->getType();
@@ -380,9 +448,9 @@ bool InterleavedAccessImpl::replaceBinOpShuffles(
<< "\n With : " << *NewSVI1 << "\n And : "
<< *NewSVI2 << "\n And : " << *NewBI << "\n");
RecursivelyDeleteTriviallyDeadInstructions(SVI);
- if (NewSVI1->getOperand(0) == LI)
+ if (NewSVI1->getOperand(0) == LoadOp)
Shuffles.push_back(NewSVI1);
- if (NewSVI2->getOperand(0) == LI)
+ if (NewSVI2->getOperand(0) == LoadOp)
Shuffles.push_back(NewSVI2);
}
@@ -454,27 +522,79 @@ bool InterleavedAccessImpl::tryReplaceExtracts(
}
bool InterleavedAccessImpl::lowerInterleavedStore(
- StoreInst *SI, SmallSetVector<Instruction *, 32> &DeadInsts) {
- if (!SI->isSimple())
- return false;
+ Instruction *StoreOp, SmallSetVector<Instruction *, 32> &DeadInsts) {
+ Value *StoredValue;
+ if (auto *SI = dyn_cast<StoreInst>(StoreOp)) {
+ if (!SI->isSimple())
+ return false;
+ StoredValue = SI->getValueOperand();
+ } else if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
+ assert(VPStore->getIntrinsicID() == Intrinsic::vp_store);
+ // Require a constant mask and evl.
+ if (!isa<ConstantVector>(VPStore->getArgOperand(2)) ||
+ !isa<ConstantInt>(VPStore->getArgOperand(3)))
+ return false;
+ StoredValue = VPStore->getArgOperand(0);
+ } else {
+ llvm_unreachable("unsupported store operation");
+ }
- auto *SVI = dyn_cast<ShuffleVectorInst>(SI->getValueOperand());
+ auto *SVI = dyn_cast<ShuffleVectorInst>(StoredValue);
if (!SVI || !SVI->hasOneUse() || isa<ScalableVectorType>(SVI->getType()))
return false;
+ unsigned NumStoredElements =
+ cast<FixedVectorType>(SVI->getType())->getNumElements();
+ // If this is a vp.store, record its mask (NOT shuffle mask).
+ BitVector MaskedIndices(NumStoredElements);
+ if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
+ auto *Mask = cast<ConstantVector>(VPStore->getArgOperand(2));
+ assert(cast<FixedVectorType>(Mask->getType())->getNumElements() ==
+ NumStoredElements);
+ if (auto *Splat = Mask->getSplatValue()) {
+ // All-zeros mask, bail out early.
+ if (Splat->isZeroValue())
+ return false;
+ } else {
+ for (unsigned i = 0U; i < NumStoredElements; ++i) {
+ if (Mask->getAggregateElement(i)->isZeroValue())
+ MaskedIndices.set(i);
+ }
+ }
+ }
+
// Check if the shufflevector is RE-interleave shuffle.
unsigned Factor;
if (!isReInterleaveMask(SVI, Factor, MaxFactor))
return false;
- LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n");
+ // Check if we store only the unmasked elements.
+ if (MaskedIndices.any()) {
+ if (any_of(SVI->getShuffleMask(), [&](int Idx) {
+ return Idx >= 0 && MaskedIndices.test(unsigned(Idx));
+ })) {
+ LLVM_DEBUG(dbgs() << "IA: trying to store a masked element\n");
+ return false;
+ }
+ }
+ // Check if we store only the elements within evl.
+ if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
+ uint64_t EVL = cast<ConstantInt>(VPStore->getArgOperand(3))->getZExtValue();
+ if (any_of(SVI->getShuffleMask(),
+ [&](int Idx) { return Idx >= 0 && unsigned(Idx) >= EVL; })) {
+ LLVM_DEBUG(dbgs() << "IA: trying to store an element out of EVL range\n");
+ return false;
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *StoreOp << "\n");
// Try to create target specific intrinsics to replace the store and shuffle.
- if (!TLI->lowerInterleavedStore(SI, SVI, Factor))
+ if (!TLI->lowerInterleavedStore(StoreOp, SVI, Factor))
return false;
// Already have a new target specific interleaved store. Erase the old store.
- DeadInsts.insert(SI);
+ DeadInsts.insert(StoreOp);
DeadInsts.insert(SVI);
return true;
}
@@ -766,12 +886,15 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
SmallSetVector<Instruction *, 32> DeadInsts;
bool Changed = false;
+ using namespace PatternMatch;
for (auto &I : instructions(F)) {
- if (auto *LI = dyn_cast<LoadInst>(&I))
- Changed |= lowerInterleavedLoad(LI, DeadInsts);
+ if (match(&I, m_CombineOr(m_Load(m_Value()),
+ m_Intrinsic<Intrinsic::vp_load>())))
+ Changed |= lowerInterleavedLoad(&I, DeadInsts);
- if (auto *SI = dyn_cast<StoreInst>(&I))
- Changed |= lowerInterleavedStore(SI, DeadInsts);
+ if (match(&I, m_CombineOr(m_Store(m_Value(), m_Value()),
+ m_Intrinsic<Intrinsic::vp_store>())))
+ Changed |= lowerInterleavedStore(&I, DeadInsts);
if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
// At present, we only have intrinsics to represent (de)interleaving
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e366d7cb54490..d74cc3161684d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17176,7 +17176,7 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
bool AArch64TargetLowering::lowerInterleavedLoad(
- LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+ Instruction *LoadOp, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
@@ -17184,6 +17184,10 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
assert(Shuffles.size() == Indices.size() &&
"Unmatched number of shufflevectors and indices");
+ auto *LI = dyn_cast<LoadInst>(LoadOp);
+ if (!LI)
+ return false;
+
const DataLayout &DL = LI->getDataLayout();
VectorType *VTy = Shuffles[0]->getType();
@@ -17359,13 +17363,17 @@ bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
-bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
+bool AArch64TargetLowering::lowerInterleavedStore(Instruction *StoreOp,
ShuffleVectorInst *SVI,
unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
+ auto *SI = dyn_cast<StoreInst>(StoreOp);
+ if (!SI)
+ return false;
+
auto *VecTy = cast<FixedVectorType>(SVI->getType());
assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 0d51ef2be8631..34446abb1474c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -716,11 +716,11 @@ class AArch64TargetLowering : public TargetLowering {
unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
- bool lowerInterleavedLoad(LoadInst *LI,
+ bool lowerInterleavedLoad(Instruction *LoadOp,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
- bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+ bool lowerInterleavedStore(Instruction *StoreOp, ShuffleVectorInst *SVI,
unsigned Factor) const override;
bool lowerDeinterleaveIntrinsicToLoad(
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 2290ac2728c6d..64d12a0eb1d9b 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21608,7 +21608,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
bool ARMTargetLowering::lowerInterleavedLoad(
- LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+ Instruction *LoadOp, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
@@ -21616,6 +21616,10 @@ bool ARMTargetLowering::lowerInterleavedLoad(
assert(Shuffles.size() == Indices.size() &&
"Unmatched number of shufflevectors and indices");
+ auto *LI = dyn_cast<LoadInst>(LoadOp);
+ if (!LI)
+ return false;
+
auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
Type *EltTy = VecTy->getElementType();
@@ -21750,12 +21754,16 @@ bool ARMTargetLowering::lowerInterleavedLoad(
/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
-bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
+bool ARMTargetLowering::lowerInterleavedStore(Instruction *StoreOp,
ShuffleVectorInst *SVI,
unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
+ auto *SI = dyn_cast<StoreInst>(StoreOp);
+ if (!SI)
+ return false;
+
auto *VecTy = cast<FixedVectorType>(SVI->getType());
assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 9fad056edd3f1..635a6cd226936 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -673,11 +673,11 @@ class VectorType;
unsigned getMaxSupportedInterleaveFactor() const override;
- bool lowerInterleavedLoad(LoadInst *LI,
+ bool lowerInterleavedLoad(Instruction *LoadOp,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
- bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+ bool lowerInterleavedStore(Instruction *StoreOp, ShuffleVectorInst *SVI,
unsigned Factor) const override;
bool shouldInsertFencesForAtomic(const Instruction *I) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f7d192756fd56..9558783963500 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -23376,19 +23376,36 @@ static const Intrinsic::ID FixedVlsegIntrIds[] = {
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
bool RISCVTargetLowering::lowerInterleavedLoad(
- LoadInst *LI, ArrayRef<Sh...
[truncated]
|
@llvm/pr-subscribers-backend-arm Author: Min-Yih Hsu (mshockwave) ChangesTeach InterleavedAccessPass to recognize vp.load + shufflevector and shufflevector + vp.store. Though this patch only adds RISC-V support to actually lower this pattern. The vp.load/vp.store in this pattern require constant mask and EVL. I really don't want to add yet another new TLI hook on top of the six we already have, and that's why I reuse the existing Also, that I'm planning to consolidate and generalize InterleavedAccessPass to support any combinations of load/vp.load/masked.load with shufflevector/vector.deinterleave (and their store operation counterparts). Patch is 60.54 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/135445.diff 11 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 4f2f202f94841..5407bf8b2ba13 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3157,11 +3157,11 @@ class TargetLoweringBase {
/// Lower an interleaved load to target specific intrinsics. Return
/// true on success.
///
- /// \p LI is the vector load instruction.
+ /// \p LoadOp is a vector load or vp.load instruction.
/// \p Shuffles is the shufflevector list to DE-interleave the loaded vector.
/// \p Indices is the corresponding indices for each shufflevector.
/// \p Factor is the interleave factor.
- virtual bool lowerInterleavedLoad(LoadInst *LI,
+ virtual bool lowerInterleavedLoad(Instruction *LoadOp,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const {
@@ -3171,10 +3171,11 @@ class TargetLoweringBase {
/// Lower an interleaved store to target specific intrinsics. Return
/// true on success.
///
- /// \p SI is the vector store instruction.
+ /// \p StoreOp is a vector store or vp.store instruction.
/// \p SVI is the shufflevector to RE-interleave the stored vector.
/// \p Factor is the interleave factor.
- virtual bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+ virtual bool lowerInterleavedStore(Instruction *StoreOp,
+ ShuffleVectorInst *SVI,
unsigned Factor) const {
return false;
}
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 9e47510e9cd1a..83bde96cc725a 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -45,6 +45,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/ADT/SmallVector.h"
@@ -100,11 +101,11 @@ class InterleavedAccessImpl {
unsigned MaxFactor = 0u;
/// Transform an interleaved load into target specific intrinsics.
- bool lowerInterleavedLoad(LoadInst *LI,
+ bool lowerInterleavedLoad(Instruction *LoadOp,
SmallSetVector<Instruction *, 32> &DeadInsts);
/// Transform an interleaved store into target specific intrinsics.
- bool lowerInterleavedStore(StoreInst *SI,
+ bool lowerInterleavedStore(Instruction *StoreOp,
SmallSetVector<Instruction *, 32> &DeadInsts);
/// Transform a load and a deinterleave intrinsic into target specific
@@ -131,7 +132,7 @@ class InterleavedAccessImpl {
/// made.
bool replaceBinOpShuffles(ArrayRef<ShuffleVectorInst *> BinOpShuffles,
SmallVectorImpl<ShuffleVectorInst *> &Shuffles,
- LoadInst *LI);
+ Instruction *LI);
};
class InterleavedAccess : public FunctionPass {
@@ -250,10 +251,23 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
}
bool InterleavedAccessImpl::lowerInterleavedLoad(
- LoadInst *LI, SmallSetVector<Instruction *, 32> &DeadInsts) {
- if (!LI->isSimple() || isa<ScalableVectorType>(LI->getType()))
+ Instruction *LoadOp, SmallSetVector<Instruction *, 32> &DeadInsts) {
+ if (isa<ScalableVectorType>(LoadOp->getType()))
return false;
+ if (auto *LI = dyn_cast<LoadInst>(LoadOp)) {
+ if (!LI->isSimple())
+ return false;
+ } else if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
+ assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load);
+ // Require a constant mask and evl.
+ if (!isa<ConstantVector>(VPLoad->getArgOperand(1)) ||
+ !isa<ConstantInt>(VPLoad->getArgOperand(2)))
+ return false;
+ } else {
+ llvm_unreachable("unsupported load operation");
+ }
+
// Check if all users of this load are shufflevectors. If we encounter any
// users that are extractelement instructions or binary operators, we save
// them to later check if they can be modified to extract from one of the
@@ -265,7 +279,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
// binop are the same load.
SmallSetVector<ShuffleVectorInst *, 4> BinOpShuffles;
- for (auto *User : LI->users()) {
+ for (auto *User : LoadOp->users()) {
auto *Extract = dyn_cast<ExtractElementInst>(User);
if (Extract && isa<ConstantInt>(Extract->getIndexOperand())) {
Extracts.push_back(Extract);
@@ -294,13 +308,31 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
unsigned Factor, Index;
unsigned NumLoadElements =
- cast<FixedVectorType>(LI->getType())->getNumElements();
+ cast<FixedVectorType>(LoadOp->getType())->getNumElements();
auto *FirstSVI = Shuffles.size() > 0 ? Shuffles[0] : BinOpShuffles[0];
// Check if the first shufflevector is DE-interleave shuffle.
if (!isDeInterleaveMask(FirstSVI->getShuffleMask(), Factor, Index, MaxFactor,
NumLoadElements))
return false;
+ // If this is a vp.load, record its mask (NOT shuffle mask).
+ BitVector MaskedIndices(NumLoadElements);
+ if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
+ auto *Mask = cast<ConstantVector>(VPLoad->getArgOperand(1));
+ assert(cast<FixedVectorType>(Mask->getType())->getNumElements() ==
+ NumLoadElements);
+ if (auto *Splat = Mask->getSplatValue()) {
+ // All-zeros mask, bail out early.
+ if (Splat->isZeroValue())
+ return false;
+ } else {
+ for (unsigned i = 0U; i < NumLoadElements; ++i) {
+ if (Mask->getAggregateElement(i)->isZeroValue())
+ MaskedIndices.set(i);
+ }
+ }
+ }
+
// Holds the corresponding index for each DE-interleave shuffle.
SmallVector<unsigned, 4> Indices;
@@ -327,9 +359,9 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
assert(Shuffle->getShuffleMask().size() <= NumLoadElements);
- if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(0) == LI)
+ if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(0) == LoadOp)
Indices.push_back(Index);
- if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(1) == LI)
+ if (cast<Instruction>(Shuffle->getOperand(0))->getOperand(1) == LoadOp)
Indices.push_back(Index);
}
@@ -339,25 +371,61 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
return false;
bool BinOpShuffleChanged =
- replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LI);
+ replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LoadOp);
+
+ // Check if we extract only the unmasked elements.
+ if (MaskedIndices.any()) {
+ if (any_of(Shuffles, [&](const auto *Shuffle) {
+ ArrayRef<int> ShuffleMask = Shuffle->getShuffleMask();
+ for (int Idx : ShuffleMask) {
+ if (Idx < 0)
+ continue;
+ if (MaskedIndices.test(unsigned(Idx)))
+ return true;
+ }
+ return false;
+ })) {
+ LLVM_DEBUG(dbgs() << "IA: trying to extract a masked element through "
+ << "shufflevector\n");
+ return false;
+ }
+ }
+ // Check if we extract only the elements within evl.
+ if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
+ uint64_t EVL = cast<ConstantInt>(VPLoad->getArgOperand(2))->getZExtValue();
+ if (any_of(Shuffles, [&](const auto *Shuffle) {
+ ArrayRef<int> ShuffleMask = Shuffle->getShuffleMask();
+ for (int Idx : ShuffleMask) {
+ if (Idx < 0)
+ continue;
+ if (unsigned(Idx) >= EVL)
+ return true;
+ }
+ return false;
+ })) {
+ LLVM_DEBUG(
+ dbgs() << "IA: trying to extract an element out of EVL range\n");
+ return false;
+ }
+ }
- LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n");
+ LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LoadOp << "\n");
// Try to create target specific intrinsics to replace the load and shuffles.
- if (!TLI->lowerInterleavedLoad(LI, Shuffles, Indices, Factor)) {
+ if (!TLI->lowerInterleavedLoad(LoadOp, Shuffles, Indices, Factor)) {
// If Extracts is not empty, tryReplaceExtracts made changes earlier.
return !Extracts.empty() || BinOpShuffleChanged;
}
DeadInsts.insert_range(Shuffles);
- DeadInsts.insert(LI);
+ DeadInsts.insert(LoadOp);
return true;
}
bool InterleavedAccessImpl::replaceBinOpShuffles(
ArrayRef<ShuffleVectorInst *> BinOpShuffles,
- SmallVectorImpl<ShuffleVectorInst *> &Shuffles, LoadInst *LI) {
+ SmallVectorImpl<ShuffleVectorInst *> &Shuffles, Instruction *LoadOp) {
for (auto *SVI : BinOpShuffles) {
BinaryOperator *BI = cast<BinaryOperator>(SVI->getOperand(0));
Type *BIOp0Ty = BI->getOperand(0)->getType();
@@ -380,9 +448,9 @@ bool InterleavedAccessImpl::replaceBinOpShuffles(
<< "\n With : " << *NewSVI1 << "\n And : "
<< *NewSVI2 << "\n And : " << *NewBI << "\n");
RecursivelyDeleteTriviallyDeadInstructions(SVI);
- if (NewSVI1->getOperand(0) == LI)
+ if (NewSVI1->getOperand(0) == LoadOp)
Shuffles.push_back(NewSVI1);
- if (NewSVI2->getOperand(0) == LI)
+ if (NewSVI2->getOperand(0) == LoadOp)
Shuffles.push_back(NewSVI2);
}
@@ -454,27 +522,79 @@ bool InterleavedAccessImpl::tryReplaceExtracts(
}
bool InterleavedAccessImpl::lowerInterleavedStore(
- StoreInst *SI, SmallSetVector<Instruction *, 32> &DeadInsts) {
- if (!SI->isSimple())
- return false;
+ Instruction *StoreOp, SmallSetVector<Instruction *, 32> &DeadInsts) {
+ Value *StoredValue;
+ if (auto *SI = dyn_cast<StoreInst>(StoreOp)) {
+ if (!SI->isSimple())
+ return false;
+ StoredValue = SI->getValueOperand();
+ } else if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
+ assert(VPStore->getIntrinsicID() == Intrinsic::vp_store);
+ // Require a constant mask and evl.
+ if (!isa<ConstantVector>(VPStore->getArgOperand(2)) ||
+ !isa<ConstantInt>(VPStore->getArgOperand(3)))
+ return false;
+ StoredValue = VPStore->getArgOperand(0);
+ } else {
+ llvm_unreachable("unsupported store operation");
+ }
- auto *SVI = dyn_cast<ShuffleVectorInst>(SI->getValueOperand());
+ auto *SVI = dyn_cast<ShuffleVectorInst>(StoredValue);
if (!SVI || !SVI->hasOneUse() || isa<ScalableVectorType>(SVI->getType()))
return false;
+ unsigned NumStoredElements =
+ cast<FixedVectorType>(SVI->getType())->getNumElements();
+ // If this is a vp.store, record its mask (NOT shuffle mask).
+ BitVector MaskedIndices(NumStoredElements);
+ if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
+ auto *Mask = cast<ConstantVector>(VPStore->getArgOperand(2));
+ assert(cast<FixedVectorType>(Mask->getType())->getNumElements() ==
+ NumStoredElements);
+ if (auto *Splat = Mask->getSplatValue()) {
+ // All-zeros mask, bail out early.
+ if (Splat->isZeroValue())
+ return false;
+ } else {
+ for (unsigned i = 0U; i < NumStoredElements; ++i) {
+ if (Mask->getAggregateElement(i)->isZeroValue())
+ MaskedIndices.set(i);
+ }
+ }
+ }
+
// Check if the shufflevector is RE-interleave shuffle.
unsigned Factor;
if (!isReInterleaveMask(SVI, Factor, MaxFactor))
return false;
- LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n");
+ // Check if we store only the unmasked elements.
+ if (MaskedIndices.any()) {
+ if (any_of(SVI->getShuffleMask(), [&](int Idx) {
+ return Idx >= 0 && MaskedIndices.test(unsigned(Idx));
+ })) {
+ LLVM_DEBUG(dbgs() << "IA: trying to store a masked element\n");
+ return false;
+ }
+ }
+ // Check if we store only the elements within evl.
+ if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
+ uint64_t EVL = cast<ConstantInt>(VPStore->getArgOperand(3))->getZExtValue();
+ if (any_of(SVI->getShuffleMask(),
+ [&](int Idx) { return Idx >= 0 && unsigned(Idx) >= EVL; })) {
+ LLVM_DEBUG(dbgs() << "IA: trying to store an element out of EVL range\n");
+ return false;
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *StoreOp << "\n");
// Try to create target specific intrinsics to replace the store and shuffle.
- if (!TLI->lowerInterleavedStore(SI, SVI, Factor))
+ if (!TLI->lowerInterleavedStore(StoreOp, SVI, Factor))
return false;
// Already have a new target specific interleaved store. Erase the old store.
- DeadInsts.insert(SI);
+ DeadInsts.insert(StoreOp);
DeadInsts.insert(SVI);
return true;
}
@@ -766,12 +886,15 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
SmallSetVector<Instruction *, 32> DeadInsts;
bool Changed = false;
+ using namespace PatternMatch;
for (auto &I : instructions(F)) {
- if (auto *LI = dyn_cast<LoadInst>(&I))
- Changed |= lowerInterleavedLoad(LI, DeadInsts);
+ if (match(&I, m_CombineOr(m_Load(m_Value()),
+ m_Intrinsic<Intrinsic::vp_load>())))
+ Changed |= lowerInterleavedLoad(&I, DeadInsts);
- if (auto *SI = dyn_cast<StoreInst>(&I))
- Changed |= lowerInterleavedStore(SI, DeadInsts);
+ if (match(&I, m_CombineOr(m_Store(m_Value(), m_Value()),
+ m_Intrinsic<Intrinsic::vp_store>())))
+ Changed |= lowerInterleavedStore(&I, DeadInsts);
if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
// At present, we only have intrinsics to represent (de)interleaving
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e366d7cb54490..d74cc3161684d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17176,7 +17176,7 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
bool AArch64TargetLowering::lowerInterleavedLoad(
- LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+ Instruction *LoadOp, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
@@ -17184,6 +17184,10 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
assert(Shuffles.size() == Indices.size() &&
"Unmatched number of shufflevectors and indices");
+ auto *LI = dyn_cast<LoadInst>(LoadOp);
+ if (!LI)
+ return false;
+
const DataLayout &DL = LI->getDataLayout();
VectorType *VTy = Shuffles[0]->getType();
@@ -17359,13 +17363,17 @@ bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
-bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
+bool AArch64TargetLowering::lowerInterleavedStore(Instruction *StoreOp,
ShuffleVectorInst *SVI,
unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
+ auto *SI = dyn_cast<StoreInst>(StoreOp);
+ if (!SI)
+ return false;
+
auto *VecTy = cast<FixedVectorType>(SVI->getType());
assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 0d51ef2be8631..34446abb1474c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -716,11 +716,11 @@ class AArch64TargetLowering : public TargetLowering {
unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
- bool lowerInterleavedLoad(LoadInst *LI,
+ bool lowerInterleavedLoad(Instruction *LoadOp,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
- bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+ bool lowerInterleavedStore(Instruction *StoreOp, ShuffleVectorInst *SVI,
unsigned Factor) const override;
bool lowerDeinterleaveIntrinsicToLoad(
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 2290ac2728c6d..64d12a0eb1d9b 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21608,7 +21608,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
bool ARMTargetLowering::lowerInterleavedLoad(
- LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
+ Instruction *LoadOp, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
@@ -21616,6 +21616,10 @@ bool ARMTargetLowering::lowerInterleavedLoad(
assert(Shuffles.size() == Indices.size() &&
"Unmatched number of shufflevectors and indices");
+ auto *LI = dyn_cast<LoadInst>(LoadOp);
+ if (!LI)
+ return false;
+
auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
Type *EltTy = VecTy->getElementType();
@@ -21750,12 +21754,16 @@ bool ARMTargetLowering::lowerInterleavedLoad(
/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
-bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
+bool ARMTargetLowering::lowerInterleavedStore(Instruction *StoreOp,
ShuffleVectorInst *SVI,
unsigned Factor) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
+ auto *SI = dyn_cast<StoreInst>(StoreOp);
+ if (!SI)
+ return false;
+
auto *VecTy = cast<FixedVectorType>(SVI->getType());
assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 9fad056edd3f1..635a6cd226936 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -673,11 +673,11 @@ class VectorType;
unsigned getMaxSupportedInterleaveFactor() const override;
- bool lowerInterleavedLoad(LoadInst *LI,
+ bool lowerInterleavedLoad(Instruction *LoadOp,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
- bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
+ bool lowerInterleavedStore(Instruction *StoreOp, ShuffleVectorInst *SVI,
unsigned Factor) const override;
bool shouldInsertFencesForAtomic(const Instruction *I) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index f7d192756fd56..9558783963500 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -23376,19 +23376,36 @@ static const Intrinsic::ID FixedVlsegIntrIds[] = {
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
bool RISCVTargetLowering::lowerInterleavedLoad(
- LoadInst *LI, ArrayRef<Sh...
[truncated]
|
for (unsigned i = 0U; i < NumLoadElements; ++i) { | ||
if (Mask->getAggregateElement(i)->isZeroValue()) | ||
MaskedIndices.set(i); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
for (unsigned I : seq<unsigned>(NumLoadElements)) {
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why? Is that in the coding standards?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, of course not, just a recommendation + var name should start with upper case letter.
for (int Idx : ShuffleMask) { | ||
if (Idx < 0) | ||
continue; | ||
if (MaskedIndices.test(unsigned(Idx))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (MaskedIndices.test(Idx))
for (int Idx : ShuffleMask) { | ||
if (Idx < 0) | ||
continue; | ||
if (unsigned(Idx) >= EVL) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (Idx >= EVL)
} | ||
// Check if we extract only the elements within evl. | ||
if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) { | ||
uint64_t EVL = cast<ConstantInt>(VPLoad->getArgOperand(2))->getZExtValue(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
int64_t EVL =
if (Splat->isZeroValue()) | ||
return false; | ||
} else { | ||
for (unsigned i = 0U; i < NumStoredElements; ++i) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
for (unsigned I : seq<unsigned>(NumStoredElements)) {
// Check if we store only the unmasked elements. | ||
if (MaskedIndices.any()) { | ||
if (any_of(SVI->getShuffleMask(), [&](int Idx) { | ||
return Idx >= 0 && MaskedIndices.test(unsigned(Idx)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
return Idx >= 0 && MaskedIndices.test(Idx);
} | ||
// Check if we store only the elements within evl. | ||
if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) { | ||
uint64_t EVL = cast<ConstantInt>(VPStore->getArgOperand(3))->getZExtValue(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
int64_t EVL = cast<ConstantInt>(VPStore->getArgOperand(3))->getZExtValue();
if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) { | ||
uint64_t EVL = cast<ConstantInt>(VPStore->getArgOperand(3))->getZExtValue(); | ||
if (any_of(SVI->getShuffleMask(), | ||
[&](int Idx) { return Idx >= 0 && unsigned(Idx) >= EVL; })) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[&](int Idx) { return Idx >= 0 && Idx >= EVL; })) {
I've been looking into some potential correctness issue of this patch raised by @topperc previously. I thought it might be a good idea to write down some of the discussions here. First, the motivation of this patch is that SLP is planning to use vp.load. Currently it will generate something like %interleaved.vec = tail call <9 x i32> @llvm.vp.load.v9i32.p0(ptr %ptr, <9 x i1> <i1 1, i1 1, i1 0, i1 1, i1 1, i1 0, i1 1, i1 1, i1 0>, i32 9)
%v0 = shufflevector <9 x i32> %interleaved.vec, <9 x i32> poison, <3 x i32> <i32 0, i32 3, i32 6>
%v1 = shufflevector <9 x i32> %interleaved.vec, <9 x i32> poison, <3 x i32> <i32 1, i32 4, i32 7> That is, we only extract the first two segments. All elements in the last segment is masked off in the load. My current lowering would turn this into a segmented load with all ones mask, followed by extractions of only the first two segments. The problem here is that if we (segmented) load values with all ones mask on all 9 elements, we might trigger page fault on an address that would have been masked off. Most likely it would be a trailing address, like element 8 in the example above, which was masked off in the original vp.load. Similar situation also happens on this: %interleaved.vec = tail call <9 x i32> @llvm.vp.load.v9i32.p0(ptr %ptr, <9 x i1> <i1 0, i1 0, i1 0, i1 1, i1 1, i1 1, i1 0, i1 0, i1 0>, i32 9)
%v0 = shufflevector <9 x i32> %interleaved.vec, <9 x i32> poison, <3 x i32> <i32 0, i32 3, i32 6>
%v1 = shufflevector <9 x i32> %interleaved.vec, <9 x i32> poison, <3 x i32> <i32 1, i32 4, i32 7>
%v2 = shufflevector <9 x i32> %interleaved.vec, <9 x i32> poison, <3 x i32> <i32 2, i32 5, i32 8> Where individual segment has a mask of So I think the conclusion is that we should propagate the mask. I already had rough a fix for that in my local tree. That being said, I think propagating mask only fixes the second example. Because for the first example, the first two segments have masks of |
You can test this locally with the following command:git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)' 'HEAD~1' HEAD llvm/include/llvm/CodeGen/TargetLowering.h llvm/lib/CodeGen/InterleavedAccessPass.cpp llvm/lib/Target/RISCV/RISCVISelLowering.cpp llvm/lib/Target/RISCV/RISCVISelLowering.h llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll llvm/test/CodeGen/RISCV/rvv/vp-vector-interleaved-access.ll The following files introduce new uses of undef:
Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields In tests, avoid using For example, this is considered a bad practice: define void @fn() {
...
br i1 undef, ...
} Please use the following instead: define void @fn(i1 %cond) {
...
br i1 %cond, ...
} Please refer to the Undefined Behavior Manual for more information. |
I made some big changes on this patch:
|
ConstantInt::getTrue(Mask->getContext())); | ||
} else { | ||
for (unsigned Idx = 0U, N = LaneMaskLen * Factor; Idx < N; ++Idx) { | ||
Constant *Ref = Mask->getAggregateElement((Idx / Factor) * Factor); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(Idx / Factor) * Factor
-> alignDown(Idx, Factor)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed.
if (auto *Splat = Mask->getSplatValue()) { | ||
// All-zeros mask. | ||
if (Splat->isZeroValue()) | ||
return false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we reject an all 0s mask? Hopefully its gets removed by InstCombine earlier, but if not it should still be valid to make a segment load/store with a 0 mask and it would avoid the interleave/deinterleave lowering in SelectionDAG.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I thought it'll got removed anyway by one of the IR Passes in the codegen pipeline. But with a quick check that doesn't seem to be the case. I'll let this case through in my next update.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's done.
// Sometimes the number of Shuffles might be less than Factor, we have to | ||
// fill the gaps with null. Also, lowerDeinterleavedVPLoad | ||
// expects them to be sorted. | ||
SmallVector<Value *, 4> ShuffleValues(Factor, nullptr); | ||
for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices)) | ||
ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this an issue that only affects vp loads? Does the non-vp lowerInterleavedLoad
need to worry about this too?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, because TLI::lowerDeinterleavedVPLoad
uses the number of shuffle / leaf values as the factor, that's why we have to pad ShuffleValues
with null until it has a size of Factor. On the contrary, TLI::lowerInterleavedLoad
has Factor explicitly passed into it. It also has the Indices
argument to specify the order of shufflevector instructions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah I see, thanks for explaining it. Would be simpler to add a Factor argument to lowerDeinterleavedVPLoad
to bring it inline with lowerInterleavedLoad
? Non-blocking, just a suggestion.
EDIT: Actually I think this is probably best left to a separate PR where we clean up the TTI hooks, like you originally suggested.
@@ -3210,15 +3210,15 @@ class TargetLoweringBase { | |||
return false; | |||
} | |||
|
|||
/// Lower an interleaved load to target specific intrinsics. Return | |||
/// Lower a deinterleaved load to target specific intrinsics. Return |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this should still be "interleaved load" to match the terminology used in the header description at the top of InterleavedAccessPass.cpp and the loop vectorizer.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agree, it's fixed now.
/// true on success. | ||
/// | ||
/// \p Load is a vp.load instruction. | ||
/// \p Mask is a mask value | ||
/// \p DeinterleaveRes is a list of deinterleaved results. | ||
virtual bool | ||
lowerDeinterleavedIntrinsicToVPLoad(VPIntrinsic *Load, Value *Mask, | ||
ArrayRef<Value *> DeinterleaveRes) const { | ||
lowerDeinterleavedVPLoad(VPIntrinsic *Load, Value *Mask, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this should be lowerInterleavedVPLoad
to match lowerInterleavedLoad
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
bool InterleavedAccessImpl::lowerInterleavedLoad( | ||
LoadInst *LI, SmallSetVector<Instruction *, 32> &DeadInsts) { | ||
if (!LI->isSimple() || isa<ScalableVectorType>(LI->getType())) | ||
Instruction *LoadOp, SmallSetVector<Instruction *, 32> &DeadInsts) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can this just be Load
? I'm not sure the Op
is providing any value.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed.
|
||
LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n"); | ||
// Check if the de-interleaved vp.store masks are the same. | ||
if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
empty block?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is likely caused by the git merge (which started to worry me that there are other places like this). I'll remove it in the next update
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's fixed now
|
||
IRBuilder<> Builder(VPStore); | ||
// We need to effectively de-interleave the shufflemask | ||
// because lowerInterleavedVPStore expected individual de-interleaved |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
expected -> expects?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed.
if (auto *Splat = ConstMask->getSplatValue()) { | ||
// All-ones or all-zeros mask. | ||
return ConstantVector::getSplat(LeafValueEC, Splat); | ||
} else if (LeafValueEC.isFixed()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No else
after return
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed.
// use the same mask. This is done by checking if every group with Factor | ||
// number of elements in the interleaved mask has homogeneous values. | ||
for (unsigned Idx = 0U; Idx < LeafMaskLen * Factor; ++Idx) { | ||
Constant *Ref = ConstMask->getAggregateElement(alignDown(Idx, Factor)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You could avoid the alignDown
by using LeafMask
. Something like.
Constant *C = ConstMask->getAggregateElement(Idx);
if (LeafMask[Idx / Factor] && LeafMask[Idx / Factor] != C)
return nullptr;
LeafMask[Idx / Factor] = C;
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point. It's fixed now.
// Sometimes the number of Shuffles might be less than Factor, we have to | ||
// fill the gaps with null. Also, lowerDeinterleavedVPLoad | ||
// expects them to be sorted. | ||
SmallVector<Value *, 4> ShuffleValues(Factor, nullptr); | ||
for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices)) | ||
ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah I see, thanks for explaining it. Would be simpler to add a Factor argument to lowerDeinterleavedVPLoad
to bring it inline with lowerInterleavedLoad
? Non-blocking, just a suggestion.
EDIT: Actually I think this is probably best left to a separate PR where we clean up the TTI hooks, like you originally suggested.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, I've left some nits and just some thoughts on possible follow ups, thanks for pushing this through
unsigned Factor; | ||
if (!isReInterleaveMask(SVI, Factor, MaxFactor)) | ||
return false; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not for this PR, but just noting that the VP and non-VP paths in this function don't really share much code at all. It's probably justified if you want to split these out into two separate functions. lowerInterleavedLoad might need more refactoring though
|
||
if (auto *VPStore = dyn_cast<VPIntrinsic>(Store)) { | ||
unsigned LaneMaskLen = NumStoredElements / Factor; | ||
Value *LaneMask = getMask(VPStore->getArgOperand(2), Factor, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Value *LaneMask = getMask(VPStore->getArgOperand(2), Factor, | |
Value *LaneMask = getMask(VPStore->getMaskParam(), Factor, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed.
ret void | ||
} | ||
|
||
define void @vpstore_factor6(ptr %ptr, <2 x i16> %v0, <2 x i16> %v1, <2 x i16> %v2, <2 x i16> %v3, <2 x i16> %v4, <2 x i16> %v5) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we have tests for factor 7 and 8?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
good catch, they're added now.
NewShuffles.push_back(Builder.CreateShuffleVector( | ||
SVI->getOperand(0), SVI->getOperand(1), NewShuffleMask)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just noting it would be nice to not have to create the shuffles in case TLI->lowerInterleavedVPStore
bails and we end up leaving them around. But I don't have any good suggestions for now
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah I believe a similar thing happened in InterleaveAccessImpl::lowerInterleavedLoad
: we might create new shufflevectors when canonicalizing the extractions, even if we bail out later in TLI::lowerInterleavedLoad
.
} else if (auto *VPLoad = dyn_cast<VPIntrinsic>(Load)) { | ||
assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load); | ||
// Require a constant mask. | ||
if (!isa<ConstantVector>(VPLoad->getArgOperand(1))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (!isa<ConstantVector>(VPLoad->getArgOperand(1))) | |
if (!isa<ConstantVector>(VPLoad->getMaskParam())) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed.
|
||
LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n"); | ||
if (auto *VPLoad = dyn_cast<VPIntrinsic>(Load)) { | ||
Value *LaneMask = getMask(VPLoad->getArgOperand(1), Factor, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Value *LaneMask = getMask(VPLoad->getArgOperand(1), Factor, | |
Value *LaneMask = getMask(VPLoad->getMaskParam(), Factor, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed.
LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n"); | ||
if (auto *VPLoad = dyn_cast<VPIntrinsic>(Load)) { | ||
Value *LaneMask = getMask(VPLoad->getArgOperand(1), Factor, | ||
cast<VectorType>(Shuffles[0]->getType())); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
cast<VectorType>(Shuffles[0]->getType())); | |
VecTy); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed. Though it should be cast<VectorType>(VecTy)
} else if (auto *VPStore = dyn_cast<VPIntrinsic>(Store)) { | ||
assert(VPStore->getIntrinsicID() == Intrinsic::vp_store); | ||
// Require a constant mask. | ||
if (!isa<ConstantVector>(VPStore->getArgOperand(2))) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (!isa<ConstantVector>(VPStore->getArgOperand(2))) | |
if (!isa<ConstantVector>(VPStore->getMaskParam())) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed.
SDValue Mask = | ||
IsMasked ? Op.getOperand(3) : getAllOnesMask(ContainerVT, VL, DL, DAG); | ||
MVT MaskVT = Mask.getSimpleValueType(); | ||
if (MaskVT.isFixedLengthVector()) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just checking, if we remove the unmasked intrinsics in the follow up PR we won't need this check for MaskVT.isFixedLengthVector()
, because we won't have getAllOnesMask anymore?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
oh that's a good point -- we always need to convert the mask to scalable vector (after we remove unmasked intrinsics). I just found that I didn't not remove the check in the follow-up PR, I'll do that shortly.
if (!isLegalInterleavedAccessType( | ||
VTy, Factor, Alignment, | ||
Load->getArgOperand(0)->getType()->getPointerAddressSpace(), DL)) | ||
return false; | ||
|
||
IRBuilder<> Builder(Load); | ||
|
||
Value *WideEVL = Load->getArgOperand(2); | ||
// Conservatively check if EVL is a multiple of factor, otherwise some | ||
// (trailing) elements might be lost after the transformation. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This check seems quite important, should it eventually be moved into InterleavedAccessPass?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agree, we probably should.
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/18/builds/15644 Here is the relevant piece of the build log for the reference
|
…#135445) Teach InterleavedAccessPass to recognize vp.load + shufflevector and shufflevector + vp.store. Though this patch only adds RISC-V support to actually lower this pattern. The vp.load/vp.store in this pattern require constant mask.
Teach InterleavedAccessPass to recognize vp.load + shufflevector and shufflevector + vp.store. Though this patch only adds RISC-V support to actually lower this pattern. The vp.load/vp.store in this pattern require constant mask.
I really don't want to add yet another new TLI hook on top of the six we already have, and that's why I reuse the existing
TLI::lower(De)interleavedIntrinsicToVPLoad/VPStore
, which are further renamed toTLI::lower(De)interleavedVPLoad/VPStore
in this patch.Also, that I'm planning to consolidate and generalize InterleavedAccessPass to support any combinations of load/vp.load/masked.load with shufflevector/vector.deinterleave (and their store operation counterparts).