-
Notifications
You must be signed in to change notification settings - Fork 15k
[LV] Support scalable interleave groups for factors 3,5,6 and 7 #141865
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
d7c9f20
90a5528
c65b866
c28663f
b901b2d
203d204
2d98d9a
d6f7e85
a591238
4f40562
4bb7405
181bdc3
abb0bf7
1b1cc2d
b50af17
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4575,6 +4575,13 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( | |
| if (VecTy->isScalableTy() && !ST->hasSVE()) | ||
| return InstructionCost::getInvalid(); | ||
|
|
||
| // Scalable VFs will emit vector.de[interleave] intrinsics, and currently we | ||
| // only have lowering for power-of-2 factors. | ||
| // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in | ||
| // InterleavedAccessPass for ld3/st3 | ||
| if (VecTy->isScalableTy() && !isPowerOf2_32(Factor)) | ||
| return InstructionCost::getInvalid(); | ||
|
Comment on lines
+4590
to
+4591
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm confused about why factors 5 and 7 seem to be working fine? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess you meant why factors or 5 and 7 used to work fine? I think that's because vectorizer has never asked for interleaving non-power-of-two factors before (until now, ofc) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the TODO comment above confused me: why is factor 3 a TODO, even though the test seems to be fine? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think there is a factor 3 test for AArch64 in this patch. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The loop vectorizer never called |
||
|
|
||
| // Vectorization for masked interleaved accesses is only enabled for scalable | ||
| // VF. | ||
| if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps)) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3217,6 +3217,62 @@ static Value *createBitOrPointerCast(IRBuilderBase &Builder, Value *V, | |
| return Builder.CreateBitOrPointerCast(CastVal, DstVTy); | ||
| } | ||
|
|
||
| static Intrinsic::ID getInterleaveIntrinsicID(unsigned Factor) { | ||
|
||
| switch (Factor) { | ||
| case 2: | ||
| return Intrinsic::vector_interleave2; | ||
| break; | ||
lukel97 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| case 3: | ||
| return Intrinsic::vector_interleave3; | ||
| break; | ||
| case 4: | ||
| return Intrinsic::vector_interleave4; | ||
| break; | ||
| case 5: | ||
| return Intrinsic::vector_interleave5; | ||
| break; | ||
| case 6: | ||
| return Intrinsic::vector_interleave6; | ||
| break; | ||
| case 7: | ||
| return Intrinsic::vector_interleave7; | ||
| break; | ||
| case 8: | ||
| return Intrinsic::vector_interleave8; | ||
| break; | ||
| default: | ||
| llvm_unreachable("Unexpected factor"); | ||
| } | ||
| } | ||
|
|
||
| static Intrinsic::ID getDeinterleaveIntrinsicID(unsigned Factor) { | ||
| switch (Factor) { | ||
| case 2: | ||
| return Intrinsic::vector_deinterleave2; | ||
| break; | ||
| case 3: | ||
| return Intrinsic::vector_deinterleave3; | ||
| break; | ||
| case 4: | ||
| return Intrinsic::vector_deinterleave4; | ||
| break; | ||
| case 5: | ||
| return Intrinsic::vector_deinterleave5; | ||
| break; | ||
| case 6: | ||
| return Intrinsic::vector_deinterleave6; | ||
| break; | ||
| case 7: | ||
| return Intrinsic::vector_deinterleave7; | ||
| break; | ||
| case 8: | ||
| return Intrinsic::vector_deinterleave8; | ||
| break; | ||
| default: | ||
| llvm_unreachable("Unexpected factor"); | ||
| } | ||
| } | ||
|
|
||
| /// Return a vector containing interleaved elements from multiple | ||
| /// smaller input vectors. | ||
| static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, | ||
|
|
@@ -3233,6 +3289,14 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef<Value *> Vals, | |
| // Scalable vectors cannot use arbitrary shufflevectors (only splats), so | ||
| // must use intrinsics to interleave. | ||
| if (VecTy->isScalableTy()) { | ||
| if (Factor <= 8) { | ||
| VectorType *InterleaveTy = VectorType::get( | ||
| VecTy->getElementType(), | ||
| VecTy->getElementCount().multiplyCoefficientBy(Factor)); | ||
| return Builder.CreateIntrinsic(InterleaveTy, | ||
| getInterleaveIntrinsicID(Factor), Vals, | ||
| /*FMFSource=*/nullptr, Name); | ||
| } | ||
| assert(isPowerOf2_32(Factor) && "Unsupported interleave factor for " | ||
| "scalable vectors, must be power of 2"); | ||
| SmallVector<Value *> InterleavingValues(Vals); | ||
|
|
@@ -3333,7 +3397,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { | |
| &InterleaveFactor](Value *MaskForGaps) -> Value * { | ||
| if (State.VF.isScalable()) { | ||
| assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); | ||
| assert(isPowerOf2_32(InterleaveFactor) && | ||
| assert((InterleaveFactor <= 8 || isPowerOf2_32(InterleaveFactor)) && | ||
| "Unsupported deinterleave factor for scalable vectors"); | ||
| auto *ResBlockInMask = State.get(BlockInMask); | ||
| SmallVector<Value *> Ops(InterleaveFactor, ResBlockInMask); | ||
|
|
@@ -3377,34 +3441,45 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { | |
| ArrayRef<VPValue *> VPDefs = definedValues(); | ||
| const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); | ||
| if (VecTy->isScalableTy()) { | ||
| assert(isPowerOf2_32(InterleaveFactor) && | ||
| "Unsupported deinterleave factor for scalable vectors"); | ||
|
|
||
| // Scalable vectors cannot use arbitrary shufflevectors (only splats), | ||
| // so must use intrinsics to deinterleave. | ||
| SmallVector<Value *> DeinterleavedValues(InterleaveFactor); | ||
| DeinterleavedValues[0] = NewLoad; | ||
| // For the case of InterleaveFactor > 2, we will have to do recursive | ||
| // deinterleaving, because the current available deinterleave intrinsic | ||
| // supports only Factor of 2, otherwise it will bailout after first | ||
| // iteration. | ||
| // When deinterleaving, the number of values will double until we | ||
| // have "InterleaveFactor". | ||
| for (unsigned NumVectors = 1; NumVectors < InterleaveFactor; | ||
| NumVectors *= 2) { | ||
|
|
||
| if (InterleaveFactor <= 8) { | ||
|
||
| Value *Deinterleave = State.Builder.CreateIntrinsic( | ||
| getDeinterleaveIntrinsicID(InterleaveFactor), NewLoad->getType(), | ||
| NewLoad, | ||
| /*FMFSource=*/nullptr, "strided.vec"); | ||
| for (unsigned I = 0; I < InterleaveFactor; I++) | ||
| DeinterleavedValues[I] = | ||
| State.Builder.CreateExtractValue(Deinterleave, I); | ||
| } else { | ||
| assert(isPowerOf2_32(InterleaveFactor) && | ||
| "Unsupported deinterleave factor for scalable vectors"); | ||
| DeinterleavedValues[0] = NewLoad; | ||
| // For the case of InterleaveFactor > 2, we will have to do recursive | ||
lukel97 marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| // deinterleaving, because the current available deinterleave intrinsic | ||
| // supports only Factor of 2, otherwise it will bailout after first | ||
| // iteration. | ||
| // When deinterleaving, the number of values will double until we | ||
| // have "InterleaveFactor". | ||
| // Deinterleave the elements within the vector | ||
| SmallVector<Value *> TempDeinterleavedValues(NumVectors); | ||
| for (unsigned I = 0; I < NumVectors; ++I) { | ||
| auto *DiTy = DeinterleavedValues[I]->getType(); | ||
| TempDeinterleavedValues[I] = State.Builder.CreateIntrinsic( | ||
| Intrinsic::vector_deinterleave2, DiTy, DeinterleavedValues[I], | ||
| /*FMFSource=*/nullptr, "strided.vec"); | ||
| SmallVector<Value *> TempDeinterleavedValues(InterleaveFactor); | ||
| for (unsigned NumVectors = 1; NumVectors < InterleaveFactor; | ||
| NumVectors *= 2) { | ||
| for (unsigned I = 0; I < NumVectors; ++I) { | ||
| auto *DiTy = DeinterleavedValues[I]->getType(); | ||
| TempDeinterleavedValues[I] = State.Builder.CreateIntrinsic( | ||
| Intrinsic::vector_deinterleave2, DiTy, DeinterleavedValues[I], | ||
| /*FMFSource=*/nullptr, "strided.vec"); | ||
| } | ||
| // Extract the deinterleaved values: | ||
| for (unsigned I = 0; I < 2; ++I) | ||
| for (unsigned J = 0; J < NumVectors; ++J) | ||
| DeinterleavedValues[NumVectors * I + J] = | ||
| State.Builder.CreateExtractValue(TempDeinterleavedValues[J], | ||
| I); | ||
| } | ||
| // Extract the deinterleaved values: | ||
| for (unsigned I = 0; I < 2; ++I) | ||
| for (unsigned J = 0; J < NumVectors; ++J) | ||
| DeinterleavedValues[NumVectors * I + J] = | ||
| State.Builder.CreateExtractValue(TempDeinterleavedValues[J], I); | ||
| } | ||
|
|
||
| #ifndef NDEBUG | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.