-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[LV] Extend FindFirstIV to unsigned case #146386
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Extend FindFirstIV vectorization to the unsigned case by introducing and handling FindFirstIVUMin.
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: Ramkumar Ramachandra (artagnon) ChangesExtend FindFirstIV vectorization to the unsigned case by introducing and handling FindFirstIVUMin. Patch is 23.00 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/146386.diff 5 Files Affected:
diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index 3b92cbff28de4..b985292ccee40 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -57,6 +57,9 @@ enum class RecurKind {
FindFirstIVSMin, /// FindFirst reduction with select(icmp(),x,y) where one of
///< (x,y) is a decreasing loop induction, and both x and y
///< are integer type, producing a SMin reduction.
+ FindFirstIVUMin, /// FindFirst reduction with select(icmp(),x,y) where one of
+ ///< (x,y) is a decreasing loop induction, and both x and y
+ ///< are integer type, producing a UMin reduction.
FindLastIVSMax, ///< FindLast reduction with select(cmp(),x,y) where one of
///< (x,y) is increasing loop induction, and both x and y
///< are integer type, producing a SMax reduction.
@@ -265,7 +268,8 @@ class RecurrenceDescriptor {
/// Returns true if the recurrence kind is of the form
/// select(cmp(),x,y) where one of (x,y) is decreasing loop induction.
static bool isFindFirstIVRecurrenceKind(RecurKind Kind) {
- return Kind == RecurKind::FindFirstIVSMin;
+ return Kind == RecurKind::FindFirstIVSMin ||
+ Kind == RecurKind::FindFirstIVUMin;
}
/// Returns true if the recurrence kind is of the form
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index b275b1064cef2..ef9f5fc0d1afd 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -51,6 +51,7 @@ bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurKind Kind) {
case RecurKind::UMin:
case RecurKind::AnyOf:
case RecurKind::FindFirstIVSMin:
+ case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
return true;
@@ -741,10 +742,9 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop,
: APInt::getMinValue(NumBits);
ValidRange = ConstantRange::getNonEmpty(Sentinel + 1, Sentinel);
} else {
- assert(IsSigned && "Only FindFirstIV with SMax is supported currently");
- ValidRange =
- ConstantRange::getNonEmpty(APInt::getSignedMinValue(NumBits),
- APInt::getSignedMaxValue(NumBits) - 1);
+ APInt Sentinel = IsSigned ? APInt::getSignedMaxValue(NumBits)
+ : APInt::getMaxValue(NumBits);
+ ValidRange = ConstantRange::getNonEmpty(Sentinel, Sentinel - 1);
}
LLVM_DEBUG(dbgs() << "LV: "
@@ -770,6 +770,8 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop,
if (CheckRange(true))
return RecurKind::FindFirstIVSMin;
+ if (CheckRange(false))
+ return RecurKind::FindFirstIVUMin;
return std::nullopt;
};
@@ -1183,6 +1185,7 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) {
return Instruction::Mul;
case RecurKind::AnyOf:
case RecurKind::FindFirstIVSMin:
+ case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
case RecurKind::Or:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 0941bf61953f1..2ee7e8815c72c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -23181,6 +23181,7 @@ class HorizontalReduction {
case RecurKind::FMulAdd:
case RecurKind::AnyOf:
case RecurKind::FindFirstIVSMin:
+ case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
case RecurKind::FMaximumNum:
@@ -23317,6 +23318,7 @@ class HorizontalReduction {
case RecurKind::FMulAdd:
case RecurKind::AnyOf:
case RecurKind::FindFirstIVSMin:
+ case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
case RecurKind::FMaximumNum:
@@ -23418,6 +23420,7 @@ class HorizontalReduction {
case RecurKind::FMulAdd:
case RecurKind::AnyOf:
case RecurKind::FindFirstIVSMin:
+ case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
case RecurKind::FMaximumNum:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 472b5700bd358..7ff22def3a409 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -743,14 +743,10 @@ Value *VPInstruction::generate(VPTransformState &State) {
Value *ReducedPartRdx = State.get(getOperand(3));
RecurKind MinMaxKind;
bool IsSigned = RecurrenceDescriptor::isSignedRecurrenceKind(RK);
- if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) {
+ if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK))
MinMaxKind = IsSigned ? RecurKind::SMax : RecurKind::UMax;
- } else {
- assert(RecurrenceDescriptor::isFindFirstIVRecurrenceKind(RK) &&
- "Kind must either be FindLastIV or FindFirstIV");
- assert(IsSigned && "Only FindFirstIV with SMax is currently supported");
- MinMaxKind = RecurKind::SMin;
- }
+ else
+ MinMaxKind = IsSigned ? RecurKind::SMin : RecurKind::UMin;
for (unsigned Part = 1; Part < UF; ++Part)
ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
State.get(getOperand(3 + Part)));
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
index d224da795997d..2186ebb067224 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
@@ -941,6 +941,205 @@ exit: ; preds = %loop
ret i16 %spec.select.lcssa
}
+; The signed sentinel value for decreasing-IV vectorization is LONG_MAX, and since
+; the IV hits this value with smin vectorization, it needs to be vectorized with a
+; an unsigned sentinel and umin instead.
+define i64 @select_decreasing_induction_icmp_iv_unsigned(ptr %a) {
+; IC1VF4-LABEL: define i64 @select_decreasing_induction_icmp_iv_unsigned(
+; IC1VF4-SAME: ptr [[A:%.*]]) {
+; IC1VF4-NEXT: [[ENTRY:.*]]:
+; IC1VF4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IC1VF4: [[VECTOR_PH]]:
+; IC1VF4-NEXT: br label %[[VECTOR_BODY:.*]]
+; IC1VF4: [[VECTOR_BODY]]:
+; IC1VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 9223372036854775807, i64 9223372036854775806, i64 9223372036854775805, i64 9223372036854775804>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[OFFSET_IDX:%.*]] = sub i64 9223372036854775807, [[INDEX]]
+; IC1VF4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]]
+; IC1VF4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i32 0
+; IC1VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 -3
+; IC1VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; IC1VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC1VF4-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[REVERSE]], splat (i64 3)
+; IC1VF4-NEXT: [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; IC1VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; IC1VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
+; IC1VF4-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], -9223372036854775808
+; IC1VF4-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; IC1VF4: [[MIDDLE_BLOCK]]:
+; IC1VF4-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> [[TMP4]])
+; IC1VF4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP6]], -1
+; IC1VF4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP6]], i64 331
+; IC1VF4-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC1VF4: [[SCALAR_PH]]:
+; IC1VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, %[[MIDDLE_BLOCK]] ], [ 9223372036854775807, %[[ENTRY]] ]
+; IC1VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 331, %[[ENTRY]] ]
+; IC1VF4-NEXT: br label %[[LOOP:.*]]
+; IC1VF4: [[LOOP]]:
+; IC1VF4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; IC1VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ]
+; IC1VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; IC1VF4-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8
+; IC1VF4-NEXT: [[CMP_A_3:%.*]] = icmp sgt i64 [[LD_A]], 3
+; IC1VF4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP_A_3]], i64 [[IV]], i64 [[RDX]]
+; IC1VF4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; IC1VF4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV]], 0
+; IC1VF4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; IC1VF4: [[EXIT]]:
+; IC1VF4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; IC1VF4-NEXT: ret i64 [[SPEC_SELECT_LCSSA]]
+;
+; IC4VF4-LABEL: define i64 @select_decreasing_induction_icmp_iv_unsigned(
+; IC4VF4-SAME: ptr [[A:%.*]]) {
+; IC4VF4-NEXT: [[ENTRY:.*]]:
+; IC4VF4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IC4VF4: [[VECTOR_PH]]:
+; IC4VF4-NEXT: br label %[[VECTOR_BODY:.*]]
+; IC4VF4: [[VECTOR_BODY]]:
+; IC4VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 9223372036854775807, i64 9223372036854775806, i64 9223372036854775805, i64 9223372036854775804>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
+; IC4VF4-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 -4)
+; IC4VF4-NEXT: [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 -4)
+; IC4VF4-NEXT: [[OFFSET_IDX:%.*]] = sub i64 9223372036854775807, [[INDEX]]
+; IC4VF4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]]
+; IC4VF4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i32 0
+; IC4VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 -3
+; IC4VF4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i32 -4
+; IC4VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -3
+; IC4VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i32 -8
+; IC4VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 -3
+; IC4VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i32 -12
+; IC4VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 -3
+; IC4VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; IC4VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; IC4VF4-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD4]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP6]], align 8
+; IC4VF4-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD6]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8
+; IC4VF4-NEXT: [[REVERSE9:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD8]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[TMP9:%.*]] = icmp sgt <4 x i64> [[REVERSE]], splat (i64 3)
+; IC4VF4-NEXT: [[TMP10:%.*]] = icmp sgt <4 x i64> [[REVERSE5]], splat (i64 3)
+; IC4VF4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i64> [[REVERSE7]], splat (i64 3)
+; IC4VF4-NEXT: [[TMP12:%.*]] = icmp sgt <4 x i64> [[REVERSE9]], splat (i64 3)
+; IC4VF4-NEXT: [[TMP13]] = select <4 x i1> [[TMP9]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; IC4VF4-NEXT: [[TMP14]] = select <4 x i1> [[TMP10]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI1]]
+; IC4VF4-NEXT: [[TMP15]] = select <4 x i1> [[TMP11]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
+; IC4VF4-NEXT: [[TMP16]] = select <4 x i1> [[TMP12]], <4 x i64> [[STEP_ADD_3]], <4 x i64> [[VEC_PHI3]]
+; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; IC4VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 -4)
+; IC4VF4-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], -9223372036854775808
+; IC4VF4-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; IC4VF4: [[MIDDLE_BLOCK]]:
+; IC4VF4-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[TMP13]], <4 x i64> [[TMP14]])
+; IC4VF4-NEXT: [[RDX_MINMAX10:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP15]])
+; IC4VF4-NEXT: [[RDX_MINMAX11:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[RDX_MINMAX10]], <4 x i64> [[TMP16]])
+; IC4VF4-NEXT: [[TMP18:%.*]] = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> [[RDX_MINMAX11]])
+; IC4VF4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP18]], -1
+; IC4VF4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP18]], i64 331
+; IC4VF4-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC4VF4: [[SCALAR_PH]]:
+; IC4VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, %[[MIDDLE_BLOCK]] ], [ 9223372036854775807, %[[ENTRY]] ]
+; IC4VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 331, %[[ENTRY]] ]
+; IC4VF4-NEXT: br label %[[LOOP:.*]]
+; IC4VF4: [[LOOP]]:
+; IC4VF4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; IC4VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ]
+; IC4VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; IC4VF4-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8
+; IC4VF4-NEXT: [[CMP_A_3:%.*]] = icmp sgt i64 [[LD_A]], 3
+; IC4VF4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP_A_3]], i64 [[IV]], i64 [[RDX]]
+; IC4VF4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; IC4VF4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV]], 0
+; IC4VF4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; IC4VF4: [[EXIT]]:
+; IC4VF4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; IC4VF4-NEXT: ret i64 [[SPEC_SELECT_LCSSA]]
+;
+; IC4VF1-LABEL: define i64 @select_decreasing_induction_icmp_iv_unsigned(
+; IC4VF1-SAME: ptr [[A:%.*]]) {
+; IC4VF1-NEXT: [[ENTRY:.*]]:
+; IC4VF1-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IC4VF1: [[VECTOR_PH]]:
+; IC4VF1-NEXT: br label %[[VECTOR_BODY:.*]]
+; IC4VF1: [[VECTOR_BODY]]:
+; IC4VF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF1-NEXT: [[VEC_PHI:%.*]] = phi i64 [ -1, %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF1-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ -1, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF1-NEXT: [[VEC_PHI2:%.*]] = phi i64 [ -1, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF1-NEXT: [[VEC_PHI3:%.*]] = phi i64 [ -1, %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF1-NEXT: [[OFFSET_IDX:%.*]] = sub i64 9223372036854775807, [[INDEX]]
+; IC4VF1-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], -1
+; IC4VF1-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -2
+; IC4VF1-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -3
+; IC4VF1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]]
+; IC4VF1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
+; IC4VF1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; IC4VF1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; IC4VF1-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP3]], align 8
+; IC4VF1-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP4]], align 8
+; IC4VF1-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP5]], align 8
+; IC4VF1-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8
+; IC4VF1-NEXT: [[TMP11:%.*]] = icmp sgt i64 [[TMP7]], 3
+; IC4VF1-NEXT: [[TMP12:%.*]] = icmp sgt i64 [[TMP8]], 3
+; IC4VF1-NEXT: [[TMP13:%.*]] = icmp sgt i64 [[TMP9]], 3
+; IC4VF1-NEXT: [[TMP14:%.*]] = icmp sgt i64 [[TMP10]], 3
+; IC4VF1-NEXT: [[TMP15]] = select i1 [[TMP11]], i64 [[OFFSET_IDX]], i64 [[VEC_PHI]]
+; IC4VF1-NEXT: [[TMP16]] = select i1 [[TMP12]], i64 [[TMP0]], i64 [[VEC_PHI1]]
+; IC4VF1-NEXT: [[TMP17]] = select i1 [[TMP13]], i64 [[TMP1]], i64 [[VEC_PHI2]]
+; IC4VF1-NEXT: [[TMP18]] = select i1 [[TMP14]], i64 [[TMP2]], i64 [[VEC_PHI3]]
+; IC4VF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; IC4VF1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], -9223372036854775808
+; IC4VF1-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; IC4VF1: [[MIDDLE_BLOCK]]:
+; IC4VF1-NEXT: [[RDX_MINMAX:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP15]], i64 [[TMP16]])
+; IC4VF1-NEXT: [[RDX_MINMAX4:%.*]] = call i64 @llvm.umin.i64(i64 [[RDX_MINMAX]], i64 [[TMP17]])
+; IC4VF1-NEXT: [[RDX_MINMAX5:%.*]] = call i64 @llvm.umin.i64(i64 [[RDX_MINMAX4]], i64 [[TMP18]])
+; IC4VF1-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[RDX_MINMAX5]], -1
+; IC4VF1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_MINMAX5]], i64 331
+; IC4VF1-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC4VF1: [[SCALAR_PH]]:
+; IC4VF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, %[[MIDDLE_BLOCK]] ], [ 9223372036854775807, %[[ENTRY]] ]
+; IC4VF1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 331, %[[ENTRY]] ]
+; IC4VF1-NEXT: br label %[[LOOP:.*]]
+; IC4VF1: [[LOOP]]:
+; IC4VF1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; IC4VF1-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ]
+; IC4VF1-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; IC4VF1-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8
+; IC4VF1-NEXT: [[CMP_A_3:%.*]] = icmp sgt i64 [[LD_A]], 3
+; IC4VF1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP_A_3]], i64 [[IV]], i64 [[RDX]]
+; IC4VF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; IC4VF1-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV]], 0
+; IC4VF1-NEXT: br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; IC4VF1: [[EXIT]]:
+; IC4VF1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; IC4VF1-NEXT: ret i64 [[SPEC_SELECT_LCS...
[truncated]
|
@llvm/pr-subscribers-llvm-analysis Author: Ramkumar Ramachandra (artagnon) ChangesExtend FindFirstIV vectorization to the unsigned case by introducing and handling FindFirstIVUMin. Patch is 23.00 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/146386.diff 5 Files Affected:
diff --git a/llvm/include/llvm/Analysis/IVDescriptors.h b/llvm/include/llvm/Analysis/IVDescriptors.h
index 3b92cbff28de4..b985292ccee40 100644
--- a/llvm/include/llvm/Analysis/IVDescriptors.h
+++ b/llvm/include/llvm/Analysis/IVDescriptors.h
@@ -57,6 +57,9 @@ enum class RecurKind {
FindFirstIVSMin, /// FindFirst reduction with select(icmp(),x,y) where one of
///< (x,y) is a decreasing loop induction, and both x and y
///< are integer type, producing a SMin reduction.
+ FindFirstIVUMin, /// FindFirst reduction with select(icmp(),x,y) where one of
+ ///< (x,y) is a decreasing loop induction, and both x and y
+ ///< are integer type, producing a UMin reduction.
FindLastIVSMax, ///< FindLast reduction with select(cmp(),x,y) where one of
///< (x,y) is increasing loop induction, and both x and y
///< are integer type, producing a SMax reduction.
@@ -265,7 +268,8 @@ class RecurrenceDescriptor {
/// Returns true if the recurrence kind is of the form
/// select(cmp(),x,y) where one of (x,y) is decreasing loop induction.
static bool isFindFirstIVRecurrenceKind(RecurKind Kind) {
- return Kind == RecurKind::FindFirstIVSMin;
+ return Kind == RecurKind::FindFirstIVSMin ||
+ Kind == RecurKind::FindFirstIVUMin;
}
/// Returns true if the recurrence kind is of the form
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index b275b1064cef2..ef9f5fc0d1afd 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -51,6 +51,7 @@ bool RecurrenceDescriptor::isIntegerRecurrenceKind(RecurKind Kind) {
case RecurKind::UMin:
case RecurKind::AnyOf:
case RecurKind::FindFirstIVSMin:
+ case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
return true;
@@ -741,10 +742,9 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop,
: APInt::getMinValue(NumBits);
ValidRange = ConstantRange::getNonEmpty(Sentinel + 1, Sentinel);
} else {
- assert(IsSigned && "Only FindFirstIV with SMax is supported currently");
- ValidRange =
- ConstantRange::getNonEmpty(APInt::getSignedMinValue(NumBits),
- APInt::getSignedMaxValue(NumBits) - 1);
+ APInt Sentinel = IsSigned ? APInt::getSignedMaxValue(NumBits)
+ : APInt::getMaxValue(NumBits);
+ ValidRange = ConstantRange::getNonEmpty(Sentinel, Sentinel - 1);
}
LLVM_DEBUG(dbgs() << "LV: "
@@ -770,6 +770,8 @@ RecurrenceDescriptor::isFindIVPattern(RecurKind Kind, Loop *TheLoop,
if (CheckRange(true))
return RecurKind::FindFirstIVSMin;
+ if (CheckRange(false))
+ return RecurKind::FindFirstIVUMin;
return std::nullopt;
};
@@ -1183,6 +1185,7 @@ unsigned RecurrenceDescriptor::getOpcode(RecurKind Kind) {
return Instruction::Mul;
case RecurKind::AnyOf:
case RecurKind::FindFirstIVSMin:
+ case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
case RecurKind::Or:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 0941bf61953f1..2ee7e8815c72c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -23181,6 +23181,7 @@ class HorizontalReduction {
case RecurKind::FMulAdd:
case RecurKind::AnyOf:
case RecurKind::FindFirstIVSMin:
+ case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
case RecurKind::FMaximumNum:
@@ -23317,6 +23318,7 @@ class HorizontalReduction {
case RecurKind::FMulAdd:
case RecurKind::AnyOf:
case RecurKind::FindFirstIVSMin:
+ case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
case RecurKind::FMaximumNum:
@@ -23418,6 +23420,7 @@ class HorizontalReduction {
case RecurKind::FMulAdd:
case RecurKind::AnyOf:
case RecurKind::FindFirstIVSMin:
+ case RecurKind::FindFirstIVUMin:
case RecurKind::FindLastIVSMax:
case RecurKind::FindLastIVUMax:
case RecurKind::FMaximumNum:
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 472b5700bd358..7ff22def3a409 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -743,14 +743,10 @@ Value *VPInstruction::generate(VPTransformState &State) {
Value *ReducedPartRdx = State.get(getOperand(3));
RecurKind MinMaxKind;
bool IsSigned = RecurrenceDescriptor::isSignedRecurrenceKind(RK);
- if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) {
+ if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK))
MinMaxKind = IsSigned ? RecurKind::SMax : RecurKind::UMax;
- } else {
- assert(RecurrenceDescriptor::isFindFirstIVRecurrenceKind(RK) &&
- "Kind must either be FindLastIV or FindFirstIV");
- assert(IsSigned && "Only FindFirstIV with SMax is currently supported");
- MinMaxKind = RecurKind::SMin;
- }
+ else
+ MinMaxKind = IsSigned ? RecurKind::SMin : RecurKind::UMin;
for (unsigned Part = 1; Part < UF; ++Part)
ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
State.get(getOperand(3 + Part)));
diff --git a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
index d224da795997d..2186ebb067224 100644
--- a/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
+++ b/llvm/test/Transforms/LoopVectorize/iv-select-cmp-decreasing.ll
@@ -941,6 +941,205 @@ exit: ; preds = %loop
ret i16 %spec.select.lcssa
}
+; The signed sentinel value for decreasing-IV vectorization is LONG_MAX, and since
+; the IV hits this value with smin vectorization, it needs to be vectorized with a
+; an unsigned sentinel and umin instead.
+define i64 @select_decreasing_induction_icmp_iv_unsigned(ptr %a) {
+; IC1VF4-LABEL: define i64 @select_decreasing_induction_icmp_iv_unsigned(
+; IC1VF4-SAME: ptr [[A:%.*]]) {
+; IC1VF4-NEXT: [[ENTRY:.*]]:
+; IC1VF4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IC1VF4: [[VECTOR_PH]]:
+; IC1VF4-NEXT: br label %[[VECTOR_BODY:.*]]
+; IC1VF4: [[VECTOR_BODY]]:
+; IC1VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 9223372036854775807, i64 9223372036854775806, i64 9223372036854775805, i64 9223372036854775804>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP4:%.*]], %[[VECTOR_BODY]] ]
+; IC1VF4-NEXT: [[OFFSET_IDX:%.*]] = sub i64 9223372036854775807, [[INDEX]]
+; IC1VF4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]]
+; IC1VF4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i32 0
+; IC1VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 -3
+; IC1VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; IC1VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC1VF4-NEXT: [[TMP3:%.*]] = icmp sgt <4 x i64> [[REVERSE]], splat (i64 3)
+; IC1VF4-NEXT: [[TMP4]] = select <4 x i1> [[TMP3]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; IC1VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; IC1VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
+; IC1VF4-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], -9223372036854775808
+; IC1VF4-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; IC1VF4: [[MIDDLE_BLOCK]]:
+; IC1VF4-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> [[TMP4]])
+; IC1VF4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP6]], -1
+; IC1VF4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP6]], i64 331
+; IC1VF4-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC1VF4: [[SCALAR_PH]]:
+; IC1VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, %[[MIDDLE_BLOCK]] ], [ 9223372036854775807, %[[ENTRY]] ]
+; IC1VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 331, %[[ENTRY]] ]
+; IC1VF4-NEXT: br label %[[LOOP:.*]]
+; IC1VF4: [[LOOP]]:
+; IC1VF4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; IC1VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ]
+; IC1VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; IC1VF4-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8
+; IC1VF4-NEXT: [[CMP_A_3:%.*]] = icmp sgt i64 [[LD_A]], 3
+; IC1VF4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP_A_3]], i64 [[IV]], i64 [[RDX]]
+; IC1VF4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; IC1VF4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV]], 0
+; IC1VF4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; IC1VF4: [[EXIT]]:
+; IC1VF4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; IC1VF4-NEXT: ret i64 [[SPEC_SELECT_LCSSA]]
+;
+; IC4VF4-LABEL: define i64 @select_decreasing_induction_icmp_iv_unsigned(
+; IC4VF4-SAME: ptr [[A:%.*]]) {
+; IC4VF4-NEXT: [[ENTRY:.*]]:
+; IC4VF4-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IC4VF4: [[VECTOR_PH]]:
+; IC4VF4-NEXT: br label %[[VECTOR_BODY:.*]]
+; IC4VF4: [[VECTOR_BODY]]:
+; IC4VF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 9223372036854775807, i64 9223372036854775806, i64 9223372036854775805, i64 9223372036854775804>, %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP13:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP14:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i64> [ splat (i64 -1), %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF4-NEXT: [[STEP_ADD:%.*]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
+; IC4VF4-NEXT: [[STEP_ADD_2:%.*]] = add <4 x i64> [[STEP_ADD]], splat (i64 -4)
+; IC4VF4-NEXT: [[STEP_ADD_3:%.*]] = add <4 x i64> [[STEP_ADD_2]], splat (i64 -4)
+; IC4VF4-NEXT: [[OFFSET_IDX:%.*]] = sub i64 9223372036854775807, [[INDEX]]
+; IC4VF4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]]
+; IC4VF4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i32 0
+; IC4VF4-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 -3
+; IC4VF4-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i32 -4
+; IC4VF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 -3
+; IC4VF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i32 -8
+; IC4VF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP5]], i32 -3
+; IC4VF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i32 -12
+; IC4VF4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i32 -3
+; IC4VF4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; IC4VF4-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; IC4VF4-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD4]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP6]], align 8
+; IC4VF4-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD6]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8
+; IC4VF4-NEXT: [[REVERSE9:%.*]] = shufflevector <4 x i64> [[WIDE_LOAD8]], <4 x i64> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; IC4VF4-NEXT: [[TMP9:%.*]] = icmp sgt <4 x i64> [[REVERSE]], splat (i64 3)
+; IC4VF4-NEXT: [[TMP10:%.*]] = icmp sgt <4 x i64> [[REVERSE5]], splat (i64 3)
+; IC4VF4-NEXT: [[TMP11:%.*]] = icmp sgt <4 x i64> [[REVERSE7]], splat (i64 3)
+; IC4VF4-NEXT: [[TMP12:%.*]] = icmp sgt <4 x i64> [[REVERSE9]], splat (i64 3)
+; IC4VF4-NEXT: [[TMP13]] = select <4 x i1> [[TMP9]], <4 x i64> [[VEC_IND]], <4 x i64> [[VEC_PHI]]
+; IC4VF4-NEXT: [[TMP14]] = select <4 x i1> [[TMP10]], <4 x i64> [[STEP_ADD]], <4 x i64> [[VEC_PHI1]]
+; IC4VF4-NEXT: [[TMP15]] = select <4 x i1> [[TMP11]], <4 x i64> [[STEP_ADD_2]], <4 x i64> [[VEC_PHI2]]
+; IC4VF4-NEXT: [[TMP16]] = select <4 x i1> [[TMP12]], <4 x i64> [[STEP_ADD_3]], <4 x i64> [[VEC_PHI3]]
+; IC4VF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; IC4VF4-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD_3]], splat (i64 -4)
+; IC4VF4-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], -9223372036854775808
+; IC4VF4-NEXT: br i1 [[TMP17]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; IC4VF4: [[MIDDLE_BLOCK]]:
+; IC4VF4-NEXT: [[RDX_MINMAX:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[TMP13]], <4 x i64> [[TMP14]])
+; IC4VF4-NEXT: [[RDX_MINMAX10:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[RDX_MINMAX]], <4 x i64> [[TMP15]])
+; IC4VF4-NEXT: [[RDX_MINMAX11:%.*]] = call <4 x i64> @llvm.umin.v4i64(<4 x i64> [[RDX_MINMAX10]], <4 x i64> [[TMP16]])
+; IC4VF4-NEXT: [[TMP18:%.*]] = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> [[RDX_MINMAX11]])
+; IC4VF4-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[TMP18]], -1
+; IC4VF4-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[TMP18]], i64 331
+; IC4VF4-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC4VF4: [[SCALAR_PH]]:
+; IC4VF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, %[[MIDDLE_BLOCK]] ], [ 9223372036854775807, %[[ENTRY]] ]
+; IC4VF4-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 331, %[[ENTRY]] ]
+; IC4VF4-NEXT: br label %[[LOOP:.*]]
+; IC4VF4: [[LOOP]]:
+; IC4VF4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; IC4VF4-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ]
+; IC4VF4-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; IC4VF4-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8
+; IC4VF4-NEXT: [[CMP_A_3:%.*]] = icmp sgt i64 [[LD_A]], 3
+; IC4VF4-NEXT: [[SPEC_SELECT]] = select i1 [[CMP_A_3]], i64 [[IV]], i64 [[RDX]]
+; IC4VF4-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; IC4VF4-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV]], 0
+; IC4VF4-NEXT: br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; IC4VF4: [[EXIT]]:
+; IC4VF4-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; IC4VF4-NEXT: ret i64 [[SPEC_SELECT_LCSSA]]
+;
+; IC4VF1-LABEL: define i64 @select_decreasing_induction_icmp_iv_unsigned(
+; IC4VF1-SAME: ptr [[A:%.*]]) {
+; IC4VF1-NEXT: [[ENTRY:.*]]:
+; IC4VF1-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; IC4VF1: [[VECTOR_PH]]:
+; IC4VF1-NEXT: br label %[[VECTOR_BODY:.*]]
+; IC4VF1: [[VECTOR_BODY]]:
+; IC4VF1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF1-NEXT: [[VEC_PHI:%.*]] = phi i64 [ -1, %[[VECTOR_PH]] ], [ [[TMP15:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF1-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ -1, %[[VECTOR_PH]] ], [ [[TMP16:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF1-NEXT: [[VEC_PHI2:%.*]] = phi i64 [ -1, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF1-NEXT: [[VEC_PHI3:%.*]] = phi i64 [ -1, %[[VECTOR_PH]] ], [ [[TMP18:%.*]], %[[VECTOR_BODY]] ]
+; IC4VF1-NEXT: [[OFFSET_IDX:%.*]] = sub i64 9223372036854775807, [[INDEX]]
+; IC4VF1-NEXT: [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], -1
+; IC4VF1-NEXT: [[TMP1:%.*]] = add i64 [[OFFSET_IDX]], -2
+; IC4VF1-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], -3
+; IC4VF1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[OFFSET_IDX]]
+; IC4VF1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
+; IC4VF1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]]
+; IC4VF1-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]]
+; IC4VF1-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP3]], align 8
+; IC4VF1-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP4]], align 8
+; IC4VF1-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP5]], align 8
+; IC4VF1-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8
+; IC4VF1-NEXT: [[TMP11:%.*]] = icmp sgt i64 [[TMP7]], 3
+; IC4VF1-NEXT: [[TMP12:%.*]] = icmp sgt i64 [[TMP8]], 3
+; IC4VF1-NEXT: [[TMP13:%.*]] = icmp sgt i64 [[TMP9]], 3
+; IC4VF1-NEXT: [[TMP14:%.*]] = icmp sgt i64 [[TMP10]], 3
+; IC4VF1-NEXT: [[TMP15]] = select i1 [[TMP11]], i64 [[OFFSET_IDX]], i64 [[VEC_PHI]]
+; IC4VF1-NEXT: [[TMP16]] = select i1 [[TMP12]], i64 [[TMP0]], i64 [[VEC_PHI1]]
+; IC4VF1-NEXT: [[TMP17]] = select i1 [[TMP13]], i64 [[TMP1]], i64 [[VEC_PHI2]]
+; IC4VF1-NEXT: [[TMP18]] = select i1 [[TMP14]], i64 [[TMP2]], i64 [[VEC_PHI3]]
+; IC4VF1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; IC4VF1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], -9223372036854775808
+; IC4VF1-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; IC4VF1: [[MIDDLE_BLOCK]]:
+; IC4VF1-NEXT: [[RDX_MINMAX:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP15]], i64 [[TMP16]])
+; IC4VF1-NEXT: [[RDX_MINMAX4:%.*]] = call i64 @llvm.umin.i64(i64 [[RDX_MINMAX]], i64 [[TMP17]])
+; IC4VF1-NEXT: [[RDX_MINMAX5:%.*]] = call i64 @llvm.umin.i64(i64 [[RDX_MINMAX4]], i64 [[TMP18]])
+; IC4VF1-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i64 [[RDX_MINMAX5]], -1
+; IC4VF1-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i64 [[RDX_MINMAX5]], i64 331
+; IC4VF1-NEXT: br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; IC4VF1: [[SCALAR_PH]]:
+; IC4VF1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ -1, %[[MIDDLE_BLOCK]] ], [ 9223372036854775807, %[[ENTRY]] ]
+; IC4VF1-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ], [ 331, %[[ENTRY]] ]
+; IC4VF1-NEXT: br label %[[LOOP:.*]]
+; IC4VF1: [[LOOP]]:
+; IC4VF1-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; IC4VF1-NEXT: [[RDX:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SPEC_SELECT:%.*]], %[[LOOP]] ]
+; IC4VF1-NEXT: [[GEP_A_IV:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; IC4VF1-NEXT: [[LD_A:%.*]] = load i64, ptr [[GEP_A_IV]], align 8
+; IC4VF1-NEXT: [[CMP_A_3:%.*]] = icmp sgt i64 [[LD_A]], 3
+; IC4VF1-NEXT: [[SPEC_SELECT]] = select i1 [[CMP_A_3]], i64 [[IV]], i64 [[RDX]]
+; IC4VF1-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
+; IC4VF1-NEXT: [[EXIT_COND:%.*]] = icmp eq i64 [[IV]], 0
+; IC4VF1-NEXT: br i1 [[EXIT_COND]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
+; IC4VF1: [[EXIT]]:
+; IC4VF1-NEXT: [[SPEC_SELECT_LCSSA:%.*]] = phi i64 [ [[SPEC_SELECT]], %[[LOOP]] ], [ [[RDX_SELECT]], %[[MIDDLE_BLOCK]] ]
+; IC4VF1-NEXT: ret i64 [[SPEC_SELECT_LCS...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
; The signed sentinel value for decreasing-IV vectorization is LONG_MAX, and since | ||
; the IV hits this value with smin vectorization, it needs to be vectorized with a | ||
; an unsigned sentinel and umin instead. | ||
define i64 @select_decreasing_induction_icmp_iv_unsigned(ptr %a) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just double-checking, do we have a negative test case where we hit unsigned max?
Extend FindFirstIV vectorization to the unsigned case by introducing and handling FindFirstIVUMin.