diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index cc37c8a5cba04c..22ed922e678e7a 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -336,6 +336,7 @@ bool X86TargetInfo::initFeatureMap( setFeatureEnabledImpl(Features, "popcnt", true); setFeatureEnabledImpl(Features, "sahf", true); setFeatureEnabledImpl(Features, "prfchw", true); + setFeatureEnabledImpl(Features, "cx16", true); LLVM_FALLTHROUGH; case CK_K8SSE3: setFeatureEnabledImpl(Features, "sse3", true); @@ -711,7 +712,7 @@ void X86TargetInfo::setFeatureEnabledImpl(llvm::StringMap &Features, setSSELevel(Features, SSE41, Enabled); } else if (Name == "xsave") { if (!Enabled) - Features["xsaveopt"] = false; + Features["xsaveopt"] = Features["xsavec"] = Features["xsaves"] = false; } else if (Name == "xsaveopt" || Name == "xsavec" || Name == "xsaves") { if (Enabled) Features["xsave"] = true; diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adda.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adda.c index 9d9c33a891cd52..853da8783faa7b 100644 --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adda.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adda.c @@ -1,5 +1,4 @@ // REQUIRES: aarch64-registered-target -// RUN: rm -f -- %S/acle_sve_adda.s // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -o - %s >/dev/null 2>%t diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c index 4e1535c91c081a..91f6a99a29c27b 100644 --- a/clang/test/Preprocessor/predefined-arch-macros.c +++ b/clang/test/Preprocessor/predefined-arch-macros.c @@ -2397,6 +2397,7 @@ // RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_AMDFAM10_M64 // CHECK_AMDFAM10_M64: #define __3dNOW_A__ 1 // CHECK_AMDFAM10_M64: #define __3dNOW__ 1 +// CHECK_AMDFAM10_M64: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16 1 // CHECK_AMDFAM10_M64: #define __LZCNT__ 1 // CHECK_AMDFAM10_M64: #define __MMX__ 1 // CHECK_AMDFAM10_M64: #define __POPCNT__ 1 diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c index 6f3c0cffb17ef8..dc7c8109b4da34 100644 --- a/clang/test/Preprocessor/x86_target_features.c +++ b/clang/test/Preprocessor/x86_target_features.c @@ -417,9 +417,11 @@ // XSAVES: #define __XSAVES__ 1 // XSAVES: #define __XSAVE__ 1 -// RUN: %clang -target i386-unknown-unknown -march=atom -mxsaveopt -mno-xsave -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOXSAVE %s +// RUN: %clang -target i386-unknown-unknown -march=atom -mxsaveopt -mxsavec -mxsaves -mno-xsave -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOXSAVE %s +// NOXSAVE-NOT: #define __XSAVEC__ 1 // NOXSAVE-NOT: #define __XSAVEOPT__ 1 +// NOXSAVE-NOT: #define __XSAVES__ 1 // NOXSAVE-NOT: #define __XSAVE__ 1 // RUN: %clang -target i386-unknown-unknown -march=atom -mclflushopt -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=CLFLUSHOPT %s diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 161cbc6b118981..7ff0ea9d6a1335 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -6914,6 +6914,39 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, DAG.getZExtOrTrunc(Const, getCurSDLoc(), PtrVT))); return; } + case Intrinsic::get_active_lane_mask: { + auto DL = getCurSDLoc(); + SDValue Index = getValue(I.getOperand(0)); + SDValue BTC = getValue(I.getOperand(1)); + Type *ElementTy = I.getOperand(0)->getType(); + EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType()); + unsigned VecWidth = VT.getVectorNumElements(); + + SmallVector OpsBTC; + SmallVector OpsIndex; + SmallVector OpsStepConstants; + for (unsigned i = 0; i < VecWidth; i++) { + OpsBTC.push_back(BTC); + OpsIndex.push_back(Index); + OpsStepConstants.push_back(DAG.getConstant(i, DL, MVT::getVT(ElementTy))); + } + + EVT CCVT = MVT::i1; + CCVT = EVT::getVectorVT(I.getContext(), CCVT, VecWidth); + + auto VecTy = MVT::getVT(FixedVectorType::get(ElementTy, VecWidth)); + SDValue VectorIndex = DAG.getBuildVector(VecTy, DL, OpsIndex); + SDValue VectorStep = DAG.getBuildVector(VecTy, DL, OpsStepConstants); + SDValue VectorInduction = DAG.getNode( + ISD::UADDO, DL, DAG.getVTList(VecTy, CCVT), VectorIndex, VectorStep); + SDValue VectorBTC = DAG.getBuildVector(VecTy, DL, OpsBTC); + SDValue SetCC = DAG.getSetCC(DL, CCVT, VectorInduction.getValue(0), + VectorBTC, ISD::CondCode::SETULE); + setValue(&I, DAG.getNode(ISD::AND, DL, CCVT, + DAG.getNOT(DL, VectorInduction.getValue(1), CCVT), + SetCC)); + return; + } } } diff --git a/llvm/lib/IR/DataLayout.cpp b/llvm/lib/IR/DataLayout.cpp index ca8f66d5f6c614..58ef87a21ecba6 100644 --- a/llvm/lib/IR/DataLayout.cpp +++ b/llvm/lib/IR/DataLayout.cpp @@ -792,7 +792,7 @@ Type *DataLayout::getIntPtrType(Type *Ty) const { unsigned NumBits = getPointerTypeSizeInBits(Ty); IntegerType *IntTy = IntegerType::get(Ty->getContext(), NumBits); if (VectorType *VecTy = dyn_cast(Ty)) - return FixedVectorType::get(IntTy, VecTy->getNumElements()); + return VectorType::get(IntTy, VecTy); return IntTy; } diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp index 7af20226434808..79713bd5cec570 100644 --- a/llvm/lib/Target/ARM/MVETailPredication.cpp +++ b/llvm/lib/Target/ARM/MVETailPredication.cpp @@ -86,8 +86,6 @@ class MVETailPredication : public LoopPass { TargetTransformInfo *TTI = nullptr; TargetLibraryInfo *TLI = nullptr; bool ClonedVCTPInExitBlock = false; - IntrinsicInst *ActiveLaneMask = nullptr; - FixedVectorType *VecTy = nullptr; public: static char ID; @@ -119,7 +117,8 @@ class MVETailPredication : public LoopPass { /// intrinsic: check if the first is a loop induction variable, and for the /// the second check that no overflow can occur in the expression that use /// this backedge-taken count. - bool IsSafeActiveMask(Value *TripCount, FixedVectorType *VecTy); + bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount, + FixedVectorType *VecTy); /// Insert the intrinsic to represent the effect of tail predication. void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount, @@ -130,10 +129,6 @@ class MVETailPredication : public LoopPass { /// ARMLowOverheadLoops to better optimise away loop update statements inside /// hardware-loops. void RematerializeIterCount(); - - /// If it is not safe to lower @llvm.get.active.lane.mask to a VCTP, it needs - /// to be lowered to an icmp. - void RevertActiveLaneMask(); }; } // end namespace @@ -167,83 +162,6 @@ void MVETailPredication::RematerializeIterCount() { DeadInsts); } -void MVETailPredication::RevertActiveLaneMask() { - if (!ActiveLaneMask) - return; - - int VectorWidth = VecTy->getElementCount().Min; - IRBuilder<> Builder(ActiveLaneMask->getParent()->getFirstNonPHI()); - - // 1. Create the vector induction step. This %induction will be the LHS of - // the icmp: - // - // %splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 - // %splat = shufflevector <4 x i32> %splatinsert, <4 x i32> undef, <4 x i32> 0 - // %induction = add <4 x i32> %splat, - // - Value *Index = ActiveLaneMask->getOperand(0); - Value *SplatIndex = - Builder.CreateVectorSplat(VectorWidth, Index, "lane.mask"); - - SmallVector Indices; - for (int i = 0; i < VectorWidth; ++i) - Indices.push_back(ConstantInt::get(Index->getType(), i)); - - Constant *CV = ConstantVector::get(Indices); - Value *Induction = Builder.CreateAdd(SplatIndex, CV, "lane.mask.induction"); - - LLVM_DEBUG(dbgs() << "ARM TP: New index: " << *SplatIndex << "\n"; - dbgs() << "ARM TP: New Induction: " << *Induction << "\n"); - - // 2. In the Preheader, first look if the splat BTC already exists. Find this - // %splat, which will be the RHS of the icmp: - // - // %TC.minus.1 = add i32 %N, -1 - // %splatinsert = insertelement <4 x i32> undef, i32 %TC.minus.1, i32 0 - // %splat = shufflevector <4 x i32> %splatinsert, <4 x i32> undef, <16 x i32> 0 - // - auto *Preheader = L->getLoopPreheader(); - auto *BTC = ActiveLaneMask->getOperand(1); - Value *SplatBTC = nullptr; - - if (auto *C = dyn_cast(BTC)) { - Builder.SetInsertPoint(Preheader->getTerminator()); - SplatBTC = Builder.CreateVectorSplat(VectorWidth, C); - LLVM_DEBUG(dbgs() << "ARM TCP: New splat BTC: " << *SplatBTC << "\n"); - } else { - Instruction *InsertElem; - for (auto &V : *Preheader) { - InsertElem = dyn_cast(&V); - if (!InsertElem) - continue; - ConstantInt *CI = dyn_cast(InsertElem->getOperand(2)); - if (!CI) - continue; - if (InsertElem->getOperand(1) != BTC || CI->getSExtValue() != 0) - continue; - if ((SplatBTC = dyn_cast(*InsertElem->users().begin()))) - break; - } - } - // Or create the splat BTC if it doesn't exist. - if (!SplatBTC) { - Builder.SetInsertPoint(Preheader->getTerminator()); - Value *Undef = - UndefValue::get(FixedVectorType::get(BTC->getType(), VectorWidth)); - Value *Insert = Builder.CreateInsertElement(Undef, - BTC, Builder.getInt32(0), "insert.btc"); - Value *Zero = ConstantInt::get(Insert->getType(), 0); - SplatBTC = Builder.CreateShuffleVector (Insert, Undef, Zero, "splat.btc"); - LLVM_DEBUG(dbgs() << "ARM TCP: New splat BTC: " << *SplatBTC << "\n"); - } - - Builder.SetInsertPoint(ActiveLaneMask); - Value *ICmp = Builder.CreateICmp(ICmpInst::ICMP_ULE, Induction, SplatBTC); - LLVM_DEBUG(dbgs() << "ARM TP: New compare: " << *ICmp << "\n"); - ActiveLaneMask->replaceAllUsesWith(ICmp); - ActiveLaneMask->eraseFromParent(); -} - bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) { if (skipLoop(L) || DisableTailPredication) return false; @@ -261,7 +179,6 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) { TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr; DL = &L->getHeader()->getModule()->getDataLayout(); this->L = L; - ActiveLaneMask = nullptr; // The MVE and LOB extensions are combined to enable tail-predication, but // there's nothing preventing us from generating VCTP instructions for v8.1m. @@ -318,15 +235,14 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) { LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n" << *Decrement << "\n"); - if (TryConvert(Setup->getArgOperand(0))) { - if (ClonedVCTPInExitBlock) - RematerializeIterCount(); - return true; - } else - RevertActiveLaneMask(); + if (!TryConvert(Setup->getArgOperand(0))) { + LLVM_DEBUG(dbgs() << "ARM TP: Can't tail-predicate this loop.\n"); + return false; + } - LLVM_DEBUG(dbgs() << "ARM TP: Can't tail-predicate this loop.\n"); - return false; + if (ClonedVCTPInExitBlock) + RematerializeIterCount(); + return true; } static FixedVectorType *getVectorType(IntrinsicInst *I) { @@ -341,10 +257,27 @@ bool MVETailPredication::IsPredicatedVectorLoop() { // Check that the loop contains at least one masked load/store intrinsic. // We only support 'normal' vector instructions - other than masked // load/stores. + bool ActiveLaneMask = false; for (auto *BB : L->getBlocks()) { for (auto &I : *BB) { + auto *Int = dyn_cast(&I); + if (!Int) + continue; + + switch (Int->getIntrinsicID()) { + case Intrinsic::get_active_lane_mask: + ActiveLaneMask = true; + LLVM_FALLTHROUGH; + case Intrinsic::fma: + case Intrinsic::sadd_sat: + case Intrinsic::uadd_sat: + continue; + default: + break; + } + if (IsMasked(&I)) { - FixedVectorType *VecTy = getVectorType(cast(&I)); + auto *VecTy = getVectorType(Int); unsigned Lanes = VecTy->getNumElements(); unsigned ElementWidth = VecTy->getScalarSizeInBits(); // MVE vectors are 128-bit, but don't support 128 x i1. @@ -353,23 +286,20 @@ bool MVETailPredication::IsPredicatedVectorLoop() { if (Lanes * ElementWidth > MaxWidth || Lanes == MaxWidth) return false; MaskedInsts.push_back(cast(&I)); - } else if (auto *Int = dyn_cast(&I)) { - switch (Int->getIntrinsicID()) { - case Intrinsic::fma: - case Intrinsic::sadd_sat: - case Intrinsic::uadd_sat: - continue; - default: - break; - } - for (auto &U : Int->args()) { - if (isa(U->getType())) - return false; - } + continue; + } + + for (const Use &U : Int->args()) { + if (isa(U->getType())) + return false; } } } + if (!ActiveLaneMask) { + LLVM_DEBUG(dbgs() << "ARM TP: No get.active.lane.mask intrinsic found.\n"); + return false; + } return !MaskedInsts.empty(); } @@ -451,14 +381,15 @@ static bool Cleanup(DenseMap &NewPredicates, // (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount // 3) The IV must be an induction phi with an increment equal to the // vector width. -bool MVETailPredication::IsSafeActiveMask(Value *TripCount, - FixedVectorType *VecTy) { +bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, + Value *TripCount, FixedVectorType *VecTy) { // 1) Test whether entry to the loop is protected by a conditional // BTC + 1 < 0. In other words, if the scalar trip count overflows, // becomes negative, we shouldn't enter the loop and creating // tripcount expression BTC + 1 is not safe. So, check that BTC // isn't max. This is evaluated in unsigned, because the semantics // of @get.active.lane.mask is a ULE comparison. + int VectorWidth = VecTy->getNumElements(); auto *BackedgeTakenCount = ActiveLaneMask->getOperand(1); auto *BTC = SE->getSCEV(BackedgeTakenCount); @@ -570,8 +501,8 @@ bool MVETailPredication::IsSafeActiveMask(Value *TripCount, if (VectorWidth == StepValue) return true; - LLVM_DEBUG(dbgs() << "ARM TP: step value " << StepValue << " doesn't match " - "vector width : " << VectorWidth << "\n"); + LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue << " doesn't match " + "vector width " << VectorWidth << "\n"); return false; } @@ -614,6 +545,7 @@ void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); Module *M = L->getHeader()->getModule(); Type *Ty = IntegerType::get(M->getContext(), 32); + unsigned VectorWidth = VecTy->getNumElements(); // The backedge-taken count in @llvm.get.active.lane.mask, its 2nd operand, // is one less than the trip count. So we need to find or create @@ -631,10 +563,10 @@ void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, // represent the effect of tail predication. Builder.SetInsertPoint(ActiveLaneMask); ConstantInt *Factor = - ConstantInt::get(cast(Ty), VecTy->getNumElements()); + ConstantInt::get(cast(Ty), VectorWidth); Intrinsic::ID VCTPID; - switch (VecTy->getNumElements()) { + switch (VectorWidth) { default: llvm_unreachable("unexpected number of lanes"); case 4: VCTPID = Intrinsic::arm_mve_vctp32; break; @@ -680,7 +612,7 @@ bool MVETailPredication::TryConvert(Value *TripCount) { if (!Predicate || Predicates.count(Predicate)) continue; - ActiveLaneMask = dyn_cast(Predicate); + auto *ActiveLaneMask = dyn_cast(Predicate); if (!ActiveLaneMask || ActiveLaneMask->getIntrinsicID() != Intrinsic::get_active_lane_mask) continue; @@ -689,8 +621,8 @@ bool MVETailPredication::TryConvert(Value *TripCount) { LLVM_DEBUG(dbgs() << "ARM TP: Found active lane mask: " << *ActiveLaneMask << "\n"); - VecTy = getVectorType(I); - if (!IsSafeActiveMask(TripCount, VecTy)) { + auto *VecTy = getVectorType(I); + if (!IsSafeActiveMask(ActiveLaneMask, TripCount, VecTy)) { LLVM_DEBUG(dbgs() << "ARM TP: Not safe to insert VCTP.\n"); return false; } diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td index 3e85d4ab2a4c3e..d68fb970b571c4 100644 --- a/llvm/lib/Target/X86/X86.td +++ b/llvm/lib/Target/X86/X86.td @@ -52,13 +52,16 @@ def FeatureXSAVE : SubtargetFeature<"xsave", "HasXSAVE", "true", "Support xsave instructions">; def FeatureXSAVEOPT: SubtargetFeature<"xsaveopt", "HasXSAVEOPT", "true", - "Support xsaveopt instructions">; + "Support xsaveopt instructions", + [FeatureXSAVE]>; def FeatureXSAVEC : SubtargetFeature<"xsavec", "HasXSAVEC", "true", - "Support xsavec instructions">; + "Support xsavec instructions", + [FeatureXSAVE]>; def FeatureXSAVES : SubtargetFeature<"xsaves", "HasXSAVES", "true", - "Support xsaves instructions">; + "Support xsaves instructions", + [FeatureXSAVE]>; def FeatureSSE1 : SubtargetFeature<"sse", "X86SSELevel", "SSE1", "Enable SSE instructions">; diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll index 54ddf646833665..a00af0d6a9ec4e 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll @@ -49,7 +49,7 @@ vector.body: ; preds = %vector.body, %vecto %tmp7 = bitcast i8* %tmp6 to <16 x i8>* tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %mul, <16 x i8>* %tmp7, i32 4, <16 x i1> %active.lane.mask) %index.next = add i32 %index, 16 - %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) + %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 br i1 %tmp16, label %vector.body, label %for.cond.cleanup @@ -106,7 +106,7 @@ vector.body: ; preds = %vector.body, %vecto %tmp7 = bitcast i16* %tmp6 to <8 x i16>* tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %mul, <8 x i16>* %tmp7, i32 4, <8 x i1> %active.lane.mask) %index.next = add i32 %index, 8 - %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) + %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 br i1 %tmp16, label %vector.body, label %for.cond.cleanup @@ -160,7 +160,7 @@ vector.body: ; preds = %vector.body, %vecto %tmp7 = bitcast i32* %tmp6 to <4 x i32>* tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %mul, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask) %index.next = add i32 %index, 4 - %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) + %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 br i1 %tmp16, label %vector.body, label %for.cond.cleanup @@ -221,7 +221,7 @@ vector.body: ; preds = %vector.body, %vecto %tmp7 = bitcast i32* %tmp6 to <4 x i32>* tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %combine, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask) %index.next = add i32 %index, 4 - %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) + %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 br i1 %tmp16, label %vector.body, label %for.cond.cleanup @@ -277,7 +277,7 @@ vector.body: ; preds = %vector.body, %vecto %tmp7 = bitcast i32* %tmp6 to <4 x i32>* tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask) %index.next = add i32 %index, 4 - %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) + %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 br i1 %tmp16, label %vector.body, label %for.cond.cleanup @@ -336,7 +336,7 @@ vector.body: ; preds = %vector.body, %vecto %tmp7 = bitcast i32* %tmp6 to <4 x i32>* tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %wrong) %index.next = add i32 %index, 4 - %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1) + %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1) %tmp16 = icmp ne i32 %tmp15, 0 br i1 %tmp16, label %vector.body, label %for.cond.cleanup @@ -344,6 +344,92 @@ for.cond.cleanup: ; preds = %vector.body, %entry ret void } +; TODO: Multiple intrinsics not yet supported. +; This is currently rejected, because if the vector body is unrolled, the step +; is not what we expect: +; +; Step value 16 doesn't match vector width 4 +; +; CHECK-LABEL: interleave4 +; CHECK: vector.body: +; CHECK: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) +; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %trip.count.minus.1) +; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %trip.count.minus.1) +; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %trip.count.minus.1) +; +define dso_local void @interleave4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { +entry: + %cmp8 = icmp sgt i32 %N, 0 + %v0 = add i32 %N, 15 + %v1 = lshr i32 %v0, 4 + %v2 = shl nuw i32 %v1, 4 + %v3 = add i32 %v2, -16 + %v4 = lshr i32 %v3, 4 + %v5 = add nuw nsw i32 %v4, 1 + br i1 %cmp8, label %vector.ph, label %for.cond.cleanup + + +vector.ph: + %trip.count.minus.1 = add i32 %N, -1 + %scevgep = getelementptr i32, i32* %A, i32 8 + %scevgep30 = getelementptr i32, i32* %C, i32 8 + %scevgep37 = getelementptr i32, i32* %B, i32 8 + call void @llvm.set.loop.iterations.i32(i32 %v5) + br label %vector.body + +vector.body: + %lsr.iv38 = phi i32* [ %scevgep39, %vector.body ], [ %scevgep37, %vector.ph ] + %lsr.iv31 = phi i32* [ %scevgep32, %vector.body ], [ %scevgep30, %vector.ph ] + %lsr.iv = phi i32* [ %scevgep25, %vector.body ], [ %scevgep, %vector.ph ] + %index = phi i32 [ 0, %vector.ph ], [ %v14, %vector.body ] + %v6 = phi i32 [ %v5, %vector.ph ], [ %v15, %vector.body ] + %lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>* + %lsr.iv3133 = bitcast i32* %lsr.iv31 to <4 x i32>* + %lsr.iv26 = bitcast i32* %lsr.iv to <4 x i32>* + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) + %v7 = add i32 %index, 4 + %active.lane.mask15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %trip.count.minus.1) + %v8 = add i32 %v7, 4 + %active.lane.mask16 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %trip.count.minus.1) + %v9 = add i32 %v8, 4 + %active.lane.mask17 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %trip.count.minus.1) + %scevgep42 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 -2 + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %scevgep42, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %scevgep43 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 -1 + %wide.masked.load18 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep43, i32 4, <4 x i1> %active.lane.mask15, <4 x i32> undef) + %wide.masked.load19 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %lsr.iv3840, i32 4, <4 x i1> %active.lane.mask16, <4 x i32> undef) + %scevgep41 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 1 + %wide.masked.load20 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep41, i32 4, <4 x i1> %active.lane.mask17, <4 x i32> undef) + %scevgep34 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 -2 + %wide.masked.load21 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %scevgep34, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %scevgep35 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 -1 + %wide.masked.load22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep35, i32 4, <4 x i1> %active.lane.mask15, <4 x i32> undef) + %wide.masked.load23 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %lsr.iv3133, i32 4, <4 x i1> %active.lane.mask16, <4 x i32> undef) + %scevgep36 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 1 + %wide.masked.load24 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep36, i32 4, <4 x i1> %active.lane.mask17, <4 x i32> undef) + %v10 = add nsw <4 x i32> %wide.masked.load21, %wide.masked.load + %v11 = add nsw <4 x i32> %wide.masked.load22, %wide.masked.load18 + %v12 = add nsw <4 x i32> %wide.masked.load23, %wide.masked.load19 + %v13 = add nsw <4 x i32> %wide.masked.load24, %wide.masked.load20 + %scevgep27 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 -2 + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v10, <4 x i32>* %scevgep27, i32 4, <4 x i1> %active.lane.mask) + %scevgep28 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 -1 + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v11, <4 x i32>* %scevgep28, i32 4, <4 x i1> %active.lane.mask15) + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v12, <4 x i32>* %lsr.iv26, i32 4, <4 x i1> %active.lane.mask16) + %scevgep29 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 1 + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v13, <4 x i32>* %scevgep29, i32 4, <4 x i1> %active.lane.mask17) + %scevgep25 = getelementptr i32, i32* %lsr.iv, i32 16 + %scevgep32 = getelementptr i32, i32* %lsr.iv31, i32 16 + %scevgep39 = getelementptr i32, i32* %lsr.iv38, i32 16 + %v14 = add i32 %v9, 4 + %v15 = call i32 @llvm.loop.decrement.reg.i32(i32 %v6, i32 1) + %v16 = icmp ne i32 %v15, 0 + br i1 %v16, label %vector.body, label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) @@ -353,7 +439,7 @@ declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32 immarg, declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32 immarg, <2 x i1>, <2 x i64>) declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) declare void @llvm.set.loop.iterations.i32(i32) -declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) +declare i32 @llvm.loop.decrement.reg.i32(i32, i32) declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll index dc9da0c9f76414..13d750310a56cc 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll @@ -266,16 +266,9 @@ for.cond.cleanup: } ; CHECK-LABEL: @overflow_BTC_plus_1( -; +; CHECK: vector.body: ; CHECK-NOT: @llvm.arm.mve.vctp32 -; CHECK-NOT: @llvm.get.active.lane.mask -; -; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 -; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, -; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, -; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef) -; +; CHECK: @llvm.get.active.lane.mask ; CHECK: ret void ; define dso_local void @overflow_BTC_plus_1(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { @@ -316,8 +309,9 @@ for.cond.cleanup: } ; CHECK-LABEL: @overflow_in_sub( +; CHECK: vector.body: ; CHECK-NOT: @llvm.arm.mve.vctp32 -; CHECK-NOT: @llvm.get.active.lane.mask +; CHECK: @llvm.get.active.lane.mask ; CHECK: ret void ; define dso_local void @overflow_in_sub(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { @@ -366,8 +360,9 @@ for.cond.cleanup: } ; CHECK-LABEL: @overflow_in_rounding_tripcount( +; CHECK: vector.body: ; CHECK-NOT: @llvm.arm.mve.vctp32 -; CHECK-NOT: @llvm.get.active.lane.mask +; CHECK: @llvm.get.active.lane.mask ; CHECK: ret void ; define dso_local void @overflow_in_rounding_tripcount(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { @@ -413,15 +408,9 @@ for.cond.cleanup: ; CHECK-LABEL: @IV_not_an_induction( -; +; CHECK: vector.body: ; CHECK-NOT: @llvm.arm.mve.vctp32 -; CHECK-NOT: @llvm.get.active.lane.mask -; -; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %N, i32 0 -; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, -; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, -; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef) +; CHECK: @llvm.get.active.lane.mask ; CHECK: ret void ; define dso_local void @IV_not_an_induction(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { @@ -462,15 +451,9 @@ for.cond.cleanup: } ; CHECK-LABEL: @IV_wrong_step( -; +; CHECK: vector.body: ; CHECK-NOT: @llvm.arm.mve.vctp32 -; CHECK-NOT: @llvm.get.active.lane.mask -; -; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 -; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, -; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, -; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef) +; CHECK: @llvm.get.active.lane.mask ; CHECK: ret void ; define dso_local void @IV_wrong_step(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { @@ -514,15 +497,9 @@ for.cond.cleanup: } ; CHECK-LABEL: @IV_step_not_constant( -; +; CHECK: vector.body: ; CHECK-NOT: @llvm.arm.mve.vctp32 -; CHECK-NOT: @llvm.get.active.lane.mask -; -; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 -; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, -; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, -; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef) +; CHECK: @llvm.get.active.lane.mask ; CHECK: ret void ; define dso_local void @IV_step_not_constant(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { @@ -563,15 +540,9 @@ for.cond.cleanup: } ; CHECK-LABEL: @outerloop_phi( -; +; CHECK: vector.body: ; CHECK-NOT: @llvm.arm.mve.vctp32 -; CHECK-NOT: @llvm.get.active.lane.mask -; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %j.025, i32 0 -; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, -; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, -; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef) -; +; CHECK: @llvm.get.active.lane.mask ; CHECK: ret void ; define dso_local void @outerloop_phi(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll index 0b103ca54750b0..5c753134744d65 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll @@ -143,21 +143,10 @@ for.cond.cleanup: ; ; CHECK-LABEL: @reduction_not_guarded ; +; CHECK: vector.body: ; CHECK-NOT: @llvm.arm.mve.vctp -; CHECK-NOT: @llvm.get.active.lane.mask.v8i1.i32 -; -; CHECK: entry: -; CHECK: %[[ELEMCOUNT:.*]] = add i32 %N, -1 -; CHECK: %broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %[[ELEMCOUNT]], i32 0 -; CHECK %broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer -; -; CHECK: vector.body: -; CHECK: %lane.mask.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 -; CHECK: %lane.mask.splat = shufflevector <8 x i32> %lane.mask.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer -; CHECK: %lane.mask.induction = add <8 x i32> %lane.mask.splat, -; CHECK: %[[ICMP:.*]] = icmp ule <8 x i32> %lane.mask.induction, %broadcast.splat2 -; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16({{.*}}, <8 x i1> %[[ICMP]], <8 x i16> undef) -; CHECK: ret +; CHECK: @llvm.get.active.lane.mask.v8i1.i32 +; CHECK: ret ; define i16 @reduction_not_guarded(i16* nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr { entry: @@ -213,20 +202,9 @@ middle.block: ; preds = %vector.body ; ; CHECK-LABEL: @Correlation ; -; CHECK: entry: -; CHECK: for.body.lr.ph: ; preds = %entry -; CHECK: for.body: ; preds = %for.end, %for.body.lr.ph -; CHECK: vector.ph: ; preds = %for.body -; CHECK: %trip.count.minus.1 = add i32 %8, -1 -; CHECK: call void @llvm.set.loop.iterations.i32(i32 %7) -; CHECK: %insert.btc = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0 -; CHECK: %splat.btc = shufflevector <4 x i32> %insert.btc, <4 x i32> undef, <4 x i32> zeroinitializer -; CHECK: br label %vector.body ; CHECK: vector.body: -; CHECK-NOT: @llvm.arm.mve.vctp -; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, %splat.btc -; CHECK: call <4 x i16> @llvm.masked.load.v4i16.p0v4i16({{.*}}, <4 x i1> %[[ICMP]],{{.*}} -; +; CHECK-NOT: @llvm.arm.mve.vctp +; CHECK: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1) ; ; FORCE-LABEL: @Correlation ; FORCE: vector.ph: ; preds = %for.body diff --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll new file mode 100644 index 00000000000000..e9dfccd320dae7 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll @@ -0,0 +1,338 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve %s -o - | FileCheck %s + +define <4 x i32> @v4i32(i32 %index, i32 %BTC, <4 x i32> %V1, <4 x i32> %V2) { +; CHECK-LABEL: v4i32: +; CHECK: @ %bb.0: +; CHECK-NEXT: adr.w r12, .LCPI0_0 +; CHECK-NEXT: vdup.32 q1, r0 +; CHECK-NEXT: vldrw.u32 q0, [r12] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: add r0, sp, #8 +; CHECK-NEXT: vcmp.u32 hi, q1, q0 +; CHECK-NEXT: vdup.32 q1, r1 +; CHECK-NEXT: vpnot +; CHECK-NEXT: vpst +; CHECK-NEXT: vcmpt.u32 cs, q1, q0 +; CHECK-NEXT: vmov d0, r2, r3 +; CHECK-NEXT: vldr d1, [sp] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI0_0: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 2 @ 0x2 +; CHECK-NEXT: .long 3 @ 0x3 + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %BTC) + %select = select <4 x i1> %active.lane.mask, <4 x i32> %V1, <4 x i32> %V2 + ret <4 x i32> %select +} + +define <8 x i16> @v8i16(i32 %index, i32 %BTC, <8 x i16> %V1, <8 x i16> %V2) { +; CHECK-LABEL: v8i16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: adr.w r12, .LCPI1_0 +; CHECK-NEXT: vdup.32 q5, r1 +; CHECK-NEXT: vldrw.u32 q0, [r12] +; CHECK-NEXT: vmov.i8 q1, #0x0 +; CHECK-NEXT: vmov.i8 q2, #0xff +; CHECK-NEXT: vadd.i32 q3, q0, r0 +; CHECK-NEXT: vcmp.u32 cs, q5, q3 +; CHECK-NEXT: vpsel q4, q2, q1 +; CHECK-NEXT: vmov r1, s16 +; CHECK-NEXT: vmov.16 q0[0], r1 +; CHECK-NEXT: vmov r1, s17 +; CHECK-NEXT: vmov.16 q0[1], r1 +; CHECK-NEXT: vmov r1, s18 +; CHECK-NEXT: vmov.16 q0[2], r1 +; CHECK-NEXT: vmov r1, s19 +; CHECK-NEXT: vmov.16 q0[3], r1 +; CHECK-NEXT: adr r1, .LCPI1_1 +; CHECK-NEXT: vldrw.u32 q4, [r1] +; CHECK-NEXT: vadd.i32 q4, q4, r0 +; CHECK-NEXT: vcmp.u32 cs, q5, q4 +; CHECK-NEXT: vpsel q5, q2, q1 +; CHECK-NEXT: vmov r1, s20 +; CHECK-NEXT: vmov.16 q0[4], r1 +; CHECK-NEXT: vmov r1, s21 +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov r1, s22 +; CHECK-NEXT: vmov.16 q0[6], r1 +; CHECK-NEXT: vmov r1, s23 +; CHECK-NEXT: vdup.32 q5, r0 +; CHECK-NEXT: vmov.16 q0[7], r1 +; CHECK-NEXT: vcmp.u32 hi, q5, q3 +; CHECK-NEXT: vpsel q6, q2, q1 +; CHECK-NEXT: vcmp.u32 hi, q5, q4 +; CHECK-NEXT: vmov r0, s24 +; CHECK-NEXT: vpsel q1, q2, q1 +; CHECK-NEXT: vmov.16 q3[0], r0 +; CHECK-NEXT: vmov r0, s25 +; CHECK-NEXT: vmov.16 q3[1], r0 +; CHECK-NEXT: vmov r0, s26 +; CHECK-NEXT: vmov.16 q3[2], r0 +; CHECK-NEXT: vmov r0, s27 +; CHECK-NEXT: vmov.16 q3[3], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.16 q3[4], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.16 q3[5], r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.16 q3[6], r0 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.16 q3[7], r0 +; CHECK-NEXT: add r0, sp, #56 +; CHECK-NEXT: vcmp.i16 ne, q3, zr +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpnot +; CHECK-NEXT: vpst +; CHECK-NEXT: vcmpt.i16 ne, q0, zr +; CHECK-NEXT: vmov d0, r2, r3 +; CHECK-NEXT: vldr d1, [sp, #48] +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI1_0: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 2 @ 0x2 +; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .LCPI1_1: +; CHECK-NEXT: .long 4 @ 0x4 +; CHECK-NEXT: .long 5 @ 0x5 +; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 7 @ 0x7 + %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %BTC) + %select = select <8 x i1> %active.lane.mask, <8 x i16> %V1, <8 x i16> %V2 + ret <8 x i16> %select +} + +define <16 x i8> @v16i8(i32 %index, i32 %BTC, <16 x i8> %V1, <16 x i8> %V2) { +; CHECK-LABEL: v16i8: +; CHECK: @ %bb.0: +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: adr.w r12, .LCPI2_0 +; CHECK-NEXT: vdup.32 q7, r1 +; CHECK-NEXT: vldrw.u32 q0, [r12] +; CHECK-NEXT: vmov.i8 q5, #0x0 +; CHECK-NEXT: vmov.i8 q4, #0xff +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vcmp.u32 cs, q7, q1 +; CHECK-NEXT: vpsel q0, q4, q5 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov.16 q2[0], r1 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov.16 q2[1], r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov.16 q2[2], r1 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov.16 q2[3], r1 +; CHECK-NEXT: adr r1, .LCPI2_1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vadd.i32 q3, q0, r0 +; CHECK-NEXT: vcmp.u32 cs, q7, q3 +; CHECK-NEXT: vpsel q0, q4, q5 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vmov.16 q2[4], r1 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: vmov.16 q2[5], r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov.16 q2[6], r1 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov.16 q2[7], r1 +; CHECK-NEXT: vcmp.i16 ne, q2, zr +; CHECK-NEXT: vpsel q0, q4, q5 +; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: vmov.8 q2[0], r1 +; CHECK-NEXT: vmov.u16 r1, q0[1] +; CHECK-NEXT: vmov.8 q2[1], r1 +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmov.8 q2[2], r1 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: vmov.8 q2[3], r1 +; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: vmov.8 q2[4], r1 +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: vmov.8 q2[5], r1 +; CHECK-NEXT: vmov.u16 r1, q0[6] +; CHECK-NEXT: vmov.8 q2[6], r1 +; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.8 q2[7], r1 +; CHECK-NEXT: adr r1, .LCPI2_2 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vcmp.u32 cs, q7, q0 +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vpsel q6, q4, q5 +; CHECK-NEXT: vmov r1, s24 +; CHECK-NEXT: vmov.16 q0[0], r1 +; CHECK-NEXT: vmov r1, s25 +; CHECK-NEXT: vmov.16 q0[1], r1 +; CHECK-NEXT: vmov r1, s26 +; CHECK-NEXT: vmov.16 q0[2], r1 +; CHECK-NEXT: vmov r1, s27 +; CHECK-NEXT: vmov.16 q0[3], r1 +; CHECK-NEXT: adr r1, .LCPI2_3 +; CHECK-NEXT: vldrw.u32 q6, [r1] +; CHECK-NEXT: vadd.i32 q6, q6, r0 +; CHECK-NEXT: vcmp.u32 cs, q7, q6 +; CHECK-NEXT: vpsel q7, q4, q5 +; CHECK-NEXT: vmov r1, s28 +; CHECK-NEXT: vmov.16 q0[4], r1 +; CHECK-NEXT: vmov r1, s29 +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov r1, s30 +; CHECK-NEXT: vmov.16 q0[6], r1 +; CHECK-NEXT: vmov r1, s31 +; CHECK-NEXT: vmov.16 q0[7], r1 +; CHECK-NEXT: vdup.32 q7, r0 +; CHECK-NEXT: vcmp.i16 ne, q0, zr +; CHECK-NEXT: vpsel q0, q4, q5 +; CHECK-NEXT: vcmp.u32 hi, q7, q1 +; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: vpsel q1, q4, q5 +; CHECK-NEXT: vmov.8 q2[8], r1 +; CHECK-NEXT: vmov.u16 r1, q0[1] +; CHECK-NEXT: vmov.8 q2[9], r1 +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: vmov.8 q2[10], r1 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: vmov.8 q2[11], r1 +; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: vmov.8 q2[12], r1 +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: vmov.8 q2[13], r1 +; CHECK-NEXT: vmov.u16 r1, q0[6] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.8 q2[14], r1 +; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: vmov.16 q0[0], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vcmp.u32 hi, q7, q3 +; CHECK-NEXT: vmov.16 q0[1], r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vpsel q1, q4, q5 +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.8 q2[15], r1 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vcmp.i16 ne, q0, zr +; CHECK-NEXT: vpsel q0, q4, q5 +; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: vmov.8 q3[0], r0 +; CHECK-NEXT: vmov.u16 r0, q0[1] +; CHECK-NEXT: vmov.8 q3[1], r0 +; CHECK-NEXT: vmov.u16 r0, q0[2] +; CHECK-NEXT: vmov.8 q3[2], r0 +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.8 q3[3], r0 +; CHECK-NEXT: vmov.u16 r0, q0[4] +; CHECK-NEXT: vmov.8 q3[4], r0 +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmov.8 q3[5], r0 +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.8 q3[6], r0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.8 q3[7], r0 +; CHECK-NEXT: vcmp.u32 hi, q7, q0 +; CHECK-NEXT: vpsel q1, q4, q5 +; CHECK-NEXT: vcmp.u32 hi, q7, q6 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.16 q0[0], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.16 q0[1], r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vpsel q1, q4, q5 +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vcmp.i16 ne, q0, zr +; CHECK-NEXT: vpsel q0, q4, q5 +; CHECK-NEXT: vmov.u16 r0, q0[0] +; CHECK-NEXT: vmov.8 q3[8], r0 +; CHECK-NEXT: vmov.u16 r0, q0[1] +; CHECK-NEXT: vmov.8 q3[9], r0 +; CHECK-NEXT: vmov.u16 r0, q0[2] +; CHECK-NEXT: vmov.8 q3[10], r0 +; CHECK-NEXT: vmov.u16 r0, q0[3] +; CHECK-NEXT: vmov.8 q3[11], r0 +; CHECK-NEXT: vmov.u16 r0, q0[4] +; CHECK-NEXT: vmov.8 q3[12], r0 +; CHECK-NEXT: vmov.u16 r0, q0[5] +; CHECK-NEXT: vmov.8 q3[13], r0 +; CHECK-NEXT: vmov.u16 r0, q0[6] +; CHECK-NEXT: vmov.8 q3[14], r0 +; CHECK-NEXT: vmov.u16 r0, q0[7] +; CHECK-NEXT: vmov.8 q3[15], r0 +; CHECK-NEXT: vmov d0, r2, r3 +; CHECK-NEXT: add r0, sp, #88 +; CHECK-NEXT: vcmp.i8 ne, q3, zr +; CHECK-NEXT: vldr d1, [sp, #80] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vpnot +; CHECK-NEXT: vpst +; CHECK-NEXT: vcmpt.i8 ne, q2, zr +; CHECK-NEXT: vpsel q0, q0, q1 +; CHECK-NEXT: vmov r0, r1, d0 +; CHECK-NEXT: vmov r2, r3, d1 +; CHECK-NEXT: add sp, #16 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI2_0: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 2 @ 0x2 +; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .LCPI2_1: +; CHECK-NEXT: .long 4 @ 0x4 +; CHECK-NEXT: .long 5 @ 0x5 +; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 7 @ 0x7 +; CHECK-NEXT: .LCPI2_2: +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 9 @ 0x9 +; CHECK-NEXT: .long 10 @ 0xa +; CHECK-NEXT: .long 11 @ 0xb +; CHECK-NEXT: .LCPI2_3: +; CHECK-NEXT: .long 12 @ 0xc +; CHECK-NEXT: .long 13 @ 0xd +; CHECK-NEXT: .long 14 @ 0xe +; CHECK-NEXT: .long 15 @ 0xf + %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %BTC) + %select = select <16 x i1> %active.lane.mask, <16 x i8> %V1, <16 x i8> %V2 + ret <16 x i8> %select +} + +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) +declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) +declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) diff --git a/llvm/test/Transforms/InstCombine/vector_gep1.ll b/llvm/test/Transforms/InstCombine/vector_gep1.ll index 8e5bcf963ea1c0..4eb449edb34807 100644 --- a/llvm/test/Transforms/InstCombine/vector_gep1.ll +++ b/llvm/test/Transforms/InstCombine/vector_gep1.ll @@ -62,3 +62,13 @@ define <2 x i32*> @test7(<2 x {i32, i32}*> %a) { ret <2 x i32*> %w } +define @test8() { +; CHECK-LABEL: @test8( +; CHECK-NEXT: ret icmp ult ( zext ( shufflevector ( insertelement ( undef, i32 1, i32 0), undef, zeroinitializer) to ), zeroinitializer) +; + %ins = insertelement undef, i32 1, i32 0 + %b = shufflevector %ins, undef, zeroinitializer + %c = inttoptr %b to + %d = icmp ult %c, zeroinitializer + ret %d +}