diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index cc37c8a5cba04c..22ed922e678e7a 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -336,6 +336,7 @@ bool X86TargetInfo::initFeatureMap(
     setFeatureEnabledImpl(Features, "popcnt", true);
     setFeatureEnabledImpl(Features, "sahf", true);
     setFeatureEnabledImpl(Features, "prfchw", true);
+    setFeatureEnabledImpl(Features, "cx16", true);
     LLVM_FALLTHROUGH;
   case CK_K8SSE3:
     setFeatureEnabledImpl(Features, "sse3", true);
@@ -711,7 +712,7 @@ void X86TargetInfo::setFeatureEnabledImpl(llvm::StringMap<bool> &Features,
       setSSELevel(Features, SSE41, Enabled);
   } else if (Name == "xsave") {
     if (!Enabled)
-      Features["xsaveopt"] = false;
+      Features["xsaveopt"] = Features["xsavec"] = Features["xsaves"] = false;
   } else if (Name == "xsaveopt" || Name == "xsavec" || Name == "xsaves") {
     if (Enabled)
       Features["xsave"] = true;
diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adda.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adda.c
index 9d9c33a891cd52..853da8783faa7b 100644
--- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adda.c
+++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adda.c
@@ -1,5 +1,4 @@
 // REQUIRES: aarch64-registered-target
-// RUN: rm -f -- %S/acle_sve_adda.s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -o - %s >/dev/null 2>%t
diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c
index 4e1535c91c081a..91f6a99a29c27b 100644
--- a/clang/test/Preprocessor/predefined-arch-macros.c
+++ b/clang/test/Preprocessor/predefined-arch-macros.c
@@ -2397,6 +2397,7 @@
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_AMDFAM10_M64
 // CHECK_AMDFAM10_M64: #define __3dNOW_A__ 1
 // CHECK_AMDFAM10_M64: #define __3dNOW__ 1
+// CHECK_AMDFAM10_M64: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16 1
 // CHECK_AMDFAM10_M64: #define __LZCNT__ 1
 // CHECK_AMDFAM10_M64: #define __MMX__ 1
 // CHECK_AMDFAM10_M64: #define __POPCNT__ 1
diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c
index 6f3c0cffb17ef8..dc7c8109b4da34 100644
--- a/clang/test/Preprocessor/x86_target_features.c
+++ b/clang/test/Preprocessor/x86_target_features.c
@@ -417,9 +417,11 @@
 // XSAVES: #define __XSAVES__ 1
 // XSAVES: #define __XSAVE__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mxsaveopt -mno-xsave -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOXSAVE %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mxsaveopt -mxsavec -mxsaves -mno-xsave -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOXSAVE %s
 
+// NOXSAVE-NOT: #define __XSAVEC__ 1
 // NOXSAVE-NOT: #define __XSAVEOPT__ 1
+// NOXSAVE-NOT: #define __XSAVES__ 1
 // NOXSAVE-NOT: #define __XSAVE__ 1
 
 // RUN: %clang -target i386-unknown-unknown -march=atom -mclflushopt -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=CLFLUSHOPT %s
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 161cbc6b118981..7ff0ea9d6a1335 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6914,6 +6914,39 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                              DAG.getZExtOrTrunc(Const, getCurSDLoc(), PtrVT)));
     return;
   }
+  case Intrinsic::get_active_lane_mask: {
+    auto DL = getCurSDLoc();
+    SDValue Index = getValue(I.getOperand(0));
+    SDValue BTC = getValue(I.getOperand(1));
+    Type *ElementTy = I.getOperand(0)->getType();
+    EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    unsigned VecWidth = VT.getVectorNumElements();
+
+    SmallVector<SDValue, 16> OpsBTC;
+    SmallVector<SDValue, 16> OpsIndex;
+    SmallVector<SDValue, 16> OpsStepConstants;
+    for (unsigned i = 0; i < VecWidth; i++) {
+      OpsBTC.push_back(BTC);
+      OpsIndex.push_back(Index);
+      OpsStepConstants.push_back(DAG.getConstant(i, DL, MVT::getVT(ElementTy)));
+    }
+
+    EVT CCVT = MVT::i1;
+    CCVT = EVT::getVectorVT(I.getContext(), CCVT, VecWidth);
+
+    auto VecTy = MVT::getVT(FixedVectorType::get(ElementTy, VecWidth));
+    SDValue VectorIndex = DAG.getBuildVector(VecTy, DL, OpsIndex);
+    SDValue VectorStep = DAG.getBuildVector(VecTy, DL, OpsStepConstants);
+    SDValue VectorInduction = DAG.getNode(
+       ISD::UADDO, DL, DAG.getVTList(VecTy, CCVT), VectorIndex, VectorStep);
+    SDValue VectorBTC = DAG.getBuildVector(VecTy, DL, OpsBTC);
+    SDValue SetCC = DAG.getSetCC(DL, CCVT, VectorInduction.getValue(0),
+                                 VectorBTC, ISD::CondCode::SETULE);
+    setValue(&I, DAG.getNode(ISD::AND, DL, CCVT,
+                             DAG.getNOT(DL, VectorInduction.getValue(1), CCVT),
+                             SetCC));
+    return;
+  }
   }
 }
 
diff --git a/llvm/lib/IR/DataLayout.cpp b/llvm/lib/IR/DataLayout.cpp
index ca8f66d5f6c614..58ef87a21ecba6 100644
--- a/llvm/lib/IR/DataLayout.cpp
+++ b/llvm/lib/IR/DataLayout.cpp
@@ -792,7 +792,7 @@ Type *DataLayout::getIntPtrType(Type *Ty) const {
   unsigned NumBits = getPointerTypeSizeInBits(Ty);
   IntegerType *IntTy = IntegerType::get(Ty->getContext(), NumBits);
   if (VectorType *VecTy = dyn_cast<VectorType>(Ty))
-    return FixedVectorType::get(IntTy, VecTy->getNumElements());
+    return VectorType::get(IntTy, VecTy);
   return IntTy;
 }
 
diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp
index 7af20226434808..79713bd5cec570 100644
--- a/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -86,8 +86,6 @@ class MVETailPredication : public LoopPass {
   TargetTransformInfo *TTI = nullptr;
   TargetLibraryInfo *TLI = nullptr;
   bool ClonedVCTPInExitBlock = false;
-  IntrinsicInst *ActiveLaneMask = nullptr;
-  FixedVectorType *VecTy = nullptr;
 
 public:
   static char ID;
@@ -119,7 +117,8 @@ class MVETailPredication : public LoopPass {
   /// intrinsic: check if the first is a loop induction variable, and for the
   /// the second check that no overflow can occur in the expression that use
   /// this backedge-taken count.
-  bool IsSafeActiveMask(Value *TripCount, FixedVectorType *VecTy);
+  bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount,
+                        FixedVectorType *VecTy);
 
   /// Insert the intrinsic to represent the effect of tail predication.
   void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount,
@@ -130,10 +129,6 @@ class MVETailPredication : public LoopPass {
   /// ARMLowOverheadLoops to better optimise away loop update statements inside
   /// hardware-loops.
   void RematerializeIterCount();
-
-  /// If it is not safe to lower @llvm.get.active.lane.mask to a VCTP, it needs
-  /// to be lowered to an icmp.
-  void RevertActiveLaneMask();
 };
 
 } // end namespace
@@ -167,83 +162,6 @@ void MVETailPredication::RematerializeIterCount() {
                         DeadInsts);
 }
 
-void MVETailPredication::RevertActiveLaneMask() {
-  if (!ActiveLaneMask)
-    return;
-
-  int VectorWidth = VecTy->getElementCount().Min;
-  IRBuilder<> Builder(ActiveLaneMask->getParent()->getFirstNonPHI());
-
-  // 1. Create the vector induction step. This %induction will be the LHS of
-  // the icmp:
-  //
-  // %splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  // %splat = shufflevector <4 x i32> %splatinsert, <4 x i32> undef, <4 x i32> 0
-  // %induction = add <4 x i32> %splat, <i32 0, i32 1, i32 2, i32 3>
-  //
-  Value *Index = ActiveLaneMask->getOperand(0);
-  Value *SplatIndex =
-      Builder.CreateVectorSplat(VectorWidth, Index, "lane.mask");
-
-  SmallVector<Constant *, 8> Indices;
-  for (int i = 0; i < VectorWidth; ++i)
-    Indices.push_back(ConstantInt::get(Index->getType(), i));
-
-  Constant *CV = ConstantVector::get(Indices);
-  Value *Induction = Builder.CreateAdd(SplatIndex, CV, "lane.mask.induction");
-
-  LLVM_DEBUG(dbgs() << "ARM TP: New index: " << *SplatIndex << "\n";
-             dbgs() << "ARM TP: New Induction: " << *Induction << "\n");
-
-  // 2. In the Preheader, first look if the splat BTC already exists. Find this
-  //    %splat, which will be the RHS of the icmp:
-  //
-  //    %TC.minus.1 = add i32 %N, -1
-  //    %splatinsert = insertelement <4 x i32> undef, i32 %TC.minus.1, i32 0
-  //    %splat = shufflevector <4 x i32> %splatinsert, <4 x i32> undef, <16 x i32> 0
-  //
-  auto *Preheader = L->getLoopPreheader();
-  auto *BTC = ActiveLaneMask->getOperand(1);
-  Value *SplatBTC = nullptr;
-
-  if (auto *C = dyn_cast<ConstantInt>(BTC)) {
-    Builder.SetInsertPoint(Preheader->getTerminator());
-    SplatBTC = Builder.CreateVectorSplat(VectorWidth, C);
-    LLVM_DEBUG(dbgs() << "ARM TCP: New splat BTC: " << *SplatBTC << "\n");
-  } else {
-    Instruction *InsertElem;
-    for (auto &V : *Preheader) {
-      InsertElem = dyn_cast<InsertElementInst>(&V);
-      if (!InsertElem)
-        continue;
-      ConstantInt *CI = dyn_cast<ConstantInt>(InsertElem->getOperand(2));
-      if (!CI)
-        continue;
-      if (InsertElem->getOperand(1) != BTC || CI->getSExtValue() != 0)
-        continue;
-      if ((SplatBTC = dyn_cast<ShuffleVectorInst>(*InsertElem->users().begin())))
-         break;
-    }
-  }
-  // Or create the splat BTC if it doesn't exist.
-  if (!SplatBTC) {
-    Builder.SetInsertPoint(Preheader->getTerminator());
-    Value *Undef =
-        UndefValue::get(FixedVectorType::get(BTC->getType(), VectorWidth));
-    Value *Insert = Builder.CreateInsertElement(Undef,
-        BTC, Builder.getInt32(0), "insert.btc");
-    Value *Zero = ConstantInt::get(Insert->getType(), 0);
-    SplatBTC = Builder.CreateShuffleVector (Insert, Undef, Zero, "splat.btc");
-    LLVM_DEBUG(dbgs() << "ARM TCP: New splat BTC: " << *SplatBTC << "\n");
-  }
-
-  Builder.SetInsertPoint(ActiveLaneMask);
-  Value *ICmp = Builder.CreateICmp(ICmpInst::ICMP_ULE, Induction, SplatBTC);
-  LLVM_DEBUG(dbgs() << "ARM TP: New compare: " << *ICmp << "\n");
-  ActiveLaneMask->replaceAllUsesWith(ICmp);
-  ActiveLaneMask->eraseFromParent();
-}
-
 bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
   if (skipLoop(L) || DisableTailPredication)
     return false;
@@ -261,7 +179,6 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
   TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr;
   DL = &L->getHeader()->getModule()->getDataLayout();
   this->L = L;
-  ActiveLaneMask = nullptr;
 
   // The MVE and LOB extensions are combined to enable tail-predication, but
   // there's nothing preventing us from generating VCTP instructions for v8.1m.
@@ -318,15 +235,14 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
   LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n"
              << *Decrement << "\n");
 
-  if (TryConvert(Setup->getArgOperand(0))) {
-    if (ClonedVCTPInExitBlock)
-      RematerializeIterCount();
-    return true;
-  } else
-    RevertActiveLaneMask();
+  if (!TryConvert(Setup->getArgOperand(0))) {
+    LLVM_DEBUG(dbgs() << "ARM TP: Can't tail-predicate this loop.\n");
+    return false;
+  }
 
-  LLVM_DEBUG(dbgs() << "ARM TP: Can't tail-predicate this loop.\n");
-  return false;
+  if (ClonedVCTPInExitBlock)
+    RematerializeIterCount();
+  return true;
 }
 
 static FixedVectorType *getVectorType(IntrinsicInst *I) {
@@ -341,10 +257,27 @@ bool MVETailPredication::IsPredicatedVectorLoop() {
   // Check that the loop contains at least one masked load/store intrinsic.
   // We only support 'normal' vector instructions - other than masked
   // load/stores.
+  bool ActiveLaneMask = false;
   for (auto *BB : L->getBlocks()) {
     for (auto &I : *BB) {
+      auto *Int = dyn_cast<IntrinsicInst>(&I);
+      if (!Int)
+        continue;
+
+      switch (Int->getIntrinsicID()) {
+      case Intrinsic::get_active_lane_mask:
+        ActiveLaneMask = true;
+        LLVM_FALLTHROUGH;
+      case Intrinsic::fma:
+      case Intrinsic::sadd_sat:
+      case Intrinsic::uadd_sat:
+        continue;
+      default:
+        break;
+      }
+
       if (IsMasked(&I)) {
-        FixedVectorType *VecTy = getVectorType(cast<IntrinsicInst>(&I));
+        auto *VecTy = getVectorType(Int);
         unsigned Lanes = VecTy->getNumElements();
         unsigned ElementWidth = VecTy->getScalarSizeInBits();
         // MVE vectors are 128-bit, but don't support 128 x i1.
@@ -353,23 +286,20 @@ bool MVETailPredication::IsPredicatedVectorLoop() {
         if (Lanes * ElementWidth > MaxWidth || Lanes == MaxWidth)
           return false;
         MaskedInsts.push_back(cast<IntrinsicInst>(&I));
-      } else if (auto *Int = dyn_cast<IntrinsicInst>(&I)) {
-        switch (Int->getIntrinsicID()) {
-          case Intrinsic::fma:
-          case Intrinsic::sadd_sat:
-          case Intrinsic::uadd_sat:
-            continue;
-          default:
-            break;
-        }
-        for (auto &U : Int->args()) {
-          if (isa<VectorType>(U->getType()))
-            return false;
-        }
+        continue;
+      }
+
+      for (const Use &U : Int->args()) {
+        if (isa<VectorType>(U->getType()))
+          return false;
       }
     }
   }
 
+  if (!ActiveLaneMask) {
+    LLVM_DEBUG(dbgs() << "ARM TP: No get.active.lane.mask intrinsic found.\n");
+    return false;
+  }
   return !MaskedInsts.empty();
 }
 
@@ -451,14 +381,15 @@ static bool Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
 //        (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount
 // 3) The IV must be an induction phi with an increment equal to the
 //    vector width.
-bool MVETailPredication::IsSafeActiveMask(Value *TripCount,
-    FixedVectorType *VecTy) {
+bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
+    Value *TripCount, FixedVectorType *VecTy) {
   // 1) Test whether entry to the loop is protected by a conditional
   // BTC + 1 < 0. In other words, if the scalar trip count overflows,
   // becomes negative, we shouldn't enter the loop and creating
   // tripcount expression BTC + 1 is not safe. So, check that BTC
   // isn't max. This is evaluated in unsigned, because the semantics
   // of @get.active.lane.mask is a ULE comparison.
+
   int VectorWidth = VecTy->getNumElements();
   auto *BackedgeTakenCount = ActiveLaneMask->getOperand(1);
   auto *BTC = SE->getSCEV(BackedgeTakenCount);
@@ -570,8 +501,8 @@ bool MVETailPredication::IsSafeActiveMask(Value *TripCount,
   if (VectorWidth == StepValue)
     return true;
 
-  LLVM_DEBUG(dbgs() << "ARM TP: step value " << StepValue << " doesn't match "
-             "vector width : " << VectorWidth << "\n");
+  LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue << " doesn't match "
+             "vector width " << VectorWidth << "\n");
 
   return false;
 }
@@ -614,6 +545,7 @@ void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
   Module *M = L->getHeader()->getModule();
   Type *Ty = IntegerType::get(M->getContext(), 32);
+  unsigned VectorWidth = VecTy->getNumElements();
 
   // The backedge-taken count in @llvm.get.active.lane.mask, its 2nd operand,
   // is one less than the trip count. So we need to find or create
@@ -631,10 +563,10 @@ void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
   // represent the effect of tail predication.
   Builder.SetInsertPoint(ActiveLaneMask);
   ConstantInt *Factor =
-    ConstantInt::get(cast<IntegerType>(Ty), VecTy->getNumElements());
+    ConstantInt::get(cast<IntegerType>(Ty), VectorWidth);
 
   Intrinsic::ID VCTPID;
-  switch (VecTy->getNumElements()) {
+  switch (VectorWidth) {
   default:
     llvm_unreachable("unexpected number of lanes");
   case 4:  VCTPID = Intrinsic::arm_mve_vctp32; break;
@@ -680,7 +612,7 @@ bool MVETailPredication::TryConvert(Value *TripCount) {
     if (!Predicate || Predicates.count(Predicate))
       continue;
 
-    ActiveLaneMask = dyn_cast<IntrinsicInst>(Predicate);
+    auto *ActiveLaneMask = dyn_cast<IntrinsicInst>(Predicate);
     if (!ActiveLaneMask ||
         ActiveLaneMask->getIntrinsicID() != Intrinsic::get_active_lane_mask)
       continue;
@@ -689,8 +621,8 @@ bool MVETailPredication::TryConvert(Value *TripCount) {
     LLVM_DEBUG(dbgs() << "ARM TP: Found active lane mask: "
                       << *ActiveLaneMask << "\n");
 
-    VecTy = getVectorType(I);
-    if (!IsSafeActiveMask(TripCount, VecTy)) {
+    auto *VecTy = getVectorType(I);
+    if (!IsSafeActiveMask(ActiveLaneMask, TripCount, VecTy)) {
       LLVM_DEBUG(dbgs() << "ARM TP: Not safe to insert VCTP.\n");
       return false;
     }
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 3e85d4ab2a4c3e..d68fb970b571c4 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -52,13 +52,16 @@ def FeatureXSAVE   : SubtargetFeature<"xsave", "HasXSAVE", "true",
                                        "Support xsave instructions">;
 
 def FeatureXSAVEOPT: SubtargetFeature<"xsaveopt", "HasXSAVEOPT", "true",
-                                       "Support xsaveopt instructions">;
+                                       "Support xsaveopt instructions",
+                                       [FeatureXSAVE]>;
 
 def FeatureXSAVEC  : SubtargetFeature<"xsavec", "HasXSAVEC", "true",
-                                       "Support xsavec instructions">;
+                                       "Support xsavec instructions",
+                                       [FeatureXSAVE]>;
 
 def FeatureXSAVES  : SubtargetFeature<"xsaves", "HasXSAVES", "true",
-                                       "Support xsaves instructions">;
+                                       "Support xsaves instructions",
+                                       [FeatureXSAVE]>;
 
 def FeatureSSE1    : SubtargetFeature<"sse", "X86SSELevel", "SSE1",
                                       "Enable SSE instructions">;
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
index 54ddf646833665..a00af0d6a9ec4e 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
@@ -49,7 +49,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %tmp7 = bitcast i8* %tmp6 to <16 x i8>*
   tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %mul, <16 x i8>* %tmp7, i32 4, <16 x i1> %active.lane.mask)
   %index.next = add i32 %index, 16
-  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
+  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 
@@ -106,7 +106,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %tmp7 = bitcast i16* %tmp6 to <8 x i16>*
   tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %mul, <8 x i16>* %tmp7, i32 4, <8 x i1> %active.lane.mask)
   %index.next = add i32 %index, 8
-  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
+  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 
@@ -160,7 +160,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %mul, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask)
   %index.next = add i32 %index, 4
-  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
+  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 
@@ -221,7 +221,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %combine, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask)
   %index.next = add i32 %index, 4
-  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
+  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 
@@ -277,7 +277,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask)
   %index.next = add i32 %index, 4
-  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
+  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 
@@ -336,7 +336,7 @@ vector.body:                                      ; preds = %vector.body, %vecto
   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %wrong)
   %index.next = add i32 %index, 4
-  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
+  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 
@@ -344,6 +344,92 @@ for.cond.cleanup:                                 ; preds = %vector.body, %entry
   ret void
 }
 
+; TODO: Multiple intrinsics not yet supported.
+; This is currently rejected, because if the vector body is unrolled, the step
+; is not what we expect:
+;
+;   Step value 16 doesn't match vector width 4
+;
+; CHECK-LABEL: interleave4
+; CHECK: vector.body:
+; CHECK:  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
+; CHECK:  %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %trip.count.minus.1)
+; CHECK:  %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %trip.count.minus.1)
+; CHECK:  %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %trip.count.minus.1)
+;
+define dso_local void @interleave4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
+entry:
+  %cmp8 = icmp sgt i32 %N, 0
+  %v0 = add i32 %N, 15
+  %v1 = lshr i32 %v0, 4
+  %v2 = shl nuw i32 %v1, 4
+  %v3 = add i32 %v2, -16
+  %v4 = lshr i32 %v3, 4
+  %v5 = add nuw nsw i32 %v4, 1
+  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
+
+
+vector.ph:
+  %trip.count.minus.1 = add i32 %N, -1
+  %scevgep = getelementptr i32, i32* %A, i32 8
+  %scevgep30 = getelementptr i32, i32* %C, i32 8
+  %scevgep37 = getelementptr i32, i32* %B, i32 8
+  call void @llvm.set.loop.iterations.i32(i32 %v5)
+  br label %vector.body
+
+vector.body:
+  %lsr.iv38 = phi i32* [ %scevgep39, %vector.body ], [ %scevgep37, %vector.ph ]
+  %lsr.iv31 = phi i32* [ %scevgep32, %vector.body ], [ %scevgep30, %vector.ph ]
+  %lsr.iv = phi i32* [ %scevgep25, %vector.body ], [ %scevgep, %vector.ph ]
+  %index = phi i32 [ 0, %vector.ph ], [ %v14, %vector.body ]
+  %v6 = phi i32 [ %v5, %vector.ph ], [ %v15, %vector.body ]
+  %lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>*
+  %lsr.iv3133 = bitcast i32* %lsr.iv31 to <4 x i32>*
+  %lsr.iv26 = bitcast i32* %lsr.iv to <4 x i32>*
+  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
+  %v7 = add i32 %index, 4
+  %active.lane.mask15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %trip.count.minus.1)
+  %v8 = add i32 %v7, 4
+  %active.lane.mask16 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %trip.count.minus.1)
+  %v9 = add i32 %v8, 4
+  %active.lane.mask17 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %trip.count.minus.1)
+  %scevgep42 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 -2
+  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %scevgep42, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
+  %scevgep43 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 -1
+  %wide.masked.load18 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep43, i32 4, <4 x i1> %active.lane.mask15, <4 x i32> undef)
+  %wide.masked.load19 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %lsr.iv3840, i32 4, <4 x i1> %active.lane.mask16, <4 x i32> undef)
+  %scevgep41 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 1
+  %wide.masked.load20 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep41, i32 4, <4 x i1> %active.lane.mask17, <4 x i32> undef)
+  %scevgep34 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 -2
+  %wide.masked.load21 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %scevgep34, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
+  %scevgep35 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 -1
+  %wide.masked.load22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep35, i32 4, <4 x i1> %active.lane.mask15, <4 x i32> undef)
+  %wide.masked.load23 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %lsr.iv3133, i32 4, <4 x i1> %active.lane.mask16, <4 x i32> undef)
+  %scevgep36 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 1
+  %wide.masked.load24 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep36, i32 4, <4 x i1> %active.lane.mask17, <4 x i32> undef)
+  %v10 = add nsw <4 x i32> %wide.masked.load21, %wide.masked.load
+  %v11 = add nsw <4 x i32> %wide.masked.load22, %wide.masked.load18
+  %v12 = add nsw <4 x i32> %wide.masked.load23, %wide.masked.load19
+  %v13 = add nsw <4 x i32> %wide.masked.load24, %wide.masked.load20
+  %scevgep27 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 -2
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v10, <4 x i32>* %scevgep27, i32 4, <4 x i1> %active.lane.mask)
+  %scevgep28 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 -1
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v11, <4 x i32>* %scevgep28, i32 4, <4 x i1> %active.lane.mask15)
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v12, <4 x i32>* %lsr.iv26, i32 4, <4 x i1> %active.lane.mask16)
+  %scevgep29 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 1
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v13, <4 x i32>* %scevgep29, i32 4, <4 x i1> %active.lane.mask17)
+  %scevgep25 = getelementptr i32, i32* %lsr.iv, i32 16
+  %scevgep32 = getelementptr i32, i32* %lsr.iv31, i32 16
+  %scevgep39 = getelementptr i32, i32* %lsr.iv38, i32 16
+  %v14 = add i32 %v9, 4
+  %v15 = call i32 @llvm.loop.decrement.reg.i32(i32 %v6, i32 1)
+  %v16 = icmp ne i32 %v15, 0
+  br i1 %v16, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
 declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
 declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
 declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
@@ -353,7 +439,7 @@ declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32 immarg,
 declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32 immarg, <2 x i1>, <2 x i64>)
 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
 declare void @llvm.set.loop.iterations.i32(i32)
-declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
+declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
 declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
index dc9da0c9f76414..13d750310a56cc 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
@@ -266,16 +266,9 @@ for.cond.cleanup:
 }
 
 ; CHECK-LABEL: @overflow_BTC_plus_1(
-;
+; CHECK:       vector.body:
 ; CHECK-NOT:   @llvm.arm.mve.vctp32
-; CHECK-NOT:   @llvm.get.active.lane.mask
-;
-; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, <i32 0, i32 1, i32 2, i32 3>
-; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, <i32 -1, i32 -1, i32 -1, i32 -1>
-; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef)
-;
+; CHECK:       @llvm.get.active.lane.mask
 ; CHECK:       ret void
 ;
 define dso_local void @overflow_BTC_plus_1(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
@@ -316,8 +309,9 @@ for.cond.cleanup:
 }
 
 ; CHECK-LABEL: @overflow_in_sub(
+; CHECK:       vector.body:
 ; CHECK-NOT:   @llvm.arm.mve.vctp32
-; CHECK-NOT:   @llvm.get.active.lane.mask
+; CHECK:       @llvm.get.active.lane.mask
 ; CHECK:       ret void
 ;
 define dso_local void @overflow_in_sub(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
@@ -366,8 +360,9 @@ for.cond.cleanup:
 }
 
 ; CHECK-LABEL: @overflow_in_rounding_tripcount(
+; CHECK:       vector.body:
 ; CHECK-NOT:   @llvm.arm.mve.vctp32
-; CHECK-NOT:   @llvm.get.active.lane.mask
+; CHECK:       @llvm.get.active.lane.mask
 ; CHECK:       ret void
 ;
 define dso_local void @overflow_in_rounding_tripcount(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
@@ -413,15 +408,9 @@ for.cond.cleanup:
 
 
 ; CHECK-LABEL: @IV_not_an_induction(
-;
+; CHECK:       vector.body:
 ; CHECK-NOT:   @llvm.arm.mve.vctp32
-; CHECK-NOT:   @llvm.get.active.lane.mask
-;
-; CHECK:  %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %N, i32 0
-; CHECK:  %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK:  %lane.mask.induction = add <4 x i32> %lane.mask.splat, <i32 0, i32 1, i32 2, i32 3>
-; CHECK:  %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, <i32 32002, i32 32002, i32 32002, i32 32002>
-; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef)
+; CHECK:       @llvm.get.active.lane.mask
 ; CHECK:       ret void
 ;
 define dso_local void @IV_not_an_induction(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
@@ -462,15 +451,9 @@ for.cond.cleanup:
 }
 
 ; CHECK-LABEL: @IV_wrong_step(
-;
+; CHECK:       vector.body:
 ; CHECK-NOT:   @llvm.arm.mve.vctp32
-; CHECK-NOT:   @llvm.get.active.lane.mask
-;
-; CHECK:  %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-; CHECK:  %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK:  %lane.mask.induction = add <4 x i32> %lane.mask.splat, <i32 0, i32 1, i32 2, i32 3>
-; CHECK:  %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, <i32 32002, i32 32002, i32 32002, i32 32002>
-; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef)
+; CHECK:       @llvm.get.active.lane.mask
 ; CHECK:       ret void
 ;
 define dso_local void @IV_wrong_step(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
@@ -514,15 +497,9 @@ for.cond.cleanup:
 }
 
 ; CHECK-LABEL: @IV_step_not_constant(
-;
+; CHECK:       vector.body:
 ; CHECK-NOT:   @llvm.arm.mve.vctp32
-; CHECK-NOT:   @llvm.get.active.lane.mask
-;
-; CHECK:  %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-; CHECK:  %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK:  %lane.mask.induction = add <4 x i32> %lane.mask.splat, <i32 0, i32 1, i32 2, i32 3>
-; CHECK:  %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, <i32 32002, i32 32002, i32 32002, i32 32002>
-; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef)
+; CHECK:       @llvm.get.active.lane.mask
 ; CHECK:       ret void
 ;
 define dso_local void @IV_step_not_constant(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
@@ -563,15 +540,9 @@ for.cond.cleanup:
 }
 
 ; CHECK-LABEL: @outerloop_phi(
-;
+; CHECK:       vector.body:
 ; CHECK-NOT:   @llvm.arm.mve.vctp32
-; CHECK-NOT:   @llvm.get.active.lane.mask
-; CHECK:  %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %j.025, i32 0
-; CHECK:  %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK:  %lane.mask.induction = add <4 x i32> %lane.mask.splat, <i32 0, i32 1, i32 2, i32 3>
-; CHECK:  %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, <i32 4096, i32 4096, i32 4096, i32 4096>
-; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef)
-;
+; CHECK:       @llvm.get.active.lane.mask
 ; CHECK:       ret void
 ;
 define dso_local void @outerloop_phi(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll
index 0b103ca54750b0..5c753134744d65 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll
@@ -143,21 +143,10 @@ for.cond.cleanup:
 ;
 ; CHECK-LABEL: @reduction_not_guarded
 ;
+; CHECK:     vector.body:
 ; CHECK-NOT: @llvm.arm.mve.vctp
-; CHECK-NOT: @llvm.get.active.lane.mask.v8i1.i32
-;
-; CHECK: entry:
-; CHECK: %[[ELEMCOUNT:.*]] = add i32 %N, -1
-; CHECK: %broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %[[ELEMCOUNT]], i32 0
-; CHECK  %broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer
-;
-; CHECK: vector.body:
-; CHECK: %lane.mask.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
-; CHECK: %lane.mask.splat = shufflevector <8 x i32> %lane.mask.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
-; CHECK: %lane.mask.induction = add <8 x i32> %lane.mask.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK: %[[ICMP:.*]] = icmp ule <8 x i32> %lane.mask.induction, %broadcast.splat2
-; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16({{.*}}, <8 x i1> %[[ICMP]], <8 x i16> undef)
-; CHECK: ret
+; CHECK:     @llvm.get.active.lane.mask.v8i1.i32
+; CHECK:     ret
 ;
 define i16 @reduction_not_guarded(i16* nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr {
 entry:
@@ -213,20 +202,9 @@ middle.block:                                     ; preds = %vector.body
 ;
 ; CHECK-LABEL: @Correlation
 ;
-; CHECK: entry:
-; CHECK: for.body.lr.ph:          ; preds = %entry
-; CHECK: for.body:                ; preds = %for.end, %for.body.lr.ph
-; CHECK: vector.ph:               ; preds = %for.body
-; CHECK:   %trip.count.minus.1 = add i32 %8, -1
-; CHECK:   call void @llvm.set.loop.iterations.i32(i32 %7)
-; CHECK:   %insert.btc = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-; CHECK:   %splat.btc = shufflevector <4 x i32> %insert.btc, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK:   br label %vector.body
 ; CHECK: vector.body:
-; CHECK-NOT:   @llvm.arm.mve.vctp
-; CHECK:   %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, %splat.btc
-; CHECK:   call <4 x i16> @llvm.masked.load.v4i16.p0v4i16({{.*}}, <4 x i1> %[[ICMP]],{{.*}} 
-;
+; CHECK-NOT: @llvm.arm.mve.vctp
+; CHECK:     %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
 ;
 ; FORCE-LABEL: @Correlation
 ; FORCE: vector.ph:                                        ; preds = %for.body
diff --git a/llvm/test/CodeGen/Thumb2/active_lane_mask.ll b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
new file mode 100644
index 00000000000000..e9dfccd320dae7
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/active_lane_mask.ll
@@ -0,0 +1,338 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve %s -o - | FileCheck %s
+
+define <4 x i32> @v4i32(i32 %index, i32 %BTC, <4 x i32> %V1, <4 x i32> %V2) {
+; CHECK-LABEL: v4i32:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    adr.w r12, .LCPI0_0
+; CHECK-NEXT:    vdup.32 q1, r0
+; CHECK-NEXT:    vldrw.u32 q0, [r12]
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    add r0, sp, #8
+; CHECK-NEXT:    vcmp.u32 hi, q1, q0
+; CHECK-NEXT:    vdup.32 q1, r1
+; CHECK-NEXT:    vpnot
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vcmpt.u32 cs, q1, q0
+; CHECK-NEXT:    vmov d0, r2, r3
+; CHECK-NEXT:    vldr d1, [sp]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  @ %bb.1:
+; CHECK-NEXT:  .LCPI0_0:
+; CHECK-NEXT:    .long 0 @ 0x0
+; CHECK-NEXT:    .long 1 @ 0x1
+; CHECK-NEXT:    .long 2 @ 0x2
+; CHECK-NEXT:    .long 3 @ 0x3
+  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %BTC)
+  %select = select <4 x i1> %active.lane.mask, <4 x i32> %V1, <4 x i32> %V2
+  ret <4 x i32> %select
+}
+
+define <8 x i16> @v8i16(i32 %index, i32 %BTC, <8 x i16> %V1, <8 x i16> %V2) {
+; CHECK-LABEL: v8i16:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    adr.w r12, .LCPI1_0
+; CHECK-NEXT:    vdup.32 q5, r1
+; CHECK-NEXT:    vldrw.u32 q0, [r12]
+; CHECK-NEXT:    vmov.i8 q1, #0x0
+; CHECK-NEXT:    vmov.i8 q2, #0xff
+; CHECK-NEXT:    vadd.i32 q3, q0, r0
+; CHECK-NEXT:    vcmp.u32 cs, q5, q3
+; CHECK-NEXT:    vpsel q4, q2, q1
+; CHECK-NEXT:    vmov r1, s16
+; CHECK-NEXT:    vmov.16 q0[0], r1
+; CHECK-NEXT:    vmov r1, s17
+; CHECK-NEXT:    vmov.16 q0[1], r1
+; CHECK-NEXT:    vmov r1, s18
+; CHECK-NEXT:    vmov.16 q0[2], r1
+; CHECK-NEXT:    vmov r1, s19
+; CHECK-NEXT:    vmov.16 q0[3], r1
+; CHECK-NEXT:    adr r1, .LCPI1_1
+; CHECK-NEXT:    vldrw.u32 q4, [r1]
+; CHECK-NEXT:    vadd.i32 q4, q4, r0
+; CHECK-NEXT:    vcmp.u32 cs, q5, q4
+; CHECK-NEXT:    vpsel q5, q2, q1
+; CHECK-NEXT:    vmov r1, s20
+; CHECK-NEXT:    vmov.16 q0[4], r1
+; CHECK-NEXT:    vmov r1, s21
+; CHECK-NEXT:    vmov.16 q0[5], r1
+; CHECK-NEXT:    vmov r1, s22
+; CHECK-NEXT:    vmov.16 q0[6], r1
+; CHECK-NEXT:    vmov r1, s23
+; CHECK-NEXT:    vdup.32 q5, r0
+; CHECK-NEXT:    vmov.16 q0[7], r1
+; CHECK-NEXT:    vcmp.u32 hi, q5, q3
+; CHECK-NEXT:    vpsel q6, q2, q1
+; CHECK-NEXT:    vcmp.u32 hi, q5, q4
+; CHECK-NEXT:    vmov r0, s24
+; CHECK-NEXT:    vpsel q1, q2, q1
+; CHECK-NEXT:    vmov.16 q3[0], r0
+; CHECK-NEXT:    vmov r0, s25
+; CHECK-NEXT:    vmov.16 q3[1], r0
+; CHECK-NEXT:    vmov r0, s26
+; CHECK-NEXT:    vmov.16 q3[2], r0
+; CHECK-NEXT:    vmov r0, s27
+; CHECK-NEXT:    vmov.16 q3[3], r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.16 q3[4], r0
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vmov.16 q3[5], r0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.16 q3[6], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vmov.16 q3[7], r0
+; CHECK-NEXT:    add r0, sp, #56
+; CHECK-NEXT:    vcmp.i16 ne, q3, zr
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vpnot
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vcmpt.i16 ne, q0, zr
+; CHECK-NEXT:    vmov d0, r2, r3
+; CHECK-NEXT:    vldr d1, [sp, #48]
+; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  @ %bb.1:
+; CHECK-NEXT:  .LCPI1_0:
+; CHECK-NEXT:    .long 0 @ 0x0
+; CHECK-NEXT:    .long 1 @ 0x1
+; CHECK-NEXT:    .long 2 @ 0x2
+; CHECK-NEXT:    .long 3 @ 0x3
+; CHECK-NEXT:  .LCPI1_1:
+; CHECK-NEXT:    .long 4 @ 0x4
+; CHECK-NEXT:    .long 5 @ 0x5
+; CHECK-NEXT:    .long 6 @ 0x6
+; CHECK-NEXT:    .long 7 @ 0x7
+  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %BTC)
+  %select = select <8 x i1> %active.lane.mask, <8 x i16> %V1, <8 x i16> %V2
+  ret <8 x i16> %select
+}
+
+define <16 x i8> @v16i8(i32 %index, i32 %BTC, <16 x i8> %V1, <16 x i8> %V2) {
+; CHECK-LABEL: v16i8:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    sub sp, #16
+; CHECK-NEXT:    adr.w r12, .LCPI2_0
+; CHECK-NEXT:    vdup.32 q7, r1
+; CHECK-NEXT:    vldrw.u32 q0, [r12]
+; CHECK-NEXT:    vmov.i8 q5, #0x0
+; CHECK-NEXT:    vmov.i8 q4, #0xff
+; CHECK-NEXT:    vadd.i32 q1, q0, r0
+; CHECK-NEXT:    vcmp.u32 cs, q7, q1
+; CHECK-NEXT:    vpsel q0, q4, q5
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov.16 q2[0], r1
+; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov.16 q2[1], r1
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov.16 q2[2], r1
+; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov.16 q2[3], r1
+; CHECK-NEXT:    adr r1, .LCPI2_1
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vadd.i32 q3, q0, r0
+; CHECK-NEXT:    vcmp.u32 cs, q7, q3
+; CHECK-NEXT:    vpsel q0, q4, q5
+; CHECK-NEXT:    vmov r1, s0
+; CHECK-NEXT:    vmov.16 q2[4], r1
+; CHECK-NEXT:    vmov r1, s1
+; CHECK-NEXT:    vmov.16 q2[5], r1
+; CHECK-NEXT:    vmov r1, s2
+; CHECK-NEXT:    vmov.16 q2[6], r1
+; CHECK-NEXT:    vmov r1, s3
+; CHECK-NEXT:    vmov.16 q2[7], r1
+; CHECK-NEXT:    vcmp.i16 ne, q2, zr
+; CHECK-NEXT:    vpsel q0, q4, q5
+; CHECK-NEXT:    vmov.u16 r1, q0[0]
+; CHECK-NEXT:    vmov.8 q2[0], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[1]
+; CHECK-NEXT:    vmov.8 q2[1], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[2]
+; CHECK-NEXT:    vmov.8 q2[2], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[3]
+; CHECK-NEXT:    vmov.8 q2[3], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[4]
+; CHECK-NEXT:    vmov.8 q2[4], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[5]
+; CHECK-NEXT:    vmov.8 q2[5], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[6]
+; CHECK-NEXT:    vmov.8 q2[6], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[7]
+; CHECK-NEXT:    vmov.8 q2[7], r1
+; CHECK-NEXT:    adr r1, .LCPI2_2
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vadd.i32 q0, q0, r0
+; CHECK-NEXT:    vcmp.u32 cs, q7, q0
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vpsel q6, q4, q5
+; CHECK-NEXT:    vmov r1, s24
+; CHECK-NEXT:    vmov.16 q0[0], r1
+; CHECK-NEXT:    vmov r1, s25
+; CHECK-NEXT:    vmov.16 q0[1], r1
+; CHECK-NEXT:    vmov r1, s26
+; CHECK-NEXT:    vmov.16 q0[2], r1
+; CHECK-NEXT:    vmov r1, s27
+; CHECK-NEXT:    vmov.16 q0[3], r1
+; CHECK-NEXT:    adr r1, .LCPI2_3
+; CHECK-NEXT:    vldrw.u32 q6, [r1]
+; CHECK-NEXT:    vadd.i32 q6, q6, r0
+; CHECK-NEXT:    vcmp.u32 cs, q7, q6
+; CHECK-NEXT:    vpsel q7, q4, q5
+; CHECK-NEXT:    vmov r1, s28
+; CHECK-NEXT:    vmov.16 q0[4], r1
+; CHECK-NEXT:    vmov r1, s29
+; CHECK-NEXT:    vmov.16 q0[5], r1
+; CHECK-NEXT:    vmov r1, s30
+; CHECK-NEXT:    vmov.16 q0[6], r1
+; CHECK-NEXT:    vmov r1, s31
+; CHECK-NEXT:    vmov.16 q0[7], r1
+; CHECK-NEXT:    vdup.32 q7, r0
+; CHECK-NEXT:    vcmp.i16 ne, q0, zr
+; CHECK-NEXT:    vpsel q0, q4, q5
+; CHECK-NEXT:    vcmp.u32 hi, q7, q1
+; CHECK-NEXT:    vmov.u16 r1, q0[0]
+; CHECK-NEXT:    vpsel q1, q4, q5
+; CHECK-NEXT:    vmov.8 q2[8], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[1]
+; CHECK-NEXT:    vmov.8 q2[9], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[2]
+; CHECK-NEXT:    vmov.8 q2[10], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[3]
+; CHECK-NEXT:    vmov.8 q2[11], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[4]
+; CHECK-NEXT:    vmov.8 q2[12], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[5]
+; CHECK-NEXT:    vmov.8 q2[13], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[6]
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.8 q2[14], r1
+; CHECK-NEXT:    vmov.u16 r1, q0[7]
+; CHECK-NEXT:    vmov.16 q0[0], r0
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vcmp.u32 hi, q7, q3
+; CHECK-NEXT:    vmov.16 q0[1], r0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vpsel q1, q4, q5
+; CHECK-NEXT:    vmov.16 q0[3], r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.8 q2[15], r1
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vmov.16 q0[5], r0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.16 q0[6], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vcmp.i16 ne, q0, zr
+; CHECK-NEXT:    vpsel q0, q4, q5
+; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vmov.8 q3[0], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov.8 q3[1], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.8 q3[2], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.8 q3[3], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmov.8 q3[4], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vmov.8 q3[5], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.8 q3[6], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vmov.8 q3[7], r0
+; CHECK-NEXT:    vcmp.u32 hi, q7, q0
+; CHECK-NEXT:    vpsel q1, q4, q5
+; CHECK-NEXT:    vcmp.u32 hi, q7, q6
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.16 q0[0], r0
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vmov.16 q0[1], r0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vpsel q1, q4, q5
+; CHECK-NEXT:    vmov.16 q0[3], r0
+; CHECK-NEXT:    vmov r0, s4
+; CHECK-NEXT:    vmov.16 q0[4], r0
+; CHECK-NEXT:    vmov r0, s5
+; CHECK-NEXT:    vmov.16 q0[5], r0
+; CHECK-NEXT:    vmov r0, s6
+; CHECK-NEXT:    vmov.16 q0[6], r0
+; CHECK-NEXT:    vmov r0, s7
+; CHECK-NEXT:    vmov.16 q0[7], r0
+; CHECK-NEXT:    vcmp.i16 ne, q0, zr
+; CHECK-NEXT:    vpsel q0, q4, q5
+; CHECK-NEXT:    vmov.u16 r0, q0[0]
+; CHECK-NEXT:    vmov.8 q3[8], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[1]
+; CHECK-NEXT:    vmov.8 q3[9], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[2]
+; CHECK-NEXT:    vmov.8 q3[10], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[3]
+; CHECK-NEXT:    vmov.8 q3[11], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[4]
+; CHECK-NEXT:    vmov.8 q3[12], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[5]
+; CHECK-NEXT:    vmov.8 q3[13], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[6]
+; CHECK-NEXT:    vmov.8 q3[14], r0
+; CHECK-NEXT:    vmov.u16 r0, q0[7]
+; CHECK-NEXT:    vmov.8 q3[15], r0
+; CHECK-NEXT:    vmov d0, r2, r3
+; CHECK-NEXT:    add r0, sp, #88
+; CHECK-NEXT:    vcmp.i8 ne, q3, zr
+; CHECK-NEXT:    vldr d1, [sp, #80]
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vpnot
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vcmpt.i8 ne, q2, zr
+; CHECK-NEXT:    vpsel q0, q0, q1
+; CHECK-NEXT:    vmov r0, r1, d0
+; CHECK-NEXT:    vmov r2, r3, d1
+; CHECK-NEXT:    add sp, #16
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  @ %bb.1:
+; CHECK-NEXT:  .LCPI2_0:
+; CHECK-NEXT:    .long 0 @ 0x0
+; CHECK-NEXT:    .long 1 @ 0x1
+; CHECK-NEXT:    .long 2 @ 0x2
+; CHECK-NEXT:    .long 3 @ 0x3
+; CHECK-NEXT:  .LCPI2_1:
+; CHECK-NEXT:    .long 4 @ 0x4
+; CHECK-NEXT:    .long 5 @ 0x5
+; CHECK-NEXT:    .long 6 @ 0x6
+; CHECK-NEXT:    .long 7 @ 0x7
+; CHECK-NEXT:  .LCPI2_2:
+; CHECK-NEXT:    .long 8 @ 0x8
+; CHECK-NEXT:    .long 9 @ 0x9
+; CHECK-NEXT:    .long 10 @ 0xa
+; CHECK-NEXT:    .long 11 @ 0xb
+; CHECK-NEXT:  .LCPI2_3:
+; CHECK-NEXT:    .long 12 @ 0xc
+; CHECK-NEXT:    .long 13 @ 0xd
+; CHECK-NEXT:    .long 14 @ 0xe
+; CHECK-NEXT:    .long 15 @ 0xf
+  %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %BTC)
+  %select = select <16 x i1> %active.lane.mask, <16 x i8> %V1, <16 x i8> %V2
+  ret <16 x i8> %select
+}
+
+declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
+declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
+declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
diff --git a/llvm/test/Transforms/InstCombine/vector_gep1.ll b/llvm/test/Transforms/InstCombine/vector_gep1.ll
index 8e5bcf963ea1c0..4eb449edb34807 100644
--- a/llvm/test/Transforms/InstCombine/vector_gep1.ll
+++ b/llvm/test/Transforms/InstCombine/vector_gep1.ll
@@ -62,3 +62,13 @@ define <2 x i32*> @test7(<2 x {i32, i32}*> %a) {
   ret <2 x i32*> %w
 }
 
+define <vscale x 2 x i1> @test8() {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:    ret <vscale x 2 x i1> icmp ult (<vscale x 2 x i64> zext (<vscale x 2 x i32> shufflevector (<vscale x 2 x i32> insertelement (<vscale x 2 x i32> undef, i32 1, i32 0), <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer) to <vscale x 2 x i64>), <vscale x 2 x i64> zeroinitializer)
+;
+  %ins = insertelement <vscale x 2 x i32> undef, i32 1, i32 0
+  %b = shufflevector <vscale x 2 x i32> %ins, <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer
+  %c = inttoptr <vscale x 2 x i32> %b to <vscale x 2 x i8*>
+  %d = icmp ult <vscale x 2 x i8*> %c, zeroinitializer
+  ret <vscale x 2 x i1> %d
+}