Merged master:ae47d158a096 into amd-gfx:7b117dcd2448

Local branch amd-gfx 7b117dc Merged master:d4adac48320e into amd-gfx:a7ec3c4bc816 Remote branch master ae47d15 Remove "rm -f" workaround in acle_sve_adda.c
jaebaek · Jun 26, 2020 · 8504aad · 8504aad
2 parents 7b117dc + ae47d15
commit 8504aad
Show file tree

Hide file tree

Showing 13 changed files with 554 additions and 200 deletions.
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
@@ -336,6 +336,7 @@ bool X86TargetInfo::initFeatureMap(
     setFeatureEnabledImpl(Features, "popcnt", true);
     setFeatureEnabledImpl(Features, "sahf", true);
     setFeatureEnabledImpl(Features, "prfchw", true);
+    setFeatureEnabledImpl(Features, "cx16", true);
     LLVM_FALLTHROUGH;
   case CK_K8SSE3:
     setFeatureEnabledImpl(Features, "sse3", true);
@@ -711,7 +712,7 @@ void X86TargetInfo::setFeatureEnabledImpl(llvm::StringMap<bool> &Features,
       setSSELevel(Features, SSE41, Enabled);
   } else if (Name == "xsave") {
     if (!Enabled)
-      Features["xsaveopt"] = false;
+      Features["xsaveopt"] = Features["xsavec"] = Features["xsaves"] = false;
   } else if (Name == "xsaveopt" || Name == "xsavec" || Name == "xsaves") {
     if (Enabled)
       Features["xsave"] = true;

diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adda.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adda.c
@@ -1,5 +1,4 @@
 // REQUIRES: aarch64-registered-target
-// RUN: rm -f -- %S/acle_sve_adda.s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -o - %s >/dev/null 2>%t

diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c
@@ -2397,6 +2397,7 @@
 // RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_AMDFAM10_M64
 // CHECK_AMDFAM10_M64: #define __3dNOW_A__ 1
 // CHECK_AMDFAM10_M64: #define __3dNOW__ 1
+// CHECK_AMDFAM10_M64: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16 1
 // CHECK_AMDFAM10_M64: #define __LZCNT__ 1
 // CHECK_AMDFAM10_M64: #define __MMX__ 1
 // CHECK_AMDFAM10_M64: #define __POPCNT__ 1

diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c
@@ -417,9 +417,11 @@
 // XSAVES: #define __XSAVES__ 1
 // XSAVES: #define __XSAVE__ 1
 
-// RUN: %clang -target i386-unknown-unknown -march=atom -mxsaveopt -mno-xsave -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOXSAVE %s
+// RUN: %clang -target i386-unknown-unknown -march=atom -mxsaveopt -mxsavec -mxsaves -mno-xsave -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOXSAVE %s
 
+// NOXSAVE-NOT: #define __XSAVEC__ 1
 // NOXSAVE-NOT: #define __XSAVEOPT__ 1
+// NOXSAVE-NOT: #define __XSAVES__ 1
 // NOXSAVE-NOT: #define __XSAVE__ 1
 
 // RUN: %clang -target i386-unknown-unknown -march=atom -mclflushopt -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=CLFLUSHOPT %s

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6914,6 +6914,39 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                              DAG.getZExtOrTrunc(Const, getCurSDLoc(), PtrVT)));
     return;
   }
+  case Intrinsic::get_active_lane_mask: {
+    auto DL = getCurSDLoc();
+    SDValue Index = getValue(I.getOperand(0));
+    SDValue BTC = getValue(I.getOperand(1));
+    Type *ElementTy = I.getOperand(0)->getType();
+    EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    unsigned VecWidth = VT.getVectorNumElements();
+
+    SmallVector<SDValue, 16> OpsBTC;
+    SmallVector<SDValue, 16> OpsIndex;
+    SmallVector<SDValue, 16> OpsStepConstants;
+    for (unsigned i = 0; i < VecWidth; i++) {
+      OpsBTC.push_back(BTC);
+      OpsIndex.push_back(Index);
+      OpsStepConstants.push_back(DAG.getConstant(i, DL, MVT::getVT(ElementTy)));
+    }
+
+    EVT CCVT = MVT::i1;
+    CCVT = EVT::getVectorVT(I.getContext(), CCVT, VecWidth);
+
+    auto VecTy = MVT::getVT(FixedVectorType::get(ElementTy, VecWidth));
+    SDValue VectorIndex = DAG.getBuildVector(VecTy, DL, OpsIndex);
+    SDValue VectorStep = DAG.getBuildVector(VecTy, DL, OpsStepConstants);
+    SDValue VectorInduction = DAG.getNode(
+       ISD::UADDO, DL, DAG.getVTList(VecTy, CCVT), VectorIndex, VectorStep);
+    SDValue VectorBTC = DAG.getBuildVector(VecTy, DL, OpsBTC);
+    SDValue SetCC = DAG.getSetCC(DL, CCVT, VectorInduction.getValue(0),
+                                 VectorBTC, ISD::CondCode::SETULE);
+    setValue(&I, DAG.getNode(ISD::AND, DL, CCVT,
+                             DAG.getNOT(DL, VectorInduction.getValue(1), CCVT),
+                             SetCC));
+    return;
+  }
   }
 }
 

diff --git a/llvm/lib/IR/DataLayout.cpp b/llvm/lib/IR/DataLayout.cpp
@@ -792,7 +792,7 @@ Type *DataLayout::getIntPtrType(Type *Ty) const {
   unsigned NumBits = getPointerTypeSizeInBits(Ty);
   IntegerType *IntTy = IntegerType::get(Ty->getContext(), NumBits);
   if (VectorType *VecTy = dyn_cast<VectorType>(Ty))
-    return FixedVectorType::get(IntTy, VecTy->getNumElements());
+    return VectorType::get(IntTy, VecTy);
   return IntTy;
 }
 

diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -86,8 +86,6 @@ class MVETailPredication : public LoopPass {
   TargetTransformInfo *TTI = nullptr;
   TargetLibraryInfo *TLI = nullptr;
   bool ClonedVCTPInExitBlock = false;
-  IntrinsicInst *ActiveLaneMask = nullptr;
-  FixedVectorType *VecTy = nullptr;
 
 public:
   static char ID;
@@ -119,7 +117,8 @@ class MVETailPredication : public LoopPass {
   /// intrinsic: check if the first is a loop induction variable, and for the
   /// the second check that no overflow can occur in the expression that use
   /// this backedge-taken count.
-  bool IsSafeActiveMask(Value *TripCount, FixedVectorType *VecTy);
+  bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount,
+                        FixedVectorType *VecTy);
 
   /// Insert the intrinsic to represent the effect of tail predication.
   void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount,
@@ -130,10 +129,6 @@ class MVETailPredication : public LoopPass {
   /// ARMLowOverheadLoops to better optimise away loop update statements inside
   /// hardware-loops.
   void RematerializeIterCount();
-
-  /// If it is not safe to lower @llvm.get.active.lane.mask to a VCTP, it needs
-  /// to be lowered to an icmp.
-  void RevertActiveLaneMask();
 };
 
 } // end namespace
@@ -167,83 +162,6 @@ void MVETailPredication::RematerializeIterCount() {
                         DeadInsts);
 }
 
-void MVETailPredication::RevertActiveLaneMask() {
-  if (!ActiveLaneMask)
-    return;
-
-  int VectorWidth = VecTy->getElementCount().Min;
-  IRBuilder<> Builder(ActiveLaneMask->getParent()->getFirstNonPHI());
-
-  // 1. Create the vector induction step. This %induction will be the LHS of
-  // the icmp:
-  //
-  // %splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  // %splat = shufflevector <4 x i32> %splatinsert, <4 x i32> undef, <4 x i32> 0
-  // %induction = add <4 x i32> %splat, <i32 0, i32 1, i32 2, i32 3>
-  //
-  Value *Index = ActiveLaneMask->getOperand(0);
-  Value *SplatIndex =
-      Builder.CreateVectorSplat(VectorWidth, Index, "lane.mask");
-
-  SmallVector<Constant *, 8> Indices;
-  for (int i = 0; i < VectorWidth; ++i)
-    Indices.push_back(ConstantInt::get(Index->getType(), i));
-
-  Constant *CV = ConstantVector::get(Indices);
-  Value *Induction = Builder.CreateAdd(SplatIndex, CV, "lane.mask.induction");
-
-  LLVM_DEBUG(dbgs() << "ARM TP: New index: " << *SplatIndex << "\n";
-             dbgs() << "ARM TP: New Induction: " << *Induction << "\n");
-
-  // 2. In the Preheader, first look if the splat BTC already exists. Find this
-  //    %splat, which will be the RHS of the icmp:
-  //
-  //    %TC.minus.1 = add i32 %N, -1
-  //    %splatinsert = insertelement <4 x i32> undef, i32 %TC.minus.1, i32 0
-  //    %splat = shufflevector <4 x i32> %splatinsert, <4 x i32> undef, <16 x i32> 0
-  //
-  auto *Preheader = L->getLoopPreheader();
-  auto *BTC = ActiveLaneMask->getOperand(1);
-  Value *SplatBTC = nullptr;
-
-  if (auto *C = dyn_cast<ConstantInt>(BTC)) {
-    Builder.SetInsertPoint(Preheader->getTerminator());
-    SplatBTC = Builder.CreateVectorSplat(VectorWidth, C);
-    LLVM_DEBUG(dbgs() << "ARM TCP: New splat BTC: " << *SplatBTC << "\n");
-  } else {
-    Instruction *InsertElem;
-    for (auto &V : *Preheader) {
-      InsertElem = dyn_cast<InsertElementInst>(&V);
-      if (!InsertElem)
-        continue;
-      ConstantInt *CI = dyn_cast<ConstantInt>(InsertElem->getOperand(2));
-      if (!CI)
-        continue;
-      if (InsertElem->getOperand(1) != BTC || CI->getSExtValue() != 0)
-        continue;
-      if ((SplatBTC = dyn_cast<ShuffleVectorInst>(*InsertElem->users().begin())))
-         break;
-    }
-  }
-  // Or create the splat BTC if it doesn't exist.
-  if (!SplatBTC) {
-    Builder.SetInsertPoint(Preheader->getTerminator());
-    Value *Undef =
-        UndefValue::get(FixedVectorType::get(BTC->getType(), VectorWidth));
-    Value *Insert = Builder.CreateInsertElement(Undef,
-        BTC, Builder.getInt32(0), "insert.btc");
-    Value *Zero = ConstantInt::get(Insert->getType(), 0);
-    SplatBTC = Builder.CreateShuffleVector (Insert, Undef, Zero, "splat.btc");
-    LLVM_DEBUG(dbgs() << "ARM TCP: New splat BTC: " << *SplatBTC << "\n");
-  }
-
-  Builder.SetInsertPoint(ActiveLaneMask);
-  Value *ICmp = Builder.CreateICmp(ICmpInst::ICMP_ULE, Induction, SplatBTC);
-  LLVM_DEBUG(dbgs() << "ARM TP: New compare: " << *ICmp << "\n");
-  ActiveLaneMask->replaceAllUsesWith(ICmp);
-  ActiveLaneMask->eraseFromParent();
-}
-
 bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
   if (skipLoop(L) || DisableTailPredication)
     return false;
@@ -261,7 +179,6 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
   TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr;
   DL = &L->getHeader()->getModule()->getDataLayout();
   this->L = L;
-  ActiveLaneMask = nullptr;
 
   // The MVE and LOB extensions are combined to enable tail-predication, but
   // there's nothing preventing us from generating VCTP instructions for v8.1m.
@@ -318,15 +235,14 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
   LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n"
              << *Decrement << "\n");
 
-  if (TryConvert(Setup->getArgOperand(0))) {
-    if (ClonedVCTPInExitBlock)
-      RematerializeIterCount();
-    return true;
-  } else
-    RevertActiveLaneMask();
+  if (!TryConvert(Setup->getArgOperand(0))) {
+    LLVM_DEBUG(dbgs() << "ARM TP: Can't tail-predicate this loop.\n");
+    return false;
+  }
 
-  LLVM_DEBUG(dbgs() << "ARM TP: Can't tail-predicate this loop.\n");
-  return false;
+  if (ClonedVCTPInExitBlock)
+    RematerializeIterCount();
+  return true;
 }
 
 static FixedVectorType *getVectorType(IntrinsicInst *I) {
@@ -341,10 +257,27 @@ bool MVETailPredication::IsPredicatedVectorLoop() {
   // Check that the loop contains at least one masked load/store intrinsic.
   // We only support 'normal' vector instructions - other than masked
   // load/stores.
+  bool ActiveLaneMask = false;
   for (auto *BB : L->getBlocks()) {
     for (auto &I : *BB) {
+      auto *Int = dyn_cast<IntrinsicInst>(&I);
+      if (!Int)
+        continue;
+
+      switch (Int->getIntrinsicID()) {
+      case Intrinsic::get_active_lane_mask:
+        ActiveLaneMask = true;
+        LLVM_FALLTHROUGH;
+      case Intrinsic::fma:
+      case Intrinsic::sadd_sat:
+      case Intrinsic::uadd_sat:
+        continue;
+      default:
+        break;
+      }
+
       if (IsMasked(&I)) {
-        FixedVectorType *VecTy = getVectorType(cast<IntrinsicInst>(&I));
+        auto *VecTy = getVectorType(Int);
         unsigned Lanes = VecTy->getNumElements();
         unsigned ElementWidth = VecTy->getScalarSizeInBits();
         // MVE vectors are 128-bit, but don't support 128 x i1.
@@ -353,23 +286,20 @@ bool MVETailPredication::IsPredicatedVectorLoop() {
         if (Lanes * ElementWidth > MaxWidth || Lanes == MaxWidth)
           return false;
         MaskedInsts.push_back(cast<IntrinsicInst>(&I));
-      } else if (auto *Int = dyn_cast<IntrinsicInst>(&I)) {
-        switch (Int->getIntrinsicID()) {
-          case Intrinsic::fma:
-          case Intrinsic::sadd_sat:
-          case Intrinsic::uadd_sat:
-            continue;
-          default:
-            break;
-        }
-        for (auto &U : Int->args()) {
-          if (isa<VectorType>(U->getType()))
-            return false;
-        }
+        continue;
+      }
+
+      for (const Use &U : Int->args()) {
+        if (isa<VectorType>(U->getType()))
+          return false;
       }
     }
   }
 
+  if (!ActiveLaneMask) {
+    LLVM_DEBUG(dbgs() << "ARM TP: No get.active.lane.mask intrinsic found.\n");
+    return false;
+  }
   return !MaskedInsts.empty();
 }
 
@@ -451,14 +381,15 @@ static bool Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
 //        (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount
 // 3) The IV must be an induction phi with an increment equal to the
 //    vector width.
-bool MVETailPredication::IsSafeActiveMask(Value *TripCount,
-    FixedVectorType *VecTy) {
+bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
+    Value *TripCount, FixedVectorType *VecTy) {
   // 1) Test whether entry to the loop is protected by a conditional
   // BTC + 1 < 0. In other words, if the scalar trip count overflows,
   // becomes negative, we shouldn't enter the loop and creating
   // tripcount expression BTC + 1 is not safe. So, check that BTC
   // isn't max. This is evaluated in unsigned, because the semantics
   // of @get.active.lane.mask is a ULE comparison.
+
   int VectorWidth = VecTy->getNumElements();
   auto *BackedgeTakenCount = ActiveLaneMask->getOperand(1);
   auto *BTC = SE->getSCEV(BackedgeTakenCount);
@@ -570,8 +501,8 @@ bool MVETailPredication::IsSafeActiveMask(Value *TripCount,
   if (VectorWidth == StepValue)
     return true;
 
-  LLVM_DEBUG(dbgs() << "ARM TP: step value " << StepValue << " doesn't match "
-             "vector width : " << VectorWidth << "\n");
+  LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue << " doesn't match "
+             "vector width " << VectorWidth << "\n");
 
   return false;
 }
@@ -614,6 +545,7 @@ void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
   Module *M = L->getHeader()->getModule();
   Type *Ty = IntegerType::get(M->getContext(), 32);
+  unsigned VectorWidth = VecTy->getNumElements();
 
   // The backedge-taken count in @llvm.get.active.lane.mask, its 2nd operand,
   // is one less than the trip count. So we need to find or create
@@ -631,10 +563,10 @@ void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
   // represent the effect of tail predication.
   Builder.SetInsertPoint(ActiveLaneMask);
   ConstantInt *Factor =
-    ConstantInt::get(cast<IntegerType>(Ty), VecTy->getNumElements());
+    ConstantInt::get(cast<IntegerType>(Ty), VectorWidth);
 
   Intrinsic::ID VCTPID;
-  switch (VecTy->getNumElements()) {
+  switch (VectorWidth) {
   default:
     llvm_unreachable("unexpected number of lanes");
   case 4:  VCTPID = Intrinsic::arm_mve_vctp32; break;
@@ -680,7 +612,7 @@ bool MVETailPredication::TryConvert(Value *TripCount) {
     if (!Predicate || Predicates.count(Predicate))
       continue;
 
-    ActiveLaneMask = dyn_cast<IntrinsicInst>(Predicate);
+    auto *ActiveLaneMask = dyn_cast<IntrinsicInst>(Predicate);
     if (!ActiveLaneMask ||
         ActiveLaneMask->getIntrinsicID() != Intrinsic::get_active_lane_mask)
       continue;
@@ -689,8 +621,8 @@ bool MVETailPredication::TryConvert(Value *TripCount) {
     LLVM_DEBUG(dbgs() << "ARM TP: Found active lane mask: "
                       << *ActiveLaneMask << "\n");
 
-    VecTy = getVectorType(I);
-    if (!IsSafeActiveMask(TripCount, VecTy)) {
+    auto *VecTy = getVectorType(I);
+    if (!IsSafeActiveMask(ActiveLaneMask, TripCount, VecTy)) {
       LLVM_DEBUG(dbgs() << "ARM TP: Not safe to insert VCTP.\n");
       return false;
     }

diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
@@ -52,13 +52,16 @@ def FeatureXSAVE   : SubtargetFeature<"xsave", "HasXSAVE", "true",
                                        "Support xsave instructions">;
 
 def FeatureXSAVEOPT: SubtargetFeature<"xsaveopt", "HasXSAVEOPT", "true",
-                                       "Support xsaveopt instructions">;
+                                       "Support xsaveopt instructions",
+                                       [FeatureXSAVE]>;
 
 def FeatureXSAVEC  : SubtargetFeature<"xsavec", "HasXSAVEC", "true",
-                                       "Support xsavec instructions">;
+                                       "Support xsavec instructions",
+                                       [FeatureXSAVE]>;
 
 def FeatureXSAVES  : SubtargetFeature<"xsaves", "HasXSAVES", "true",
-                                       "Support xsaves instructions">;
+                                       "Support xsaves instructions",
+                                       [FeatureXSAVE]>;
 
 def FeatureSSE1    : SubtargetFeature<"sse", "X86SSELevel", "SSE1",
                                       "Enable SSE instructions">;