Skip to content

Commit

Permalink
Merged master:ae47d158a096 into amd-gfx:7b117dcd2448
Browse files Browse the repository at this point in the history
Local branch amd-gfx 7b117dc Merged master:d4adac48320e into amd-gfx:a7ec3c4bc816
Remote branch master ae47d15 Remove "rm -f" workaround in acle_sve_adda.c
  • Loading branch information
Sw authored and Sw committed Jun 26, 2020
2 parents 7b117dc + ae47d15 commit 8504aad
Show file tree
Hide file tree
Showing 13 changed files with 554 additions and 200 deletions.
3 changes: 2 additions & 1 deletion clang/lib/Basic/Targets/X86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,7 @@ bool X86TargetInfo::initFeatureMap(
setFeatureEnabledImpl(Features, "popcnt", true);
setFeatureEnabledImpl(Features, "sahf", true);
setFeatureEnabledImpl(Features, "prfchw", true);
setFeatureEnabledImpl(Features, "cx16", true);
LLVM_FALLTHROUGH;
case CK_K8SSE3:
setFeatureEnabledImpl(Features, "sse3", true);
Expand Down Expand Up @@ -711,7 +712,7 @@ void X86TargetInfo::setFeatureEnabledImpl(llvm::StringMap<bool> &Features,
setSSELevel(Features, SSE41, Enabled);
} else if (Name == "xsave") {
if (!Enabled)
Features["xsaveopt"] = false;
Features["xsaveopt"] = Features["xsavec"] = Features["xsaves"] = false;
} else if (Name == "xsaveopt" || Name == "xsavec" || Name == "xsaves") {
if (Enabled)
Features["xsave"] = true;
Expand Down
1 change: 0 additions & 1 deletion clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_adda.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
// REQUIRES: aarch64-registered-target
// RUN: rm -f -- %S/acle_sve_adda.s
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -o - %s >/dev/null 2>%t
Expand Down
1 change: 1 addition & 0 deletions clang/test/Preprocessor/predefined-arch-macros.c
Original file line number Diff line number Diff line change
Expand Up @@ -2397,6 +2397,7 @@
// RUN: | FileCheck -match-full-lines %s -check-prefix=CHECK_AMDFAM10_M64
// CHECK_AMDFAM10_M64: #define __3dNOW_A__ 1
// CHECK_AMDFAM10_M64: #define __3dNOW__ 1
// CHECK_AMDFAM10_M64: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16 1
// CHECK_AMDFAM10_M64: #define __LZCNT__ 1
// CHECK_AMDFAM10_M64: #define __MMX__ 1
// CHECK_AMDFAM10_M64: #define __POPCNT__ 1
Expand Down
4 changes: 3 additions & 1 deletion clang/test/Preprocessor/x86_target_features.c
Original file line number Diff line number Diff line change
Expand Up @@ -417,9 +417,11 @@
// XSAVES: #define __XSAVES__ 1
// XSAVES: #define __XSAVE__ 1

// RUN: %clang -target i386-unknown-unknown -march=atom -mxsaveopt -mno-xsave -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOXSAVE %s
// RUN: %clang -target i386-unknown-unknown -march=atom -mxsaveopt -mxsavec -mxsaves -mno-xsave -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOXSAVE %s

// NOXSAVE-NOT: #define __XSAVEC__ 1
// NOXSAVE-NOT: #define __XSAVEOPT__ 1
// NOXSAVE-NOT: #define __XSAVES__ 1
// NOXSAVE-NOT: #define __XSAVE__ 1

// RUN: %clang -target i386-unknown-unknown -march=atom -mclflushopt -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=CLFLUSHOPT %s
Expand Down
33 changes: 33 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6914,6 +6914,39 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
DAG.getZExtOrTrunc(Const, getCurSDLoc(), PtrVT)));
return;
}
case Intrinsic::get_active_lane_mask: {
auto DL = getCurSDLoc();
SDValue Index = getValue(I.getOperand(0));
SDValue BTC = getValue(I.getOperand(1));
Type *ElementTy = I.getOperand(0)->getType();
EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
unsigned VecWidth = VT.getVectorNumElements();

SmallVector<SDValue, 16> OpsBTC;
SmallVector<SDValue, 16> OpsIndex;
SmallVector<SDValue, 16> OpsStepConstants;
for (unsigned i = 0; i < VecWidth; i++) {
OpsBTC.push_back(BTC);
OpsIndex.push_back(Index);
OpsStepConstants.push_back(DAG.getConstant(i, DL, MVT::getVT(ElementTy)));
}

EVT CCVT = MVT::i1;
CCVT = EVT::getVectorVT(I.getContext(), CCVT, VecWidth);

auto VecTy = MVT::getVT(FixedVectorType::get(ElementTy, VecWidth));
SDValue VectorIndex = DAG.getBuildVector(VecTy, DL, OpsIndex);
SDValue VectorStep = DAG.getBuildVector(VecTy, DL, OpsStepConstants);
SDValue VectorInduction = DAG.getNode(
ISD::UADDO, DL, DAG.getVTList(VecTy, CCVT), VectorIndex, VectorStep);
SDValue VectorBTC = DAG.getBuildVector(VecTy, DL, OpsBTC);
SDValue SetCC = DAG.getSetCC(DL, CCVT, VectorInduction.getValue(0),
VectorBTC, ISD::CondCode::SETULE);
setValue(&I, DAG.getNode(ISD::AND, DL, CCVT,
DAG.getNOT(DL, VectorInduction.getValue(1), CCVT),
SetCC));
return;
}
}
}

Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/IR/DataLayout.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -792,7 +792,7 @@ Type *DataLayout::getIntPtrType(Type *Ty) const {
unsigned NumBits = getPointerTypeSizeInBits(Ty);
IntegerType *IntTy = IntegerType::get(Ty->getContext(), NumBits);
if (VectorType *VecTy = dyn_cast<VectorType>(Ty))
return FixedVectorType::get(IntTy, VecTy->getNumElements());
return VectorType::get(IntTy, VecTy);
return IntTy;
}

Expand Down
164 changes: 48 additions & 116 deletions llvm/lib/Target/ARM/MVETailPredication.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,6 @@ class MVETailPredication : public LoopPass {
TargetTransformInfo *TTI = nullptr;
TargetLibraryInfo *TLI = nullptr;
bool ClonedVCTPInExitBlock = false;
IntrinsicInst *ActiveLaneMask = nullptr;
FixedVectorType *VecTy = nullptr;

public:
static char ID;
Expand Down Expand Up @@ -119,7 +117,8 @@ class MVETailPredication : public LoopPass {
/// intrinsic: check if the first is a loop induction variable, and for the
/// the second check that no overflow can occur in the expression that use
/// this backedge-taken count.
bool IsSafeActiveMask(Value *TripCount, FixedVectorType *VecTy);
bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount,
FixedVectorType *VecTy);

/// Insert the intrinsic to represent the effect of tail predication.
void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount,
Expand All @@ -130,10 +129,6 @@ class MVETailPredication : public LoopPass {
/// ARMLowOverheadLoops to better optimise away loop update statements inside
/// hardware-loops.
void RematerializeIterCount();

/// If it is not safe to lower @llvm.get.active.lane.mask to a VCTP, it needs
/// to be lowered to an icmp.
void RevertActiveLaneMask();
};

} // end namespace
Expand Down Expand Up @@ -167,83 +162,6 @@ void MVETailPredication::RematerializeIterCount() {
DeadInsts);
}

void MVETailPredication::RevertActiveLaneMask() {
if (!ActiveLaneMask)
return;

int VectorWidth = VecTy->getElementCount().Min;
IRBuilder<> Builder(ActiveLaneMask->getParent()->getFirstNonPHI());

// 1. Create the vector induction step. This %induction will be the LHS of
// the icmp:
//
// %splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
// %splat = shufflevector <4 x i32> %splatinsert, <4 x i32> undef, <4 x i32> 0
// %induction = add <4 x i32> %splat, <i32 0, i32 1, i32 2, i32 3>
//
Value *Index = ActiveLaneMask->getOperand(0);
Value *SplatIndex =
Builder.CreateVectorSplat(VectorWidth, Index, "lane.mask");

SmallVector<Constant *, 8> Indices;
for (int i = 0; i < VectorWidth; ++i)
Indices.push_back(ConstantInt::get(Index->getType(), i));

Constant *CV = ConstantVector::get(Indices);
Value *Induction = Builder.CreateAdd(SplatIndex, CV, "lane.mask.induction");

LLVM_DEBUG(dbgs() << "ARM TP: New index: " << *SplatIndex << "\n";
dbgs() << "ARM TP: New Induction: " << *Induction << "\n");

// 2. In the Preheader, first look if the splat BTC already exists. Find this
// %splat, which will be the RHS of the icmp:
//
// %TC.minus.1 = add i32 %N, -1
// %splatinsert = insertelement <4 x i32> undef, i32 %TC.minus.1, i32 0
// %splat = shufflevector <4 x i32> %splatinsert, <4 x i32> undef, <16 x i32> 0
//
auto *Preheader = L->getLoopPreheader();
auto *BTC = ActiveLaneMask->getOperand(1);
Value *SplatBTC = nullptr;

if (auto *C = dyn_cast<ConstantInt>(BTC)) {
Builder.SetInsertPoint(Preheader->getTerminator());
SplatBTC = Builder.CreateVectorSplat(VectorWidth, C);
LLVM_DEBUG(dbgs() << "ARM TCP: New splat BTC: " << *SplatBTC << "\n");
} else {
Instruction *InsertElem;
for (auto &V : *Preheader) {
InsertElem = dyn_cast<InsertElementInst>(&V);
if (!InsertElem)
continue;
ConstantInt *CI = dyn_cast<ConstantInt>(InsertElem->getOperand(2));
if (!CI)
continue;
if (InsertElem->getOperand(1) != BTC || CI->getSExtValue() != 0)
continue;
if ((SplatBTC = dyn_cast<ShuffleVectorInst>(*InsertElem->users().begin())))
break;
}
}
// Or create the splat BTC if it doesn't exist.
if (!SplatBTC) {
Builder.SetInsertPoint(Preheader->getTerminator());
Value *Undef =
UndefValue::get(FixedVectorType::get(BTC->getType(), VectorWidth));
Value *Insert = Builder.CreateInsertElement(Undef,
BTC, Builder.getInt32(0), "insert.btc");
Value *Zero = ConstantInt::get(Insert->getType(), 0);
SplatBTC = Builder.CreateShuffleVector (Insert, Undef, Zero, "splat.btc");
LLVM_DEBUG(dbgs() << "ARM TCP: New splat BTC: " << *SplatBTC << "\n");
}

Builder.SetInsertPoint(ActiveLaneMask);
Value *ICmp = Builder.CreateICmp(ICmpInst::ICMP_ULE, Induction, SplatBTC);
LLVM_DEBUG(dbgs() << "ARM TP: New compare: " << *ICmp << "\n");
ActiveLaneMask->replaceAllUsesWith(ICmp);
ActiveLaneMask->eraseFromParent();
}

bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
if (skipLoop(L) || DisableTailPredication)
return false;
Expand All @@ -261,7 +179,6 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr;
DL = &L->getHeader()->getModule()->getDataLayout();
this->L = L;
ActiveLaneMask = nullptr;

// The MVE and LOB extensions are combined to enable tail-predication, but
// there's nothing preventing us from generating VCTP instructions for v8.1m.
Expand Down Expand Up @@ -318,15 +235,14 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n"
<< *Decrement << "\n");

if (TryConvert(Setup->getArgOperand(0))) {
if (ClonedVCTPInExitBlock)
RematerializeIterCount();
return true;
} else
RevertActiveLaneMask();
if (!TryConvert(Setup->getArgOperand(0))) {
LLVM_DEBUG(dbgs() << "ARM TP: Can't tail-predicate this loop.\n");
return false;
}

LLVM_DEBUG(dbgs() << "ARM TP: Can't tail-predicate this loop.\n");
return false;
if (ClonedVCTPInExitBlock)
RematerializeIterCount();
return true;
}

static FixedVectorType *getVectorType(IntrinsicInst *I) {
Expand All @@ -341,10 +257,27 @@ bool MVETailPredication::IsPredicatedVectorLoop() {
// Check that the loop contains at least one masked load/store intrinsic.
// We only support 'normal' vector instructions - other than masked
// load/stores.
bool ActiveLaneMask = false;
for (auto *BB : L->getBlocks()) {
for (auto &I : *BB) {
auto *Int = dyn_cast<IntrinsicInst>(&I);
if (!Int)
continue;

switch (Int->getIntrinsicID()) {
case Intrinsic::get_active_lane_mask:
ActiveLaneMask = true;
LLVM_FALLTHROUGH;
case Intrinsic::fma:
case Intrinsic::sadd_sat:
case Intrinsic::uadd_sat:
continue;
default:
break;
}

if (IsMasked(&I)) {
FixedVectorType *VecTy = getVectorType(cast<IntrinsicInst>(&I));
auto *VecTy = getVectorType(Int);
unsigned Lanes = VecTy->getNumElements();
unsigned ElementWidth = VecTy->getScalarSizeInBits();
// MVE vectors are 128-bit, but don't support 128 x i1.
Expand All @@ -353,23 +286,20 @@ bool MVETailPredication::IsPredicatedVectorLoop() {
if (Lanes * ElementWidth > MaxWidth || Lanes == MaxWidth)
return false;
MaskedInsts.push_back(cast<IntrinsicInst>(&I));
} else if (auto *Int = dyn_cast<IntrinsicInst>(&I)) {
switch (Int->getIntrinsicID()) {
case Intrinsic::fma:
case Intrinsic::sadd_sat:
case Intrinsic::uadd_sat:
continue;
default:
break;
}
for (auto &U : Int->args()) {
if (isa<VectorType>(U->getType()))
return false;
}
continue;
}

for (const Use &U : Int->args()) {
if (isa<VectorType>(U->getType()))
return false;
}
}
}

if (!ActiveLaneMask) {
LLVM_DEBUG(dbgs() << "ARM TP: No get.active.lane.mask intrinsic found.\n");
return false;
}
return !MaskedInsts.empty();
}

Expand Down Expand Up @@ -451,14 +381,15 @@ static bool Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
// (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount
// 3) The IV must be an induction phi with an increment equal to the
// vector width.
bool MVETailPredication::IsSafeActiveMask(Value *TripCount,
FixedVectorType *VecTy) {
bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
Value *TripCount, FixedVectorType *VecTy) {
// 1) Test whether entry to the loop is protected by a conditional
// BTC + 1 < 0. In other words, if the scalar trip count overflows,
// becomes negative, we shouldn't enter the loop and creating
// tripcount expression BTC + 1 is not safe. So, check that BTC
// isn't max. This is evaluated in unsigned, because the semantics
// of @get.active.lane.mask is a ULE comparison.

int VectorWidth = VecTy->getNumElements();
auto *BackedgeTakenCount = ActiveLaneMask->getOperand(1);
auto *BTC = SE->getSCEV(BackedgeTakenCount);
Expand Down Expand Up @@ -570,8 +501,8 @@ bool MVETailPredication::IsSafeActiveMask(Value *TripCount,
if (VectorWidth == StepValue)
return true;

LLVM_DEBUG(dbgs() << "ARM TP: step value " << StepValue << " doesn't match "
"vector width : " << VectorWidth << "\n");
LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue << " doesn't match "
"vector width " << VectorWidth << "\n");

return false;
}
Expand Down Expand Up @@ -614,6 +545,7 @@ void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
Module *M = L->getHeader()->getModule();
Type *Ty = IntegerType::get(M->getContext(), 32);
unsigned VectorWidth = VecTy->getNumElements();

// The backedge-taken count in @llvm.get.active.lane.mask, its 2nd operand,
// is one less than the trip count. So we need to find or create
Expand All @@ -631,10 +563,10 @@ void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
// represent the effect of tail predication.
Builder.SetInsertPoint(ActiveLaneMask);
ConstantInt *Factor =
ConstantInt::get(cast<IntegerType>(Ty), VecTy->getNumElements());
ConstantInt::get(cast<IntegerType>(Ty), VectorWidth);

Intrinsic::ID VCTPID;
switch (VecTy->getNumElements()) {
switch (VectorWidth) {
default:
llvm_unreachable("unexpected number of lanes");
case 4: VCTPID = Intrinsic::arm_mve_vctp32; break;
Expand Down Expand Up @@ -680,7 +612,7 @@ bool MVETailPredication::TryConvert(Value *TripCount) {
if (!Predicate || Predicates.count(Predicate))
continue;

ActiveLaneMask = dyn_cast<IntrinsicInst>(Predicate);
auto *ActiveLaneMask = dyn_cast<IntrinsicInst>(Predicate);
if (!ActiveLaneMask ||
ActiveLaneMask->getIntrinsicID() != Intrinsic::get_active_lane_mask)
continue;
Expand All @@ -689,8 +621,8 @@ bool MVETailPredication::TryConvert(Value *TripCount) {
LLVM_DEBUG(dbgs() << "ARM TP: Found active lane mask: "
<< *ActiveLaneMask << "\n");

VecTy = getVectorType(I);
if (!IsSafeActiveMask(TripCount, VecTy)) {
auto *VecTy = getVectorType(I);
if (!IsSafeActiveMask(ActiveLaneMask, TripCount, VecTy)) {
LLVM_DEBUG(dbgs() << "ARM TP: Not safe to insert VCTP.\n");
return false;
}
Expand Down
9 changes: 6 additions & 3 deletions llvm/lib/Target/X86/X86.td
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,16 @@ def FeatureXSAVE : SubtargetFeature<"xsave", "HasXSAVE", "true",
"Support xsave instructions">;

def FeatureXSAVEOPT: SubtargetFeature<"xsaveopt", "HasXSAVEOPT", "true",
"Support xsaveopt instructions">;
"Support xsaveopt instructions",
[FeatureXSAVE]>;

def FeatureXSAVEC : SubtargetFeature<"xsavec", "HasXSAVEC", "true",
"Support xsavec instructions">;
"Support xsavec instructions",
[FeatureXSAVE]>;

def FeatureXSAVES : SubtargetFeature<"xsaves", "HasXSAVES", "true",
"Support xsaves instructions">;
"Support xsaves instructions",
[FeatureXSAVE]>;

def FeatureSSE1 : SubtargetFeature<"sse", "X86SSELevel", "SSE1",
"Enable SSE instructions">;
Expand Down
Loading

0 comments on commit 8504aad

Please sign in to comment.