Skip to content

Commit

Permalink
Reintroduce "[AMDGPU][CostModel] Refine cost model for half- and quar…
Browse files Browse the repository at this point in the history
…ter-rate instructions."

This patch seems to be fine now underlying issue with discards has been fixed.

This reverts commit 1a8972a.

Change-Id: I5e8f647473d1e3e0f0f6c63c36355d707612e616
  • Loading branch information
perlfu authored and piotrAMD committed Nov 10, 2020
1 parent af97e00 commit 242c915
Show file tree
Hide file tree
Showing 10 changed files with 417 additions and 270 deletions.
119 changes: 103 additions & 16 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -472,9 +472,50 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
// FIXME: We're having to query the throughput cost so that the basic
// implementation tries to generate legalize and scalarization costs. Maybe
// we could hoist the scalarization code here?
return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput,
Opd1Info, Opd2Info, Opd1PropInfo,
Opd2PropInfo, Args, CxtI);
if (CostKind != TTI::TCK_CodeSize)
return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput,
Opd1Info, Opd2Info, Opd1PropInfo,
Opd2PropInfo, Args, CxtI);
// Scalarization

// Check if any of the operands are vector operands.
int ISD = TLI->InstructionOpcodeToISD(Opcode);
assert(ISD && "Invalid opcode");

std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);

bool IsFloat = Ty->isFPOrFPVectorTy();
// Assume that floating point arithmetic operations cost twice as much as
// integer operations.
unsigned OpCost = (IsFloat ? 2 : 1);

if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
// The operation is legal. Assume it costs 1.
// TODO: Once we have extract/insert subvector cost we need to use them.
return LT.first * OpCost;
}

if (!TLI->isOperationExpand(ISD, LT.second)) {
// If the operation is custom lowered, then assume that the code is twice
// as expensive.
return LT.first * 2 * OpCost;
}

// Else, assume that we need to scalarize this op.
// TODO: If one of the types get legalized by splitting, handle this
// similarly to what getCastInstrCost() does.
if (auto *VTy = dyn_cast<VectorType>(Ty)) {
unsigned Num = cast<FixedVectorType>(VTy)->getNumElements();
unsigned Cost = getArithmeticInstrCost(
Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
Opd1PropInfo, Opd2PropInfo, Args, CxtI);
// Return the cost of multiple scalar invocation plus the cost of
// inserting and extracting the values.
return getScalarizationOverhead(VTy, Args) + Num * Cost;
}

// We don't know anything about this scalar instruction.
return OpCost;
}

// Legalize the type.
Expand All @@ -493,7 +534,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
case ISD::SRL:
case ISD::SRA:
if (SLT == MVT::i64)
return get64BitInstrCost() * LT.first * NElts;
return get64BitInstrCost(CostKind) * LT.first * NElts;

if (ST->has16BitInsts() && SLT == MVT::i16)
NElts = (NElts + 1) / 2;
Expand All @@ -515,7 +556,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,

return LT.first * NElts * getFullRateInstrCost();
case ISD::MUL: {
const int QuarterRateCost = getQuarterRateInstrCost();
const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
if (SLT == MVT::i64) {
const int FullRateCost = getFullRateInstrCost();
return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
Expand Down Expand Up @@ -552,7 +593,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
case ISD::FADD:
case ISD::FSUB:
if (SLT == MVT::f64)
return LT.first * NElts * get64BitInstrCost();
return LT.first * NElts * get64BitInstrCost(CostKind);

if (ST->has16BitInsts() && SLT == MVT::f16)
NElts = (NElts + 1) / 2;
Expand All @@ -565,7 +606,9 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
// FIXME: frem should be handled separately. The fdiv in it is most of it,
// but the current lowering is also not entirely correct.
if (SLT == MVT::f64) {
int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
int Cost = 7 * get64BitInstrCost(CostKind) +
getQuarterRateInstrCost(CostKind) +
3 * getHalfRateInstrCost(CostKind);
// Add cost of workaround.
if (!ST->hasUsableDivScaleConditionOutput())
Cost += 3 * getFullRateInstrCost();
Expand All @@ -577,7 +620,7 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
// TODO: This is more complicated, unsafe flags etc.
if ((SLT == MVT::f32 && !HasFP32Denormals) ||
(SLT == MVT::f16 && ST->has16BitInsts())) {
return LT.first * getQuarterRateInstrCost() * NElts;
return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
}
}

Expand All @@ -587,12 +630,15 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
// f32 fmul
// v_cvt_f16_f32
// f16 div_fixup
int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost();
int Cost =
4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
return LT.first * Cost * NElts;
}

if (SLT == MVT::f32 || SLT == MVT::f16) {
int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
// 4 more v_cvt_* insts without f16 insts support
int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
1 * getQuarterRateInstrCost(CostKind);

if (!HasFP32Denormals) {
// FP mode switches.
Expand Down Expand Up @@ -642,7 +688,48 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
Type *RetTy = ICA.getReturnType();
EVT OrigTy = TLI->getValueType(DL, RetTy);
if (!OrigTy.isSimple()) {
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
if (CostKind != TTI::TCK_CodeSize)
return BaseT::getIntrinsicInstrCost(ICA, CostKind);

// TODO: Combine these two logic paths.
if (ICA.isTypeBasedOnly())
return getTypeBasedIntrinsicInstrCost(ICA, CostKind);

Type *RetTy = ICA.getReturnType();
unsigned VF = ICA.getVectorFactor();
unsigned RetVF =
(RetTy->isVectorTy() ? cast<FixedVectorType>(RetTy)->getNumElements()
: 1);
assert((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type");
const IntrinsicInst *I = ICA.getInst();
const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
FastMathFlags FMF = ICA.getFlags();
// Assume that we need to scalarize this intrinsic.
SmallVector<Type *, 4> Types;
for (const Value *Op : Args) {
Type *OpTy = Op->getType();
assert(VF == 1 || !OpTy->isVectorTy());
Types.push_back(VF == 1 ? OpTy : FixedVectorType::get(OpTy, VF));
}

if (VF > 1 && !RetTy->isVoidTy())
RetTy = FixedVectorType::get(RetTy, VF);

// Compute the scalarization overhead based on Args for a vector
// intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
// CostModel will pass a vector RetTy and VF is 1.
unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();
if (RetVF > 1 || VF > 1) {
ScalarizationCost = 0;
if (!RetTy->isVoidTy())
ScalarizationCost +=
getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
ScalarizationCost += getOperandsScalarizationOverhead(Args, VF);
}

IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, Types, FMF,
ScalarizationCost, I);
return getIntrinsicInstrCost(Attrs, CostKind);
}

// Legalize the type.
Expand All @@ -654,16 +741,16 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;

if (SLT == MVT::f64)
return LT.first * NElts * get64BitInstrCost();
return LT.first * NElts * get64BitInstrCost(CostKind);

if (ST->has16BitInsts() && SLT == MVT::f16)
NElts = (NElts + 1) / 2;

// TODO: Get more refined intrinsic costs?
unsigned InstRate = getQuarterRateInstrCost();
unsigned InstRate = getQuarterRateInstrCost(CostKind);
if (ICA.getID() == Intrinsic::fma) {
InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost()
: getQuarterRateInstrCost();
InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
: getQuarterRateInstrCost(CostKind);
}

return LT.first * NElts * InstRate;
Expand Down Expand Up @@ -714,7 +801,7 @@ int GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
CostKind);

std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
return LT.first * getHalfRateInstrCost();
return LT.first * getHalfRateInstrCost(CostKind);
}

int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
Expand Down
23 changes: 14 additions & 9 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,21 +115,26 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
return TargetTransformInfo::TCC_Basic;
}

static inline int getHalfRateInstrCost() {
return 2 * TargetTransformInfo::TCC_Basic;
static inline int getHalfRateInstrCost(
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) {
return CostKind == TTI::TCK_CodeSize ? 2
: 2 * TargetTransformInfo::TCC_Basic;
}

// TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
// should be 2 or 4.
static inline int getQuarterRateInstrCost() {
return 3 * TargetTransformInfo::TCC_Basic;
static inline int getQuarterRateInstrCost(
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) {
return CostKind == TTI::TCK_CodeSize ? 2
: 4 * TargetTransformInfo::TCC_Basic;
}

// On some parts, normal fp64 operations are half rate, and others
// quarter. This also applies to some integer operations.
inline int get64BitInstrCost() const {
return ST->hasHalfRate64Ops() ?
getHalfRateInstrCost() : getQuarterRateInstrCost();
// On some parts, normal fp64 operations are half rate, and others
// quarter. This also applies to some integer operations.
inline int get64BitInstrCost(
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const {
return ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
: getQuarterRateInstrCost(CostKind);
}

public:
Expand Down
43 changes: 21 additions & 22 deletions llvm/test/Analysis/CostModel/AMDGPU/fadd.ll
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF64,FASTF16,ALL %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF16,SIZEALL,ALL %s
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF16,SIZEALL,ALL %s

; ALL: 'fadd_f32'
; ALL-LABEL: 'fadd_f32'
; ALL: estimated cost of 1 for {{.*}} fadd float
define amdgpu_kernel void @fadd_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
%vec = load float, float addrspace(1)* %vaddr
Expand All @@ -12,7 +12,7 @@ define amdgpu_kernel void @fadd_f32(float addrspace(1)* %out, float addrspace(1)
ret void
}

; ALL: 'fadd_v2f32'
; ALL-LABEL: 'fadd_v2f32'
; ALL: estimated cost of 2 for {{.*}} fadd <2 x float>
define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
%vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
Expand All @@ -21,59 +21,58 @@ define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float
ret void
}

; ALL: 'fadd_v3f32'
; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening,
; and 3 when it is legal.
; ALL: estimated cost of {{[34]}} for {{.*}} fadd <3 x float>
; ALL-LABEL: 'fadd_v3f32'
; ALL: estimated cost of 3 for {{.*}} fadd <3 x float>
define amdgpu_kernel void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
%vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
%add = fadd <3 x float> %vec, %b
store <3 x float> %add, <3 x float> addrspace(1)* %out
ret void
}

; ALL: 'fadd_v5f32'
; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening,
; and 5 when it is legal.
; ALL: estimated cost of {{[58]}} for {{.*}} fadd <5 x float>
; ALL-LABEL: 'fadd_v5f32'
; ALL: estimated cost of 5 for {{.*}} fadd <5 x float>
define amdgpu_kernel void @fadd_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
%vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
%add = fadd <5 x float> %vec, %b
store <5 x float> %add, <5 x float> addrspace(1)* %out
ret void
}

; ALL: 'fadd_f64'
; ALL-LABEL: 'fadd_f64'
; FASTF64: estimated cost of 2 for {{.*}} fadd double
; SLOWF64: estimated cost of 3 for {{.*}} fadd double
; SLOWF64: estimated cost of 4 for {{.*}} fadd double
; SIZEALL: estimated cost of 2 for {{.*}} fadd double
define amdgpu_kernel void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
%vec = load double, double addrspace(1)* %vaddr
%add = fadd double %vec, %b
store double %add, double addrspace(1)* %out
ret void
}

; ALL: 'fadd_v2f64'
; ALL-LABEL: 'fadd_v2f64'
; FASTF64: estimated cost of 4 for {{.*}} fadd <2 x double>
; SLOWF64: estimated cost of 6 for {{.*}} fadd <2 x double>
; SLOWF64: estimated cost of 8 for {{.*}} fadd <2 x double>
; SIZEALL: estimated cost of 4 for {{.*}} fadd <2 x double>
define amdgpu_kernel void @fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
%vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
%add = fadd <2 x double> %vec, %b
store <2 x double> %add, <2 x double> addrspace(1)* %out
ret void
}

; ALL: 'fadd_v3f64'
; ALL-LABEL: 'fadd_v3f64'
; FASTF64: estimated cost of 6 for {{.*}} fadd <3 x double>
; SLOWF64: estimated cost of 9 for {{.*}} fadd <3 x double>
; SLOWF64: estimated cost of 12 for {{.*}} fadd <3 x double>
; SIZEALL: estimated cost of 6 for {{.*}} fadd <3 x double>
define amdgpu_kernel void @fadd_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
%vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
%add = fadd <3 x double> %vec, %b
store <3 x double> %add, <3 x double> addrspace(1)* %out
ret void
}

; ALL: 'fadd_f16'
; ALL-LABEL: 'fadd_f16'
; ALL: estimated cost of 1 for {{.*}} fadd half
define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
%vec = load half, half addrspace(1)* %vaddr
Expand All @@ -82,7 +81,7 @@ define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half addrspace(1)*
ret void
}

; ALL: 'fadd_v2f16'
; ALL-LABEL: 'fadd_v2f16'
; SLOWF16: estimated cost of 2 for {{.*}} fadd <2 x half>
; FASTF16: estimated cost of 1 for {{.*}} fadd <2 x half>
define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
Expand All @@ -92,7 +91,7 @@ define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
ret void
}

; ALL: 'fadd_v3f16'
; ALL-LABEL: 'fadd_v3f16'
; SLOWF16: estimated cost of 4 for {{.*}} fadd <3 x half>
; FASTF16: estimated cost of 2 for {{.*}} fadd <3 x half>
define amdgpu_kernel void @fadd_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr, <3 x half> %b) #0 {
Expand All @@ -102,7 +101,7 @@ define amdgpu_kernel void @fadd_v3f16(<3 x half> addrspace(1)* %out, <3 x half>
ret void
}

; ALL: 'fadd_v4f16'
; ALL-LABEL: 'fadd_v4f16'
; SLOWF16: estimated cost of 4 for {{.*}} fadd <4 x half>
; FASTF16: estimated cost of 2 for {{.*}} fadd <4 x half>
define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
Expand Down
Loading

0 comments on commit 242c915

Please sign in to comment.