Skip to content

Commit fe93eb9

Browse files
committed
[AArch64] Improve urem by constant costs
A urem by a constant, much like a udiv by a constant, can be expanded into a series of mul/add/shift instructions. The exact sequence of instructions depends on the constants and the types. If the constant is a power-2 then a shift / and will be used, so the cost will be 1. This canonicalization happens relatively early so this likely has very little effect in practice (it does help the cost of funnel shifts). For a non-power 2 the code for div will expand to a series of UMULH + Add + Shift + Add, depending on the constant. urem is generally udiv + mul + sub, so involves a few extra instructions. The UMULH is not always available, i32 will use umull+shift, and vector types will use umull+shift or umull+umull2+uzp depending on the vector size. v2i64 will be scalarized because there is no mul available. SVE does have a UMULH instruction. The end result is that the costs should be closer to reality, with scalable types a little lower cost than the fixed-width versions. (In the future we might be able to use umulh for fixed-width when the SVE instruction is available, but for the moment this should favour scalable vectorization a little). I've tried to make this patch only apply to constant UREM/UDIV instructions. SDIV and SREM are left until a later patch to prevent this becoming too complex. The funnel shift costs are changing as it believes it will need a urem to clamp the shift amount, which should be a power-2 value for most common types.
1 parent f08824b commit fe93eb9

File tree

8 files changed

+521
-483
lines changed

8 files changed

+521
-483
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

+50-12
Original file line numberDiff line numberDiff line change
@@ -3545,20 +3545,58 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
35453545
return Cost;
35463546
}
35473547
[[fallthrough]];
3548-
case ISD::UDIV: {
3548+
case ISD::UDIV:
3549+
case ISD::UREM: {
35493550
auto VT = TLI->getValueType(DL, Ty);
3550-
if (Op2Info.isConstant() && Op2Info.isUniform()) {
3551+
if (Op2Info.isConstant()) {
3552+
// If the operand is a power of 2 we can use the shift or and cost.
3553+
if (ISD == ISD::UDIV && Op2Info.isPowerOf2())
3554+
return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
3555+
Op1Info.getNoProps(),
3556+
Op2Info.getNoProps());
3557+
if (ISD == ISD::UREM && Op2Info.isPowerOf2())
3558+
return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
3559+
Op1Info.getNoProps(),
3560+
Op2Info.getNoProps());
3561+
3562+
if (ISD == ISD::UDIV || ISD == ISD::UREM) {
3563+
// Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.
3564+
// The MULHU will be expanded to UMULL for the types not listed below,
3565+
// and will become a pair of UMULL+MULL2 for 128bit vectors.
3566+
bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 ||
3567+
LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 ||
3568+
LT.second == MVT::nxv16i8;
3569+
bool Is128bit = LT.second.is128BitVector();
3570+
3571+
InstructionCost MulCost =
3572+
getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
3573+
Op1Info.getNoProps(), Op2Info.getNoProps());
3574+
InstructionCost AddCost =
3575+
getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
3576+
Op1Info.getNoProps(), Op2Info.getNoProps());
3577+
InstructionCost ShrCost =
3578+
getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
3579+
Op1Info.getNoProps(), Op2Info.getNoProps());
3580+
InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH
3581+
(HasMULH ? 0 : ShrCost) + // UMULL shift
3582+
AddCost * 2 + ShrCost;
3583+
return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
3584+
}
3585+
3586+
// TODO: Fix SDIV and SREM costs, similar to the above.
35513587
if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT) &&
3552-
!VT.isScalableVector()) {
3588+
Op2Info.isUniform() && !VT.isScalableVector()) {
35533589
// Vector signed division by constant are expanded to the
3554-
// sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
3555-
// to MULHS + SUB + SRL + ADD + SRL.
3556-
InstructionCost MulCost = getArithmeticInstrCost(
3557-
Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3558-
InstructionCost AddCost = getArithmeticInstrCost(
3559-
Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3560-
InstructionCost ShrCost = getArithmeticInstrCost(
3561-
Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3590+
// sequence MULHS + ADD/SUB + SRA + SRL + ADD.
3591+
InstructionCost MulCost =
3592+
getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
3593+
Op1Info.getNoProps(), Op2Info.getNoProps());
3594+
InstructionCost AddCost =
3595+
getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
3596+
Op1Info.getNoProps(), Op2Info.getNoProps());
3597+
InstructionCost ShrCost =
3598+
getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
3599+
Op1Info.getNoProps(), Op2Info.getNoProps());
35623600
return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
35633601
}
35643602
}
@@ -3571,7 +3609,7 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost(
35713609

35723610
InstructionCost Cost = BaseT::getArithmeticInstrCost(
35733611
Opcode, Ty, CostKind, Op1Info, Op2Info);
3574-
if (Ty->isVectorTy()) {
3612+
if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) {
35753613
if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
35763614
// SDIV/UDIV operations are lowered using SVE, then we can have less
35773615
// costs.

llvm/test/Analysis/CostModel/AArch64/div.ll

+128-128
Large diffs are not rendered by default.

llvm/test/Analysis/CostModel/AArch64/div_cte.ll

+3-3
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ define <4 x i32> @sdiv32xi4(<4 x i32> %x) {
3434

3535
define <16 x i8> @udiv8xi16(<16 x i8> %x) {
3636
; CHECK-LABEL: 'udiv8xi16'
37-
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %div = udiv <16 x i8> %x, splat (i8 9)
37+
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %div = udiv <16 x i8> %x, splat (i8 9)
3838
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %div
3939
;
4040
%div = udiv <16 x i8> %x, <i8 9, i8 9, i8 9, i8 9, i8 9, i8 9, i8 9, i8 9, i8 9, i8 9, i8 9, i8 9, i8 9, i8 9, i8 9, i8 9>
@@ -43,7 +43,7 @@ define <16 x i8> @udiv8xi16(<16 x i8> %x) {
4343

4444
define <8 x i16> @udiv16xi8(<8 x i16> %x) {
4545
; CHECK-LABEL: 'udiv16xi8'
46-
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %div = udiv <8 x i16> %x, splat (i16 9)
46+
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %div = udiv <8 x i16> %x, splat (i16 9)
4747
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %div
4848
;
4949
%div = udiv <8 x i16> %x, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
@@ -52,7 +52,7 @@ define <8 x i16> @udiv16xi8(<8 x i16> %x) {
5252

5353
define <4 x i32> @udiv32xi4(<4 x i32> %x) {
5454
; CHECK-LABEL: 'udiv32xi4'
55-
; CHECK-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %div = udiv <4 x i32> %x, splat (i32 9)
55+
; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %div = udiv <4 x i32> %x, splat (i32 9)
5656
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %div
5757
;
5858
%div = udiv <4 x i32> %x, <i32 9, i32 9, i32 9, i32 9>

llvm/test/Analysis/CostModel/AArch64/fshl.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,7 @@ declare <2 x i64> @llvm.fshl.v4i64(<2 x i64>, <2 x i64>, <2 x i64>)
224224

225225
define <4 x i30> @fshl_v4i30_3rd_arg_var(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c) {
226226
; CHECK-LABEL: 'fshl_v4i30_3rd_arg_var'
227-
; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %fshl = tail call <4 x i30> @llvm.fshl.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
227+
; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %fshl = tail call <4 x i30> @llvm.fshl.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
228228
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i30> %fshl
229229
;
230230
entry:

llvm/test/Analysis/CostModel/AArch64/fshr.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,7 @@ declare <2 x i64> @llvm.fshr.v4i64(<2 x i64>, <2 x i64>, <2 x i64>)
224224

225225
define <4 x i30> @fshr_v4i30_3rd_arg_var(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c) {
226226
; CHECK-LABEL: 'fshr_v4i30_3rd_arg_var'
227-
; CHECK-NEXT: Cost Model: Found an estimated cost of 34 for instruction: %fshr = tail call <4 x i30> @llvm.fshr.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
227+
; CHECK-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %fshr = tail call <4 x i30> @llvm.fshr.v4i30(<4 x i30> %a, <4 x i30> %b, <4 x i30> %c)
228228
; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i30> %fshr
229229
;
230230
entry:

0 commit comments

Comments
 (0)