Skip to content

Commit 9867d11

Browse files
Nikola PericNikola Peric
authored andcommitted
NanoMips: div-rem optimization
Disable expansion of mod to mul-and-sub when Os or Oz. Generate udivmoddi4 libcall when div-rem pairs of type uint64 are present.
1 parent cf10d7e commit 9867d11

File tree

5 files changed

+289
-3
lines changed

5 files changed

+289
-3
lines changed

llvm/lib/Target/Mips/MipsSEISelLowering.cpp

Lines changed: 131 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -208,8 +208,11 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
208208
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
209209
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
210210

211-
setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
212-
setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
211+
if (!Subtarget.hasNanoMips()) {
212+
setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
213+
setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
214+
}
215+
213216
setOperationAction(ISD::ATOMIC_FENCE, MVT::Other, Custom);
214217

215218
if (Subtarget.hasNanoMips()) {
@@ -324,6 +327,11 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM,
324327
setOperationAction(ISD::SREM, MVT::i32, Legal);
325328
setOperationAction(ISD::UDIV, MVT::i32, Legal);
326329
setOperationAction(ISD::UREM, MVT::i32, Legal);
330+
331+
setLibcallName(RTLIB::UDIVREM_I64, "__udivmoddi4");
332+
setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
333+
setOperationAction(ISD::UDIV, MVT::i64, Custom);
334+
setOperationAction(ISD::UREM, MVT::i64, Custom);
327335
}
328336

329337
computeRegisterProperties(Subtarget.getRegisterInfo());
@@ -504,6 +512,9 @@ SDValue MipsSETargetLowering::LowerOperation(SDValue Op,
504512
case ISD::SDIVREM: return lowerMulDiv(Op, MipsISD::DivRem, true, true, DAG);
505513
case ISD::UDIVREM: return lowerMulDiv(Op, MipsISD::DivRemU, true, true,
506514
DAG);
515+
case ISD::UDIV:
516+
case ISD::UREM:
517+
return lowerRemOrDiv(Op, DAG);
507518
case ISD::INTRINSIC_WO_CHAIN: return lowerINTRINSIC_WO_CHAIN(Op, DAG);
508519
case ISD::INTRINSIC_W_CHAIN: return lowerINTRINSIC_W_CHAIN(Op, DAG);
509520
case ISD::INTRINSIC_VOID: return lowerINTRINSIC_VOID(Op, DAG);
@@ -1315,6 +1326,59 @@ SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc,
13151326
// MIPS32r6/MIPS64r6 removed accumulator based multiplies.
13161327
assert(!Subtarget.hasMips32r6());
13171328

1329+
unsigned Opcode = Op.getOpcode();
1330+
MVT SimpleVT = Op.getSimpleValueType().SimpleTy;
1331+
if (Subtarget.hasNanoMips() && Opcode == ISD::UDIVREM &&
1332+
SimpleVT == MVT::i64) {
1333+
bool isSigned = false;
1334+
RTLIB::Libcall LC = RTLIB::UDIVREM_I64;
1335+
1336+
SDValue InChain = DAG.getEntryNode();
1337+
1338+
EVT RetVT = Op.getValueType();
1339+
Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
1340+
1341+
TargetLowering::ArgListTy Args;
1342+
TargetLowering::ArgListEntry Entry;
1343+
for (const SDValue &Operand : Op.getNode()->op_values()) {
1344+
EVT ArgVT = Operand.getValueType();
1345+
Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
1346+
Entry.Node = Operand;
1347+
Entry.Ty = ArgTy;
1348+
Entry.IsSExt = isSigned;
1349+
Entry.IsZExt = !isSigned;
1350+
Args.push_back(Entry);
1351+
}
1352+
1353+
// Pass the return address of the remainder
1354+
SDValue FIPtr = DAG.CreateStackTemporary(RetVT);
1355+
Entry.Node = FIPtr;
1356+
Entry.Ty = RetTy->getPointerTo();
1357+
Entry.IsSExt = isSigned;
1358+
Entry.IsZExt = !isSigned;
1359+
Args.push_back(Entry);
1360+
1361+
SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
1362+
getPointerTy(DAG.getDataLayout()));
1363+
1364+
SDLoc dl(Op);
1365+
TargetLowering::CallLoweringInfo CLI(DAG);
1366+
CLI.setDebugLoc(dl)
1367+
.setChain(InChain)
1368+
.setLibCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
1369+
.setSExtResult(isSigned)
1370+
.setZExtResult(!isSigned);
1371+
1372+
std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
1373+
1374+
// Remainder is loaded back from the stack frame
1375+
SDValue Rem =
1376+
DAG.getLoad(RetVT, dl, CallInfo.second, FIPtr, MachinePointerInfo());
1377+
1378+
SDValue Vals[] = {CallInfo.first, Rem};
1379+
return DAG.getMergeValues(Vals, dl);
1380+
}
1381+
13181382
EVT Ty = Op.getOperand(0).getValueType();
13191383
SDLoc DL(Op);
13201384
SDValue Mult = DAG.getNode(NewOpc, DL, MVT::Untyped,
@@ -1333,6 +1397,71 @@ SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc,
13331397
return DAG.getMergeValues(Vals, DL);
13341398
}
13351399

1400+
// This custom lowering hook prevents expansion of DIV and REM nodes
1401+
// with i64 value types into DIVREM node for NanoMips target and lowers them
1402+
// into appropriate libcall instead.
1403+
// During type legalization DIV and REM nodes are expanded into DIVREM node
1404+
// because i64 is ilegal value type and the action for DIVREM node is set to be
1405+
// "Custom" for NanoMips target. We want to lower DIV and REM nodes into
1406+
// appropriate libcalls instead of expanding them to DIVREM. In order to
1407+
// accomplish this we set the actions for DIV and REM nodes for MVT::i64 to be
1408+
// "Custom" instead of "LibCall". This results in calling this hook before
1409+
// expansion happens, bypassing the expansion but still lowering DIV and REM
1410+
// into appropriate libcalls.
1411+
SDValue MipsSETargetLowering::lowerRemOrDiv(SDValue Op,
1412+
SelectionDAG &DAG) const {
1413+
1414+
unsigned Opcode = Op.getOpcode();
1415+
MVT SimpleVT = Op.getSimpleValueType().SimpleTy;
1416+
if (Subtarget.hasNanoMips() && (Opcode == ISD::UDIV || Opcode == ISD::UREM) &&
1417+
SimpleVT == MVT::i64) {
1418+
1419+
SDLoc dl(Op.getNode());
1420+
EVT VT = Op.getNode()->getValueType(0);
1421+
SDValue Ops[2] = {Op.getNode()->getOperand(0), Op.getNode()->getOperand(1)};
1422+
SDValue Lo, Hi;
1423+
Lo = Hi = SDValue();
1424+
1425+
RTLIB::Libcall LC = Opcode == ISD::UDIV ? RTLIB::UDIV_I64 : RTLIB::UREM_I64;
1426+
1427+
TargetLowering::MakeLibCallOptions CallOptions;
1428+
1429+
SDValue LibcallOp = makeLibCall(DAG, LC, VT, Ops, CallOptions, dl).first;
1430+
1431+
EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(),
1432+
LibcallOp.getValueSizeInBits() / 2);
1433+
1434+
EVT LoVT, HiVT;
1435+
LoVT = HalfVT;
1436+
HiVT = HalfVT;
1437+
1438+
SDLoc DL(LibcallOp);
1439+
1440+
assert(LoVT.getSizeInBits() + HiVT.getSizeInBits() ==
1441+
LibcallOp.getValueSizeInBits() &&
1442+
"Invalid integer splitting!");
1443+
1444+
Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, LibcallOp);
1445+
1446+
unsigned ReqShiftAmountInBits =
1447+
Log2_32_Ceil(LibcallOp.getValueType().getSizeInBits());
1448+
1449+
MVT ShiftAmountTy =
1450+
getScalarShiftAmountTy(DAG.getDataLayout(), LibcallOp.getValueType());
1451+
1452+
assert(ReqShiftAmountInBits <= ShiftAmountTy.getSizeInBits());
1453+
1454+
Hi = DAG.getNode(ISD::SRL, DL, LibcallOp.getValueType(), LibcallOp,
1455+
DAG.getConstant(LoVT.getSizeInBits(), DL, ShiftAmountTy));
1456+
1457+
Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi);
1458+
1459+
SDValue Vals[] = {LibcallOp, Lo, Hi};
1460+
return DAG.getMergeValues(Vals, dl);
1461+
}
1462+
return SDValue();
1463+
}
1464+
13361465
static SDValue initAccumulator(SDValue In, const SDLoc &DL, SelectionDAG &DAG) {
13371466
SDValue InLo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, In,
13381467
DAG.getConstant(0, DL, MVT::i32));

llvm/lib/Target/Mips/MipsSEISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ class TargetRegisterClass;
7575
SDValue lowerSTORE(SDValue Op, SelectionDAG &DAG) const;
7676
SDValue lowerBITCAST(SDValue Op, SelectionDAG &DAG) const;
7777

78+
SDValue lowerRemOrDiv(SDValue Op, SelectionDAG &DAG) const;
7879
SDValue lowerMulDiv(SDValue Op, unsigned NewOpc, bool HasLo, bool HasHi,
7980
SelectionDAG &DAG) const;
8081

llvm/lib/Target/Mips/NanoMipsTargetTransformInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,3 +97,7 @@ void NanoMipsTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
9797
UP.Threshold = 60;
9898
UP.OptSizeThreshold = 0;
9999
}
100+
101+
bool NanoMipsTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
102+
return F->hasOptSize() || (DataType->isIntegerTy(64) && !IsSigned);
103+
}

llvm/lib/Target/Mips/NanoMipsTargetTransformInfo.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,12 @@ class NanoMipsTTIImpl : public BasicTTIImplBase<NanoMipsTTIImpl> {
3636
const MipsSubtarget *getST() const { return ST; }
3737
const MipsTargetLowering *getTLI() const { return TLI; }
3838

39+
const Function *F;
40+
3941
public:
4042
explicit NanoMipsTTIImpl(const MipsTargetMachine *TM, const Function &F)
4143
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
42-
TLI(ST->getTargetLowering()) {}
44+
TLI(ST->getTargetLowering()), F(&F) {}
4345

4446
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty,
4547
TTI::TargetCostKind CostKind);
@@ -49,6 +51,7 @@ class NanoMipsTTIImpl : public BasicTTIImplBase<NanoMipsTTIImpl> {
4951
Instruction *Inst = nullptr);
5052
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
5153
TTI::UnrollingPreferences &UP);
54+
bool hasDivRemOp(Type *DataType, bool IsSigned);
5255
};
5356

5457
} // end namespace llvm
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
; RUN: llc -mtriple=nanomips -asm-show-inst -verify-machineinstrs < %s | FileCheck %s
2+
3+
; Make sure to generate __udivmoddi4 libcall when udiv and urem
4+
; instructions with the same operands are present
5+
; and the operands are of type int64
6+
define void @test1(i64 %a, i64 %b, i64* %divmod) {
7+
; CHECK: save 16, $ra, $s0
8+
; CHECK: move $s0, $a4
9+
; CHECK: move $a4, $sp
10+
; CHECK: balc __udivmoddi4
11+
; CHECK: swm $a0, 0($s0), 2
12+
; CHECK: lw $a0, 4($sp)
13+
; CHECK: sw $a0, 12($s0)
14+
; CHECK: lw $a0, 0($sp)
15+
; CHECK: sw $a0, 8($s0)
16+
; CHECK: restore.jrc 16, $ra, $s0
17+
%div = udiv i64 %a, %b
18+
store i64 %div, i64* %divmod, align 8
19+
%rem = urem i64 %a, %b
20+
%arrayidx1 = getelementptr inbounds i64, i64* %divmod, i32 1
21+
store i64 %rem, i64* %arrayidx1, align 8
22+
ret void
23+
}
24+
25+
; Make sure to generate __umoddi3 libcall when only urem is present
26+
; and the operands are of type int64
27+
define void @test2(i64 %a, i64 %b, i64* %divmod) {
28+
; CHECK: save 16, $ra, $s0
29+
; CHECK: move $s0, $a4
30+
; CHECK: balc __umoddi3
31+
; CHECK: swm $a0, 8($s0), 2
32+
; CHECK: restore.jrc 16, $ra, $s0
33+
%rem = urem i64 %a, %b
34+
%arrayidx = getelementptr inbounds i64, i64* %divmod, i32 1
35+
store i64 %rem, i64* %arrayidx, align 8
36+
ret void
37+
}
38+
39+
; Make sure to generate __udivdi3 libcall when only udiv is present
40+
; and the operands are of type int64
41+
define void @test3(i64 %a, i64 %b, i64* %divmod) {
42+
; CHECK: save 16, $ra, $s0
43+
; CHECK: move $s0, $a4
44+
; CHECK: balc __udivdi3
45+
; CHECK: swm $a0, 0($s0), 2
46+
; CHECK: restore.jrc 16, $ra, $s0
47+
%div = udiv i64 %a, %b
48+
store i64 %div, i64* %divmod, align 8
49+
ret void
50+
}
51+
52+
; If urem is expanded into mul+sub and the operands
53+
; are of type int64, make sure to stay that way
54+
define void @test4(i64 %a, i64 %b, i64* %divmod) {
55+
; CHECK: save 32, $ra, $s0, $s1, $s2, $s3, $s4
56+
; CHECK: movep $s1, $s0, $a3, $a4
57+
; CHECK: movep $s4, $s2, $a1, $a2
58+
; CHECK: move $s3, $a0
59+
; CHECK: balc __udivdi3
60+
; CHECK: mul $a2, $a0, $s2
61+
; CHECK: subu $a3, $s3, $a2
62+
; CHECK: sw $a3, 8($s0)
63+
; CHECK: mul $a3, $a0, $s1
64+
; CHECK: muhu $s1, $a0, $s2
65+
; CHECK: addu $a3, $s1, $a3
66+
; CHECK: swm $a0, 0($s0), 2
67+
; CHECK: mul $a0, $a1, $s2
68+
; CHECK: addu $a0, $a3, $a0
69+
; CHECK: subu $a0, $s4, $a0
70+
; CHECK: sltu $a1, $s3, $a2
71+
; CHECK: subu $a0, $a0, $a1
72+
; CHECK: sw $a0, 12($s0)
73+
; CHECK: restore.jrc 32, $ra, $s0, $s1, $s2, $s3, $s4
74+
%a.frozen = freeze i64 %a
75+
%b.frozen = freeze i64 %b
76+
%div = udiv i64 %a.frozen, %b.frozen
77+
store i64 %div, i64* %divmod, align 8
78+
%1 = mul i64 %div, %b.frozen
79+
%rem.decomposed = sub i64 %a.frozen, %1
80+
%arrayidx1 = getelementptr inbounds i64, i64* %divmod, i32 1
81+
store i64 %rem.decomposed, i64* %arrayidx1, align 8
82+
ret void
83+
}
84+
85+
; Make sure to generate divu and modu when udiv and urem
86+
; instructions with the same operands are present
87+
; and the operands are of type int32
88+
define void @test5(i32 %a, i32 %b, i32* %divmod) {
89+
; CHECK: modu $a3, $a0, $a1
90+
; CHECK: teq $zero, $a1, 7
91+
; CHECK: sw $a3, 4($a2)
92+
; CHECK: divu $a0, $a0, $a1
93+
; CHECK: teq $zero, $a1, 7
94+
; CHECK: sw $a0, 0($a2)
95+
; CHECK: jrc $ra
96+
%div = udiv i32 %a, %b
97+
store i32 %div, i32* %divmod, align 4
98+
%rem = urem i32 %a, %b
99+
%arrayidx1 = getelementptr inbounds i32, i32* %divmod, i32 1
100+
store i32 %rem, i32* %arrayidx1, align 4
101+
ret void
102+
}
103+
104+
; Make sure to generate modu when only urem is present
105+
; and the operands are of type int32
106+
define void @test6(i32 %a, i32 %b, i32* %divmod) {
107+
; CHECK: modu $a0, $a0, $a1
108+
; CHECK: teq $zero, $a1, 7
109+
; CHECK: sw $a0, 4($a2)
110+
; CHECK: jrc $ra
111+
%rem = urem i32 %a, %b
112+
%arrayidx = getelementptr inbounds i32, i32* %divmod, i32 1
113+
store i32 %rem, i32* %arrayidx, align 4
114+
ret void
115+
}
116+
117+
; Make sure to generate divu when only udiv is present
118+
; and the operands are of type int32
119+
define void @test7(i32 %a, i32 %b, i32* %divmod) {
120+
; CHECK: divu $a0, $a0, $a1
121+
; CHECK: teq $zero, $a1, 7
122+
; CHECK: sw $a0, 0($a2)
123+
; CHECK: jrc $ra
124+
%div = udiv i32 %a, %b
125+
store i32 %div, i32* %divmod, align 4
126+
ret void
127+
}
128+
129+
; If urem is expanded into mul+sub and the operands
130+
; are of type int32, make sure to stay that way.
131+
define void @test8(i32 %a, i32 %b, i32* %divmod) {
132+
; CHECK: divu $a3, $a0, $a1
133+
; CHECK: teq $zero, $a1, 7
134+
; CHECK: sw $a3, 0($a2)
135+
; CHECK: mul $a1, $a3, $a1
136+
; CHECK: subu $a0, $a0, $a1
137+
; CHECK: sw $a0, 4($a2)
138+
; CHECK: jrc $ra
139+
%a.frozen = freeze i32 %a
140+
%b.frozen = freeze i32 %b
141+
%div = udiv i32 %a.frozen, %b.frozen
142+
store i32 %div, i32* %divmod, align 4
143+
%1 = mul i32 %div, %b.frozen
144+
%rem.decomposed = sub i32 %a.frozen, %1
145+
%arrayidx1 = getelementptr inbounds i32, i32* %divmod, i32 1
146+
store i32 %rem.decomposed, i32* %arrayidx1, align 4
147+
ret void
148+
}
149+

0 commit comments

Comments
 (0)