Skip to content

Commit 8af39e0

Browse files
committed
[AMDGPU] Src1 of VOP3 DPP instructions can be SGPR on supported subtargets
In order to avoid duplicating every dpp pseudo opcode that has src1, we allow it for all opcodes and add manual checks on subtargets that do not support it.
1 parent 989173c commit 8af39e0

File tree

12 files changed

+157
-15
lines changed

12 files changed

+157
-15
lines changed

llvm/lib/Target/AMDGPU/AMDGPU.td

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -472,6 +472,12 @@ def FeatureDPALU_DPP : SubtargetFeature<"dpp-64bit",
472472
"Support DPP (Data Parallel Primitives) extension in DP ALU"
473473
>;
474474

475+
def FeatureDPPSrc1SGPR : SubtargetFeature<"dpp-src1-sgpr",
476+
"HasDPPSrc1SGPR",
477+
"true",
478+
"Support SGPR for Src1 of DPP instructions"
479+
>;
480+
475481
def FeaturePackedFP32Ops : SubtargetFeature<"packed-fp32-ops",
476482
"HasPackedFP32Ops",
477483
"true",
@@ -1383,11 +1389,13 @@ def FeatureISAVersion11_0_3 : FeatureSet<
13831389

13841390
def FeatureISAVersion11_5_0 : FeatureSet<
13851391
!listconcat(FeatureISAVersion11_Common.Features,
1386-
[FeatureSALUFloatInsts])>;
1392+
[FeatureSALUFloatInsts,
1393+
FeatureDPPSrc1SGPR])>;
13871394

13881395
def FeatureISAVersion11_5_1 : FeatureSet<
13891396
!listconcat(FeatureISAVersion11_Common.Features,
13901397
[FeatureSALUFloatInsts,
1398+
FeatureDPPSrc1SGPR,
13911399
FeatureGFX11FullVGPRs])>;
13921400

13931401
//===----------------------------------------------------------------------===//

llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4231,16 +4231,33 @@ bool AMDGPUAsmParser::validateDPP(const MCInst &Inst,
42314231
const OperandVector &Operands) {
42324232
const unsigned Opc = Inst.getOpcode();
42334233
int DppCtrlIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dpp_ctrl);
4234-
if (DppCtrlIdx < 0)
4235-
return true;
4236-
unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm();
4234+
if (DppCtrlIdx >= 0) {
4235+
unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm();
4236+
4237+
if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl) &&
4238+
AMDGPU::isDPALU_DPP(MII.get(Opc))) {
4239+
// DP ALU DPP is supported for row_newbcast only on GFX9*
4240+
SMLoc S = getImmLoc(AMDGPUOperand::ImmTyDppCtrl, Operands);
4241+
Error(S, "DP ALU dpp only supports row_newbcast");
4242+
return false;
4243+
}
4244+
}
42374245

4238-
if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl) &&
4239-
AMDGPU::isDPALU_DPP(MII.get(Opc))) {
4240-
// DP ALU DPP is supported for row_newbcast only on GFX9*
4241-
SMLoc S = getImmLoc(AMDGPUOperand::ImmTyDppCtrl, Operands);
4242-
Error(S, "DP ALU dpp only supports row_newbcast");
4243-
return false;
4246+
int Dpp8Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dpp8);
4247+
bool IsDPP = DppCtrlIdx >= 0 || Dpp8Idx >= 0;
4248+
4249+
if (IsDPP && !hasDPPSrc1SGPR(getSTI())) {
4250+
int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
4251+
if (Src1Idx >= 0) {
4252+
const MCOperand &Src1 = Inst.getOperand(Src1Idx);
4253+
const MCRegisterInfo *TRI = getContext().getRegisterInfo();
4254+
if (Src1.isImm() ||
4255+
(Src1.isReg() && isSGPR(mc2PseudoReg(Src1.getReg()), TRI))) {
4256+
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[Src1Idx]);
4257+
Error(Op.getStartLoc(), "invalid operand for instruction");
4258+
return false;
4259+
}
4260+
}
42444261
}
42454262

42464263
return true;

llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,16 @@ MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const {
191191
return &OldOpnd;
192192
}
193193

194+
static unsigned getOperandSize(MachineInstr &MI, unsigned Idx,
195+
MachineRegisterInfo &MRI) {
196+
int16_t RegClass = MI.getDesc().operands()[Idx].RegClass;
197+
if (RegClass == -1)
198+
return 0;
199+
200+
const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
201+
return TRI->getRegSizeInBits(*TRI->getRegClass(RegClass));
202+
}
203+
194204
MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
195205
MachineInstr &MovMI,
196206
RegSubRegPair CombOldVGPR,
@@ -278,6 +288,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
278288
}
279289
auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
280290
assert(Src0);
291+
int Src0Idx = NumOperands;
281292
if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) {
282293
LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n");
283294
Fail = true;
@@ -301,7 +312,17 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
301312
}
302313
auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
303314
if (Src1) {
304-
if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) {
315+
int OpNum = NumOperands;
316+
// If subtarget does not support SGPRs for src1 operand then the
317+
// requirements are the same as for src0. We check src0 instead because
318+
// pseudos are shared between subtargets and allow SGPR for src1 on all.
319+
if (!ST->hasDPPSrc1SGPR()) {
320+
assert(getOperandSize(*DPPInst, Src0Idx, *MRI) ==
321+
getOperandSize(*DPPInst, NumOperands, *MRI) &&
322+
"Src0 and Src1 operands should have the same size");
323+
OpNum = Src0Idx;
324+
}
325+
if (!TII->isOperandLegal(*DPPInst.getInstr(), OpNum, Src1)) {
305326
LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n");
306327
Fail = true;
307328
break;

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
128128
bool HasDPP = false;
129129
bool HasDPP8 = false;
130130
bool HasDPALU_DPP = false;
131+
bool HasDPPSrc1SGPR = false;
131132
bool HasPackedFP32Ops = false;
132133
bool HasImageInsts = false;
133134
bool HasExtendedImageInsts = false;
@@ -916,6 +917,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
916917
return HasDPALU_DPP;
917918
}
918919

920+
bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; }
921+
919922
bool hasPackedFP32Ops() const {
920923
return HasPackedFP32Ops;
921924
}

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2296,7 +2296,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
22962296
field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret;
22972297
field RegisterClass Src2DPP = getVregSrcForVT<Src2VT>.ret;
22982298
field RegisterOperand Src0VOP3DPP = VGPRSrc_32;
2299-
field RegisterOperand Src1VOP3DPP = VRegSrc_32;
2299+
field RegisterOperand Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT>.ret;
23002300
field RegisterOperand Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT>.ret;
23012301
field RegisterOperand Src0SDWA = getSDWASrcForVT<Src0VT>.ret;
23022302
field RegisterOperand Src1SDWA = getSDWASrcForVT<Src0VT>.ret;

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2085,6 +2085,10 @@ bool hasVOPD(const MCSubtargetInfo &STI) {
20852085
return STI.hasFeature(AMDGPU::FeatureVOPD);
20862086
}
20872087

2088+
bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI) {
2089+
return STI.hasFeature(AMDGPU::FeatureDPPSrc1SGPR);
2090+
}
2091+
20882092
unsigned hasKernargPreload(const MCSubtargetInfo &STI) {
20892093
return STI.hasFeature(AMDGPU::FeatureKernargPreload);
20902094
}

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1169,6 +1169,7 @@ bool isGFX940(const MCSubtargetInfo &STI);
11691169
bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI);
11701170
bool hasMAIInsts(const MCSubtargetInfo &STI);
11711171
bool hasVOPD(const MCSubtargetInfo &STI);
1172+
bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI);
11721173
int getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR);
11731174
unsigned hasKernargPreload(const MCSubtargetInfo &STI);
11741175

llvm/test/CodeGen/AMDGPU/dpp_combine.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
22
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
33
; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
4+
; RUN: llc -march=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
45

56
; GCN-LABEL: {{^}}dpp_add:
67
; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]],

llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
1-
# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GCN
1+
# RUN: llc -march=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1100
2+
# RUN: llc -march=amdgcn -mcpu=gfx1150 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1150
23

34
---
45

56
# GCN-label: name: vop3
67
# GCN: %6:vgpr_32, %7:sreg_32_xm0_xexec = V_SUBBREV_U32_e64_dpp %3, %0, %1, %5, 1, 1, 15, 15, 1, implicit $exec
78
# GCN: %8:vgpr_32 = V_CVT_PK_U8_F32_e64_dpp %3, 4, %0, 2, %2, 2, %1, 1, 1, 15, 15, 1, implicit $mode, implicit $exec
89
# GCN: %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %0, 0, 12345678, 0, 0, implicit $mode, implicit $exec
9-
# GCN: %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 2, 0, %7, 0, 0, implicit $mode, implicit $exec
10+
# GFX1100: %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 2, 0, %7, 0, 0, implicit $mode, implicit $exec
11+
# GFX1150: %12:vgpr_32 = V_MED3_F32_e64_dpp %3, 0, %1, 0, 2, 0, %7, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
1012
name: vop3
1113
tracksRegLiveness: true
1214
body: |
@@ -28,10 +30,54 @@ body: |
2830
%9:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
2931
%10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %0, 0, 12345678, 0, 0, implicit $mode, implicit $exec
3032
31-
; should not be combined because src1 imm is illegal
33+
; should not be combined on subtargets where src1 imm is illegal
3234
%11:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
3335
%12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 2, 0, %7, 0, 0, implicit $mode, implicit $exec
3436
...
37+
---
38+
39+
# GCN-label: name: vop3_sgpr_src1
40+
# GCN: %6:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %1, 0, %2, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
41+
# GFX1100: %8:vgpr_32 = V_MED3_F32_e64 0, %7, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec
42+
# GFX1150: %8:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %2, 0, %1, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
43+
# GFX1100: %10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %2, 0, %3, 0, 0, implicit $mode, implicit $exec
44+
# GFX1150: %10:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, %2, 0, %3, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
45+
# GFX1100: %12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 42, 0, %2, 0, 0, implicit $mode, implicit $exec
46+
# GFX1150: %12:vgpr_32 = V_MED3_F32_e64_dpp %4, 0, %0, 0, 42, 0, %2, 0, 0, 1, 15, 15, 1, implicit $mode, implicit $exec
47+
# GCN: %14:vgpr_32 = V_MED3_F32_e64 0, %13, 0, 4242, 0, %2, 0, 0, implicit $mode, implicit $exec
48+
name: vop3_sgpr_src1
49+
tracksRegLiveness: true
50+
body: |
51+
bb.0:
52+
liveins: $vgpr0, $vgpr1, $sgpr0, $sgpr1
53+
54+
%0:vgpr_32 = COPY $vgpr0
55+
%1:vgpr_32 = COPY $vgpr1
56+
%2:sgpr_32 = COPY $sgpr0
57+
%3:sgpr_32 = COPY $sgpr1
58+
%4:vgpr_32 = IMPLICIT_DEF
59+
60+
; should be combined because src2 allows sgpr
61+
%5:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec
62+
%6:vgpr_32 = V_MED3_F32_e64 0, %5, 0, %1, 0, %2, 0, 0, implicit $mode, implicit $exec
63+
64+
; should be combined only on subtargets that allow sgpr for src1
65+
%7:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec
66+
%8:vgpr_32 = V_MED3_F32_e64 0, %7, 0, %2, 0, %1, 0, 0, implicit $mode, implicit $exec
67+
68+
; should be combined only on subtargets that allow sgpr for src1
69+
%9:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec
70+
%10:vgpr_32 = V_MED3_F32_e64 0, %9, 0, %2, 0, %3, 0, 0, implicit $mode, implicit $exec
71+
72+
; should be combined only on subtargets that allow inlinable constants for src1
73+
%11:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec
74+
%12:vgpr_32 = V_MED3_F32_e64 0, %11, 0, 42, 0, %2, 0, 0, implicit $mode, implicit $exec
75+
76+
; should not be combined when literal constants are used
77+
%13:vgpr_32 = V_MOV_B32_dpp %4, %0, 1, 15, 15, 1, implicit $exec
78+
%14:vgpr_32 = V_MED3_F32_e64 0, %13, 0, 4242, 0, %2, 0, 0, implicit $mode, implicit $exec
79+
...
80+
---
3581

3682
# Regression test for src_modifiers on base u16 opcode
3783
# GCN-label: name: vop3_u16
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
// RUN: llvm-mc -arch=amdgcn -show-encoding -mcpu=gfx1150 %s | FileCheck --check-prefix=GFX1150 %s
2+
// RUN: llvm-mc -arch=amdgcn -show-encoding -mcpu=gfx1151 %s | FileCheck --check-prefix=GFX1150 %s
3+
4+
//
5+
// Subtargets allow src1 of VOP3 DPP instructions to be SGPR or inlinable
6+
// constant.
7+
//
8+
9+
v_add3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
10+
// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
11+
12+
v_add3_u32_e64_dpp v5, v1, 42, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
13+
// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xfa,0x54,0x0d,0x04,0x01,0x1b,0x00,0xff]
14+
15+
v_add3_u32_e64_dpp v5, v1, s2, v0 dpp8:[7,6,5,4,3,2,1,0]
16+
// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x00,0x04,0x01,0x77,0x39,0x05]
17+
18+
v_add3_u32_e64_dpp v5, v1, 42, v0 dpp8:[7,6,5,4,3,2,1,0]
19+
// GFX1150: encoding: [0x05,0x00,0x55,0xd6,0xe9,0x54,0x01,0x04,0x01,0x77,0x39,0x05]

llvm/test/MC/AMDGPU/gfx11_asm_err.s

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,15 @@ v_add3_u32_e64_dpp v5, v1, v2, 49812340 dpp8:[7,6,5,4,3,2,1,0]
4545
v_add3_u32_e64_dpp v5, v1, s1, v0 dpp8:[7,6,5,4,3,2,1,0]
4646
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
4747

48+
v_add3_u32_e64_dpp v5, v1, 42, v0 dpp8:[7,6,5,4,3,2,1,0]
49+
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
50+
51+
v_add3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
52+
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
53+
54+
v_add3_u32_e64_dpp v5, v1, 42, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf
55+
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
56+
4857
v_cvt_f32_i32_e64_dpp v5, s1 dpp8:[7,6,5,4,3,2,1,0]
4958
// GFX11: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
5059

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# RUN: llvm-mc -arch=amdgcn -mcpu=gfx1150 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1150 %s
2+
3+
# GFX1150: v_add3_u32_e64_dpp v5, v1, s2, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff]
4+
0x05,0x00,0x55,0xd6,0xfa,0x04,0x0c,0x04,0x01,0x1b,0x00,0xff
5+
6+
# GFX1150: v_add3_u32_e64_dpp v5, v1, 42, v3 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x55,0xd6,0xfa,0x54,0x0d,0x04,0x01,0x1b,0x00,0xff]
7+
0x05,0x00,0x55,0xd6,0xfa,0x54,0x0d,0x04,0x01,0x1b,0x00,0xff
8+
9+
# GFX1150: v_add3_u32_e64_dpp v5, v1, s2, v0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd6,0xe9,0x04,0x00,0x04,0x01,0x77,0x39,0x05]
10+
0x05,0x00,0x55,0xd6,0xe9,0x04,0x00,0x04,0x01,0x77,0x39,0x05
11+
12+
# GFX1150: v_add3_u32_e64_dpp v5, v1, 42, v0 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x55,0xd6,0xe9,0x54,0x01,0x04,0x01,0x77,0x39,0x05]
13+
0x05,0x00,0x55,0xd6,0xe9,0x54,0x01,0x04,0x01,0x77,0x39,0x05

0 commit comments

Comments
 (0)