-
Notifications
You must be signed in to change notification settings - Fork 13.3k
[AMDGPU] Implement vop3p complex pattern optmization for gisel #130234
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: None (Shoreshen) ChangesSeeking opportunities to optimize VOP3P instructions by altering opsel, opsel_hi, neg, neg_hi bits Tests differences:
Patch is 36.71 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/130234.diff 12 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 441fb5730a6d8..0dc47b957bdac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4282,30 +4282,346 @@ AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
}};
}
-std::pair<Register, unsigned>
-AMDGPUInstructionSelector::selectVOP3PModsImpl(
- Register Src, const MachineRegisterInfo &MRI, bool IsDOT) const {
- unsigned Mods = 0;
- MachineInstr *MI = MRI.getVRegDef(Src);
+enum srcStatus {
+ IS_SAME,
+ IS_UPPER_HALF,
+ IS_LOWER_HALF,
+ IS_NEG,
+ IS_UPPER_HALF_NEG,
+ IS_LOWER_HALF_NEG,
+ LAST_STAT = IS_LOWER_HALF_NEG
+};
+
+bool isTruncHalf(MachineInstr *MI, const MachineRegisterInfo &MRI) {
+ assert(MI->getOpcode() == AMDGPU::G_TRUNC);
+ unsigned dstSize = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();
+ unsigned srcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
+ return dstSize * 2 == srcSize;
+}
+
+bool isLshrHalf(MachineInstr *MI, const MachineRegisterInfo &MRI) {
+ assert(MI->getOpcode() == AMDGPU::G_LSHR);
+ Register ShiftSrc;
+ std::optional<ValueAndVReg> ShiftAmt;
+ if (mi_match(MI->getOperand(0).getReg(), MRI,
+ m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
+ unsigned srcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
+ unsigned shift = ShiftAmt->Value.getZExtValue();
+ return shift * 2 == srcSize;
+ }
+ return false;
+}
- if (MI->getOpcode() == AMDGPU::G_FNEG &&
- // It's possible to see an f32 fneg here, but unlikely.
- // TODO: Treat f32 fneg as only high bit.
- MRI.getType(Src) == LLT::fixed_vector(2, 16)) {
- Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
- Src = MI->getOperand(1).getReg();
- MI = MRI.getVRegDef(Src);
+bool isShlHalf(MachineInstr *MI, const MachineRegisterInfo &MRI) {
+ assert(MI->getOpcode() == AMDGPU::G_SHL);
+ Register ShiftSrc;
+ std::optional<ValueAndVReg> ShiftAmt;
+ if (mi_match(MI->getOperand(0).getReg(), MRI,
+ m_GShl(m_Reg(ShiftSrc), m_GCst(ShiftAmt)))) {
+ unsigned srcSize = MRI.getType(MI->getOperand(1).getReg()).getSizeInBits();
+ unsigned shift = ShiftAmt->Value.getZExtValue();
+ return shift * 2 == srcSize;
+ }
+ return false;
+}
+
+bool retOpStat(MachineOperand *Op, int stat,
+ std::pair<MachineOperand *, int> &curr) {
+ if ((Op->isReg() && !(Op->getReg().isPhysical())) || Op->isImm() ||
+ Op->isCImm() || Op->isFPImm()) {
+ curr = {Op, stat};
+ return true;
+ }
+ return false;
+}
+
+bool calcNextStatus(std::pair<MachineOperand *, int> &curr,
+ const MachineRegisterInfo &MRI) {
+ if (!curr.first->isReg()) {
+ return false;
+ }
+ MachineInstr *MI = nullptr;
+
+ if (!curr.first->isDef()) {
+ // MRI.getVRegDef falls into infinite loop if use define reg
+ MI = MRI.getVRegDef(curr.first->getReg());
+ } else {
+ MI = curr.first->getParent();
+ }
+ if (!MI) {
+ return false;
+ }
+
+ unsigned Opc = MI->getOpcode();
+
+ // Handle general Opc cases
+ switch (Opc) {
+ case AMDGPU::G_BITCAST:
+ case AMDGPU::G_CONSTANT:
+ case AMDGPU::G_FCONSTANT:
+ case AMDGPU::COPY:
+ return retOpStat(&MI->getOperand(1), curr.second, curr);
+ case AMDGPU::G_FNEG:
+ // XXXX + 3 = XXXX_NEG, (XXXX_NEG + 3) mod 3 = XXXX
+ return retOpStat(&MI->getOperand(1),
+ (curr.second + ((LAST_STAT + 1) / 2)) % (LAST_STAT + 1),
+ curr);
+ }
+
+ // Calc next stat from current stat
+ switch (curr.second) {
+ case IS_SAME:
+ switch (Opc) {
+ case AMDGPU::G_TRUNC: {
+ if (isTruncHalf(MI, MRI)) {
+ return retOpStat(&MI->getOperand(1), IS_LOWER_HALF, curr);
+ }
+ break;
+ }
+ }
+ break;
+ case IS_NEG:
+ switch (Opc) {
+ case AMDGPU::G_TRUNC: {
+ if (isTruncHalf(MI, MRI)) {
+ return retOpStat(&MI->getOperand(1), IS_LOWER_HALF_NEG, curr);
+ }
+ break;
+ }
+ }
+ break;
+ case IS_UPPER_HALF:
+ switch (Opc) {
+ case AMDGPU::G_SHL: {
+ if (isShlHalf(MI, MRI)) {
+ return retOpStat(&MI->getOperand(1), IS_LOWER_HALF, curr);
+ }
+ break;
+ }
+ }
+ break;
+ case IS_LOWER_HALF:
+ switch (Opc) {
+ case AMDGPU::G_LSHR: {
+ if (isLshrHalf(MI, MRI)) {
+ return retOpStat(&MI->getOperand(1), IS_UPPER_HALF, curr);
+ }
+ break;
+ }
+ }
+ break;
+ case IS_UPPER_HALF_NEG:
+ switch (Opc) {
+ case AMDGPU::G_SHL: {
+ if (isShlHalf(MI, MRI)) {
+ return retOpStat(&MI->getOperand(1), IS_LOWER_HALF_NEG, curr);
+ }
+ break;
+ }
+ }
+ break;
+ case IS_LOWER_HALF_NEG:
+ switch (Opc) {
+ case AMDGPU::G_LSHR: {
+ if (isLshrHalf(MI, MRI)) {
+ return retOpStat(&MI->getOperand(1), IS_UPPER_HALF_NEG, curr);
+ }
+ break;
+ }
+ }
+ break;
+ }
+ return false;
+}
+
+std::vector<std::pair<MachineOperand *, int>>
+getSrcStats(MachineOperand *Op, const MachineRegisterInfo &MRI,
+ bool onlyLastSameOrNeg = false, int maxDepth = 6) {
+ int depth = 0;
+ std::pair<MachineOperand *, int> curr = {Op, IS_SAME};
+ std::vector<std::pair<MachineOperand *, int>> statList;
+
+ while (true) {
+ depth++;
+ if (depth > maxDepth) {
+ break;
+ }
+ bool ret = calcNextStatus(curr, MRI);
+ if (!ret || (onlyLastSameOrNeg &&
+ (curr.second != IS_SAME && curr.second != IS_NEG))) {
+ break;
+ } else if (!onlyLastSameOrNeg) {
+ statList.push_back(curr);
+ }
}
+ if (onlyLastSameOrNeg) {
+ statList.push_back(curr);
+ }
+ return statList;
+}
- // TODO: Handle G_FSUB 0 as fneg
+bool isInlinableConstant(MachineOperand *Op, const SIInstrInfo &TII) {
+ bool a = TII.isInlineConstant(*Op);
+ switch (Op->getType()) {
+ case MachineOperand::MachineOperandType::MO_Immediate:
+ return TII.isInlineConstant(*Op);
+ case MachineOperand::MachineOperandType::MO_CImmediate:
+ return TII.isInlineConstant(Op->getCImm()->getValue());
+ case MachineOperand::MachineOperandType::MO_FPImmediate:
+ return TII.isInlineConstant(Op->getFPImm()->getValueAPF());
+ }
+ return false;
+}
- // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
- (void)IsDOT; // DOTs do not use OPSEL on gfx942+, check ST.hasDOTOpSelHazard()
+bool isSameBitWidth(MachineOperand *Op1, MachineOperand *Op2,
+ const MachineRegisterInfo &MRI) {
+ unsigned width1 = MRI.getType(Op1->getReg()).getSizeInBits();
+ unsigned width2 = MRI.getType(Op2->getReg()).getSizeInBits();
+ return width1 == width2;
+}
+bool isSameOperand(MachineOperand *Op1, MachineOperand *Op2) {
+ if (Op1->isReg()) {
+ if (Op2->isReg()) {
+ return Op1->getReg() == Op2->getReg();
+ }
+ return false;
+ }
+ return Op1->isIdenticalTo(*Op2);
+}
+
+bool validToPack(int HiStat, int LoStat, unsigned int &Mods,
+ MachineOperand *newOp, MachineOperand *RootOp,
+ const SIInstrInfo &TII, const MachineRegisterInfo &MRI) {
+ if (newOp->isReg()) {
+ if (isSameBitWidth(newOp, RootOp, MRI)) {
+ // IS_LOWER_HALF remain 0
+ if (HiStat == IS_UPPER_HALF_NEG) {
+ Mods ^= SISrcMods::NEG_HI;
+ Mods |= SISrcMods::OP_SEL_1;
+ } else if (HiStat == IS_UPPER_HALF) {
+ Mods |= SISrcMods::OP_SEL_1;
+ } else if (HiStat == IS_LOWER_HALF_NEG) {
+ Mods ^= SISrcMods::NEG_HI;
+ }
+ if (LoStat == IS_UPPER_HALF_NEG) {
+ Mods ^= SISrcMods::NEG;
+ Mods |= SISrcMods::OP_SEL_0;
+ } else if (LoStat == IS_UPPER_HALF) {
+ Mods |= SISrcMods::OP_SEL_0;
+ } else if (LoStat == IS_UPPER_HALF_NEG) {
+ Mods |= SISrcMods::NEG;
+ }
+ return true;
+ }
+ } else {
+ if ((HiStat == IS_SAME || HiStat == IS_NEG) &&
+ (LoStat == IS_SAME || LoStat == IS_NEG) &&
+ isInlinableConstant(newOp, TII)) {
+ if (HiStat == IS_NEG) {
+ Mods ^= SISrcMods::NEG_HI;
+ }
+ if (LoStat == IS_NEG) {
+ Mods ^= SISrcMods::NEG;
+ }
+ // opsel = opsel_hi = 0, since the upper half and lower half both
+ // the same as the target inlinable constant
+ return true;
+ }
+ }
+ return false;
+}
+
+std::pair<MachineOperand *, unsigned>
+AMDGPUInstructionSelector::selectVOP3PModsImpl(MachineOperand *Op,
+ const MachineRegisterInfo &MRI,
+ bool IsDOT) const {
+ unsigned Mods = 0;
+ MachineOperand *RootOp = Op;
+ std::pair<MachineOperand *, int> stat = getSrcStats(Op, MRI, true)[0];
+ if (!stat.first->isReg()) {
+ Mods |= SISrcMods::OP_SEL_1;
+ return {Op, Mods};
+ }
+ if (stat.second == IS_NEG) {
+ Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
+ }
+ Op = stat.first;
+ MachineInstr *MI = MRI.getVRegDef(Op->getReg());
+ if (MI->getOpcode() == AMDGPU::G_BUILD_VECTOR && MI->getNumOperands() == 3 &&
+ (!IsDOT || !Subtarget->hasDOTOpSelHazard())) {
+ std::vector<std::pair<MachineOperand *, int>> statList_Hi;
+ std::vector<std::pair<MachineOperand *, int>> statList_Lo;
+ statList_Hi = getSrcStats(&MI->getOperand(2), MRI);
+ if (statList_Hi.size() != 0) {
+ statList_Lo = getSrcStats(&MI->getOperand(1), MRI);
+ if (statList_Lo.size() != 0) {
+ for (int i = statList_Hi.size() - 1; i >= 0; i--) {
+ for (int j = statList_Lo.size() - 1; j >= 0; j--) {
+ if (isSameOperand(statList_Hi[i].first, statList_Lo[j].first)) {
+ if (validToPack(statList_Hi[i].second, statList_Lo[j].second,
+ Mods, statList_Hi[i].first, RootOp, TII, MRI)) {
+ return {statList_Hi[i].first, Mods};
+ }
+ }
+ }
+ }
+ }
+ }
+ }
// Packed instructions do not have abs modifiers.
Mods |= SISrcMods::OP_SEL_1;
- return std::pair(Src, Mods);
+ return {Op, Mods};
+}
+
+int64_t getAllKindImm(MachineOperand *Op) {
+ switch (Op->getType()) {
+ case MachineOperand::MachineOperandType::MO_Immediate:
+ return Op->getImm();
+ case MachineOperand::MachineOperandType::MO_CImmediate:
+ return Op->getCImm()->getSExtValue();
+ break;
+ case MachineOperand::MachineOperandType::MO_FPImmediate:
+ return Op->getFPImm()->getValueAPF().bitcastToAPInt().getSExtValue();
+ break;
+ }
+ llvm_unreachable("not an imm type");
+}
+
+bool checkRB(MachineOperand *Op, int RBNo, const AMDGPURegisterBankInfo &RBI,
+ const MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI) {
+ const RegisterBank *RB = RBI.getRegBank(Op->getReg(), MRI, TRI);
+ return RB->getID() == RBNo;
+}
+
+MachineOperand *getVReg(MachineOperand *newOp, MachineOperand *RootOp,
+ const AMDGPURegisterBankInfo &RBI,
+ MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI) {
+ // RootOp can only be VGPR or SGPR (some hand written cases such as
+ // inst-select-ashr.v2s16.mir::ashr_v2s16_vs)
+ if (checkRB(RootOp, AMDGPU::SGPRRegBankID, RBI, MRI, TRI) ||
+ checkRB(newOp, AMDGPU::VGPRRegBankID, RBI, MRI, TRI)) {
+ return newOp;
+ }
+ MachineInstr *MI = MRI.getVRegDef(RootOp->getReg());
+ if (MI->getOpcode() == AMDGPU::COPY &&
+ isSameOperand(newOp, &MI->getOperand(1))) {
+ // RootOp is VGPR, newOp is not VGPR, but RootOp = COPY newOp
+ return RootOp;
+ }
+
+ const TargetRegisterClass *DstRC =
+ TRI.getConstrainedRegClassForOperand(*RootOp, MRI);
+ Register dstReg = MRI.createVirtualRegister(DstRC);
+
+ MachineIRBuilder B(*RootOp->getParent());
+ MachineInstrBuilder MIB =
+ B.buildInstr(AMDGPU::COPY).addDef(dstReg).addUse(newOp->getReg());
+
+ // only accept VGPR
+ return &MIB->getOperand(0);
}
InstructionSelector::ComplexRendererFns
@@ -4313,13 +4629,17 @@ AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
MachineRegisterInfo &MRI
= Root.getParent()->getParent()->getParent()->getRegInfo();
- Register Src;
- unsigned Mods;
- std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
-
+ std::pair<MachineOperand *, unsigned> res = selectVOP3PModsImpl(&Root, MRI);
+ if (!(res.first->isReg())) {
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(getAllKindImm(res.first)); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(res.second); } // src_mods
+ }};
+ }
+ res.first = getVReg(res.first, &Root, RBI, MRI, TRI);
return {{
- [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(res.first->getReg()); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(res.second); } // src_mods
}};
}
@@ -4328,13 +4648,18 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
MachineRegisterInfo &MRI
= Root.getParent()->getParent()->getParent()->getRegInfo();
- Register Src;
- unsigned Mods;
- std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI, true);
-
+ std::pair<MachineOperand *, unsigned> res =
+ selectVOP3PModsImpl(&Root, MRI, true);
+ if (!(res.first->isReg())) {
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(getAllKindImm(res.first)); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(res.second); } // src_mods
+ }};
+ }
+ res.first = getVReg(res.first, &Root, RBI, MRI, TRI);
return {{
- [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(res.first->getReg()); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(res.second); } // src_mods
}};
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index cc7552868a056..2af4f55403acc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -187,8 +187,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
ComplexRendererFns selectVOP3NoMods(MachineOperand &Root) const;
- std::pair<Register, unsigned>
- selectVOP3PModsImpl(Register Src, const MachineRegisterInfo &MRI,
+ std::pair<MachineOperand *, unsigned>
+ selectVOP3PModsImpl(MachineOperand *Op, const MachineRegisterInfo &MRI,
bool IsDOT = false) const;
InstructionSelector::ComplexRendererFns
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
index 1d9514c58ab9c..2243c57cf37ac 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
@@ -68,8 +68,7 @@ define float @v_fdot2_neg_c(<2 x half> %a, <2 x half> %b, float %c) {
; GFX906-LABEL: v_fdot2_neg_c:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
-; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2
+; GFX906-NEXT: v_dot2_f32_f16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
%neg.c = fneg float %c
%r = call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %neg.c, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
index e2dab03e410aa..7d6cfac52714e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll
@@ -248,8 +248,7 @@ define i32 @v_sdot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) {
; GFX906-LABEL: v_sdot2_fnegf32_c:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
-; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
+; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_fnegf32_c:
@@ -263,8 +262,7 @@ define i32 @v_sdot2_fnegf32_c(<2 x i16> %a, <2 x i16> %b, float %c) {
; GFX10-LABEL: v_sdot2_fnegf32_c:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
-; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
+; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.c = fneg float %c
%cast.neg.c = bitcast float %neg.c to i32
@@ -276,8 +274,7 @@ define i32 @v_sdot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) {
; GFX906-LABEL: v_sdot2_fnegv2f16_c:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
-; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
+; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_fnegv2f16_c:
@@ -291,8 +288,7 @@ define i32 @v_sdot2_fnegv2f16_c(<2 x i16> %a, <2 x i16> %b, <2 x half> %c) {
; GFX10-LABEL: v_sdot2_fnegv2f16_c:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_xor_b32_e32 v2, 0x80008000, v2
-; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
+; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 neg_lo:[0,0,1] neg_hi:[0,0,1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.c = fneg <2 x half> %c
%cast.neg.c = bitcast <2 x half> %neg.c to i32
@@ -304,8 +300,7 @@ define i32 @v_sdot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX906-LABEL: v_sdot2_shuffle10_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_alignbit_b32 v0, v0, v0, 16
-; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
+; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_shuffle10_a:
@@ -319,8 +314,7 @@ define i32 @v_sdot2_shuffle10_a(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX10-LABEL: v_sdot2_shuffle10_a:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v0, v0, v0, 16
-; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
+; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[1,0,0] op_sel_hi:[0,1,1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%shuf.a = shufflevector <2 x i16> %a, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %shuf.a, <2 x i16> %b, i32 %c, i1 false)
@@ -331,8 +325,7 @@ define i32 @v_sdot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX906-LABEL: v_sdot2_shuffle10_b:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_alignbit_b32 v1, v1, v1, 16
-; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
+; GFX906-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
; GFX906-NEXT: s_setpc_b64 s[30:31]
;
; GFX908-LABEL: v_sdot2_shuffle10_b:
@@ -346,8 +339,7 @@ define i32 @v_sdot2_shuffle10_b(<2 x i16> %a, <2 x i16> %b, i32 %c) {
; GFX10-LABEL: v_sdot2_shuffle10_b:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_alignbit_b32 v1, v1, v1, 16
-; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2
+; GFX10-NEXT: v_dot2_i32_i16 v0, v0, v1, v2 op_sel:[0,1,0] op_sel_hi:[1,0,1]
; GFX10-NEXT: s_setpc_b64 s[30:31]
%shuf.b = shufflevector <2 x i16> %b, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
%r = call i32 @llvm.amdgcn.sdot2(<2 x i16> %a, <2 x i16> %shuf.b, i32 %c, i1 false)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
index 06560afee3c9a..d6ef48e25cafb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
@@ -91,8 +91,7 @@ define i32 @v_sdot4_fnegf32_a(float %a, i32 %b, i32 %c) {
; GFX906-LABEL: v_sdot4_fnegf32_a:
; GFX906: ; %bb.0:
; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; GFX90...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
First round of comments
return false; | ||
} | ||
|
||
bool retOpStat(MachineOperand *Op, int stat, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is trying to handle too many cases. The G_* instructions only accept virtual registers. You should not encounter anything else. The immediate types are only permitted with G_CONSTANT and G_FCONSTANT
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi @arsenm , this is because I have to imply COPY for cases that copy from SGPR to VGPR (this happens usually after build_vector).
While COPY can also used to copy from physical registers, I need to block physical register to be selected, since they are not SSA (e.g. used as return value and cause infinite loop).
MachineInstr *MI = nullptr; | ||
|
||
if (!curr.first->isDef()) { | ||
// MRI.getVRegDef falls into infinite loop if use define reg |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't follow this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi @arsenm , I fall into infinite loop when running MRI.getVRegDef(curr.first->getReg())
if curr.first->isDef() == true
. So just block the defining operands.
if (Op1->isReg()) { | ||
if (Op2->isReg()) { | ||
return Op1->getReg() == Op2->getReg(); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is ignoring subregister uses, but you shouldn't encounter them either
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi @arsenm , to be honest I'm not sure about the subreg. From the isIdenticalTo function, this is comparing SubReg_TargetFlags member of MachineOperand class.
While in AMD backend 's TargetOperandFlags enumeration it seems more like address related I guess.
Should I incorporate the subreg comparison?? can you explain what SubReg_TargetFlags is used for in AMD backend??
Thanks a lot
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just drive by and find multiple code format issues. Can you check the LLVM code standard and refine your code accordingly?
Hi @shiltian , thanks and sorry for the code format issue. I'll read through the code standard and make sure following the standard~ |
There are still some issues. For example:
Should be:
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/203/builds/8017 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/205/builds/6808 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/33/builds/15076 Here is the relevant piece of the build log for the reference
|
…on for gisel" (#136249) Reverts llvm/llvm-project#130234
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/16/builds/17570 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/60/builds/25064 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/56/builds/23757 Here is the relevant piece of the build log for the reference
|
if (!Op->isReg() || Op->getReg().isPhysical()) | ||
return TypeClass::NONE_OF_LISTED; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You should never need to handle these cases, these checks should be dead
// [CurrHi, CurrLo] = [-OpHi, -OpLo](2 x Type) | ||
// [SrcHi, SrcLo] = [-OpHi, -OpLo] | ||
return SrcStatus::IS_BOTH_NEG; | ||
} else if (NegType == TypeClass::SCALAR) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No else after return
case AMDGPU::G_CONSTANT: | ||
case AMDGPU::G_FCONSTANT: | ||
case AMDGPU::COPY: | ||
return retOpStat(&MI->getOperand(1), Curr.second, Curr); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shouldn't be trying to push these cases into the same handling, should directly handled the special case constants here
Seeking opportunities to optimize VOP3P instructions by altering opsel, opsel_hi, neg, neg_hi bits
Tests differences:
CodeGen/AMDGPU/packed-fp32.ll
CodeGen/AMDGPU/strict_fsub.f16.ll
CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.fdot2.ll
CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot8.ll
CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot2.ll
CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot8.ll
CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot2.ll