llvm · jayfoad · Sep 11, 2024 · Sep 6, 2024 · Sep 6, 2024 · Sep 9, 2024
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -5439,6 +5439,8 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
 
   bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
                       IID == Intrinsic::amdgcn_permlanex16;
+  bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
+                       IID == Intrinsic::amdgcn_set_inactive_chain_arg;
 
   auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
                                       Register Src2, LLT VT) -> Register {
@@ -5448,6 +5450,8 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
     case Intrinsic::amdgcn_permlane64:
       return LaneOp.getReg(0);
     case Intrinsic::amdgcn_readlane:
+    case Intrinsic::amdgcn_set_inactive:
+    case Intrinsic::amdgcn_set_inactive_chain_arg:
       return LaneOp.addUse(Src1).getReg(0);
     case Intrinsic::amdgcn_writelane:
       return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
@@ -5472,7 +5476,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
   Register Src0 = MI.getOperand(2).getReg();
   Register Src1, Src2;
   if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
-      IsPermLane16) {
+      IsSetInactive || IsPermLane16) {
     Src1 = MI.getOperand(3).getReg();
     if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
       Src2 = MI.getOperand(4).getReg();
@@ -5490,7 +5494,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
   if (Size < 32) {
     Src0 = B.buildAnyExt(S32, Src0).getReg(0);
 
-    if (IsPermLane16)
+    if (IsSetInactive || IsPermLane16)
       Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
 
     if (IID == Intrinsic::amdgcn_writelane)
@@ -5526,7 +5530,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
   MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
   MachineInstrBuilder Src1Parts, Src2Parts;
 
-  if (IsPermLane16)
+  if (IsSetInactive || IsPermLane16)
     Src1Parts = B.buildUnmerge(PartialResTy, Src1);
 
   if (IID == Intrinsic::amdgcn_writelane)
@@ -5535,7 +5539,7 @@ bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
   for (unsigned i = 0; i < NumParts; ++i) {
     Src0 = Src0Parts.getReg(i);
 
-    if (IsPermLane16)
+    if (IsSetInactive || IsPermLane16)
       Src1 = Src1Parts.getReg(i);
 
     if (IID == Intrinsic::amdgcn_writelane)
@@ -7496,6 +7500,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::amdgcn_permlane16:
   case Intrinsic::amdgcn_permlanex16:
   case Intrinsic::amdgcn_permlane64:
+  case Intrinsic::amdgcn_set_inactive:
+  case Intrinsic::amdgcn_set_inactive_chain_arg:
     return legalizeLaneOp(Helper, MI, IntrID);
   case Intrinsic::amdgcn_s_buffer_prefetch_data:
     return legalizeSBufferPrefetch(Helper, MI);

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6102,6 +6102,8 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
   unsigned IID = N->getConstantOperandVal(0);
   bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
                       IID == Intrinsic::amdgcn_permlanex16;
+  bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
+                       IID == Intrinsic::amdgcn_set_inactive_chain_arg;
   SDLoc SL(N);
   MVT IntVT = MVT::getIntegerVT(ValSize);
 
@@ -6119,6 +6121,8 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
       Operands.push_back(Src2);
       [[fallthrough]];
     case Intrinsic::amdgcn_readlane:
+    case Intrinsic::amdgcn_set_inactive:
+    case Intrinsic::amdgcn_set_inactive_chain_arg:
       Operands.push_back(Src1);
       [[fallthrough]];
     case Intrinsic::amdgcn_readfirstlane:
@@ -6145,7 +6149,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
   SDValue Src0 = N->getOperand(1);
   SDValue Src1, Src2;
   if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
-      IsPermLane16) {
+      IsSetInactive || IsPermLane16) {
     Src1 = N->getOperand(2);
     if (IID == Intrinsic::amdgcn_writelane || IsPermLane16)
       Src2 = N->getOperand(3);
@@ -6161,7 +6165,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
     Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
                                 SL, MVT::i32);
 
-    if (IsPermLane16) {
+    if (IsSetInactive || IsPermLane16) {
       Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
                                   SL, MVT::i32);
     }
@@ -6237,7 +6241,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
         Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
                                  DAG.getConstant(EltIdx, SL, MVT::i32));
 
-        if (IsPermLane16)
+        if (IsSetInactive || IsPermLane16)
           Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
                                    DAG.getConstant(EltIdx, SL, MVT::i32));
 
@@ -6246,7 +6250,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
                                    DAG.getConstant(EltIdx, SL, MVT::i32));
 
         Pieces.push_back(
-            IsPermLane16
+            IsSetInactive || IsPermLane16
                 ? createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT)
                 : createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
         EltIdx += 2;
@@ -6262,7 +6266,7 @@ static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N,
   MVT VecVT = MVT::getVectorVT(MVT::i32, ValSize / 32);
   Src0 = DAG.getBitcast(VecVT, Src0);
 
-  if (IsPermLane16)
+  if (IsSetInactive || IsPermLane16)
     Src1 = DAG.getBitcast(VecVT, Src1);
 
   if (IID == Intrinsic::amdgcn_writelane)
@@ -8745,6 +8749,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_permlane16:
   case Intrinsic::amdgcn_permlanex16:
   case Intrinsic::amdgcn_permlane64:
+  case Intrinsic::amdgcn_set_inactive:
+  case Intrinsic::amdgcn_set_inactive_chain_arg:
     return lowerLaneOp(*this, Op.getNode(), DAG);
   default:
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2097,21 +2097,6 @@ unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) {
   }
 }
 
-Register SIInstrInfo::findSetInactiveMask(const MachineInstr &MI) {
-  assert(MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
-         MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64);
-  for (auto &Op : MI.implicit_operands()) {
-    if (Op.isDef())
-      continue;
-    Register OpReg = Op.getReg();
-    if (OpReg == AMDGPU::EXEC || OpReg == AMDGPU::EXEC_LO ||
-        OpReg == AMDGPU::SCC)
-      continue;
-    return OpReg;
-  }
-  return Register();
-}
-
 bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MBB.findDebugLoc(MI);
@@ -2286,147 +2271,15 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.eraseFromParent();
     break;
   }
-  case AMDGPU::V_SET_INACTIVE_B32:
-  case AMDGPU::V_SET_INACTIVE_B64: {
-    unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
-    unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-    unsigned VMovOpc = MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64
-                           ? AMDGPU::V_MOV_B64_PSEUDO
-                           : AMDGPU::V_MOV_B32_e32;
-    Register ExecReg = RI.getExec();
+  case AMDGPU::V_SET_INACTIVE_B32: {
+    // Lower V_SET_INACTIVE_B32 to V_CNDMASK_B32.
     Register DstReg = MI.getOperand(0).getReg();
-    MachineOperand &ActiveSrc = MI.getOperand(1);
-    MachineOperand &InactiveSrc = MI.getOperand(2);
-
-    // Find implicit register defining lanes active outside WWM.
-    Register ExecSrcReg = findSetInactiveMask(MI);
-    assert(ExecSrcReg && "V_SET_INACTIVE must be in known WWM region");
-    // Note: default here is set to ExecReg so that functional MIR is still
-    // generated if implicit def is not found and assertions are disabled.
-    if (!ExecSrcReg)
-      ExecSrcReg = ExecReg;
-
-    // Ideally in WWM this operation is lowered to V_CNDMASK; however,
-    // constant bus constraints and the presence of literal constants
-    // present an issue.
-    // Fallback to V_MOV base lowering in all but the common cases.
-    const bool VMov64 = VMovOpc != AMDGPU::V_MOV_B32_e32;
-    MachineFunction *MF = MBB.getParent();
-    MachineRegisterInfo &MRI = MF->getRegInfo();
-    const unsigned Opcode = AMDGPU::V_CNDMASK_B32_e64;
-    const MCInstrDesc &Desc = get(Opcode);
-
-    const APInt ActiveImm(64, ActiveSrc.isImm() ? ActiveSrc.getImm() : 0);
-    const APInt InactiveImm(64, InactiveSrc.isImm() ? InactiveSrc.getImm() : 0);
-    const APInt ActiveImmLo(32, ActiveImm.getLoBits(32).getZExtValue());
-    const APInt ActiveImmHi(32, ActiveImm.getHiBits(32).getZExtValue());
-    const APInt InactiveImmLo(32, InactiveImm.getLoBits(32).getZExtValue());
-    const APInt InactiveImmHi(32, InactiveImm.getHiBits(32).getZExtValue());
-
-    int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
-    int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
-
-    int ConstantBusLimit = ST.getConstantBusLimit(AMDGPU::V_CNDMASK_B32_e64);
-    int LiteralLimit = ST.hasVOP3Literal() ? 1 : 0;
-    int ConstantBusUses =
-        1 + // Starts at 1 for ExecSrcReg
-        (usesConstantBus(MRI, ActiveSrc, Desc.operands()[Src1Idx]) ? 1 : 0) +
-        (usesConstantBus(MRI, InactiveSrc, Desc.operands()[Src0Idx]) ? 1 : 0);
-    int LiteralConstants =
-        ((ActiveSrc.isReg() ||
-          (ActiveSrc.isImm() && isInlineConstant(ActiveImm)))
-             ? 0
-             : 1) +
-        ((InactiveSrc.isReg() ||
-          (InactiveSrc.isImm() && isInlineConstant(InactiveImm)))
-             ? 0
-             : 1);
-
-    bool UseVCndMask =
-        ConstantBusUses <= ConstantBusLimit && LiteralConstants <= LiteralLimit;
-    if (VMov64 && UseVCndMask) {
-      // Decomposition must not introduce new literals.
-      UseVCndMask &=
-          ActiveSrc.isReg() ||
-          (isInlineConstant(ActiveImmLo) && isInlineConstant(ActiveImmHi)) ||
-          (!isInlineConstant(ActiveImm));
-      UseVCndMask &= InactiveSrc.isReg() ||
-                     (isInlineConstant(InactiveImmLo) &&
-                      isInlineConstant(InactiveImmHi)) ||
-                     (!isInlineConstant(InactiveImm));
-    }
-
-    if (UseVCndMask && VMov64) {
-      // Dual V_CNDMASK_B32
-      MachineOperand ActiveLo = buildExtractSubRegOrImm(
-          MI, MRI, ActiveSrc, nullptr, AMDGPU::sub0, nullptr);
-      MachineOperand ActiveHi = buildExtractSubRegOrImm(
-          MI, MRI, ActiveSrc, nullptr, AMDGPU::sub1, nullptr);
-      MachineOperand InactiveLo = buildExtractSubRegOrImm(
-          MI, MRI, InactiveSrc, nullptr, AMDGPU::sub0, nullptr);
-      MachineOperand InactiveHi = buildExtractSubRegOrImm(
-          MI, MRI, InactiveSrc, nullptr, AMDGPU::sub1, nullptr);
-      if (ActiveSrc.isReg())
-        ActiveHi.setIsKill(ActiveSrc.isKill());
-      if (InactiveSrc.isReg())
-        InactiveHi.setIsKill(InactiveSrc.isKill());
-      BuildMI(MBB, MI, DL, Desc, RI.getSubReg(DstReg, AMDGPU::sub0))
-          .addImm(0)
-          .add(InactiveLo)
-          .addImm(0)
-          .add(ActiveLo)
-          .addReg(ExecSrcReg)
-          .addReg(DstReg, RegState::ImplicitDefine);
-      BuildMI(MBB, MI, DL, Desc, RI.getSubReg(DstReg, AMDGPU::sub1))
-          .addImm(0)
-          .add(InactiveHi)
-          .addImm(0)
-          .add(ActiveHi)
-          .addReg(ExecSrcReg)
-          .addReg(DstReg, RegState::ImplicitDefine);
-    } else if (UseVCndMask) {
-      // Single V_CNDMASK_B32
-      BuildMI(MBB, MI, DL, Desc, DstReg)
-          .addImm(0)
-          .add(InactiveSrc)
-          .addImm(0)
-          .add(ActiveSrc)
-          .addReg(ExecSrcReg);
-    } else {
-      // Fallback V_MOV case.
-      // Avoid unnecessary work if a source VGPR is also the destination.
-      // This can happen if WWM register allocation was efficient.
-      // Note: this assumes WWM execution.
-      bool DstIsActive = ActiveSrc.isReg() && ActiveSrc.getReg() == DstReg;
-      bool DstIsInactive =
-          InactiveSrc.isReg() && InactiveSrc.getReg() == DstReg;
-      if (!DstIsInactive) {
-        // Set exec mask to inactive lanes,
-        // but only if active lanes would be overwritten.
-        if (DstIsActive) {
-          BuildMI(MBB, MI, DL, get(NotOpc), ExecReg)
-              .addReg(ExecSrcReg)
-              .setOperandDead(3); // Dead scc
-        }
-        // Copy inactive lanes
-        MachineInstr *VMov =
-            BuildMI(MBB, MI, DL, get(VMovOpc), DstReg).add(InactiveSrc);
-        if (VMov64)
-          expandPostRAPseudo(*VMov);
-      }
-      if (!DstIsActive) {
-        // Set exec mask to active lanes
-        BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addReg(ExecSrcReg);
-        // Copy active lanes
-        MachineInstr *VMov =
-            BuildMI(MBB, MI, DL, get(VMovOpc), MI.getOperand(0).getReg())
-                .add(ActiveSrc);
-        if (VMov64)
-          expandPostRAPseudo(*VMov);
-      }
-      // Restore WWM
-      BuildMI(MBB, MI, DL, get(MovOpc), ExecReg).addImm(-1);
-    }
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+        .add(MI.getOperand(3))
+        .add(MI.getOperand(4))
+        .add(MI.getOperand(1))
+        .add(MI.getOperand(2))
+        .add(MI.getOperand(5));
     MI.eraseFromParent();
     break;
   }

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1437,8 +1437,6 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   // This is used if an operand is a 32 bit register but needs to be aligned
   // regardless.
   void enforceOperandRCAlignment(MachineInstr &MI, unsigned OpName) const;
-
-  static Register findSetInactiveMask(const MachineInstr &MI);
 };
 
 /// \brief Returns true if a reg:subreg pair P has a TRC class

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -243,29 +243,16 @@ def : GCNPat <(f32 (fptrunc_round f64:$src0, (i32 SupportedRoundMode:$round))),
 
 // Invert the exec mask and overwrite the inactive lanes of dst with inactive,
 // restoring it after we're done.
-let Defs = [SCC], isConvergent = 1 in {
-def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst),
-  (ins VSrc_b32: $src, VSrc_b32:$inactive), []>;
-
-def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
-  (ins VSrc_b64: $src, VSrc_b64:$inactive), []>;
-} // End Defs = [SCC]
+let isConvergent = 1 in
+def V_SET_INACTIVE_B32 : VOP3_Pseudo<"v_set_inactive_b32", VOP2e_I32_I32_I32_I1>;
 
 foreach vt = Reg32Types.types in {
 def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
-     (V_SET_INACTIVE_B32 VSrc_b32:$src, VSrc_b32:$inactive)>;
-}
-
-foreach vt = Reg64Types.types in {
-def : GCNPat <(vt (int_amdgcn_set_inactive vt:$src, vt:$inactive)),
-     (V_SET_INACTIVE_B64 VSrc_b64:$src, VSrc_b64:$inactive)>;
+     (V_SET_INACTIVE_B32 0, VSrc_b32:$src, 0, VSrc_b32:$inactive, (IMPLICIT_DEF))>;
 }
 
 def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
-    (V_SET_INACTIVE_B32 VGPR_32:$src, VGPR_32:$inactive)>;
-
-def : GCNPat<(i64 (int_amdgcn_set_inactive_chain_arg i64:$src, i64:$inactive)),
-    (V_SET_INACTIVE_B64 VReg_64:$src, VReg_64:$inactive)>;
+    (V_SET_INACTIVE_B32 0, VGPR_32:$src, 0, VGPR_32:$inactive, (IMPLICIT_DEF))>;
 
 let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
   def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),

diff --git a/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -215,8 +215,7 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
   for (MachineBasicBlock *MBB : RPOT) {
     bool InWWM = false;
     for (MachineInstr &MI : *MBB) {
-      if (MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
-          MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64)
+      if (MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B32)
         RegsAssigned |= processDef(MI.getOperand(0));
 
       if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR) {