Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

reduce over divergent mask #133228

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from

Conversation

lalaniket8
Copy link
Contributor

No description provided.

@lalaniket8 lalaniket8 changed the title reduce over divergent wave reduce over divergent mask Mar 27, 2025
@lalaniket8
Copy link
Contributor Author

lalaniket8 commented Mar 27, 2025

Command to run :

./build/bin/llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -print-after="finalize-isel" -verify-machineinstrs < ~/upstream_llvm_project/llvm-project/newreduceumax.ll 2>&1 | tee temp.ll

Copy link

⚠️ C/C++ code formatter, clang-format found issues in your code. ⚠️

You can test this locally with the following command:
git-clang-format --diff 78408fddccf34b7d79eb655fa2cb4dfacdfb8ae3 c5c1cc54524d839f148c1390b659772fcabc0a4a --extensions cpp -- llvm/lib/Target/AMDGPU/SIISelLowering.cpp
View the diff from clang-format here.
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8bca356327..2578619ab3 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4980,11 +4980,13 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
   Register DstReg = MI.getOperand(0).getReg();
   llvm::errs() << TrgtRegInfo->getRegClassName(MRI.getRegClass(DstReg)) << "\n";
   Register MaskReg = MI.getOperand(2).getReg();
-  llvm::errs() << TrgtRegInfo->getRegClassName(MRI.getRegClass(MaskReg)) << "\n";
+  llvm::errs() << TrgtRegInfo->getRegClassName(MRI.getRegClass(MaskReg))
+               << "\n";
 
-  // llvm::errs() << "srcreg:" << MRI.getRegClassName(MRI.getRegClass(SrcReg)) << "\n";
-  // llvm::errs() << "DstReg:" << MRI.getRegClassName(MRI.getRegClass(DstReg)) << "\n";
-  // llvm::errs() << "MaskReg:" << MRI.getRegClassName(MRI.getRegClass(MaskReg)) << "\n";
+  // llvm::errs() << "srcreg:" << MRI.getRegClassName(MRI.getRegClass(SrcReg))
+  // << "\n"; llvm::errs() << "DstReg:" <<
+  // MRI.getRegClassName(MRI.getRegClass(DstReg)) << "\n"; llvm::errs() <<
+  // "MaskReg:" << MRI.getRegClassName(MRI.getRegClass(MaskReg)) << "\n";
   MachineBasicBlock *RetBB = nullptr;
   if (isSGPR) {
     // These operations with a uniform value i.e. SGPR are idempotent.
@@ -5015,9 +5017,11 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
     const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
     const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
     Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
-    Register InitalValReg = MRI.createVirtualRegister(DstRegClass);//MRI.getRegClass(SrcReg)
+    Register InitalValReg =
+        MRI.createVirtualRegister(DstRegClass); // MRI.getRegClass(SrcReg)
 
-    Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);//MRI.getRegClass(SrcReg)
+    Register AccumulatorReg =
+        MRI.createVirtualRegister(DstRegClass); // MRI.getRegClass(SrcReg)
     Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
     Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
     Register TempRegMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
@@ -5037,12 +5041,13 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
     // insert branch instr to newly created ComputeBlockk
     uint32_t InitalValue =
         (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
-    auto TmpSReg =
-        BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg); //s_mov_b64 s[2:3], exec
+    auto TmpSReg = BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator)
+                       .addReg(ExecReg); // s_mov_b64 s[2:3], exec
     // auto TmpMaskSReg =
-        // BuildMI(BB, I, DL, TII->get(MovOpc), TempRegMaskReg).addReg(MaskReg); //s_mov_b64 s[2:3], exec
+    // BuildMI(BB, I, DL, TII->get(MovOpc), TempRegMaskReg).addReg(MaskReg);
+    // //s_mov_b64 s[2:3], exec
     BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
-        .addImm(InitalValue);//s_mov_b32 s4, 0 | %17:sgpr_32 = S_MOV_B32 0
+        .addImm(InitalValue); // s_mov_b32 s4, 0 | %17:sgpr_32 = S_MOV_B32 0
     // clang-format off
     BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
         .addMBB(ComputeLoop);
@@ -5061,21 +5066,36 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
 
     // Perform the computations
     unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
-    auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
-                   .addReg(ActiveBits->getOperand(0).getReg());//%index.sgpr = S_FF1_I32_B64 %exec_copy.sreg
-    auto LaneValue = BuildMI(*ComputeLoop, I, DL,
-                             TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
-                         .addReg(SrcReg)
-                         .addReg(FF1->getOperand(0).getReg());//%value_at_lane_index.sreg = V_READLANE %value.vgpr %index.sgpr
-    auto MaskLaneValue = BuildMI(*ComputeLoop, I, DL,
-                          TII->get(AMDGPU::V_READLANE_B32), MaskLaneValueReg)
-                      .addReg(MaskReg)
-                      .addReg(FF1->getOperand(0).getReg());//%mask_at_lane_index.sreg = V_READLANE %mask.vgpr %index.sgpr
-    auto FF2 = BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_FF1_I32_B64), FF1Reg)
-                      .addReg(MaskLaneValue->getOperand(0).getReg());//%subgroupindex.sgpr = S_FF1_I32_B64 %mask_at_lane_index.sreg
-    auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
-                              .addReg(Accumulator->getOperand(0).getReg())
-                              .addReg(LaneValue->getOperand(0).getReg());//%acc.sgpr = max %acc.sgpr %value_at_lane_index.sreg
+    auto FF1 =
+        BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
+            .addReg(
+                ActiveBits->getOperand(0)
+                    .getReg()); //%index.sgpr = S_FF1_I32_B64 %exec_copy.sreg
+    auto LaneValue =
+        BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
+                LaneValueReg)
+            .addReg(SrcReg)
+            .addReg(FF1->getOperand(0)
+                        .getReg()); //%value_at_lane_index.sreg = V_READLANE
+                                    //%value.vgpr %index.sgpr
+    auto MaskLaneValue =
+        BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
+                MaskLaneValueReg)
+            .addReg(MaskReg)
+            .addReg(FF1->getOperand(0)
+                        .getReg()); //%mask_at_lane_index.sreg = V_READLANE
+                                    //%mask.vgpr %index.sgpr
+    auto FF2 =
+        BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_FF1_I32_B64), FF1Reg)
+            .addReg(MaskLaneValue->getOperand(0)
+                        .getReg()); //%subgroupindex.sgpr = S_FF1_I32_B64
+                                    //%mask_at_lane_index.sreg
+    auto NewAccumulator =
+        BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
+            .addReg(Accumulator->getOperand(0).getReg())
+            .addReg(
+                LaneValue->getOperand(0).getReg()); //%acc.sgpr = max %acc.sgpr
+                                                    //%value_at_lane_index.sreg
 
     // Manipulate the iterator to get the next active lane
     unsigned BITSETOpc =
@@ -5083,7 +5103,8 @@ static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
     auto NewActiveBits =
         BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
             .addReg(FF1->getOperand(0).getReg())
-            .addReg(ActiveBits->getOperand(0).getReg());//%bitsetresult = S_BITSET0_B64 %exec_copy
+            .addReg(ActiveBits->getOperand(0)
+                        .getReg()); //%bitsetresult = S_BITSET0_B64 %exec_copy
 
     // Add phi nodes
     Accumulator.addReg(NewAccumulator->getOperand(0).getReg())

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

1 participant