-
Notifications
You must be signed in to change notification settings - Fork 13.3k
[AMDGPU] add s_bitset[10]_b32 optimization for shl+[or,andn2] pattern #134155
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Baoshan (BaoshanPang) ChangesThis PR is for #130245 Patch is 32.54 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/134155.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 73343e1c80f33..b0b0e46066bc7 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -15,6 +15,7 @@
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineOperand.h"
#define DEBUG_TYPE "si-shrink-instructions"
@@ -534,6 +535,56 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
MachineOperand *SrcReg = Src0;
MachineOperand *SrcImm = Src1;
+ // case 1:
+ // From:
+ // s_lshl_b32 s1, 1, s1
+ // s_or_b32 s0, s0, s1
+ // To:
+ // s_bitset1_b32 s0, s1
+ //
+ // case 2:
+ // s_lshl_b32 s1, 1, s1
+ // s_andn2_b32 s0, s0, s1
+ // To:
+ // s_bitset0_b32 s0, s1
+ if ((MI.getOpcode() == AMDGPU::S_OR_B32 ||
+ MI.getOpcode() == AMDGPU::S_ANDN2_B32) &&
+ Src0->isReg() && Src1->isReg() && Dest->getReg() == Src0->getReg()) {
+ MachineInstr *Shl = MRI->getUniqueVRegDef(Src1->getReg());
+ if (Shl && Shl->getOpcode() == AMDGPU::S_LSHL_B32) {
+ MachineOperand &ShlDest = Shl->getOperand(0);
+ MachineOperand &ShlSrc0 = Shl->getOperand(1);
+ MachineOperand &ShlSrc1 = Shl->getOperand(2);
+ const bool IsUndef = SrcReg->isUndef();
+ const bool IsKill = SrcReg->isKill();
+ if (MI.getParent() == Shl->getParent() &&
+ MRI->hasAtMostUserInstrs(ShlDest.getReg(), 2) && ShlSrc0.isImm() &&
+ ShlSrc0.getImm() == 1 && ShlSrc1.isReg()) {
+ unsigned int NewOpc = (MI.getOpcode() == AMDGPU::S_OR_B32)
+ ? AMDGPU::S_BITSET1_B32
+ : AMDGPU::S_BITSET0_B32;
+ MI.setDesc(TII->get(NewOpc));
+ Src0->setReg(ShlSrc1.getReg());
+ MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false,
+ /*isImp*/ false, IsKill,
+ /*isDead*/ false, IsUndef);
+ MI.tieOperands(0, 2);
+ for (auto IE = MI.getIterator(), I = std::next(Shl->getIterator());
+ I != IE; ++I) {
+ for (MachineOperand &MO : I->operands()) {
+ if (MO.isReg() && MO.getReg() == ShlSrc1.getReg()) {
+ if (MO.isKill())
+ MO.setIsKill(false);
+ }
+ }
+ }
+ Shl->eraseFromParent();
+ return true;
+ }
+ }
+ return false;
+ }
+
if (!SrcImm->isImm() ||
AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST->hasInv2PiInlineImm()))
return false;
@@ -577,8 +628,7 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
const bool IsUndef = SrcReg->isUndef();
const bool IsKill = SrcReg->isKill();
MI.setDesc(TII->get(Opc));
- if (Opc == AMDGPU::S_BITSET0_B32 ||
- Opc == AMDGPU::S_BITSET1_B32) {
+ if (Opc == AMDGPU::S_BITSET0_B32 || Opc == AMDGPU::S_BITSET1_B32) {
Src0->ChangeToImmediate(NewImm);
// Remove the immediate and add the tied input.
MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false,
@@ -945,6 +995,7 @@ bool SIShrinkInstructions::run(MachineFunction &MF) {
// Shrink scalar logic operations.
if (MI.getOpcode() == AMDGPU::S_AND_B32 ||
+ MI.getOpcode() == AMDGPU::S_ANDN2_B32 ||
MI.getOpcode() == AMDGPU::S_OR_B32 ||
MI.getOpcode() == AMDGPU::S_XOR_B32) {
if (shrinkScalarLogicOp(MI))
diff --git a/llvm/test/CodeGen/AMDGPU/130245.ll b/llvm/test/CodeGen/AMDGPU/130245.ll
new file mode 100644
index 0000000000000..54e7e0f022f59
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/130245.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+define amdgpu_ps i32 @s_bitset1_b32(i32 inreg %src0, i32 inreg %bit.index) {
+; SI-LABEL: s_bitset1_b32:
+; SI: ; %bb.0:
+; SI-NEXT: s_bitset1_b32 s0, s1
+; SI-NEXT: ; return to shader part epilog
+ %set.bit.at.index = shl i32 1, %bit.index
+ %or = or i32 %src0, %set.bit.at.index
+ ret i32 %or
+}
+
+define amdgpu_ps i32 @s_bitset0_b32(i32 inreg %src0, i32 inreg %bit.index) {
+; SI-LABEL: s_bitset0_b32:
+; SI: ; %bb.0:
+; SI-NEXT: s_bitset0_b32 s0, s1
+; SI-NEXT: ; return to shader part epilog
+ %set.bit.at.index = shl i32 1, %bit.index
+ %other.bits = xor i32 %set.bit.at.index, -1
+ %and = and i32 %src0, %other.bits
+ ret i32 %and
+}
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index 8319e112f526e..bada7eb3eba93 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -728,9 +728,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT: s_bitset0_b32 s1, s2
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1
@@ -813,9 +812,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT: s_bitset0_b32 s1, s2
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1
@@ -898,9 +896,8 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT: s_bitset0_b32 s1, s2
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
@@ -1120,9 +1117,8 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT: s_bitset0_b32 s1, s2
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_1
@@ -1211,9 +1207,8 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT: s_bitset0_b32 s1, s2
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_1
@@ -1301,9 +1296,8 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT: s_bitset0_b32 s1, s2
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
@@ -2183,9 +2177,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX10W32-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX10W32-NEXT: s_ff1_i32_b32 s2, s1
; GFX10W32-NEXT: v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX10W32-NEXT: v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT: s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT: s_bitset0_b32 s1, s2
; GFX10W32-NEXT: s_add_i32 s0, s0, s3
; GFX10W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1
@@ -2268,9 +2261,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX11W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX11W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX11W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT: s_bitset0_b32 s1, s2
; GFX11W32-NEXT: s_add_i32 s0, s0, s3
; GFX11W32-NEXT: s_cmp_lg_u32 s1, 0
; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1
@@ -2354,9 +2346,8 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
; GFX12W32-NEXT: s_ctz_i32_b32 s2, s1
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT: s_lshl_b32 s6, 1, s2
; GFX12W32-NEXT: v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT: s_bitset0_b32 s1, s2
; GFX12W32-NEXT: s_add_co_i32 s0, s0, s3
; GFX12W32-NEXT: s_wait_alu 0xfffe
; GFX12W32-NEXT: s_cmp_lg_u32 s1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 9775a37276dfd..15ad3c03a2b9c 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -620,9 +620,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_bitset0_b32 s1, s2
; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
@@ -706,9 +705,8 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_bitset0_b32 s1, s2
; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB2_1
@@ -4027,9 +4025,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_bitset0_b32 s1, s2
; GFX1032_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
@@ -4113,9 +4110,8 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_bitset0_b32 s1, s2
; GFX1132_ITERATIVE-NEXT: s_add_i32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB10_1
@@ -6436,9 +6432,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_bitset0_b32 s1, s2
; GFX1032_ITERATIVE-NEXT: s_and_b32 s0, s0, s3
; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
@@ -6522,9 +6517,8 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_bitset0_b32 s1, s2
; GFX1132_ITERATIVE-NEXT: s_and_b32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB15_1
@@ -7068,10 +7062,9 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8
+; GFX1032_ITERATIVE-NEXT: s_bitset0_b32 s2, s3
; GFX1032_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
@@ -7166,10 +7159,9 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8
+; GFX1132_ITERATIVE-NEXT: s_bitset0_b32 s2, s3
; GFX1132_ITERATIVE-NEXT: s_and_b64 s[0:1], s[0:1], s[6:7]
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB16_1
@@ -7797,9 +7789,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_bitset0_b32 s1, s2
; GFX1032_ITERATIVE-NEXT: s_or_b32 s0, s0, s3
; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
@@ -7883,9 +7874,8 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_bitset0_b32 s1, s2
; GFX1132_ITERATIVE-NEXT: s_or_b32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB17_1
@@ -8428,10 +8418,9 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8
+; GFX1032_ITERATIVE-NEXT: s_bitset0_b32 s2, s3
; GFX1032_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
@@ -8526,10 +8515,9 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v1, s1, s3
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s2, s2, s8
+; GFX1132_ITERATIVE-NEXT: s_bitset0_b32 s2, s3
; GFX1132_ITERATIVE-NEXT: s_or_b64 s[0:1], s[0:1], s[6:7]
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB18_1
@@ -9157,9 +9145,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s2, s1
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT: s_bitset0_b32 s1, s2
; GFX1032_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3
; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
@@ -9243,9 +9230,8 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_ctz_i32_b32 s2, s1
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT: s_lshl_b32 s6, 1, s2
; GFX1132_ITERATIVE-NEXT: v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT: s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT: s_bitset0_b32 s1, s2
; GFX1132_ITERATIVE-NEXT: s_xor_b32 s0, s0, s3
; GFX1132_ITERATIVE-NEXT: s_cmp_lg_u32 s1, 0
; GFX1132_ITERATIVE-NEXT: s_cbranch_scc1 .LBB19_1
@@ -9788,10 +9774,9 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1032_ITERATIVE-NEXT: s_ff1_i32_b32 s3, s2
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1032_ITERATIVE-NEXT: v_readlane_b32 s6, v0, s3
-; GFX1032_ITERATIVE-NEXT: s_lshl_b32 s8, 1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v2, s1, s3
; GFX1032_ITERATIVE-NEXT: v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT: s_andn2_b32 s2, s2, s8
+; GFX1032_ITERATIVE-NEXT: s_bitset0_b32 s2, s3
; GFX1032_ITERATIVE-NEXT: s_xor_b64 s[0:1], s[0:1], s[6:7]
; GFX1032_ITERATIVE-NEXT: s_cmp_lg_u32 s2, 0
; GFX1032_ITERATIVE-NEXT: s_cbranch_scc1 .LBB20_1
@@ -9886,10 +9871,9 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
; GFX1132_ITERATIVE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s7, v3, s3
; GFX1132_ITERATIVE-NEXT: v_readlane_b32 s...
[truncated]
|
@arsenm Please help to review this PR. The PR is for 32-bit. I want to use a different PR for the 64-bit version if it is OK. |
The failed test case seems not related to my change:
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This more naturally belongs in a selection pattern
Do you mean should I do it in td file? |
Yes. This shouldn't require anything special that would require manual selection, or post-select folding (as you have here) |
Need help. This pat is not working:
I want it to be used only when src and bit_idx both are SGPR, but llvm would insert a copy from VGPR to SGPR, and at later stage llvm also convert one SGPR back to VGPR which result an error. What should I do for such situation? |
I am able to find a way to do it. |
✅ With the latest revision this PR passed the C/C++ code formatter. |
@arsenm
also in later phase like RegBankSelect, some sgpr would be converted to vgpr which inrease the complexity:
While if do it in post-select phase, these would be what we need to handle wich is much simple and straghtforward:
|
With the knowledage what I have so far, doing it in SiShrink pass is the easiest way. |
This is more difficult than selecting this as a tablegen pattern
This is not true
The opposite is true. One generic operation becomes many more complicated selected operations |
Ok, I guess I need to get better understanding of the system. For now, I will try to find some workable issues to fix. Please feel free to assign #130245 to others. |
This PR is for #130245