[AMDGPU][MC] Allow op_sel in v_alignbit_b32 etc in GFX9 and GFX10 #142188

jwanggit86 · 2025-05-30T17:40:53Z

In GFX9 and GFX10, the op_sel modifier should be allowed in the instructions v_align_bit_b32 and v_alignbyte_b32.

llvmbot · 2025-05-30T17:41:15Z

@llvm/pr-subscribers-mc

@llvm/pr-subscribers-backend-amdgpu

Author: Jun Wang (jwanggit86)

Changes

In GFX9 and GFX10, the op_sel modifier should be allowed in the instructions v_align_bit_b32 and v_alignbyte_b32.

Full diff: https://github.com/llvm/llvm-project/pull/142188.diff

5 Files Affected:

(modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+22)
(modified) llvm/test/MC/AMDGPU/gfx10_asm_vop3.s (+24)
(modified) llvm/test/MC/AMDGPU/gfx7_err_pos.s (+13)
(modified) llvm/test/MC/AMDGPU/gfx8_err_pos.s (+10)
(modified) llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s (+24)

diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 0252c4f1b0929..defd203e6eec3 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -218,6 +218,11 @@ defm V_ALIGNBIT_B32 : VOP3Inst_t16_with_profiles <"v_alignbit_b32",
                                                    fshr, null_frag>;
 
 defm V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbyte>;
+
+// In gfx9 and 10, opsel is allowed for V_ALIGNBIT_B32 and V_ALIGNBYTE_B32
+defm V_ALIGNBIT_B32_opsel : VOP3Inst <"v_alignbit_b32_opsel", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_OPSEL>>;
+defm V_ALIGNBYTE_B32_opsel : VOP3Inst <"v_alignbyte_b32_opsel", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_OPSEL>>;
+
 let True16Predicate = UseRealTrue16Insts in
 defm V_ALIGNBYTE_B32_t16 : VOP3Inst <"v_alignbyte_b32_t16", VOP3_Profile_True16<VOP_I32_I32_I32_I16, VOP3_OPSEL>>;
 let True16Predicate = UseFakeTrue16Insts in
@@ -1863,6 +1868,9 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
   }
 } // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
 
+defm V_ALIGNBIT_B32_opsel  : VOP3OpSel_Real_gfx10_with_name<0x14e, "V_ALIGNBIT_B32_opsel", "v_alignbit_b32">;
+defm V_ALIGNBYTE_B32_opsel  : VOP3OpSel_Real_gfx10_with_name<0x14f, "V_ALIGNBYTE_B32_opsel", "v_alignbyte_b32">;
+
 defm V_READLANE_B32  : VOP3_Real_No_Suffix_gfx10<0x360>;
 
 let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in {
@@ -2159,6 +2167,17 @@ multiclass VOP3_Real_BITOP3_gfx9<bits<10> op, string AsmName, bit isSingle = 0>
   }
 }
 
+// Instructions such as v_alignbyte_b32 allows op_sel in gfx9, but not in vi.
+// The following is created to support that.
+multiclass VOP3OpSel_Real_gfx9_with_names<bits<10> op, string opName, string AsmName> {
+  defvar psName = opName#"_e64";
+  def _gfx9 : VOP3_Real<!cast<VOP3_Pseudo>(psName), SIEncodingFamily.VI>, // note: encoding family is VI
+            VOP3OpSel_gfx9 <op, !cast<VOP3_Pseudo>(psName).Pfl> {
+              VOP3_Pseudo ps = !cast<VOP3_Pseudo>(psName);
+              let AsmString = AsmName # ps.AsmOperands;
+            }
+}
+
 } // End AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9"
 
 defm V_MAD_U64_U32      : VOP3be_Real_vi <0x1E8>;
@@ -2224,6 +2243,9 @@ defm V_INTERP_P2_LEGACY_F16 : VOP3Interp_F16_Real_gfx9 <0x276, "V_INTERP_P2_F16"
 defm V_MAD_LEGACY_U16       : VOP3_F16_Real_gfx9 <0x1eb, "V_MAD_U16",       "v_mad_legacy_u16">;
 defm V_MAD_LEGACY_I16       : VOP3_F16_Real_gfx9 <0x1ec, "V_MAD_I16",       "v_mad_legacy_i16">;
 
+defm V_ALIGNBIT_B32_opsel   : VOP3OpSel_Real_gfx9_with_names <0x1ce, "V_ALIGNBIT_B32_opsel", "v_alignbit_b32">;
+defm V_ALIGNBYTE_B32_opsel  : VOP3OpSel_Real_gfx9_with_names <0x1cf, "V_ALIGNBYTE_B32_opsel", "v_alignbyte_b32">;
+
 defm V_MAD_F16_gfx9         : VOP3OpSel_F16_Real_gfx9 <0x203, "v_mad_f16">;
 defm V_MAD_U16_gfx9         : VOP3OpSel_F16_Real_gfx9 <0x204, "v_mad_u16">;
 defm V_MAD_I16_gfx9         : VOP3OpSel_F16_Real_gfx9 <0x205, "v_mad_i16">;
diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s
index c151bf99b76c5..73055a67b0f46 100644
--- a/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_vop3.s
@@ -3628,6 +3628,18 @@ v_alignbit_b32 v5, v1, v2, exec_lo
 v_alignbit_b32 v5, v1, v2, exec_hi
 // GFX10: encoding: [0x05,0x00,0x4e,0xd5,0x01,0x05,0xfe,0x01]
 
+v_alignbit_b32 v5, v1, v2, v3 op_sel:[1]
+// GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x4e,0xd5,0x01,0x05,0x0e,0x04]
+
+v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1]
+// GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0x4e,0xd5,0x01,0x05,0x0e,0x04]
+
+v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1]
+// GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0x4e,0xd5,0x01,0x05,0x0e,0x04]
+
+v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
+// GFX10: v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x4e,0xd5,0x01,0x05,0x0e,0x04]
+
 v_alignbyte_b32 v5, v1, v2, v3
 // GFX10: encoding: [0x05,0x00,0x4f,0xd5,0x01,0x05,0x0e,0x04]
 
@@ -3715,6 +3727,18 @@ v_alignbyte_b32 v5, v1, v2, exec_lo
 v_alignbyte_b32 v5, v1, v2, exec_hi
 // GFX10: encoding: [0x05,0x00,0x4f,0xd5,0x01,0x05,0xfe,0x01]
 
+v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1]
+// GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0x4f,0xd5,0x01,0x05,0x0e,0x04]
+
+v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1]
+// GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0x4f,0xd5,0x01,0x05,0x0e,0x04]
+
+v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1]
+// GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0x4f,0xd5,0x01,0x05,0x0e,0x04]
+
+v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
+// GFX10: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0x4f,0xd5,0x01,0x05,0x0e,0x04]
+
 v_mullit_f32 v5, v1, v2, v3
 // GFX10: encoding: [0x05,0x00,0x50,0xd5,0x01,0x05,0x0e,0x04]
 
diff --git a/llvm/test/MC/AMDGPU/gfx7_err_pos.s b/llvm/test/MC/AMDGPU/gfx7_err_pos.s
index 9dcbd4a4074af..7b6b241e04707 100644
--- a/llvm/test/MC/AMDGPU/gfx7_err_pos.s
+++ b/llvm/test/MC/AMDGPU/gfx7_err_pos.s
@@ -44,3 +44,16 @@ s_load_dword s5, s[2:3], glc
 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: cache policy is not supported for SMRD instructions
 // CHECK-NEXT:{{^}}s_load_dword s5, s[2:3], glc
 // CHECK-NEXT:{{^}}                         ^
+
+//==============================================================================
+// not a valid operand
+
+v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// CHECK-NEXT:{{^}}v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
+// CHECK-NEXT:{{^}}                              ^
+
+v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// CHECK-NEXT:{{^}}v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
+// CHECK-NEXT:{{^}}                               ^
diff --git a/llvm/test/MC/AMDGPU/gfx8_err_pos.s b/llvm/test/MC/AMDGPU/gfx8_err_pos.s
index 1e8457d54049a..a475c739e690d 100644
--- a/llvm/test/MC/AMDGPU/gfx8_err_pos.s
+++ b/llvm/test/MC/AMDGPU/gfx8_err_pos.s
@@ -49,3 +49,13 @@ v_cndmask_b32_sdwa v5, v1, sext(v2), vcc dst_sel:DWORD dst_unused:UNUSED_PRESERV
 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
 // CHECK-NEXT:{{^}}v_cndmask_b32_sdwa v5, v1, sext(v2), vcc dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:BYTE_0 src1_sel:WORD_0
 // CHECK-NEXT:{{^}}                           ^
+
+v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// CHECK-NEXT:{{^}}v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
+// CHECK-NEXT:{{^}}                              ^
+
+v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+// CHECK-NEXT:{{^}}v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
+// CHECK-NEXT:{{^}}                               ^
diff --git a/llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s b/llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s
index f3f4cae22538a..a1cd9ce8ef18e 100644
--- a/llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s
+++ b/llvm/test/MC/AMDGPU/gfx9_asm_vop3_e64.s
@@ -2829,6 +2829,18 @@ v_alignbit_b32 v5, v1, v2, src_execz
 v_alignbit_b32 v5, v1, v2, src_scc
 // CHECK: [0x05,0x00,0xce,0xd1,0x01,0x05,0xf6,0x03]
 
+v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0xce,0xd1,0x01,0x05,0x0e,0x04]
+// CHECK: [0x05,0x08,0xce,0xd1,0x01,0x05,0x0e,0x04]
+
+v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0xce,0xd1,0x01,0x05,0x0e,0x04]
+// CHECK: [0x05,0x18,0xce,0xd1,0x01,0x05,0x0e,0x04]
+
+v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0xce,0xd1,0x01,0x05,0x0e,0x04]
+// CHECK: [0x05,0x38,0xce,0xd1,0x01,0x05,0x0e,0x04]
+
+v_alignbit_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0xce,0xd1,0x01,0x05,0x0e,0x04]
+// CHECK: [0x05,0x78,0xce,0xd1,0x01,0x05,0x0e,0x04]
+
 v_alignbyte_b32 v5, v1, v2, v3
 // CHECK: [0x05,0x00,0xcf,0xd1,0x01,0x05,0x0e,0x04]
 
@@ -3000,6 +3012,18 @@ v_alignbyte_b32 v5, v1, v2, src_execz
 v_alignbyte_b32 v5, v1, v2, src_scc
 // CHECK: [0x05,0x00,0xcf,0xd1,0x01,0x05,0xf6,0x03]
 
+v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1]
+// CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,0,0,0] ; encoding: [0x05,0x08,0xcf,0xd1,0x01,0x05,0x0e,0x04]
+
+v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1]
+// CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,0,0] ; encoding: [0x05,0x18,0xcf,0xd1,0x01,0x05,0x0e,0x04]
+
+v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1]
+// CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,0] ; encoding: [0x05,0x38,0xcf,0xd1,0x01,0x05,0x0e,0x04]
+
+v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1]
+// CHECK: v_alignbyte_b32 v5, v1, v2, v3 op_sel:[1,1,1,1] ; encoding: [0x05,0x78,0xcf,0xd1,0x01,0x05,0x0e,0x04]
+
 v_min3_f32 v5, v1, v2, v3
 // CHECK: [0x05,0x00,0xd0,0xd1,0x01,0x05,0x0e,0x04]

rampitec

Hm... I see it is supported, but what does it do in a b32 instruction?

Sisyph

The types are not uniform across gfx9 and gfx10. For example, on gfx1010 and gfx900 src2 is 32 bits and on gfx1030 and gfx90a src2 is 16 bits. And the compiler won't be able to make use of opsel without true16 or some intrinsic change.
Why do you want to do this?

rampitec · 2025-05-30T18:13:49Z

The types are not uniform across gfx9 and gfx10. For example, on gfx1010 and gfx900 src2 is 32 bits and on gfx1030 and gfx90a src2 is 16 bits. And the compiler won't be able to make use of opsel without true16 or some intrinsic change. Why do you want to do this?

So this is for src2 only? This is confusing though as ISA spec says:

D0.u32 = 32'U(({ S0.u32, S1.u32 } >> S2.u32[4 : 0]) & 0xffffffffLL)

I.e., [4:0] is explicit w/o any mention of the opsel.

jwanggit86 · 2025-05-30T18:22:17Z

For issue 38650.

rampitec

Ok, I see this is supported: V_ALIGNBIT_B32 v1, v2, v3, v4.h.
I guess S2 in this case is v4.h.
LGTM.

Sisyph

How do you intend to address the fact that some versions of gfx9 and gfx10 claim to op_sel and some do not?

What will the behavior be when op_sel is applied to 32 bit registers?

The issue you claim #38650 only suggests the op_sel might be supported, it is not a justification. How can this new support be used?

jwanggit86 · 2025-05-30T19:04:06Z

The types are not uniform across gfx9 and gfx10. For example, on gfx1010 and gfx900 src2 is 32 bits and on gfx1030 and gfx90a src2 is 16 bits.

Where can I find this?

jwanggit86 · 2025-05-30T19:18:13Z

How do you intend to address the fact that some versions of gfx9 and gfx10 claim to op_sel and some do not?

Where can I find this in the specs?

What will the behavior be when op_sel is applied to 32 bit registers?

It's not clear from the documents. Maybe 0 for lower half and 1 for higher half?

The issue you claim #38650 only suggests the op_sel might be supported, it is not a justification. How can this new support be used?

This was mainly done from the assembler's point of view. Opsel is not allowed in those two instructions, but the docs seem to indicate it should be.

See related discussion for LLVM: llvm/llvm-project#142188

Sisyph · 2025-06-03T13:54:36Z

Discussed offline. The docs do seem permissive with regard to opsel being allowed on different subtargets. I still wonder what the behavior will be if opsel is applied to the various src0, src1, src2. Please let me know when there is an answer to that.

[AMDGPU][MC] Allow op_sel in v_alignbit_b32 etc in GFX9 and GFX10

aa959a2

In GFX9 and GFX10, the op_sel modifier should be allowed in the instructions v_align_bit_b32 and v_alignbyte_b32.

jwanggit86 requested review from arsenm, kosarev and Sisyph May 30, 2025 17:40

jwanggit86 added backend:AMDGPU mc Machine (object) code labels May 30, 2025

Sisyph requested a review from rampitec May 30, 2025 18:04

rampitec reviewed May 30, 2025

View reviewed changes

Sisyph reviewed May 30, 2025

View reviewed changes

rampitec approved these changes May 30, 2025

View reviewed changes

Sisyph requested changes May 30, 2025

View reviewed changes

Sisyph requested a review from shiltian May 30, 2025 20:00

ms178 added a commit to ms178/archpkgbuilds that referenced this pull request Jun 1, 2025

Implement new opcodes for ACO

2804c49

See related discussion for LLVM: llvm/llvm-project#142188

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AMDGPU][MC] Allow op_sel in v_alignbit_b32 etc in GFX9 and GFX10 #142188

[AMDGPU][MC] Allow op_sel in v_alignbit_b32 etc in GFX9 and GFX10 #142188

Uh oh!

jwanggit86 commented May 30, 2025

Uh oh!

llvmbot commented May 30, 2025 •

edited

Loading

Uh oh!

rampitec left a comment

Uh oh!

Sisyph left a comment

Uh oh!

rampitec commented May 30, 2025

Uh oh!

jwanggit86 commented May 30, 2025

Uh oh!

rampitec left a comment

Uh oh!

Sisyph left a comment

Uh oh!

jwanggit86 commented May 30, 2025 •

edited

Loading

Uh oh!

jwanggit86 commented May 30, 2025

Uh oh!

Sisyph commented Jun 3, 2025

Uh oh!

Uh oh!

[AMDGPU][MC] Allow op_sel in v_alignbit_b32 etc in GFX9 and GFX10 #142188

Are you sure you want to change the base?

[AMDGPU][MC] Allow op_sel in v_alignbit_b32 etc in GFX9 and GFX10 #142188

Uh oh!

Conversation

jwanggit86 commented May 30, 2025

Uh oh!

llvmbot commented May 30, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

rampitec left a comment

Choose a reason for hiding this comment

Uh oh!

Sisyph left a comment

Choose a reason for hiding this comment

Uh oh!

rampitec commented May 30, 2025

Uh oh!

jwanggit86 commented May 30, 2025

Uh oh!

rampitec left a comment

Choose a reason for hiding this comment

Uh oh!

Sisyph left a comment

Choose a reason for hiding this comment

Uh oh!

jwanggit86 commented May 30, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

jwanggit86 commented May 30, 2025

Uh oh!

Sisyph commented Jun 3, 2025

Uh oh!

Uh oh!

llvmbot commented May 30, 2025 •

edited

Loading

jwanggit86 commented May 30, 2025 •

edited

Loading