-
Notifications
You must be signed in to change notification settings - Fork 13.3k
[AArch64][GlobalISel] Perfect Shuffles #106446
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-backend-spir-v @llvm/pr-subscribers-llvm-globalisel Author: David Green (davemgreen) ChangesThis is a port of the existing perfect shuffle generation code from SDAG, following the same structure. I wrote it a while ago and it has been sitting around. It brings the codegen for shuffles inline and avoids the need for generating a tbl and constant pool load. Patch is 33.35 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/106446.diff 14 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
index 92e05ee858a755..2a4121e0be1d46 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -1302,6 +1302,24 @@ class MachineIRBuilder {
const SrcOp &Elt,
const SrcOp &Idx);
+ /// Build and insert \p Res = G_INSERT_VECTOR_ELT \p Val, \p Elt, \p Idx
+ ///
+ /// \pre setBasicBlock or setMI must have been called.
+ /// \pre \p Res must be a generic virtual register with scalar type.
+ /// \pre \p Val must be a generic virtual register with vector type.
+ /// \pre \p Elt must be a generic virtual register with scalar type.
+ ///
+ /// \return The newly created instruction.
+ MachineInstrBuilder buildInsertVectorElementConstant(const DstOp &Res,
+ const SrcOp &Val,
+ const SrcOp &Elt,
+ const int Idx) {
+ auto TLI = getMF().getSubtarget().getTargetLowering();
+ unsigned VecIdxWidth = TLI->getVectorIdxTy(getDataLayout()).getSizeInBits();
+ return buildInsertVectorElement(
+ Res, Val, Elt, buildConstant(LLT::scalar(VecIdxWidth), Idx));
+ }
+
/// Build and insert \p Res = G_EXTRACT_VECTOR_ELT \p Val, \p Idx
///
/// \pre setBasicBlock or setMI must have been called.
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 10cad6d1924407..3541ca423c0d1d 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -129,6 +129,13 @@ def shuf_to_ins: GICombineRule <
(apply [{ applyINS(*${root}, MRI, B, ${matchinfo}); }])
>;
+def perfect_shuffle: GICombineRule <
+ (defs root:$root),
+ (match (wip_match_opcode G_SHUFFLE_VECTOR):$root,
+ [{ return matchPerfectShuffle(*${root}, MRI); }]),
+ (apply [{ applyPerfectShuffle(*${root}, MRI, B); }])
+>;
+
def vashr_vlshr_imm_matchdata : GIDefMatchData<"int64_t">;
def vashr_vlshr_imm : GICombineRule<
(defs root:$root, vashr_vlshr_imm_matchdata:$matchinfo),
@@ -147,7 +154,8 @@ def form_duplane : GICombineRule <
>;
def shuffle_vector_lowering : GICombineGroup<[dup, rev, ext, zip, uzp, trn,
- form_duplane, shuf_to_ins]>;
+ form_duplane, shuf_to_ins,
+ perfect_shuffle]>;
// Turn G_UNMERGE_VALUES -> G_EXTRACT_VECTOR_ELT's
def vector_unmerge_lowering : GICombineRule <
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index e31a27e9428e8a..2100d5d87e90bc 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -12090,25 +12090,6 @@ static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1,
unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
- enum {
- OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
- OP_VREV,
- OP_VDUP0,
- OP_VDUP1,
- OP_VDUP2,
- OP_VDUP3,
- OP_VEXT1,
- OP_VEXT2,
- OP_VEXT3,
- OP_VUZPL, // VUZP, left result
- OP_VUZPR, // VUZP, right result
- OP_VZIPL, // VZIP, left result
- OP_VZIPR, // VZIP, right result
- OP_VTRNL, // VTRN, left result
- OP_VTRNR, // VTRN, right result
- OP_MOVLANE // Move lane. RHSID is the lane to move into
- };
-
if (OpNum == OP_COPY) {
if (LHSID == (1 * 9 + 2) * 9 + 3)
return LHS;
diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
index 7b044cf7c238fd..3691b4787b1b66 100644
--- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
+++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h
@@ -6588,6 +6588,25 @@ static const unsigned PerfectShuffleTable[6561 + 1] = {
835584U, // <u,u,u,u>: Cost 0 copy LHS
0};
+enum {
+ OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
+ OP_VREV,
+ OP_VDUP0,
+ OP_VDUP1,
+ OP_VDUP2,
+ OP_VDUP3,
+ OP_VEXT1,
+ OP_VEXT2,
+ OP_VEXT3,
+ OP_VUZPL, // VUZP, left result
+ OP_VUZPR, // VUZP, right result
+ OP_VZIPL, // VZIP, left result
+ OP_VZIPR, // VZIP, right result
+ OP_VTRNL, // VTRN, left result
+ OP_VTRNR, // VTRN, right result
+ OP_MOVLANE // Move lane. RHSID is the lane to move into
+};
+
inline unsigned getPerfectShuffleCost(llvm::ArrayRef<int> M) {
assert(M.size() == 4 && "Expected a 4 entry perfect shuffle");
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
index 77b8cbe5793c39..7e63c4e8a7fe9f 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp
@@ -504,6 +504,189 @@ void applyINS(MachineInstr &MI, MachineRegisterInfo &MRI,
MI.eraseFromParent();
}
+/// Match 4 elemental G_SHUFFLE_VECTOR
+bool matchPerfectShuffle(MachineInstr &MI, MachineRegisterInfo &MRI) {
+ assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+ return MRI.getType(MI.getOperand(0).getReg()).getNumElements() == 4;
+}
+
+static Register GeneratePerfectShuffle(unsigned ID, Register V1, Register V2,
+ unsigned PFEntry, Register LHS,
+ Register RHS, MachineIRBuilder &MIB,
+ MachineRegisterInfo &MRI) {
+ unsigned OpNum = (PFEntry >> 26) & 0x0F;
+ unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
+ unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
+
+ if (OpNum == OP_COPY) {
+ if (LHSID == (1 * 9 + 2) * 9 + 3)
+ return LHS;
+ assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
+ return RHS;
+ }
+
+ if (OpNum == OP_MOVLANE) {
+ // Decompose a PerfectShuffle ID to get the Mask for lane Elt
+ auto getPFIDLane = [](unsigned ID, int Elt) -> int {
+ assert(Elt < 4 && "Expected Perfect Lanes to be less than 4");
+ Elt = 3 - Elt;
+ while (Elt > 0) {
+ ID /= 9;
+ Elt--;
+ }
+ return (ID % 9 == 8) ? -1 : ID % 9;
+ };
+
+ // For OP_MOVLANE shuffles, the RHSID represents the lane to move into. We
+ // get the lane to move from the PFID, which is always from the
+ // original vectors (V1 or V2).
+ Register OpLHS = GeneratePerfectShuffle(
+ LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS, RHS, MIB, MRI);
+ LLT VT = MRI.getType(OpLHS);
+ assert(RHSID < 8 && "Expected a lane index for RHSID!");
+ unsigned ExtLane = 0;
+ Register Input;
+
+ // OP_MOVLANE are either D movs (if bit 0x4 is set) or S movs. D movs
+ // convert into a higher type.
+ if (RHSID & 0x4) {
+ int MaskElt = getPFIDLane(ID, (RHSID & 0x01) << 1) >> 1;
+ if (MaskElt == -1)
+ MaskElt = (getPFIDLane(ID, ((RHSID & 0x01) << 1) + 1) - 1) >> 1;
+ assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
+ ExtLane = MaskElt < 2 ? MaskElt : (MaskElt - 2);
+ Input = MaskElt < 2 ? V1 : V2;
+ if (VT.getScalarSizeInBits() == 16 && VT != LLT::fixed_vector(2, 32)) {
+ Input = MIB.buildInstr(TargetOpcode::G_BITCAST,
+ {LLT::fixed_vector(2, 32)}, {Input})
+ .getReg(0);
+ OpLHS = MIB.buildInstr(TargetOpcode::G_BITCAST,
+ {LLT::fixed_vector(2, 32)}, {OpLHS})
+ .getReg(0);
+ }
+ if (VT.getScalarSizeInBits() == 32 && VT != LLT::fixed_vector(2, 64)) {
+ Input = MIB.buildInstr(TargetOpcode::G_BITCAST,
+ {LLT::fixed_vector(2, 64)}, {Input})
+ .getReg(0);
+ OpLHS = MIB.buildInstr(TargetOpcode::G_BITCAST,
+ {LLT::fixed_vector(2, 64)}, {OpLHS})
+ .getReg(0);
+ }
+ } else {
+ int MaskElt = getPFIDLane(ID, RHSID);
+ assert(MaskElt >= 0 && "Didn't expect an undef movlane index!");
+ ExtLane = MaskElt < 4 ? MaskElt : (MaskElt - 4);
+ Input = MaskElt < 4 ? V1 : V2;
+ if (VT.getScalarSizeInBits() == 16 && VT != LLT::fixed_vector(4, 16)) {
+ Input = MIB.buildInstr(TargetOpcode::G_BITCAST,
+ {LLT::fixed_vector(4, 16)}, {Input})
+ .getReg(0);
+ OpLHS = MIB.buildInstr(TargetOpcode::G_BITCAST,
+ {LLT::fixed_vector(4, 16)}, {OpLHS})
+ .getReg(0);
+ }
+ }
+ auto Ext = MIB.buildExtractVectorElementConstant(
+ MRI.getType(Input).getElementType(), Input, ExtLane);
+ auto Ins = MIB.buildInsertVectorElementConstant(MRI.getType(Input), OpLHS,
+ Ext, RHSID & 0x3);
+ if (MRI.getType(Ins.getReg(0)) != VT)
+ return MIB.buildInstr(TargetOpcode::G_BITCAST, {VT}, {Ins}).getReg(0);
+ return Ins.getReg(0);
+ }
+
+ Register OpLHS, OpRHS;
+ OpLHS = GeneratePerfectShuffle(LHSID, V1, V2, PerfectShuffleTable[LHSID], LHS,
+ RHS, MIB, MRI);
+ OpRHS = GeneratePerfectShuffle(RHSID, V1, V2, PerfectShuffleTable[RHSID], LHS,
+ RHS, MIB, MRI);
+ LLT VT = MRI.getType(OpLHS);
+
+ switch (OpNum) {
+ default:
+ llvm_unreachable("Unknown shuffle opcode!");
+ case OP_VREV: {
+ // VREV divides the vector in half and swaps within the half.
+ unsigned Opcode = VT.getScalarSizeInBits() == 32 ? AArch64::G_REV64
+ : VT.getScalarSizeInBits() == 16 ? AArch64::G_REV32
+ : AArch64::G_REV16;
+ return MIB.buildInstr(Opcode, {VT}, {OpLHS}).getReg(0);
+ }
+ case OP_VDUP0:
+ case OP_VDUP1:
+ case OP_VDUP2:
+ case OP_VDUP3: {
+ unsigned Opcode;
+ if (VT.getScalarSizeInBits() == 8)
+ Opcode = AArch64::G_DUPLANE8;
+ else if (VT.getScalarSizeInBits() == 16)
+ Opcode = AArch64::G_DUPLANE16;
+ else if (VT.getScalarSizeInBits() == 32)
+ Opcode = AArch64::G_DUPLANE32;
+ else if (VT.getScalarSizeInBits() == 64)
+ Opcode = AArch64::G_DUPLANE64;
+ else
+ llvm_unreachable("Invalid vector element type?");
+
+ if (VT.getSizeInBits() == 64)
+ OpLHS = MIB.buildConcatVectors(
+ VT.changeElementCount(VT.getElementCount() * 2),
+ {OpLHS, MIB.buildUndef(VT).getReg(0)})
+ .getReg(0);
+ Register Lane =
+ MIB.buildConstant(LLT::scalar(64), OpNum - OP_VDUP0).getReg(0);
+ return MIB.buildInstr(Opcode, {VT}, {OpLHS, Lane}).getReg(0);
+ }
+ case OP_VEXT1:
+ case OP_VEXT2:
+ case OP_VEXT3: {
+ unsigned Imm = (OpNum - OP_VEXT1 + 1) * VT.getScalarSizeInBits() / 8;
+ return MIB
+ .buildInstr(AArch64::G_EXT, {VT},
+ {OpLHS, OpRHS, MIB.buildConstant(LLT::scalar(64), Imm)})
+ .getReg(0);
+ }
+ case OP_VUZPL:
+ return MIB.buildInstr(AArch64::G_UZP1, {VT}, {OpLHS, OpRHS}).getReg(0);
+ case OP_VUZPR:
+ return MIB.buildInstr(AArch64::G_UZP2, {VT}, {OpLHS, OpRHS}).getReg(0);
+ case OP_VZIPL:
+ return MIB.buildInstr(AArch64::G_ZIP1, {VT}, {OpLHS, OpRHS}).getReg(0);
+ case OP_VZIPR:
+ return MIB.buildInstr(AArch64::G_ZIP2, {VT}, {OpLHS, OpRHS}).getReg(0);
+ case OP_VTRNL:
+ return MIB.buildInstr(AArch64::G_TRN1, {VT}, {OpLHS, OpRHS}).getReg(0);
+ case OP_VTRNR:
+ return MIB.buildInstr(AArch64::G_TRN2, {VT}, {OpLHS, OpRHS}).getReg(0);
+ }
+}
+
+void applyPerfectShuffle(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &Builder) {
+ Register Dst = MI.getOperand(0).getReg();
+ Register LHS = MI.getOperand(1).getReg();
+ Register RHS = MI.getOperand(2).getReg();
+ ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
+ assert(ShuffleMask.size() == 4 && "Expected 4 element mask");
+
+ unsigned PFIndexes[4];
+ for (unsigned i = 0; i != 4; ++i) {
+ if (ShuffleMask[i] < 0)
+ PFIndexes[i] = 8;
+ else
+ PFIndexes[i] = ShuffleMask[i];
+ }
+
+ // Compute the index in the perfect shuffle table.
+ unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
+ PFIndexes[2] * 9 + PFIndexes[3];
+ unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+ Register Res = GeneratePerfectShuffle(PFTableIndex, LHS, RHS, PFEntry, LHS,
+ RHS, Builder, MRI);
+ Builder.buildCopy(Dst, Res);
+ MI.eraseFromParent();
+}
+
/// isVShiftRImm - Check if this is a valid vector for the immediate
/// operand of a vector shift right operation. The value must be in the range:
/// 1 <= Value <= ElementBits for a right shift.
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir
index 9d12c3c32c7f8b..c426bb957d704d 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-shuffle-splat.mir
@@ -280,8 +280,16 @@ body: |
; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s32>) = G_IMPLICIT_DEF
; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[DEF]], [[COPY]](s32), [[C]](s64)
- ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[IVEC]](<4 x s32>), [[DEF]], shufflemask(undef, 0, 0, 3)
- ; CHECK-NEXT: $q0 = COPY [[SHUF]](<4 x s32>)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[IVEC]](<4 x s32>), [[C1]](s64)
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+ ; CHECK-NEXT: [[IVEC1:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], [[EVEC]](s32), [[C2]](s64)
+ ; CHECK-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[EVEC1:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[IVEC]](<4 x s32>), [[C3]](s64)
+ ; CHECK-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+ ; CHECK-NEXT: [[IVEC2:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[IVEC1]], [[EVEC1]](s32), [[C4]](s64)
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY [[IVEC2]](<4 x s32>)
+ ; CHECK-NEXT: $q0 = COPY [[COPY1]](<4 x s32>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:_(s32) = COPY $s0
%2:_(<4 x s32>) = G_IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-uzp.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-uzp.mir
index d1d5c6c29ba0df..d466df2a55c537 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-uzp.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-uzp.mir
@@ -67,8 +67,10 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
- ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<4 x s32>), [[COPY1]], shufflemask(0, 1, 4, 6)
- ; CHECK-NEXT: $q0 = COPY [[SHUF]](<4 x s32>)
+ ; CHECK-NEXT: [[ZIP1_:%[0-9]+]]:_(<4 x s32>) = G_ZIP1 [[COPY]], [[COPY1]]
+ ; CHECK-NEXT: [[UZP1_:%[0-9]+]]:_(<4 x s32>) = G_UZP1 [[ZIP1_]], [[COPY1]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY [[UZP1_]](<4 x s32>)
+ ; CHECK-NEXT: $q0 = COPY [[COPY2]](<4 x s32>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:_(<4 x s32>) = COPY $q0
%1:_(<4 x s32>) = COPY $q1
@@ -92,8 +94,13 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
- ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<4 x s32>), [[COPY1]], shufflemask(1, 4, 5, 7)
- ; CHECK-NEXT: $q0 = COPY [[SHUF]](<4 x s32>)
+ ; CHECK-NEXT: [[UZP2_:%[0-9]+]]:_(<4 x s32>) = G_UZP2 [[COPY]], [[COPY1]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY1]](<4 x s32>), [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[UZP2_]], [[EVEC]](s32), [[C1]](s64)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY [[IVEC]](<4 x s32>)
+ ; CHECK-NEXT: $q0 = COPY [[COPY2]](<4 x s32>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:_(<4 x s32>) = COPY $q0
%1:_(<4 x s32>) = COPY $q1
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-zip.mir b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-zip.mir
index bcf088287f46ae..afd5eaa8867bc4 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-zip.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/postlegalizer-lowering-zip.mir
@@ -220,8 +220,13 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
- ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<4 x s32>), [[COPY1]], shufflemask(3, 4, 1, 5)
- ; CHECK-NEXT: $q0 = COPY [[SHUF]](<4 x s32>)
+ ; CHECK-NEXT: [[ZIP1_:%[0-9]+]]:_(<4 x s32>) = G_ZIP1 [[COPY]], [[COPY1]]
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+ ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s32>), [[C]](s64)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+ ; CHECK-NEXT: [[IVEC:%[0-9]+]]:_(<4 x s32>) = G_INSERT_VECTOR_ELT [[ZIP1_]], [[EVEC]](s32), [[C1]](s64)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<4 x s32>) = COPY [[IVEC]](<4 x s32>)
+ ; CHECK-NEXT: $q0 = COPY [[COPY2]](<4 x s32>)
; CHECK-NEXT: RET_ReallyLR implicit $q0
%0:_(<4 x s32>) = COPY $q0
%1:_(<4 x s32>) = COPY $q1
diff --git a/llvm/test/CodeGen/AArch64/arm64-dup.ll b/llvm/test/CodeGen/AArch64/arm64-dup.ll
index 2bf5419e54830b..50520ce6987fab 100644
--- a/llvm/test/CodeGen/AArch64/arm64-dup.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-dup.ll
@@ -408,85 +408,45 @@ entry:
; Also test the DUP path in the PerfectShuffle generator.
define <4 x i16> @test_perfectshuffle_dupext_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
-; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4i16:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: trn1.4h v0, v0, v0
-; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-SD-NEXT: mov.s v0[1], v1[0]
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4i16:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: adrp x8, .LCPI33_0
-; CHECK-GI-NEXT: mov.d v0[1], v1[0]
-; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI33_0]
-; CHECK-GI-NEXT: tbl.16b v0, { v0 }, v1
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: test_perfectshuffle_dupext_v4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: trn1.4h v0, v0, v0
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: mov.s v0[1], v1[0]
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEXT: ret
%r = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
ret <4 x i16> %r
}
define <4 x half> @test_perfectshuffle_dupext_v4f16(<4 x half> %a, <4 x half> %b) nounwind {
-; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4f16:
-; CHECK-SD: // %bb.0:
-; CHECK-SD-NEXT: trn1.4h v0, v0, v0
-; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-SD-NEXT: mov.s v0[1], v1[0]
-; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4f16:
-; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT: adrp x8, .LCPI34_0
-; CHECK-G...
[truncated]
|
@@ -129,6 +129,13 @@ def shuf_to_ins: GICombineRule < | |||
(apply [{ applyINS(*${root}, MRI, B, ${matchinfo}); }]) | |||
>; | |||
|
|||
def perfect_shuffle: GICombineRule < | |||
(defs root:$root), | |||
(match (wip_match_opcode G_SHUFFLE_VECTOR):$root, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please avoid using wip_match_opcode
.
Brilliant, thanks for doing this. |
Reverse ping @davemgreen |
Hi - I will hopefully get back to addressing this soon - I have been going through some other stuff recently (fp128, some of the missing combines and inefficiencies is how we lower small vectors). |
19c2a5f
to
80a233b
Compare
80a233b
to
1b93362
Compare
This comment was marked as off-topic.
This comment was marked as off-topic.
1b93362
to
79d7da7
Compare
Rebase and ping - thanks. |
const SrcOp &Val, | ||
const SrcOp &Elt, | ||
const int Idx) { | ||
auto TLI = getMF().getSubtarget().getTargetLowering(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no auto
const SrcOp &Elt, | ||
const int Idx) { | ||
auto TLI = getMF().getSubtarget().getTargetLowering(); | ||
unsigned VecIdxWidth = TLI->getVectorIdxTy(getDataLayout()).getSizeInBits(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should have a version of this that directly returns LLT.
I'm also not sure how I feel about requiring TLI in these low level build functions. For the most part, you could just use i64 and let it legalize as required (but this does present an issue after legalization)
From llvm#106446, this adds a variant of getVectorIdxTy that returns an LLT. Many uses only look at the width, so a getVectorIdxWidth was added as the common base.
79d7da7
to
b5b8c57
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
b5b8c57
to
d30a8d6
Compare
From llvm#106446, this adds a variant of getVectorIdxTy that returns an LLT. Many uses only look at the width, so a getVectorIdxWidth was added as the common base.
From llvm#106446, this adds a variant of getVectorIdxTy that returns an LLT. Many uses only look at the width, so a getVectorIdxWidth was added as the common base.
d30a8d6
to
ae8a72e
Compare
I've rebased this over the other commit that was included in it - Thanks. |
SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, | ||
ExtSrc.getValueType().getVectorElementType(), | ||
ExtSrc, DAG.getVectorIdxConstant(ExtLane, dl)); | ||
SDValue Ins = | ||
DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtSrc.getValueType(), InsSrc, | ||
Ext, DAG.getVectorIdxConstant(InsLane, dl)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can query the index type once instead of repeating it for each of these cases
auto Ext = MIB.buildExtractVectorElementConstant( | ||
MRI.getType(ExtSrc).getElementType(), ExtSrc, ExtLane); | ||
auto Ins = MIB.buildInsertVectorElementConstant(MRI.getType(ExtSrc), InsSrc, | ||
Ext, InsLane); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ditto
LLT Ty = MRI.getType(OpLHS); | ||
switch (OpNum) { | ||
default: | ||
llvm_unreachable("Unexpected perfect shuffle opcode\n"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
llvm_unreachable("Unexpected perfect shuffle opcode\n"); | |
llvm_unreachable("Unexpected perfect shuffle opcode"); |
ae8a72e
to
ee1e2e4
Compare
This is a port of the existing perfect shuffle generation code from SDAG, geneticized to work for both SDAG and GISel. I wrote it a while ago and it has been sitting on my machine. It brings the codegen for certain shuffles inline and avoids the need for generating a tbl and constant pool load.
ee1e2e4
to
c9a1c0a
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I know nothing of aarch64 so I just have more nits
InsSrc = MIB.buildBitcast(LLT::fixed_vector(2, 32), InsSrc).getReg(0); | ||
} | ||
auto Ext = MIB.buildExtractVectorElement( | ||
MRI.getType(ExtSrc).getElementType(), ExtSrc, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Avoid repeated getTypes
switch (OpNum) { | ||
default: | ||
llvm_unreachable("Unexpected perfect shuffle opcode"); | ||
case OP_VUZPL: | ||
Opc = AArch64::G_UZP1; | ||
break; | ||
case OP_VUZPR: | ||
Opc = AArch64::G_UZP2; | ||
break; | ||
case OP_VZIPL: | ||
Opc = AArch64::G_ZIP1; | ||
break; | ||
case OP_VZIPR: | ||
Opc = AArch64::G_ZIP2; | ||
break; | ||
case OP_VTRNL: | ||
Opc = AArch64::G_TRN1; | ||
break; | ||
case OP_VTRNR: | ||
Opc = AArch64::G_TRN2; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Move to standalone utility function with return instead of conditional assign and break
This is a port of the existing perfect shuffle generation code from SDAG, geneticized to work for both SDAG and GISel. I wrote it a while ago and it has been sitting on my machine. It brings the codegen for certain shuffles inline and avoids the need for generating a tbl and constant pool load.