Skip to content

Commit

Permalink
[AArch64][GlobalISel] Combine MUL(AND(LSHR(X, 15), 0x10001), 0xffff)…
Browse files Browse the repository at this point in the history
… to CMLTz (#92915)

This patch mirrors the following SelectionDAG patch for GlobalISel:
https://reviews.llvm.org/D130874
  • Loading branch information
chuongg3 authored May 29, 2024
1 parent 9a28272 commit 23366d4
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 90 deletions.
11 changes: 10 additions & 1 deletion llvm/lib/Target/AArch64/AArch64Combine.td
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,14 @@ def or_to_bsp: GICombineRule <
(apply [{ applyOrToBSP(*${root}, MRI, B, ${matchinfo}); }])
>;

// Combines Mul(And(Srl(X, 15), 0x10001), 0xffff) into CMLTz
def combine_mul_cmlt : GICombineRule<
(defs root:$root, register_matchinfo:$matchinfo),
(match (wip_match_opcode G_MUL):$root,
[{ return matchCombineMulCMLT(*${root}, MRI, ${matchinfo}); }]),
(apply [{ applyCombineMulCMLT(*${root}, MRI, B, ${matchinfo}); }])
>;

// Post-legalization combines which should happen at all optimization levels.
// (E.g. ones that facilitate matching for the selector) For example, matching
// pseudos.
Expand Down Expand Up @@ -296,5 +304,6 @@ def AArch64PostLegalizerCombiner
split_store_zero_128, undef_combines,
select_to_minmax, or_to_bsp, combine_concat_vector,
commute_constant_to_rhs,
push_freeze_to_prevent_poison_from_propagating]> {
push_freeze_to_prevent_poison_from_propagating,
combine_mul_cmlt]> {
}
55 changes: 55 additions & 0 deletions llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,61 @@ void applyOrToBSP(MachineInstr &MI, MachineRegisterInfo &MRI,
MI.eraseFromParent();
}

// Combines Mul(And(Srl(X, 15), 0x10001), 0xffff) into CMLTz
bool matchCombineMulCMLT(MachineInstr &MI, MachineRegisterInfo &MRI,
Register &SrcReg) {
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());

if (DstTy != LLT::fixed_vector(2, 64) && DstTy != LLT::fixed_vector(2, 32) &&
DstTy != LLT::fixed_vector(4, 32) && DstTy != LLT::fixed_vector(4, 16) &&
DstTy != LLT::fixed_vector(8, 16))
return false;

auto AndMI = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI);
if (AndMI->getOpcode() != TargetOpcode::G_AND)
return false;
auto LShrMI = getDefIgnoringCopies(AndMI->getOperand(1).getReg(), MRI);
if (LShrMI->getOpcode() != TargetOpcode::G_LSHR)
return false;

// Check the constant splat values
auto V1 = isConstantOrConstantSplatVector(
*MRI.getVRegDef(MI.getOperand(2).getReg()), MRI);
auto V2 = isConstantOrConstantSplatVector(
*MRI.getVRegDef(AndMI->getOperand(2).getReg()), MRI);
auto V3 = isConstantOrConstantSplatVector(
*MRI.getVRegDef(LShrMI->getOperand(2).getReg()), MRI);
if (!V1.has_value() || !V2.has_value() || !V3.has_value())
return false;
unsigned HalfSize = DstTy.getScalarSizeInBits() / 2;
if (!V1.value().isMask(HalfSize) || V2.value() != (1ULL | 1ULL << HalfSize) ||
V3 != (HalfSize - 1))
return false;

SrcReg = LShrMI->getOperand(1).getReg();

return true;
}

void applyCombineMulCMLT(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B, Register &SrcReg) {
Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
LLT HalfTy =
DstTy.changeElementCount(DstTy.getElementCount().multiplyCoefficientBy(2))
.changeElementSize(DstTy.getScalarSizeInBits() / 2);

Register ZeroVec = B.buildConstant(HalfTy, 0).getReg(0);
Register CastReg =
B.buildInstr(TargetOpcode::G_BITCAST, {HalfTy}, {SrcReg}).getReg(0);
Register CMLTReg =
B.buildICmp(CmpInst::Predicate::ICMP_SLT, HalfTy, CastReg, ZeroVec)
.getReg(0);

B.buildInstr(TargetOpcode::G_BITCAST, {DstReg}, {CMLTReg}).getReg(0);
MI.eraseFromParent();
}

class AArch64PostLegalizerCombinerImpl : public Combiner {
protected:
// TODO: Make CombinerHelper methods const.
Expand Down
114 changes: 25 additions & 89 deletions llvm/test/CodeGen/AArch64/mulcmle.ll
Original file line number Diff line number Diff line change
Expand Up @@ -24,130 +24,66 @@ define <1 x i64> @v1i64(<1 x i64> %a) {
}

define <2 x i64> @v2i64(<2 x i64> %a) {
; CHECK-SD-LABEL: v2i64:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: cmlt v0.4s, v0.4s, #0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i64:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: movi v1.4s, #1
; CHECK-GI-NEXT: ushr v0.2d, v0.2d, #31
; CHECK-GI-NEXT: movi v2.2d, #0x000000ffffffff
; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-NEXT: fmov x11, d2
; CHECK-GI-NEXT: mov x9, v2.d[1]
; CHECK-GI-NEXT: fmov x10, d0
; CHECK-GI-NEXT: mov x8, v0.d[1]
; CHECK-GI-NEXT: mul x10, x10, x11
; CHECK-GI-NEXT: mul x8, x8, x9
; CHECK-GI-NEXT: fmov d0, x10
; CHECK-GI-NEXT: mov v0.d[1], x8
; CHECK-GI-NEXT: ret
; CHECK-LABEL: v2i64:
; CHECK: // %bb.0:
; CHECK-NEXT: cmlt v0.4s, v0.4s, #0
; CHECK-NEXT: ret
%b = lshr <2 x i64> %a, <i64 31, i64 31>
%c = and <2 x i64> %b, <i64 4294967297, i64 4294967297>
%d = mul nuw <2 x i64> %c, <i64 4294967295, i64 4294967295>
ret <2 x i64> %d
}

define <2 x i32> @v2i32(<2 x i32> %a) {
; CHECK-SD-LABEL: v2i32:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: cmlt v0.4h, v0.4h, #0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v2i32:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: movi v1.4h, #1
; CHECK-GI-NEXT: ushr v0.2s, v0.2s, #15
; CHECK-GI-NEXT: movi d2, #0x00ffff0000ffff
; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: mul v0.2s, v0.2s, v2.2s
; CHECK-GI-NEXT: ret
; CHECK-LABEL: v2i32:
; CHECK: // %bb.0:
; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
; CHECK-NEXT: ret
%b = lshr <2 x i32> %a, <i32 15, i32 15>
%c = and <2 x i32> %b, <i32 65537, i32 65537>
%d = mul nuw <2 x i32> %c, <i32 65535, i32 65535>
ret <2 x i32> %d
}

define <4 x i32> @v4i32(<4 x i32> %a) {
; CHECK-SD-LABEL: v4i32:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: cmlt v0.8h, v0.8h, #0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v4i32:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: movi v1.8h, #1
; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #15
; CHECK-GI-NEXT: movi v2.2d, #0x00ffff0000ffff
; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-NEXT: mul v0.4s, v0.4s, v2.4s
; CHECK-GI-NEXT: ret
; CHECK-LABEL: v4i32:
; CHECK: // %bb.0:
; CHECK-NEXT: cmlt v0.8h, v0.8h, #0
; CHECK-NEXT: ret
%b = lshr <4 x i32> %a, <i32 15, i32 15, i32 15, i32 15>
%c = and <4 x i32> %b, <i32 65537, i32 65537, i32 65537, i32 65537>
%d = mul nuw <4 x i32> %c, <i32 65535, i32 65535, i32 65535, i32 65535>
ret <4 x i32> %d
}

define <8 x i32> @v8i32(<8 x i32> %a) {
; CHECK-SD-LABEL: v8i32:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: cmlt v0.8h, v0.8h, #0
; CHECK-SD-NEXT: cmlt v1.8h, v1.8h, #0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v8i32:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: movi v2.8h, #1
; CHECK-GI-NEXT: ushr v0.4s, v0.4s, #15
; CHECK-GI-NEXT: ushr v1.4s, v1.4s, #15
; CHECK-GI-NEXT: movi v3.2d, #0x00ffff0000ffff
; CHECK-GI-NEXT: and v0.16b, v0.16b, v2.16b
; CHECK-GI-NEXT: and v1.16b, v1.16b, v2.16b
; CHECK-GI-NEXT: mul v0.4s, v0.4s, v3.4s
; CHECK-GI-NEXT: mul v1.4s, v1.4s, v3.4s
; CHECK-GI-NEXT: ret
; CHECK-LABEL: v8i32:
; CHECK: // %bb.0:
; CHECK-NEXT: cmlt v0.8h, v0.8h, #0
; CHECK-NEXT: cmlt v1.8h, v1.8h, #0
; CHECK-NEXT: ret
%b = lshr <8 x i32> %a, <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
%c = and <8 x i32> %b, <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
%d = mul nuw <8 x i32> %c, <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
ret <8 x i32> %d
}

define <4 x i16> @v4i16(<4 x i16> %a) {
; CHECK-SD-LABEL: v4i16:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: cmlt v0.8b, v0.8b, #0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v4i16:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: movi v1.8b, #1
; CHECK-GI-NEXT: ushr v0.4h, v0.4h, #7
; CHECK-GI-NEXT: movi d2, #0xff00ff00ff00ff
; CHECK-GI-NEXT: and v0.8b, v0.8b, v1.8b
; CHECK-GI-NEXT: mul v0.4h, v0.4h, v2.4h
; CHECK-GI-NEXT: ret
; CHECK-LABEL: v4i16:
; CHECK: // %bb.0:
; CHECK-NEXT: cmlt v0.8b, v0.8b, #0
; CHECK-NEXT: ret
%b = lshr <4 x i16> %a, <i16 7, i16 7, i16 7, i16 7>
%c = and <4 x i16> %b, <i16 257, i16 257, i16 257, i16 257>
%d = mul nuw <4 x i16> %c, <i16 255, i16 255, i16 255, i16 255>
ret <4 x i16> %d
}

define <8 x i16> @v8i16(<8 x i16> %a) {
; CHECK-SD-LABEL: v8i16:
; CHECK-SD: // %bb.0:
; CHECK-SD-NEXT: cmlt v0.16b, v0.16b, #0
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: v8i16:
; CHECK-GI: // %bb.0:
; CHECK-GI-NEXT: movi v1.16b, #1
; CHECK-GI-NEXT: ushr v0.8h, v0.8h, #7
; CHECK-GI-NEXT: movi v2.2d, #0xff00ff00ff00ff
; CHECK-GI-NEXT: and v0.16b, v0.16b, v1.16b
; CHECK-GI-NEXT: mul v0.8h, v0.8h, v2.8h
; CHECK-GI-NEXT: ret
; CHECK-LABEL: v8i16:
; CHECK: // %bb.0:
; CHECK-NEXT: cmlt v0.16b, v0.16b, #0
; CHECK-NEXT: ret
%b = lshr <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
%c = and <8 x i16> %b, <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>
%d = mul nuw <8 x i16> %c, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
Expand Down

0 comments on commit 23366d4

Please sign in to comment.