-
Notifications
You must be signed in to change notification settings - Fork 13.6k
[PowerPC] Intrinsics and tests for dmr insert/extract #135653
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-powerpc Author: None (RolandF77) ChangesAdd some intrinsics and LIT tests for PPC dmr insert/extract instructions. Full diff: https://github.com/llvm/llvm-project/pull/135653.diff 4 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
index e4d39134a4a25..a1f1a1707013f 100644
--- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -1661,6 +1661,22 @@ let TargetPrefix = "ppc" in {
DefaultAttrsIntrinsic<[llvm_v1024i1_ty], [llvm_v1024i1_ty,
llvm_v1024i1_ty], [IntrNoMem]>;
+ def int_ppc_mma_dmxxextfdmr512 :
+ DefaultAttrsIntrinsic<[llvm_v256i1_ty, llvm_v256i1_ty], [llvm_v1024i1_ty,
+ llvm_i32_ty], [IntrNoMem]>;
+
+ def int_ppc_mma_dmxxinstdmr512 :
+ DefaultAttrsIntrinsic<[llvm_v1024i1_ty], [llvm_v1024i1_ty, llvm_v256i1_ty,
+ llvm_v256i1_ty, llvm_i32_ty], [IntrNoMem]>;
+
+ def int_ppc_mma_dmxxextfdmr256 :
+ DefaultAttrsIntrinsic<[llvm_v256i1_ty], [llvm_v1024i1_ty, llvm_i32_ty],
+ [IntrNoMem]>;
+
+ def int_ppc_mma_dmxxinstdmr256 :
+ DefaultAttrsIntrinsic<[llvm_v1024i1_ty], [llvm_v1024i1_ty, llvm_v256i1_ty,
+ llvm_i32_ty], [IntrNoMem]>;
+
// MMA Reduced-Precision: Outer Product Intrinsic Definitions.
defm int_ppc_mma_xvi4ger8 :
PowerPC_MMA_ACC_PP_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty]>;
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 03a034182ae15..76dbecb45d7a6 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -135,6 +135,10 @@ MCRegister PPC::getRegNumForOperand(const MCInstrDesc &Desc, MCRegister Reg,
if (PPC::isVRRegister(Reg))
return PPC::VSX32 + (Reg - PPC::V0);
break;
+ case PPC::DMRROWpRCRegClassID: {
+ // Reference to dmr reg. There are four dmrrow pairs per dmr.
+ return PPC::DMR0 + ((Reg - PPC::DMRROWp0) / 4);
+ }
// Other RegClass doesn't need mapping
default:
break;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 1f75425752a78..0800ed5dfce2c 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -11146,6 +11146,116 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getMergeValues(RetOps, dl);
}
+ case Intrinsic::ppc_mma_dmxxextfdmr512: {
+ assert(Subtarget.isISAFuture() && "dmxxextfdmr512 requires ISA Future");
+ auto *Idx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+ assert(Idx && (Idx->getSExtValue() == 0 || Idx->getSExtValue() == 1) &&
+ "Specify P of 0 or 1 for lower or upper 512 bytes");
+ unsigned HiLo = Idx->getSExtValue();
+ unsigned Opcode;
+ unsigned Subx;
+ if (HiLo == 0) {
+ Opcode = PPC::DMXXEXTFDMR512;
+ Subx = PPC::sub_wacc_lo;
+ } else {
+ Opcode = PPC::DMXXEXTFDMR512_HI;
+ Subx = PPC::sub_wacc_hi;
+ }
+ SDValue Subreg(
+ DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, MVT::v512i1,
+ Op.getOperand(1),
+ DAG.getTargetConstant(Subx, dl, MVT::i32)),
+ 0);
+ EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1};
+ return SDValue(DAG.getMachineNode(Opcode, dl, ReturnTypes, Subreg), 0);
+ }
+
+ case Intrinsic::ppc_mma_dmxxextfdmr256: {
+ assert(Subtarget.isISAFuture() && "dmxxextfdmr256 requires ISA Future");
+ auto *Idx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+ assert(Idx && (Idx->getSExtValue() >= 0 || Idx->getSExtValue() <= 3) &&
+ "Specify a dmr row pair 0-3");
+ unsigned IdxVal = Idx->getSExtValue();
+ unsigned Subx;
+ switch (IdxVal) {
+ case 0:
+ Subx = PPC::sub_dmrrowp0;
+ break;
+ case 1:
+ Subx = PPC::sub_dmrrowp1;
+ break;
+ case 2:
+ Subx = PPC::sub_wacc_hi_then_sub_dmrrowp0;
+ break;
+ case 3:
+ Subx = PPC::sub_wacc_hi_then_sub_dmrrowp1;
+ break;
+ }
+ SDValue Subreg(
+ DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, MVT::v256i1,
+ Op.getOperand(1),
+ DAG.getTargetConstant(Subx, dl, MVT::i32)),
+ 0);
+ SDValue P = DAG.getTargetConstant(IdxVal, dl, MVT::i32);
+ return SDValue(
+ DAG.getMachineNode(PPC::DMXXEXTFDMR256, dl, MVT::v256i1, {Subreg, P}),
+ 0);
+ }
+
+ case Intrinsic::ppc_mma_dmxxinstdmr512: {
+ assert(Subtarget.isISAFuture() && "dmxxinstdmr512 requires ISA Future");
+ auto *Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4));
+ assert(Idx && (Idx->getSExtValue() == 0 || Idx->getSExtValue() == 1) &&
+ "Specify P of 0 or 1 for lower or upper 512 bytes");
+ unsigned HiLo = Idx->getSExtValue();
+ unsigned Opcode;
+ unsigned Subx;
+ if (HiLo == 0) {
+ Opcode = PPC::DMXXINSTDMR512;
+ Subx = PPC::sub_wacc_lo;
+ } else {
+ Opcode = PPC::DMXXINSTDMR512_HI;
+ Subx = PPC::sub_wacc_hi;
+ }
+ SDValue Ops[] = {Op.getOperand(2), Op.getOperand(3)};
+ SDValue Wacc = SDValue(DAG.getMachineNode(Opcode, dl, MVT::v512i1, Ops), 0);
+ SDValue SubReg = DAG.getTargetConstant(Subx, dl, MVT::i32);
+ return SDValue(DAG.getMachineNode(PPC::INSERT_SUBREG, dl, MVT::v1024i1,
+ Op.getOperand(1), Wacc, SubReg),
+ 0);
+ }
+
+ case Intrinsic::ppc_mma_dmxxinstdmr256: {
+ assert(Subtarget.isISAFuture() && "dmxxinstdmr256 requires ISA Future");
+ auto *Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3));
+ assert(Idx && (Idx->getSExtValue() >= 0 || Idx->getSExtValue() <= 3) &&
+ "Specify a dmr row pair 0-3");
+ unsigned IdxVal = Idx->getSExtValue();
+ unsigned Subx;
+ switch (IdxVal) {
+ case 0:
+ Subx = PPC::sub_dmrrowp0;
+ break;
+ case 1:
+ Subx = PPC::sub_dmrrowp1;
+ break;
+ case 2:
+ Subx = PPC::sub_wacc_hi_then_sub_dmrrowp0;
+ break;
+ case 3:
+ Subx = PPC::sub_wacc_hi_then_sub_dmrrowp1;
+ break;
+ }
+ SDValue SubReg = DAG.getTargetConstant(Subx, dl, MVT::i32);
+ SDValue P = DAG.getTargetConstant(IdxVal, dl, MVT::i32);
+ SDValue Ops[] = {Op.getOperand(2), P};
+ SDValue DMRRowp = SDValue(
+ DAG.getMachineNode(PPC::DMXXINSTDMR256, dl, MVT::v256i1, Ops), 0);
+ return SDValue(DAG.getMachineNode(PPC::INSERT_SUBREG, dl, MVT::v1024i1,
+ Op.getOperand(1), DMRRowp, SubReg),
+ 0);
+ }
+
case Intrinsic::ppc_mma_xxmfacc:
case Intrinsic::ppc_mma_xxmtacc: {
// Allow pre-isa-future subtargets to lower as normal.
diff --git a/llvm/test/CodeGen/PowerPC/dmr-enable.ll b/llvm/test/CodeGen/PowerPC/dmr-enable.ll
index a6c99a751e2c5..303ca60fc62d8 100644
--- a/llvm/test/CodeGen/PowerPC/dmr-enable.ll
+++ b/llvm/test/CodeGen/PowerPC/dmr-enable.ll
@@ -129,6 +129,248 @@ entry:
ret void
}
+define void @text512(ptr %vp1, ptr %rp1, ptr %rp2, ptr %rp3, ptr %rp4) {
+; CHECK-LABEL: text512:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: dmsetdmrz dmr0
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT: stxv v2, 16(r4)
+; CHECK-NEXT: stxv v3, 0(r4)
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT: stxv v2, 16(r6)
+; CHECK-NEXT: stxv v3, 0(r6)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: text512:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: dmsetdmrz dmr0
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT: stxv v3, 16(r4)
+; CHECK-BE-NEXT: stxv v2, 0(r4)
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT: stxv v3, 16(r6)
+; CHECK-BE-NEXT: stxv v2, 0(r6)
+; CHECK-BE-NEXT: blr
+entry:
+ %z = call <1024 x i1> @llvm.ppc.mma.dmsetdmrz()
+ %x = call { <256 x i1>, <256 x i1> } @llvm.ppc.mma.dmxxextfdmr512(<1024 x i1> %z, i32 0)
+ %p = extractvalue { <256 x i1>, <256 x i1 > } %x, 0
+ store <256 x i1> %p, ptr %rp1, align 16
+ %y = call { <256 x i1>, <256 x i1> } @llvm.ppc.mma.dmxxextfdmr512(<1024 x i1> %z, i32 1)
+ %q = extractvalue { <256 x i1>, <256 x i1 > } %y, 0
+ store <256 x i1> %q, ptr %rp3, align 16
+ ret void
+}
+
+define void @text256(ptr %vp1, ptr %rp1, ptr %rp2, ptr %rp3, ptr %rp4) {
+; CHECK-LABEL: text256:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: dmsetdmrz dmr0
+; CHECK-NEXT: dmxxextfdmr256 vsp34, dmrrowp0, 0
+; CHECK-NEXT: stxv v2, 16(r4)
+; CHECK-NEXT: stxv v3, 0(r4)
+; CHECK-NEXT: dmxxextfdmr256 vsp34, dmrrowp1, 1
+; CHECK-NEXT: stxv v2, 16(r5)
+; CHECK-NEXT: stxv v3, 0(r5)
+; CHECK-NEXT: dmxxextfdmr256 vsp34, dmrrowp2, 2
+; CHECK-NEXT: stxv v2, 16(r6)
+; CHECK-NEXT: stxv v3, 0(r6)
+; CHECK-NEXT: dmxxextfdmr256 vsp34, dmrrowp3, 3
+; CHECK-NEXT: stxv v2, 16(r7)
+; CHECK-NEXT: stxv v3, 0(r7)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: text256:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: dmsetdmrz dmr0
+; CHECK-BE-NEXT: dmxxextfdmr256 vsp34, dmrrowp0, 0
+; CHECK-BE-NEXT: stxv v3, 16(r4)
+; CHECK-BE-NEXT: stxv v2, 0(r4)
+; CHECK-BE-NEXT: dmxxextfdmr256 vsp34, dmrrowp1, 1
+; CHECK-BE-NEXT: stxv v3, 16(r5)
+; CHECK-BE-NEXT: stxv v2, 0(r5)
+; CHECK-BE-NEXT: dmxxextfdmr256 vsp34, dmrrowp2, 2
+; CHECK-BE-NEXT: stxv v3, 16(r6)
+; CHECK-BE-NEXT: stxv v2, 0(r6)
+; CHECK-BE-NEXT: dmxxextfdmr256 vsp34, dmrrowp3, 3
+; CHECK-BE-NEXT: stxv v3, 16(r7)
+; CHECK-BE-NEXT: stxv v2, 0(r7)
+; CHECK-BE-NEXT: blr
+entry:
+ %z = call <1024 x i1> @llvm.ppc.mma.dmsetdmrz()
+ %x = call <256 x i1> @llvm.ppc.mma.dmxxextfdmr256(<1024 x i1> %z, i32 0)
+ store <256 x i1> %x, ptr %rp1, align 16
+ %q = call <256 x i1> @llvm.ppc.mma.dmxxextfdmr256(<1024 x i1> %z, i32 1)
+ store <256 x i1> %q, ptr %rp2, align 16
+ %w = call <256 x i1> @llvm.ppc.mma.dmxxextfdmr256(<1024 x i1> %z, i32 2)
+ store <256 x i1> %w, ptr %rp3, align 16
+ %y = call <256 x i1> @llvm.ppc.mma.dmxxextfdmr256(<1024 x i1> %z, i32 3)
+ store <256 x i1> %y, ptr %rp4, align 16
+ ret void
+}
+
+define void @tins512(ptr %vp1, ptr %vp2, ptr %vp3, ptr %vp4, ptr %rp1, ptr %rp2) {
+; CHECK-LABEL: tins512:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lxv v2, 16(r3)
+; CHECK-NEXT: lxv v3, 0(r3)
+; CHECK-NEXT: lxv v4, 16(r4)
+; CHECK-NEXT: lxv v5, 0(r4)
+; CHECK-NEXT: dmsetdmrz dmr0
+; CHECK-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT: stxvp vsp34, 96(r7)
+; CHECK-NEXT: stxvp vsp36, 64(r7)
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT: stxvp vsp34, 32(r7)
+; CHECK-NEXT: stxvp vsp36, 0(r7)
+; CHECK-NEXT: lxv v2, 16(r5)
+; CHECK-NEXT: lxv v4, 16(r6)
+; CHECK-NEXT: lxv v3, 0(r5)
+; CHECK-NEXT: lxv v5, 0(r6)
+; CHECK-NEXT: dmxxinstdmr512 wacc_hi0, vsp34, vsp36, 1
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT: stxvp vsp34, 96(r8)
+; CHECK-NEXT: stxvp vsp36, 64(r8)
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT: stxvp vsp34, 32(r8)
+; CHECK-NEXT: stxvp vsp36, 0(r8)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: tins512:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: lxv v2, 0(r3)
+; CHECK-BE-NEXT: lxv v3, 16(r3)
+; CHECK-BE-NEXT: lxv v4, 0(r4)
+; CHECK-BE-NEXT: lxv v5, 16(r4)
+; CHECK-BE-NEXT: dmsetdmrz dmr0
+; CHECK-BE-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT: stxvp vsp36, 96(r7)
+; CHECK-BE-NEXT: stxvp vsp34, 64(r7)
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT: stxvp vsp36, 32(r7)
+; CHECK-BE-NEXT: stxvp vsp34, 0(r7)
+; CHECK-BE-NEXT: lxv v2, 0(r5)
+; CHECK-BE-NEXT: lxv v4, 0(r6)
+; CHECK-BE-NEXT: lxv v3, 16(r5)
+; CHECK-BE-NEXT: lxv v5, 16(r6)
+; CHECK-BE-NEXT: dmxxinstdmr512 wacc_hi0, vsp34, vsp36, 1
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT: stxvp vsp36, 96(r8)
+; CHECK-BE-NEXT: stxvp vsp34, 64(r8)
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT: stxvp vsp36, 32(r8)
+; CHECK-BE-NEXT: stxvp vsp34, 0(r8)
+; CHECK-BE-NEXT: blr
+entry:
+ %z = call <1024 x i1> @llvm.ppc.mma.dmsetdmrz()
+ %l1 = load <256 x i1>, ptr %vp1, align 16
+ %r1 = load <256 x i1>, ptr %vp2, align 16
+ %a = call <1024 x i1> @llvm.ppc.mma.dmxxinstdmr512(<1024 x i1> %z, <256 x i1> %l1, <256 x i1> %r1, i32 0)
+ store <1024 x i1> %a, ptr %rp1, align 16
+ %l2 = load <256 x i1>, ptr %vp3, align 16
+ %r2 = load <256 x i1>, ptr %vp4, align 16
+ %b = call <1024 x i1> @llvm.ppc.mma.dmxxinstdmr512(<1024 x i1> %a, <256 x i1> %l2, <256 x i1> %r2, i32 1)
+ store <1024 x i1> %b, ptr %rp2, align 16
+ ret void
+}
+
+define void @tins256(ptr %vp1, ptr %vp2, ptr %vp3, ptr %vp4, ptr %rp1, ptr %rp2, ptr %rp3, ptr %rp4) {
+; CHECK-LABEL: tins256:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lxv v2, 16(r3)
+; CHECK-NEXT: lxv v3, 0(r3)
+; CHECK-NEXT: dmsetdmrz dmr0
+; CHECK-NEXT: dmxxinstdmr256 dmrrowp0, vsp34, 0
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT: stxvp vsp34, 96(r7)
+; CHECK-NEXT: stxvp vsp36, 64(r7)
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT: stxvp vsp34, 32(r7)
+; CHECK-NEXT: stxvp vsp36, 0(r7)
+; CHECK-NEXT: lxv v2, 16(r4)
+; CHECK-NEXT: lxv v3, 0(r4)
+; CHECK-NEXT: dmxxinstdmr256 dmrrowp1, vsp34, 1
+; CHECK-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc0, 0
+; CHECK-NEXT: stxvp vsp36, 96(r8)
+; CHECK-NEXT: stxvp vsp32, 64(r8)
+; CHECK-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc_hi0, 1
+; CHECK-NEXT: stxvp vsp36, 32(r8)
+; CHECK-NEXT: stxvp vsp32, 0(r8)
+; CHECK-NEXT: dmxxinstdmr256 dmrrowp2, vsp34, 2
+; CHECK-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc0, 0
+; CHECK-NEXT: stxvp vsp36, 96(r9)
+; CHECK-NEXT: stxvp vsp32, 64(r9)
+; CHECK-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc_hi0, 1
+; CHECK-NEXT: stxvp vsp36, 32(r9)
+; CHECK-NEXT: stxvp vsp32, 0(r9)
+; CHECK-NEXT: dmxxinstdmr256 dmrrowp3, vsp34, 3
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT: stxvp vsp34, 96(r10)
+; CHECK-NEXT: stxvp vsp36, 64(r10)
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT: stxvp vsp34, 32(r10)
+; CHECK-NEXT: stxvp vsp36, 0(r10)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: tins256:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: lxv v2, 0(r3)
+; CHECK-BE-NEXT: lxv v3, 16(r3)
+; CHECK-BE-NEXT: dmsetdmrz dmr0
+; CHECK-BE-NEXT: dmxxinstdmr256 dmrrowp0, vsp34, 0
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT: stxvp vsp36, 96(r7)
+; CHECK-BE-NEXT: stxvp vsp34, 64(r7)
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT: stxvp vsp36, 32(r7)
+; CHECK-BE-NEXT: stxvp vsp34, 0(r7)
+; CHECK-BE-NEXT: lxv v2, 0(r4)
+; CHECK-BE-NEXT: lxv v3, 16(r4)
+; CHECK-BE-NEXT: dmxxinstdmr256 dmrrowp1, vsp34, 1
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc_hi0, 1
+; CHECK-BE-NEXT: stxvp vsp32, 96(r8)
+; CHECK-BE-NEXT: stxvp vsp36, 64(r8)
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc0, 0
+; CHECK-BE-NEXT: stxvp vsp32, 32(r8)
+; CHECK-BE-NEXT: stxvp vsp36, 0(r8)
+; CHECK-BE-NEXT: dmxxinstdmr256 dmrrowp2, vsp34, 2
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc_hi0, 1
+; CHECK-BE-NEXT: stxvp vsp32, 96(r9)
+; CHECK-BE-NEXT: stxvp vsp36, 64(r9)
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc0, 0
+; CHECK-BE-NEXT: stxvp vsp32, 32(r9)
+; CHECK-BE-NEXT: stxvp vsp36, 0(r9)
+; CHECK-BE-NEXT: dmxxinstdmr256 dmrrowp3, vsp34, 3
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT: stxvp vsp36, 96(r10)
+; CHECK-BE-NEXT: stxvp vsp34, 64(r10)
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT: stxvp vsp36, 32(r10)
+; CHECK-BE-NEXT: stxvp vsp34, 0(r10)
+; CHECK-BE-NEXT: blr
+entry:
+ %z = call <1024 x i1> @llvm.ppc.mma.dmsetdmrz()
+ %l1 = load <256 x i1>, ptr %vp1, align 16
+ %a = call <1024 x i1> @llvm.ppc.mma.dmxxinstdmr256(<1024 x i1> %z, <256 x i1> %l1, i32 0)
+ store <1024 x i1> %a, ptr %rp1, align 16
+ %l2 = load <256 x i1>, ptr %vp2, align 16
+ %b = call <1024 x i1> @llvm.ppc.mma.dmxxinstdmr256(<1024 x i1> %a, <256 x i1> %l2, i32 1)
+ store <1024 x i1> %b, ptr %rp2, align 16
+ %l3 = load <256 x i1>, ptr %vp3, align 16
+ %c = call <1024 x i1> @llvm.ppc.mma.dmxxinstdmr256(<1024 x i1> %b, <256 x i1> %l2, i32 2)
+ store <1024 x i1> %c, ptr %rp3, align 16
+ %l4 = load <256 x i1>, ptr %vp4, align 16
+ %d = call <1024 x i1> @llvm.ppc.mma.dmxxinstdmr256(<1024 x i1> %c, <256 x i1> %l2, i32 3)
+ store <1024 x i1> %d, ptr %rp4, align 16
+ ret void
+}
+
declare <1024 x i1> @llvm.ppc.mma.dmsetdmrz()
declare <1024 x i1> @llvm.ppc.mma.dmmr(<1024 x i1>)
declare <1024 x i1> @llvm.ppc.mma.dmxor(<1024 x i1>, <1024 x i1>)
+declare <1024 x i1> @llvm.ppc.mma.dmxxinstdmr512(<1024 x i1>, <256 x i1>, <256 x i1>, i32)
+declare <1024 x i1> @llvm.ppc.mma.dmxxinstdmr256(<1024 x i1>, <256 x i1>, i32)
+declare { <256 x i1>, <256 x i1> } @llvm.ppc.mma.dmxxextfdmr512(<1024 x i1>, i32)
+declare <256 x i1> @llvm.ppc.mma.dmxxextfdmr256(<1024 x i1>, i32)
|
@llvm/pr-subscribers-llvm-ir Author: None (RolandF77) ChangesAdd some intrinsics and LIT tests for PPC dmr insert/extract instructions. Full diff: https://github.com/llvm/llvm-project/pull/135653.diff 4 Files Affected:
diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
index e4d39134a4a25..a1f1a1707013f 100644
--- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -1661,6 +1661,22 @@ let TargetPrefix = "ppc" in {
DefaultAttrsIntrinsic<[llvm_v1024i1_ty], [llvm_v1024i1_ty,
llvm_v1024i1_ty], [IntrNoMem]>;
+ def int_ppc_mma_dmxxextfdmr512 :
+ DefaultAttrsIntrinsic<[llvm_v256i1_ty, llvm_v256i1_ty], [llvm_v1024i1_ty,
+ llvm_i32_ty], [IntrNoMem]>;
+
+ def int_ppc_mma_dmxxinstdmr512 :
+ DefaultAttrsIntrinsic<[llvm_v1024i1_ty], [llvm_v1024i1_ty, llvm_v256i1_ty,
+ llvm_v256i1_ty, llvm_i32_ty], [IntrNoMem]>;
+
+ def int_ppc_mma_dmxxextfdmr256 :
+ DefaultAttrsIntrinsic<[llvm_v256i1_ty], [llvm_v1024i1_ty, llvm_i32_ty],
+ [IntrNoMem]>;
+
+ def int_ppc_mma_dmxxinstdmr256 :
+ DefaultAttrsIntrinsic<[llvm_v1024i1_ty], [llvm_v1024i1_ty, llvm_v256i1_ty,
+ llvm_i32_ty], [IntrNoMem]>;
+
// MMA Reduced-Precision: Outer Product Intrinsic Definitions.
defm int_ppc_mma_xvi4ger8 :
PowerPC_MMA_ACC_PP_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty]>;
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 03a034182ae15..76dbecb45d7a6 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -135,6 +135,10 @@ MCRegister PPC::getRegNumForOperand(const MCInstrDesc &Desc, MCRegister Reg,
if (PPC::isVRRegister(Reg))
return PPC::VSX32 + (Reg - PPC::V0);
break;
+ case PPC::DMRROWpRCRegClassID: {
+ // Reference to dmr reg. There are four dmrrow pairs per dmr.
+ return PPC::DMR0 + ((Reg - PPC::DMRROWp0) / 4);
+ }
// Other RegClass doesn't need mapping
default:
break;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 1f75425752a78..0800ed5dfce2c 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -11146,6 +11146,116 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getMergeValues(RetOps, dl);
}
+ case Intrinsic::ppc_mma_dmxxextfdmr512: {
+ assert(Subtarget.isISAFuture() && "dmxxextfdmr512 requires ISA Future");
+ auto *Idx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+ assert(Idx && (Idx->getSExtValue() == 0 || Idx->getSExtValue() == 1) &&
+ "Specify P of 0 or 1 for lower or upper 512 bytes");
+ unsigned HiLo = Idx->getSExtValue();
+ unsigned Opcode;
+ unsigned Subx;
+ if (HiLo == 0) {
+ Opcode = PPC::DMXXEXTFDMR512;
+ Subx = PPC::sub_wacc_lo;
+ } else {
+ Opcode = PPC::DMXXEXTFDMR512_HI;
+ Subx = PPC::sub_wacc_hi;
+ }
+ SDValue Subreg(
+ DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, MVT::v512i1,
+ Op.getOperand(1),
+ DAG.getTargetConstant(Subx, dl, MVT::i32)),
+ 0);
+ EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1};
+ return SDValue(DAG.getMachineNode(Opcode, dl, ReturnTypes, Subreg), 0);
+ }
+
+ case Intrinsic::ppc_mma_dmxxextfdmr256: {
+ assert(Subtarget.isISAFuture() && "dmxxextfdmr256 requires ISA Future");
+ auto *Idx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+ assert(Idx && (Idx->getSExtValue() >= 0 || Idx->getSExtValue() <= 3) &&
+ "Specify a dmr row pair 0-3");
+ unsigned IdxVal = Idx->getSExtValue();
+ unsigned Subx;
+ switch (IdxVal) {
+ case 0:
+ Subx = PPC::sub_dmrrowp0;
+ break;
+ case 1:
+ Subx = PPC::sub_dmrrowp1;
+ break;
+ case 2:
+ Subx = PPC::sub_wacc_hi_then_sub_dmrrowp0;
+ break;
+ case 3:
+ Subx = PPC::sub_wacc_hi_then_sub_dmrrowp1;
+ break;
+ }
+ SDValue Subreg(
+ DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, MVT::v256i1,
+ Op.getOperand(1),
+ DAG.getTargetConstant(Subx, dl, MVT::i32)),
+ 0);
+ SDValue P = DAG.getTargetConstant(IdxVal, dl, MVT::i32);
+ return SDValue(
+ DAG.getMachineNode(PPC::DMXXEXTFDMR256, dl, MVT::v256i1, {Subreg, P}),
+ 0);
+ }
+
+ case Intrinsic::ppc_mma_dmxxinstdmr512: {
+ assert(Subtarget.isISAFuture() && "dmxxinstdmr512 requires ISA Future");
+ auto *Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4));
+ assert(Idx && (Idx->getSExtValue() == 0 || Idx->getSExtValue() == 1) &&
+ "Specify P of 0 or 1 for lower or upper 512 bytes");
+ unsigned HiLo = Idx->getSExtValue();
+ unsigned Opcode;
+ unsigned Subx;
+ if (HiLo == 0) {
+ Opcode = PPC::DMXXINSTDMR512;
+ Subx = PPC::sub_wacc_lo;
+ } else {
+ Opcode = PPC::DMXXINSTDMR512_HI;
+ Subx = PPC::sub_wacc_hi;
+ }
+ SDValue Ops[] = {Op.getOperand(2), Op.getOperand(3)};
+ SDValue Wacc = SDValue(DAG.getMachineNode(Opcode, dl, MVT::v512i1, Ops), 0);
+ SDValue SubReg = DAG.getTargetConstant(Subx, dl, MVT::i32);
+ return SDValue(DAG.getMachineNode(PPC::INSERT_SUBREG, dl, MVT::v1024i1,
+ Op.getOperand(1), Wacc, SubReg),
+ 0);
+ }
+
+ case Intrinsic::ppc_mma_dmxxinstdmr256: {
+ assert(Subtarget.isISAFuture() && "dmxxinstdmr256 requires ISA Future");
+ auto *Idx = dyn_cast<ConstantSDNode>(Op.getOperand(3));
+ assert(Idx && (Idx->getSExtValue() >= 0 || Idx->getSExtValue() <= 3) &&
+ "Specify a dmr row pair 0-3");
+ unsigned IdxVal = Idx->getSExtValue();
+ unsigned Subx;
+ switch (IdxVal) {
+ case 0:
+ Subx = PPC::sub_dmrrowp0;
+ break;
+ case 1:
+ Subx = PPC::sub_dmrrowp1;
+ break;
+ case 2:
+ Subx = PPC::sub_wacc_hi_then_sub_dmrrowp0;
+ break;
+ case 3:
+ Subx = PPC::sub_wacc_hi_then_sub_dmrrowp1;
+ break;
+ }
+ SDValue SubReg = DAG.getTargetConstant(Subx, dl, MVT::i32);
+ SDValue P = DAG.getTargetConstant(IdxVal, dl, MVT::i32);
+ SDValue Ops[] = {Op.getOperand(2), P};
+ SDValue DMRRowp = SDValue(
+ DAG.getMachineNode(PPC::DMXXINSTDMR256, dl, MVT::v256i1, Ops), 0);
+ return SDValue(DAG.getMachineNode(PPC::INSERT_SUBREG, dl, MVT::v1024i1,
+ Op.getOperand(1), DMRRowp, SubReg),
+ 0);
+ }
+
case Intrinsic::ppc_mma_xxmfacc:
case Intrinsic::ppc_mma_xxmtacc: {
// Allow pre-isa-future subtargets to lower as normal.
diff --git a/llvm/test/CodeGen/PowerPC/dmr-enable.ll b/llvm/test/CodeGen/PowerPC/dmr-enable.ll
index a6c99a751e2c5..303ca60fc62d8 100644
--- a/llvm/test/CodeGen/PowerPC/dmr-enable.ll
+++ b/llvm/test/CodeGen/PowerPC/dmr-enable.ll
@@ -129,6 +129,248 @@ entry:
ret void
}
+define void @text512(ptr %vp1, ptr %rp1, ptr %rp2, ptr %rp3, ptr %rp4) {
+; CHECK-LABEL: text512:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: dmsetdmrz dmr0
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT: stxv v2, 16(r4)
+; CHECK-NEXT: stxv v3, 0(r4)
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT: stxv v2, 16(r6)
+; CHECK-NEXT: stxv v3, 0(r6)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: text512:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: dmsetdmrz dmr0
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT: stxv v3, 16(r4)
+; CHECK-BE-NEXT: stxv v2, 0(r4)
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT: stxv v3, 16(r6)
+; CHECK-BE-NEXT: stxv v2, 0(r6)
+; CHECK-BE-NEXT: blr
+entry:
+ %z = call <1024 x i1> @llvm.ppc.mma.dmsetdmrz()
+ %x = call { <256 x i1>, <256 x i1> } @llvm.ppc.mma.dmxxextfdmr512(<1024 x i1> %z, i32 0)
+ %p = extractvalue { <256 x i1>, <256 x i1 > } %x, 0
+ store <256 x i1> %p, ptr %rp1, align 16
+ %y = call { <256 x i1>, <256 x i1> } @llvm.ppc.mma.dmxxextfdmr512(<1024 x i1> %z, i32 1)
+ %q = extractvalue { <256 x i1>, <256 x i1 > } %y, 0
+ store <256 x i1> %q, ptr %rp3, align 16
+ ret void
+}
+
+define void @text256(ptr %vp1, ptr %rp1, ptr %rp2, ptr %rp3, ptr %rp4) {
+; CHECK-LABEL: text256:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: dmsetdmrz dmr0
+; CHECK-NEXT: dmxxextfdmr256 vsp34, dmrrowp0, 0
+; CHECK-NEXT: stxv v2, 16(r4)
+; CHECK-NEXT: stxv v3, 0(r4)
+; CHECK-NEXT: dmxxextfdmr256 vsp34, dmrrowp1, 1
+; CHECK-NEXT: stxv v2, 16(r5)
+; CHECK-NEXT: stxv v3, 0(r5)
+; CHECK-NEXT: dmxxextfdmr256 vsp34, dmrrowp2, 2
+; CHECK-NEXT: stxv v2, 16(r6)
+; CHECK-NEXT: stxv v3, 0(r6)
+; CHECK-NEXT: dmxxextfdmr256 vsp34, dmrrowp3, 3
+; CHECK-NEXT: stxv v2, 16(r7)
+; CHECK-NEXT: stxv v3, 0(r7)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: text256:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: dmsetdmrz dmr0
+; CHECK-BE-NEXT: dmxxextfdmr256 vsp34, dmrrowp0, 0
+; CHECK-BE-NEXT: stxv v3, 16(r4)
+; CHECK-BE-NEXT: stxv v2, 0(r4)
+; CHECK-BE-NEXT: dmxxextfdmr256 vsp34, dmrrowp1, 1
+; CHECK-BE-NEXT: stxv v3, 16(r5)
+; CHECK-BE-NEXT: stxv v2, 0(r5)
+; CHECK-BE-NEXT: dmxxextfdmr256 vsp34, dmrrowp2, 2
+; CHECK-BE-NEXT: stxv v3, 16(r6)
+; CHECK-BE-NEXT: stxv v2, 0(r6)
+; CHECK-BE-NEXT: dmxxextfdmr256 vsp34, dmrrowp3, 3
+; CHECK-BE-NEXT: stxv v3, 16(r7)
+; CHECK-BE-NEXT: stxv v2, 0(r7)
+; CHECK-BE-NEXT: blr
+entry:
+ %z = call <1024 x i1> @llvm.ppc.mma.dmsetdmrz()
+ %x = call <256 x i1> @llvm.ppc.mma.dmxxextfdmr256(<1024 x i1> %z, i32 0)
+ store <256 x i1> %x, ptr %rp1, align 16
+ %q = call <256 x i1> @llvm.ppc.mma.dmxxextfdmr256(<1024 x i1> %z, i32 1)
+ store <256 x i1> %q, ptr %rp2, align 16
+ %w = call <256 x i1> @llvm.ppc.mma.dmxxextfdmr256(<1024 x i1> %z, i32 2)
+ store <256 x i1> %w, ptr %rp3, align 16
+ %y = call <256 x i1> @llvm.ppc.mma.dmxxextfdmr256(<1024 x i1> %z, i32 3)
+ store <256 x i1> %y, ptr %rp4, align 16
+ ret void
+}
+
+define void @tins512(ptr %vp1, ptr %vp2, ptr %vp3, ptr %vp4, ptr %rp1, ptr %rp2) {
+; CHECK-LABEL: tins512:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lxv v2, 16(r3)
+; CHECK-NEXT: lxv v3, 0(r3)
+; CHECK-NEXT: lxv v4, 16(r4)
+; CHECK-NEXT: lxv v5, 0(r4)
+; CHECK-NEXT: dmsetdmrz dmr0
+; CHECK-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT: stxvp vsp34, 96(r7)
+; CHECK-NEXT: stxvp vsp36, 64(r7)
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT: stxvp vsp34, 32(r7)
+; CHECK-NEXT: stxvp vsp36, 0(r7)
+; CHECK-NEXT: lxv v2, 16(r5)
+; CHECK-NEXT: lxv v4, 16(r6)
+; CHECK-NEXT: lxv v3, 0(r5)
+; CHECK-NEXT: lxv v5, 0(r6)
+; CHECK-NEXT: dmxxinstdmr512 wacc_hi0, vsp34, vsp36, 1
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT: stxvp vsp34, 96(r8)
+; CHECK-NEXT: stxvp vsp36, 64(r8)
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT: stxvp vsp34, 32(r8)
+; CHECK-NEXT: stxvp vsp36, 0(r8)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: tins512:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: lxv v2, 0(r3)
+; CHECK-BE-NEXT: lxv v3, 16(r3)
+; CHECK-BE-NEXT: lxv v4, 0(r4)
+; CHECK-BE-NEXT: lxv v5, 16(r4)
+; CHECK-BE-NEXT: dmsetdmrz dmr0
+; CHECK-BE-NEXT: dmxxinstdmr512 wacc0, vsp34, vsp36, 0
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT: stxvp vsp36, 96(r7)
+; CHECK-BE-NEXT: stxvp vsp34, 64(r7)
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT: stxvp vsp36, 32(r7)
+; CHECK-BE-NEXT: stxvp vsp34, 0(r7)
+; CHECK-BE-NEXT: lxv v2, 0(r5)
+; CHECK-BE-NEXT: lxv v4, 0(r6)
+; CHECK-BE-NEXT: lxv v3, 16(r5)
+; CHECK-BE-NEXT: lxv v5, 16(r6)
+; CHECK-BE-NEXT: dmxxinstdmr512 wacc_hi0, vsp34, vsp36, 1
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT: stxvp vsp36, 96(r8)
+; CHECK-BE-NEXT: stxvp vsp34, 64(r8)
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT: stxvp vsp36, 32(r8)
+; CHECK-BE-NEXT: stxvp vsp34, 0(r8)
+; CHECK-BE-NEXT: blr
+entry:
+ %z = call <1024 x i1> @llvm.ppc.mma.dmsetdmrz()
+ %l1 = load <256 x i1>, ptr %vp1, align 16
+ %r1 = load <256 x i1>, ptr %vp2, align 16
+ %a = call <1024 x i1> @llvm.ppc.mma.dmxxinstdmr512(<1024 x i1> %z, <256 x i1> %l1, <256 x i1> %r1, i32 0)
+ store <1024 x i1> %a, ptr %rp1, align 16
+ %l2 = load <256 x i1>, ptr %vp3, align 16
+ %r2 = load <256 x i1>, ptr %vp4, align 16
+ %b = call <1024 x i1> @llvm.ppc.mma.dmxxinstdmr512(<1024 x i1> %a, <256 x i1> %l2, <256 x i1> %r2, i32 1)
+ store <1024 x i1> %b, ptr %rp2, align 16
+ ret void
+}
+
+define void @tins256(ptr %vp1, ptr %vp2, ptr %vp3, ptr %vp4, ptr %rp1, ptr %rp2, ptr %rp3, ptr %rp4) {
+; CHECK-LABEL: tins256:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: lxv v2, 16(r3)
+; CHECK-NEXT: lxv v3, 0(r3)
+; CHECK-NEXT: dmsetdmrz dmr0
+; CHECK-NEXT: dmxxinstdmr256 dmrrowp0, vsp34, 0
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT: stxvp vsp34, 96(r7)
+; CHECK-NEXT: stxvp vsp36, 64(r7)
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT: stxvp vsp34, 32(r7)
+; CHECK-NEXT: stxvp vsp36, 0(r7)
+; CHECK-NEXT: lxv v2, 16(r4)
+; CHECK-NEXT: lxv v3, 0(r4)
+; CHECK-NEXT: dmxxinstdmr256 dmrrowp1, vsp34, 1
+; CHECK-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc0, 0
+; CHECK-NEXT: stxvp vsp36, 96(r8)
+; CHECK-NEXT: stxvp vsp32, 64(r8)
+; CHECK-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc_hi0, 1
+; CHECK-NEXT: stxvp vsp36, 32(r8)
+; CHECK-NEXT: stxvp vsp32, 0(r8)
+; CHECK-NEXT: dmxxinstdmr256 dmrrowp2, vsp34, 2
+; CHECK-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc0, 0
+; CHECK-NEXT: stxvp vsp36, 96(r9)
+; CHECK-NEXT: stxvp vsp32, 64(r9)
+; CHECK-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc_hi0, 1
+; CHECK-NEXT: stxvp vsp36, 32(r9)
+; CHECK-NEXT: stxvp vsp32, 0(r9)
+; CHECK-NEXT: dmxxinstdmr256 dmrrowp3, vsp34, 3
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-NEXT: stxvp vsp34, 96(r10)
+; CHECK-NEXT: stxvp vsp36, 64(r10)
+; CHECK-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-NEXT: stxvp vsp34, 32(r10)
+; CHECK-NEXT: stxvp vsp36, 0(r10)
+; CHECK-NEXT: blr
+;
+; CHECK-BE-LABEL: tins256:
+; CHECK-BE: # %bb.0: # %entry
+; CHECK-BE-NEXT: lxv v2, 0(r3)
+; CHECK-BE-NEXT: lxv v3, 16(r3)
+; CHECK-BE-NEXT: dmsetdmrz dmr0
+; CHECK-BE-NEXT: dmxxinstdmr256 dmrrowp0, vsp34, 0
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT: stxvp vsp36, 96(r7)
+; CHECK-BE-NEXT: stxvp vsp34, 64(r7)
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT: stxvp vsp36, 32(r7)
+; CHECK-BE-NEXT: stxvp vsp34, 0(r7)
+; CHECK-BE-NEXT: lxv v2, 0(r4)
+; CHECK-BE-NEXT: lxv v3, 16(r4)
+; CHECK-BE-NEXT: dmxxinstdmr256 dmrrowp1, vsp34, 1
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc_hi0, 1
+; CHECK-BE-NEXT: stxvp vsp32, 96(r8)
+; CHECK-BE-NEXT: stxvp vsp36, 64(r8)
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc0, 0
+; CHECK-BE-NEXT: stxvp vsp32, 32(r8)
+; CHECK-BE-NEXT: stxvp vsp36, 0(r8)
+; CHECK-BE-NEXT: dmxxinstdmr256 dmrrowp2, vsp34, 2
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc_hi0, 1
+; CHECK-BE-NEXT: stxvp vsp32, 96(r9)
+; CHECK-BE-NEXT: stxvp vsp36, 64(r9)
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp36, vsp32, wacc0, 0
+; CHECK-BE-NEXT: stxvp vsp32, 32(r9)
+; CHECK-BE-NEXT: stxvp vsp36, 0(r9)
+; CHECK-BE-NEXT: dmxxinstdmr256 dmrrowp3, vsp34, 3
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc_hi0, 1
+; CHECK-BE-NEXT: stxvp vsp36, 96(r10)
+; CHECK-BE-NEXT: stxvp vsp34, 64(r10)
+; CHECK-BE-NEXT: dmxxextfdmr512 vsp34, vsp36, wacc0, 0
+; CHECK-BE-NEXT: stxvp vsp36, 32(r10)
+; CHECK-BE-NEXT: stxvp vsp34, 0(r10)
+; CHECK-BE-NEXT: blr
+entry:
+ %z = call <1024 x i1> @llvm.ppc.mma.dmsetdmrz()
+ %l1 = load <256 x i1>, ptr %vp1, align 16
+ %a = call <1024 x i1> @llvm.ppc.mma.dmxxinstdmr256(<1024 x i1> %z, <256 x i1> %l1, i32 0)
+ store <1024 x i1> %a, ptr %rp1, align 16
+ %l2 = load <256 x i1>, ptr %vp2, align 16
+ %b = call <1024 x i1> @llvm.ppc.mma.dmxxinstdmr256(<1024 x i1> %a, <256 x i1> %l2, i32 1)
+ store <1024 x i1> %b, ptr %rp2, align 16
+ %l3 = load <256 x i1>, ptr %vp3, align 16
+ %c = call <1024 x i1> @llvm.ppc.mma.dmxxinstdmr256(<1024 x i1> %b, <256 x i1> %l2, i32 2)
+ store <1024 x i1> %c, ptr %rp3, align 16
+ %l4 = load <256 x i1>, ptr %vp4, align 16
+ %d = call <1024 x i1> @llvm.ppc.mma.dmxxinstdmr256(<1024 x i1> %c, <256 x i1> %l2, i32 3)
+ store <1024 x i1> %d, ptr %rp4, align 16
+ ret void
+}
+
declare <1024 x i1> @llvm.ppc.mma.dmsetdmrz()
declare <1024 x i1> @llvm.ppc.mma.dmmr(<1024 x i1>)
declare <1024 x i1> @llvm.ppc.mma.dmxor(<1024 x i1>, <1024 x i1>)
+declare <1024 x i1> @llvm.ppc.mma.dmxxinstdmr512(<1024 x i1>, <256 x i1>, <256 x i1>, i32)
+declare <1024 x i1> @llvm.ppc.mma.dmxxinstdmr256(<1024 x i1>, <256 x i1>, i32)
+declare { <256 x i1>, <256 x i1> } @llvm.ppc.mma.dmxxextfdmr512(<1024 x i1>, i32)
+declare <256 x i1> @llvm.ppc.mma.dmxxextfdmr256(<1024 x i1>, i32)
|
SDValue Wacc = SDValue(DAG.getMachineNode(Opcode, dl, MVT::v512i1, Ops), 0); | ||
SDValue SubReg = DAG.getTargetConstant(Subx, dl, MVT::i32); | ||
return SDValue(DAG.getMachineNode(PPC::INSERT_SUBREG, dl, MVT::v1024i1, | ||
Op.getOperand(1), Wacc, SubReg), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Op.getOperand(1) is the 1024i1 operand which can be technically just the output but since it is used here by INSERT_SUBREG, then it was added as an input type in the int_ppc_mma_dmxxinstdmr512 intrinsic definition. Could we just create an IMPLICIT_DEF 1024i1 here instead as it seems we only care about its RegClass and remove 1024i1 from the input list of int_ppc_mma_dmxxinstdmr512?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We want to be able to insert both the top half and bottom half of the same object. Without an input parameter for the object, we would create two different objects when we set the upper and lower halves, with no obvious way to put them together.
SDValue DMRRowp = SDValue( | ||
DAG.getMachineNode(PPC::DMXXINSTDMR256, dl, MVT::v256i1, Ops), 0); | ||
return SDValue(DAG.getMachineNode(PPC::INSERT_SUBREG, dl, MVT::v1024i1, | ||
Op.getOperand(1), DMRRowp, SubReg), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
same as Intrinsic::ppc_mma_dmxxinstdmr512.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
Add some intrinsics and LIT tests for PPC dmr insert/extract instructions.
Add some intrinsics and LIT tests for PPC dmr insert/extract instructions.
Add some intrinsics and LIT tests for PPC dmr insert/extract instructions.
Add some intrinsics and LIT tests for PPC dmr insert/extract instructions.
Add some intrinsics and LIT tests for PPC dmr insert/extract instructions.