[PowerPC] Add paired vector load and store builtins and intrinsics

This patch adds the Clang builtins and LLVM intrinsics to load and store vector pairs. Differential Revision: https://reviews.llvm.org/D90799
jaebaek · Nov 13, 2020 · 3f78605 · 3f78605
1 parent 66b876c
commit 3f78605
Show file tree

Hide file tree

Showing 15 changed files with 848 additions and 8 deletions.
diff --git a/clang/include/clang/Basic/BuiltinsPPC.def b/clang/include/clang/Basic/BuiltinsPPC.def
@@ -738,6 +738,8 @@ MMA_BUILTIN(pmxvbf16ger2pp, "vW512*VVi15i15i3", true)
 MMA_BUILTIN(pmxvbf16ger2pn, "vW512*VVi15i15i3", true)
 MMA_BUILTIN(pmxvbf16ger2np, "vW512*VVi15i15i3", true)
 MMA_BUILTIN(pmxvbf16ger2nn, "vW512*VVi15i15i3", true)
+MMA_BUILTIN(lxvp, "W256SLLiW256C*", false)
+MMA_BUILTIN(stxvp, "vW256SLLiW256C*", false)
 
 // FIXME: Obviously incomplete.
 

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -14776,6 +14776,19 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
       break;
   #include "clang/Basic/BuiltinsPPC.def"
     }
+    if (BuiltinID == PPC::BI__builtin_mma_lxvp ||
+        BuiltinID == PPC::BI__builtin_mma_stxvp) {
+      if (BuiltinID == PPC::BI__builtin_mma_lxvp) {
+        Ops[1] = Builder.CreateBitCast(Ops[1], Int8PtrTy);
+        Ops[0] = Builder.CreateGEP(Ops[1], Ops[0]);
+      } else {
+        Ops[2] = Builder.CreateBitCast(Ops[2], Int8PtrTy);
+        Ops[1] = Builder.CreateGEP(Ops[2], Ops[1]);
+      }
+      Ops.pop_back();
+      llvm::Function *F = CGM.getIntrinsic(ID);
+      return Builder.CreateCall(F, Ops, "");
+    }
     SmallVector<Value*, 4> CallOps;
     if (Accumulate) {
       Address Addr = EmitPointerWithAlignment(E->getArg(0));

diff --git a/clang/test/CodeGen/builtins-ppc-mma.c b/clang/test/CodeGen/builtins-ppc-mma.c
@@ -1036,3 +1036,162 @@ void test65(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
   __builtin_mma_pmxvbf16ger2nn(&vq, vc, vc, 0, 0, 0);
   *((__vector_quad *)resp) = vq;
 }
+
+// CHECK-LABEL: @test66(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8*
+// CHECK-NEXT:    tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP1]], i8* [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+void test66(const __vector_pair *vpp, const __vector_pair *vp2) {
+  __vector_pair vp = __builtin_mma_lxvp(0LL, vpp);
+  __builtin_mma_stxvp(vp, 0LL, vp2);
+}
+
+// CHECK-LABEL: @test67(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 [[OFFSET:%.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8*
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i64 [[OFFSET]]
+// CHECK-NEXT:    tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]])
+// CHECK-NEXT:    ret void
+//
+void test67(const __vector_pair *vpp, signed long long offset, const __vector_pair *vp2) {
+  __vector_pair vp = __builtin_mma_lxvp(offset, vpp);
+  __builtin_mma_stxvp(vp, offset, vp2);
+}
+
+// CHECK-LABEL: @test68(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 18
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8*
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i64 18
+// CHECK-NEXT:    tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]])
+// CHECK-NEXT:    ret void
+//
+void test68(const __vector_pair *vpp, const __vector_pair *vp2) {
+  __vector_pair vp = __builtin_mma_lxvp(18LL, vpp);
+  __builtin_mma_stxvp(vp, 18LL, vp2);
+}
+
+// CHECK-LABEL: @test69(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 1
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8*
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i64 1
+// CHECK-NEXT:    tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]])
+// CHECK-NEXT:    ret void
+//
+void test69(const __vector_pair *vpp, const __vector_pair *vp2) {
+  __vector_pair vp = __builtin_mma_lxvp(1LL, vpp);
+  __builtin_mma_stxvp(vp, 1LL, vp2);
+}
+
+// CHECK-LABEL: @test70(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 42
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8*
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i64 42
+// CHECK-NEXT:    tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]])
+// CHECK-NEXT:    ret void
+//
+void test70(const __vector_pair *vpp, const __vector_pair *vp2) {
+  __vector_pair vp = __builtin_mma_lxvp(42LL, vpp);
+  __builtin_mma_stxvp(vp, 42LL, vp2);
+}
+
+// CHECK-LABEL: @test71(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <256 x i1>, <256 x i1>* [[VPP:%.*]], i64 128
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <256 x i1>* [[TMP0]] to i8*
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr <256 x i1>, <256 x i1>* [[VP2:%.*]], i64 128
+// CHECK-NEXT:    [[TMP4:%.*]] = bitcast <256 x i1>* [[TMP3]] to i8*
+// CHECK-NEXT:    tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]])
+// CHECK-NEXT:    ret void
+//
+void test71(const __vector_pair *vpp, const __vector_pair *vp2) {
+  __vector_pair vp = __builtin_mma_lxvp(32768LL, vpp);
+  __builtin_mma_stxvp(vp, 32768LL, vp2);
+}
+
+// CHECK-LABEL: @test72(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8*
+// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, i8* [[TMP0]], i64 32799
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP1]])
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <256 x i1>* [[VP2:%.*]] to i8*
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, i8* [[TMP3]], i64 32799
+// CHECK-NEXT:    tail call void @llvm.ppc.mma.stxvp(<256 x i1> [[TMP2]], i8* [[TMP4]])
+// CHECK-NEXT:    ret void
+//
+void test72(const __vector_pair *vpp, const __vector_pair *vp2) {
+  __vector_pair vp = __builtin_mma_lxvp(32799LL, vpp);
+  __builtin_mma_stxvp(vp, 32799LL, vp2);
+}
+
+// CHECK-LABEL: @test73(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, [[TBAA2:!tbaa !.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP2]], i64 8
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <512 x i1> @llvm.ppc.mma.pmxvf64gernn(<512 x i1> [[TMP1]], <256 x i1> [[TMP4]], <16 x i8> [[VC:%.*]], i32 0, i32 0)
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>*
+// CHECK-NEXT:    store <512 x i1> [[TMP5]], <512 x i1>* [[TMP6]], align 64, [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+void test73(unsigned char *vqp, const __vector_pair *vpp, vector unsigned char vc, unsigned char *resp) {
+  __vector_quad vq = *((__vector_quad *)vqp);
+  __vector_pair vp = __builtin_mma_lxvp(8LL, vpp);
+  __builtin_mma_pmxvf64gernn(&vq, vp, vc, 0, 0);
+  *((__vector_quad *)resp) = vq;
+}
+
+// CHECK-LABEL: @test74(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, [[TBAA2]]
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP2]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> [[TMP1]], <256 x i1> [[TMP3]], <16 x i8> [[VC:%.*]])
+// CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>*
+// CHECK-NEXT:    store <512 x i1> [[TMP4]], <512 x i1>* [[TMP5]], align 64, [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+void test74(unsigned char *vqp, const __vector_pair *vpp, vector unsigned char vc, unsigned char *resp) {
+  __vector_quad vq = *((__vector_quad *)vqp);
+  __vector_pair vp = __builtin_mma_lxvp(0LL, vpp);
+  __builtin_mma_xvf64gernp(&vq, vp, vc);
+  *((__vector_quad *)resp) = vq;
+}
+
+// CHECK-LABEL: @test75(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[VQP:%.*]] to <512 x i1>*
+// CHECK-NEXT:    [[TMP1:%.*]] = load <512 x i1>, <512 x i1>* [[TMP0]], align 64, [[TBAA2:!tbaa !.*]]
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <256 x i1>* [[VPP:%.*]] to i8*
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, i8* [[TMP2]], i64 [[OFFS:%.*]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call <256 x i1> @llvm.ppc.mma.lxvp(i8* [[TMP3]])
+// CHECK-NEXT:    [[TMP5:%.*]] = tail call <512 x i1> @llvm.ppc.mma.xvf64gernp(<512 x i1> [[TMP1]], <256 x i1> [[TMP4]], <16 x i8> [[VC:%.*]])
+// CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[RESP:%.*]] to <512 x i1>*
+// CHECK-NEXT:    store <512 x i1> [[TMP5]], <512 x i1>* [[TMP6]], align 64, [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+void test75(unsigned char *vqp, signed long long offs, const __vector_pair *vpp, vector unsigned char vc, unsigned char *resp) {
+  __vector_quad vq = *((__vector_quad *)vqp);
+  __vector_pair vp = __builtin_mma_lxvp(offs, vpp);
+  __builtin_mma_xvf64gernp(&vq, vp, vc);
+  *((__vector_quad *)resp) = vq;
+}
diff --git a/clang/test/Sema/ppc-mma-types.c b/clang/test/Sema/ppc-mma-types.c
@@ -319,3 +319,17 @@ void testVPOperators4(int v, void *ptr) {
   __vector_pair vp2 = (__vector_pair)vpp; // expected-error {{used type '__vector_pair' where arithmetic or pointer type is required}}
 }
 
+void testBuiltinTypes1(const __vector_pair *vpp, const __vector_pair *vp2, float f) {
+  __vector_pair vp = __builtin_mma_lxvp(f, vpp); // expected-error {{passing 'float' to parameter of incompatible type 'long long'}}
+  __builtin_mma_stxvp(vp, 32799, vp2);           // expected-error {{passing 'int' to parameter of incompatible type 'long long'}}
+}
+
+void testBuiltinTypes2(__vector_pair *vpp, const __vector_pair *vp2, unsigned char c) {
+  __vector_pair vp = __builtin_mma_lxvp(6LL, vpp); // expected-error {{passing '__vector_pair *' to parameter of incompatible type 'const __vector_pair *'}}
+  __builtin_mma_stxvp(vp, c, vp2);                 // expected-error {{passing 'unsigned char' to parameter of incompatible type 'long long'}}
+}
+
+void testBuiltinTypes3(vector int v, __vector_pair *vp2, signed long long ll, unsigned short s) {
+  __vector_pair vp = __builtin_mma_lxvp(ll, v); // expected-error {{passing '__vector int' (vector of 4 'int' values) to parameter of incompatible type 'const __vector_pair *'}}
+  __builtin_mma_stxvp(vp, ll, s);               // expected-error {{passing 'unsigned short' to parameter of incompatible type 'const __vector_pair *'}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -1422,6 +1422,14 @@ let TargetPrefix = "ppc" in {
   def int_ppc_mma_xxsetaccz :
         Intrinsic<[llvm_v512i1_ty], [], [IntrNoMem]>;
 
+  def int_ppc_mma_lxvp :
+        Intrinsic<[llvm_v256i1_ty], [llvm_ptr_ty],
+                  [IntrReadMem, IntrArgMemOnly]>;
+
+  def int_ppc_mma_stxvp :
+        Intrinsic<[], [llvm_v256i1_ty, llvm_ptr_ty],
+                  [IntrWriteMem, IntrArgMemOnly]>;
+
   // MMA Reduced-Precision: Outer Product Intrinsic Definitions.
   defm int_ppc_mma_xvi4ger8 :
         PowerPC_MMA_ACC_PP_Intrinsic<[llvm_v16i8_ty, llvm_v16i8_ty]>;

diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -293,6 +293,13 @@ namespace {
                                               Align(16));
     }
 
+    /// SelectAddrImmX34 - Returns true if the address N can be represented by
+    /// a base register plus a signed 34-bit displacement. Suitable for use by
+    /// PSTXVP and friends.
+    bool SelectAddrImmX34(SDValue N, SDValue &Disp, SDValue &Base) {
+      return PPCLowering->SelectAddressRegImm34(N, Disp, Base, *CurDAG);
+    }
+
     // Select an address into a single register.
     bool SelectAddr(SDValue N, SDValue &Base) {
       Base = N;

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -2399,6 +2399,20 @@ bool PPCTargetLowering::SelectAddressEVXRegReg(SDValue N, SDValue &Base,
   return false;
 }
 
+/// isIntS34Immediate - This method tests if value of node given can be
+/// accurately represented as a sign extension from a 34-bit value.  If so,
+/// this returns true and the immediate.
+bool llvm::isIntS34Immediate(SDNode *N, int64_t &Imm) {
+  if (!isa<ConstantSDNode>(N))
+    return false;
+
+  Imm = (int64_t)cast<ConstantSDNode>(N)->getZExtValue();
+  return isInt<34>(Imm);
+}
+bool llvm::isIntS34Immediate(SDValue Op, int64_t &Imm) {
+  return isIntS34Immediate(Op.getNode(), Imm);
+}
+
 /// SelectAddressRegReg - Given the specified addressed, check to see if it
 /// can be represented as an indexed [r+r] operation.  Returns false if it
 /// can be more efficiently represented as [r+imm]. If \p EncodingAlignment is
@@ -2599,6 +2613,55 @@ bool PPCTargetLowering::SelectAddressRegImm(
   return true;      // [r+0]
 }
 
+/// Similar to the 16-bit case but for instructions that take a 34-bit
+/// displacement field (prefixed loads/stores).
+bool PPCTargetLowering::SelectAddressRegImm34(SDValue N, SDValue &Disp,
+                                              SDValue &Base,
+                                              SelectionDAG &DAG) const {
+  // Only on 64-bit targets.
+  if (N.getValueType() != MVT::i64)
+    return false;
+
+  SDLoc dl(N);
+  int64_t Imm = 0;
+
+  if (N.getOpcode() == ISD::ADD) {
+    if (!isIntS34Immediate(N.getOperand(1), Imm))
+      return false;
+    Disp = DAG.getTargetConstant(Imm, dl, N.getValueType());
+    if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0)))
+      Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+    else
+      Base = N.getOperand(0);
+    return true;
+  }
+
+  if (N.getOpcode() == ISD::OR) {
+    if (!isIntS34Immediate(N.getOperand(1), Imm))
+      return false;
+    // If this is an or of disjoint bitfields, we can codegen this as an add
+    // (for better address arithmetic) if the LHS and RHS of the OR are
+    // provably disjoint.
+    KnownBits LHSKnown = DAG.computeKnownBits(N.getOperand(0));
+    if ((LHSKnown.Zero.getZExtValue() | ~(uint64_t)Imm) != ~0ULL)
+      return false;
+    if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0)))
+      Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
+    else
+      Base = N.getOperand(0);
+    Disp = DAG.getTargetConstant(Imm, dl, N.getValueType());
+    return true;
+  }
+
+  if (isIntS34Immediate(N, Imm)) { // If the address is a 34-bit const.
+    Disp = DAG.getTargetConstant(Imm, dl, N.getValueType());
+    Base = DAG.getRegister(PPC::ZERO8, N.getValueType());
+    return true;
+  }
+
+  return false;
+}
+
 /// SelectAddressRegRegOnly - Given the specified addressed, force it to be
 /// represented as an indexed [r+r] operation.
 bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -770,6 +770,8 @@ namespace llvm {
     bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base,
                              SelectionDAG &DAG,
                              MaybeAlign EncodingAlignment) const;
+    bool SelectAddressRegImm34(SDValue N, SDValue &Disp, SDValue &Base,
+                               SelectionDAG &DAG) const;
 
     /// SelectAddressRegRegOnly - Given the specified addressed, force it to be
     /// represented as an indexed [r+r] operation.
@@ -1325,6 +1327,8 @@ namespace llvm {
 
   bool isIntS16Immediate(SDNode *N, int16_t &Imm);
   bool isIntS16Immediate(SDValue Op, int16_t &Imm);
+  bool isIntS34Immediate(SDNode *N, int64_t &Imm);
+  bool isIntS34Immediate(SDValue Op, int64_t &Imm);
 
   bool convertToNonDenormSingle(APInt &ArgAPInt);
   bool convertToNonDenormSingle(APFloat &ArgAPFloat);

diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -1031,11 +1031,13 @@ def pred : Operand<OtherVT> {
 // Define PowerPC specific addressing mode.
 
 // d-form
-def iaddr    : ComplexPattern<iPTR, 2, "SelectAddrImm",     [], []>;  // "stb"
+def iaddr    : ComplexPattern<iPTR, 2, "SelectAddrImm",     [], []>; // "stb"
 // ds-form
-def iaddrX4  : ComplexPattern<iPTR, 2, "SelectAddrImmX4",   [], []>;  // "std"
+def iaddrX4  : ComplexPattern<iPTR, 2, "SelectAddrImmX4",   [], []>; // "std"
 // dq-form
-def iaddrX16 : ComplexPattern<iPTR, 2, "SelectAddrImmX16",  [], []>;  // "stxv"
+def iaddrX16 : ComplexPattern<iPTR, 2, "SelectAddrImmX16",  [], []>; // "stxv"
+// 8LS:d-form
+def iaddrX34 : ComplexPattern<iPTR, 2, "SelectAddrImmX34",  [], []>; // "pstxvp"
 
 // Below forms are all x-form addressing mode, use three different ones so we
 // can make a accurate check for x-form instructions in ISEL.

diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
@@ -1654,6 +1654,24 @@ let mayLoad = 0, mayStore = 1, Predicates = [PairedVectorMemops, PrefixInstrs] i
                                 "pstxvp $XTp, $D_RA", IIC_LdStLFD>;
 }
 
+let Predicates = [PairedVectorMemops] in {
+  // Intrinsics for Paired Vector Loads.
+  def : Pat<(v256i1 (int_ppc_mma_lxvp iaddrX16:$src)), (LXVP memrix16:$src)>;
+  def : Pat<(v256i1 (int_ppc_mma_lxvp xaddrX16:$src)), (LXVPX xaddrX16:$src)>;
+  let Predicates = [PairedVectorMemops, PrefixInstrs] in {
+    def : Pat<(v256i1 (int_ppc_mma_lxvp iaddrX34:$src)), (PLXVP memri34:$src)>;
+  }
+  // Intrinsics for Paired Vector Stores.
+  def : Pat<(int_ppc_mma_stxvp v256i1:$XSp, iaddrX16:$dst),
+            (STXVP $XSp, memrix16:$dst)>;
+  def : Pat<(int_ppc_mma_stxvp v256i1:$XSp, xaddrX16:$dst),
+            (STXVPX $XSp, xaddrX16:$dst)>;
+  let Predicates = [PairedVectorMemops, PrefixInstrs] in {
+    def : Pat<(int_ppc_mma_stxvp v256i1:$XSp, iaddrX34:$dst),
+              (PSTXVP $XSp, memri34:$dst)>;
+  }
+}
+
 // TODO: We have an added complexity of 500 here. This is only a temporary
 // solution to have tablegen consider these patterns first. The way we do
 // addressing for PowerPC is complex depending on available D form, X form, or