diff --git a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
index d079951221d082..c94cae04523980 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
@@ -108,10 +108,10 @@ struct StreamState {
     return StreamState{L, Opened, ES};
   }
   static StreamState getClosed(const FnDescription *L) {
-    return StreamState{L, Closed};
+    return StreamState{L, Closed, {}};
   }
   static StreamState getOpenFailed(const FnDescription *L) {
-    return StreamState{L, OpenFailed};
+    return StreamState{L, OpenFailed, {}};
   }
 
   void Profile(llvm::FoldingSetNodeID &ID) const {
diff --git a/lldb/test/API/functionalities/breakpoint/step_over_breakpoint/TestStepOverBreakpoint.py b/lldb/test/API/functionalities/breakpoint/step_over_breakpoint/TestStepOverBreakpoint.py
index b20490f3cefdcc..931326b322911b 100644
--- a/lldb/test/API/functionalities/breakpoint/step_over_breakpoint/TestStepOverBreakpoint.py
+++ b/lldb/test/API/functionalities/breakpoint/step_over_breakpoint/TestStepOverBreakpoint.py
@@ -51,6 +51,7 @@ def setUp(self):
         self.thread = lldbutil.get_one_thread_stopped_at_breakpoint(self.process, self.breakpoint1)
         self.assertIsNotNone(self.thread, "Didn't stop at breakpoint 1.")
 
+    @skipIfReproducer
     def test_step_instruction(self):
         # Count instructions between breakpoint_1 and breakpoint_4
         contextList = self.target.FindFunctions('main', lldb.eFunctionNameTypeAuto)
diff --git a/lldb/test/API/functionalities/deleted-executable/TestDeletedExecutable.py b/lldb/test/API/functionalities/deleted-executable/TestDeletedExecutable.py
index ed17d9b36b6b0f..78f3feae6ff637 100644
--- a/lldb/test/API/functionalities/deleted-executable/TestDeletedExecutable.py
+++ b/lldb/test/API/functionalities/deleted-executable/TestDeletedExecutable.py
@@ -20,6 +20,7 @@ class TestDeletedExecutable(TestBase):
         triple=no_match('aarch64-.*-android'))
         # determining the architecture of the process fails
     @expectedFailureNetBSD
+    @skipIfReproducer # File synchronization is not supported during replay.
     def test(self):
         self.build()
         exe = self.getBuildArtifact("a.out")
diff --git a/lldb/test/API/functionalities/load_unload/TestLoadUnload.py b/lldb/test/API/functionalities/load_unload/TestLoadUnload.py
index 7188fa32a154e9..e0013ccd93fa60 100644
--- a/lldb/test/API/functionalities/load_unload/TestLoadUnload.py
+++ b/lldb/test/API/functionalities/load_unload/TestLoadUnload.py
@@ -95,6 +95,7 @@ def setSvr4Support(self, enabled):
     @not_remote_testsuite_ready
     @skipIfWindows  # Windows doesn't have dlopen and friends, dynamic libraries work differently
     @expectedFailureNetBSD
+    @skipIfReproducer # VFS is a snapshot.
     def test_modules_search_paths(self):
         """Test target modules list after loading a different copy of the library libd.dylib, and verifies that it works with 'target modules search-paths add'."""
         if self.platformIsDarwin():
diff --git a/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py b/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py
index 63bb02e5eb60f3..e0046f7108898e 100644
--- a/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py
+++ b/lldb/test/API/functionalities/postmortem/elf-core/TestLinuxCore.py
@@ -38,29 +38,34 @@ class LinuxCoreTestCase(TestBase):
 
     @skipIf(triple='^mips')
     @skipIfLLVMTargetMissing("AArch64")
+    @skipIfReproducer # lldb::FileSP used in typemap cannot be instrumented.
     def test_aarch64(self):
         """Test that lldb can read the process information from an aarch64 linux core file."""
         self.do_test("linux-aarch64", self._aarch64_pid, self._aarch64_regions, "a.out")
 
     @skipIf(triple='^mips')
     @skipIfLLVMTargetMissing("X86")
+    @skipIfReproducer # lldb::FileSP used in typemap cannot be instrumented.
     def test_i386(self):
         """Test that lldb can read the process information from an i386 linux core file."""
         self.do_test("linux-i386", self._i386_pid, self._i386_regions, "a.out")
 
     @skipIfLLVMTargetMissing("Mips")
+    @skipIfReproducer # lldb::FileSP used in typemap cannot be instrumented.
     def test_mips_o32(self):
         """Test that lldb can read the process information from an MIPS O32 linux core file."""
         self.do_test("linux-mipsel-gnuabio32", self._mips_o32_pid,
                 self._mips_regions, "linux-mipsel-gn")
 
     @skipIfLLVMTargetMissing("Mips")
+    @skipIfReproducer # lldb::FileSP used in typemap cannot be instrumented.
     def test_mips_n32(self):
         """Test that lldb can read the process information from an MIPS N32 linux core file """
         self.do_test("linux-mips64el-gnuabin32", self._mips64_n32_pid,
                 self._mips_regions, "linux-mips64el-")
 
     @skipIfLLVMTargetMissing("Mips")
+    @skipIfReproducer # lldb::FileSP used in typemap cannot be instrumented.
     def test_mips_n64(self):
         """Test that lldb can read the process information from an MIPS N64 linux core file """
         self.do_test("linux-mips64el-gnuabi64", self._mips64_n64_pid,
@@ -68,6 +73,7 @@ def test_mips_n64(self):
 
     @skipIf(triple='^mips')
     @skipIfLLVMTargetMissing("PowerPC")
+    @skipIfReproducer # lldb::FileSP used in typemap cannot be instrumented.
     def test_ppc64le(self):
         """Test that lldb can read the process information from an ppc64le linux core file."""
         self.do_test("linux-ppc64le", self._ppc64le_pid, self._ppc64le_regions,
@@ -75,6 +81,7 @@ def test_ppc64le(self):
 
     @skipIf(triple='^mips')
     @skipIfLLVMTargetMissing("X86")
+    @skipIfReproducer # lldb::FileSP used in typemap cannot be instrumented.
     def test_x86_64(self):
         """Test that lldb can read the process information from an x86_64 linux core file."""
         self.do_test("linux-x86_64", self._x86_64_pid, self._x86_64_regions,
@@ -82,6 +89,7 @@ def test_x86_64(self):
 
     @skipIf(triple='^mips')
     @skipIfLLVMTargetMissing("SystemZ")
+    @skipIfReproducer # lldb::FileSP used in typemap cannot be instrumented.
     def test_s390x(self):
         """Test that lldb can read the process information from an s390x linux core file."""
         self.do_test("linux-s390x", self._s390x_pid, self._s390x_regions,
@@ -89,6 +97,7 @@ def test_s390x(self):
 
     @skipIf(triple='^mips')
     @skipIfLLVMTargetMissing("X86")
+    @skipIfReproducer # lldb::FileSP used in typemap cannot be instrumented.
     def test_same_pid_running(self):
         """Test that we read the information from the core correctly even if we have a running
         process with the same PID around"""
@@ -117,6 +126,7 @@ def test_same_pid_running(self):
 
     @skipIf(triple='^mips')
     @skipIfLLVMTargetMissing("X86")
+    @skipIfReproducer # lldb::FileSP used in typemap cannot be instrumented.
     def test_two_cores_same_pid(self):
         """Test that we handle the situation if we have two core files with the same PID
         around"""
@@ -197,6 +207,7 @@ def test_FPR_SSE(self):
 
     @skipIf(triple='^mips')
     @skipIfLLVMTargetMissing("X86")
+    @skipIfReproducer # lldb::FileSP used in typemap cannot be instrumented.
     def test_i386_sysroot(self):
         """Test that lldb can find the exe for an i386 linux core file using the sysroot."""
 
@@ -221,6 +232,7 @@ def test_i386_sysroot(self):
     @skipIf(triple='^mips')
     @skipIfLLVMTargetMissing("X86")
     @skipIfWindows
+    @skipIfReproducer # lldb::FileSP used in typemap cannot be instrumented.
     def test_x86_64_sysroot(self):
         """Test that sysroot has more priority then local filesystem."""
 
diff --git a/lldb/test/API/functionalities/postmortem/netbsd-core/TestNetBSDCore.py b/lldb/test/API/functionalities/postmortem/netbsd-core/TestNetBSDCore.py
index f967a57e4ea719..6ecd2673534474 100644
--- a/lldb/test/API/functionalities/postmortem/netbsd-core/TestNetBSDCore.py
+++ b/lldb/test/API/functionalities/postmortem/netbsd-core/TestNetBSDCore.py
@@ -159,11 +159,13 @@ def check_stack(self, process, pid, filename):
         self.check_backtrace(thread, filename, backtrace)
 
     @skipIfLLVMTargetMissing("AArch64")
+    @skipIfReproducer # lldb::FileSP used in typemap cannot be instrumented.
     def test_aarch64(self):
         """Test single-threaded aarch64 core dump."""
         self.do_test("1lwp_SIGSEGV.aarch64", pid=8339, region_count=32)
 
     @skipIfLLVMTargetMissing("X86")
+    @skipIfReproducer # lldb::FileSP used in typemap cannot be instrumented.
     def test_amd64(self):
         """Test single-threaded amd64 core dump."""
         self.do_test("1lwp_SIGSEGV.amd64", pid=693, region_count=21)
@@ -189,11 +191,13 @@ def check_stack(self, process, pid, filename):
         self.assertEqual(thread.GetStopReasonDataAtIndex(0), 0)
 
     @skipIfLLVMTargetMissing("AArch64")
+    @skipIfReproducer # lldb::FileSP used in typemap cannot be instrumented.
     def test_aarch64(self):
         """Test double-threaded aarch64 core dump where thread 2 is signalled."""
         self.do_test("2lwp_t2_SIGSEGV.aarch64", pid=14142, region_count=31)
 
     @skipIfLLVMTargetMissing("X86")
+    @skipIfReproducer # lldb::FileSP used in typemap cannot be instrumented.
     def test_amd64(self):
         """Test double-threaded amd64 core dump where thread 2 is signalled."""
         self.do_test("2lwp_t2_SIGSEGV.amd64", pid=622, region_count=24)
@@ -219,11 +223,13 @@ def check_stack(self, process, pid, filename):
         self.assertEqual(thread.GetStopReasonDataAtIndex(0), signal.SIGSEGV)
 
     @skipIfLLVMTargetMissing("AArch64")
+    @skipIfReproducer # lldb::FileSP used in typemap cannot be instrumented.
     def test_aarch64(self):
         """Test double-threaded aarch64 core dump where process is signalled."""
         self.do_test("2lwp_process_SIGSEGV.aarch64", pid=1403, region_count=30)
 
     @skipIfLLVMTargetMissing("X86")
+    @skipIfReproducer # lldb::FileSP used in typemap cannot be instrumented.
     def test_amd64(self):
         """Test double-threaded amd64 core dump where process is signalled."""
         self.do_test("2lwp_process_SIGSEGV.amd64", pid=665, region_count=24)
diff --git a/llvm/include/llvm/Analysis/InlineAdvisor.h b/llvm/include/llvm/Analysis/InlineAdvisor.h
index 115bca1d32192c..118fd236bee486 100644
--- a/llvm/include/llvm/Analysis/InlineAdvisor.h
+++ b/llvm/include/llvm/Analysis/InlineAdvisor.h
@@ -123,12 +123,12 @@ class InlineAdvisor {
   /// This must be called when the Inliner pass is entered, to allow the
   /// InlineAdvisor update internal state, as result of function passes run
   /// between Inliner pass runs (for the same module).
-  virtual void OnPassEntry() {}
+  virtual void onPassEntry() {}
 
   /// This must be called when the Inliner pass is exited, as function passes
   /// may be run subsequently. This allows an implementation of InlineAdvisor
   /// to prepare for a partial update.
-  virtual void OnPassExit() {}
+  virtual void onPassExit() {}
 
 protected:
   InlineAdvisor() = default;
@@ -163,7 +163,7 @@ class DefaultInlineAdvisor : public InlineAdvisor {
   std::unique_ptr<InlineAdvice>
   getAdvice(CallBase &CB, FunctionAnalysisManager &FAM) override;
 
-  void OnPassExit() override { freeDeletedFunctions(); }
+  void onPassExit() override { freeDeletedFunctions(); }
   InlineParams Params;
 };
 
diff --git a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
index 64a8ff31624ce8..a0ed5eea065152 100644
--- a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
+++ b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
@@ -673,9 +673,9 @@ struct VectorInfo {
   ElementInfo *EI;
 
   /// Vector Type
-  VectorType *const VTy;
+  FixedVectorType *const VTy;
 
-  VectorInfo(VectorType *VTy)
+  VectorInfo(FixedVectorType *VTy)
       : BB(nullptr), PV(nullptr), LIs(), Is(), SVI(nullptr), VTy(VTy) {
     EI = new ElementInfo[VTy->getNumElements()];
   }
@@ -735,7 +735,7 @@ struct VectorInfo {
     if (!Op)
       return false;
 
-    VectorType *VTy = dyn_cast<VectorType>(Op->getType());
+    FixedVectorType *VTy = dyn_cast<FixedVectorType>(Op->getType());
     if (!VTy)
       return false;
 
@@ -785,8 +785,8 @@ struct VectorInfo {
   /// \returns false if no sensible information can be gathered.
   static bool computeFromSVI(ShuffleVectorInst *SVI, VectorInfo &Result,
                              const DataLayout &DL) {
-    VectorType *ArgTy = dyn_cast<VectorType>(SVI->getOperand(0)->getType());
-    assert(ArgTy && "ShuffleVector Operand is not a VectorType");
+    FixedVectorType *ArgTy =
+        cast<FixedVectorType>(SVI->getOperand(0)->getType());
 
     // Compute the left hand vector information.
     VectorInfo LHS(ArgTy);
@@ -1201,7 +1201,7 @@ bool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad,
   Type *ETy = InterleavedLoad.front().SVI->getType()->getElementType();
   unsigned ElementsPerSVI =
       InterleavedLoad.front().SVI->getType()->getNumElements();
-  VectorType *ILTy = VectorType::get(ETy, Factor * ElementsPerSVI);
+  FixedVectorType *ILTy = FixedVectorType::get(ETy, Factor * ElementsPerSVI);
 
   SmallVector<unsigned, 4> Indices;
   for (unsigned i = 0; i < Factor; i++)
@@ -1265,8 +1265,11 @@ bool InterleavedLoadCombineImpl::run() {
     for (BasicBlock &BB : F) {
       for (Instruction &I : BB) {
         if (auto SVI = dyn_cast<ShuffleVectorInst>(&I)) {
+          // We don't support scalable vectors in this pass.
+          if (isa<ScalableVectorType>(SVI->getType()))
+            continue;
 
-          Candidates.emplace_back(SVI->getType());
+          Candidates.emplace_back(cast<FixedVectorType>(SVI->getType()));
 
           if (!VectorInfo::computeFromSVI(SVI, Candidates.back(), DL)) {
             Candidates.pop_back();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index f0e81c56bc1aee..7229f1793a9dbe 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -1135,15 +1135,20 @@ bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI,
 
   MachineIRBuilder B(MI);
 
-  unsigned SplitElts =
-      MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits();
-  const LLT LoadSplitTy =  LLT::vector(SplitElts, LoadTy.getScalarType());
+  unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
+  const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
   ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
   GISelObserverWrapper Observer(&O);
   B.setChangeObserver(Observer);
   LegalizerHelper Helper(B.getMF(), Observer, B);
-  if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
-    return false;
+
+  if (LoadTy.isVector()) {
+    if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
+      return false;
+  } else {
+    if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
+      return false;
+  }
 
   MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
   return true;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5c0a67ef61a6f7..e99dec44c65b89 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -4712,13 +4712,6 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
     Target = BR->getOperand(1);
   }
 
-  // FIXME: This changes the types of the intrinsics instead of introducing new
-  // nodes with the correct types.
-  // e.g. llvm.amdgcn.loop
-
-  // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
-  // =>     t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
-
   unsigned CFNode = isCFIntrinsic(Intr);
   if (CFNode == 0) {
     // This is a uniform branch so we don't need to legalize.
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 0ce6f317722438..7a8b5249255f9c 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -4474,7 +4474,8 @@ static SDValue getGeneralPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
     else
       IndexNodes[I] = DAG.getUNDEF(MVT::i32);
   SDValue Op2 = DAG.getBuildVector(MVT::v16i8, DL, IndexNodes);
-  return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Ops[0], Ops[1], Op2);
+  return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Ops[0],
+                     (!Ops[1].isUndef() ? Ops[1] : Ops[0]), Op2);
 }
 
 namespace {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyDebugFixup.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyDebugFixup.cpp
index 8f1f77e23b8e31..655e30a29eff45 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyDebugFixup.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyDebugFixup.cpp
@@ -86,9 +86,9 @@ bool WebAssemblyDebugFixup::runOnMachineFunction(MachineFunction &MF) {
           // Search for register rather than assume it is on top (which it
           // typically is if it appears right after the def), since
           // DBG_VALUE's may shift under some circumstances.
-          size_t Depth = 0;
-          for (auto &Elem : Stack) {
+          for (auto &Elem : reverse(Stack)) {
             if (MO.getReg() == Elem.Reg) {
+              auto Depth = static_cast<unsigned>(&Elem - &Stack[0]);
               LLVM_DEBUG(dbgs() << "Debug Value VReg " << MO.getReg()
                                 << " -> Stack Relative " << Depth << "\n");
               MO.ChangeToTargetIndex(WebAssembly::TI_OPERAND_STACK, Depth);
@@ -98,7 +98,6 @@ bool WebAssemblyDebugFixup::runOnMachineFunction(MachineFunction &MF) {
               Elem.DebugValue = &MI;
               break;
             }
-            Depth++;
           }
           // If the Reg was not found, we have a DBG_VALUE outside of its
           // def-use range, and we leave it unmodified as reg, which means
diff --git a/llvm/lib/Transforms/IPO/Inliner.cpp b/llvm/lib/Transforms/IPO/Inliner.cpp
index 770ca2ea913094..862385d044815b 100644
--- a/llvm/lib/Transforms/IPO/Inliner.cpp
+++ b/llvm/lib/Transforms/IPO/Inliner.cpp
@@ -696,9 +696,9 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
   ProfileSummaryInfo *PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(M);
 
   InlineAdvisor &Advisor = getAdvisor(MAMProxy, M);
-  Advisor.OnPassEntry();
+  Advisor.onPassEntry();
 
-  auto AdvisorOnExit = make_scope_exit([&] { Advisor.OnPassExit(); });
+  auto AdvisorOnExit = make_scope_exit([&] { Advisor.onPassExit(); });
 
   if (!ImportedFunctionsStats &&
       InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No) {
@@ -808,7 +808,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
       return FAM.getResult<AssumptionAnalysis>(F);
     };
 
-    // Now process as many calls as we have within this caller in the sequnece.
+    // Now process as many calls as we have within this caller in the sequence.
     // We bail out as soon as the caller has to change so we can update the
     // call graph and prepare the context of that new caller.
     bool DidInline = false;
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index ae4ef97b2fd0f6..545413c1fe035c 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -1698,13 +1698,14 @@ DIExpression *llvm::salvageDebugInfoImpl(Instruction &I,
   };
 
   if (auto *CI = dyn_cast<CastInst>(&I)) {
-    // No-op casts and zexts are irrelevant for debug info.
-    if (CI->isNoopCast(DL) || isa<ZExtInst>(&I))
+    // No-op casts are irrelevant for debug info.
+    if (CI->isNoopCast(DL))
       return SrcDIExpr;
 
     Type *Type = CI->getType();
-    // Casts other than Trunc or SExt to scalar types cannot be salvaged.
-    if (Type->isVectorTy() || (!isa<TruncInst>(&I) && !isa<SExtInst>(&I)))
+    // Casts other than Trunc, SExt, or ZExt to scalar types cannot be salvaged.
+    if (Type->isVectorTy() ||
+        !(isa<TruncInst>(&I) || isa<SExtInst>(&I) || isa<ZExtInst>(&I)))
       return nullptr;
 
     Value *FromValue = CI->getOperand(0);
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir
index 998094d622dac0..53302f9554e392 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir
@@ -1,3 +1,4 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
 
@@ -8,6 +9,7 @@
     %tmp2 = load <8 x i32>, <8 x i32> addrspace(1)* %global.not.uniform.v8i32
     ret void
   }
+
   define amdgpu_kernel void @load_global_v4i64_non_uniform(<4 x i64> addrspace(1)* %in) {
     %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0
     %global.not.uniform.v4i64 = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tmp0
@@ -36,6 +38,21 @@
     %tmp2 = load <8 x i32>, <8 x i32> addrspace(4)* %constant.not.uniform.v8i32
     ret void
   }
+
+  define amdgpu_kernel void @load_constant_i256_non_uniform(i256 addrspace(4)* %in) {
+    %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0
+    %constant.not.uniform = getelementptr i256, i256 addrspace(4)* %in, i32 %tmp0
+    %tmp2 = load i256, i256 addrspace(4)* %constant.not.uniform
+    ret void
+  }
+
+  define amdgpu_kernel void @load_constant_v16i16_non_uniform(<16 x i16> addrspace(4)* %in) {
+    %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0
+    %constant.not.uniform = getelementptr <16 x i16>, <16 x i16> addrspace(4)* %in, i32 %tmp0
+    %tmp2 = load <16 x i16>, <16 x i16> addrspace(4)* %constant.not.uniform
+    ret void
+  }
+
   define amdgpu_kernel void @load_constant_v4i64_non_uniform(<4 x i64> addrspace(4)* %in) {
     %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0
     %constant.not.uniform.v4i64 = getelementptr <4 x i64>, <4 x i64> addrspace(4)* %in, i32 %tmp0
@@ -56,6 +73,7 @@
   }
 
   define amdgpu_kernel void @load_constant_v8i32_uniform() {ret void}
+  define amdgpu_kernel void @load_constant_v16i16_uniform() {ret void}
   define amdgpu_kernel void @load_constant_v4i64_uniform() {ret void}
   define amdgpu_kernel void @load_constant_v16i32_uniform() {ret void}
   define amdgpu_kernel void @load_constant_v8i64_uniform() {ret void}
@@ -84,12 +102,13 @@ body: |
   bb.0:
     liveins: $sgpr0_sgpr1
     ; CHECK-LABEL: name: load_global_v8i32_non_uniform
-    ; CHECK: [[PTR:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
-    ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR]](p1) :: (load 16 from %ir.global.not.uniform.v8i32, align 32, addrspace 1)
-    ; CHECK: [[OFFSET16:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
-    ; CHECK: [[GEP16:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[PTR]], [[OFFSET16]](s64)
-    ; CHECK: [[LOAD16:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP16]](p1) :: (load 16 from %ir.global.not.uniform.v8i32 + 16, align 32, addrspace 1)
-    ; CHECK: %1:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD0]](<4 x s32>), [[LOAD16]](<4 x s32>)
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+    ; CHECK: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 16 from %ir.global.not.uniform.v8i32, align 32, addrspace 1)
+    ; CHECK: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+    ; CHECK: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; CHECK: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load 16 from %ir.global.not.uniform.v8i32 + 16, align 32, addrspace 1)
+    ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
     %0:_(p1) = COPY $sgpr0_sgpr1
     %1:_(<8 x s32>) = G_LOAD %0 :: (load 32 from %ir.global.not.uniform.v8i32)
 ...
@@ -101,13 +120,15 @@ legalized: true
 body: |
   bb.0:
     liveins: $sgpr0_sgpr1
-    ; CHECK: [[PTR:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
-    ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR]](p1) :: (load 16 from %ir.global.not.uniform.v4i64, align 32, addrspace 1)
-    ; CHECK: [[OFFSET16:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
-    ; CHECK: [[GEP16:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[PTR]], [[OFFSET16]](s64)
-    ; CHECK: [[LOAD16:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP16]](p1) :: (load 16 from %ir.global.not.uniform.v4i64 + 16, align 32, addrspace 1)
-    ; CHECK: %1:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[LOAD0]](<2 x s64>), [[LOAD16]](<2 x s64>)
 
+    ; CHECK-LABEL: name: load_global_v4i64_non_uniform
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+    ; CHECK: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p1) :: (load 16 from %ir.global.not.uniform.v4i64, align 32, addrspace 1)
+    ; CHECK: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+    ; CHECK: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; CHECK: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p1) :: (load 16 from %ir.global.not.uniform.v4i64 + 16, align 32, addrspace 1)
+    ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
     %0:_(p1) = COPY $sgpr0_sgpr1
     %1:_(<4 x s64>) = G_LOAD %0 :: (load 32 from %ir.global.not.uniform.v4i64)
 ...
@@ -120,18 +141,19 @@ body: |
   bb.0:
     liveins: $sgpr0_sgpr1
     ; CHECK-LABEL: name: load_global_v16i32_non_uniform
-    ; CHECK: [[PTR:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
-    ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR]](p1) :: (load 16 from %ir.global.not.uniform.v16i32, align 64, addrspace 1)
-    ; CHECK: [[OFFSET16:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
-    ; CHECK: [[GEP16:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[PTR]], [[OFFSET16]](s64)
-    ; CHECK: [[LOAD16:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP16]](p1) :: (load 16 from %ir.global.not.uniform.v16i32 + 16, align 64, addrspace 1)
-    ; CHECK: [[OFFSET32:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32
-    ; CHECK: [[GEP32:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[PTR]], [[OFFSET32]](s64)
-    ; CHECK: [[LOAD32:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP32]](p1) :: (load 16 from %ir.global.not.uniform.v16i32 + 32, align 64, addrspace 1)
-    ; CHECK: [[OFFSET48:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48
-    ; CHECK: [[GEP48:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[PTR]], [[OFFSET48]](s64)
-    ; CHECK: [[LOAD48:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP48]](p1) :: (load 16 from %ir.global.not.uniform.v16i32 + 48, align 64, addrspace 1)
-    ; CHECK: %1:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD0]](<4 x s32>), [[LOAD16]](<4 x s32>), [[LOAD32]](<4 x s32>), [[LOAD48]](<4 x s32>)
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(p1) = COPY [[COPY]](p1)
+    ; CHECK: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p1) :: (load 16 from %ir.global.not.uniform.v16i32, align 64, addrspace 1)
+    ; CHECK: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+    ; CHECK: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; CHECK: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load 16 from %ir.global.not.uniform.v16i32 + 16, align 64, addrspace 1)
+    ; CHECK: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32
+    ; CHECK: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; CHECK: [[LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD1]](p1) :: (load 16 from %ir.global.not.uniform.v16i32 + 32, align 64, addrspace 1)
+    ; CHECK: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48
+    ; CHECK: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; CHECK: [[LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD2]](p1) :: (load 16 from %ir.global.not.uniform.v16i32 + 48, align 64, addrspace 1)
+    ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
     %0:_(p1) = COPY $sgpr0_sgpr1
     %1:_(<16 x s32>) = G_LOAD %0 :: (load 64 from %ir.global.not.uniform.v16i32)
 ...
@@ -167,7 +189,8 @@ body: |
   bb.0:
     liveins: $sgpr0_sgpr1
     ; CHECK-LABEL: name: load_global_v8i32_uniform
-    ; CHECK: (<8 x s32>) = G_LOAD %0(p1) :: (invariant load 32, addrspace 1)
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+    ; CHECK: [[LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_LOAD [[COPY]](p1) :: (invariant load 32, addrspace 1)
     %0:_(p1) = COPY $sgpr0_sgpr1
     %1:_(<8 x s32>) = G_LOAD %0 :: (invariant load 32, addrspace 1)
 ...
@@ -180,7 +203,8 @@ body: |
   bb.0:
     liveins: $sgpr0_sgpr1
     ; CHECK-LABEL: name: load_global_v4i64_uniform
-    ; CHECK: (<4 x s64>) = G_LOAD %0(p1) :: (invariant load 32, addrspace 1)
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+    ; CHECK: [[LOAD:%[0-9]+]]:sgpr(<4 x s64>) = G_LOAD [[COPY]](p1) :: (invariant load 32, addrspace 1)
     %0:_(p1) = COPY $sgpr0_sgpr1
     %1:_(<4 x s64>) = G_LOAD %0 :: (invariant load 32, addrspace 1)
 ...
@@ -193,7 +217,8 @@ body: |
   bb.0:
     liveins: $sgpr0_sgpr1
     ; CHECK-LABEL: name: load_global_v16i32_uniform
-    ; CHECK: (<16 x s32>) = G_LOAD %0(p1) :: (invariant load 64, addrspace 1)
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+    ; CHECK: [[LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_LOAD [[COPY]](p1) :: (invariant load 64, addrspace 1)
     %0:_(p1) = COPY $sgpr0_sgpr1
     %1:_(<16 x s32>) = G_LOAD %0 :: (invariant load 64, addrspace 1)
 ...
@@ -206,7 +231,8 @@ body: |
   bb.0:
     liveins: $sgpr0_sgpr1
     ; CHECK-LABEL: name: load_global_v8i64_uniform
-    ; CHECK: (<8 x s64>) = G_LOAD %0(p1) :: (invariant load 64, addrspace 1)
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $sgpr0_sgpr1
+    ; CHECK: [[LOAD:%[0-9]+]]:sgpr(<8 x s64>) = G_LOAD [[COPY]](p1) :: (invariant load 64, addrspace 1)
     %0:_(p1) = COPY $sgpr0_sgpr1
     %1:_(<8 x s64>) = G_LOAD %0 :: (invariant load 64, addrspace 1)
 ...
@@ -219,16 +245,56 @@ body: |
   bb.0:
     liveins: $sgpr0_sgpr1
     ; CHECK-LABEL: name: load_constant_v8i32_non_uniform
-    ; CHECK: [[PTR:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
-    ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR]](p4) :: (load 16 from %ir.constant.not.uniform.v8i32, align 32, addrspace 4)
-    ; CHECK: [[OFFSET16:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
-    ; CHECK: [[GEP16:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[PTR]], [[OFFSET16]](s64)
-    ; CHECK: [[LOAD16:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP16]](p4) :: (load 16 from %ir.constant.not.uniform.v8i32 + 16, align 32, addrspace 4)
-    ; CHECK: %1:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD0]](<4 x s32>), [[LOAD16]](<4 x s32>)
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
+    ; CHECK: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load 16 from %ir.constant.not.uniform.v8i32, align 32, addrspace 4)
+    ; CHECK: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+    ; CHECK: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; CHECK: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load 16 from %ir.constant.not.uniform.v8i32 + 16, align 32, addrspace 4)
+    ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
     %0:_(p4) = COPY $sgpr0_sgpr1
     %1:_(<8 x s32>) = G_LOAD %0 :: (load 32 from %ir.constant.not.uniform.v8i32)
 ...
 
+---
+name: load_constant_i256_non_uniform
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+    ; CHECK-LABEL: name: load_constant_i256_non_uniform
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
+    ; CHECK: [[LOAD:%[0-9]+]]:vgpr(s128) = G_LOAD [[COPY]](p4) :: (load 16 from %ir.constant.not.uniform, align 32, addrspace 4)
+    ; CHECK: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+    ; CHECK: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; CHECK: [[LOAD1:%[0-9]+]]:vgpr(s128) = G_LOAD [[PTR_ADD]](p4) :: (load 16 from %ir.constant.not.uniform + 16, align 32, addrspace 4)
+    ; CHECK: [[MV:%[0-9]+]]:vgpr(s256) = G_MERGE_VALUES [[LOAD]](s128), [[LOAD1]](s128)
+    %0:_(p4) = COPY $sgpr0_sgpr1
+    %1:_(s256) = G_LOAD %0 :: (load 32 from %ir.constant.not.uniform)
+...
+
+---
+name: load_constant_v16i16_non_uniform
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+
+    ; CHECK-LABEL: name: load_constant_v16i16_non_uniform
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
+    ; CHECK: [[LOAD:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[COPY]](p4) :: (load 16 from %ir.constant.not.uniform, align 32, addrspace 4)
+    ; CHECK: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+    ; CHECK: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; CHECK: [[LOAD1:%[0-9]+]]:vgpr(<8 x s16>) = G_LOAD [[PTR_ADD]](p4) :: (load 16 from %ir.constant.not.uniform + 16, align 32, addrspace 4)
+    ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s16>) = G_CONCAT_VECTORS [[LOAD]](<8 x s16>), [[LOAD1]](<8 x s16>)
+    %0:_(p4) = COPY $sgpr0_sgpr1
+    %1:_(<16 x s16>) = G_LOAD %0 :: (load 32 from %ir.constant.not.uniform)
+...
+
 ---
 name: load_constant_v4i64_non_uniform
 legalized: true
@@ -237,12 +303,13 @@ body: |
   bb.0:
     liveins: $sgpr0_sgpr1
     ; CHECK-LABEL: name: load_constant_v4i64_non_uniform
-    ; CHECK: [[PTR:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
-    ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR]](p4) :: (load 16 from %ir.constant.not.uniform.v4i64, align 32, addrspace 4)
-    ; CHECK: [[OFFSET16:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
-    ; CHECK: [[GEP16:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[PTR]], [[OFFSET16]](s64)
-    ; CHECK: [[LOAD16:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP16]](p4) :: (load 16 from %ir.constant.not.uniform.v4i64 + 16, align 32, addrspace 4)
-    ; CHECK: %1:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[LOAD0]](<2 x s64>), [[LOAD16]](<2 x s64>)
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
+    ; CHECK: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p4) :: (load 16 from %ir.constant.not.uniform.v4i64, align 32, addrspace 4)
+    ; CHECK: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+    ; CHECK: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; CHECK: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p4) :: (load 16 from %ir.constant.not.uniform.v4i64 + 16, align 32, addrspace 4)
+    ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<4 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>)
     %0:_(p4) = COPY $sgpr0_sgpr1
     %1:_(<4 x s64>) = G_LOAD %0 :: (load 32 from %ir.constant.not.uniform.v4i64)
 ...
@@ -255,18 +322,19 @@ body: |
   bb.0:
     liveins: $sgpr0_sgpr1
     ; CHECK-LABEL: name: load_constant_v16i32_non_uniform
-    ; CHECK: [[PTR:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
-    ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR]](p4) :: (load 16 from %ir.constant.not.uniform.v16i32, align 64, addrspace 4)
-    ; CHECK: [[OFFSET16:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
-    ; CHECK: [[GEP16:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[PTR]], [[OFFSET16]](s64)
-    ; CHECK: [[LOAD16:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP16]](p4) :: (load 16 from %ir.constant.not.uniform.v16i32 + 16, align 64, addrspace 4)
-    ; CHECK: [[OFFSET32:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32
-    ; CHECK: [[GEP32:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[PTR]], [[OFFSET32]](s64)
-    ; CHECK: [[LOAD32:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP32]](p4) :: (load 16 from %ir.constant.not.uniform.v16i32 + 32, align 64, addrspace 4)
-    ; CHECK: [[OFFSET48:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48
-    ; CHECK: [[GEP48:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[PTR]], [[OFFSET48]](s64)
-    ; CHECK: [[LOAD48:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[GEP48]](p4) :: (load 16 from %ir.constant.not.uniform.v16i32 + 48, align 64, addrspace 4)
-    ; CHECK: %1:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD0]](<4 x s32>), [[LOAD16]](<4 x s32>), [[LOAD32]](<4 x s32>), [[LOAD48]](<4 x s32>)
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
+    ; CHECK: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load 16 from %ir.constant.not.uniform.v16i32, align 64, addrspace 4)
+    ; CHECK: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+    ; CHECK: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; CHECK: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load 16 from %ir.constant.not.uniform.v16i32 + 16, align 64, addrspace 4)
+    ; CHECK: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32
+    ; CHECK: [[PTR_ADD1:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; CHECK: [[LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD1]](p4) :: (load 16 from %ir.constant.not.uniform.v16i32 + 32, align 64, addrspace 4)
+    ; CHECK: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48
+    ; CHECK: [[PTR_ADD2:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; CHECK: [[LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD2]](p4) :: (load 16 from %ir.constant.not.uniform.v16i32 + 48, align 64, addrspace 4)
+    ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
     %0:_(p4) = COPY $sgpr0_sgpr1
     %1:_(<16 x s32>) = G_LOAD %0 :: (load 64 from %ir.constant.not.uniform.v16i32)
 ...
@@ -279,18 +347,19 @@ body: |
   bb.0:
     liveins: $sgpr0_sgpr1
     ; CHECK-LABEL: name: load_constant_v8i64_non_uniform
-    ; CHECK: [[PTR:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
-    ; CHECK: [[LOAD0:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR]](p4) :: (load 16 from %ir.constant.not.uniform.v8i64, align 64, addrspace 4)
-    ; CHECK: [[OFFSET16:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
-    ; CHECK: [[GEP16:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[PTR]], [[OFFSET16]](s64)
-    ; CHECK: [[LOAD16:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP16]](p4) :: (load 16 from %ir.constant.not.uniform.v8i64 + 16, align 64, addrspace 4)
-    ; CHECK: [[OFFSET32:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32
-    ; CHECK: [[GEP32:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[PTR]], [[OFFSET32]](s64)
-    ; CHECK: [[LOAD32:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP32]](p4) :: (load 16 from %ir.constant.not.uniform.v8i64 + 32, align 64, addrspace 4)
-    ; CHECK: [[OFFSET48:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48
-    ; CHECK: [[GEP48:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[PTR]], [[OFFSET48]](s64)
-    ; CHECK: [[LOAD48:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[GEP48]](p4) :: (load 16 from %ir.constant.not.uniform.v8i64 + 48, align 64, addrspace 4)
-    ; CHECK: %1:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[LOAD0]](<2 x s64>), [[LOAD16]](<2 x s64>), [[LOAD32]](<2 x s64>), [[LOAD48]](<2 x s64>)
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
+    ; CHECK: [[LOAD:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[COPY]](p4) :: (load 16 from %ir.constant.not.uniform.v8i64, align 64, addrspace 4)
+    ; CHECK: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+    ; CHECK: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; CHECK: [[LOAD1:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD]](p4) :: (load 16 from %ir.constant.not.uniform.v8i64 + 16, align 64, addrspace 4)
+    ; CHECK: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32
+    ; CHECK: [[PTR_ADD1:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C1]](s64)
+    ; CHECK: [[LOAD2:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD1]](p4) :: (load 16 from %ir.constant.not.uniform.v8i64 + 32, align 64, addrspace 4)
+    ; CHECK: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48
+    ; CHECK: [[PTR_ADD2:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C2]](s64)
+    ; CHECK: [[LOAD3:%[0-9]+]]:vgpr(<2 x s64>) = G_LOAD [[PTR_ADD2]](p4) :: (load 16 from %ir.constant.not.uniform.v8i64 + 48, align 64, addrspace 4)
+    ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s64>) = G_CONCAT_VECTORS [[LOAD]](<2 x s64>), [[LOAD1]](<2 x s64>), [[LOAD2]](<2 x s64>), [[LOAD3]](<2 x s64>)
     %0:_(p4) = COPY $sgpr0_sgpr1
     %1:_(<8 x s64>) = G_LOAD %0 :: (load 64 from %ir.constant.not.uniform.v8i64)
 ...
@@ -303,11 +372,26 @@ body: |
   bb.0:
     liveins: $sgpr0_sgpr1
     ; CHECK-LABEL: name: load_constant_v8i32_uniform
-    ; CHECK: (<8 x s32>) = G_LOAD %0(p4) :: (load 32, addrspace 4)
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+    ; CHECK: [[LOAD:%[0-9]+]]:sgpr(<8 x s32>) = G_LOAD [[COPY]](p4) :: (load 32, addrspace 4)
     %0:_(p4) = COPY $sgpr0_sgpr1
     %1:_(<8 x s32>) = G_LOAD %0 :: (load 32, addrspace 4)
 ...
 
+---
+name: load_constant_v16i16_uniform
+legalized: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0_sgpr1
+    ; CHECK-LABEL: name: load_constant_v16i16_uniform
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+    ; CHECK: [[LOAD:%[0-9]+]]:sgpr(<16 x s16>) = G_LOAD [[COPY]](p4) :: (load 32, addrspace 4)
+    %0:_(p4) = COPY $sgpr0_sgpr1
+    %1:_(<16 x s16>) = G_LOAD %0 :: (load 32, addrspace 4)
+...
+
 ---
 name: load_constant_v4i64_uniform
 legalized: true
@@ -316,7 +400,8 @@ body: |
   bb.0:
     liveins: $sgpr0_sgpr1
     ; CHECK-LABEL: name: load_constant_v4i64_uniform
-    ; CHECK: (<4 x s64>) = G_LOAD %0(p4) :: (load 32, addrspace 4)
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+    ; CHECK: [[LOAD:%[0-9]+]]:sgpr(<4 x s64>) = G_LOAD [[COPY]](p4) :: (load 32, addrspace 4)
     %0:_(p4) = COPY $sgpr0_sgpr1
     %1:_(<4 x s64>) = G_LOAD %0 :: (load 32, addrspace 4)
 ...
@@ -329,7 +414,8 @@ body: |
   bb.0:
     liveins: $sgpr0_sgpr1
     ; CHECK-LABEL: name: load_constant_v16i32_uniform
-    ; CHECK: (<16 x s32>) = G_LOAD %0(p4) :: (load 64, addrspace 4)
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+    ; CHECK: [[LOAD:%[0-9]+]]:sgpr(<16 x s32>) = G_LOAD [[COPY]](p4) :: (load 64, addrspace 4)
     %0:_(p4) = COPY $sgpr0_sgpr1
     %1:_(<16 x s32>) = G_LOAD %0 :: (load 64, addrspace 4)
 ...
@@ -342,7 +428,8 @@ body: |
   bb.0:
     liveins: $sgpr0_sgpr1
     ; CHECK-LABEL: name: load_constant_v8i64_uniform
-    ; CHECK: (<8 x s64>) = G_LOAD %0(p4) :: (load 64, addrspace 4)
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+    ; CHECK: [[LOAD:%[0-9]+]]:sgpr(<8 x s64>) = G_LOAD [[COPY]](p4) :: (load 64, addrspace 4)
     %0:_(p4) = COPY $sgpr0_sgpr1
     %1:_(<8 x s64>) = G_LOAD %0 :: (load 64, addrspace 4)
 ...
@@ -353,11 +440,11 @@ legalized: true
 body: |
   bb.0:
     liveins: $sgpr0
-    ; CHECK-LABEL: load_local_uniform
-    ; CHECK: %0:sgpr(p3) = COPY $sgpr0
-    ; CHECK: %2:vgpr(p3) = COPY %0(p3)
-    ; CHECK: %1:vgpr(s32) = G_LOAD %2(p3) :: (load 4, addrspace 3)
 
+    ; CHECK-LABEL: name: load_local_uniform
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p3) = COPY $sgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY [[COPY]](p3)
+    ; CHECK: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p3) :: (load 4, addrspace 3)
     %0:_(p3) = COPY $sgpr0
     %1:_(s32) = G_LOAD %0 :: (load 4, addrspace 3)
 
@@ -368,11 +455,11 @@ legalized: true
 body: |
   bb.0:
     liveins: $sgpr0
-    ; CHECK-LABEL: load_region_uniform
-    ; CHECK: %0:sgpr(p3) = COPY $sgpr0
-    ; CHECK: %2:vgpr(p3) = COPY %0(p3)
-    ; CHECK: %1:vgpr(s32) = G_LOAD %2(p3) :: (load 4, addrspace 5)
 
+    ; CHECK-LABEL: name: load_region_uniform
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p3) = COPY $sgpr0
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(p3) = COPY [[COPY]](p3)
+    ; CHECK: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p3) :: (load 4, addrspace 5)
     %0:_(p3) = COPY $sgpr0
     %1:_(s32) = G_LOAD %0 :: (load 4, addrspace 5)
 
@@ -386,9 +473,9 @@ body: |
   bb.0:
     liveins: $sgpr0_sgpr1
     ; CHECK-LABEL: name: extload_constant_i8_to_i32_uniform
-    ; CHECK: %0:sgpr(p4) = COPY $sgpr0_sgpr1
-    ; CHECK: %2:vgpr(p4) = COPY %0(p4)
-    ; CHECK: %1:vgpr(s32) = G_LOAD %2(p4) :: (load 1, addrspace 4)
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
+    ; CHECK: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load 1, addrspace 4)
     %0:_(p4) = COPY $sgpr0_sgpr1
     %1:_(s32) = G_LOAD %0 :: (load 1, addrspace 4, align 1)
 ...
@@ -401,10 +488,10 @@ body: |
   bb.0:
     liveins: $sgpr0_sgpr1
 
-    ; CHECK-LABEL: name: extload_global_i8_to_i32_uniform{{$}}
-    ; CHECK: %0:sgpr(p4) = COPY $sgpr0_sgpr1
-    ; CHECK: %2:vgpr(p4) = COPY %0(p4)
-    ; CHECK: %1:vgpr(s32) = G_LOAD %2(p4) :: (load 1, addrspace 1)
+    ; CHECK-LABEL: name: extload_global_i8_to_i32_uniform
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
+    ; CHECK: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load 1, addrspace 1)
     %0:_(p4) = COPY $sgpr0_sgpr1
     %1:_(s32) = G_LOAD %0 :: (load 1, addrspace 1, align 1)
 ...
@@ -416,11 +503,11 @@ legalized: true
 body: |
   bb.0:
     liveins: $sgpr0_sgpr1
-    ; CHECK-LABEL: name: extload_constant_i16_to_i32_uniform
-    ; CHECK: %0:sgpr(p4) = COPY $sgpr0_sgpr1
-    ; CHECK: %2:vgpr(p4) = COPY %0(p4)
-    ; CHECK: %1:vgpr(s32) = G_LOAD %2(p4) :: (load 2, addrspace 4)
 
+    ; CHECK-LABEL: name: extload_constant_i16_to_i32_uniform
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
+    ; CHECK: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load 2, addrspace 4)
     %0:_(p4) = COPY $sgpr0_sgpr1
     %1:_(s32) = G_LOAD %0 :: (load 2, addrspace 4, align 2)
 ...
@@ -432,11 +519,11 @@ legalized: true
 body: |
   bb.0:
     liveins: $sgpr0_sgpr1
-    ; CHECK-LABEL: name: extload_global_i16_to_i32_uniform
-    ; CHECK: %0:sgpr(p4) = COPY $sgpr0_sgpr1
-    ; CHECK: %2:vgpr(p4) = COPY %0(p4)
-    ; CHECK: %1:vgpr(s32) = G_LOAD %2(p4) :: (load 2, addrspace 1)
 
+    ; CHECK-LABEL: name: extload_global_i16_to_i32_uniform
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
+    ; CHECK: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load 2, addrspace 1)
     %0:_(p4) = COPY $sgpr0_sgpr1
     %1:_(s32) = G_LOAD %0 :: (load 2, addrspace 1, align 2)
 ...
@@ -449,8 +536,8 @@ body: |
   bb.0:
     liveins: $sgpr0_sgpr1
     ; CHECK-LABEL: name: load_constant_i32_uniform_align4
-    ; CHECK: %0:sgpr(p4) = COPY $sgpr0_sgpr1
-    ; CHECK: %1:sgpr(s32) = G_LOAD %0(p4) :: (load 4, addrspace 4)
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+    ; CHECK: [[LOAD:%[0-9]+]]:sgpr(s32) = G_LOAD [[COPY]](p4) :: (load 4, addrspace 4)
     %0:_(p4) = COPY $sgpr0_sgpr1
     %1:_(s32) = G_LOAD %0 :: (load 4, addrspace 4, align 4)
 ...
@@ -462,11 +549,11 @@ legalized: true
 body: |
   bb.0:
     liveins: $sgpr0_sgpr1
-    ; CHECK-LABEL: name: load_constant_i32_uniform_align2
-    ; CHECK: %0:sgpr(p4) = COPY $sgpr0_sgpr1
-    ; CHECK: %2:vgpr(p4) = COPY %0(p4)
-    ; CHECK: %1:vgpr(s32) = G_LOAD %2(p4) :: (load 4, align 2, addrspace 4)
 
+    ; CHECK-LABEL: name: load_constant_i32_uniform_align2
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
+    ; CHECK: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load 4, align 2, addrspace 4)
     %0:_(p4) = COPY $sgpr0_sgpr1
     %1:_(s32) = G_LOAD %0 :: (load 4, addrspace 4, align 2)
 ...
@@ -480,9 +567,9 @@ body: |
     liveins: $sgpr0_sgpr1
 
     ; CHECK-LABEL: name: load_constant_i32_uniform_align1
-    ; CHECK: %0:sgpr(p4) = COPY $sgpr0_sgpr1
-    ; CHECK: %2:vgpr(p4) = COPY %0(p4)
-    ; CHECK: %1:vgpr(s32) = G_LOAD %2(p4) :: (load 4, align 1, addrspace 4)
+    ; CHECK: [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+    ; CHECK: [[COPY1:%[0-9]+]]:vgpr(p4) = COPY [[COPY]](p4)
+    ; CHECK: [[LOAD:%[0-9]+]]:vgpr(s32) = G_LOAD [[COPY1]](p4) :: (load 4, align 1, addrspace 4)
     %0:_(p4) = COPY $sgpr0_sgpr1
     %1:_(s32) = G_LOAD %0 :: (load 4, addrspace 4, align 1)
 ...
@@ -513,10 +600,13 @@ body: |
     liveins: $vgpr0_vgpr1
 
     ; CHECK-LABEL: name: load_constant_v8i32_vgpr_crash
-    ; CHECK: %0:vgpr(p4) = COPY $vgpr0_vgpr1
-    ; CHECK: vgpr(<4 x s32>) = G_LOAD %0(p4)
-    ; CHECK: vgpr(<4 x s32>) = G_LOAD
-    ; CHECK: G_CONCAT_VECTORS
+    ; CHECK: liveins: $vgpr0_vgpr1
+    ; CHECK: [[COPY:%[0-9]+]]:vgpr(p4) = COPY $vgpr0_vgpr1
+    ; CHECK: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[COPY]](p4) :: (load 16, align 32, addrspace 4)
+    ; CHECK: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+    ; CHECK: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[COPY]], [[C]](s64)
+    ; CHECK: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load 16 + 16, addrspace 4)
+    ; CHECK: [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
     %0:_(p4) = COPY $vgpr0_vgpr1
     %1:_(<8 x s32>) = G_LOAD %0 :: (load 32, addrspace 4)
 ...
@@ -527,14 +617,26 @@ legalized: true
 tracksRegLiveness: true
 
 body: |
+  ; CHECK-LABEL: name: load_constant_v8i32_vgpr_crash_loop_phi
+  ; CHECK: bb.0:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
+  ; CHECK:   [[COPY:%[0-9]+]]:sgpr(p4) = COPY $sgpr0_sgpr1
+  ; CHECK:   [[COPY1:%[0-9]+]]:sgpr(p4) = COPY $sgpr2_sgpr3
+  ; CHECK:   G_BR %bb.1
+  ; CHECK: bb.1:
+  ; CHECK:   successors: %bb.1(0x80000000)
+  ; CHECK:   [[PHI:%[0-9]+]]:vgpr(p4) = G_PHI [[COPY]](p4), %bb.0, %3(p4), %bb.1
+  ; CHECK:   [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PHI]](p4) :: (load 16, align 32, addrspace 4)
+  ; CHECK:   [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
+  ; CHECK:   [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD [[PHI]], [[C]](s64)
+  ; CHECK:   [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load 16 + 16, addrspace 4)
+  ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
+  ; CHECK:   [[COPY2:%[0-9]+]]:sgpr(p4) = COPY [[COPY1]](p4)
+  ; CHECK:   G_BR %bb.1
   bb.0:
     liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
 
-    ; CHECK-LABEL: name: load_constant_v8i32_vgpr_crash_loop_phi
-    ; CHECK: G_PHI
-    ; CHECK: vgpr(<4 x s32>) = G_LOAD
-    ; CHECK: vgpr(<4 x s32>) = G_LOAD
-    ; CHECK: G_CONCAT_VECTORS
 
     %0:_(p4) = COPY $sgpr0_sgpr1
     %1:_(p4) = COPY $sgpr2_sgpr3
diff --git a/llvm/test/CodeGen/SystemZ/vec-perm-14.ll b/llvm/test/CodeGen/SystemZ/vec-perm-14.ll
new file mode 100644
index 00000000000000..0cf3c6ef7a064c
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/vec-perm-14.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z14 | FileCheck %s
+;
+; Test that only one vperm of the vector compare is needed for both extracts.
+
+define void @fun() {
+; CHECK-LABEL: fun
+; CHECK: vperm
+; CHECK-NOT: vperm
+bb:
+  %tmp = load <4 x i8>, <4 x i8>* undef
+  %tmp1 = icmp eq <4 x i8> zeroinitializer, %tmp
+  %tmp2 = extractelement <4 x i1> %tmp1, i32 0
+  br i1 %tmp2, label %bb1, label %bb2
+
+bb1:
+  unreachable
+
+bb2:
+  %tmp3 = extractelement <4 x i1> %tmp1, i32 1
+  br i1 %tmp3, label %bb3, label %bb4
+
+bb3:
+  unreachable
+
+bb4:
+  unreachable
+}
diff --git a/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll
index 7cea2ff8eb9c06..b7cbac89db31e9 100644
--- a/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll
@@ -5377,12 +5377,12 @@ define void @constrained_vector_fptrunc_v3f64(<3 x double>* %src, <3 x float>* %
 ; SZ13-LABEL: constrained_vector_fptrunc_v3f64:
 ; SZ13:       # %bb.0: # %entry
 ; SZ13-NEXT:    vl %v1, 0(%r2), 4
+; SZ13-NEXT:    ld %f0, 16(%r2)
 ; SZ13-NEXT:    vledb %v1, %v1, 0, 0
 ; SZ13-NEXT:    larl %r1, .LCPI97_0
-; SZ13-NEXT:    ld %f0, 16(%r2)
-; SZ13-NEXT:    vl %v2, 0(%r1), 3
-; SZ13-NEXT:    vperm %v1, %v1, %v0, %v2
 ; SZ13-NEXT:    ledbra %f0, 0, %f0, 0
+; SZ13-NEXT:    vl %v2, 0(%r1), 3
+; SZ13-NEXT:    vperm %v1, %v1, %v1, %v2
 ; SZ13-NEXT:    ste %f0, 8(%r3)
 ; SZ13-NEXT:    vsteg %v1, 0(%r3), 0
 ; SZ13-NEXT:    br %r14
diff --git a/llvm/test/Transforms/InstCombine/cast-mul-select.ll b/llvm/test/Transforms/InstCombine/cast-mul-select.ll
index f82d2fd285fe8f..e68f3830b5a9d5 100644
--- a/llvm/test/Transforms/InstCombine/cast-mul-select.ll
+++ b/llvm/test/Transforms/InstCombine/cast-mul-select.ll
@@ -13,8 +13,8 @@ define i32 @mul(i32 %x, i32 %y) {
 ; we preserve the debug information in the resulting
 ; instruction.
 ; DBGINFO-LABEL: @mul(
-; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i32 %x
-; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i32 %y
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i32 %x, {{.*}} !DIExpression(DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_stack_value))
+; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i32 %y, {{.*}} !DIExpression(DW_OP_LLVM_convert, 32, DW_ATE_unsigned, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_stack_value))
 ; DBGINFO-NEXT:    [[C:%.*]] = mul i32 {{.*}}
 ; DBGINFO-NEXT:    [[D:%.*]] = and i32 {{.*}}
 ; DBGINFO-NEXT:    call void @llvm.dbg.value(metadata i32 [[C]]
@@ -175,7 +175,7 @@ exit:
 ; Check that we don't drop debug info when a zext is removed.
 define i1 @foo(i1 zeroext %b) {
 ; DBGINFO-LABEL: @foo(
-; DBGINFO-NEXT:  call void @llvm.dbg.value(metadata i1 %b
+; DBGINFO-NEXT:  call void @llvm.dbg.value(metadata i1 %b, {{.*}} !DIExpression(DW_OP_LLVM_convert, 1, DW_ATE_unsigned, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_stack_value))
 ; DBGINFO-NEXT:  ret i1 %b
 
   %frombool = zext i1 %b to i8 
diff --git a/mlir/CMakeLists.txt b/mlir/CMakeLists.txt
index 3671a97395d4ec..7c2c5978c44e62 100644
--- a/mlir/CMakeLists.txt
+++ b/mlir/CMakeLists.txt
@@ -34,6 +34,10 @@ add_definitions(-DMLIR_CUDA_CONVERSIONS_ENABLED=${MLIR_CUDA_CONVERSIONS_ENABLED}
 set(MLIR_CUDA_RUNNER_ENABLED 0 CACHE BOOL "Enable building the mlir CUDA runner")
 set(MLIR_VULKAN_RUNNER_ENABLED 0 CACHE BOOL "Enable building the mlir Vulkan runner")
 
+option(MLIR_INCLUDE_TESTS
+       "Generate build targets for the MLIR unit tests."
+       ${LLVM_INCLUDE_TESTS})
+
 include_directories( "include")
 include_directories( ${MLIR_INCLUDE_DIR})
 
@@ -44,8 +48,11 @@ add_subdirectory(tools/mlir-tblgen)
 
 add_subdirectory(include/mlir)
 add_subdirectory(lib)
-add_subdirectory(unittests)
-add_subdirectory(test)
+if (MLIR_INCLUDE_TESTS)
+  add_definitions(-DMLIR_INCLUDE_TESTS)
+  add_subdirectory(unittests)
+  add_subdirectory(test)
+endif()
 # Tools needs to come late to ensure that MLIR_ALL_LIBS is populated.
 # Generally things after this point may depend on MLIR_ALL_LIBS or libMLIR.so.
 add_subdirectory(tools)
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 70c3f00f52161b..e93977185fb356 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -11,6 +11,7 @@
 
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
 #include "mlir/IR/PatternMatch.h"
+#include "llvm/ADT/SmallBitVector.h"
 
 namespace mlir {
 namespace linalg {
@@ -97,6 +98,28 @@ struct LinalgPromotionOptions {
     operandsToPromote->insert(operands.begin(), operands.end());
     return *this;
   }
+  /// If ith element of `useFullTiles` is true the full view should be used for
+  /// the promoted buffer of the ith operand in `operandsToPromote`. Otherwise
+  /// the partial view will be used.
+  /// The decision is defaulted to `useFullTileBuffersDefault` when
+  /// `useFullTileBuffers` is None and for operands missing from
+  /// `useFullTileBuffers`.
+  Optional<llvm::SmallBitVector> useFullTileBuffers = None;
+  LinalgPromotionOptions &setUseFullTileBuffers(ArrayRef<bool> useFullTiles) {
+    unsigned size = useFullTiles.size();
+    llvm::SmallBitVector tmp(size, false);
+    for (unsigned i = 0; i < size; ++i)
+      tmp[i] = useFullTiles[i];
+    useFullTileBuffers = tmp;
+    return *this;
+  }
+  /// If true all operands unspecified by `useFullTileBuffers` will use the full
+  /// view, otherwise the partial view.
+  bool useFullTileBuffersDefault = false;
+  LinalgPromotionOptions &useFullTileBuffersByDefault() {
+    useFullTileBuffersDefault = true;
+    return *this;
+  }
   /// Allow the use of dynamicaly-sized buffers.
   bool dynamicBuffers = false;
   LinalgPromotionOptions &setDynamicBuffers(unsigned dynamic) {
diff --git a/mlir/include/mlir/Dialect/Vector/EDSC/Intrinsics.h b/mlir/include/mlir/Dialect/Vector/EDSC/Intrinsics.h
index 7fa9099a6a90ff..6b5c4be7b2f409 100644
--- a/mlir/include/mlir/Dialect/Vector/EDSC/Intrinsics.h
+++ b/mlir/include/mlir/Dialect/Vector/EDSC/Intrinsics.h
@@ -16,6 +16,9 @@ namespace intrinsics {
 
 using vector_broadcast = ValueBuilder<vector::BroadcastOp>;
 using vector_contract = ValueBuilder<vector::ContractionOp>;
+using vector_insert = ValueBuilder<vector::InsertOp>;
+using vector_fma = ValueBuilder<vector::FMAOp>;
+using vector_extract = ValueBuilder<vector::ExtractOp>;
 using vector_matmul = ValueBuilder<vector::MatmulOp>;
 using vector_print = OperationBuilder<vector::PrintOp>;
 using vector_transfer_read = ValueBuilder<vector::TransferReadOp>;
diff --git a/mlir/include/mlir/Dialect/Vector/VectorOps.h b/mlir/include/mlir/Dialect/Vector/VectorOps.h
index a3376d53fc9590..6394fae2137507 100644
--- a/mlir/include/mlir/Dialect/Vector/VectorOps.h
+++ b/mlir/include/mlir/Dialect/Vector/VectorOps.h
@@ -13,6 +13,7 @@
 #ifndef MLIR_DIALECT_VECTOR_VECTOROPS_H
 #define MLIR_DIALECT_VECTOR_VECTOROPS_H
 
+#include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/OpDefinition.h"
@@ -71,6 +72,14 @@ IntegerType getVectorSubscriptType(Builder &builder);
 /// the integer type required for subscripts in the vector dialect.
 ArrayAttr getVectorSubscriptAttr(Builder &b, ArrayRef<int64_t> values);
 
+namespace impl {
+/// Build the default minor identity map suitable for a vector transfer. This
+/// also handles the case memref<... x vector<...>> -> vector<...> in which the
+/// rank of the identity map must take the vector element type into account.
+AffineMap getTransferMinorIdentityMap(MemRefType memRefType,
+                                      VectorType vectorType);
+} // namespace impl
+
 #define GET_OP_CLASSES
 #include "mlir/Dialect/Vector/VectorOps.h.inc"
 
diff --git a/mlir/include/mlir/Dialect/Vector/VectorOps.td b/mlir/include/mlir/Dialect/Vector/VectorOps.td
index 4c71a168dae737..29e72857b291e4 100644
--- a/mlir/include/mlir/Dialect/Vector/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/VectorOps.td
@@ -863,17 +863,35 @@ def Vector_ExtractStridedSliceOp :
   let assemblyFormat = "$vector attr-dict `:` type($vector) `to` type(results)";
 }
 
+def Vector_TransferOpUtils {
+  code extraTransferDeclaration = [{
+    static StringRef getMaskedAttrName() { return "masked"; }
+    static StringRef getPermutationMapAttrName() { return "permutation_map"; }
+    bool isMaskedDim(unsigned dim) {
+      return !masked() ||
+        masked()->cast<ArrayAttr>()[dim].cast<BoolAttr>().getValue();
+    }
+    MemRefType getMemRefType() {
+      return memref().getType().cast<MemRefType>();
+    }
+    VectorType getVectorType() {
+      return vector().getType().cast<VectorType>();
+    }
+  }];
+}
+
 def Vector_TransferReadOp :
   Vector_Op<"transfer_read">,
     Arguments<(ins AnyMemRef:$memref, Variadic<Index>:$indices,
-               AffineMapAttr:$permutation_map, AnyType:$padding)>,
+               AffineMapAttr:$permutation_map, AnyType:$padding,
+               OptionalAttr<BoolArrayAttr>:$masked)>,
     Results<(outs AnyVector:$vector)> {
 
   let summary = "Reads a supervector from memory into an SSA vector value.";
 
   let description = [{
-    The `vector.transfer_read` op performs a blocking read from a slice within
-    a [MemRef](../LangRef.md#memref-type) supplied as its first operand
+    The `vector.transfer_read` op performs a read from a slice within a
+    [MemRef](../LangRef.md#memref-type) supplied as its first operand
     into a [vector](../LangRef.md#vector-type) of the same base elemental type.
 
     A memref operand with vector element type, must have its vector element
@@ -881,18 +899,31 @@ def Vector_TransferReadOp :
     memref<3x2x6x4x3xf32>, vector<1x1x4x3xf32>).
 
     The slice is further defined by a full-rank index within the MemRef,
-    supplied as the operands `2 .. 1 + rank(memref)`. The permutation_map
-    [attribute](../LangRef.md#attributes) is an
+    supplied as the operands `2 .. 1 + rank(memref)`.
+
+    The permutation_map [attribute](../LangRef.md#attributes) is an
     [affine-map](Affine.md#affine-maps) which specifies the transposition on the
-    slice to match the vector shape. The size of the slice is specified by the
-    size of the vector, given as the return type. An `ssa-value` of the same
-    elemental type as the MemRef is provided as the last operand to specify
-    padding in the case of out-of-bounds accesses. This operation is called
-    'read' by opposition to 'load' because the super-vector granularity is
-    generally not representable with a single hardware register.
-    A `vector.transfer_read` is thus a mid-level
-    abstraction that supports super-vectorization with non-effecting padding for
-    full-tile-only code.
+    slice to match the vector shape. The permutation map may be implicit and
+    ommitted from parsing and printing if it is the canonical minor identity map
+    (i.e. if it does not permute or broadcast any dimension).
+
+    The size of the slice is specified by the size of the vector, given as the
+    return type.
+
+    An `ssa-value` of the same elemental type as the MemRef is provided as the
+    last operand to specify padding in the case of out-of-bounds accesses.
+
+    An optional boolean array attribute is provided to specify which dimensions
+    of the transfer need masking. When a dimension is specified as not requiring
+    masking, the `vector.transfer_read` may be lowered to simple loads. The
+    absence of this `masked` attribute signifies that all dimensions of the
+    transfer need to be masked.
+
+    This operation is called 'read' by opposition to 'load' because the
+    super-vector granularity is generally not representable with a single
+    hardware register. A `vector.transfer_read` is thus a mid-level abstraction
+    that supports super-vectorization with non-effecting padding for full-tile
+    only operations.
 
     More precisely, let's dive deeper into the permutation_map for the following
     MLIR:
@@ -995,19 +1026,27 @@ def Vector_TransferReadOp :
   }];
 
   let builders = [
-    // Builder that sets permutation map and padding to 'getMinorIdentityMap'
-    // and zero, respectively, by default.
+    // Builder that sets padding to zero.
+    OpBuilder<"OpBuilder &builder, OperationState &result, VectorType vector, "
+              "Value memref, ValueRange indices, AffineMap permutationMap, "
+              "ArrayRef<bool> maybeMasked = {}">,
+    // Builder that sets permutation map (resp. padding) to
+    // 'getMinorIdentityMap' (resp. zero).
     OpBuilder<"OpBuilder &builder, OperationState &result, VectorType vector, "
-              "Value memref, ValueRange indices">
+              "Value memref, ValueRange indices, "
+              "ArrayRef<bool> maybeMasked = {}">
   ];
 
-  let extraClassDeclaration = [{
-    MemRefType getMemRefType() {
-      return memref().getType().cast<MemRefType>();
-    }
-    VectorType getVectorType() {
-      return vector().getType().cast<VectorType>();
-    }
+  let extraClassDeclaration = Vector_TransferOpUtils.extraTransferDeclaration #
+  [{
+    /// Build the default minor identity map suitable for a vector transfer.
+    /// This also handles the case memref<... x vector<...>> -> vector<...> in
+    /// which the rank of the identity map must take the vector element type
+    /// into account.
+    static AffineMap getTransferMinorIdentityMap(
+      MemRefType memRefType, VectorType vectorType) {
+        return impl::getTransferMinorIdentityMap(memRefType, vectorType);
+      }
   }];
 }
 
@@ -1015,12 +1054,13 @@ def Vector_TransferWriteOp :
   Vector_Op<"transfer_write">,
     Arguments<(ins AnyVector:$vector, AnyMemRef:$memref,
                Variadic<Index>:$indices,
-               AffineMapAttr:$permutation_map)> {
+               AffineMapAttr:$permutation_map,
+               OptionalAttr<BoolArrayAttr>:$masked)> {
 
   let summary = "The vector.transfer_write op writes a supervector to memory.";
 
   let description = [{
-    The `vector.transfer_write` performs a blocking write from a
+    The `vector.transfer_write` op performs a write from a
     [vector](../LangRef.md#vector-type), supplied as its first operand, into a
     slice within a [MemRef](../LangRef.md#memref-type) of the same base
     elemental type, supplied as its second operand.
@@ -1031,12 +1071,24 @@ def Vector_TransferWriteOp :
 
     The slice is further defined by a full-rank index within the MemRef,
     supplied as the operands `3 .. 2 + rank(memref)`.
+
     The permutation_map [attribute](../LangRef.md#attributes) is an
     [affine-map](Affine.md#affine-maps) which specifies the transposition on the
-    slice to match the vector shape. The size of the slice is specified by the
-    size of the vector. This operation is called 'write' by opposition to
-    'store' because the super-vector granularity is generally not representable
-    with a single hardware register. A `vector.transfer_write` is thus a
+    slice to match the vector shape. The permutation map may be implicit and
+    ommitted from parsing and printing if it is the canonical minor identity map
+    (i.e. if it does not permute or broadcast any dimension).
+
+    The size of the slice is specified by the size of the vector.
+
+    An optional boolean array attribute is provided to specify which dimensions
+    of the transfer need masking. When a dimension is specified as not requiring
+    masking, the `vector.transfer_write` may be lowered to simple stores. The
+    absence of this `mask` attribute signifies that all dimensions of the
+    transfer need to be masked.
+
+    This operation is called 'write' by opposition to 'store' because the
+    super-vector granularity is generally not representable with a single
+    hardware register. A `vector.transfer_write` is thus a
     mid-level abstraction that supports super-vectorization with non-effecting
     padding for full-tile-only code. It is the responsibility of
     `vector.transfer_write`'s implementation to ensure the memory writes are
@@ -1066,23 +1118,24 @@ def Vector_TransferWriteOp :
   }];
 
   let builders = [
-    // Builder that sets permutation map and padding to 'getMinorIdentityMap'
-    // by default.
+    // Builder that sets permutation map to 'getMinorIdentityMap'.
+    OpBuilder<"OpBuilder &builder, OperationState &result, Value vector, "
+              "Value memref, ValueRange indices, "
+              "ArrayRef<bool> maybeMasked = {}">,
     OpBuilder<"OpBuilder &builder, OperationState &result, Value vector, "
-              "Value memref, ValueRange indices">
+              "Value memref, ValueRange indices, AffineMap permutationMap">,
   ];
 
-  let extraClassDeclaration = [{
-    VectorType getVectorType() {
-      return vector().getType().cast<VectorType>();
-    }
-    MemRefType getMemRefType() {
-      return memref().getType().cast<MemRefType>();
-    }
-  }];
-  let assemblyFormat = [{
-    $vector `,` $memref `[` $indices `]` attr-dict `:` type($vector) `,`
-    type($memref)
+  let extraClassDeclaration = Vector_TransferOpUtils.extraTransferDeclaration #
+  [{
+    /// Build the default minor identity map suitable for a vector transfer.
+    /// This also handles the case memref<... x vector<...>> -> vector<...> in
+    /// which the rank of the identity map must take the vector element type
+    /// into account.
+    static AffineMap getTransferMinorIdentityMap(
+      MemRefType memRefType, VectorType vectorType) {
+        return impl::getTransferMinorIdentityMap(memRefType, vectorType);
+      }
   }];
 }
 
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index eb25bf3abf85ed..975807ca86712f 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -746,12 +746,6 @@ class VectorTypeCastOpConversion : public ConvertToLLVMPattern {
   }
 };
 
-template <typename ConcreteOp>
-LogicalResult replaceTransferOp(ConversionPatternRewriter &rewriter,
-                                LLVMTypeConverter &typeConverter, Location loc,
-                                Operation *op, ArrayRef<Value> operands,
-                                Value dataPtr, Value mask);
-
 LogicalResult getLLVMTypeAndAlignment(LLVMTypeConverter &typeConverter,
                                       Type type, LLVM::LLVMType &llvmType,
                                       unsigned &align) {
@@ -765,12 +759,25 @@ LogicalResult getLLVMTypeAndAlignment(LLVMTypeConverter &typeConverter,
   return success();
 }
 
-template <>
-LogicalResult replaceTransferOp<TransferReadOp>(
-    ConversionPatternRewriter &rewriter, LLVMTypeConverter &typeConverter,
-    Location loc, Operation *op, ArrayRef<Value> operands, Value dataPtr,
-    Value mask) {
-  auto xferOp = cast<TransferReadOp>(op);
+LogicalResult
+replaceTransferOpWithLoadOrStore(ConversionPatternRewriter &rewriter,
+                                 LLVMTypeConverter &typeConverter, Location loc,
+                                 TransferReadOp xferOp,
+                                 ArrayRef<Value> operands, Value dataPtr) {
+  LLVM::LLVMType vecTy;
+  unsigned align;
+  if (failed(getLLVMTypeAndAlignment(typeConverter, xferOp.getVectorType(),
+                                     vecTy, align)))
+    return failure();
+  rewriter.replaceOpWithNewOp<LLVM::LoadOp>(xferOp, dataPtr);
+  return success();
+}
+
+LogicalResult replaceTransferOpWithMasked(ConversionPatternRewriter &rewriter,
+                                          LLVMTypeConverter &typeConverter,
+                                          Location loc, TransferReadOp xferOp,
+                                          ArrayRef<Value> operands,
+                                          Value dataPtr, Value mask) {
   auto toLLVMTy = [&](Type t) { return typeConverter.convertType(t); };
   VectorType fillType = xferOp.getVectorType();
   Value fill = rewriter.create<SplatOp>(loc, fillType, xferOp.padding());
@@ -783,19 +790,32 @@ LogicalResult replaceTransferOp<TransferReadOp>(
     return failure();
 
   rewriter.replaceOpWithNewOp<LLVM::MaskedLoadOp>(
-      op, vecTy, dataPtr, mask, ValueRange{fill},
+      xferOp, vecTy, dataPtr, mask, ValueRange{fill},
       rewriter.getI32IntegerAttr(align));
   return success();
 }
 
-template <>
-LogicalResult replaceTransferOp<TransferWriteOp>(
-    ConversionPatternRewriter &rewriter, LLVMTypeConverter &typeConverter,
-    Location loc, Operation *op, ArrayRef<Value> operands, Value dataPtr,
-    Value mask) {
+LogicalResult
+replaceTransferOpWithLoadOrStore(ConversionPatternRewriter &rewriter,
+                                 LLVMTypeConverter &typeConverter, Location loc,
+                                 TransferWriteOp xferOp,
+                                 ArrayRef<Value> operands, Value dataPtr) {
   auto adaptor = TransferWriteOpOperandAdaptor(operands);
+  LLVM::LLVMType vecTy;
+  unsigned align;
+  if (failed(getLLVMTypeAndAlignment(typeConverter, xferOp.getVectorType(),
+                                     vecTy, align)))
+    return failure();
+  rewriter.replaceOpWithNewOp<LLVM::StoreOp>(xferOp, adaptor.vector(), dataPtr);
+  return success();
+}
 
-  auto xferOp = cast<TransferWriteOp>(op);
+LogicalResult replaceTransferOpWithMasked(ConversionPatternRewriter &rewriter,
+                                          LLVMTypeConverter &typeConverter,
+                                          Location loc, TransferWriteOp xferOp,
+                                          ArrayRef<Value> operands,
+                                          Value dataPtr, Value mask) {
+  auto adaptor = TransferWriteOpOperandAdaptor(operands);
   LLVM::LLVMType vecTy;
   unsigned align;
   if (failed(getLLVMTypeAndAlignment(typeConverter, xferOp.getVectorType(),
@@ -803,7 +823,8 @@ LogicalResult replaceTransferOp<TransferWriteOp>(
     return failure();
 
   rewriter.replaceOpWithNewOp<LLVM::MaskedStoreOp>(
-      op, adaptor.vector(), dataPtr, mask, rewriter.getI32IntegerAttr(align));
+      xferOp, adaptor.vector(), dataPtr, mask,
+      rewriter.getI32IntegerAttr(align));
   return success();
 }
 
@@ -877,6 +898,10 @@ class VectorTransferConversion : public ConvertToLLVMPattern {
       vectorDataPtr = rewriter.create<LLVM::AddrSpaceCastOp>(
           loc, vecTy.getPointerTo(), dataPtr);
 
+    if (!xferOp.isMaskedDim(0))
+      return replaceTransferOpWithLoadOrStore(rewriter, typeConverter, loc,
+                                              xferOp, operands, vectorDataPtr);
+
     // 2. Create a vector with linear indices [ 0 .. vector_length - 1 ].
     unsigned vecWidth = vecTy.getVectorNumElements();
     VectorType vectorCmpType = VectorType::get(vecWidth, i64Type);
@@ -910,8 +935,8 @@ class VectorTransferConversion : public ConvertToLLVMPattern {
                                                 mask);
 
     // 5. Rewrite as a masked read / write.
-    return replaceTransferOp<ConcreteOp>(rewriter, typeConverter, loc, op,
-                                         operands, vectorDataPtr, mask);
+    return replaceTransferOpWithMasked(rewriter, typeConverter, loc, xferOp,
+                                       operands, vectorDataPtr, mask);
   }
 };
 
diff --git a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
index c9cd605afb84cd..03b78491fa1222 100644
--- a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
+++ b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
@@ -157,25 +157,34 @@ void NDTransferOpHelper<ConcreteOp>::emitInBounds(
     ValueRange majorIvs, ValueRange majorOffsets,
     MemRefBoundsCapture &memrefBounds, LambdaThen thenBlockBuilder,
     LambdaElse elseBlockBuilder) {
-  Value inBounds = std_constant_int(/*value=*/1, /*width=*/1);
+  Value inBounds;
   SmallVector<Value, 4> majorIvsPlusOffsets;
   majorIvsPlusOffsets.reserve(majorIvs.size());
+  unsigned idx = 0;
   for (auto it : llvm::zip(majorIvs, majorOffsets, memrefBounds.getUbs())) {
     Value iv = std::get<0>(it), off = std::get<1>(it), ub = std::get<2>(it);
     using namespace mlir::edsc::op;
     majorIvsPlusOffsets.push_back(iv + off);
-    Value inBounds2 = majorIvsPlusOffsets.back() < ub;
-    inBounds = inBounds && inBounds2;
+    if (xferOp.isMaskedDim(leadingRank + idx)) {
+      Value inBounds2 = majorIvsPlusOffsets.back() < ub;
+      inBounds = (inBounds) ? (inBounds && inBounds2) : inBounds2;
+    }
+    ++idx;
   }
 
-  auto ifOp = ScopedContext::getBuilderRef().create<scf::IfOp>(
-      ScopedContext::getLocation(), TypeRange{}, inBounds,
-      /*withElseRegion=*/std::is_same<ConcreteOp, TransferReadOp>());
-  BlockBuilder(&ifOp.thenRegion().front(),
-               Append())([&] { thenBlockBuilder(majorIvsPlusOffsets); });
-  if (std::is_same<ConcreteOp, TransferReadOp>())
-    BlockBuilder(&ifOp.elseRegion().front(),
-                 Append())([&] { elseBlockBuilder(majorIvsPlusOffsets); });
+  if (inBounds) {
+    auto ifOp = ScopedContext::getBuilderRef().create<scf::IfOp>(
+        ScopedContext::getLocation(), TypeRange{}, inBounds,
+        /*withElseRegion=*/std::is_same<ConcreteOp, TransferReadOp>());
+    BlockBuilder(&ifOp.thenRegion().front(),
+                 Append())([&] { thenBlockBuilder(majorIvsPlusOffsets); });
+    if (std::is_same<ConcreteOp, TransferReadOp>())
+      BlockBuilder(&ifOp.elseRegion().front(),
+                   Append())([&] { elseBlockBuilder(majorIvsPlusOffsets); });
+  } else {
+    // Just build the body of the then block right here.
+    thenBlockBuilder(majorIvsPlusOffsets);
+  }
 }
 
 template <>
@@ -187,18 +196,23 @@ LogicalResult NDTransferOpHelper<TransferReadOp>::doReplace() {
                 MemRefBoundsCapture &memrefBounds) {
     // If in-bounds, index into memref and lower to 1-D transfer read.
     auto thenBlockBuilder = [&](ValueRange majorIvsPlusOffsets) {
-      auto map = AffineMap::getMinorIdentityMap(
-          xferOp.getMemRefType().getRank(), minorRank, xferOp.getContext());
-      // Lower to 1-D vector_transfer_read and let recursion handle it.
-      Value memref = xferOp.memref();
       SmallVector<Value, 8> indexing;
       indexing.reserve(leadingRank + majorRank + minorRank);
       indexing.append(leadingOffsets.begin(), leadingOffsets.end());
       indexing.append(majorIvsPlusOffsets.begin(), majorIvsPlusOffsets.end());
       indexing.append(minorOffsets.begin(), minorOffsets.end());
-      auto loaded1D =
-          vector_transfer_read(minorVectorType, memref, indexing,
-                               AffineMapAttr::get(map), xferOp.padding());
+
+      Value memref = xferOp.memref();
+      auto map = TransferReadOp::getTransferMinorIdentityMap(
+          xferOp.getMemRefType(), minorVectorType);
+      ArrayAttr masked;
+      if (xferOp.isMaskedDim(xferOp.getVectorType().getRank() - 1)) {
+        OpBuilder &b = ScopedContext::getBuilderRef();
+        masked = b.getBoolArrayAttr({true});
+      }
+      auto loaded1D = vector_transfer_read(minorVectorType, memref, indexing,
+                                           AffineMapAttr::get(map),
+                                           xferOp.padding(), masked);
       // Store the 1-D vector.
       std_store(loaded1D, alloc, majorIvs);
     };
@@ -229,17 +243,22 @@ LogicalResult NDTransferOpHelper<TransferWriteOp>::doReplace() {
                 ValueRange majorOffsets, ValueRange minorOffsets,
                 MemRefBoundsCapture &memrefBounds) {
     auto thenBlockBuilder = [&](ValueRange majorIvsPlusOffsets) {
-      // Lower to 1-D vector_transfer_write and let recursion handle it.
-      Value loaded1D = std_load(alloc, majorIvs);
-      auto map = AffineMap::getMinorIdentityMap(
-          xferOp.getMemRefType().getRank(), minorRank, xferOp.getContext());
       SmallVector<Value, 8> indexing;
       indexing.reserve(leadingRank + majorRank + minorRank);
       indexing.append(leadingOffsets.begin(), leadingOffsets.end());
       indexing.append(majorIvsPlusOffsets.begin(), majorIvsPlusOffsets.end());
       indexing.append(minorOffsets.begin(), minorOffsets.end());
+      // Lower to 1-D vector_transfer_write and let recursion handle it.
+      Value loaded1D = std_load(alloc, majorIvs);
+      auto map = TransferWriteOp::getTransferMinorIdentityMap(
+          xferOp.getMemRefType(), minorVectorType);
+      ArrayAttr masked;
+      if (xferOp.isMaskedDim(xferOp.getVectorType().getRank() - 1)) {
+        OpBuilder &b = ScopedContext::getBuilderRef();
+        masked = b.getBoolArrayAttr({true});
+      }
       vector_transfer_write(loaded1D, xferOp.memref(), indexing,
-                            AffineMapAttr::get(map));
+                            AffineMapAttr::get(map), masked);
     };
     // Don't write anything when out of bounds.
     auto elseBlockBuilder = [&](ValueRange majorIvsPlusOffsets) {};
diff --git a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
index fe669624f6cb62..f5b98f9bf0653d 100644
--- a/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/SuperVectorize.cpp
@@ -793,10 +793,7 @@ static LogicalResult vectorizeRootOrTerminal(Value iv,
     LLVM_DEBUG(permutationMap.print(dbgs()));
     auto transfer = b.create<vector::TransferReadOp>(
         opInst->getLoc(), vectorType, memoryOp.getMemRef(), indices,
-        AffineMapAttr::get(permutationMap),
-        // TODO(b/144455320) add a proper padding value, not just 0.0 : f32
-        state->folder->create<ConstantFloatOp>(b, opInst->getLoc(),
-                                               APFloat(0.0f), b.getF32Type()));
+        permutationMap);
     state->registerReplacement(opInst, transfer.getOperation());
   } else {
     state->registerTerminal(opInst);
@@ -1020,8 +1017,7 @@ static Operation *vectorizeOneOperation(Operation *opInst,
     LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ permutationMap: ");
     LLVM_DEBUG(permutationMap.print(dbgs()));
     auto transfer = b.create<vector::TransferWriteOp>(
-        opInst->getLoc(), vectorValue, memRef, indices,
-        AffineMapAttr::get(permutationMap));
+        opInst->getLoc(), vectorValue, memRef, indices, permutationMap);
     auto *res = transfer.getOperation();
     LLVM_DEBUG(dbgs() << "\n[early-vect]+++++ vectorized store: " << *res);
     // "Terminals" (i.e. AffineStoreOps) are erased on the spot.
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
index 5cbaa2f426dbcf..44de2a1021c274 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
@@ -56,6 +56,8 @@ struct LinalgOpInstancePromotionOptions {
                                    const LinalgPromotionOptions &options);
   /// SubViews to promote.
   SetVector<Value> subViews;
+  /// True if the full view should be used for the promoted buffer.
+  DenseMap<Value, bool> useFullTileBuffers;
   /// Allow the use of dynamicaly-sized buffers.
   bool dynamicBuffers;
   /// Alignment of promoted buffer.
@@ -65,20 +67,28 @@ struct LinalgOpInstancePromotionOptions {
 
 LinalgOpInstancePromotionOptions::LinalgOpInstancePromotionOptions(
     LinalgOp linalgOp, const LinalgPromotionOptions &options)
-    : subViews(), dynamicBuffers(options.dynamicBuffers),
+    : subViews(), useFullTileBuffers(), dynamicBuffers(options.dynamicBuffers),
       alignment(options.alignment) {
+  unsigned nBuffers = linalgOp.getNumInputsAndOutputBuffers();
+  auto vUseFullTileBuffers =
+      options.useFullTileBuffers.getValueOr(llvm::SmallBitVector());
+  vUseFullTileBuffers.resize(nBuffers, options.useFullTileBuffersDefault);
+
   if (options.operandsToPromote.hasValue()) {
-    for (unsigned idx : options.operandsToPromote.getValue()) {
-      auto *op = linalgOp.getBuffer(idx).getDefiningOp();
-      if (auto sv = dyn_cast_or_null<SubViewOp>(op))
+    for (auto it : llvm::enumerate(options.operandsToPromote.getValue())) {
+      auto *op = linalgOp.getBuffer(it.value()).getDefiningOp();
+      if (auto sv = dyn_cast_or_null<SubViewOp>(op)) {
         subViews.insert(sv);
+        useFullTileBuffers[sv] = vUseFullTileBuffers[it.index()];
+      }
     }
   } else {
-    unsigned nBuffers = linalgOp.getNumInputsAndOutputBuffers();
     for (unsigned idx = 0; idx < nBuffers; ++idx) {
       auto *op = linalgOp.getBuffer(idx).getDefiningOp();
-      if (auto sv = dyn_cast_or_null<SubViewOp>(op))
+      if (auto sv = dyn_cast_or_null<SubViewOp>(op)) {
         subViews.insert(sv);
+        useFullTileBuffers[sv] = vUseFullTileBuffers[idx];
+      }
     }
   }
 }
@@ -201,6 +211,9 @@ promoteSubViews(OpBuilder &b, Location loc,
     auto info = promotionInfoMap.find(v);
     if (info == promotionInfoMap.end())
       continue;
+    // Only fill the buffer if the full local view is used
+    if (!options.useFullTileBuffers[v])
+      continue;
     Value fillVal;
     if (auto t = subView.getType().getElementType().dyn_cast<FloatType>())
       fillVal = folded_std_constant(folder, FloatAttr::get(t, 0.0));
@@ -244,7 +257,10 @@ static void promoteSubViews(OpBuilder &b, LinalgOp op,
   unsigned promotedIdx = 0;
   for (auto view : op.getInputsAndOutputBuffers()) {
     if (options.subViews.count(view) != 0) {
-      opViews.push_back(promotedBufferAndViews[promotedIdx].fullLocalView);
+      if (options.useFullTileBuffers[view])
+        opViews.push_back(promotedBufferAndViews[promotedIdx].fullLocalView);
+      else
+        opViews.push_back(promotedBufferAndViews[promotedIdx].partialLocalView);
       writebackViews.emplace_back(std::make_pair(
           view, promotedBufferAndViews[promotedIdx].partialLocalView));
       promotedIdx++;
diff --git a/mlir/lib/Dialect/Vector/VectorOps.cpp b/mlir/lib/Dialect/Vector/VectorOps.cpp
index 8385b253e9fb60..f347a564f446b8 100644
--- a/mlir/lib/Dialect/Vector/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/VectorOps.cpp
@@ -1202,6 +1202,23 @@ void ExtractStridedSliceOp::getCanonicalizationPatterns(
 //===----------------------------------------------------------------------===//
 // TransferReadOp
 //===----------------------------------------------------------------------===//
+
+/// Build the default minor identity map suitable for a vector transfer. This
+/// also handles the case memref<... x vector<...>> -> vector<...> in which the
+/// rank of the identity map must take the vector element type into account.
+AffineMap
+mlir::vector::impl::getTransferMinorIdentityMap(MemRefType memRefType,
+                                                VectorType vectorType) {
+  int64_t elementVectorRank = 0;
+  VectorType elementVectorType =
+      memRefType.getElementType().dyn_cast<VectorType>();
+  if (elementVectorType)
+    elementVectorRank += elementVectorType.getRank();
+  return AffineMap::getMinorIdentityMap(
+      memRefType.getRank(), vectorType.getRank() - elementVectorRank,
+      memRefType.getContext());
+}
+
 template <typename EmitFun>
 static LogicalResult verifyPermutationMap(AffineMap permutationMap,
                                           EmitFun emitOpError) {
@@ -1233,7 +1250,8 @@ static LogicalResult verifyPermutationMap(AffineMap permutationMap,
 
 static LogicalResult verifyTransferOp(Operation *op, MemRefType memrefType,
                                       VectorType vectorType,
-                                      AffineMap permutationMap) {
+                                      AffineMap permutationMap,
+                                      ArrayAttr optionalMasked) {
   auto memrefElementType = memrefType.getElementType();
   if (auto memrefVectorElementType = memrefElementType.dyn_cast<VectorType>()) {
     // Memref has vector element type.
@@ -1281,28 +1299,68 @@ static LogicalResult verifyTransferOp(Operation *op, MemRefType memrefType,
   if (permutationMap.getNumInputs() != memrefType.getRank())
     return op->emitOpError("requires a permutation_map with input dims of the "
                            "same rank as the memref type");
+
+  if (optionalMasked) {
+    if (permutationMap.getNumResults() !=
+        static_cast<int64_t>(optionalMasked.size()))
+      return op->emitOpError("expects the optional masked attr of same rank as "
+                             "permutation_map results: ")
+             << AffineMapAttr::get(permutationMap);
+  }
+
   return success();
 }
 
-/// Builder that sets permutation map and padding to 'getMinorIdentityMap' and
-/// zero, respectively, by default.
+/// Builder that sets padding to zero.
 void TransferReadOp::build(OpBuilder &builder, OperationState &result,
-                           VectorType vector, Value memref,
-                           ValueRange indices) {
-  auto permMap = AffineMap::getMinorIdentityMap(
-      memref.getType().cast<MemRefType>().getRank(), vector.getRank(),
-      builder.getContext());
+                           VectorType vector, Value memref, ValueRange indices,
+                           AffineMap permutationMap,
+                           ArrayRef<bool> maybeMasked) {
   Type elemType = vector.cast<VectorType>().getElementType();
   Value padding = builder.create<ConstantOp>(result.location, elemType,
                                              builder.getZeroAttr(elemType));
+  if (maybeMasked.empty())
+    return build(builder, result, vector, memref, indices, permutationMap,
+                 padding, ArrayAttr());
+  ArrayAttr maskedArrayAttr = builder.getBoolArrayAttr(maybeMasked);
+  build(builder, result, vector, memref, indices, permutationMap, padding,
+        maskedArrayAttr);
+}
 
-  build(builder, result, vector, memref, indices, permMap, padding);
+/// Builder that sets permutation map (resp. padding) to 'getMinorIdentityMap'
+/// (resp. zero).
+void TransferReadOp::build(OpBuilder &builder, OperationState &result,
+                           VectorType vectorType, Value memref,
+                           ValueRange indices, ArrayRef<bool> maybeMasked) {
+  auto permMap = getTransferMinorIdentityMap(
+      memref.getType().cast<MemRefType>(), vectorType);
+  build(builder, result, vectorType, memref, indices, permMap, maybeMasked);
+}
+
+template <typename TransferOp>
+void printTransferAttrs(OpAsmPrinter &p, TransferOp op) {
+  SmallVector<StringRef, 2> elidedAttrs;
+  if (op.permutation_map() == TransferOp::getTransferMinorIdentityMap(
+                                  op.getMemRefType(), op.getVectorType()))
+    elidedAttrs.push_back(op.getPermutationMapAttrName());
+  bool elideMasked = true;
+  if (auto maybeMasked = op.masked()) {
+    for (auto attr : *maybeMasked) {
+      if (!attr.template cast<BoolAttr>().getValue()) {
+        elideMasked = false;
+        break;
+      }
+    }
+  }
+  if (elideMasked)
+    elidedAttrs.push_back(op.getMaskedAttrName());
+  p.printOptionalAttrDict(op.getAttrs(), elidedAttrs);
 }
 
 static void print(OpAsmPrinter &p, TransferReadOp op) {
   p << op.getOperationName() << " " << op.memref() << "[" << op.indices()
-    << "], " << op.padding() << " ";
-  p.printOptionalAttrDict(op.getAttrs());
+    << "], " << op.padding();
+  printTransferAttrs(p, op);
   p << " : " << op.getMemRefType() << ", " << op.getVectorType();
 }
 
@@ -1313,7 +1371,7 @@ static ParseResult parseTransferReadOp(OpAsmParser &parser,
   SmallVector<OpAsmParser::OperandType, 8> indexInfo;
   OpAsmParser::OperandType paddingInfo;
   SmallVector<Type, 2> types;
-  // Parsing with support for optional paddingValue.
+  // Parsing with support for paddingValue.
   if (parser.parseOperand(memrefInfo) ||
       parser.parseOperandList(indexInfo, OpAsmParser::Delimiter::Square) ||
       parser.parseComma() || parser.parseOperand(paddingInfo) ||
@@ -1321,12 +1379,21 @@ static ParseResult parseTransferReadOp(OpAsmParser &parser,
       parser.getCurrentLocation(&typesLoc) || parser.parseColonTypeList(types))
     return failure();
   if (types.size() != 2)
-    return parser.emitError(typesLoc, "two types required");
+    return parser.emitError(typesLoc, "requires two types");
   auto indexType = parser.getBuilder().getIndexType();
   MemRefType memRefType = types[0].dyn_cast<MemRefType>();
   if (!memRefType)
-    return parser.emitError(typesLoc, "memref type required"), failure();
-  Type vectorType = types[1];
+    return parser.emitError(typesLoc, "requires memref type");
+  VectorType vectorType = types[1].dyn_cast<VectorType>();
+  if (!vectorType)
+    return parser.emitError(typesLoc, "requires vector type");
+  auto permutationAttrName = TransferReadOp::getPermutationMapAttrName();
+  auto attr = result.attributes.get(permutationAttrName);
+  if (!attr) {
+    auto permMap =
+        TransferReadOp::getTransferMinorIdentityMap(memRefType, vectorType);
+    result.attributes.set(permutationAttrName, AffineMapAttr::get(permMap));
+  }
   return failure(
       parser.resolveOperand(memrefInfo, memRefType, result.operands) ||
       parser.resolveOperands(indexInfo, indexType, result.operands) ||
@@ -1347,7 +1414,8 @@ static LogicalResult verify(TransferReadOp op) {
     return op.emitOpError("requires ") << memrefType.getRank() << " indices";
 
   if (failed(verifyTransferOp(op.getOperation(), memrefType, vectorType,
-                              permutationMap)))
+                              permutationMap,
+                              op.masked() ? *op.masked() : ArrayAttr())))
     return failure();
 
   if (auto memrefVectorElementType = memrefElementType.dyn_cast<VectorType>()) {
@@ -1376,15 +1444,67 @@ static LogicalResult verify(TransferReadOp op) {
 // TransferWriteOp
 //===----------------------------------------------------------------------===//
 
-/// Builder that sets permutation map and padding to 'getMinorIdentityMap' by
-/// default.
+/// Builder that sets permutation map to 'getMinorIdentityMap'.
 void TransferWriteOp::build(OpBuilder &builder, OperationState &result,
-                            Value vector, Value memref, ValueRange indices) {
+                            Value vector, Value memref, ValueRange indices,
+                            ArrayRef<bool> maybeMasked) {
   auto vectorType = vector.getType().cast<VectorType>();
-  auto permMap = AffineMap::getMinorIdentityMap(
-      memref.getType().cast<MemRefType>().getRank(), vectorType.getRank(),
-      builder.getContext());
-  build(builder, result, vector, memref, indices, permMap);
+  auto permMap = getTransferMinorIdentityMap(
+      memref.getType().cast<MemRefType>(), vectorType);
+  if (maybeMasked.empty())
+    return build(builder, result, vector, memref, indices, permMap,
+                 ArrayAttr());
+  ArrayAttr maskedArrayAttr = builder.getBoolArrayAttr(maybeMasked);
+  build(builder, result, vector, memref, indices, permMap, maskedArrayAttr);
+}
+
+/// Builder that sets permutation map to 'getMinorIdentityMap'.
+void TransferWriteOp::build(OpBuilder &builder, OperationState &result,
+                            Value vector, Value memref, ValueRange indices,
+                            AffineMap permutationMap) {
+  build(builder, result, vector, memref, indices,
+        /*maybeMasked=*/ArrayRef<bool>{});
+}
+
+static ParseResult parseTransferWriteOp(OpAsmParser &parser,
+                                        OperationState &result) {
+  llvm::SMLoc typesLoc;
+  OpAsmParser::OperandType vectorInfo, memrefInfo;
+  SmallVector<OpAsmParser::OperandType, 8> indexInfo;
+  SmallVector<Type, 2> types;
+  if (parser.parseOperand(vectorInfo) || parser.parseComma() ||
+      parser.parseOperand(memrefInfo) ||
+      parser.parseOperandList(indexInfo, OpAsmParser::Delimiter::Square) ||
+      parser.parseOptionalAttrDict(result.attributes) ||
+      parser.getCurrentLocation(&typesLoc) || parser.parseColonTypeList(types))
+    return failure();
+  if (types.size() != 2)
+    return parser.emitError(typesLoc, "requires two types");
+  auto indexType = parser.getBuilder().getIndexType();
+  VectorType vectorType = types[0].dyn_cast<VectorType>();
+  if (!vectorType)
+    return parser.emitError(typesLoc, "requires vector type");
+  MemRefType memRefType = types[1].dyn_cast<MemRefType>();
+  if (!memRefType)
+    return parser.emitError(typesLoc, "requires memref type");
+  auto permutationAttrName = TransferWriteOp::getPermutationMapAttrName();
+  auto attr = result.attributes.get(permutationAttrName);
+  if (!attr) {
+    auto permMap =
+        TransferWriteOp::getTransferMinorIdentityMap(memRefType, vectorType);
+    result.attributes.set(permutationAttrName, AffineMapAttr::get(permMap));
+  }
+  return failure(
+      parser.resolveOperand(vectorInfo, vectorType, result.operands) ||
+      parser.resolveOperand(memrefInfo, memRefType, result.operands) ||
+      parser.resolveOperands(indexInfo, indexType, result.operands));
+}
+
+static void print(OpAsmPrinter &p, TransferWriteOp op) {
+  p << op.getOperationName() << " " << op.vector() << ", " << op.memref() << "["
+    << op.indices() << "]";
+  printTransferAttrs(p, op);
+  p << " : " << op.getVectorType() << ", " << op.getMemRefType();
 }
 
 static LogicalResult verify(TransferWriteOp op) {
@@ -1397,7 +1517,8 @@ static LogicalResult verify(TransferWriteOp op) {
     return op.emitOpError("requires ") << memrefType.getRank() << " indices";
 
   if (failed(verifyTransferOp(op.getOperation(), memrefType, vectorType,
-                              permutationMap)))
+                              permutationMap,
+                              op.masked() ? *op.masked() : ArrayAttr())))
     return failure();
 
   return verifyPermutationMap(permutationMap,
diff --git a/mlir/lib/Dialect/Vector/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/VectorTransforms.cpp
index af7e5ad86af854..cf1bdede90271e 100644
--- a/mlir/lib/Dialect/Vector/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/VectorTransforms.cpp
@@ -564,9 +564,12 @@ struct SplitTransferReadOp : public OpRewritePattern<vector::TransferReadOp> {
       // Get VectorType for slice 'i'.
       auto sliceVectorType = resultTupleType.getType(index);
       // Create split TransferReadOp for 'sliceUser'.
+      // `masked` attribute propagates conservatively: if the coarse op didn't
+      // need masking, the fine op doesn't either.
       vectorTupleValues[index] = rewriter.create<vector::TransferReadOp>(
           loc, sliceVectorType, xferReadOp.memref(), sliceIndices,
-          xferReadOp.permutation_map(), xferReadOp.padding());
+          xferReadOp.permutation_map(), xferReadOp.padding(),
+          xferReadOp.masked() ? *xferReadOp.masked() : ArrayAttr());
     };
     generateTransferOpSlices(memrefElementType, sourceVectorType,
                              resultTupleType, sizes, strides, indices, rewriter,
@@ -620,9 +623,12 @@ struct SplitTransferWriteOp : public OpRewritePattern<vector::TransferWriteOp> {
                                   xferWriteOp.indices().end());
     auto createSlice = [&](unsigned index, ArrayRef<Value> sliceIndices) {
       // Create split TransferWriteOp for source vector 'tupleOp.operand[i]'.
+      // `masked` attribute propagates conservatively: if the coarse op didn't
+      // need masking, the fine op doesn't either.
       rewriter.create<vector::TransferWriteOp>(
           loc, tupleOp.getOperand(index), xferWriteOp.memref(), sliceIndices,
-          xferWriteOp.permutation_map());
+          xferWriteOp.permutation_map(),
+          xferWriteOp.masked() ? *xferWriteOp.masked() : ArrayAttr());
     };
     generateTransferOpSlices(memrefElementType, resultVectorType,
                              sourceTupleType, sizes, strides, indices, rewriter,
diff --git a/mlir/test/Conversion/AffineToStandard/lower-affine-to-vector.mlir b/mlir/test/Conversion/AffineToStandard/lower-affine-to-vector.mlir
index f9a78aa495a5c3..7fba0996d8f560 100644
--- a/mlir/test/Conversion/AffineToStandard/lower-affine-to-vector.mlir
+++ b/mlir/test/Conversion/AffineToStandard/lower-affine-to-vector.mlir
@@ -1,6 +1,5 @@
 // RUN: mlir-opt -lower-affine --split-input-file %s | FileCheck %s
 
-// CHECK: #[[perm_map:.*]] = affine_map<(d0) -> (d0)>
 // CHECK-LABEL: func @affine_vector_load
 func @affine_vector_load(%arg0 : index) {
   %0 = alloc() : memref<100xf32>
@@ -12,13 +11,12 @@ func @affine_vector_load(%arg0 : index) {
 // CHECK-NEXT:  %[[c7:.*]] = constant 7 : index
 // CHECK-NEXT:  %[[b:.*]] = addi %[[a]], %[[c7]] : index
 // CHECK-NEXT:  %[[pad:.*]] = constant 0.0
-// CHECK-NEXT:  vector.transfer_read %[[buf]][%[[b]]], %[[pad]] {permutation_map = #[[perm_map]]} : memref<100xf32>, vector<8xf32>
+// CHECK-NEXT:  vector.transfer_read %[[buf]][%[[b]]], %[[pad]] : memref<100xf32>, vector<8xf32>
   return
 }
 
 // -----
 
-// CHECK: #[[perm_map:.*]] = affine_map<(d0) -> (d0)>
 // CHECK-LABEL: func @affine_vector_store
 func @affine_vector_store(%arg0 : index) {
   %0 = alloc() : memref<100xf32>
@@ -33,13 +31,12 @@ func @affine_vector_store(%arg0 : index) {
 // CHECK-NEXT:  %[[b:.*]] = addi %{{.*}}, %[[a]] : index
 // CHECK-NEXT:  %[[c7:.*]] = constant 7 : index
 // CHECK-NEXT:  %[[c:.*]] = addi %[[b]], %[[c7]] : index
-// CHECK-NEXT:  vector.transfer_write  %[[val]], %[[buf]][%[[c]]] {permutation_map = #[[perm_map]]} : vector<4xf32>, memref<100xf32>
+// CHECK-NEXT:  vector.transfer_write  %[[val]], %[[buf]][%[[c]]] : vector<4xf32>, memref<100xf32>
   return
 }
 
 // -----
 
-// CHECK: #[[perm_map:.*]] = affine_map<(d0) -> (d0)>
 // CHECK-LABEL: func @affine_vector_load
 func @affine_vector_load(%arg0 : index) {
   %0 = alloc() : memref<100xf32>
@@ -51,13 +48,12 @@ func @affine_vector_load(%arg0 : index) {
 // CHECK-NEXT:  %[[c7:.*]] = constant 7 : index
 // CHECK-NEXT:  %[[b:.*]] = addi %[[a]], %[[c7]] : index
 // CHECK-NEXT:  %[[pad:.*]] = constant 0.0
-// CHECK-NEXT:  vector.transfer_read %[[buf]][%[[b]]], %[[pad]] {permutation_map = #[[perm_map]]} : memref<100xf32>, vector<8xf32>
+// CHECK-NEXT:  vector.transfer_read %[[buf]][%[[b]]], %[[pad]] : memref<100xf32>, vector<8xf32>
   return
 }
 
 // -----
 
-// CHECK: #[[perm_map:.*]] = affine_map<(d0) -> (d0)>
 // CHECK-LABEL: func @affine_vector_store
 func @affine_vector_store(%arg0 : index) {
   %0 = alloc() : memref<100xf32>
@@ -72,13 +68,12 @@ func @affine_vector_store(%arg0 : index) {
 // CHECK-NEXT:  %[[b:.*]] = addi %{{.*}}, %[[a]] : index
 // CHECK-NEXT:  %[[c7:.*]] = constant 7 : index
 // CHECK-NEXT:  %[[c:.*]] = addi %[[b]], %[[c7]] : index
-// CHECK-NEXT:  vector.transfer_write  %[[val]], %[[buf]][%[[c]]] {permutation_map = #[[perm_map]]} : vector<4xf32>, memref<100xf32>
+// CHECK-NEXT:  vector.transfer_write  %[[val]], %[[buf]][%[[c]]] : vector<4xf32>, memref<100xf32>
   return
 }
 
 // -----
 
-// CHECK: #[[perm_map:.*]] = affine_map<(d0, d1) -> (d0, d1)>
 // CHECK-LABEL: func @vector_load_2d
 func @vector_load_2d() {
   %0 = alloc() : memref<100x100xf32>
@@ -89,7 +84,7 @@ func @vector_load_2d() {
 // CHECK:      scf.for %[[i0:.*]] =
 // CHECK:        scf.for %[[i1:.*]] =
 // CHECK-NEXT:     %[[pad:.*]] = constant 0.0
-// CHECK-NEXT:     vector.transfer_read %[[buf]][%[[i0]], %[[i1]]], %[[pad]] {permutation_map = #[[perm_map]]} : memref<100x100xf32>, vector<2x8xf32>
+// CHECK-NEXT:     vector.transfer_read %[[buf]][%[[i0]], %[[i1]]], %[[pad]] : memref<100x100xf32>, vector<2x8xf32>
     }
   }
   return
@@ -97,7 +92,6 @@ func @vector_load_2d() {
 
 // -----
 
-// CHECK: #[[perm_map:.*]] = affine_map<(d0, d1) -> (d0, d1)>
 // CHECK-LABEL: func @vector_store_2d
 func @vector_store_2d() {
   %0 = alloc() : memref<100x100xf32>
@@ -109,7 +103,7 @@ func @vector_store_2d() {
 // CHECK:      %[[val:.*]] = constant dense
 // CHECK:      scf.for %[[i0:.*]] =
 // CHECK:        scf.for %[[i1:.*]] =
-// CHECK-NEXT:     vector.transfer_write  %[[val]], %[[buf]][%[[i0]], %[[i1]]] {permutation_map = #[[perm_map]]} : vector<2x8xf32>, memref<100x100xf32>
+// CHECK-NEXT:     vector.transfer_write  %[[val]], %[[buf]][%[[i0]], %[[i1]]] : vector<2x8xf32>, memref<100x100xf32>
     }
   }
   return
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index 1c23072b61092a..26e3e9dbe2b1e5 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -918,6 +918,24 @@ func @transfer_read_1d_non_zero_addrspace(%A : memref<?xf32, 3>, %base: index) -
 //       CHECK: %[[vecPtr_b:.*]] = llvm.addrspacecast %[[gep_b]] :
 //  CHECK-SAME: !llvm<"float addrspace(3)*"> to !llvm<"<17 x float>*">
 
+func @transfer_read_1d_not_masked(%A : memref<?xf32>, %base: index) -> vector<17xf32> {
+  %f7 = constant 7.0: f32
+  %f = vector.transfer_read %A[%base], %f7 {masked = [false]} :
+    memref<?xf32>, vector<17xf32>
+  return %f: vector<17xf32>
+}
+// CHECK-LABEL: func @transfer_read_1d_not_masked
+//  CHECK-SAME: %[[BASE:[a-zA-Z0-9]*]]: !llvm.i64) -> !llvm<"<17 x float>">
+//
+// 1. Bitcast to vector form.
+//       CHECK: %[[gep:.*]] = llvm.getelementptr {{.*}} :
+//  CHECK-SAME: (!llvm<"float*">, !llvm.i64) -> !llvm<"float*">
+//       CHECK: %[[vecPtr:.*]] = llvm.bitcast %[[gep]] :
+//  CHECK-SAME: !llvm<"float*"> to !llvm<"<17 x float>*">
+//
+// 2. Rewrite as a load.
+//       CHECK: %[[loaded:.*]] = llvm.load %[[vecPtr]] : !llvm<"<17 x float>*">
+
 func @genbool_1d() -> vector<8xi1> {
   %0 = vector.constant_mask [4] : vector<8xi1>
   return %0 : vector<8xi1>
diff --git a/mlir/test/Conversion/VectorToLoops/vector-to-loops.mlir b/mlir/test/Conversion/VectorToLoops/vector-to-loops.mlir
index 491196c91efb0f..c0bc5542e21d2a 100644
--- a/mlir/test/Conversion/VectorToLoops/vector-to-loops.mlir
+++ b/mlir/test/Conversion/VectorToLoops/vector-to-loops.mlir
@@ -220,16 +220,14 @@ func @transfer_read_progressive(%A : memref<?x?xf32>, %base: index) -> vector<17
   // CHECK: %[[cst:.*]] = constant 7.000000e+00 : f32
   %f7 = constant 7.0: f32
 
-  // CHECK-DAG: %[[cond0:.*]] = constant 1 : i1
   // CHECK-DAG: %[[splat:.*]] = constant dense<7.000000e+00> : vector<15xf32>
   // CHECK-DAG: %[[alloc:.*]] = alloc() : memref<17xvector<15xf32>>
   // CHECK-DAG: %[[dim:.*]] = dim %[[A]], 0 : memref<?x?xf32>
   // CHECK: affine.for %[[I:.*]] = 0 to 17 {
   // CHECK:   %[[add:.*]] = affine.apply #[[MAP0]](%[[I]])[%[[base]]]
-  // CHECK:   %[[cmp:.*]] = cmpi "slt", %[[add]], %[[dim]] : index
-  // CHECK:   %[[cond1:.*]] = and %[[cmp]], %[[cond0]] : i1
+  // CHECK:   %[[cond1:.*]] = cmpi "slt", %[[add]], %[[dim]] : index
   // CHECK:   scf.if %[[cond1]] {
-  // CHECK:     %[[vec_1d:.*]] = vector.transfer_read %[[A]][%[[add]], %[[base]]], %[[cst]]  {permutation_map = #[[MAP1]]} : memref<?x?xf32>, vector<15xf32>
+  // CHECK:     %[[vec_1d:.*]] = vector.transfer_read %[[A]][%[[add]], %[[base]]], %[[cst]] : memref<?x?xf32>, vector<15xf32>
   // CHECK:     store %[[vec_1d]], %[[alloc]][%[[I]]] : memref<17xvector<15xf32>>
   // CHECK:   } else {
   // CHECK:     store %[[splat]], %[[alloc]][%[[I]]] : memref<17xvector<15xf32>>
@@ -253,7 +251,6 @@ func @transfer_read_progressive(%A : memref<?x?xf32>, %base: index) -> vector<17
 //  CHECK-SAME:   %[[base:[a-zA-Z0-9]+]]: index,
 //  CHECK-SAME:   %[[vec:[a-zA-Z0-9]+]]: vector<17x15xf32>
 func @transfer_write_progressive(%A : memref<?x?xf32>, %base: index, %vec: vector<17x15xf32>) {
-  // CHECK: %[[cond0:.*]] = constant 1 : i1
   // CHECK: %[[alloc:.*]] = alloc() : memref<17xvector<15xf32>>
   // CHECK: %[[vmemref:.*]] = vector.type_cast %[[alloc]] : memref<17xvector<15xf32>> to memref<vector<17x15xf32>>
   // CHECK: store %[[vec]], %[[vmemref]][] : memref<vector<17x15xf32>>
@@ -261,13 +258,35 @@ func @transfer_write_progressive(%A : memref<?x?xf32>, %base: index, %vec: vecto
   // CHECK: affine.for %[[I:.*]] = 0 to 17 {
   // CHECK:   %[[add:.*]] = affine.apply #[[MAP0]](%[[I]])[%[[base]]]
   // CHECK:   %[[cmp:.*]] = cmpi "slt", %[[add]], %[[dim]] : index
-  // CHECK:   %[[cond1:.*]] = and %[[cmp]], %[[cond0]] : i1
-  // CHECK:   scf.if %[[cond1]] {
+  // CHECK:   scf.if %[[cmp]] {
   // CHECK:     %[[vec_1d:.*]] = load %0[%[[I]]] : memref<17xvector<15xf32>>
-  // CHECK:     vector.transfer_write %[[vec_1d]], %[[A]][%[[add]], %[[base]]] {permutation_map = #[[MAP1]]} : vector<15xf32>, memref<?x?xf32>
+  // CHECK:     vector.transfer_write %[[vec_1d]], %[[A]][%[[add]], %[[base]]] : vector<15xf32>, memref<?x?xf32>
   // CHECK:   }
   vector.transfer_write %vec, %A[%base, %base]
       {permutation_map = affine_map<(d0, d1) -> (d0, d1)>} :
     vector<17x15xf32>, memref<?x?xf32>
   return
 }
+
+// -----
+
+// CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0)[s0] -> (d0 + s0)>
+// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1) -> (d1)>
+
+// CHECK-LABEL: transfer_write_progressive_not_masked(
+//  CHECK-SAME:   %[[A:[a-zA-Z0-9]+]]: memref<?x?xf32>,
+//  CHECK-SAME:   %[[base:[a-zA-Z0-9]+]]: index,
+//  CHECK-SAME:   %[[vec:[a-zA-Z0-9]+]]: vector<17x15xf32>
+func @transfer_write_progressive_not_masked(%A : memref<?x?xf32>, %base: index, %vec: vector<17x15xf32>) {
+  // CHECK-NOT:    scf.if
+  // CHECK-NEXT: %[[alloc:.*]] = alloc() : memref<17xvector<15xf32>>
+  // CHECK-NEXT: %[[vmemref:.*]] = vector.type_cast %[[alloc]] : memref<17xvector<15xf32>> to memref<vector<17x15xf32>>
+  // CHECK-NEXT: store %[[vec]], %[[vmemref]][] : memref<vector<17x15xf32>>
+  // CHECK-NEXT: affine.for %[[I:.*]] = 0 to 17 {
+  // CHECK-NEXT:   %[[add:.*]] = affine.apply #[[MAP0]](%[[I]])[%[[base]]]
+  // CHECK-NEXT:   %[[vec_1d:.*]] = load %0[%[[I]]] : memref<17xvector<15xf32>>
+  // CHECK-NEXT:   vector.transfer_write %[[vec_1d]], %[[A]][%[[add]], %[[base]]] : vector<15xf32>, memref<?x?xf32>
+  vector.transfer_write %vec, %A[%base, %base] {masked = [false, false]} :
+    vector<17x15xf32>, memref<?x?xf32>
+  return
+}
diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir
index b577e229ba7639..10bf5009d5f630 100644
--- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_1d.mlir
@@ -2,7 +2,6 @@
 
 // Permutation maps used in vectorization.
 // CHECK: #[[map_proj_d0d1_0:map[0-9]+]] = affine_map<(d0, d1) -> (0)>
-// CHECK: #[[map_proj_d0d1_d1:map[0-9]+]] = affine_map<(d0, d1) -> (d1)>
 
 #map0 = affine_map<(d0) -> (d0)>
 #mapadd1 = affine_map<(d0) -> (d0 + 1)>
@@ -13,7 +12,6 @@
 // Maps introduced to vectorize fastest varying memory index.
 // CHECK-LABEL: func @vec1d_1
 func @vec1d_1(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
-// CHECK-DAG: %{{.*}} = constant 0.0{{.*}}: f32
 // CHECK-DAG: %[[C0:[a-z0-9_]+]] = constant 0 : index
 // CHECK-DAG: [[ARG_M:%[0-9]+]] = dim %{{.*}}, 0 : memref<?x?xf32>
 // CHECK-DAG: [[ARG_N:%[0-9]+]] = dim %{{.*}}, 1 : memref<?x?xf32>
@@ -22,10 +20,11 @@ func @vec1d_1(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
    %N = dim %A, 1 : memref<?x?xf32>
    %P = dim %B, 2 : memref<?x?x?xf32>
    %cst0 = constant 0 : index
-//
+
 // CHECK: for {{.*}} step 128
 // CHECK-NEXT: %{{.*}} = affine.apply #map0(%[[C0]])
 // CHECK-NEXT: %{{.*}} = affine.apply #map0(%[[C0]])
+// CHECK-NEXT: %{{.*}} = constant 0.0{{.*}}: f32
 // CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
    affine.for %i0 = 0 to %M { // vectorized due to scalar -> vector
      %a0 = affine.load %A[%cst0, %cst0] : memref<?x?xf32>
@@ -35,7 +34,6 @@ func @vec1d_1(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 
 // CHECK-LABEL: func @vec1d_2
 func @vec1d_2(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
-// CHECK-DAG: %{{.*}} = constant 0.0{{.*}}: f32
 // CHECK-DAG: %[[C0:[a-z0-9_]+]] = constant 0 : index
 // CHECK-DAG: [[ARG_M:%[0-9]+]] = dim %{{.*}}, 0 : memref<?x?xf32>
 // CHECK-DAG: [[ARG_N:%[0-9]+]] = dim %{{.*}}, 1 : memref<?x?xf32>
@@ -46,7 +44,8 @@ func @vec1d_2(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
    %cst0 = constant 0 : index
 //
 // CHECK:for [[IV3:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128
-// CHECK-NEXT:   {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1_d1]]} : memref<?x?xf32>, vector<128xf32>
+// CHECK-NEXT: %[[CST:.*]] = constant 0.0{{.*}}: f32
+// CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %[[CST]] : memref<?x?xf32>, vector<128xf32>
    affine.for %i3 = 0 to %M { // vectorized
      %a3 = affine.load %A[%cst0, %i3] : memref<?x?xf32>
    }
@@ -55,7 +54,6 @@ func @vec1d_2(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 
 // CHECK-LABEL: func @vec1d_3
 func @vec1d_3(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
-// CHECK-DAG: %{{.*}} = constant 0.0{{.*}}: f32
 // CHECK-DAG: %[[C0:[a-z0-9_]+]] = constant 0 : index
 // CHECK-DAG: [[ARG_M:%[0-9]+]] = dim %arg0, 0 : memref<?x?xf32>
 // CHECK-DAG: [[ARG_N:%[0-9]+]] = dim %arg0, 1 : memref<?x?xf32>
@@ -69,7 +67,8 @@ func @vec1d_3(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 // CHECK-NEXT:   for [[IV9:%[arg0-9]*]] = 0 to [[ARG_N]] {
 // CHECK-NEXT:   %[[APP9_0:[0-9]+]] = affine.apply {{.*}}([[IV9]], [[IV8]])
 // CHECK-NEXT:   %[[APP9_1:[0-9]+]] = affine.apply {{.*}}([[IV9]], [[IV8]])
-// CHECK-NEXT:   {{.*}} = vector.transfer_read %{{.*}}[%[[APP9_0]], %[[APP9_1]]], %{{.*}} {permutation_map = #[[map_proj_d0d1_d1]]} : memref<?x?xf32>, vector<128xf32>
+// CHECK-NEXT:   %[[CST:.*]] = constant 0.0{{.*}}: f32
+// CHECK-NEXT:   {{.*}} = vector.transfer_read %{{.*}}[%[[APP9_0]], %[[APP9_1]]], %[[CST]] : memref<?x?xf32>, vector<128xf32>
    affine.for %i8 = 0 to %M { // vectorized
      affine.for %i9 = 0 to %N {
        %a9 = affine.load %A[%i9, %i8 + %i9] : memref<?x?xf32>
@@ -87,31 +86,31 @@ func @vector_add_2d(%M : index, %N : index) -> f32 {
   %f2 = constant 2.0 : f32
   affine.for %i0 = 0 to %M {
     affine.for %i1 = 0 to %N {
-      // CHECK: [[C1:%.*]] = constant dense<1.000000e+00> : vector<128xf32>
-      // CHECK: vector.transfer_write [[C1]], {{.*}} {permutation_map = #[[map_proj_d0d1_d1]]} : vector<128xf32>, memref<?x?xf32>
+      // CHECK: %[[C1:.*]] = constant dense<1.000000e+00> : vector<128xf32>
+      // CHECK: vector.transfer_write %[[C1]], {{.*}} : vector<128xf32>, memref<?x?xf32>
       // non-scoped %f1
       affine.store %f1, %A[%i0, %i1] : memref<?x?xf32, 0>
     }
   }
   affine.for %i2 = 0 to %M {
     affine.for %i3 = 0 to %N {
-      // CHECK: [[C3:%.*]] = constant dense<2.000000e+00> : vector<128xf32>
-      // CHECK: vector.transfer_write [[C3]], {{.*}} {permutation_map = #[[map_proj_d0d1_d1]]} : vector<128xf32>, memref<?x?xf32>
+      // CHECK: %[[C3:.*]] = constant dense<2.000000e+00> : vector<128xf32>
+      // CHECK: vector.transfer_write %[[C3]], {{.*}} : vector<128xf32>, memref<?x?xf32>
       // non-scoped %f2
       affine.store %f2, %B[%i2, %i3] : memref<?x?xf32, 0>
     }
   }
   affine.for %i4 = 0 to %M {
     affine.for %i5 = 0 to %N {
-      // CHECK: [[A5:%.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1_d1]]} : memref<?x?xf32>, vector<128xf32>
-      // CHECK: [[B5:%.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1_d1]]} : memref<?x?xf32>, vector<128xf32>
-      // CHECK: [[S5:%.*]] = addf [[A5]], [[B5]] : vector<128xf32>
-      // CHECK: [[SPLAT1:%.*]] = constant dense<1.000000e+00> : vector<128xf32>
-      // CHECK: [[S6:%.*]] = addf [[S5]], [[SPLAT1]] : vector<128xf32>
-      // CHECK: [[SPLAT2:%.*]] = constant dense<2.000000e+00> : vector<128xf32>
-      // CHECK: [[S7:%.*]] = addf [[S5]], [[SPLAT2]] : vector<128xf32>
-      // CHECK: [[S8:%.*]] = addf [[S7]], [[S6]] : vector<128xf32>
-      // CHECK: vector.transfer_write [[S8]], {{.*}} {permutation_map = #[[map_proj_d0d1_d1]]} : vector<128xf32>, memref<?x?xf32>
+      // CHECK: %[[A5:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{[a-zA-Z0-9_]*}} : memref<?x?xf32>, vector<128xf32>
+      // CHECK: %[[B5:.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{[a-zA-Z0-9_]*}} : memref<?x?xf32>, vector<128xf32>
+      // CHECK: %[[S5:.*]] = addf %[[A5]], %[[B5]] : vector<128xf32>
+      // CHECK: %[[SPLAT1:.*]] = constant dense<1.000000e+00> : vector<128xf32>
+      // CHECK: %[[S6:.*]] = addf %[[S5]], %[[SPLAT1]] : vector<128xf32>
+      // CHECK: %[[SPLAT2:.*]] = constant dense<2.000000e+00> : vector<128xf32>
+      // CHECK: %[[S7:.*]] = addf %[[S5]], %[[SPLAT2]] : vector<128xf32>
+      // CHECK: %[[S8:.*]] = addf %[[S7]], %[[S6]] : vector<128xf32>
+      // CHECK: vector.transfer_write %[[S8]], {{.*}} : vector<128xf32>, memref<?x?xf32>
       %a5 = affine.load %A[%i4, %i5] : memref<?x?xf32, 0>
       %b5 = affine.load %B[%i4, %i5] : memref<?x?xf32, 0>
       %s5 = addf %a5, %b5 : f32
@@ -168,7 +167,6 @@ func @vec_rejected_2(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 
 // CHECK-LABEL: func @vec_rejected_3
 func @vec_rejected_3(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
-// CHECK-DAG: %{{.*}} = constant 0.0{{.*}}: f32
 // CHECK-DAG: [[C0:%[a-z0-9_]+]] = constant 0 : index
 // CHECK-DAG: [[ARG_M:%[0-9]+]] = dim %{{.*}}, 0 : memref<?x?xf32>
 // CHECK-DAG: [[ARG_N:%[0-9]+]] = dim %{{.*}}, 1 : memref<?x?xf32>
@@ -180,7 +178,8 @@ func @vec_rejected_3(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 //
 // CHECK:for [[IV4:%[arg0-9]+]] = 0 to [[ARG_M]] step 128 {
 // CHECK-NEXT:   for [[IV5:%[arg0-9]*]] = 0 to [[ARG_N]] {
-// CHECK-NEXT:   {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1_d1]]} : memref<?x?xf32>, vector<128xf32>
+// CHECK-NEXT:     %{{.*}} = constant 0.0{{.*}}: f32
+// CHECK-NEXT:     {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{[a-zA-Z0-9_]*}} : memref<?x?xf32>, vector<128xf32>
    affine.for %i4 = 0 to %M { // vectorized
      affine.for %i5 = 0 to %N { // not vectorized, would vectorize with --test-fastest-varying=1
        %a5 = affine.load %A[%i5, %i4] : memref<?x?xf32>
@@ -277,7 +276,6 @@ func @vec_rejected_7(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 
 // CHECK-LABEL: func @vec_rejected_8
 func @vec_rejected_8(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
-// CHECK-DAG: %{{.*}} = constant 0.0{{.*}}: f32
 // CHECK-DAG: %[[C0:[a-z0-9_]+]] = constant 0 : index
 // CHECK-DAG: [[ARG_M:%[0-9]+]] = dim %{{.*}}, 0 : memref<?x?xf32>
 // CHECK-DAG: [[ARG_N:%[0-9]+]] = dim %{{.*}}, 1 : memref<?x?xf32>
@@ -291,6 +289,7 @@ func @vec_rejected_8(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 // CHECK:   for [[IV18:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128
 // CHECK:     %{{.*}} = affine.apply #map0(%{{.*}})
 // CHECK:     %{{.*}} = affine.apply #map0(%{{.*}})
+// CHECK:     %{{.*}} = constant 0.0{{.*}}: f32
 // CHECK:     {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
    affine.for %i17 = 0 to %M { // not vectorized, the 1-D pattern that matched %{{.*}} in DFS post-order prevents vectorizing %{{.*}}
      affine.for %i18 = 0 to %M { // vectorized due to scalar -> vector
@@ -302,7 +301,6 @@ func @vec_rejected_8(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 
 // CHECK-LABEL: func @vec_rejected_9
 func @vec_rejected_9(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
-// CHECK-DAG: %{{.*}} = constant 0.0{{.*}}: f32
 // CHECK-DAG: %[[C0:[a-z0-9_]+]] = constant 0 : index
 // CHECK-DAG: [[ARG_M:%[0-9]+]] = dim %{{.*}}, 0 : memref<?x?xf32>
 // CHECK-DAG: [[ARG_N:%[0-9]+]] = dim %{{.*}}, 1 : memref<?x?xf32>
@@ -316,6 +314,7 @@ func @vec_rejected_9(%A : memref<?x?xf32>, %B : memref<?x?x?xf32>) {
 // CHECK:   for [[IV18:%[a-zA-Z0-9]+]] = 0 to [[ARG_M]] step 128
 // CHECK:      %{{.*}} = affine.apply #map0(%{{.*}})
 // CHECK-NEXT: %{{.*}} = affine.apply #map0(%{{.*}})
+// CHECK-NEXT: %{{.*}} = constant 0.0{{.*}}: f32
 // CHECK-NEXT: {{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1_0]]} : memref<?x?xf32>, vector<128xf32>
    affine.for %i17 = 0 to %M { // not vectorized, the 1-D pattern that matched %i18 in DFS post-order prevents vectorizing %{{.*}}
      affine.for %i18 = 0 to %M { // vectorized due to scalar -> vector
diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir
index 884907024bb115..3352644da63d8b 100644
--- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_2d.mlir
@@ -54,7 +54,7 @@ func @vector_add_2d(%M : index, %N : index) -> f32 {
   affine.for %i0 = 0 to %M {
     affine.for %i1 = 0 to %N {
       // CHECK: [[C1:%.*]] = constant dense<1.000000e+00> : vector<32x256xf32>
-      // CHECK: vector.transfer_write [[C1]], {{.*}} {permutation_map = #[[map_id2]]} : vector<32x256xf32>, memref<?x?xf32>
+      // CHECK: vector.transfer_write [[C1]], {{.*}} : vector<32x256xf32>, memref<?x?xf32>
       // non-scoped %f1
       affine.store %f1, %A[%i0, %i1] : memref<?x?xf32, 0>
     }
@@ -62,22 +62,22 @@ func @vector_add_2d(%M : index, %N : index) -> f32 {
   affine.for %i2 = 0 to %M {
     affine.for %i3 = 0 to %N {
       // CHECK: [[C3:%.*]] = constant dense<2.000000e+00> : vector<32x256xf32>
-      // CHECK: vector.transfer_write [[C3]], {{.*}} {permutation_map = #[[map_id2]]}  : vector<32x256xf32>, memref<?x?xf32>
+      // CHECK: vector.transfer_write [[C3]], {{.*}}  : vector<32x256xf32>, memref<?x?xf32>
       // non-scoped %f2
       affine.store %f2, %B[%i2, %i3] : memref<?x?xf32, 0>
     }
   }
   affine.for %i4 = 0 to %M {
     affine.for %i5 = 0 to %N {
-      // CHECK: [[A5:%.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} {permutation_map = #[[map_id2]]} : memref<?x?xf32>, vector<32x256xf32>
-      // CHECK: [[B5:%.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} {permutation_map = #[[map_id2]]} : memref<?x?xf32>, vector<32x256xf32>
+      // CHECK: [[A5:%.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} : memref<?x?xf32>, vector<32x256xf32>
+      // CHECK: [[B5:%.*]] = vector.transfer_read %{{.*}}[{{.*}}], %{{.*}} : memref<?x?xf32>, vector<32x256xf32>
       // CHECK: [[S5:%.*]] = addf [[A5]], [[B5]] : vector<32x256xf32>
       // CHECK: [[SPLAT1:%.*]] = constant dense<1.000000e+00> : vector<32x256xf32>
       // CHECK: [[S6:%.*]] = addf [[S5]], [[SPLAT1]] : vector<32x256xf32>
       // CHECK: [[SPLAT2:%.*]] = constant dense<2.000000e+00> : vector<32x256xf32>
       // CHECK: [[S7:%.*]] = addf [[S5]], [[SPLAT2]] : vector<32x256xf32>
       // CHECK: [[S8:%.*]] = addf [[S7]], [[S6]] : vector<32x256xf32>
-      // CHECK: vector.transfer_write [[S8]], {{.*}} {permutation_map = #[[map_id2]]} : vector<32x256xf32>, memref<?x?xf32>
+      // CHECK: vector.transfer_write [[S8]], {{.*}} : vector<32x256xf32>, memref<?x?xf32>
       //
       %a5 = affine.load %A[%i4, %i5] : memref<?x?xf32, 0>
       %b5 = affine.load %B[%i4, %i5] : memref<?x?xf32, 0>
@@ -110,7 +110,7 @@ func @vectorize_matmul(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: me
   //      VECT: {{.*}} #[[map_id1]](%[[M]]) step 4 {
   // VECT-NEXT:   {{.*}} #[[map_id1]](%[[N]]) step 8 {
   //      VECT:     %[[VC0:.*]] = constant dense<0.000000e+00> : vector<4x8xf32>
-  // VECT-NEXT:     vector.transfer_write %[[VC0]], %{{.*}}[%{{.*}}, %{{.*}}] {permutation_map = #[[map_id2]]} : vector<4x8xf32>, memref<?x?xf32>
+  // VECT-NEXT:     vector.transfer_write %[[VC0]], %{{.*}}[%{{.*}}, %{{.*}}] : vector<4x8xf32>, memref<?x?xf32>
   affine.for %i0 = affine_map<(d0) -> (d0)>(%c0) to affine_map<(d0) -> (d0)>(%M) {
     affine.for %i1 = affine_map<(d0) -> (d0)>(%c0) to affine_map<(d0) -> (d0)>(%N) {
       %cst = constant 0.000000e+00 : f32
@@ -120,12 +120,12 @@ func @vectorize_matmul(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>, %arg2: me
   //      VECT:  affine.for %[[I2:.*]] = #[[map_id1]](%[[C0]]) to #[[map_id1]](%[[M]]) step 4 {
   // VECT-NEXT:    affine.for %[[I3:.*]] = #[[map_id1]](%[[C0]]) to #[[map_id1]](%[[N]]) step 8 {
   // VECT-NEXT:      affine.for %[[I4:.*]] = #map5(%[[C0]]) to #[[map_id1]](%[[K]]) {
-  // VECT-NEXT:        %[[A:.*]] = vector.transfer_read %{{.*}}[%[[I4]], %[[I3]]], %{{.*}} {permutation_map = #[[map_proj_d0d1_zerod1]]} : memref<?x?xf32>, vector<4x8xf32>
-  // VECT-NEXT:        %[[B:.*]] = vector.transfer_read %{{.*}}[%[[I2]], %[[I4]]], %{{.*}} {permutation_map = #[[map_proj_d0d1_d0zero]]} : memref<?x?xf32>, vector<4x8xf32>
+  //      VECT:        %[[A:.*]] = vector.transfer_read %{{.*}}[%[[I4]], %[[I3]]], %{{.*}} {permutation_map = #[[map_proj_d0d1_zerod1]]} : memref<?x?xf32>, vector<4x8xf32>
+  //      VECT:        %[[B:.*]] = vector.transfer_read %{{.*}}[%[[I2]], %[[I4]]], %{{.*}} {permutation_map = #[[map_proj_d0d1_d0zero]]} : memref<?x?xf32>, vector<4x8xf32>
   // VECT-NEXT:        %[[C:.*]] = mulf %[[B]], %[[A]] : vector<4x8xf32>
-  // VECT-NEXT:        %[[D:.*]] = vector.transfer_read %{{.*}}[%[[I2]], %[[I3]]], %{{.*}} {permutation_map = #[[map_id2]]} : memref<?x?xf32>, vector<4x8xf32>
+  //      VECT:        %[[D:.*]] = vector.transfer_read %{{.*}}[%[[I2]], %[[I3]]], %{{.*}} : memref<?x?xf32>, vector<4x8xf32>
   // VECT-NEXT:        %[[E:.*]] = addf %[[D]], %[[C]] : vector<4x8xf32>
-  // VECT-NEXT:        vector.transfer_write %[[E]], %{{.*}}[%[[I2]], %[[I3]]] {permutation_map = #[[map_id2]]} : vector<4x8xf32>, memref<?x?xf32>
+  //      VECT:        vector.transfer_write %[[E]], %{{.*}}[%[[I2]], %[[I3]]] : vector<4x8xf32>, memref<?x?xf32>
   affine.for %i2 = affine_map<(d0) -> (d0)>(%c0) to affine_map<(d0) -> (d0)>(%M) {
     affine.for %i3 = affine_map<(d0) -> (d0)>(%c0) to affine_map<(d0) -> (d0)>(%N) {
       affine.for %i4 = affine_map<(d0) -> (d0)>(%c0) to affine_map<(d0) -> (d0)>(%K) {
diff --git a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_3d.mlir b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_3d.mlir
index 2980ee30d90868..5b6517ea390e5d 100644
--- a/mlir/test/Dialect/Affine/SuperVectorize/vectorize_3d.mlir
+++ b/mlir/test/Dialect/Affine/SuperVectorize/vectorize_3d.mlir
@@ -12,7 +12,7 @@ func @vec3d(%A : memref<?x?x?xf32>) {
    // CHECK:     affine.for %{{.*}} = 0 to %{{.*}} step 32 {
    // CHECK:       affine.for %{{.*}} = 0 to %{{.*}} step 64 {
    // CHECK:         affine.for %{{.*}} = 0 to %{{.*}} step 256 {
-   // CHECK:           %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} {permutation_map = #[[map_proj_d0d1d2_d0d1d2]]} : memref<?x?x?xf32>, vector<32x64x256xf32>
+   // CHECK:           %{{.*}} = vector.transfer_read %{{.*}}[%{{.*}}, %{{.*}}, %{{.*}}], %{{.*}} : memref<?x?x?xf32>, vector<32x64x256xf32>
    affine.for %t0 = 0 to %0 {
      affine.for %t1 = 0 to %0 {
        affine.for %i0 = 0 to %0 {
diff --git a/mlir/test/Dialect/Linalg/promote.mlir b/mlir/test/Dialect/Linalg/promote.mlir
index 64534733846a13..27364b05f3bd90 100644
--- a/mlir/test/Dialect/Linalg/promote.mlir
+++ b/mlir/test/Dialect/Linalg/promote.mlir
@@ -56,14 +56,11 @@ func @matmul_f32(%A: memref<?xi8>, %M: index, %N: index, %K: index) {
 //     DYNAMIC:         std.view %{{.*}}[{{.*}}][{{.*}}] : memref<?xi8> to memref<?x?xf32>
 //       CHECK:         %[[partialC:.*]] = subview %[[fullC]]{{.*}} : memref<?x?xf32> to memref<?x?xf32, #[[strided2D_dynamic]]>
 
-//       CHECK:         linalg.fill(%[[fullA]], {{.*}}) : memref<?x?xf32>, f32
-//       CHECK:         linalg.fill(%[[fullB]], {{.*}}) : memref<?x?xf32>, f32
-//       CHECK:         linalg.fill(%[[fullC]], {{.*}}) : memref<?x?xf32>, f32
 //       CHECK:         linalg.copy(%[[vA]], %[[partialA]]) : memref<?x?xf32, #[[strided2D]]>, memref<?x?xf32, #[[strided2D_dynamic]]>
 //       CHECK:         linalg.copy(%[[vB]], %[[partialB]]) : memref<?x?xf32, #[[strided2D]]>, memref<?x?xf32, #[[strided2D_dynamic]]>
 //       CHECK:         linalg.copy(%[[vC]], %[[partialC]]) : memref<?x?xf32, #[[strided2D]]>, memref<?x?xf32, #[[strided2D_dynamic]]>
 //
-//       CHECK:         linalg.matmul(%[[fullA]], %[[fullB]], %[[fullC]]) : memref<?x?xf32>, memref<?x?xf32>, memref<?x?xf32>
+//       CHECK:         linalg.matmul(%[[partialA]], %[[partialB]], %[[partialC]]) : memref<?x?xf32, #[[strided2D_dynamic]]>, memref<?x?xf32, #[[strided2D_dynamic]]>, memref<?x?xf32, #[[strided2D_dynamic]]>
 //
 //       CHECK:         linalg.copy(%[[partialC]], %[[vC]]) : memref<?x?xf32, #[[strided2D_dynamic]]>, memref<?x?xf32, #[[strided2D]]>
 //
@@ -121,14 +118,11 @@ func @matmul_f64(%A: memref<?xi8>, %M: index, %N: index, %K: index) {
 //     DYNAMIC:         std.view %{{.*}}[{{.*}}][{{.*}}] : memref<?xi8> to memref<?x?xf64>
 //       CHECK:         %[[partialC_f64:.*]] = subview %[[fullC_f64]][%{{.*}}, %{{.*}}] : memref<?x?xf64> to memref<?x?xf64, #[[strided2D_dynamic]]>
 
-//       CHECK:         linalg.fill(%[[fullA_f64]], {{.*}}) : memref<?x?xf64>, f64
-//       CHECK:         linalg.fill(%[[fullB_f64]], {{.*}}) : memref<?x?xf64>, f64
-//       CHECK:         linalg.fill(%[[fullC_f64]], {{.*}}) : memref<?x?xf64>, f64
 //       CHECK:         linalg.copy(%[[vA_f64]], %[[partialA_f64]]) : memref<?x?xf64, #[[strided2D]]>, memref<?x?xf64, #[[strided2D_dynamic]]>
 //       CHECK:         linalg.copy(%[[vB_f64]], %[[partialB_f64]]) : memref<?x?xf64, #[[strided2D]]>, memref<?x?xf64, #[[strided2D_dynamic]]>
 //       CHECK:         linalg.copy(%[[vC_f64]], %[[partialC_f64]]) : memref<?x?xf64, #[[strided2D]]>, memref<?x?xf64, #[[strided2D_dynamic]]>
 //
-//       CHECK:         linalg.matmul(%[[fullA_f64]], %[[fullB_f64]], %[[fullC_f64]]) : memref<?x?xf64>, memref<?x?xf64>, memref<?x?xf64>
+//       CHECK:         linalg.matmul(%[[partialA_f64]], %[[partialB_f64]], %[[partialC_f64]]) : memref<?x?xf64, #[[strided2D_dynamic]]>, memref<?x?xf64, #[[strided2D_dynamic]]>, memref<?x?xf64, #[[strided2D_dynamic]]>
 //
 //       CHECK:         linalg.copy(%[[partialC_f64]], %[[vC_f64]]) : memref<?x?xf64, #[[strided2D_dynamic]]>, memref<?x?xf64, #[[strided2D]]>
 //
@@ -186,14 +180,11 @@ func @matmul_i32(%A: memref<?xi8>, %M: index, %N: index, %K: index) {
 //     DYNAMIC:         std.view %{{.*}}[{{.*}}][{{.*}}] : memref<?xi8> to memref<?x?xi32>
 //       CHECK:         %[[partialC_i32:.*]] = subview %[[fullC_i32]][%{{.*}}, %{{.*}}] : memref<?x?xi32> to memref<?x?xi32, #[[strided2D_dynamic]]>
 
-//       CHECK:         linalg.fill(%[[fullA_i32]], {{.*}}) : memref<?x?xi32>, i32
-//       CHECK:         linalg.fill(%[[fullB_i32]], {{.*}}) : memref<?x?xi32>, i32
-//       CHECK:         linalg.fill(%[[fullC_i32]], {{.*}}) : memref<?x?xi32>, i32
 //       CHECK:         linalg.copy(%[[vA_i32]], %[[partialA_i32]]) : memref<?x?xi32, #[[strided2D]]>, memref<?x?xi32, #[[strided2D_dynamic]]>
 //       CHECK:         linalg.copy(%[[vB_i32]], %[[partialB_i32]]) : memref<?x?xi32, #[[strided2D]]>, memref<?x?xi32, #[[strided2D_dynamic]]>
 //       CHECK:         linalg.copy(%[[vC_i32]], %[[partialC_i32]]) : memref<?x?xi32, #[[strided2D]]>, memref<?x?xi32, #[[strided2D_dynamic]]>
 //
-//       CHECK:         linalg.matmul(%[[fullA_i32]], %[[fullB_i32]], %[[fullC_i32]]) : memref<?x?xi32>, memref<?x?xi32>, memref<?x?xi32>
+//       CHECK:         linalg.matmul(%[[partialA_i32]], %[[partialB_i32]], %[[partialC_i32]]) : memref<?x?xi32, #[[strided2D_dynamic]]>, memref<?x?xi32, #[[strided2D_dynamic]]>, memref<?x?xi32, #[[strided2D_dynamic]]>
 //
 //       CHECK:         linalg.copy(%[[partialC_i32]], %[[vC_i32]]) : memref<?x?xi32, #[[strided2D_dynamic]]>, memref<?x?xi32, #[[strided2D]]>
 //
diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir
index ab50c566f9b3a1..c18cf38edfc90f 100644
--- a/mlir/test/Dialect/Vector/invalid.mlir
+++ b/mlir/test/Dialect/Vector/invalid.mlir
@@ -238,17 +238,28 @@ func @outerproduct_operand_3_result_type_generic(%arg0: vector<4xf32>, %arg1: ve
 func @test_vector.transfer_read(%arg0: memref<?x?xf32>) {
   %c3 = constant 3 : index
   %cst = constant 3.0 : f32
-  // expected-error@+1 {{two types required}}
+  // expected-error@+1 {{requires two types}}
   %0 = vector.transfer_read %arg0[%c3, %c3], %cst { permutation_map = affine_map<()->(0)> } : memref<?x?xf32>
 }
 
 // -----
 
-func @test_vector.transfer_read(%arg0: memref<?x?xf32>) {
+func @test_vector.transfer_read(%arg0: vector<4x3xf32>) {
   %c3 = constant 3 : index
-  %cst = constant 3.0 : f32
-  // expected-error@+1 {{requires 2 indices}}
-  %0 = vector.transfer_read %arg0[%c3, %c3, %c3], %cst { permutation_map = affine_map<()->(0)> } : memref<?x?xf32>, vector<128xf32>
+  %f0 = constant 0.0 : f32
+  %vf0 = splat %f0 : vector<4x3xf32>
+  // expected-error@+1 {{ requires memref type}}
+  %0 = vector.transfer_read %arg0[%c3, %c3], %vf0 : vector<4x3xf32>, vector<1x1x2x3xf32>
+}
+
+// -----
+
+func @test_vector.transfer_read(%arg0: memref<4x3xf32>) {
+  %c3 = constant 3 : index
+  %f0 = constant 0.0 : f32
+  %vf0 = splat %f0 : vector<4x3xf32>
+  // expected-error@+1 {{ requires vector type}}
+  %0 = vector.transfer_read %arg0[%c3, %c3], %vf0 : memref<4x3xf32>, f32
 }
 
 // -----
@@ -256,8 +267,8 @@ func @test_vector.transfer_read(%arg0: memref<?x?xf32>) {
 func @test_vector.transfer_read(%arg0: memref<?x?xf32>) {
   %c3 = constant 3 : index
   %cst = constant 3.0 : f32
-  // expected-error@+1 {{requires attribute 'permutation_map'}}
-  %0 = vector.transfer_read %arg0[%c3, %c3], %cst {perm = affine_map<(d0)->(d0)>} : memref<?x?xf32>, vector<128xf32>
+  // expected-error@+1 {{requires 2 indices}}
+  %0 = vector.transfer_read %arg0[%c3, %c3, %c3], %cst { permutation_map = affine_map<()->(0)> } : memref<?x?xf32>, vector<128xf32>
 }
 
 // -----
@@ -337,11 +348,41 @@ func @test_vector.transfer_read(%arg0: memref<?x?xvector<4x3xf32>>) {
 
 // -----
 
+func @test_vector.transfer_read(%arg0: memref<?x?xvector<2x3xf32>>) {
+  %c3 = constant 3 : index
+  %f0 = constant 0.0 : f32
+  %vf0 = splat %f0 : vector<2x3xf32>
+  // expected-error@+1 {{ expects the optional masked attr of same rank as permutation_map results: affine_map<(d0, d1) -> (d0, d1)>}}
+  %0 = vector.transfer_read %arg0[%c3, %c3], %vf0 {masked = [false], permutation_map = affine_map<(d0, d1)->(d0, d1)>} : memref<?x?xvector<2x3xf32>>, vector<1x1x2x3xf32>
+}
+
+// -----
+
 func @test_vector.transfer_write(%arg0: memref<?x?xf32>) {
   %c3 = constant 3 : index
-  %cst = constant dense<3.0> : vector<128 x f32>
-  // expected-error@+1 {{expected 5 operand types but had 4}}
-  %0 = "vector.transfer_write"(%cst, %arg0, %c3, %c3, %c3) {permutation_map = affine_map<()->(0)>} : (vector<128xf32>, memref<?x?xf32>, index, index) -> ()
+  %cst = constant 3.0 : f32
+  // expected-error@+1 {{requires two types}}
+  vector.transfer_write %arg0, %arg0[%c3, %c3] : memref<?x?xf32>
+}
+
+// -----
+
+func @test_vector.transfer_write(%arg0: memref<vector<4x3xf32>>) {
+  %c3 = constant 3 : index
+  %f0 = constant 0.0 : f32
+  %vf0 = splat %f0 : vector<4x3xf32>
+  // expected-error@+1 {{ requires vector type}}
+  vector.transfer_write %arg0, %arg0[%c3, %c3] : memref<vector<4x3xf32>>, vector<4x3xf32>
+}
+
+// -----
+
+func @test_vector.transfer_write(%arg0: vector<4x3xf32>) {
+  %c3 = constant 3 : index
+  %f0 = constant 0.0 : f32
+  %vf0 = splat %f0 : vector<4x3xf32>
+  // expected-error@+1 {{ requires memref type}}
+  vector.transfer_write %arg0, %arg0[%c3, %c3] : vector<4x3xf32>, f32
 }
 
 // -----
@@ -349,8 +390,8 @@ func @test_vector.transfer_write(%arg0: memref<?x?xf32>) {
 func @test_vector.transfer_write(%arg0: memref<?x?xf32>) {
   %c3 = constant 3 : index
   %cst = constant dense<3.0> : vector<128 x f32>
-  // expected-error@+1 {{requires 2 indices}}
-  vector.transfer_write %cst, %arg0[%c3, %c3, %c3] {permutation_map = affine_map<()->(0)>} : vector<128xf32>, memref<?x?xf32>
+  // expected-error@+1 {{expected 5 operand types but had 4}}
+  %0 = "vector.transfer_write"(%cst, %arg0, %c3, %c3, %c3) {permutation_map = affine_map<()->(0)>} : (vector<128xf32>, memref<?x?xf32>, index, index) -> ()
 }
 
 // -----
@@ -358,8 +399,8 @@ func @test_vector.transfer_write(%arg0: memref<?x?xf32>) {
 func @test_vector.transfer_write(%arg0: memref<?x?xf32>) {
   %c3 = constant 3 : index
   %cst = constant dense<3.0> : vector<128 x f32>
-  // expected-error@+1 {{requires attribute 'permutation_map'}}
-  vector.transfer_write %cst, %arg0[%c3, %c3] {perm = affine_map<(d0)->(d0)>} : vector<128xf32>, memref<?x?xf32>
+  // expected-error@+1 {{requires 2 indices}}
+  vector.transfer_write %cst, %arg0[%c3, %c3, %c3] {permutation_map = affine_map<()->(0)>} : vector<128xf32>, memref<?x?xf32>
 }
 
 // -----
diff --git a/mlir/test/Dialect/Vector/ops.mlir b/mlir/test/Dialect/Vector/ops.mlir
index 73690d6ebcc86a..c194cbe2381172 100644
--- a/mlir/test/Dialect/Vector/ops.mlir
+++ b/mlir/test/Dialect/Vector/ops.mlir
@@ -20,15 +20,19 @@ func @vector_transfer_ops(%arg0: memref<?x?xf32>,
   %2 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = affine_map<(d0, d1)->(d0)>} : memref<?x?xf32>,  vector<128xf32>
   // CHECK: vector.transfer_read
   %3 = vector.transfer_read %arg0[%c3, %c3], %cst {permutation_map = affine_map<(d0, d1)->(d1)>} : memref<?x?xf32>,  vector<128xf32>
-  // CHECK: vector.transfer_read %{{.*}}[%[[C3]], %[[C3]]], %{{.*}} {permutation_map = #[[MAP0]]} : memref<?x?xvector<4x3xf32>>, vector<1x1x4x3xf32>
+  // CHECK: vector.transfer_read %{{.*}}[%[[C3]], %[[C3]]], %{{.*}} : memref<?x?xvector<4x3xf32>>, vector<1x1x4x3xf32>
   %4 = vector.transfer_read %arg1[%c3, %c3], %vf0 {permutation_map = affine_map<(d0, d1)->(d0, d1)>} : memref<?x?xvector<4x3xf32>>, vector<1x1x4x3xf32>
+  // CHECK: vector.transfer_read %{{.*}}[%[[C3]], %[[C3]]], %{{.*}} {masked = [true, false]} : memref<?x?xvector<4x3xf32>>, vector<1x1x4x3xf32>
+  %5 = vector.transfer_read %arg1[%c3, %c3], %vf0 {masked = [true, false]} : memref<?x?xvector<4x3xf32>>, vector<1x1x4x3xf32>
 
   // CHECK: vector.transfer_write
   vector.transfer_write %0, %arg0[%c3, %c3] {permutation_map = affine_map<(d0, d1)->(d0)>} : vector<128xf32>, memref<?x?xf32>
   // CHECK: vector.transfer_write
   vector.transfer_write %1, %arg0[%c3, %c3] {permutation_map = affine_map<(d0, d1)->(d1, d0)>} : vector<3x7xf32>, memref<?x?xf32>
-  // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[C3]], %[[C3]]] {permutation_map = #[[MAP0]]} : vector<1x1x4x3xf32>, memref<?x?xvector<4x3xf32>>
+  // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[C3]], %[[C3]]] : vector<1x1x4x3xf32>, memref<?x?xvector<4x3xf32>>
   vector.transfer_write %4, %arg1[%c3, %c3] {permutation_map = affine_map<(d0, d1)->(d0, d1)>} : vector<1x1x4x3xf32>, memref<?x?xvector<4x3xf32>>
+  // CHECK: vector.transfer_write %{{.*}}, %{{.*}}[%[[C3]], %[[C3]]] : vector<1x1x4x3xf32>, memref<?x?xvector<4x3xf32>>
+  vector.transfer_write %5, %arg1[%c3, %c3] {masked = [true, true]} : vector<1x1x4x3xf32>, memref<?x?xvector<4x3xf32>>
 
   return
 }
diff --git a/mlir/test/Dialect/Vector/vector-transforms.mlir b/mlir/test/Dialect/Vector/vector-transforms.mlir
index 2e4e9033fb81e6..8de153adf73108 100644
--- a/mlir/test/Dialect/Vector/vector-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-transforms.mlir
@@ -231,26 +231,26 @@ func @contraction4x4_ikj(%arg0 : vector<4x2xf32>, %arg1 : vector<2x4xf32>,
 
 // Check LHS vector.transfer read is split for each user.
 
-//      CHECK: %[[VTR0:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %{{.*}} {permutation_map = #[[MAP0]]} : memref<4x2xf32>, vector<2x2xf32>
-// CHECK-NEXT: %[[VTR1:.*]] = vector.transfer_read %{{.*}}[%[[C2]], %[[C0]]], %{{.*}} {permutation_map = #[[MAP0]]} : memref<4x2xf32>, vector<2x2xf32>
+//      CHECK: %[[VTR0:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %{{.*}} : memref<4x2xf32>, vector<2x2xf32>
+// CHECK-NEXT: %[[VTR1:.*]] = vector.transfer_read %{{.*}}[%[[C2]], %[[C0]]], %{{.*}} : memref<4x2xf32>, vector<2x2xf32>
 
-// CHECK-NEXT: %[[VTR2:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %{{.*}} {permutation_map = #[[MAP0]]} : memref<2x4xf32>, vector<2x2xf32>
-// CHECK-NEXT: %[[VTR3:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C2]]], %{{.*}} {permutation_map = #[[MAP0]]} : memref<2x4xf32>, vector<2x2xf32>
+// CHECK-NEXT: %[[VTR2:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %{{.*}} : memref<2x4xf32>, vector<2x2xf32>
+// CHECK-NEXT: %[[VTR3:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C2]]], %{{.*}} : memref<2x4xf32>, vector<2x2xf32>
 
-// CHECK-NEXT: %[[VTR4:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %{{.*}} {permutation_map = #[[MAP0]]} : memref<4x4xf32>, vector<2x2xf32>
-// CHECK-NEXT: %[[VTR5:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C2]]], %{{.*}} {permutation_map = #[[MAP0]]} : memref<4x4xf32>, vector<2x2xf32>
-// CHECK-NEXT: %[[VTR6:.*]] = vector.transfer_read %{{.*}}[%[[C2]], %[[C0]]], %{{.*}} {permutation_map = #[[MAP0]]} : memref<4x4xf32>, vector<2x2xf32>
-// CHECK-NEXT: %[[VTR7:.*]] = vector.transfer_read %{{.*}}[%[[C2]], %[[C2]]], %{{.*}} {permutation_map = #[[MAP0]]} : memref<4x4xf32>, vector<2x2xf32>
+// CHECK-NEXT: %[[VTR4:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32>
+// CHECK-NEXT: %[[VTR5:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C2]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32>
+// CHECK-NEXT: %[[VTR6:.*]] = vector.transfer_read %{{.*}}[%[[C2]], %[[C0]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32>
+// CHECK-NEXT: %[[VTR7:.*]] = vector.transfer_read %{{.*}}[%[[C2]], %[[C2]]], %{{.*}} : memref<4x4xf32>, vector<2x2xf32>
 
 // CHECK-NEXT: %[[R0:.*]] = vector.contract {indexing_maps = [#map2, #map3, #map0], iterator_types = ["parallel", "reduction", "parallel"]} %[[VTR0]], %[[VTR2]], %[[VTR4]] : vector<2x2xf32>, vector<2x2xf32> into vector<2x2xf32>
 // CHECK-NEXT: %[[R1:.*]] = vector.contract {indexing_maps = [#map2, #map3, #map0], iterator_types = ["parallel", "reduction", "parallel"]} %[[VTR0]], %[[VTR3]], %[[VTR5]] : vector<2x2xf32>, vector<2x2xf32> into vector<2x2xf32>
 // CHECK-NEXT: %[[R2:.*]] = vector.contract {indexing_maps = [#map2, #map3, #map0], iterator_types = ["parallel", "reduction", "parallel"]} %[[VTR1]], %[[VTR2]], %[[VTR6]] : vector<2x2xf32>, vector<2x2xf32> into vector<2x2xf32>
 // CHECK-NEXT: %[[R3:.*]] = vector.contract {indexing_maps = [#map2, #map3, #map0], iterator_types = ["parallel", "reduction", "parallel"]} %[[VTR1]], %[[VTR3]], %[[VTR7]] : vector<2x2xf32>, vector<2x2xf32> into vector<2x2xf32>
 
-// CHECK-NEXT: vector.transfer_write %[[R0]], %{{.*}}[%[[C0]], %[[C0]]] {permutation_map = #[[MAP0]]} : vector<2x2xf32>, memref<4x4xf32>
-// CHECK-NEXT: vector.transfer_write %[[R1]], %{{.*}}[%[[C0]], %[[C2]]] {permutation_map = #[[MAP0]]} : vector<2x2xf32>, memref<4x4xf32>
-// CHECK-NEXT: vector.transfer_write %[[R2]], %{{.*}}[%[[C2]], %[[C0]]] {permutation_map = #[[MAP0]]} : vector<2x2xf32>, memref<4x4xf32>
-// CHECK-NEXT: vector.transfer_write %[[R3]], %{{.*}}[%[[C2]], %[[C2]]] {permutation_map = #[[MAP0]]} : vector<2x2xf32>, memref<4x4xf32>
+// CHECK-NEXT: vector.transfer_write %[[R0]], %{{.*}}[%[[C0]], %[[C0]]] : vector<2x2xf32>, memref<4x4xf32>
+// CHECK-NEXT: vector.transfer_write %[[R1]], %{{.*}}[%[[C0]], %[[C2]]] : vector<2x2xf32>, memref<4x4xf32>
+// CHECK-NEXT: vector.transfer_write %[[R2]], %{{.*}}[%[[C2]], %[[C0]]] : vector<2x2xf32>, memref<4x4xf32>
+// CHECK-NEXT: vector.transfer_write %[[R3]], %{{.*}}[%[[C2]], %[[C2]]] : vector<2x2xf32>, memref<4x4xf32>
 // CHECK-NEXT: return
 
 func @contraction4x4_ikj_xfer_read(%arg0 : memref<4x2xf32>,
@@ -425,10 +425,10 @@ func @cancelling_shape_cast_ops(%arg0 : vector<2x4xf32>) -> vector<2x4xf32> {
 // CHECK-LABEL: func @vector_transfers_vector_element_type
 //      CHECK: %[[C0:.*]] = constant 0 : index
 //      CHECK: %[[C1:.*]] = constant 1 : index
-//      CHECK: %[[VTR0:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]]], %{{.*}} {permutation_map = #[[MAP1]]} : memref<6x2x1xvector<2x4xf32>>, vector<1x1x2x4xf32>
-// CHECK-NEXT: %[[VTR1:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C1]], %[[C0]]], %{{.*}} {permutation_map = #[[MAP1]]} : memref<6x2x1xvector<2x4xf32>>, vector<1x1x2x4xf32>
-// CHECK-NEXT: vector.transfer_write %[[VTR0]], %{{.*}}[%[[C0]], %[[C0]], %[[C0]]] {permutation_map = #[[MAP1]]} : vector<1x1x2x4xf32>, memref<6x2x1xvector<2x4xf32>>
-// CHECK-NEXT: vector.transfer_write %[[VTR1]], %{{.*}}[%[[C0]], %[[C1]], %[[C0]]] {permutation_map = #[[MAP1]]} : vector<1x1x2x4xf32>, memref<6x2x1xvector<2x4xf32>>
+//      CHECK: %[[VTR0:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C0]], %[[C0]]], %{{.*}} : memref<6x2x1xvector<2x4xf32>>, vector<1x1x2x4xf32>
+// CHECK-NEXT: %[[VTR1:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[C1]], %[[C0]]], %{{.*}} : memref<6x2x1xvector<2x4xf32>>, vector<1x1x2x4xf32>
+// CHECK-NEXT: vector.transfer_write %[[VTR0]], %{{.*}}[%[[C0]], %[[C0]], %[[C0]]] : vector<1x1x2x4xf32>, memref<6x2x1xvector<2x4xf32>>
+// CHECK-NEXT: vector.transfer_write %[[VTR1]], %{{.*}}[%[[C0]], %[[C1]], %[[C0]]] : vector<1x1x2x4xf32>, memref<6x2x1xvector<2x4xf32>>
 
 func @vector_transfers_vector_element_type() {
   %c0 = constant 0 : index
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index f3bbf0e50dca70..eb2ff83fdddfb8 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -1214,9 +1214,15 @@ def FormatAttrOp : TEST_Op<"format_attr_op"> {
   let assemblyFormat = "$attr attr-dict";
 }
 
+// Test that we elide optional attributes that are within the syntax.
+def FormatOptAttrOp : TEST_Op<"format_opt_attr_op"> {
+  let arguments = (ins OptionalAttr<I64Attr>:$opt_attr);
+  let assemblyFormat = "(`(`$opt_attr^`)`)? attr-dict";
+}
+
 // Test that we elide attributes that are within the syntax.
 def FormatAttrDictWithKeywordOp : TEST_Op<"format_attr_dict_w_keyword"> {
-  let arguments = (ins I64Attr:$attr);
+  let arguments = (ins I64Attr:$attr, OptionalAttr<I64Attr>:$opt_attr);
   let assemblyFormat = "attr-dict-with-keyword";
 }
 
diff --git a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
index 0390ac945d2f7d..87191d3e87d2b4 100644
--- a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
+++ b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp
@@ -132,13 +132,20 @@ static void applyPatterns(FuncOp funcOp) {
   // Linalg subview operands promotion.
   //===--------------------------------------------------------------------===//
   patterns.insert<LinalgPromotionPattern<MatmulOp>>(
-      ctx, LinalgPromotionOptions(),
+      ctx, LinalgPromotionOptions().useFullTileBuffersByDefault(),
       LinalgMarker({"_promote_views_"}, "_views_promoted_"));
   patterns.insert<LinalgPromotionPattern<MatmulOp>>(
-      ctx, LinalgPromotionOptions().setOperandsToPromote({0}),
+      ctx,
+      LinalgPromotionOptions()
+          .setOperandsToPromote({0})
+          .useFullTileBuffersByDefault(),
       LinalgMarker({"_promote_first_view_"}, "_first_view_promoted_"));
   patterns.insert<LinalgPromotionPattern<FillOp>>(
-      ctx, LinalgPromotionOptions().setOperandsToPromote({0}).setAlignment(32),
+      ctx,
+      LinalgPromotionOptions()
+          .setOperandsToPromote({0})
+          .setUseFullTileBuffers({true})
+          .setAlignment(32),
       LinalgMarker({"_promote_views_aligned_"}, "_views_aligned_promoted_"));
 
   applyPatternsAndFoldGreedily(funcOp, patterns);
@@ -171,7 +178,8 @@ void fillL1TilingAndMatmulToVectorPatterns(
       LinalgMarker({startMarker}, "L1")));
 
   patternsVector.emplace_back(LinalgPromotionPattern<MatmulOp>(
-      context, LinalgPromotionOptions(), LinalgMarker({"L1"}, "VEC")));
+      context, LinalgPromotionOptions().useFullTileBuffersByDefault(),
+      LinalgMarker({"L1"}, "VEC")));
 
   patternsVector.emplace_back(
       LinalgVectorizationPattern<MatmulOp>(context, LinalgMarker({"VEC"})));
diff --git a/mlir/test/mlir-tblgen/op-format.mlir b/mlir/test/mlir-tblgen/op-format.mlir
index 8d55768aced79b..066e548e17083e 100644
--- a/mlir/test/mlir-tblgen/op-format.mlir
+++ b/mlir/test/mlir-tblgen/op-format.mlir
@@ -12,9 +12,16 @@ test.format_literal_op keyword_$. -> :, = <> () [] {foo.some_attr}
 // CHECK-NOT: {attr
 test.format_attr_op 10
 
+// CHECK: test.format_opt_attr_op(10)
+// CHECK-NOT: {opt_attr
+test.format_opt_attr_op(10)
+
 // CHECK: test.format_attr_dict_w_keyword attributes {attr = 10 : i64}
 test.format_attr_dict_w_keyword attributes {attr = 10 : i64}
 
+// CHECK: test.format_attr_dict_w_keyword attributes {attr = 10 : i64, opt_attr = 10 : i64}
+test.format_attr_dict_w_keyword attributes {attr = 10 : i64, opt_attr = 10 : i64}
+
 // CHECK: test.format_buildable_type_op %[[I64]]
 %ignored = test.format_buildable_type_op %i64
 
diff --git a/mlir/tools/mlir-opt/CMakeLists.txt b/mlir/tools/mlir-opt/CMakeLists.txt
index 3e8ed0ebee7b51..d509b23505d12b 100644
--- a/mlir/tools/mlir-opt/CMakeLists.txt
+++ b/mlir/tools/mlir-opt/CMakeLists.txt
@@ -10,24 +10,30 @@ set(LLVM_LINK_COMPONENTS
   AsmParser
   )
 
+if(MLIR_INCLUDE_TESTS)
+  set(test_libs
+    MLIRAffineTransformsTestPasses
+    MLIRSPIRVTestPasses
+    MLIRTestDialect
+    MLIRTestIR
+    MLIRTestPass
+    MLIRTestTransforms
+    )
+endif()
+
 set(LIBS
   ${dialect_libs}
   ${conversion_libs}
+  ${test_libs}
   MLIRLoopAnalysis
-  MLIRAffineTransformsTestPasses
   MLIRAnalysis
   MLIRDialect
   MLIREDSC
   MLIROptLib
   MLIRParser
   MLIRPass
-  MLIRSPIRVTestPasses
   MLIRTransforms
   MLIRTransformUtils
-  MLIRTestDialect
-  MLIRTestIR
-  MLIRTestPass
-  MLIRTestTransforms
   MLIRSupport
   MLIRIR
   )
diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp
index 218d6c03b4b80d..69b1d8d57bc56b 100644
--- a/mlir/tools/mlir-opt/mlir-opt.cpp
+++ b/mlir/tools/mlir-opt/mlir-opt.cpp
@@ -93,6 +93,7 @@ static cl::opt<bool> allowUnregisteredDialects(
     "allow-unregistered-dialect",
     cl::desc("Allow operation with no registered dialects"), cl::init(false));
 
+#ifdef MLIR_INCLUDE_TESTS
 void registerTestPasses() {
   registerConvertToTargetEnvPass();
   registerInliner();
@@ -131,6 +132,7 @@ void registerTestPasses() {
   registerTestVectorToSCFPass();
   registerVectorizerTestPass();
 }
+#endif
 
 static cl::opt<bool>
     showDialects("show-dialects",
@@ -140,7 +142,9 @@ static cl::opt<bool>
 int main(int argc, char **argv) {
   registerAllDialects();
   registerAllPasses();
+#ifdef MLIR_INCLUDE_TESTS
   registerTestPasses();
+#endif
   InitLLVM y(argc, argv);
 
   // Register any command line options.
diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
index 127b6b976cd53d..9fa87e3a842771 100644
--- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
@@ -886,9 +886,17 @@ static void genAttrDictPrinter(OperationFormat &fmt, Operator &op,
                                OpMethodBody &body, bool withKeyword) {
   // Collect all of the attributes used in the format, these will be elided.
   SmallVector<const NamedAttribute *, 1> usedAttributes;
-  for (auto &it : fmt.elements)
+  for (auto &it : fmt.elements) {
     if (auto *attr = dyn_cast<AttributeVariable>(it.get()))
       usedAttributes.push_back(attr->getVar());
+    // Collect the optional attributes.
+    if (auto *opt = dyn_cast<OptionalElement>(it.get())) {
+      for (auto &elem : opt->getElements()) {
+        if (auto *attr = dyn_cast<AttributeVariable>(&elem))
+          usedAttributes.push_back(attr->getVar());
+      }
+    }
+  }
 
   body << "  p.printOptionalAttrDict" << (withKeyword ? "WithKeyword" : "")
        << "(getAttrs(), /*elidedAttrs=*/{";