diff --git a/clang-tools-extra/clangd/IncludeFixer.cpp b/clang-tools-extra/clangd/IncludeFixer.cpp
index 7704ccb82c0f02..0fd8db0d116796 100644
--- a/clang-tools-extra/clangd/IncludeFixer.cpp
+++ b/clang-tools-extra/clangd/IncludeFixer.cpp
@@ -68,10 +68,10 @@ class VisitedContextCollector : public VisibleDeclConsumer {
 std::vector<Fix> IncludeFixer::fix(DiagnosticsEngine::Level DiagLevel,
                                    const clang::Diagnostic &Info) const {
   switch (Info.getID()) {
-  case diag::err_incomplete_type:
-  case diag::err_incomplete_member_access:
-  case diag::err_incomplete_base_class:
   case diag::err_incomplete_nested_name_spec:
+  case diag::err_incomplete_base_class:
+  case diag::err_incomplete_member_access:
+  case diag::err_incomplete_type:
     // Incomplete type diagnostics should have a QualType argument for the
     // incomplete type.
     for (unsigned Idx = 0; Idx < Info.getNumArgs(); ++Idx) {
diff --git a/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp b/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp
index 8fd1fe385a1c8d..e80a135c0a999f 100644
--- a/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp
+++ b/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp
@@ -708,9 +708,11 @@ class Y : $base[[public ns::X]] {};
 int main() {
   ns::X *x;
   x$access[[->]]f();
+  auto& $type[[[]]a] = *x;
 }
   )cpp");
   auto TU = TestTU::withCode(Test.code());
+  TU.ExtraArgs.push_back("-std=c++17");
   auto Index = buildIndexWithSymbol(
       {SymbolWithHeader{"ns::X", "unittest:///x.h", "\"x.h\""}});
   TU.ExternalIndex = Index.get();
@@ -731,7 +733,13 @@ int main() {
                      "member access into incomplete type 'ns::X'"),
                 DiagName("incomplete_member_access"),
                 WithFix(Fix(Test.range("insert"), "#include \"x.h\"\n",
-                            "Add include \"x.h\" for symbol ns::X")))));
+                            "Add include \"x.h\" for symbol ns::X"))),
+          AllOf(
+              Diag(Test.range("type"),
+                   "incomplete type 'ns::X' where a complete type is required"),
+              DiagName("incomplete_type"),
+              WithFix(Fix(Test.range("insert"), "#include \"x.h\"\n",
+                          "Add include \"x.h\" for symbol ns::X")))));
 }
 
 TEST(IncludeFixerTest, NoSuggestIncludeWhenNoDefinitionInHeader) {
diff --git a/clang/include/clang/Driver/Job.h b/clang/include/clang/Driver/Job.h
index 6173b9d314b4df..8a348c8048d066 100644
--- a/clang/include/clang/Driver/Job.h
+++ b/clang/include/clang/Driver/Job.h
@@ -122,6 +122,9 @@ class Command {
   /// The list of program arguments which are inputs.
   llvm::opt::ArgStringList InputFilenames;
 
+  /// The list of program arguments which are outputs. May be empty.
+  std::vector<std::string> OutputFilenames;
+
   /// Response file name, if this command is set to use one, or nullptr
   /// otherwise
   const char *ResponseFile = nullptr;
@@ -158,8 +161,8 @@ class Command {
 
   Command(const Action &Source, const Tool &Creator,
           ResponseFileSupport ResponseSupport, const char *Executable,
-          const llvm::opt::ArgStringList &Arguments,
-          ArrayRef<InputInfo> Inputs);
+          const llvm::opt::ArgStringList &Arguments, ArrayRef<InputInfo> Inputs,
+          ArrayRef<InputInfo> Outputs = None);
   // FIXME: This really shouldn't be copyable, but is currently copied in some
   // error handling in Driver::generateCompilationDiagnostics.
   Command(const Command &) = default;
@@ -201,6 +204,14 @@ class Command {
 
   const llvm::opt::ArgStringList &getArguments() const { return Arguments; }
 
+  const llvm::opt::ArgStringList &getInputFilenames() const {
+    return InputFilenames;
+  }
+
+  const std::vector<std::string> &getOutputFilenames() const {
+    return OutputFilenames;
+  }
+
 protected:
   /// Optionally print the filenames to be compiled
   void PrintFileNames() const;
@@ -212,7 +223,7 @@ class CC1Command : public Command {
   CC1Command(const Action &Source, const Tool &Creator,
              ResponseFileSupport ResponseSupport, const char *Executable,
              const llvm::opt::ArgStringList &Arguments,
-             ArrayRef<InputInfo> Inputs);
+             ArrayRef<InputInfo> Inputs, ArrayRef<InputInfo> Outputs = None);
 
   void Print(llvm::raw_ostream &OS, const char *Terminator, bool Quote,
              CrashReportInfo *CrashInfo = nullptr) const override;
@@ -230,7 +241,7 @@ class FallbackCommand : public Command {
   FallbackCommand(const Action &Source_, const Tool &Creator_,
                   ResponseFileSupport ResponseSupport, const char *Executable_,
                   const llvm::opt::ArgStringList &Arguments_,
-                  ArrayRef<InputInfo> Inputs,
+                  ArrayRef<InputInfo> Inputs, ArrayRef<InputInfo> Outputs,
                   std::unique_ptr<Command> Fallback_);
 
   void Print(llvm::raw_ostream &OS, const char *Terminator, bool Quote,
@@ -250,7 +261,8 @@ class ForceSuccessCommand : public Command {
                       ResponseFileSupport ResponseSupport,
                       const char *Executable_,
                       const llvm::opt::ArgStringList &Arguments_,
-                      ArrayRef<InputInfo> Inputs);
+                      ArrayRef<InputInfo> Inputs,
+                      ArrayRef<InputInfo> Outputs = None);
 
   void Print(llvm::raw_ostream &OS, const char *Terminator, bool Quote,
              CrashReportInfo *CrashInfo = nullptr) const override;
diff --git a/clang/lib/Driver/Job.cpp b/clang/lib/Driver/Job.cpp
index 4808a9f4628d5e..de2c2350f8d1d2 100644
--- a/clang/lib/Driver/Job.cpp
+++ b/clang/lib/Driver/Job.cpp
@@ -38,12 +38,15 @@ using namespace driver;
 Command::Command(const Action &Source, const Tool &Creator,
                  ResponseFileSupport ResponseSupport, const char *Executable,
                  const llvm::opt::ArgStringList &Arguments,
-                 ArrayRef<InputInfo> Inputs)
+                 ArrayRef<InputInfo> Inputs, ArrayRef<InputInfo> Outputs)
     : Source(Source), Creator(Creator), ResponseSupport(ResponseSupport),
       Executable(Executable), Arguments(Arguments) {
   for (const auto &II : Inputs)
     if (II.isFilename())
       InputFilenames.push_back(II.getFilename());
+  for (const auto &II : Outputs)
+    if (II.isFilename())
+      OutputFilenames.push_back(II.getFilename());
 }
 
 /// Check if the compiler flag in question should be skipped when
@@ -357,8 +360,9 @@ CC1Command::CC1Command(const Action &Source, const Tool &Creator,
                        ResponseFileSupport ResponseSupport,
                        const char *Executable,
                        const llvm::opt::ArgStringList &Arguments,
-                       ArrayRef<InputInfo> Inputs)
-    : Command(Source, Creator, ResponseSupport, Executable, Arguments, Inputs) {
+                       ArrayRef<InputInfo> Inputs, ArrayRef<InputInfo> Outputs)
+    : Command(Source, Creator, ResponseSupport, Executable, Arguments, Inputs,
+              Outputs) {
   InProcess = true;
 }
 
@@ -415,9 +419,10 @@ FallbackCommand::FallbackCommand(const Action &Source_, const Tool &Creator_,
                                  const char *Executable_,
                                  const llvm::opt::ArgStringList &Arguments_,
                                  ArrayRef<InputInfo> Inputs,
+                                 ArrayRef<InputInfo> Outputs,
                                  std::unique_ptr<Command> Fallback_)
     : Command(Source_, Creator_, ResponseSupport, Executable_, Arguments_,
-              Inputs),
+              Inputs, Outputs),
       Fallback(std::move(Fallback_)) {}
 
 void FallbackCommand::Print(raw_ostream &OS, const char *Terminator,
@@ -456,9 +461,10 @@ int FallbackCommand::Execute(ArrayRef<llvm::Optional<StringRef>> Redirects,
 ForceSuccessCommand::ForceSuccessCommand(
     const Action &Source_, const Tool &Creator_,
     ResponseFileSupport ResponseSupport, const char *Executable_,
-    const llvm::opt::ArgStringList &Arguments_, ArrayRef<InputInfo> Inputs)
+    const llvm::opt::ArgStringList &Arguments_, ArrayRef<InputInfo> Inputs,
+    ArrayRef<InputInfo> Outputs)
     : Command(Source_, Creator_, ResponseSupport, Executable_, Arguments_,
-              Inputs) {}
+              Inputs, Outputs) {}
 
 void ForceSuccessCommand::Print(raw_ostream &OS, const char *Terminator,
                             bool Quote, CrashReportInfo *CrashInfo) const {
diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp
index 351b34e8bf90fe..b833ebaebf9257 100644
--- a/clang/lib/Driver/ToolChains/AIX.cpp
+++ b/clang/lib/Driver/ToolChains/AIX.cpp
@@ -71,7 +71,7 @@ void aix::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
 
   const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("as"));
   C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
-                                         Exec, CmdArgs, Inputs));
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 void aix::Linker::ConstructJob(Compilation &C, const JobAction &JA,
@@ -170,7 +170,7 @@ void aix::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   const char *Exec = Args.MakeArgString(ToolChain.GetLinkerPath());
   C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
-                                         Exec, CmdArgs, Inputs));
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 /// AIX - AIX tool chain which can call as(1) and ld(1) directly.
diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index c6087156642b27..6781045886f20b 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -356,9 +356,9 @@ void amdgpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   CmdArgs.push_back("-shared");
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
-  C.addCommand(
-      std::make_unique<Command>(JA, *this, ResponseFileSupport::AtFileCurCP(),
-                                Args.MakeArgString(Linker), CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(
+      JA, *this, ResponseFileSupport::AtFileCurCP(), Args.MakeArgString(Linker),
+      CmdArgs, Inputs, Output));
 }
 
 void amdgpu::getAMDGPUTargetFeatures(const Driver &D,
diff --git a/clang/lib/Driver/ToolChains/AVR.cpp b/clang/lib/Driver/ToolChains/AVR.cpp
index 092bade53c6352..02b745c6a2056b 100644
--- a/clang/lib/Driver/ToolChains/AVR.cpp
+++ b/clang/lib/Driver/ToolChains/AVR.cpp
@@ -142,9 +142,9 @@ void AVR::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(Args.MakeArgString(std::string("-m") + *FamilyName));
   }
 
-  C.addCommand(
-      std::make_unique<Command>(JA, *this, ResponseFileSupport::AtFileCurCP(),
-                                Args.MakeArgString(Linker), CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(
+      JA, *this, ResponseFileSupport::AtFileCurCP(), Args.MakeArgString(Linker),
+      CmdArgs, Inputs, Output));
 }
 
 llvm::Optional<std::string> AVRToolChain::findAVRLibcInstallation() const {
diff --git a/clang/lib/Driver/ToolChains/Ananas.cpp b/clang/lib/Driver/ToolChains/Ananas.cpp
index a4141a57acccb1..e5e33fe24874eb 100644
--- a/clang/lib/Driver/ToolChains/Ananas.cpp
+++ b/clang/lib/Driver/ToolChains/Ananas.cpp
@@ -39,8 +39,9 @@ void ananas::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(II.getFilename());
 
   const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("as"));
-  C.addCommand(std::make_unique<Command>(
-      JA, *this, ResponseFileSupport::AtFileCurCP(), Exec, CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(JA, *this,
+                                         ResponseFileSupport::AtFileCurCP(),
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 void ananas::Linker::ConstructJob(Compilation &C, const JobAction &JA,
@@ -124,8 +125,9 @@ void ananas::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   }
 
   const char *Exec = Args.MakeArgString(ToolChain.GetLinkerPath());
-  C.addCommand(std::make_unique<Command>(
-      JA, *this, ResponseFileSupport::AtFileCurCP(), Exec, CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(JA, *this,
+                                         ResponseFileSupport::AtFileCurCP(),
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 // Ananas - Ananas tool chain which can call as(1) and ld(1) directly.
diff --git a/clang/lib/Driver/ToolChains/BareMetal.cpp b/clang/lib/Driver/ToolChains/BareMetal.cpp
index 61839a9e31b0b5..6ed81c1e34a120 100644
--- a/clang/lib/Driver/ToolChains/BareMetal.cpp
+++ b/clang/lib/Driver/ToolChains/BareMetal.cpp
@@ -202,5 +202,5 @@ void baremetal::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
                                          Args.MakeArgString(TC.GetLinkerPath()),
-                                         CmdArgs, Inputs));
+                                         CmdArgs, Inputs, Output));
 }
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 630b39d1e769ed..b37dcfee1a3ec5 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -4356,9 +4356,9 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
         II.getInputArg().renderAsInput(Args, CmdArgs);
     }
 
-    C.addCommand(
-        std::make_unique<Command>(JA, *this, ResponseFileSupport::AtFileUTF8(),
-                                  D.getClangProgramPath(), CmdArgs, Inputs));
+    C.addCommand(std::make_unique<Command>(
+        JA, *this, ResponseFileSupport::AtFileUTF8(), D.getClangProgramPath(),
+        CmdArgs, Inputs, Output));
     return;
   }
 
@@ -6314,20 +6314,23 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
         getCLFallback()->GetCommand(C, JA, Output, Inputs, Args, LinkingOutput);
     C.addCommand(std::make_unique<FallbackCommand>(
         JA, *this, ResponseFileSupport::AtFileUTF8(), Exec, CmdArgs, Inputs,
-        std::move(CLCommand)));
+        Output, std::move(CLCommand)));
   } else if (Args.hasArg(options::OPT__SLASH_fallback) &&
              isa<PrecompileJobAction>(JA)) {
     // In /fallback builds, run the main compilation even if the pch generation
     // fails, so that the main compilation's fallback to cl.exe runs.
     C.addCommand(std::make_unique<ForceSuccessCommand>(
-        JA, *this, ResponseFileSupport::AtFileUTF8(), Exec, CmdArgs, Inputs));
+        JA, *this, ResponseFileSupport::AtFileUTF8(), Exec, CmdArgs, Inputs,
+        Output));
   } else if (D.CC1Main && !D.CCGenDiagnostics) {
     // Invoke the CC1 directly in this process
-    C.addCommand(std::make_unique<CC1Command>(
-        JA, *this, ResponseFileSupport::AtFileUTF8(), Exec, CmdArgs, Inputs));
+    C.addCommand(std::make_unique<CC1Command>(JA, *this,
+                                              ResponseFileSupport::AtFileUTF8(),
+                                              Exec, CmdArgs, Inputs, Output));
   } else {
-    C.addCommand(std::make_unique<Command>(
-        JA, *this, ResponseFileSupport::AtFileUTF8(), Exec, CmdArgs, Inputs));
+    C.addCommand(std::make_unique<Command>(JA, *this,
+                                           ResponseFileSupport::AtFileUTF8(),
+                                           Exec, CmdArgs, Inputs, Output));
   }
 
   // Make the compile command echo its inputs for /showFilenames.
@@ -7074,8 +7077,9 @@ void ClangAs::ConstructJob(Compilation &C, const JobAction &JA,
   CmdArgs.push_back(Input.getFilename());
 
   const char *Exec = getToolChain().getDriver().getClangProgramPath();
-  C.addCommand(std::make_unique<Command>(
-      JA, *this, ResponseFileSupport::AtFileUTF8(), Exec, CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(JA, *this,
+                                         ResponseFileSupport::AtFileUTF8(),
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 // Begin OffloadBundler
@@ -7161,7 +7165,7 @@ void OffloadBundler::ConstructJob(Compilation &C, const JobAction &JA,
   C.addCommand(std::make_unique<Command>(
       JA, *this, ResponseFileSupport::None(),
       TCArgs.MakeArgString(getToolChain().GetProgramPath(getShortName())),
-      CmdArgs, None));
+      CmdArgs, None, Output));
 }
 
 void OffloadBundler::ConstructJobMultipleOutputs(
@@ -7227,7 +7231,7 @@ void OffloadBundler::ConstructJobMultipleOutputs(
   C.addCommand(std::make_unique<Command>(
       JA, *this, ResponseFileSupport::None(),
       TCArgs.MakeArgString(getToolChain().GetProgramPath(getShortName())),
-      CmdArgs, None));
+      CmdArgs, None, Outputs));
 }
 
 void OffloadWrapper::ConstructJob(Compilation &C, const JobAction &JA,
@@ -7257,5 +7261,5 @@ void OffloadWrapper::ConstructJob(Compilation &C, const JobAction &JA,
   C.addCommand(std::make_unique<Command>(
       JA, *this, ResponseFileSupport::None(),
       Args.MakeArgString(getToolChain().GetProgramPath(getShortName())),
-      CmdArgs, Inputs));
+      CmdArgs, Inputs, Output));
 }
diff --git a/clang/lib/Driver/ToolChains/CloudABI.cpp b/clang/lib/Driver/ToolChains/CloudABI.cpp
index 8dcfd4951bbfe4..3efca8776260a6 100644
--- a/clang/lib/Driver/ToolChains/CloudABI.cpp
+++ b/clang/lib/Driver/ToolChains/CloudABI.cpp
@@ -92,8 +92,9 @@ void cloudabi::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(Args.MakeArgString(ToolChain.GetFilePath("crtend.o")));
 
   const char *Exec = Args.MakeArgString(ToolChain.GetLinkerPath());
-  C.addCommand(std::make_unique<Command>(
-      JA, *this, ResponseFileSupport::AtFileCurCP(), Exec, CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(JA, *this,
+                                         ResponseFileSupport::AtFileCurCP(),
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 // CloudABI - CloudABI tool chain which can call ld(1) directly.
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index e3723e213c52f6..692d0600bad35e 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -951,12 +951,13 @@ void tools::SplitDebugInfo(const ToolChain &TC, Compilation &C, const Tool &T,
   InputInfo II(types::TY_Object, Output.getFilename(), Output.getFilename());
 
   // First extract the dwo sections.
-  C.addCommand(std::make_unique<Command>(
-      JA, T, ResponseFileSupport::AtFileCurCP(), Exec, ExtractArgs, II));
+  C.addCommand(std::make_unique<Command>(JA, T,
+                                         ResponseFileSupport::AtFileCurCP(),
+                                         Exec, ExtractArgs, II, Output));
 
   // Then remove them from the original .o file.
   C.addCommand(std::make_unique<Command>(
-      JA, T, ResponseFileSupport::AtFileCurCP(), Exec, StripArgs, II));
+      JA, T, ResponseFileSupport::AtFileCurCP(), Exec, StripArgs, II, Output));
 }
 
 // Claim options we don't want to warn if they are unused. We do this for
diff --git a/clang/lib/Driver/ToolChains/CrossWindows.cpp b/clang/lib/Driver/ToolChains/CrossWindows.cpp
index 127a8a5f24cce5..28ad6c59c655cd 100644
--- a/clang/lib/Driver/ToolChains/CrossWindows.cpp
+++ b/clang/lib/Driver/ToolChains/CrossWindows.cpp
@@ -58,7 +58,7 @@ void tools::CrossWindows::Assembler::ConstructJob(
   Exec = Args.MakeArgString(Assembler);
 
   C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
-                                         Exec, CmdArgs, Inputs));
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 void tools::CrossWindows::Linker::ConstructJob(
@@ -203,8 +203,9 @@ void tools::CrossWindows::Linker::ConstructJob(
 
   Exec = Args.MakeArgString(TC.GetLinkerPath());
 
-  C.addCommand(std::make_unique<Command>(
-      JA, *this, ResponseFileSupport::AtFileUTF8(), Exec, CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(JA, *this,
+                                         ResponseFileSupport::AtFileUTF8(),
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 CrossWindowsToolChain::CrossWindowsToolChain(const Driver &D,
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index d7933534a5d3d1..217a0155a52d3c 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -427,7 +427,7 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
       JA, *this,
       ResponseFileSupport{ResponseFileSupport::RF_Full, llvm::sys::WEM_UTF8,
                           "--options-file"},
-      Exec, CmdArgs, Inputs));
+      Exec, CmdArgs, Inputs, Output));
 }
 
 static bool shouldIncludePTX(const ArgList &Args, const char *gpu_arch) {
@@ -496,7 +496,7 @@ void NVPTX::Linker::ConstructJob(Compilation &C, const JobAction &JA,
       JA, *this,
       ResponseFileSupport{ResponseFileSupport::RF_Full, llvm::sys::WEM_UTF8,
                           "--options-file"},
-      Exec, CmdArgs, Inputs));
+      Exec, CmdArgs, Inputs, Output));
 }
 
 void NVPTX::OpenMPLinker::ConstructJob(Compilation &C, const JobAction &JA,
@@ -577,7 +577,7 @@ void NVPTX::OpenMPLinker::ConstructJob(Compilation &C, const JobAction &JA,
       JA, *this,
       ResponseFileSupport{ResponseFileSupport::RF_Full, llvm::sys::WEM_UTF8,
                           "--options-file"},
-      Exec, CmdArgs, Inputs));
+      Exec, CmdArgs, Inputs, Output));
 }
 
 /// CUDA toolchain.  Our assembler is ptxas, and our "linker" is fatbinary,
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index 8f2be2a343cc5e..0d9e471ec07096 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -149,7 +149,7 @@ void darwin::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
 
   const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("as"));
   C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
-                                         Exec, CmdArgs, Inputs));
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 void darwin::MachOTool::anchor() {}
@@ -522,7 +522,7 @@ void darwin::Linker::ConstructJob(Compilation &C, const JobAction &JA,
         Args.MakeArgString(getToolChain().GetProgramPath("touch"));
     CmdArgs.push_back(Output.getFilename());
     C.addCommand(std::make_unique<Command>(
-        JA, *this, ResponseFileSupport::None(), Exec, CmdArgs, None));
+        JA, *this, ResponseFileSupport::None(), Exec, CmdArgs, None, Output));
     return;
   }
 
@@ -695,7 +695,7 @@ void darwin::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   const char *Exec = Args.MakeArgString(getToolChain().GetLinkerPath());
   std::unique_ptr<Command> Cmd = std::make_unique<Command>(
-      JA, *this, ResponseSupport, Exec, CmdArgs, Inputs);
+      JA, *this, ResponseSupport, Exec, CmdArgs, Inputs, Output);
   Cmd->setInputFileList(std::move(InputFileList));
   C.addCommand(std::move(Cmd));
 }
@@ -720,7 +720,7 @@ void darwin::Lipo::ConstructJob(Compilation &C, const JobAction &JA,
 
   const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("lipo"));
   C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
-                                         Exec, CmdArgs, Inputs));
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 void darwin::Dsymutil::ConstructJob(Compilation &C, const JobAction &JA,
@@ -741,7 +741,7 @@ void darwin::Dsymutil::ConstructJob(Compilation &C, const JobAction &JA,
   const char *Exec =
       Args.MakeArgString(getToolChain().GetProgramPath("dsymutil"));
   C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
-                                         Exec, CmdArgs, Inputs));
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 void darwin::VerifyDebug::ConstructJob(Compilation &C, const JobAction &JA,
@@ -765,7 +765,7 @@ void darwin::VerifyDebug::ConstructJob(Compilation &C, const JobAction &JA,
   const char *Exec =
       Args.MakeArgString(getToolChain().GetProgramPath("dwarfdump"));
   C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
-                                         Exec, CmdArgs, Inputs));
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 MachO::MachO(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
diff --git a/clang/lib/Driver/ToolChains/DragonFly.cpp b/clang/lib/Driver/ToolChains/DragonFly.cpp
index 88dd0c899d8a89..08176e507eed07 100644
--- a/clang/lib/Driver/ToolChains/DragonFly.cpp
+++ b/clang/lib/Driver/ToolChains/DragonFly.cpp
@@ -45,8 +45,9 @@ void dragonfly::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(II.getFilename());
 
   const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("as"));
-  C.addCommand(std::make_unique<Command>(
-      JA, *this, ResponseFileSupport::AtFileCurCP(), Exec, CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(JA, *this,
+                                         ResponseFileSupport::AtFileCurCP(),
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 void dragonfly::Linker::ConstructJob(Compilation &C, const JobAction &JA,
@@ -170,8 +171,9 @@ void dragonfly::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   getToolChain().addProfileRTLibs(Args, CmdArgs);
 
   const char *Exec = Args.MakeArgString(getToolChain().GetLinkerPath());
-  C.addCommand(std::make_unique<Command>(
-      JA, *this, ResponseFileSupport::AtFileCurCP(), Exec, CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(JA, *this,
+                                         ResponseFileSupport::AtFileCurCP(),
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 /// DragonFly - DragonFly tool chain which can call as(1) and ld(1) directly.
diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp
index 93401c66266304..f8633b988faa50 100644
--- a/clang/lib/Driver/ToolChains/Flang.cpp
+++ b/clang/lib/Driver/ToolChains/Flang.cpp
@@ -72,8 +72,9 @@ void Flang::ConstructJob(Compilation &C, const JobAction &JA,
   // TODO: Replace flang-new with flang once the new driver replaces the
   // throwaway driver
   const char *Exec = Args.MakeArgString(D.GetProgramPath("flang-new", TC));
-  C.addCommand(std::make_unique<Command>(
-      JA, *this, ResponseFileSupport::AtFileUTF8(), Exec, CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(JA, *this,
+                                         ResponseFileSupport::AtFileUTF8(),
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 Flang::Flang(const ToolChain &TC) : Tool("flang-new", "flang frontend", TC) {}
diff --git a/clang/lib/Driver/ToolChains/FreeBSD.cpp b/clang/lib/Driver/ToolChains/FreeBSD.cpp
index 909ac5e992129c..5854defca48819 100644
--- a/clang/lib/Driver/ToolChains/FreeBSD.cpp
+++ b/clang/lib/Driver/ToolChains/FreeBSD.cpp
@@ -128,8 +128,9 @@ void freebsd::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(II.getFilename());
 
   const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("as"));
-  C.addCommand(std::make_unique<Command>(
-      JA, *this, ResponseFileSupport::AtFileCurCP(), Exec, CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(JA, *this,
+                                         ResponseFileSupport::AtFileCurCP(),
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 void freebsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
@@ -359,8 +360,9 @@ void freebsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   ToolChain.addProfileRTLibs(Args, CmdArgs);
 
   const char *Exec = Args.MakeArgString(getToolChain().GetLinkerPath());
-  C.addCommand(std::make_unique<Command>(
-      JA, *this, ResponseFileSupport::AtFileCurCP(), Exec, CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(JA, *this,
+                                         ResponseFileSupport::AtFileCurCP(),
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 /// FreeBSD - FreeBSD tool chain which can call as(1) and ld(1) directly.
diff --git a/clang/lib/Driver/ToolChains/Fuchsia.cpp b/clang/lib/Driver/ToolChains/Fuchsia.cpp
index 781179be39a364..79d3a8d554ded2 100644
--- a/clang/lib/Driver/ToolChains/Fuchsia.cpp
+++ b/clang/lib/Driver/ToolChains/Fuchsia.cpp
@@ -167,7 +167,7 @@ void fuchsia::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   }
 
   C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
-                                         Exec, CmdArgs, Inputs));
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 /// Fuchsia - Fuchsia tool chain which can call as(1) and ld(1) directly.
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index f3843685a522be..7d75e90c6092f5 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -171,8 +171,9 @@ void tools::gcc::Common::ConstructJob(Compilation &C, const JobAction &JA,
     GCCName = "gcc";
 
   const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath(GCCName));
-  C.addCommand(std::make_unique<Command>(
-      JA, *this, ResponseFileSupport::AtFileCurCP(), Exec, CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(JA, *this,
+                                         ResponseFileSupport::AtFileCurCP(),
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 void tools::gcc::Preprocessor::RenderExtraToolArgs(
@@ -364,8 +365,9 @@ void tools::gnutools::StaticLibTool::ConstructJob(
   }
 
   const char *Exec = Args.MakeArgString(getToolChain().GetStaticLibToolPath());
-  C.addCommand(std::make_unique<Command>(
-      JA, *this, ResponseFileSupport::AtFileCurCP(), Exec, CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(JA, *this,
+                                         ResponseFileSupport::AtFileCurCP(),
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
@@ -662,8 +664,9 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   Args.AddAllArgs(CmdArgs, options::OPT_T);
 
   const char *Exec = Args.MakeArgString(ToolChain.GetLinkerPath());
-  C.addCommand(std::make_unique<Command>(
-      JA, *this, ResponseFileSupport::AtFileCurCP(), Exec, CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(JA, *this,
+                                         ResponseFileSupport::AtFileCurCP(),
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 void tools::gnutools::Assembler::ConstructJob(Compilation &C,
@@ -930,8 +933,9 @@ void tools::gnutools::Assembler::ConstructJob(Compilation &C,
 
   const char *Exec =
       Args.MakeArgString(getToolChain().GetProgramPath(DefaultAssembler));
-  C.addCommand(std::make_unique<Command>(
-      JA, *this, ResponseFileSupport::AtFileCurCP(), Exec, CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(JA, *this,
+                                         ResponseFileSupport::AtFileCurCP(),
+                                         Exec, CmdArgs, Inputs, Output));
 
   // Handle the debug info splitting at object creation time if we're
   // creating an object.
diff --git a/clang/lib/Driver/ToolChains/HIP.cpp b/clang/lib/Driver/ToolChains/HIP.cpp
index 4d1e0f9f2fdfc7..a06835eee0243e 100644
--- a/clang/lib/Driver/ToolChains/HIP.cpp
+++ b/clang/lib/Driver/ToolChains/HIP.cpp
@@ -98,7 +98,7 @@ void AMDGCN::Linker::constructLldCommand(Compilation &C, const JobAction &JA,
     LldArgs.push_back(Input.getFilename());
   const char *Lld = Args.MakeArgString(getToolChain().GetProgramPath("lld"));
   C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
-                                         Lld, LldArgs, Inputs));
+                                         Lld, LldArgs, Inputs, Output));
 }
 
 // Construct a clang-offload-bundler command to bundle code objects for
@@ -127,14 +127,16 @@ void AMDGCN::constructHIPFatbinCommand(Compilation &C, const JobAction &JA,
   BundlerArgs.push_back(Args.MakeArgString(BundlerTargetArg));
   BundlerArgs.push_back(Args.MakeArgString(BundlerInputArg));
 
-  auto BundlerOutputArg = Args.MakeArgString(
-      std::string("-outputs=").append(std::string(OutputFileName)));
+  std::string Output = std::string(OutputFileName);
+  auto BundlerOutputArg =
+      Args.MakeArgString(std::string("-outputs=").append(Output));
   BundlerArgs.push_back(BundlerOutputArg);
 
   const char *Bundler = Args.MakeArgString(
       T.getToolChain().GetProgramPath("clang-offload-bundler"));
-  C.addCommand(std::make_unique<Command>(JA, T, ResponseFileSupport::None(),
-                                         Bundler, BundlerArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(
+      JA, T, ResponseFileSupport::None(), Bundler, BundlerArgs, Inputs,
+      InputInfo(&JA, Args.MakeArgString(Output))));
 }
 
 /// Add Generated HIP Object File which has device images embedded into the
@@ -205,7 +207,7 @@ void AMDGCN::Linker::constructGenerateObjFileFromHIPFatBinary(
                        McinFile,  "--filetype=obj"};
   const char *Mc = Args.MakeArgString(TC.GetProgramPath("llvm-mc"));
   C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
-                                         Mc, McArgs, Inputs));
+                                         Mc, McArgs, Inputs, Output));
 }
 
 // For amdgcn the inputs of the linker job are device bitcode and output is
diff --git a/clang/lib/Driver/ToolChains/Hexagon.cpp b/clang/lib/Driver/ToolChains/Hexagon.cpp
index 775f6e1094fa66..fb54f73bcd4c89 100644
--- a/clang/lib/Driver/ToolChains/Hexagon.cpp
+++ b/clang/lib/Driver/ToolChains/Hexagon.cpp
@@ -189,8 +189,9 @@ void hexagon::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
   }
 
   auto *Exec = Args.MakeArgString(HTC.GetProgramPath(AsName));
-  C.addCommand(std::make_unique<Command>(
-      JA, *this, ResponseFileSupport::AtFileCurCP(), Exec, CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(JA, *this,
+                                         ResponseFileSupport::AtFileCurCP(),
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 void hexagon::Linker::RenderExtraToolArgs(const JobAction &JA,
@@ -407,8 +408,9 @@ void hexagon::Linker::ConstructJob(Compilation &C, const JobAction &JA,
                            LinkingOutput);
 
   const char *Exec = Args.MakeArgString(HTC.GetLinkerPath());
-  C.addCommand(std::make_unique<Command>(
-      JA, *this, ResponseFileSupport::AtFileCurCP(), Exec, CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(JA, *this,
+                                         ResponseFileSupport::AtFileCurCP(),
+                                         Exec, CmdArgs, Inputs, Output));
 }
 // Hexagon tools end.
 
diff --git a/clang/lib/Driver/ToolChains/InterfaceStubs.cpp b/clang/lib/Driver/ToolChains/InterfaceStubs.cpp
index f7c11421e80945..57acf338df5c42 100644
--- a/clang/lib/Driver/ToolChains/InterfaceStubs.cpp
+++ b/clang/lib/Driver/ToolChains/InterfaceStubs.cpp
@@ -56,7 +56,7 @@ void Merger::ConstructJob(Compilation &C, const JobAction &JA,
 
   C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
                                          Args.MakeArgString(Merger), CmdArgs,
-                                         Inputs));
+                                         Inputs, Output));
 }
 } // namespace ifstool
 } // namespace tools
diff --git a/clang/lib/Driver/ToolChains/MSP430.cpp b/clang/lib/Driver/ToolChains/MSP430.cpp
index 6d663e4909e596..f3ed9967a81a1d 100644
--- a/clang/lib/Driver/ToolChains/MSP430.cpp
+++ b/clang/lib/Driver/ToolChains/MSP430.cpp
@@ -312,7 +312,7 @@ void msp430::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   Args.AddAllArgs(CmdArgs, options::OPT_T);
 
-  C.addCommand(
-      std::make_unique<Command>(JA, *this, ResponseFileSupport::AtFileCurCP(),
-                                Args.MakeArgString(Linker), CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(
+      JA, *this, ResponseFileSupport::AtFileCurCP(), Args.MakeArgString(Linker),
+      CmdArgs, Inputs, Output));
 }
diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp
index 7faccdff6beed0..ba2c7146b924eb 100644
--- a/clang/lib/Driver/ToolChains/MSVC.cpp
+++ b/clang/lib/Driver/ToolChains/MSVC.cpp
@@ -606,9 +606,9 @@ void visualstudio::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     linkPath = TC.GetProgramPath(Linker.str().c_str());
   }
 
-  auto LinkCmd =
-      std::make_unique<Command>(JA, *this, ResponseFileSupport::AtFileUTF16(),
-                                Args.MakeArgString(linkPath), CmdArgs, Inputs);
+  auto LinkCmd = std::make_unique<Command>(
+      JA, *this, ResponseFileSupport::AtFileUTF16(),
+      Args.MakeArgString(linkPath), CmdArgs, Inputs, Output);
   if (!Environment.empty())
     LinkCmd->setEnvironment(Environment);
   C.addCommand(std::move(LinkCmd));
@@ -748,9 +748,9 @@ std::unique_ptr<Command> visualstudio::Compiler::GetCommand(
   CmdArgs.push_back(Fo);
 
   std::string Exec = FindVisualStudioExecutable(getToolChain(), "cl.exe");
-  return std::make_unique<Command>(JA, *this,
-                                   ResponseFileSupport::AtFileUTF16(),
-                                   Args.MakeArgString(Exec), CmdArgs, Inputs);
+  return std::make_unique<Command>(
+      JA, *this, ResponseFileSupport::AtFileUTF16(), Args.MakeArgString(Exec),
+      CmdArgs, Inputs, Output);
 }
 
 MSVCToolChain::MSVCToolChain(const Driver &D, const llvm::Triple &Triple,
diff --git a/clang/lib/Driver/ToolChains/MinGW.cpp b/clang/lib/Driver/ToolChains/MinGW.cpp
index 4267af60bf031e..49fef4298bfecd 100644
--- a/clang/lib/Driver/ToolChains/MinGW.cpp
+++ b/clang/lib/Driver/ToolChains/MinGW.cpp
@@ -51,7 +51,7 @@ void tools::MinGW::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
 
   const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("as"));
   C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
-                                         Exec, CmdArgs, Inputs));
+                                         Exec, CmdArgs, Inputs, Output));
 
   if (Args.hasArg(options::OPT_gsplit_dwarf))
     SplitDebugInfo(getToolChain(), C, *this, JA, Args, Output,
@@ -167,9 +167,10 @@ void tools::MinGW::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   // that lacks an extension.
   // GCC used to do this only when the compiler itself runs on windows, but
   // since GCC 8 it does the same when cross compiling as well.
-  if (!llvm::sys::path::has_extension(OutputFile))
+  if (!llvm::sys::path::has_extension(OutputFile)) {
     CmdArgs.push_back(Args.MakeArgString(Twine(OutputFile) + ".exe"));
-  else
+    OutputFile = CmdArgs.back();
+  } else
     CmdArgs.push_back(OutputFile);
 
   Args.AddAllArgs(CmdArgs, options::OPT_e);
@@ -318,8 +319,9 @@ void tools::MinGW::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     }
   }
   const char *Exec = Args.MakeArgString(TC.GetLinkerPath());
-  C.addCommand(std::make_unique<Command>(
-      JA, *this, ResponseFileSupport::AtFileUTF8(), Exec, CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(JA, *this,
+                                         ResponseFileSupport::AtFileUTF8(),
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 // Simplified from Generic_GCC::GCCInstallationDetector::ScanLibDirForGCCTriple.
diff --git a/clang/lib/Driver/ToolChains/Minix.cpp b/clang/lib/Driver/ToolChains/Minix.cpp
index d0314795620ce1..44479a24ebe78b 100644
--- a/clang/lib/Driver/ToolChains/Minix.cpp
+++ b/clang/lib/Driver/ToolChains/Minix.cpp
@@ -36,8 +36,9 @@ void tools::minix::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(II.getFilename());
 
   const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("as"));
-  C.addCommand(std::make_unique<Command>(
-      JA, *this, ResponseFileSupport::AtFileCurCP(), Exec, CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(JA, *this,
+                                         ResponseFileSupport::AtFileCurCP(),
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 void tools::minix::Linker::ConstructJob(Compilation &C, const JobAction &JA,
@@ -89,8 +90,9 @@ void tools::minix::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   }
 
   const char *Exec = Args.MakeArgString(getToolChain().GetLinkerPath());
-  C.addCommand(std::make_unique<Command>(
-      JA, *this, ResponseFileSupport::AtFileCurCP(), Exec, CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(JA, *this,
+                                         ResponseFileSupport::AtFileCurCP(),
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 /// Minix - Minix tool chain which can call as(1) and ld(1) directly.
diff --git a/clang/lib/Driver/ToolChains/Myriad.cpp b/clang/lib/Driver/ToolChains/Myriad.cpp
index 84fe4748b6faff..ab0df5d8f1683e 100644
--- a/clang/lib/Driver/ToolChains/Myriad.cpp
+++ b/clang/lib/Driver/ToolChains/Myriad.cpp
@@ -79,7 +79,7 @@ void tools::SHAVE::Compiler::ConstructJob(Compilation &C, const JobAction &JA,
       Args.MakeArgString(getToolChain().GetProgramPath("moviCompile"));
   C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
                                          Args.MakeArgString(Exec), CmdArgs,
-                                         Inputs));
+                                         Inputs, Output));
 }
 
 void tools::SHAVE::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
@@ -115,7 +115,7 @@ void tools::SHAVE::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
       Args.MakeArgString(getToolChain().GetProgramPath("moviAsm"));
   C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
                                          Args.MakeArgString(Exec), CmdArgs,
-                                         Inputs));
+                                         Inputs, Output));
 }
 
 void tools::Myriad::Linker::ConstructJob(Compilation &C, const JobAction &JA,
@@ -200,9 +200,9 @@ void tools::Myriad::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   std::string Exec =
       Args.MakeArgString(TC.GetProgramPath("sparc-myriad-rtems-ld"));
-  C.addCommand(
-      std::make_unique<Command>(JA, *this, ResponseFileSupport::AtFileCurCP(),
-                                Args.MakeArgString(Exec), CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(
+      JA, *this, ResponseFileSupport::AtFileCurCP(), Args.MakeArgString(Exec),
+      CmdArgs, Inputs, Output));
 }
 
 MyriadToolChain::MyriadToolChain(const Driver &D, const llvm::Triple &Triple,
diff --git a/clang/lib/Driver/ToolChains/NaCl.cpp b/clang/lib/Driver/ToolChains/NaCl.cpp
index 15a773675299a1..8a150c39475320 100644
--- a/clang/lib/Driver/ToolChains/NaCl.cpp
+++ b/clang/lib/Driver/ToolChains/NaCl.cpp
@@ -193,8 +193,9 @@ void nacltools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   }
 
   const char *Exec = Args.MakeArgString(ToolChain.GetLinkerPath());
-  C.addCommand(std::make_unique<Command>(
-      JA, *this, ResponseFileSupport::AtFileCurCP(), Exec, CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(JA, *this,
+                                         ResponseFileSupport::AtFileCurCP(),
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 /// NaCl Toolchain
diff --git a/clang/lib/Driver/ToolChains/NetBSD.cpp b/clang/lib/Driver/ToolChains/NetBSD.cpp
index 253ee6ce0f7210..48bf061c6650d9 100644
--- a/clang/lib/Driver/ToolChains/NetBSD.cpp
+++ b/clang/lib/Driver/ToolChains/NetBSD.cpp
@@ -103,8 +103,9 @@ void netbsd::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(II.getFilename());
 
   const char *Exec = Args.MakeArgString((getToolChain().GetProgramPath("as")));
-  C.addCommand(std::make_unique<Command>(
-      JA, *this, ResponseFileSupport::AtFileCurCP(), Exec, CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(JA, *this,
+                                         ResponseFileSupport::AtFileCurCP(),
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 void netbsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
@@ -338,8 +339,9 @@ void netbsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   ToolChain.addProfileRTLibs(Args, CmdArgs);
 
   const char *Exec = Args.MakeArgString(ToolChain.GetLinkerPath());
-  C.addCommand(std::make_unique<Command>(
-      JA, *this, ResponseFileSupport::AtFileCurCP(), Exec, CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(JA, *this,
+                                         ResponseFileSupport::AtFileCurCP(),
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 /// NetBSD - NetBSD tool chain which can call as(1) and ld(1) directly.
diff --git a/clang/lib/Driver/ToolChains/OpenBSD.cpp b/clang/lib/Driver/ToolChains/OpenBSD.cpp
index 5ca2fa0850e632..f155d74632f93c 100644
--- a/clang/lib/Driver/ToolChains/OpenBSD.cpp
+++ b/clang/lib/Driver/ToolChains/OpenBSD.cpp
@@ -82,8 +82,9 @@ void openbsd::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
     CmdArgs.push_back(II.getFilename());
 
   const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("as"));
-  C.addCommand(std::make_unique<Command>(
-      JA, *this, ResponseFileSupport::AtFileCurCP(), Exec, CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(JA, *this,
+                                         ResponseFileSupport::AtFileCurCP(),
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 void openbsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
@@ -221,8 +222,9 @@ void openbsd::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   }
 
   const char *Exec = Args.MakeArgString(ToolChain.GetLinkerPath());
-  C.addCommand(std::make_unique<Command>(
-      JA, *this, ResponseFileSupport::AtFileCurCP(), Exec, CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(JA, *this,
+                                         ResponseFileSupport::AtFileCurCP(),
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 SanitizerMask OpenBSD::getSupportedSanitizers() const {
diff --git a/clang/lib/Driver/ToolChains/PS4CPU.cpp b/clang/lib/Driver/ToolChains/PS4CPU.cpp
index 6dc81899cbaacb..fab1b2ac62dfdb 100644
--- a/clang/lib/Driver/ToolChains/PS4CPU.cpp
+++ b/clang/lib/Driver/ToolChains/PS4CPU.cpp
@@ -66,8 +66,9 @@ void tools::PS4cpu::Assemble::ConstructJob(Compilation &C, const JobAction &JA,
 
   const char *Exec =
       Args.MakeArgString(getToolChain().GetProgramPath("orbis-as"));
-  C.addCommand(std::make_unique<Command>(
-      JA, *this, ResponseFileSupport::AtFileUTF8(), Exec, CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(JA, *this,
+                                         ResponseFileSupport::AtFileUTF8(),
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 static void AddPS4SanitizerArgs(const ToolChain &TC, ArgStringList &CmdArgs) {
@@ -152,8 +153,9 @@ void tools::PS4cpu::Link::ConstructJob(Compilation &C, const JobAction &JA,
   const char *Exec =
       Args.MakeArgString(ToolChain.GetProgramPath("orbis-ld"));
 
-  C.addCommand(std::make_unique<Command>(
-      JA, *this, ResponseFileSupport::AtFileUTF8(), Exec, CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(JA, *this,
+                                         ResponseFileSupport::AtFileUTF8(),
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 toolchains::PS4CPU::PS4CPU(const Driver &D, const llvm::Triple &Triple,
diff --git a/clang/lib/Driver/ToolChains/RISCVToolchain.cpp b/clang/lib/Driver/ToolChains/RISCVToolchain.cpp
index cc912d94cb92f3..312c8b52c5e834 100644
--- a/clang/lib/Driver/ToolChains/RISCVToolchain.cpp
+++ b/clang/lib/Driver/ToolChains/RISCVToolchain.cpp
@@ -191,8 +191,8 @@ void RISCV::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
-  C.addCommand(
-      std::make_unique<Command>(JA, *this, ResponseFileSupport::AtFileCurCP(),
-                                Args.MakeArgString(Linker), CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(
+      JA, *this, ResponseFileSupport::AtFileCurCP(), Args.MakeArgString(Linker),
+      CmdArgs, Inputs, Output));
 }
 // RISCV tools end.
diff --git a/clang/lib/Driver/ToolChains/Solaris.cpp b/clang/lib/Driver/ToolChains/Solaris.cpp
index b8fdc87478bc66..4ed4d839ad106c 100644
--- a/clang/lib/Driver/ToolChains/Solaris.cpp
+++ b/clang/lib/Driver/ToolChains/Solaris.cpp
@@ -42,7 +42,7 @@ void solaris::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
 
   const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("as"));
   C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
-                                         Exec, CmdArgs, Inputs));
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA,
@@ -152,7 +152,7 @@ void solaris::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   const char *Exec = Args.MakeArgString(getToolChain().GetLinkerPath());
   C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
-                                         Exec, CmdArgs, Inputs));
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 static StringRef getSolarisLibSuffix(const llvm::Triple &Triple) {
diff --git a/clang/lib/Driver/ToolChains/WebAssembly.cpp b/clang/lib/Driver/ToolChains/WebAssembly.cpp
index d953082470aab6..6b654886e7746c 100644
--- a/clang/lib/Driver/ToolChains/WebAssembly.cpp
+++ b/clang/lib/Driver/ToolChains/WebAssembly.cpp
@@ -114,8 +114,9 @@ void wasm::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   CmdArgs.push_back("-o");
   CmdArgs.push_back(Output.getFilename());
 
-  C.addCommand(std::make_unique<Command>(
-      JA, *this, ResponseFileSupport::AtFileCurCP(), Linker, CmdArgs, Inputs));
+  C.addCommand(std::make_unique<Command>(JA, *this,
+                                         ResponseFileSupport::AtFileCurCP(),
+                                         Linker, CmdArgs, Inputs, Output));
 
   // When optimizing, if wasm-opt is available, run it.
   if (Arg *A = Args.getLastArg(options::OPT_O_Group)) {
@@ -139,7 +140,7 @@ void wasm::Linker::ConstructJob(Compilation &C, const JobAction &JA,
         CmdArgs.push_back(Output.getFilename());
         C.addCommand(std::make_unique<Command>(
             JA, *this, ResponseFileSupport::AtFileCurCP(), WasmOpt, CmdArgs,
-            Inputs));
+            Inputs, Output));
       }
     }
   }
diff --git a/clang/lib/Driver/ToolChains/XCore.cpp b/clang/lib/Driver/ToolChains/XCore.cpp
index 5030c73c7d8251..5f94f83d36919d 100644
--- a/clang/lib/Driver/ToolChains/XCore.cpp
+++ b/clang/lib/Driver/ToolChains/XCore.cpp
@@ -53,7 +53,7 @@ void tools::XCore::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
 
   const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("xcc"));
   C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
-                                         Exec, CmdArgs, Inputs));
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 void tools::XCore::Linker::ConstructJob(Compilation &C, const JobAction &JA,
@@ -82,7 +82,7 @@ void tools::XCore::Linker::ConstructJob(Compilation &C, const JobAction &JA,
 
   const char *Exec = Args.MakeArgString(getToolChain().GetProgramPath("xcc"));
   C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
-                                         Exec, CmdArgs, Inputs));
+                                         Exec, CmdArgs, Inputs, Output));
 }
 
 /// XCore tool chain
diff --git a/clang/unittests/Driver/ToolChainTest.cpp b/clang/unittests/Driver/ToolChainTest.cpp
index 67bf545b14e4b3..227f7c76b8a1ec 100644
--- a/clang/unittests/Driver/ToolChainTest.cpp
+++ b/clang/unittests/Driver/ToolChainTest.cpp
@@ -259,4 +259,34 @@ TEST(ToolChainTest, GetTargetAndMode) {
   EXPECT_STREQ(Res.DriverMode, "--driver-mode=cl");
   EXPECT_FALSE(Res.TargetIsValid);
 }
+
+TEST(ToolChainTest, CommandOutput) {
+  IntrusiveRefCntPtr<DiagnosticOptions> DiagOpts = new DiagnosticOptions();
+
+  IntrusiveRefCntPtr<DiagnosticIDs> DiagID(new DiagnosticIDs());
+  struct TestDiagnosticConsumer : public DiagnosticConsumer {};
+  DiagnosticsEngine Diags(DiagID, &*DiagOpts, new TestDiagnosticConsumer);
+  IntrusiveRefCntPtr<llvm::vfs::InMemoryFileSystem> InMemoryFileSystem(
+      new llvm::vfs::InMemoryFileSystem);
+
+  Driver CCDriver("/home/test/bin/clang", "arm-linux-gnueabi", Diags,
+                  "clang LLVM compiler", InMemoryFileSystem);
+  CCDriver.setCheckInputsExist(false);
+  std::unique_ptr<Compilation> CC(
+      CCDriver.BuildCompilation({"/home/test/bin/clang", "foo.cpp"}));
+  const JobList &Jobs = CC->getJobs();
+
+  const auto &CmdCompile = Jobs.getJobs().front();
+  const auto &InFile = CmdCompile->getInputFilenames().front();
+  EXPECT_STREQ(InFile, "foo.cpp");
+  auto ObjFile = CmdCompile->getOutputFilenames().front();
+  EXPECT_TRUE(StringRef(ObjFile).endswith(".o"));
+
+  const auto &CmdLink = Jobs.getJobs().back();
+  const auto LinkInFile = CmdLink->getInputFilenames().front();
+  EXPECT_EQ(ObjFile, LinkInFile);
+  auto ExeFile = CmdLink->getOutputFilenames().front();
+  EXPECT_EQ("a.out", ExeFile);
+}
+
 } // end anonymous namespace.
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index e57abea427530f..4e9b1f6c233223 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -1494,6 +1494,7 @@ def relpath(p):\n
     if os.path.splitdrive(p)[0] != os.path.splitdrive(base)[0]: return p\n
     if haslink(p) or haslink(base): return p\n
     return os.path.relpath(p, base)\n
+if len(sys.argv) < 3: sys.exit(0)\n
 sys.stdout.write(';'.join(relpath(p) for p in sys.argv[2].split(';')))"
     ${basedir}
     ${pathlist_escaped}
diff --git a/llvm/docs/CommandGuide/llvm-readobj.rst b/llvm/docs/CommandGuide/llvm-readobj.rst
index 9b1b5ba92bc071..ba5511bb765a64 100644
--- a/llvm/docs/CommandGuide/llvm-readobj.rst
+++ b/llvm/docs/CommandGuide/llvm-readobj.rst
@@ -286,6 +286,10 @@ The following options are implemented only for the PE/COFF file format.
 
  Display the debug directory.
 
+.. option:: --coff-tls-directory
+
+ Display the TLS directory.
+
 .. option:: --coff-directives
 
  Display the .drectve section.
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index 158257a5aa9a10..ac6090a30d2ff5 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -1117,6 +1117,15 @@ class ScalarEvolution {
       const SCEV *S, const Loop *L,
       SmallPtrSetImpl<const SCEVPredicate *> &Preds);
 
+  /// Compute \p LHS - \p RHS and returns the result as an APInt if it is a
+  /// constant, and None if it isn't.
+  ///
+  /// This is intended to be a cheaper version of getMinusSCEV.  We can be
+  /// frugal here since we just bail out of actually constructing and
+  /// canonicalizing an expression in the cases where the result isn't going
+  /// to be a constant.
+  Optional<APInt> computeConstantDifference(const SCEV *LHS, const SCEV *RHS);
+
 private:
   /// A CallbackVH to arrange for ScalarEvolution to be notified whenever a
   /// Value is deleted.
@@ -1799,15 +1808,6 @@ class ScalarEvolution {
   bool splitBinaryAdd(const SCEV *Expr, const SCEV *&L, const SCEV *&R,
                       SCEV::NoWrapFlags &Flags);
 
-  /// Compute \p LHS - \p RHS and returns the result as an APInt if it is a
-  /// constant, and None if it isn't.
-  ///
-  /// This is intended to be a cheaper version of getMinusSCEV.  We can be
-  /// frugal here since we just bail out of actually constructing and
-  /// canonicalizing an expression in the cases where the result isn't going
-  /// to be a constant.
-  Optional<APInt> computeConstantDifference(const SCEV *LHS, const SCEV *RHS);
-
   /// Drop memoized information computed for S.
   void forgetMemoizedResults(const SCEV *S);
 
diff --git a/llvm/include/llvm/BinaryFormat/COFF.h b/llvm/include/llvm/BinaryFormat/COFF.h
index 1919d7f0dece96..716d649f7c51c5 100644
--- a/llvm/include/llvm/BinaryFormat/COFF.h
+++ b/llvm/include/llvm/BinaryFormat/COFF.h
@@ -311,6 +311,7 @@ enum SectionCharacteristics : uint32_t {
   IMAGE_SCN_ALIGN_2048BYTES = 0x00C00000,
   IMAGE_SCN_ALIGN_4096BYTES = 0x00D00000,
   IMAGE_SCN_ALIGN_8192BYTES = 0x00E00000,
+  IMAGE_SCN_ALIGN_MASK = 0x00F00000,
   IMAGE_SCN_LNK_NRELOC_OVFL = 0x01000000,
   IMAGE_SCN_MEM_DISCARDABLE = 0x02000000,
   IMAGE_SCN_MEM_NOT_CACHED = 0x04000000,
diff --git a/llvm/include/llvm/Object/COFF.h b/llvm/include/llvm/Object/COFF.h
index 8aef00a8809dcd..505aab8bff5b39 100644
--- a/llvm/include/llvm/Object/COFF.h
+++ b/llvm/include/llvm/Object/COFF.h
@@ -786,6 +786,8 @@ class COFFObjectFile : public ObjectFile {
   const coff_base_reloc_block_header *BaseRelocEnd;
   const debug_directory *DebugDirectoryBegin;
   const debug_directory *DebugDirectoryEnd;
+  const coff_tls_directory32 *TLSDirectory32;
+  const coff_tls_directory64 *TLSDirectory64;
   // Either coff_load_configuration32 or coff_load_configuration64.
   const void *LoadConfig = nullptr;
 
@@ -805,6 +807,7 @@ class COFFObjectFile : public ObjectFile {
   Error initExportTablePtr();
   Error initBaseRelocPtr();
   Error initDebugDirectoryPtr();
+  Error initTLSDirectoryPtr();
   Error initLoadConfigPtr();
 
 public:
@@ -976,6 +979,13 @@ class COFFObjectFile : public ObjectFile {
     return make_range(debug_directory_begin(), debug_directory_end());
   }
 
+  const coff_tls_directory32 *getTLSDirectory32() const {
+    return TLSDirectory32;
+  }
+  const coff_tls_directory64 *getTLSDirectory64() const {
+    return TLSDirectory64;
+  }
+
   const dos_header *getDOSHeader() const {
     if (!PE32Header && !PE32PlusHeader)
       return nullptr;
diff --git a/llvm/lib/Object/COFFObjectFile.cpp b/llvm/lib/Object/COFFObjectFile.cpp
index c26d7721b3fe9c..cd10e67af239e8 100644
--- a/llvm/lib/Object/COFFObjectFile.cpp
+++ b/llvm/lib/Object/COFFObjectFile.cpp
@@ -649,6 +649,38 @@ Error COFFObjectFile::initDebugDirectoryPtr() {
   return Error::success();
 }
 
+Error COFFObjectFile::initTLSDirectoryPtr() {
+  // Get the RVA of the TLS directory. Do nothing if it does not exist.
+  const data_directory *DataEntry = getDataDirectory(COFF::TLS_TABLE);
+  if (!DataEntry)
+    return Error::success();
+
+  // Do nothing if the RVA is NULL.
+  if (DataEntry->RelativeVirtualAddress == 0)
+    return Error::success();
+
+  uint64_t DirSize =
+      is64() ? sizeof(coff_tls_directory64) : sizeof(coff_tls_directory32);
+
+  // Check that the size is correct.
+  if (DataEntry->Size != DirSize)
+    return createStringError(
+        object_error::parse_failed,
+        "TLS Directory size (%u) is not the expected size (%u).",
+        static_cast<uint32_t>(DataEntry->Size), DirSize);
+
+  uintptr_t IntPtr = 0;
+  if (Error E = getRvaPtr(DataEntry->RelativeVirtualAddress, IntPtr))
+    return E;
+
+  if (is64())
+    TLSDirectory64 = reinterpret_cast<const coff_tls_directory64 *>(IntPtr);
+  else
+    TLSDirectory32 = reinterpret_cast<const coff_tls_directory32 *>(IntPtr);
+
+  return Error::success();
+}
+
 Error COFFObjectFile::initLoadConfigPtr() {
   // Get the RVA of the debug directory. Do nothing if it does not exist.
   const data_directory *DataEntry = getDataDirectory(COFF::LOAD_CONFIG_TABLE);
@@ -682,7 +714,8 @@ COFFObjectFile::COFFObjectFile(MemoryBufferRef Object)
       ImportDirectory(nullptr), DelayImportDirectory(nullptr),
       NumberOfDelayImportDirectory(0), ExportDirectory(nullptr),
       BaseRelocHeader(nullptr), BaseRelocEnd(nullptr),
-      DebugDirectoryBegin(nullptr), DebugDirectoryEnd(nullptr) {}
+      DebugDirectoryBegin(nullptr), DebugDirectoryEnd(nullptr),
+      TLSDirectory32(nullptr), TLSDirectory64(nullptr) {}
 
 Error COFFObjectFile::initialize() {
   // Check that we at least have enough room for a header.
@@ -809,10 +842,14 @@ Error COFFObjectFile::initialize() {
   if (Error E = initBaseRelocPtr())
     return E;
 
-  // Initialize the pointer to the export table.
+  // Initialize the pointer to the debug directory.
   if (Error E = initDebugDirectoryPtr())
     return E;
 
+  // Initialize the pointer to the TLS directory.
+  if (Error E = initTLSDirectoryPtr())
+    return E;
+
   if (Error E = initLoadConfigPtr())
     return E;
 
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index dc44980ce218c8..a355cbf30d315a 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -154,19 +154,6 @@ static unsigned AdrImmBits(unsigned Value) {
   return (hi19 << 5) | (lo2 << 29);
 }
 
-static bool valueFitsIntoFixupKind(unsigned Kind, uint64_t Value) {
-  unsigned NumBits;
-  switch(Kind) {
-  case FK_Data_1: NumBits = 8; break;
-  case FK_Data_2: NumBits = 16; break;
-  case FK_Data_4: NumBits = 32; break;
-  case FK_Data_8: NumBits = 64; break;
-  default: return true;
-  }
-  return isUIntN(NumBits, Value) ||
-    isIntN(NumBits, static_cast<int64_t>(Value));
-}
-
 static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
                                  uint64_t Value, MCContext &Ctx,
                                  const Triple &TheTriple, bool IsResolved) {
@@ -341,9 +328,6 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
   case FK_Data_2:
   case FK_Data_4:
   case FK_Data_8:
-    if (!valueFitsIntoFixupKind(Fixup.getTargetKind(), Value))
-      Ctx.reportError(Fixup.getLoc(), "fixup value too large for data type!");
-    LLVM_FALLTHROUGH;
   case FK_SecRel_2:
   case FK_SecRel_4:
     return Value;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d1182015d581b8..822b9556eb4677 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -8028,13 +8028,6 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
          "Custom lowering for non-i32 vectors hasn't been implemented.");
 
-  if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
-                                      MemVT, *Load->getMemOperand())) {
-    SDValue Ops[2];
-    std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
-    return DAG.getMergeValues(Ops, DL);
-  }
-
   unsigned Alignment = Load->getAlignment();
   unsigned AS = Load->getAddressSpace();
   if (Subtarget->hasLDSMisalignedBug() &&
@@ -8146,6 +8139,14 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
       return SplitVectorLoad(Op, DAG);
     }
   }
+
+  if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+                                      MemVT, *Load->getMemOperand())) {
+    SDValue Ops[2];
+    std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
+    return DAG.getMergeValues(Ops, DL);
+  }
+
   return SDValue();
 }
 
@@ -8551,11 +8552,6 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   assert(VT.isVector() &&
          Store->getValue().getValueType().getScalarType() == MVT::i32);
 
-  if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
-                                      VT, *Store->getMemOperand())) {
-    return expandUnalignedStore(Store, DAG);
-  }
-
   unsigned AS = Store->getAddressSpace();
   if (Subtarget->hasLDSMisalignedBug() &&
       AS == AMDGPUAS::FLAT_ADDRESS &&
@@ -8580,6 +8576,11 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
     // v3 stores not supported on SI.
     if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
       return SplitVectorStore(Op, DAG);
+
+    if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+                                        VT, *Store->getMemOperand()))
+      return expandUnalignedStore(Store, DAG);
+
     return SDValue();
   } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
     switch (Subtarget->getMaxPrivateElementSize()) {
@@ -8619,6 +8620,13 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
       return SplitVectorStore(Op, DAG);
     }
 
+    if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+                                        VT, *Store->getMemOperand())) {
+      if (VT.isVector())
+        return SplitVectorStore(Op, DAG);
+      return expandUnalignedStore(Store, DAG);
+    }
+
     return SDValue();
   } else {
     llvm_unreachable("unhandled address space");
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 3451c238918116..4c263de673d671 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -92,6 +92,7 @@ class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> :
 
   // copy relevant pseudo op flags
   let SubtargetPredicate = ps.SubtargetPredicate;
+  let OtherPredicates    = ps.OtherPredicates;
   let AsmMatchConverter  = ps.AsmMatchConverter;
   let AsmVariantName     = ps.AsmVariantName;
   let Constraints        = ps.Constraints;
@@ -494,14 +495,14 @@ defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, or>;
 defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, xor>;
 
 let mayRaiseFPException = 0 in {
-let SubtargetPredicate = HasMadMacF32Insts in {
+let OtherPredicates = [HasMadMacF32Insts] in {
 let Constraints = "$vdst = $src2", DisableEncoding="$src2",
     isConvertibleToThreeAddress = 1 in {
 defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>;
 }
 
 def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, []>;
-} // End SubtargetPredicate = HasMadMacF32Insts
+} // End OtherPredicates = [HasMadMacF32Insts]
 }
 
 // No patterns so that the scalar instructions are always selected.
@@ -873,6 +874,7 @@ class Base_VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps,
     VOP2_DPP<op, ps, opName, p, 1> {
   let AssemblerPredicate = HasDPP16;
   let SubtargetPredicate = HasDPP16;
+  let OtherPredicates = ps.OtherPredicates;
 }
 
 class VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps,
@@ -899,6 +901,7 @@ class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps,
 
   let AssemblerPredicate = HasDPP8;
   let SubtargetPredicate = HasDPP8;
+  let OtherPredicates = ps.OtherPredicates;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index edb2dc8881c7b9..52b81ad2164a62 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1701,27 +1701,8 @@ static Instruction *foldOrToXor(BinaryOperator &I,
 /// Return true if a constant shift amount is always less than the specified
 /// bit-width. If not, the shift could create poison in the narrower type.
 static bool canNarrowShiftAmt(Constant *C, unsigned BitWidth) {
-  if (auto *ScalarC = dyn_cast<ConstantInt>(C))
-    return ScalarC->getZExtValue() < BitWidth;
-
-  if (C->getType()->isVectorTy()) {
-    // Check each element of a constant vector.
-    unsigned NumElts = cast<FixedVectorType>(C->getType())->getNumElements();
-    for (unsigned i = 0; i != NumElts; ++i) {
-      Constant *Elt = C->getAggregateElement(i);
-      if (!Elt)
-        return false;
-      if (isa<UndefValue>(Elt))
-        continue;
-      auto *CI = dyn_cast<ConstantInt>(Elt);
-      if (!CI || CI->getZExtValue() >= BitWidth)
-        return false;
-    }
-    return true;
-  }
-
-  // The constant is a constant expression or unknown.
-  return false;
+  APInt Threshold(C->getType()->getScalarSizeInBits(), BitWidth);
+  return match(C, m_SpecificInt_ICMP(ICmpInst::ICMP_ULT, Threshold));
 }
 
 /// Try to use narrower ops (sink zext ops) for an 'and' with binop operand and
@@ -2071,22 +2052,22 @@ Instruction *InstCombinerImpl::matchBSwap(BinaryOperator &Or) {
   return LastInst;
 }
 
-/// Transform UB-safe variants of bitwise rotate to the funnel shift intrinsic.
-static Instruction *matchRotate(Instruction &Or) {
+/// Match UB-safe variants of the funnel shift intrinsic.
+static Instruction *matchFunnelShift(Instruction &Or) {
   // TODO: Can we reduce the code duplication between this and the related
   // rotate matching code under visitSelect and visitTrunc?
   unsigned Width = Or.getType()->getScalarSizeInBits();
 
-  // First, find an or'd pair of opposite shifts with the same shifted operand:
-  // or (lshr ShVal, ShAmt0), (shl ShVal, ShAmt1)
+  // First, find an or'd pair of opposite shifts:
+  // or (lshr ShVal0, ShAmt0), (shl ShVal1, ShAmt1)
   BinaryOperator *Or0, *Or1;
   if (!match(Or.getOperand(0), m_BinOp(Or0)) ||
       !match(Or.getOperand(1), m_BinOp(Or1)))
     return nullptr;
 
-  Value *ShVal, *ShAmt0, *ShAmt1;
-  if (!match(Or0, m_OneUse(m_LogicalShift(m_Value(ShVal), m_Value(ShAmt0)))) ||
-      !match(Or1, m_OneUse(m_LogicalShift(m_Specific(ShVal), m_Value(ShAmt1)))))
+  Value *ShVal0, *ShVal1, *ShAmt0, *ShAmt1;
+  if (!match(Or0, m_OneUse(m_LogicalShift(m_Value(ShVal0), m_Value(ShAmt0)))) ||
+      !match(Or1, m_OneUse(m_LogicalShift(m_Value(ShVal1), m_Value(ShAmt1)))))
     return nullptr;
 
   BinaryOperator::BinaryOps ShiftOpcode0 = Or0->getOpcode();
@@ -2094,15 +2075,30 @@ static Instruction *matchRotate(Instruction &Or) {
   if (ShiftOpcode0 == ShiftOpcode1)
     return nullptr;
 
-  // Match the shift amount operands for a rotate pattern. This always matches
-  // a subtraction on the R operand.
-  auto matchShiftAmount = [](Value *L, Value *R, unsigned Width) -> Value * {
+  // Match the shift amount operands for a funnel shift pattern. This always
+  // matches a subtraction on the R operand.
+  auto matchShiftAmount = [&](Value *L, Value *R, unsigned Width) -> Value * {
     // Check for constant shift amounts that sum to the bitwidth.
-    // TODO: Support non-uniform shift amounts.
-    const APInt *LC, *RC;
-    if (match(L, m_APIntAllowUndef(LC)) && match(R, m_APIntAllowUndef(RC)))
-      if (LC->ult(Width) && RC->ult(Width) && (*LC + *RC) == Width)
-        return ConstantInt::get(L->getType(), *LC);
+    const APInt *LI, *RI;
+    if (match(L, m_APIntAllowUndef(LI)) && match(R, m_APIntAllowUndef(RI)))
+      if (LI->ult(Width) && RI->ult(Width) && (*LI + *RI) == Width)
+        return ConstantInt::get(L->getType(), *LI);
+
+    // TODO: Support undefs in non-uniform shift amounts.
+    Constant *LC, *RC;
+    if (match(L, m_Constant(LC)) && !LC->containsUndefElement() &&
+        match(R, m_Constant(RC)) && !RC->containsUndefElement() &&
+        match(L, m_SpecificInt_ICMP(ICmpInst::ICMP_ULT, APInt(Width, Width))) &&
+        match(R, m_SpecificInt_ICMP(ICmpInst::ICMP_ULT, APInt(Width, Width)))) {
+      if (match(ConstantExpr::getAdd(LC, RC), m_SpecificInt(Width)))
+        return L;
+    }
+
+    // For non-constant cases, the following patterns currently only work for
+    // rotation patterns.
+    // TODO: Add general funnel-shift compatible patterns.
+    if (ShVal0 != ShVal1)
+      return nullptr;
 
     // For non-constant cases we don't support non-pow2 shift masks.
     // TODO: Is it worth matching urem as well?
@@ -2140,7 +2136,8 @@ static Instruction *matchRotate(Instruction &Or) {
                 (SubIsOnLHS && ShiftOpcode1 == BinaryOperator::Shl);
   Intrinsic::ID IID = IsFshl ? Intrinsic::fshl : Intrinsic::fshr;
   Function *F = Intrinsic::getDeclaration(Or.getModule(), IID, Or.getType());
-  return IntrinsicInst::Create(F, { ShVal, ShVal, ShAmt });
+  return IntrinsicInst::Create(
+      F, {IsFshl ? ShVal0 : ShVal1, IsFshl ? ShVal1 : ShVal0, ShAmt});
 }
 
 /// Attempt to combine or(zext(x),shl(zext(y),bw/2) concat packing patterns.
@@ -2593,8 +2590,8 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
   if (Instruction *BSwap = matchBSwap(I))
     return BSwap;
 
-  if (Instruction *Rotate = matchRotate(I))
-    return Rotate;
+  if (Instruction *Funnel = matchFunnelShift(I))
+    return Funnel;
 
   if (Instruction *Concat = matchOrConcat(I, Builder))
     return replaceInstUsesWith(I, Concat);
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 537838e2bdc19f..93b9917b5972b1 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -59,6 +59,7 @@
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -80,6 +81,7 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/GlobalValue.h"
@@ -5776,6 +5778,27 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
   if (MSSA)
     MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
 
+  // Debug preservation - record all llvm.dbg.value from the loop as well as
+  // the SCEV of their variable location. Since salvageDebugInfo may change the
+  // DIExpression we need to store the original here as well (i.e. it needs to
+  // be in sync with the SCEV).
+  SmallVector<
+      std::tuple<DbgValueInst *, const Type *, const SCEV *, DIExpression *>,
+      32>
+      DbgValues;
+  for (auto &B : L->getBlocks()) {
+    for (auto &I : *B) {
+      if (DbgValueInst *D = dyn_cast<DbgValueInst>(&I)) {
+        auto V = D->getVariableLocation();
+        if (!V || !SE.isSCEVable(V->getType()))
+          continue;
+        auto DS = SE.getSCEV(V);
+        DbgValues.push_back(
+            std::make_tuple(D, V->getType(), DS, D->getExpression()));
+      }
+    }
+  }
+
   // Run the main LSR transformation.
   Changed |=
       LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get()).getChanged();
@@ -5797,6 +5820,40 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
       DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
     }
   }
+  // Debug preservation - go through all recorded llvm.dbg.value and for those
+  // that now have an undef variable location use the recorded SCEV to try and
+  // update it. Compare with SCEV of Phi-nodes of loop header to find a
+  // suitable update candidate. SCEV match with constant offset is allowed and
+  // will be compensated for in the DIExpression.
+  if (Changed) {
+    for (auto &D : DbgValues) {
+      auto DbgValue = std::get<DbgValueInst *>(D);
+      auto DbgValueType = std::get<const Type *>(D);
+      auto DbgValueSCEV = std::get<const SCEV *>(D);
+      auto DbgDIExpr = std::get<DIExpression *>(D);
+      if (!isa<UndefValue>(DbgValue->getVariableLocation()))
+        continue;
+      for (PHINode &Phi : L->getHeader()->phis()) {
+        if (DbgValueType != Phi.getType())
+          continue;
+        if (!SE.isSCEVable(Phi.getType()))
+          continue;
+        auto PhiSCEV = SE.getSCEV(&Phi);
+        if (Optional<APInt> Offset =
+                SE.computeConstantDifference(DbgValueSCEV, PhiSCEV)) {
+          auto &Ctx = DbgValue->getContext();
+          DbgValue->setOperand(
+              0, MetadataAsValue::get(Ctx, ValueAsMetadata::get(&Phi)));
+          if (Offset.getValue().getSExtValue()) {
+            SmallVector<uint64_t, 8> Ops;
+            DIExpression::appendOffset(Ops, Offset.getValue().getSExtValue());
+            DbgDIExpr = DIExpression::prependOpcodes(DbgDIExpr, Ops, true);
+          }
+          DbgValue->setOperand(2, MetadataAsValue::get(Ctx, DbgDIExpr));
+        }
+      }
+    }
+  }
   return Changed;
 }
 
diff --git a/llvm/test/CodeGen/AArch64/2s-complement-asm.ll b/llvm/test/CodeGen/AArch64/2s-complement-asm.ll
new file mode 100644
index 00000000000000..cf646d13602042
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/2s-complement-asm.ll
@@ -0,0 +1,9 @@
+; RUN: llc -mtriple=arm64-apple-ios %s -filetype=obj -o - | llvm-objdump --macho --section __DATA,__data - | FileCheck %s
+
+; CHECK: Contents of (__DATA,__data) section
+; CHECK: 0000002a 59ed145d
+@other = global i32 42
+@var = global i32 sub(i32 646102975,
+                      i32 add (i32 trunc(i64 sub(i64 ptrtoint(i32* @var to i64),
+                                                         i64 ptrtoint(i32* @other to i64)) to i32),
+                               i32 3432360802))
diff --git a/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll
new file mode 100644
index 00000000000000..2de1423e5eea5b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/load-local-redundant-copies.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+
+; Test that checks for redundant copies to temporary stack slot produced by
+; expandUnalignedLoad.
+
+define amdgpu_vs void @test(<4 x i32> inreg %arg1, <6 x float> addrspace(3)* %arg2) {
+; CHECK-LABEL: test:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 12, v0
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, 8, v0
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, 4, v0
+; CHECK-NEXT:    s_mov_b32 m0, -1
+; CHECK-NEXT:    ds_read_b32 v2, v1
+; CHECK-NEXT:    ds_read_b32 v1, v4
+; CHECK-NEXT:    ds_read_b32 v3, v3
+; CHECK-NEXT:    ds_read_b32 v0, v0
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    exp mrt0 off, off, off, off
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0
+; CHECK-NEXT:    tbuffer_store_format_xyzw v[0:3], v4, s[0:3], 0 format:[BUF_DATA_FORMAT_32_32_32_32,BUF_NUM_FORMAT_FLOAT] idxen
+; CHECK-NEXT:    s_endpgm
+  call void @llvm.amdgcn.exp.f32(i32 immarg 0, i32 immarg 0, float undef, float undef, float undef, float undef, i1 immarg false, i1 immarg false)
+  %var1 = load <6 x float>, <6 x float> addrspace(3)* %arg2, align 4
+  %var2 = shufflevector <6 x float> %var1, <6 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %var2, <4 x i32> %arg1, i32 0, i32 0, i32 0, i32 immarg 126, i32 immarg 0)
+  ret void
+}
+
+define amdgpu_vs void @test_2(<4 x i32> inreg %arg1, i32 %arg2, i32 inreg %arg3, <8 x float> addrspace(3)* %arg4) {
+; CHECK-LABEL: test_2:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, 28, v1
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, 24, v1
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 20, v1
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, 16, v1
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, 12, v1
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, 8, v1
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, 4, v1
+; CHECK-NEXT:    s_mov_b32 m0, -1
+; CHECK-NEXT:    ds_read_b32 v4, v2
+; CHECK-NEXT:    ds_read_b32 v3, v3
+; CHECK-NEXT:    ds_read_b32 v2, v6
+; CHECK-NEXT:    ds_read_b32 v9, v7
+; CHECK-NEXT:    ds_read_b32 v8, v8
+; CHECK-NEXT:    ds_read_b32 v7, v10
+; CHECK-NEXT:    ds_read_b32 v6, v1
+; CHECK-NEXT:    ds_read_b32 v5, v5
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    tbuffer_store_format_xyzw v[6:9], v0, s[0:3], s4 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen glc slc
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    tbuffer_store_format_xyzw v[2:5], v0, s[0:3], s4 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:16 glc slc
+; CHECK-NEXT:    s_endpgm
+  %load = load <8 x float>, <8 x float> addrspace(3)* %arg4, align 4
+  %vec1 = shufflevector <8 x float> %load, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %vec1, <4 x i32> %arg1, i32 %arg2, i32 0, i32 %arg3, i32 immarg 77, i32 immarg 3)
+  %vec2 = shufflevector <8 x float> %load, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %vec2, <4 x i32> %arg1, i32 %arg2, i32 16, i32 %arg3, i32 immarg 77, i32 immarg 3)
+  ret void
+}
+
+define amdgpu_vs void @test_3(i32 inreg %arg1, i32 inreg %arg2, <4 x i32> inreg %arg3, i32 %arg4, <6 x float> addrspace(3)* %arg5, <6 x float> addrspace(3)* %arg6) {
+; CHECK-LABEL: test_3:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_mov_b32 s7, s5
+; CHECK-NEXT:    s_mov_b32 s6, s4
+; CHECK-NEXT:    s_mov_b32 s5, s3
+; CHECK-NEXT:    s_mov_b32 s4, s2
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, 16, v1
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, 12, v1
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, 8, v1
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, 4, v1
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, 20, v1
+; CHECK-NEXT:    v_mov_b32_e32 v9, s0
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, 16, v2
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, 12, v2
+; CHECK-NEXT:    v_add_i32_e32 v12, vcc, 8, v2
+; CHECK-NEXT:    s_mov_b32 m0, -1
+; CHECK-NEXT:    ds_read_b32 v3, v1
+; CHECK-NEXT:    ds_read_b32 v5, v4
+; CHECK-NEXT:    ds_read_b32 v4, v7
+; CHECK-NEXT:    ds_read_b32 v1, v8
+; CHECK-NEXT:    ds_read_b32 v6, v6
+; CHECK-NEXT:    ds_read_b32 v0, v0
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, 4, v2
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, 20, v2
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    tbuffer_store_format_xyzw v[3:6], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:264 glc slc
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    tbuffer_store_format_xy v[0:1], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:280 glc slc
+; CHECK-NEXT:    s_waitcnt expcnt(0)
+; CHECK-NEXT:    ds_read_b32 v0, v2
+; CHECK-NEXT:    ds_read_b32 v2, v12
+; CHECK-NEXT:    ds_read_b32 v1, v7
+; CHECK-NEXT:    ds_read_b32 v5, v8
+; CHECK-NEXT:    ds_read_b32 v3, v11
+; CHECK-NEXT:    ds_read_b32 v4, v10
+; CHECK-NEXT:    s_waitcnt lgkmcnt(5)
+; CHECK-NEXT:    exp mrt0 off, off, off, off
+; CHECK-NEXT:    s_waitcnt lgkmcnt(1)
+; CHECK-NEXT:    tbuffer_store_format_xyzw v[0:3], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_32_32_32,BUF_NUM_FORMAT_UINT] idxen offset:240 glc slc
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    tbuffer_store_format_xy v[4:5], v9, s[4:7], s1 format:[BUF_DATA_FORMAT_INVALID,BUF_NUM_FORMAT_UINT] idxen offset:256 glc slc
+; CHECK-NEXT:    s_endpgm
+  %load1 = load <6 x float>, <6 x float> addrspace(3)* %arg5, align 4
+  %vec11 = shufflevector <6 x float> %load1, <6 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %vec11, <4 x i32> %arg3, i32 %arg1, i32 264, i32 %arg2, i32 immarg 77, i32 immarg 3)
+  %vec12 = shufflevector <6 x float> %load1, <6 x float> undef, <2 x i32> <i32 4, i32 5>
+  call void @llvm.amdgcn.struct.tbuffer.store.v2f32(<2 x float> %vec12, <4 x i32> %arg3, i32 %arg1, i32 280, i32 %arg2, i32 immarg 64, i32 immarg 3)
+
+  call void @llvm.amdgcn.exp.f32(i32 immarg 0, i32 immarg 0, float undef, float undef, float undef, float undef, i1 immarg false, i1 immarg false)
+
+  %load2 = load <6 x float>, <6 x float> addrspace(3)* %arg6, align 4
+  %vec21 = shufflevector <6 x float> %load2, <6 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  call void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float> %vec21, <4 x i32> %arg3, i32 %arg1, i32 240, i32 %arg2, i32 immarg 77, i32 immarg 3)
+  %vec22 = shufflevector <6 x float> %load2, <6 x float> undef, <2 x i32> <i32 4, i32 5>
+  call void @llvm.amdgcn.struct.tbuffer.store.v2f32(<2 x float> %vec22, <4 x i32> %arg3, i32 %arg1, i32 256, i32 %arg2, i32 immarg 64, i32 immarg 3)
+
+  ret void
+}
+
+declare void @llvm.amdgcn.struct.tbuffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32, i32 immarg, i32 immarg)
+declare void @llvm.amdgcn.struct.tbuffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32, i32 immarg, i32 immarg)
+declare void @llvm.amdgcn.exp.f32(i32 immarg, i32 immarg, float, float, float, float, i1 immarg, i1 immarg)
diff --git a/llvm/test/DebugInfo/COFF/fpo-shrink-wrap.ll b/llvm/test/DebugInfo/COFF/fpo-shrink-wrap.ll
index 08aecdac5b794c..e8f37a370666c9 100644
--- a/llvm/test/DebugInfo/COFF/fpo-shrink-wrap.ll
+++ b/llvm/test/DebugInfo/COFF/fpo-shrink-wrap.ll
@@ -33,7 +33,7 @@
 ; ASM:         popl    %ebx
 ; ASM: [[EPILOGUE]]:                                 # %return
 ; ASM:         retl    $8
-; ASM: Ltmp10:
+; ASM: Ltmp11:
 ; ASM:         .cv_fpo_endproc
 
 ; Note how RvaStart advances 7 bytes to skip the shrink-wrapped portion.
diff --git a/llvm/test/MC/AMDGPU/gfx1030_err.s b/llvm/test/MC/AMDGPU/gfx1030_err.s
index 246548f1668398..c6af1736371aee 100644
--- a/llvm/test/MC/AMDGPU/gfx1030_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1030_err.s
@@ -26,7 +26,7 @@ s_getreg_b32 s2, hwreg(HW_REG_XNACK_MASK)
 // GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: specified hardware register is not supported on this GPU
 
 v_mac_f32 v0, v1, v2
-// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: operands are not valid for this GPU or mode
+// GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
 v_mad_f32 v0, v1, v2, v3
 // GFX10: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll b/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll
index 085d9ee20e1c0c..00e655942c3635 100644
--- a/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll
+++ b/llvm/test/Transforms/IndVarSimplify/eliminate-comparison.ll
@@ -942,4 +942,111 @@ leave:
   ret void
 }
 
+declare i1 @cond_func()
+
+define i32 @func_25(i32 %start) {
+; CHECK-LABEL: @func_25(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[C1:%.*]] = icmp ne i32 [[IV]], 0
+; CHECK-NEXT:    br i1 [[C1]], label [[CHECKED_1:%.*]], label [[FAIL:%.*]]
+; CHECK:       checked.1:
+; CHECK-NEXT:    [[C2:%.*]] = icmp ne i32 [[IV]], 0
+; CHECK-NEXT:    br i1 [[C2]], label [[CHECKED_2:%.*]], label [[FAIL]]
+; CHECK:       checked.2:
+; CHECK-NEXT:    [[C3:%.*]] = icmp ne i32 [[IV]], 0
+; CHECK-NEXT:    br i1 [[C3]], label [[BACKEDGE]], label [[FAIL]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 758394
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = call i1 @cond_func()
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       fail:
+; CHECK-NEXT:    unreachable
+; CHECK:       exit:
+; CHECK-NEXT:    [[IV_LCSSA1:%.*]] = phi i32 [ [[IV]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[IV_LCSSA1]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [%start, %entry], [%iv.next, %backedge]
+  %c1 = icmp ne i32 %iv, 0
+  br i1 %c1, label %checked.1, label %fail
+
+checked.1:
+  %c2 = icmp ne i32 %iv, 0
+  br i1 %c2, label %checked.2, label %fail
+
+checked.2:
+  %c3 = icmp ne i32 %iv, 0
+  br i1 %c3, label %backedge, label %fail
+
+backedge:
+  %iv.next = add i32 %iv, 758394
+  %loop.cond = call i1 @cond_func()
+  br i1 %loop.cond, label %loop, label %exit
+
+fail:
+  unreachable
+
+exit:
+  ret i32 %iv
+}
+
+define i32 @func_26(i32 %start) {
+; CHECK-LABEL: @func_26(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[START:%.*]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[BACKEDGE:%.*]] ]
+; CHECK-NEXT:    [[C1:%.*]] = icmp slt i32 [[IV]], 0
+; CHECK-NEXT:    br i1 [[C1]], label [[CHECKED_1:%.*]], label [[FAIL:%.*]]
+; CHECK:       checked.1:
+; CHECK-NEXT:    [[C2:%.*]] = icmp slt i32 [[IV]], 1
+; CHECK-NEXT:    br i1 [[C2]], label [[CHECKED_2:%.*]], label [[FAIL]]
+; CHECK:       checked.2:
+; CHECK-NEXT:    [[C3:%.*]] = icmp slt i32 [[IV]], 2
+; CHECK-NEXT:    br i1 [[C3]], label [[BACKEDGE]], label [[FAIL]]
+; CHECK:       backedge:
+; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 758394
+; CHECK-NEXT:    [[LOOP_COND:%.*]] = call i1 @cond_func()
+; CHECK-NEXT:    br i1 [[LOOP_COND]], label [[LOOP]], label [[EXIT:%.*]]
+; CHECK:       fail:
+; CHECK-NEXT:    unreachable
+; CHECK:       exit:
+; CHECK-NEXT:    [[IV_LCSSA1:%.*]] = phi i32 [ [[IV]], [[BACKEDGE]] ]
+; CHECK-NEXT:    ret i32 [[IV_LCSSA1]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [%start, %entry], [%iv.next, %backedge]
+  %c1 = icmp slt i32 %iv, 0
+  br i1 %c1, label %checked.1, label %fail
+
+checked.1:
+  %c2 = icmp slt i32 %iv, 1
+  br i1 %c2, label %checked.2, label %fail
+
+checked.2:
+  %c3 = icmp slt i32 %iv, 2
+  br i1 %c3, label %backedge, label %fail
+
+backedge:
+  %iv.next = add i32 %iv, 758394
+  %loop.cond = call i1 @cond_func()
+  br i1 %loop.cond, label %loop, label %exit
+
+fail:
+  unreachable
+
+exit:
+  ret i32 %iv
+}
+
+
 !0 = !{i32 0, i32 2147483647}
diff --git a/llvm/test/Transforms/InstCombine/funnel.ll b/llvm/test/Transforms/InstCombine/funnel.ll
index fca73a4ffb884f..d56bb74119102b 100644
--- a/llvm/test/Transforms/InstCombine/funnel.ll
+++ b/llvm/test/Transforms/InstCombine/funnel.ll
@@ -3,16 +3,14 @@
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 
-; TODO: Canonicalize or(shl,lshr) by constant to funnel shift intrinsics.
+; Canonicalize or(shl,lshr) by constant to funnel shift intrinsics.
 ; This should help cost modeling for vectorization, inlining, etc.
 ; If a target does not have a fshl instruction, the expansion will
 ; be exactly these same 3 basic ops (shl/lshr/or).
 
 define i32 @fshl_i32_constant(i32 %x, i32 %y) {
 ; CHECK-LABEL: @fshl_i32_constant(
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[X:%.*]], 11
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[Y:%.*]], 21
-; CHECK-NEXT:    [[R:%.*]] = or i32 [[SHR]], [[SHL]]
+; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.fshl.i32(i32 [[X:%.*]], i32 [[Y:%.*]], i32 11)
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %shl = shl i32 %x, 11
@@ -23,9 +21,7 @@ define i32 @fshl_i32_constant(i32 %x, i32 %y) {
 
 define i42 @fshr_i42_constant(i42 %x, i42 %y) {
 ; CHECK-LABEL: @fshr_i42_constant(
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i42 [[X:%.*]], 31
-; CHECK-NEXT:    [[SHL:%.*]] = shl i42 [[Y:%.*]], 11
-; CHECK-NEXT:    [[R:%.*]] = or i42 [[SHR]], [[SHL]]
+; CHECK-NEXT:    [[R:%.*]] = call i42 @llvm.fshl.i42(i42 [[Y:%.*]], i42 [[X:%.*]], i42 11)
 ; CHECK-NEXT:    ret i42 [[R]]
 ;
   %shr = lshr i42 %x, 31
@@ -34,13 +30,11 @@ define i42 @fshr_i42_constant(i42 %x, i42 %y) {
   ret i42 %r
 }
 
-; TODO: Vector types are allowed.
+; Vector types are allowed.
 
 define <2 x i16> @fshl_v2i16_constant_splat(<2 x i16> %x, <2 x i16> %y) {
 ; CHECK-LABEL: @fshl_v2i16_constant_splat(
-; CHECK-NEXT:    [[SHL:%.*]] = shl <2 x i16> [[X:%.*]], <i16 1, i16 1>
-; CHECK-NEXT:    [[SHR:%.*]] = lshr <2 x i16> [[Y:%.*]], <i16 15, i16 15>
-; CHECK-NEXT:    [[R:%.*]] = or <2 x i16> [[SHL]], [[SHR]]
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> [[X:%.*]], <2 x i16> [[Y:%.*]], <2 x i16> <i16 1, i16 1>)
 ; CHECK-NEXT:    ret <2 x i16> [[R]]
 ;
   %shl = shl <2 x i16> %x, <i16 1, i16 1>
@@ -51,9 +45,7 @@ define <2 x i16> @fshl_v2i16_constant_splat(<2 x i16> %x, <2 x i16> %y) {
 
 define <2 x i16> @fshl_v2i16_constant_splat_undef0(<2 x i16> %x, <2 x i16> %y) {
 ; CHECK-LABEL: @fshl_v2i16_constant_splat_undef0(
-; CHECK-NEXT:    [[SHL:%.*]] = shl <2 x i16> [[X:%.*]], <i16 undef, i16 1>
-; CHECK-NEXT:    [[SHR:%.*]] = lshr <2 x i16> [[Y:%.*]], <i16 15, i16 15>
-; CHECK-NEXT:    [[R:%.*]] = or <2 x i16> [[SHL]], [[SHR]]
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> [[X:%.*]], <2 x i16> [[Y:%.*]], <2 x i16> <i16 1, i16 1>)
 ; CHECK-NEXT:    ret <2 x i16> [[R]]
 ;
   %shl = shl <2 x i16> %x, <i16 undef, i16 1>
@@ -64,9 +56,7 @@ define <2 x i16> @fshl_v2i16_constant_splat_undef0(<2 x i16> %x, <2 x i16> %y) {
 
 define <2 x i16> @fshl_v2i16_constant_splat_undef1(<2 x i16> %x, <2 x i16> %y) {
 ; CHECK-LABEL: @fshl_v2i16_constant_splat_undef1(
-; CHECK-NEXT:    [[SHL:%.*]] = shl <2 x i16> [[X:%.*]], <i16 1, i16 1>
-; CHECK-NEXT:    [[SHR:%.*]] = lshr <2 x i16> [[Y:%.*]], <i16 15, i16 undef>
-; CHECK-NEXT:    [[R:%.*]] = or <2 x i16> [[SHL]], [[SHR]]
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> [[X:%.*]], <2 x i16> [[Y:%.*]], <2 x i16> <i16 1, i16 1>)
 ; CHECK-NEXT:    ret <2 x i16> [[R]]
 ;
   %shl = shl <2 x i16> %x, <i16 1, i16 1>
@@ -75,13 +65,11 @@ define <2 x i16> @fshl_v2i16_constant_splat_undef1(<2 x i16> %x, <2 x i16> %y) {
   ret <2 x i16> %r
 }
 
-; TODO: Non-power-of-2 vector types are allowed.
+; Non-power-of-2 vector types are allowed.
 
 define <2 x i17> @fshr_v2i17_constant_splat(<2 x i17> %x, <2 x i17> %y) {
 ; CHECK-LABEL: @fshr_v2i17_constant_splat(
-; CHECK-NEXT:    [[SHR:%.*]] = lshr <2 x i17> [[X:%.*]], <i17 12, i17 12>
-; CHECK-NEXT:    [[SHL:%.*]] = shl <2 x i17> [[Y:%.*]], <i17 5, i17 5>
-; CHECK-NEXT:    [[R:%.*]] = or <2 x i17> [[SHR]], [[SHL]]
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i17> @llvm.fshl.v2i17(<2 x i17> [[Y:%.*]], <2 x i17> [[X:%.*]], <2 x i17> <i17 5, i17 5>)
 ; CHECK-NEXT:    ret <2 x i17> [[R]]
 ;
   %shr = lshr <2 x i17> %x, <i17 12, i17 12>
@@ -92,9 +80,7 @@ define <2 x i17> @fshr_v2i17_constant_splat(<2 x i17> %x, <2 x i17> %y) {
 
 define <2 x i17> @fshr_v2i17_constant_splat_undef0(<2 x i17> %x, <2 x i17> %y) {
 ; CHECK-LABEL: @fshr_v2i17_constant_splat_undef0(
-; CHECK-NEXT:    [[SHR:%.*]] = lshr <2 x i17> [[X:%.*]], <i17 12, i17 undef>
-; CHECK-NEXT:    [[SHL:%.*]] = shl <2 x i17> [[Y:%.*]], <i17 undef, i17 5>
-; CHECK-NEXT:    [[R:%.*]] = or <2 x i17> [[SHR]], [[SHL]]
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i17> @llvm.fshl.v2i17(<2 x i17> [[Y:%.*]], <2 x i17> [[X:%.*]], <2 x i17> <i17 5, i17 5>)
 ; CHECK-NEXT:    ret <2 x i17> [[R]]
 ;
   %shr = lshr <2 x i17> %x, <i17 12, i17 undef>
@@ -105,9 +91,7 @@ define <2 x i17> @fshr_v2i17_constant_splat_undef0(<2 x i17> %x, <2 x i17> %y) {
 
 define <2 x i17> @fshr_v2i17_constant_splat_undef1(<2 x i17> %x, <2 x i17> %y) {
 ; CHECK-LABEL: @fshr_v2i17_constant_splat_undef1(
-; CHECK-NEXT:    [[SHR:%.*]] = lshr <2 x i17> [[X:%.*]], <i17 12, i17 undef>
-; CHECK-NEXT:    [[SHL:%.*]] = shl <2 x i17> [[Y:%.*]], <i17 5, i17 undef>
-; CHECK-NEXT:    [[R:%.*]] = or <2 x i17> [[SHR]], [[SHL]]
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i17> @llvm.fshl.v2i17(<2 x i17> [[Y:%.*]], <2 x i17> [[X:%.*]], <2 x i17> <i17 5, i17 5>)
 ; CHECK-NEXT:    ret <2 x i17> [[R]]
 ;
   %shr = lshr <2 x i17> %x, <i17 12, i17 undef>
@@ -116,13 +100,12 @@ define <2 x i17> @fshr_v2i17_constant_splat_undef1(<2 x i17> %x, <2 x i17> %y) {
   ret <2 x i17> %r
 }
 
-; TODO: Allow arbitrary shift constants.
+; Allow arbitrary shift constants.
+; TODO: Support undef elements.
 
 define <2 x i32> @fshr_v2i32_constant_nonsplat(<2 x i32> %x, <2 x i32> %y) {
 ; CHECK-LABEL: @fshr_v2i32_constant_nonsplat(
-; CHECK-NEXT:    [[SHR:%.*]] = lshr <2 x i32> [[X:%.*]], <i32 17, i32 19>
-; CHECK-NEXT:    [[SHL:%.*]] = shl <2 x i32> [[Y:%.*]], <i32 15, i32 13>
-; CHECK-NEXT:    [[R:%.*]] = or <2 x i32> [[SHL]], [[SHR]]
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[Y:%.*]], <2 x i32> [[X:%.*]], <2 x i32> <i32 15, i32 13>)
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %shr = lshr <2 x i32> %x, <i32 17, i32 19>
@@ -159,9 +142,7 @@ define <2 x i32> @fshr_v2i32_constant_nonsplat_undef1(<2 x i32> %x, <2 x i32> %y
 
 define <2 x i36> @fshl_v2i36_constant_nonsplat(<2 x i36> %x, <2 x i36> %y) {
 ; CHECK-LABEL: @fshl_v2i36_constant_nonsplat(
-; CHECK-NEXT:    [[SHL:%.*]] = shl <2 x i36> [[X:%.*]], <i36 21, i36 11>
-; CHECK-NEXT:    [[SHR:%.*]] = lshr <2 x i36> [[Y:%.*]], <i36 15, i36 25>
-; CHECK-NEXT:    [[R:%.*]] = or <2 x i36> [[SHL]], [[SHR]]
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i36> @llvm.fshl.v2i36(<2 x i36> [[X:%.*]], <2 x i36> [[Y:%.*]], <2 x i36> <i36 21, i36 11>)
 ; CHECK-NEXT:    ret <2 x i36> [[R]]
 ;
   %shl = shl <2 x i36> %x, <i36 21, i36 11>
diff --git a/llvm/test/Transforms/InstCombine/rotate.ll b/llvm/test/Transforms/InstCombine/rotate.ll
index d08fe07784224f..667b5f087c8b53 100644
--- a/llvm/test/Transforms/InstCombine/rotate.ll
+++ b/llvm/test/Transforms/InstCombine/rotate.ll
@@ -122,13 +122,12 @@ define <2 x i17> @rotr_v2i17_constant_splat_undef1(<2 x i17> %x) {
   ret <2 x i17> %r
 }
 
-; TODO: Allow arbitrary shift constants.
+; Allow arbitrary shift constants.
+; TODO: Support undef elements.
 
 define <2 x i32> @rotr_v2i32_constant_nonsplat(<2 x i32> %x) {
 ; CHECK-LABEL: @rotr_v2i32_constant_nonsplat(
-; CHECK-NEXT:    [[SHL:%.*]] = shl <2 x i32> [[X:%.*]], <i32 17, i32 19>
-; CHECK-NEXT:    [[SHR:%.*]] = lshr <2 x i32> [[X]], <i32 15, i32 13>
-; CHECK-NEXT:    [[R:%.*]] = or <2 x i32> [[SHL]], [[SHR]]
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i32> @llvm.fshl.v2i32(<2 x i32> [[X:%.*]], <2 x i32> [[X]], <2 x i32> <i32 17, i32 19>)
 ; CHECK-NEXT:    ret <2 x i32> [[R]]
 ;
   %shl = shl <2 x i32> %x, <i32 17, i32 19>
@@ -165,9 +164,7 @@ define <2 x i32> @rotr_v2i32_constant_nonsplat_undef1(<2 x i32> %x) {
 
 define <2 x i36> @rotl_v2i36_constant_nonsplat(<2 x i36> %x) {
 ; CHECK-LABEL: @rotl_v2i36_constant_nonsplat(
-; CHECK-NEXT:    [[SHL:%.*]] = shl <2 x i36> [[X:%.*]], <i36 21, i36 11>
-; CHECK-NEXT:    [[SHR:%.*]] = lshr <2 x i36> [[X]], <i36 15, i36 25>
-; CHECK-NEXT:    [[R:%.*]] = or <2 x i36> [[SHL]], [[SHR]]
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i36> @llvm.fshl.v2i36(<2 x i36> [[X:%.*]], <2 x i36> [[X]], <2 x i36> <i36 21, i36 11>)
 ; CHECK-NEXT:    ret <2 x i36> [[R]]
 ;
   %shl = shl <2 x i36> %x, <i36 21, i36 11>
diff --git a/llvm/test/Transforms/LoopStrengthReduce/dbg-preserve-0.ll b/llvm/test/Transforms/LoopStrengthReduce/dbg-preserve-0.ll
new file mode 100644
index 00000000000000..71031aabb95b74
--- /dev/null
+++ b/llvm/test/Transforms/LoopStrengthReduce/dbg-preserve-0.ll
@@ -0,0 +1,74 @@
+; RUN: opt < %s -loop-reduce -S | FileCheck %s
+
+; Test that LSR preserves debug-info for induction variables.
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+
+define dso_local void @foo(i8* nocapture %p) local_unnamed_addr !dbg !7 {
+; CHECK-LABEL: @foo(
+entry:
+  call void @llvm.dbg.value(metadata i8* %p, metadata !13, metadata !DIExpression()), !dbg !16
+  call void @llvm.dbg.value(metadata i8 0, metadata !14, metadata !DIExpression()), !dbg !17
+  br label %for.body, !dbg !18
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void, !dbg !19
+
+for.body:                                         ; preds = %entry, %for.body
+; CHECK-LABEL: for.body:
+  %i.06 = phi i8 [ 0, %entry ], [ %inc, %for.body ]
+  %p.addr.05 = phi i8* [ %p, %entry ], [ %add.ptr, %for.body ]
+  call void @llvm.dbg.value(metadata i8 %i.06, metadata !14, metadata !DIExpression()), !dbg !17
+  call void @llvm.dbg.value(metadata i8* %p.addr.05, metadata !13, metadata !DIExpression()), !dbg !16
+; CHECK-NOT: call void @llvm.dbg.value(metadata i8* undef
+; CHECK: call void @llvm.dbg.value(metadata i8* %lsr.iv, metadata ![[MID_p:[0-9]+]], metadata !DIExpression(DW_OP_constu, 3, DW_OP_minus, DW_OP_stack_value)), !dbg !16
+  %add.ptr = getelementptr inbounds i8, i8* %p.addr.05, i64 3, !dbg !20
+  call void @llvm.dbg.value(metadata i8* %add.ptr, metadata !13, metadata !DIExpression()), !dbg !16
+; CHECK-NOT: call void @llvm.dbg.value(metadata i8* undef
+; CHECK: call void @llvm.dbg.value(metadata i8* %lsr.iv, metadata ![[MID_p]], metadata !DIExpression()), !dbg !16
+  store i8 %i.06, i8* %add.ptr, align 1, !dbg !23, !tbaa !24
+  %inc = add nuw nsw i8 %i.06, 1, !dbg !27
+  call void @llvm.dbg.value(metadata i8 %inc, metadata !14, metadata !DIExpression()), !dbg !17
+  %exitcond.not = icmp eq i8 %inc, 32, !dbg !28
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !dbg !18, !llvm.loop !29
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "lsrdbg.c", directory: "/")
+!2 = !{}
+!3 = !{i32 7, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{!"clang version 12.0.0"}
+!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 2, type: !8, scopeLine: 2, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !12)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10}
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64)
+!11 = !DIBasicType(name: "unsigned char", size: 8, encoding: DW_ATE_unsigned_char)
+!12 = !{!13, !14}
+!13 = !DILocalVariable(name: "p", arg: 1, scope: !7, file: !1, line: 2, type: !10)
+; CHECK: ![[MID_p]] = !DILocalVariable(name: "p", arg: 1, scope: !7, file: !1, line: 2, type: !10)
+!14 = !DILocalVariable(name: "i", scope: !15, file: !1, line: 4, type: !11)
+!15 = distinct !DILexicalBlock(scope: !7, file: !1, line: 4, column: 3)
+!16 = !DILocation(line: 0, scope: !7)
+!17 = !DILocation(line: 0, scope: !15)
+!18 = !DILocation(line: 4, column: 3, scope: !15)
+!19 = !DILocation(line: 8, column: 1, scope: !7)
+!20 = !DILocation(line: 5, column: 7, scope: !21)
+!21 = distinct !DILexicalBlock(scope: !22, file: !1, line: 4, column: 42)
+!22 = distinct !DILexicalBlock(scope: !15, file: !1, line: 4, column: 3)
+!23 = !DILocation(line: 6, column: 8, scope: !21)
+!24 = !{!25, !25, i64 0}
+!25 = !{!"omnipotent char", !26, i64 0}
+!26 = !{!"Simple C/C++ TBAA"}
+!27 = !DILocation(line: 4, column: 38, scope: !22)
+!28 = !DILocation(line: 4, column: 31, scope: !22)
+!29 = distinct !{!29, !18, !30, !31}
+!30 = !DILocation(line: 7, column: 3, scope: !15)
+!31 = !{!"llvm.loop.unroll.disable"}
diff --git a/llvm/test/tools/llvm-readobj/COFF/tls-directory.test b/llvm/test/tools/llvm-readobj/COFF/tls-directory.test
new file mode 100644
index 00000000000000..d553130e0a017d
--- /dev/null
+++ b/llvm/test/tools/llvm-readobj/COFF/tls-directory.test
@@ -0,0 +1,162 @@
+## Tests for the --coff-tls-directory flag.
+
+## Test that the output of --coff-tls-directory works on x86.
+## The binary created from this yaml definition is such that .rdata contains
+## only the IMAGE_TLS_DIRECTORY structure and hence we should have that
+## TlsTable.RelativeVirtualAddress == .rdata section VirtualAddress.
+## Also note that the .rdata section VirtualSize == sizeof(coff_tls_directory32) == sizeof(IMAGE_TLS_DIRECTORY32) == 24
+
+# RUN: yaml2obj %s --docnum=1 -o %t.32.exe -DTLSRVA=10000 -DTLSSIZE=24
+# RUN: llvm-readobj --coff-tls-directory %t.32.exe | FileCheck %s --check-prefix I386
+
+#      I386: Arch: i386
+# I386-NEXT: AddressSize: 32bit
+# I386-NEXT: TLSDirectory {
+# I386-NEXT:   StartAddressOfRawData: 0x404000
+# I386-NEXT:   EndAddressOfRawData: 0x404008
+# I386-NEXT:   AddressOfIndex: 0x402000
+# I386-NEXT:   AddressOfCallBacks: 0x0
+# I386-NEXT:   SizeOfZeroFill: 0x0
+# I386-NEXT:   Characteristics [ (0x300000)
+# I386-NEXT:     IMAGE_SCN_ALIGN_4BYTES (0x300000)
+# I386-NEXT:   ]
+# I386-NEXT: }
+
+
+## Test that the output of --coff-tls-directory errors on malformed input.
+## On x86, the TLS directory should be 24 bytes.
+## This test has a truncated TLS directory.
+
+# RUN: yaml2obj %s --docnum=1 -o %t.wrong-size.32.exe -DTLSRVA=10000 -DTLSSIZE=10
+# RUN: not llvm-readobj --coff-tls-directory %t.wrong-size.32.exe 2>&1 | FileCheck %s --check-prefix I386-WRONG-SIZE-ERR
+
+# I386-WRONG-SIZE-ERR: error: '{{.*}}': TLS Directory size (10) is not the expected size (24).
+
+--- !COFF
+OptionalHeader:
+  AddressOfEntryPoint: 0
+  ImageBase:       0
+  SectionAlignment: 4096
+  FileAlignment:   512
+  MajorOperatingSystemVersion: 0
+  MinorOperatingSystemVersion: 0
+  MajorImageVersion: 0
+  MinorImageVersion: 0
+  MajorSubsystemVersion: 0
+  MinorSubsystemVersion: 0
+  Subsystem:       IMAGE_SUBSYSTEM_WINDOWS_CUI
+  DLLCharacteristics: []
+  SizeOfStackReserve: 0
+  SizeOfStackCommit: 0
+  SizeOfHeapReserve: 0
+  SizeOfHeapCommit: 0
+  TlsTable:
+    RelativeVirtualAddress: [[TLSRVA]]
+    Size:            [[TLSSIZE]]
+header:
+  Machine:         IMAGE_FILE_MACHINE_I386
+  Characteristics: [ IMAGE_FILE_EXECUTABLE_IMAGE, IMAGE_FILE_32BIT_MACHINE ]
+sections:
+  - Name:            .rdata
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    VirtualAddress:  10000
+    VirtualSize:     24
+    SectionData:     '004040000840400000204000000000000000000000003000'
+symbols:         []
+
+
+## Test that the output of --coff-tls-directory works on x86_64.
+## The binary created from this yaml definition is such that .rdata contains
+## only the IMAGE_TLS_DIRECTORY structure and hence we should have that
+## TlsTable.RelativeVirtualAddress == .rdata section VirtualAddress.
+## Also note that the .rdata section VirtualSize == sizeof(coff_tls_directory64) == sizeof(IMAGE_TLS_DIRECTORY64) == 40
+
+# RUN: yaml2obj %s --docnum=2 -o %t.64.exe -DTLSRVA=10000 -DTLSSIZE=40
+# RUN: llvm-readobj --coff-tls-directory %t.64.exe | FileCheck %s --check-prefix X86-64
+
+#      X86-64: Arch: x86_64
+# X86-64-NEXT: AddressSize: 64bit
+# X86-64-NEXT: TLSDirectory {
+# X86-64-NEXT:   StartAddressOfRawData: 0x140004000
+# X86-64-NEXT:   EndAddressOfRawData: 0x140004008
+# X86-64-NEXT:   AddressOfIndex: 0x140002000
+# X86-64-NEXT:   AddressOfCallBacks: 0x0
+# X86-64-NEXT:   SizeOfZeroFill: 0x0
+# X86-64-NEXT:   Characteristics [ (0x300000)
+# X86-64-NEXT:     IMAGE_SCN_ALIGN_4BYTES (0x300000)
+# X86-64-NEXT:   ]
+# X86-64-NEXT: }
+
+
+## Test that the output of --coff-tls-directory errors on malformed input.
+
+## On x86-64, the TLS directory should be 40 bytes.
+## This test has an erroneously lengthened TLS directory.
+
+# RUN: yaml2obj %s --docnum=2 -o %t.wrong-size.64.exe -DTLSRVA=10000 -DTLSSIZE=80
+# RUN: not llvm-readobj --coff-tls-directory %t.wrong-size.64.exe 2>&1 | FileCheck %s --check-prefix X86-64-WRONG-SIZE-ERR
+
+# X86-64-WRONG-SIZE-ERR: error: '{{.*}}': TLS Directory size (80) is not the expected size (40).
+
+
+## This test has a correct TLS Directory size but the RVA is invalid.
+
+# RUN: yaml2obj %s --docnum=2 -o %t.bad-tls-rva.exe -DTLSRVA=999999 -DTLSSIZE=40
+# RUN: not llvm-readobj --coff-tls-directory %t.bad-tls-rva.exe 2>&1 | FileCheck %s --check-prefix BAD-TLS-RVA-ERR
+
+# BAD-TLS-RVA-ERR: error: '{{.*}}': Invalid data was encountered while parsing the file
+
+--- !COFF
+OptionalHeader:
+  AddressOfEntryPoint: 0
+  ImageBase:       0
+  SectionAlignment: 4096
+  FileAlignment:   512
+  MajorOperatingSystemVersion: 0
+  MinorOperatingSystemVersion: 0
+  MajorImageVersion: 0
+  MinorImageVersion: 0
+  MajorSubsystemVersion: 0
+  MinorSubsystemVersion: 0
+  Subsystem:       IMAGE_SUBSYSTEM_WINDOWS_CUI
+  DLLCharacteristics: []
+  SizeOfStackReserve: 0
+  SizeOfStackCommit: 0
+  SizeOfHeapReserve: 0
+  SizeOfHeapCommit: 0
+  TlsTable:
+    RelativeVirtualAddress: [[TLSRVA]]
+    Size:            [[TLSSIZE]]
+header:
+  Machine:         IMAGE_FILE_MACHINE_AMD64
+  Characteristics: [ IMAGE_FILE_EXECUTABLE_IMAGE, IMAGE_FILE_LARGE_ADDRESS_AWARE ]
+sections:
+  - Name:            .rdata
+    Characteristics: [ IMAGE_SCN_CNT_INITIALIZED_DATA, IMAGE_SCN_MEM_READ, IMAGE_SCN_MEM_WRITE ]
+    VirtualAddress:  10000
+    VirtualSize:     40
+    SectionData:     '00400040010000000840004001000000002000400100000000000000000000000000000000003000'
+symbols:         []
+
+
+## Test that --coff-tls-directory doesn't output anything if there's no TLS directory.
+
+## Case 1: TlsTable.RelativeVirtualAddress/Size = 0.
+
+# RUN: yaml2obj %s --docnum=2 -o %t.no-tls1.exe -DTLSRVA=0 -DTLSSIZE=0
+# RUN: llvm-readobj --coff-tls-directory %t.no-tls1.exe | FileCheck %s --check-prefix NO-TLS
+
+## Case 2: There's no TlsTable listed in the COFF header.
+
+# RUN: yaml2obj %s --docnum=3 -o %t.no-tls2.exe
+# RUN: llvm-readobj --coff-tls-directory %t.no-tls2.exe | FileCheck %s --check-prefix NO-TLS
+
+#      NO-TLS: TLSDirectory {
+# NO-TLS-NEXT: }
+
+--- !COFF
+header:
+  Machine:         IMAGE_FILE_MACHINE_AMD64
+  Characteristics: [ IMAGE_FILE_EXECUTABLE_IMAGE, IMAGE_FILE_LARGE_ADDRESS_AWARE ]
+sections:        []
+symbols:         []
diff --git a/llvm/test/tools/llvm-readobj/ELF/hash-table.test b/llvm/test/tools/llvm-readobj/ELF/hash-table.test
index 1102d848f03e46..b8d44e3cdf7191 100644
--- a/llvm/test/tools/llvm-readobj/ELF/hash-table.test
+++ b/llvm/test/tools/llvm-readobj/ELF/hash-table.test
@@ -45,9 +45,13 @@ ProgramHeaders:
 ## Check we can dump the SHT_HASH section even when an object
 ## does not have the section header table.
 
-# RUN: yaml2obj --docnum=2 %s -o %t.noshdr
-# RUN: llvm-readobj --hash-table %t.noshdr | FileCheck %s --check-prefix=NOSHDR
-# RUN: llvm-readelf --hash-table %t.noshdr | FileCheck %s --check-prefix=NOSHDR
+# RUN: yaml2obj --docnum=2 -DNOHEADERS=true %s -o %t.noshdr
+# RUN: llvm-readobj --hash-table %t.noshdr 2>&1 | \
+# RUN:   FileCheck %s -DFILE=%t.noshdr --check-prefix=NOSHDR --implicit-check-not=warning:
+# RUN: llvm-readelf --hash-table %t.noshdr 2>&1 | \
+# RUN:   FileCheck %s -DFILE=%t.noshdr --check-prefix=NOSHDR --implicit-check-not=warning:
+
+# NOSHDR: warning: '[[FILE]]': string table was not found
 
 # NOSHDR:      HashTable {
 # NOSHDR-NEXT:   Num Buckets: 1
@@ -58,37 +62,57 @@ ProgramHeaders:
 
 --- !ELF
 FileHeader:
-  Class:  ELFCLASS64
-  Data:   ELFDATA2LSB
-  Type:   ET_DYN
-## We simulate no section header table by
-## overriding the ELF header properties.
-  EShOff: 0x0
-  EShNum: 0x0
+  Class: ELFCLASS64
+  Data:  ELFDATA2LSB
+  Type:  ET_DYN
 Sections:
   - Name:   .hash
     Type:   SHT_HASH
     Flags:  [ SHF_ALLOC ]
     Bucket: [ 0 ]
     Chain:  [ 1 ]
+    EntSize: [[ENTSIZE=4]]
   - Name:  .dynamic
     Type:  SHT_DYNAMIC
     Flags: [ SHF_ALLOC ]
     Entries:
-      - Tag:   DT_HASH
+      - Tag:   [[DYNTAG=DT_HASH]]
         Value: 0x0
       - Tag:   DT_NULL
         Value: 0x0
+SectionHeaderTable:
+  NoHeaders: [[NOHEADERS=false]]
 ProgramHeaders:
   - Type: PT_LOAD
     Sections:
       - Section: .hash
       - Section: .dynamic
   - Type: PT_DYNAMIC
-    VAddr: 0x1010
+    VAddr: 0x10
     Sections:
       - Section: .dynamic
 
+## Document we don't report a warning when the value of the sh_entsize field of the SHT_HASH section is not 4.
+
+# RUN: yaml2obj --docnum=2 -DENTSIZE=0xff %s -o %t.ent.size
+# RUN: llvm-readobj --hash-table %t.ent.size 2>&1 | \
+# RUN:   FileCheck %s -DFILE=%t.ent.size --check-prefix=NOSHDR --implicit-check-not=warning:
+# RUN: llvm-readelf --hash-table %t.ent.size 2>&1 | \
+# RUN:   FileCheck %s -DFILE=%t.ent.size --check-prefix=NOSHDR --implicit-check-not=warning:
+
+## Document we need the DT_HASH dynamic tag to locate the hash table.
+
+# RUN: yaml2obj --docnum=2 -DDYNTAG=DT_NULL %s -o %t.no.dyntag
+# RUN: llvm-readobj --hash-table %t.no.dyntag 2>&1 | \
+# RUN:   FileCheck %s -DFILE=%t.no.dyntag --check-prefix=NODYNTAG --implicit-check-not=warning:
+# RUN: llvm-readelf --hash-table %t.no.dyntag 2>&1 | \
+# RUN:   FileCheck %s -DFILE=%t.no.dyntag --check-prefix=NODYNTAG --implicit-check-not=warning:
+
+# NODYNTAG: warning: '[[FILE]]': string table was not found
+
+# NODYNTAG:      HashTable {
+# NODYNTAG-NEXT: }
+
 ## Each SHT_HASH section starts with two 32-bit fields: nbucket and nchain.
 ## Check we report an error when a DT_HASH value points to data that has size less than 8 bytes.
 
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index 46ed7414dbb31e..d57ea8ef94e788 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -1737,8 +1737,8 @@ static void disassembleObject(const Target *TheTarget, const ObjectFile *Obj,
   // the output.
   StringSet<> FoundDisasmSymbolSet;
   for (std::pair<const SectionRef, SectionSymbolsTy> &SecSyms : AllSymbols)
-    stable_sort(SecSyms.second);
-  stable_sort(AbsoluteSymbols);
+    llvm::stable_sort(SecSyms.second);
+  llvm::stable_sort(AbsoluteSymbols);
 
   std::unique_ptr<DWARFContext> DICtx;
   LiveVariablePrinter LVP(*Ctx.getRegisterInfo(), *STI);
diff --git a/llvm/tools/llvm-readobj/COFFDumper.cpp b/llvm/tools/llvm-readobj/COFFDumper.cpp
index 22e27b3e5a29e1..f59bfd8b7cb6e3 100644
--- a/llvm/tools/llvm-readobj/COFFDumper.cpp
+++ b/llvm/tools/llvm-readobj/COFFDumper.cpp
@@ -89,6 +89,7 @@ class COFFDumper : public ObjDumper {
   void printCOFFDirectives() override;
   void printCOFFBaseReloc() override;
   void printCOFFDebugDirectory() override;
+  void printCOFFTLSDirectory() override;
   void printCOFFResources() override;
   void printCOFFLoadConfig() override;
   void printCodeViewDebugInfo() override;
@@ -116,6 +117,8 @@ class COFFDumper : public ObjDumper {
   void printBaseOfDataField(const pe32plus_header *Hdr);
   template <typename T>
   void printCOFFLoadConfig(const T *Conf, LoadConfigTables &Tables);
+  template <typename IntTy>
+  void printCOFFTLSDirectory(const coff_tls_directory<IntTy> *TlsTable);
   typedef void (*PrintExtraCB)(raw_ostream &, const uint8_t *);
   void printRVATable(uint64_t TableVA, uint64_t Count, uint64_t EntrySize,
                      PrintExtraCB PrintExtra = 0);
@@ -2018,3 +2021,27 @@ void llvm::dumpCodeViewMergedTypes(ScopedPrinter &Writer,
     Writer.flush();
   }
 }
+
+void COFFDumper::printCOFFTLSDirectory() {
+  if (Obj->is64())
+    printCOFFTLSDirectory(Obj->getTLSDirectory64());
+  else
+    printCOFFTLSDirectory(Obj->getTLSDirectory32());
+}
+
+template <typename IntTy>
+void COFFDumper::printCOFFTLSDirectory(
+    const coff_tls_directory<IntTy> *TlsTable) {
+  DictScope D(W, "TLSDirectory");
+  if (!TlsTable)
+    return;
+
+  W.printHex("StartAddressOfRawData", TlsTable->StartAddressOfRawData);
+  W.printHex("EndAddressOfRawData", TlsTable->EndAddressOfRawData);
+  W.printHex("AddressOfIndex", TlsTable->AddressOfIndex);
+  W.printHex("AddressOfCallBacks", TlsTable->AddressOfCallBacks);
+  W.printHex("SizeOfZeroFill", TlsTable->SizeOfZeroFill);
+  W.printFlags("Characteristics", TlsTable->Characteristics,
+               makeArrayRef(ImageSectionCharacteristics),
+               COFF::SectionCharacteristics(COFF::IMAGE_SCN_ALIGN_MASK));
+}
diff --git a/llvm/tools/llvm-readobj/ObjDumper.h b/llvm/tools/llvm-readobj/ObjDumper.h
index 9e45062ccda8d1..943299a121fc53 100644
--- a/llvm/tools/llvm-readobj/ObjDumper.h
+++ b/llvm/tools/llvm-readobj/ObjDumper.h
@@ -80,6 +80,7 @@ class ObjDumper {
   virtual void printCOFFDirectives() { }
   virtual void printCOFFBaseReloc() { }
   virtual void printCOFFDebugDirectory() { }
+  virtual void printCOFFTLSDirectory() {}
   virtual void printCOFFResources() {}
   virtual void printCOFFLoadConfig() { }
   virtual void printCodeViewDebugInfo() { }
diff --git a/llvm/tools/llvm-readobj/llvm-readobj.cpp b/llvm/tools/llvm-readobj/llvm-readobj.cpp
index 173ee3a7f140d5..1546ce7926a401 100644
--- a/llvm/tools/llvm-readobj/llvm-readobj.cpp
+++ b/llvm/tools/llvm-readobj/llvm-readobj.cpp
@@ -272,6 +272,10 @@ namespace opts {
   COFFDebugDirectory("coff-debug-directory",
                      cl::desc("Display the PE/COFF debug directory"));
 
+  // --coff-tls-directory
+  cl::opt<bool> COFFTLSDirectory("coff-tls-directory",
+                                 cl::desc("Display the PE/COFF TLS directory"));
+
   // --coff-resources
   cl::opt<bool> COFFResources("coff-resources",
                               cl::desc("Display the PE/COFF .rsrc section"));
@@ -533,6 +537,8 @@ static void dumpObject(const ObjectFile &Obj, ScopedPrinter &Writer,
       Dumper->printCOFFBaseReloc();
     if (opts::COFFDebugDirectory)
       Dumper->printCOFFDebugDirectory();
+    if (opts::COFFTLSDirectory)
+      Dumper->printCOFFTLSDirectory();
     if (opts::COFFResources)
       Dumper->printCOFFResources();
     if (opts::COFFLoadConfig)
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 2e566c941894f9..395db396dadca1 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -16,6 +16,9 @@
 #include "llvm/ADT/SmallBitVector.h"
 
 namespace mlir {
+
+class BufferAssignmentTypeConverter;
+
 namespace linalg {
 
 struct LinalgFusionOptions;
@@ -45,6 +48,12 @@ void populateConvVectorizationPatterns(
     MLIRContext *context, SmallVectorImpl<OwningRewritePatternList> &patterns,
     ArrayRef<int64_t> tileSizes);
 
+/// Populates the given list with patterns to convert Linalg operations on
+/// tensors to buffers.
+void populateConvertLinalgOnTensorsToBuffersPatterns(
+    MLIRContext *context, BufferAssignmentTypeConverter *converter,
+    OwningRewritePatternList *patterns);
+
 /// Performs standalone tiling of a single LinalgOp by `tileSizes`.
 /// and permute the loop nest according to `interchangeVector`
 /// The permutation is expressed as a list of integers that specify
@@ -246,6 +255,16 @@ Optional<LinalgOp> promoteSubViews(OpBuilder &b, LinalgOp op,
                                    LinalgPromotionOptions options,
                                    OperationFolder *folder = nullptr);
 
+/// Creates a number of ranges equal to the number of dimensions in the `map`.
+/// The returned ranges correspond to the loop ranges, in the proper order, for
+/// which new loops will be created.
+/// The function supports only maps that are invertible and have results of type
+/// DimExpr or (DimExpr + DimExpr - SymbolExpr floordiv ConstExpr).
+/// It expects a non-inverted, concatenated map and last values in
+/// allViewSizes will be applied to the symbols in the map if it contains any.
+SmallVector<Range, 4> emitLoopRanges(OpBuilder &b, Location loc, AffineMap map,
+                                     ValueRange viewSizes);
+
 /// Emit a suitable vector form for a Linalg op with fully static shape.
 void vectorizeLinalgOp(OpBuilder &builder, Operation *op);
 
diff --git a/mlir/include/mlir/Support/LLVM.h b/mlir/include/mlir/Support/LLVM.h
index 17e020442eb484..e8595ae29ed748 100644
--- a/mlir/include/mlir/Support/LLVM.h
+++ b/mlir/include/mlir/Support/LLVM.h
@@ -60,6 +60,8 @@ template <typename T>
 class SmallVectorImpl;
 template <typename AllocatorTy>
 class StringSet;
+template <typename T, typename R>
+class StringSwitch;
 template <typename T>
 class TinyPtrVector;
 template <typename T, typename ResultT>
@@ -111,6 +113,8 @@ using llvm::SmallPtrSet;
 using llvm::SmallPtrSetImpl;
 using llvm::SmallVector;
 using llvm::SmallVectorImpl;
+template <typename T, typename R = T>
+using StringSwitch = llvm::StringSwitch<T, R>;
 using llvm::TinyPtrVector;
 template <typename T, typename ResultT = void>
 using TypeSwitch = llvm::TypeSwitch<T, ResultT>;
diff --git a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
index 93381054dd213d..f4b7cedeb0e120 100644
--- a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h
@@ -27,7 +27,7 @@ struct GPUIndexIntrinsicOpLowering : public ConvertToLLVMPattern {
   unsigned indexBitwidth;
 
   static dimension dimensionToIndex(Op op) {
-    return llvm::StringSwitch<dimension>(op.dimension())
+    return StringSwitch<dimension>(op.dimension())
         .Case("x", X)
         .Case("y", Y)
         .Case("z", Z)
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp
index aa611d76a67abf..574d0aa8c37f0e 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMTypeSyntax.cpp
@@ -440,7 +440,7 @@ static LLVMType parseTypeImpl(DialectAsmParser &parser,
   if (failed(parser.parseKeyword(&key)))
     return LLVMType();
 
-  return llvm::StringSwitch<function_ref<LLVMType()>>(key)
+  return StringSwitch<function_ref<LLVMType()>>(key)
       .Case("void", [&] { return LLVMVoidType::get(ctx); })
       .Case("half", [&] { return LLVMHalfType::get(ctx); })
       .Case("bfloat", [&] { return LLVMBFloatType::get(ctx); })
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp b/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp
index 9e96c8cdc69194..b95469d8a95549 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Loops.cpp
@@ -58,77 +58,6 @@ static SmallVector<Value, 4> permuteIvs(ArrayRef<Value> ivs,
                      : SmallVector<Value, 4>(ivs.begin(), ivs.end());
 }
 
-/// Creates a number of ranges equal to the number of dimensions in the `map`.
-/// The returned ranges correspond to the loop ranges, in the proper order, for
-/// which new loops will be created.
-/// The function supports only maps that are invertible and have results of type
-/// DimExpr or (DimExpr + DimExpr - SymbolExpr floordiv ConstExpr).
-/// It expects a non-inverted, concatenated map and last values in
-/// allViewSizes will be applied to the symbols in the map if it contains any.
-static SmallVector<Range, 4> emitLoopRanges(OpBuilder &b, Location loc,
-                                            AffineMap map,
-                                            ValueRange viewSizes) {
-  unsigned numDims = map.getNumDims(), numRes = map.getNumResults();
-  unsigned numSym = map.getNumSymbols();
-  assert(viewSizes.size() == numRes + numSym &&
-         "viewSizes must contain sizes of all views and values for symbols");
-  SmallVector<Range, 4> res(numDims);
-  for (unsigned idx = 0; idx < numRes; ++idx) {
-    auto result = map.getResult(idx);
-    if (auto d = result.dyn_cast<AffineDimExpr>()) {
-      if (res[d.getPosition()].offset)
-        continue;
-      res[d.getPosition()] =
-          Range{std_constant_index(0), viewSizes[idx], std_constant_index(1)};
-    }
-
-    // If the access pattern is of form (m, n)[s] -> (m + n - s floordiv 2),
-    // then the bounds are:
-    //   (s floordiv 2) <= m <= (size(m) + s floordiv 2 - s + 1).
-    // where size(n) is applied to the symbol s.
-    // This is done statically now.
-    if (auto binOp = result.dyn_cast<AffineBinaryOpExpr>()) {
-      auto lhs = binOp.getLHS().dyn_cast<AffineBinaryOpExpr>();
-      auto rhs = binOp.getRHS().dyn_cast<AffineBinaryOpExpr>();
-      if (!lhs || !rhs || binOp.getKind() != AffineExprKind::Add ||
-          lhs.getKind() != AffineExprKind::Add ||
-          rhs.getKind() != mlir::AffineExprKind::Mul)
-        continue;
-
-      auto m = lhs.getLHS().dyn_cast<AffineDimExpr>();
-      auto n = lhs.getRHS().dyn_cast<AffineDimExpr>();
-      auto fDiv = rhs.getLHS().dyn_cast<AffineBinaryOpExpr>();
-      auto minusOne = rhs.getRHS().dyn_cast<AffineConstantExpr>();
-      if (!m || !n || !fDiv || !minusOne ||
-          fDiv.getKind() != AffineExprKind::FloorDiv ||
-          fDiv.getLHS().getKind() != AffineExprKind::SymbolId ||
-          fDiv.getRHS().getKind() != AffineExprKind::Constant)
-        continue;
-
-      auto s = fDiv.getLHS().dyn_cast<AffineSymbolExpr>();
-      if (minusOne.getValue() != -1)
-        continue;
-
-      int mPos = m.getPosition();
-      AffineExpr one = getAffineConstantExpr(1, s.getContext());
-      AffineExpr sizeOfM = getAffineSymbolExpr(numSym, s.getContext());
-      // Construction of upper bound (size(m) + s floordiv 2 - s + 1).
-      AffineExpr upperOffsetExpr = sizeOfM + fDiv + one - s;
-      AffineMap fromMap = AffineMap::get(numDims, numSym + 1, fDiv);
-      AffineMap toMap = AffineMap::get(numDims, numSym + 1, upperOffsetExpr);
-      SmallVector<Value, 8> values(viewSizes.begin(),
-                                   viewSizes.begin() + numDims);
-      values.insert(values.end(), viewSizes.begin() + numRes, viewSizes.end());
-      values.push_back(viewSizes[mPos]);
-      // Construction of the lower bound (s floordiv 2).
-      Value from = applyMapToValues(b, loc, fromMap, values).front();
-      Value to = applyMapToValues(b, loc, toMap, values).front();
-      res[mPos] = Range{from, to, std_constant_index(1)};
-    }
-  }
-  return res;
-}
-
 template <typename IndexedValueType, typename OpType>
 static void inlineRegionAndEmitStore(OpType op, ArrayRef<Value> indexedValues,
                                      ArrayRef<SmallVector<Value, 8>> indexing,
@@ -708,6 +637,70 @@ static Optional<LinalgLoops> linalgOpToLoopsImplSwitch(Operation *op,
   llvm_unreachable("Unexpected op in linalgOpToLoopsImpl");
 }
 
+SmallVector<Range, 4> mlir::linalg::emitLoopRanges(OpBuilder &b, Location loc,
+                                                   AffineMap map,
+                                                   ValueRange viewSizes) {
+  unsigned numDims = map.getNumDims(), numRes = map.getNumResults();
+  unsigned numSym = map.getNumSymbols();
+  assert(viewSizes.size() == numRes + numSym &&
+         "viewSizes must contain sizes of all views and values for symbols");
+  SmallVector<Range, 4> res(numDims);
+  for (unsigned idx = 0; idx < numRes; ++idx) {
+    auto result = map.getResult(idx);
+    if (auto d = result.dyn_cast<AffineDimExpr>()) {
+      if (res[d.getPosition()].offset)
+        continue;
+      res[d.getPosition()] =
+          Range{std_constant_index(0), viewSizes[idx], std_constant_index(1)};
+    }
+
+    // If the access pattern is of form (m, n)[s] -> (m + n - s floordiv 2),
+    // then the bounds are:
+    //   (s floordiv 2) <= m <= (size(m) + s floordiv 2 - s + 1).
+    // where size(n) is applied to the symbol s.
+    // This is done statically now.
+    if (auto binOp = result.dyn_cast<AffineBinaryOpExpr>()) {
+      auto lhs = binOp.getLHS().dyn_cast<AffineBinaryOpExpr>();
+      auto rhs = binOp.getRHS().dyn_cast<AffineBinaryOpExpr>();
+      if (!lhs || !rhs || binOp.getKind() != AffineExprKind::Add ||
+          lhs.getKind() != AffineExprKind::Add ||
+          rhs.getKind() != mlir::AffineExprKind::Mul)
+        continue;
+
+      auto m = lhs.getLHS().dyn_cast<AffineDimExpr>();
+      auto n = lhs.getRHS().dyn_cast<AffineDimExpr>();
+      auto fDiv = rhs.getLHS().dyn_cast<AffineBinaryOpExpr>();
+      auto minusOne = rhs.getRHS().dyn_cast<AffineConstantExpr>();
+      if (!m || !n || !fDiv || !minusOne ||
+          fDiv.getKind() != AffineExprKind::FloorDiv ||
+          fDiv.getLHS().getKind() != AffineExprKind::SymbolId ||
+          fDiv.getRHS().getKind() != AffineExprKind::Constant)
+        continue;
+
+      auto s = fDiv.getLHS().dyn_cast<AffineSymbolExpr>();
+      if (minusOne.getValue() != -1)
+        continue;
+
+      int mPos = m.getPosition();
+      AffineExpr one = getAffineConstantExpr(1, s.getContext());
+      AffineExpr sizeOfM = getAffineSymbolExpr(numSym, s.getContext());
+      // Construction of upper bound (size(m) + s floordiv 2 - s + 1).
+      AffineExpr upperOffsetExpr = sizeOfM + fDiv + one - s;
+      AffineMap fromMap = AffineMap::get(numDims, numSym + 1, fDiv);
+      AffineMap toMap = AffineMap::get(numDims, numSym + 1, upperOffsetExpr);
+      SmallVector<Value, 8> values(viewSizes.begin(),
+                                   viewSizes.begin() + numDims);
+      values.insert(values.end(), viewSizes.begin() + numRes, viewSizes.end());
+      values.push_back(viewSizes[mPos]);
+      // Construction of the lower bound (s floordiv 2).
+      Value from = applyMapToValues(b, loc, fromMap, values).front();
+      Value to = applyMapToValues(b, loc, toMap, values).front();
+      res[mPos] = Range{from, to, std_constant_index(1)};
+    }
+  }
+  return res;
+}
+
 /// Emits a loop nest with the proper body for `op`.
 template <typename LoopTy>
 Optional<LinalgLoops> mlir::linalg::linalgLowerOpToLoops(OpBuilder &builder,
diff --git a/mlir/lib/Dialect/Linalg/Transforms/TensorsToBuffers.cpp b/mlir/lib/Dialect/Linalg/Transforms/TensorsToBuffers.cpp
index b714a1f6c64288..3282358f5f4149 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/TensorsToBuffers.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/TensorsToBuffers.cpp
@@ -14,14 +14,119 @@
 #include "PassDetail.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/Passes.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
 #include "mlir/IR/Function.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/BufferPlacement.h"
 
-using namespace mlir;
-
 namespace {
+
+using namespace ::mlir;
+using namespace ::mlir::linalg;
+
+SmallVector<Range, 4>
+computeLoopRanges(Location loc, linalg::GenericOp linalgOp, OpBuilder *b) {
+  auto indexingMaps = llvm::to_vector<4>(
+      linalgOp.indexing_maps().getAsValueRange<AffineMapAttr>());
+  auto inputIndexingMaps =
+      llvm::makeArrayRef(indexingMaps).take_front(linalgOp.getNumInputs());
+
+  mlir::edsc::ScopedContext scope(*b, loc);
+  return emitLoopRanges(scope.getBuilderRef(), loc,
+                        concatAffineMaps(inputIndexingMaps),
+                        getShape(*b, linalgOp));
+}
+
+Value maybeConvertToIndex(Location loc, Value val, OpBuilder *b) {
+  if (val.getType().isIndex())
+    return val;
+  return b->create<IndexCastOp>(loc, val, b->getIndexType());
+}
+
+LogicalResult allocateBuffersForResults(Location loc,
+                                        linalg::GenericOp linalgOp,
+                                        linalg::GenericOpAdaptor &adaptor,
+                                        SmallVectorImpl<Value> *resultBuffers,
+                                        OpBuilder *b) {
+  // Lazily compute loopRanges.
+  SmallVector<Range, 4> loopRanges;
+
+  // Allocate a buffer for every tensor result.
+  for (auto en : llvm::enumerate(linalgOp.getResultTypes())) {
+    size_t resultIndex = en.index();
+    Type resultType = en.value();
+
+    auto tensorType = resultType.dyn_cast<RankedTensorType>();
+    if (tensorType == nullptr) {
+      linalgOp.emitOpError()
+          << "tensor to buffer conversion expects ranked tensor results";
+      return failure();
+    }
+    auto tensorShape = tensorType.getShape();
+    auto memrefType = MemRefType::get(tensorShape, tensorType.getElementType());
+
+    // Allocate buffers for init tensors that are assumed to fold onto the first
+    // results.
+    // TODO: update this assumption because the reality is more complex
+    // under linalg on tensor based transformations.
+    bool foldedInitTensor = resultIndex < linalgOp.getNumInitTensors();
+    if (foldedInitTensor) {
+      // Dealing with an init tensor requires distinguishing between 1-use
+      // and many-use cases which would create aliasing and WAR hazards.
+      Value initTensor = linalgOp.getInitTensor(resultIndex);
+      Value initBuffer = adaptor.init_tensors()[resultIndex];
+      if (initTensor.hasOneUse()) {
+        resultBuffers->push_back(initBuffer);
+        continue;
+      }
+      SmallVector<Value, 4> dynOperands;
+      for (auto dim : llvm::enumerate(tensorShape)) {
+        if (dim.value() == TensorType::kDynamicSize) {
+          dynOperands.push_back(b->create<DimOp>(loc, initTensor, dim.index()));
+        }
+      }
+      auto alloc = b->create<AllocOp>(loc, memrefType, dynOperands);
+      b->create<linalg::CopyOp>(loc, initBuffer, alloc);
+      resultBuffers->push_back(alloc);
+      continue;
+    }
+
+    // Allocate buffers for statically-shaped results.
+    if (memrefType.hasStaticShape()) {
+      resultBuffers->push_back(b->create<AllocOp>(loc, memrefType));
+      continue;
+    }
+
+    // Perform a naive shape inference for the dynamically-shaped results.
+    // Extract the required element out of the vector.
+    SmallVector<Value, 4> dynOperands;
+    auto resultIndexingMap = linalgOp.getOutputIndexingMap(resultIndex);
+    for (auto shapeElement : llvm::enumerate(tensorType.getShape())) {
+      if (loopRanges.empty())
+        loopRanges = computeLoopRanges(loc, linalgOp, b);
+
+      if (shapeElement.value() != ShapedType::kDynamicSize)
+        continue;
+
+      AffineExpr expr = resultIndexingMap.getResult(shapeElement.index());
+      switch (expr.getKind()) {
+      case AffineExprKind::DimId: {
+        int64_t loopIndex = expr.cast<AffineDimExpr>().getPosition();
+        Value size = maybeConvertToIndex(loc, loopRanges[loopIndex].size, b);
+        dynOperands.push_back(size);
+        break;
+      }
+      default:
+        return failure();
+      }
+    }
+    resultBuffers->push_back(b->create<AllocOp>(loc, memrefType, dynOperands));
+  }
+  return success();
+}
+
 /// A pattern to convert Generic Linalg operations which work on tensors to
 /// use buffers. A buffer is allocated using BufferAssignmentPlacer for
 /// each operation result. BufferPlacement pass should be later used to move
@@ -34,10 +139,10 @@ class GenericOpConverter
       linalg::GenericOp>::BufferAssignmentOpConversionPattern;
 
   LogicalResult
-  matchAndRewrite(linalg::GenericOp op, ArrayRef<Value> operands,
+  matchAndRewrite(linalg::GenericOp linalgOp, ArrayRef<Value> operands,
                   ConversionPatternRewriter &rewriter) const final {
-    linalg::GenericOpAdaptor adaptor(operands,
-                                     op.getOperation()->getAttrDictionary());
+    linalg::GenericOpAdaptor adaptor(
+        operands, linalgOp.getOperation()->getAttrDictionary());
 
     // All inputs need to be turned into buffers first. Until then, bail out.
     if (llvm::any_of(adaptor.inputs(),
@@ -50,93 +155,54 @@ class GenericOpConverter
                      [](Value in) { return !in.getType().isa<MemRefType>(); }))
       return failure();
 
-    Location loc = op.getLoc();
-    SmallVector<Value, 2> newOutputBuffers;
-    newOutputBuffers.reserve(op.getNumOutputs());
-    newOutputBuffers.append(adaptor.output_buffers().begin(),
-                            adaptor.output_buffers().end());
-
-    // Update all types to memref types.
-    // Assume the init tensors fold onto the first results.
-    // TODO: update this assumption because the reality is more complex under
-    // linalg on tensor based transformations.
-    for (auto en : llvm::enumerate(op.getResultTypes())) {
-      auto type = en.value().cast<ShapedType>();
-      if (!type.hasStaticShape())
-        return rewriter.notifyMatchFailure(
-            op, "dynamic shapes not currently supported");
-      auto memrefType = MemRefType::get(type.getShape(), type.getElementType());
-      bool foldedInitTensor = en.index() < op.getNumInitTensors();
-      if (foldedInitTensor) {
-        // Dealing with an init tensor requires distinguishing between 1-use
-        // and many-use cases which would create aliasing and WAR hazards.
-        Value initTensor = op.getInitTensor(en.index());
-        Value initBuffer = adaptor.init_tensors()[en.index()];
-        if (initTensor.hasOneUse()) {
-          newOutputBuffers.push_back(initBuffer);
-          continue;
-        }
-        auto alloc = rewriter.create<AllocOp>(loc, memrefType);
-        rewriter.create<linalg::CopyOp>(loc, initBuffer, alloc);
-        newOutputBuffers.push_back(alloc);
-      } else {
-        auto alloc = rewriter.create<AllocOp>(loc, memrefType);
-        newOutputBuffers.push_back(alloc);
-      }
+    Location loc = linalgOp.getLoc();
+    SmallVector<Value, 2> newOutputBuffers(adaptor.output_buffers().begin(),
+                                           adaptor.output_buffers().end());
+
+    if (failed(allocateBuffersForResults(loc, linalgOp, adaptor,
+                                         &newOutputBuffers, &rewriter))) {
+      linalgOp.emitOpError()
+          << "Failed to allocate buffers for tensor results.";
+      return failure();
     }
 
     // Generate a new linalg operation that works on buffers.
-    auto linalgOp = rewriter.create<linalg::GenericOp>(
+    auto newLinalgOp = rewriter.create<linalg::GenericOp>(
         loc,
-        /*resultTensorTypes=*/ArrayRef<Type>{},
+        /*resultTensorTypes=*/llvm::None,
         /*inputs=*/adaptor.inputs(),
         /*outputBuffers=*/newOutputBuffers,
-        /*initTensors=*/ValueRange{}, op.indexing_maps(), op.iterator_types(),
-        op.docAttr(), op.library_callAttr(), op.symbol_sourceAttr());
+        /*initTensors=*/llvm::None, linalgOp.indexing_maps(),
+        linalgOp.iterator_types(), linalgOp.docAttr(),
+        linalgOp.library_callAttr(), linalgOp.symbol_sourceAttr());
 
     // Create a new block in the region of the new Generic Op.
-    Block &oldBlock = op.getRegion().front();
-    Region &newRegion = linalgOp.region();
+    Block *oldBlock = linalgOp.getBody();
+    Region &newRegion = newLinalgOp.region();
     Block *newBlock = rewriter.createBlock(&newRegion, newRegion.begin(),
-                                           oldBlock.getArgumentTypes());
-
-    // Add the result arguments that do not come from init_tensors to the new
-    // block.
-    // TODO: update this assumption because the reality is more complex under
-    // linalg on tensor based transformations.
-    for (Value v :
-         ValueRange(newOutputBuffers).drop_front(adaptor.init_tensors().size()))
+                                           oldBlock->getArgumentTypes());
+
+    // Add the result arguments to the new block.
+    for (Value v : newOutputBuffers)
       newBlock->addArgument(v.getType().cast<MemRefType>().getElementType());
 
     // Clone the body of the old block to the new block.
     BlockAndValueMapping mapping;
-    for (unsigned i = 0; i < oldBlock.getNumArguments(); i++)
-      mapping.map(oldBlock.getArgument(i), newBlock->getArgument(i));
+    mapping.map(oldBlock->getArguments(), newBlock->getArguments());
 
     OpBuilder::InsertionGuard guard(rewriter);
     rewriter.setInsertionPointToEnd(newBlock);
-    for (auto &op : oldBlock.getOperations()) {
+    for (auto &op : oldBlock->getOperations()) {
       Operation *clonedOp = rewriter.clone(op, mapping);
       mapping.map(op.getResults(), clonedOp->getResults());
     }
 
     // Replace the results of the old op with the new output buffers.
-    rewriter.replaceOp(op, newOutputBuffers);
+    rewriter.replaceOp(linalgOp, newOutputBuffers);
     return success();
   }
 };
 
-/// Populate the given list with patterns to convert Linalg operations on
-/// tensors to buffers.
-static void populateConvertLinalgOnTensorsToBuffersPattern(
-    MLIRContext *context, BufferAssignmentTypeConverter *converter,
-    OwningRewritePatternList *patterns) {
-  populateWithBufferAssignmentOpConversionPatterns<
-      mlir::ReturnOp, mlir::ReturnOp, linalg::CopyOp>(context, converter,
-                                                      patterns);
-  patterns->insert<GenericOpConverter>(context, converter);
-}
-
 /// Converts Linalg operations that work on tensor-type operands or results to
 /// work on buffers.
 struct ConvertLinalgOnTensorsToBuffers
@@ -176,8 +242,11 @@ struct ConvertLinalgOnTensorsToBuffers
         BufferAssignmentTypeConverter::AppendToArgumentsList);
 
     OwningRewritePatternList patterns;
-    populateConvertLinalgOnTensorsToBuffersPattern(&context, &converter,
-                                                   &patterns);
+    populateConvertLinalgOnTensorsToBuffersPatterns(&context, &converter,
+                                                    &patterns);
+    populateWithBufferAssignmentOpConversionPatterns<
+        mlir::ReturnOp, mlir::ReturnOp, linalg::CopyOp>(&context, &converter,
+                                                        &patterns);
     if (failed(applyFullConversion(this->getOperation(), target, patterns)))
       this->signalPassFailure();
   }
@@ -188,3 +257,9 @@ std::unique_ptr<OperationPass<ModuleOp>>
 mlir::createConvertLinalgOnTensorsToBuffersPass() {
   return std::make_unique<ConvertLinalgOnTensorsToBuffers>();
 }
+
+void mlir::linalg::populateConvertLinalgOnTensorsToBuffersPatterns(
+    MLIRContext *context, BufferAssignmentTypeConverter *converter,
+    OwningRewritePatternList *patterns) {
+  patterns->insert<GenericOpConverter>(context, converter);
+}
diff --git a/mlir/lib/Dialect/PDL/IR/PDL.cpp b/mlir/lib/Dialect/PDL/IR/PDL.cpp
index a0b9c969becf61..ba1eb7b9957494 100644
--- a/mlir/lib/Dialect/PDL/IR/PDL.cpp
+++ b/mlir/lib/Dialect/PDL/IR/PDL.cpp
@@ -34,7 +34,7 @@ Type PDLDialect::parseType(DialectAsmParser &parser) const {
     return Type();
 
   Builder &builder = parser.getBuilder();
-  Type result = llvm::StringSwitch<Type>(keyword)
+  Type result = StringSwitch<Type>(keyword)
                     .Case("attribute", builder.getType<AttributeType>())
                     .Case("operation", builder.getType<OperationType>())
                     .Case("type", builder.getType<TypeType>())
diff --git a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
index f2823c564ccef6..f445a0cce242cd 100644
--- a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
+++ b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp
@@ -2823,19 +2823,30 @@ static SmallVector<int64_t, 4> extractFromI64ArrayAttr(Attribute attr) {
       }));
 }
 
+enum SubViewVerificationResult {
+  Success,
+  RankTooLarge,
+  SizeMismatch,
+  StrideMismatch,
+  ElemTypeMismatch,
+  MemSpaceMismatch,
+  AffineMapMismatch
+};
+
 /// Checks if `original` Type type can be rank reduced to `reduced` type.
 /// This function is slight variant of `is subsequence` algorithm where
 /// not matching dimension must be 1.
-static bool isRankReducedType(Type originalType, Type reducedType) {
+static SubViewVerificationResult isRankReducedType(Type originalType,
+                                                   Type reducedType) {
   if (originalType == reducedType)
-    return true;
+    return SubViewVerificationResult::Success;
   if (!originalType.isa<RankedTensorType>() && !originalType.isa<MemRefType>())
-    return true;
+    return SubViewVerificationResult::Success;
   if (originalType.isa<RankedTensorType>() &&
       !reducedType.isa<RankedTensorType>())
-    return true;
+    return SubViewVerificationResult::Success;
   if (originalType.isa<MemRefType>() && !reducedType.isa<MemRefType>())
-    return true;
+    return SubViewVerificationResult::Success;
 
   ShapedType originalShapedType = originalType.cast<ShapedType>();
   ShapedType reducedShapedType = reducedType.cast<ShapedType>();
@@ -2846,7 +2857,7 @@ static bool isRankReducedType(Type originalType, Type reducedType) {
   unsigned originalRank = originalShape.size(),
            reducedRank = reducedShape.size();
   if (reducedRank > originalRank)
-    return false;
+    return SubViewVerificationResult::RankTooLarge;
 
   unsigned reducedIdx = 0;
   SmallVector<bool, 4> keepMask(originalRank);
@@ -2858,41 +2869,78 @@ static bool isRankReducedType(Type originalType, Type reducedType) {
       reducedIdx++;
     // 1 is the only non-matching allowed.
     else if (originalShape[originalIdx] != 1)
-      return false;
+      return SubViewVerificationResult::SizeMismatch;
   }
   // Must match the reduced rank.
   if (reducedIdx != reducedRank)
-    return false;
+    return SubViewVerificationResult::SizeMismatch;
 
   // We are done for the tensor case.
   if (originalType.isa<RankedTensorType>())
-    return true;
+    return SubViewVerificationResult::Success;
 
   // Strided layout logic is relevant for MemRefType only.
   MemRefType original = originalType.cast<MemRefType>();
   MemRefType reduced = reducedType.cast<MemRefType>();
   MLIRContext *c = original.getContext();
-  int64_t originalOffset, symCounter = 0, dimCounter = 0;
-  SmallVector<int64_t, 4> originalStrides;
+  int64_t originalOffset, reducedOffset;
+  SmallVector<int64_t, 4> originalStrides, reducedStrides, keepStrides;
   getStridesAndOffset(original, originalStrides, originalOffset);
-  auto getSymbolOrConstant = [&](int64_t offset) {
-    return offset == ShapedType::kDynamicStrideOrOffset
-               ? getAffineSymbolExpr(symCounter++, c)
-               : getAffineConstantExpr(offset, c);
-  };
-
-  AffineExpr expr = getSymbolOrConstant(originalOffset);
-  for (unsigned i = 0, e = originalStrides.size(); i < e; i++) {
-    if (keepMask[i])
-      expr = expr + getSymbolOrConstant(originalStrides[i]) *
-                        getAffineDimExpr(dimCounter++, c);
+  getStridesAndOffset(reduced, reducedStrides, reducedOffset);
+
+  // Filter strides based on the mask and check that they are the same
+  // as reduced ones.
+  reducedIdx = 0;
+  for (unsigned originalIdx = 0; originalIdx < originalRank; ++originalIdx) {
+    if (keepMask[originalIdx]) {
+      if (originalStrides[originalIdx] != reducedStrides[reducedIdx++])
+        return SubViewVerificationResult::StrideMismatch;
+      keepStrides.push_back(originalStrides[originalIdx]);
+    }
   }
 
-  auto reducedMap = AffineMap::get(dimCounter, symCounter, expr, c);
-  return original.getElementType() == reduced.getElementType() &&
-         original.getMemorySpace() == reduced.getMemorySpace() &&
-         (reduced.getAffineMaps().empty() ||
-          reducedMap == reduced.getAffineMaps().front());
+  if (original.getElementType() != reduced.getElementType())
+    return SubViewVerificationResult::ElemTypeMismatch;
+
+  if (original.getMemorySpace() != reduced.getMemorySpace())
+    return SubViewVerificationResult::MemSpaceMismatch;
+
+  auto reducedMap = makeStridedLinearLayoutMap(keepStrides, originalOffset, c);
+  if (!reduced.getAffineMaps().empty() &&
+      reducedMap != reduced.getAffineMaps().front())
+    return SubViewVerificationResult::AffineMapMismatch;
+
+  return SubViewVerificationResult::Success;
+}
+
+template <typename OpTy>
+static LogicalResult produceSubViewErrorMsg(SubViewVerificationResult result,
+                                            OpTy op, Type expectedType) {
+  auto memrefType = expectedType.cast<ShapedType>();
+  switch (result) {
+  case SubViewVerificationResult::Success:
+    return success();
+  case SubViewVerificationResult::RankTooLarge:
+    return op.emitError("expected result rank to be smaller or equal to ")
+           << "the source rank.";
+  case SubViewVerificationResult::SizeMismatch:
+    return op.emitError("expected result type to be ")
+           << expectedType
+           << " or a rank-reduced version. (mismatch of result sizes)";
+  case SubViewVerificationResult::StrideMismatch:
+    return op.emitError("expected result type to be ")
+           << expectedType
+           << " or a rank-reduced version. (mismatch of result strides)";
+  case SubViewVerificationResult::ElemTypeMismatch:
+    return op.emitError("expected result element type to be ")
+           << memrefType.getElementType();
+  case SubViewVerificationResult::MemSpaceMismatch:
+    return op.emitError("expected result and source memory spaces to match.");
+  case SubViewVerificationResult::AffineMapMismatch:
+    return op.emitError("expected result type to be ")
+           << expectedType
+           << " or a rank-reduced version. (mismatch of result affine map)";
+  }
 }
 
 template <typename OpType>
@@ -2937,11 +2985,9 @@ static LogicalResult verify(SubViewOp op) {
       baseType, extractFromI64ArrayAttr(op.static_offsets()),
       extractFromI64ArrayAttr(op.static_sizes()),
       extractFromI64ArrayAttr(op.static_strides()));
-  if (!isRankReducedType(expectedType, subViewType))
-    return op.emitError("expected result type to be ")
-           << expectedType << " or a rank-reduced version.";
 
-  return success();
+  auto result = isRankReducedType(expectedType, subViewType);
+  return produceSubViewErrorMsg(result, op, expectedType);
 }
 
 raw_ostream &mlir::operator<<(raw_ostream &os, Range &range) {
@@ -3352,11 +3398,8 @@ static LogicalResult verify(SubTensorOp op) {
       op.getSourceType(), extractFromI64ArrayAttr(op.static_offsets()),
       extractFromI64ArrayAttr(op.static_sizes()),
       extractFromI64ArrayAttr(op.static_strides()));
-  if (!isRankReducedType(expectedType, op.getType()))
-    return op.emitError("expected result type to be ")
-           << expectedType << " or a rank-reduced version.";
-
-  return success();
+  auto result = isRankReducedType(expectedType, op.getType());
+  return produceSubViewErrorMsg(result, op, expectedType);
 }
 
 void SubTensorOp::getCanonicalizationPatterns(OwningRewritePatternList &results,
diff --git a/mlir/lib/ExecutionEngine/JitRunner.cpp b/mlir/lib/ExecutionEngine/JitRunner.cpp
index 2b18adb3734713..7d141e90edda3b 100644
--- a/mlir/lib/ExecutionEngine/JitRunner.cpp
+++ b/mlir/lib/ExecutionEngine/JitRunner.cpp
@@ -291,7 +291,7 @@ int mlir::JitRunnerMain(
       Error (*)(Options &, ModuleOp, StringRef,
                 std::function<llvm::Error(llvm::Module *)>);
   auto compileAndExecuteFn =
-      llvm::StringSwitch<CompileAndExecuteFnT>(options.mainFuncType.getValue())
+      StringSwitch<CompileAndExecuteFnT>(options.mainFuncType.getValue())
           .Case("i32", compileAndExecuteSingleReturnFunction<int32_t>)
           .Case("i64", compileAndExecuteSingleReturnFunction<int64_t>)
           .Case("f32", compileAndExecuteSingleReturnFunction<float>)
diff --git a/mlir/lib/IR/SymbolTable.cpp b/mlir/lib/IR/SymbolTable.cpp
index b064d83b5faadc..e18e691f8cc808 100644
--- a/mlir/lib/IR/SymbolTable.cpp
+++ b/mlir/lib/IR/SymbolTable.cpp
@@ -166,7 +166,7 @@ SymbolTable::Visibility SymbolTable::getSymbolVisibility(Operation *symbol) {
     return Visibility::Public;
 
   // Otherwise, switch on the string value.
-  return llvm::StringSwitch<Visibility>(vis.getValue())
+  return StringSwitch<Visibility>(vis.getValue())
       .Case("private", Visibility::Private)
       .Case("nested", Visibility::Nested)
       .Case("public", Visibility::Public);
diff --git a/mlir/lib/Parser/Lexer.cpp b/mlir/lib/Parser/Lexer.cpp
index 9a3418eaf83275..ee31ff0cf9e4cd 100644
--- a/mlir/lib/Parser/Lexer.cpp
+++ b/mlir/lib/Parser/Lexer.cpp
@@ -212,7 +212,7 @@ Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) {
        isAllDigit(spelling.drop_front(2))))
     return Token(Token::inttype, spelling);
 
-  Token::Kind kind = llvm::StringSwitch<Token::Kind>(spelling)
+  Token::Kind kind = StringSwitch<Token::Kind>(spelling)
 #define TOK_KEYWORD(SPELLING) .Case(#SPELLING, Token::kw_##SPELLING)
 #include "TokenKinds.def"
                          .Default(Token::bare_identifier);
diff --git a/mlir/lib/TableGen/Format.cpp b/mlir/lib/TableGen/Format.cpp
index 12735875c1c199..7d17a0aef3f978 100644
--- a/mlir/lib/TableGen/Format.cpp
+++ b/mlir/lib/TableGen/Format.cpp
@@ -60,7 +60,7 @@ Optional<StringRef> FmtContext::getSubstFor(StringRef placeholder) const {
 }
 
 FmtContext::PHKind FmtContext::getPlaceHolderKind(StringRef str) {
-  return llvm::StringSwitch<FmtContext::PHKind>(str)
+  return StringSwitch<FmtContext::PHKind>(str)
       .Case("_builder", FmtContext::PHKind::Builder)
       .Case("_op", FmtContext::PHKind::Op)
       .Case("_self", FmtContext::PHKind::Self)
diff --git a/mlir/lib/TableGen/Predicate.cpp b/mlir/lib/TableGen/Predicate.cpp
index 8927296af223bd..a37847f0d48930 100644
--- a/mlir/lib/TableGen/Predicate.cpp
+++ b/mlir/lib/TableGen/Predicate.cpp
@@ -119,7 +119,7 @@ static PredCombinerKind getPredCombinerKind(const Pred &pred) {
     return PredCombinerKind::Leaf;
 
   const auto &combinedPred = static_cast<const CombinedPred &>(pred);
-  return llvm::StringSwitch<PredCombinerKind>(
+  return StringSwitch<PredCombinerKind>(
              combinedPred.getCombinerDef()->getName())
       .Case("PredCombinerAnd", PredCombinerKind::And)
       .Case("PredCombinerOr", PredCombinerKind::Or)
diff --git a/mlir/test/Dialect/Linalg/tensors-to-buffers.mlir b/mlir/test/Dialect/Linalg/tensors-to-buffers.mlir
index 654a13fca743f3..4339b33a237920 100644
--- a/mlir/test/Dialect/Linalg/tensors-to-buffers.mlir
+++ b/mlir/test/Dialect/Linalg/tensors-to-buffers.mlir
@@ -2,11 +2,13 @@
 
 #map0 = affine_map<(d0) -> (d0)>
 
-// CHECK-LABEL: func @multiple_results_generic_op
-func @multiple_results_generic_op(%arg0: tensor<4xf32>) -> (tensor<4xf32>, tensor<4xf32>) {
-    %0, %1 = linalg.generic {indexing_maps = [#map0, #map0, #map0], iterator_types = ["parallel"]}
-      ins(%arg0 : tensor<4xf32>) {
-    ^bb0(%gen_arg1: f32):
+// CHECK-LABEL: func @multiple_results
+func @multiple_results(%arg0: tensor<4xf32>) -> (tensor<4xf32>, tensor<4xf32>) {
+    %0, %1 = linalg.generic {
+      indexing_maps = [#map0, #map0, #map0],
+      iterator_types = ["parallel"]
+    } ins(%arg0 : tensor<4xf32>) {
+      ^bb0(%gen_arg1: f32):
         %tmp1 = exp %gen_arg1 : f32
         linalg.yield %tmp1, %tmp1 : f32, f32
     } -> tensor<4xf32>, tensor<4xf32>
@@ -34,15 +36,20 @@ func @multiple_results_generic_op(%arg0: tensor<4xf32>) -> (tensor<4xf32>, tenso
 
 // CHECK-LABEL: func @chained_operations
 func @chained_operations(%arg0: tensor<4xf32>) -> tensor<4xf32> {
-    %0 = linalg.generic {indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
-      ins(%arg0 : tensor<4xf32>) {
-    ^bb0(%gen_arg1: f32):
+    %0 = linalg.generic {
+      indexing_maps = [#map0, #map0],
+      iterator_types = ["parallel"]
+    } ins(%arg0 : tensor<4xf32>) {
+      ^bb0(%gen_arg1: f32):
         %tmp1 = exp %gen_arg1 : f32
         linalg.yield %tmp1 : f32
     } -> tensor<4xf32>
-    %1 = linalg.generic {args_in = 1 : i64, args_out = 1 : i64, indexing_maps = [#map0, #map0], iterator_types = ["parallel"]}
-      ins(%0 : tensor<4xf32>) {
-    ^bb0(%gen_arg2: f32):
+
+    %1 = linalg.generic {
+      indexing_maps = [#map0, #map0],
+      iterator_types = ["parallel"]
+    } ins(%0 : tensor<4xf32>) {
+      ^bb0(%gen_arg2: f32):
         %tmp2 = exp %gen_arg2 : f32
         linalg.yield %tmp2 : f32
     } -> tensor<4xf32>
@@ -73,6 +80,46 @@ func @no_linalg_op(%arg0: f32) -> (f32, f32) {
   %0 = mulf %arg0, %arg0 : f32
   return %0, %0 : f32, f32
 }
-//      CHECK: (%[[NEW_ARG0:.*]]: [[TYPE:.*]]) -> ([[TYPE]], [[TYPE]])
-//      CHECK: %[[RESULT:.*]] = mulf %[[NEW_ARG0]], %[[NEW_ARG0]] : [[TYPE]]
-//      CHECK: return %[[RESULT]], %[[RESULT]] : [[TYPE]], [[TYPE]]
+// CHECK: (%[[NEW_ARG0:.*]]: [[TYPE:.*]]) -> ([[TYPE]], [[TYPE]])
+// CHECK: %[[RESULT:.*]] = mulf %[[NEW_ARG0]], %[[NEW_ARG0]] : [[TYPE]]
+// CHECK: return %[[RESULT]], %[[RESULT]] : [[TYPE]], [[TYPE]]
+
+// -----
+
+#map_2d = affine_map<(d0, d1) -> (d0, d1)>
+#map_2d_inv = affine_map<(d0, d1) -> (d1, d0)>
+
+func @dynamic_results(%arg0: tensor<?x?xf32>)
+         -> (tensor<?x?xf32>, tensor<?x?xf32>) {
+    %0, %1 = linalg.generic {
+      indexing_maps = [#map_2d, #map_2d, #map_2d_inv],
+      iterator_types = ["parallel", "parallel"]
+    } ins(%arg0 : tensor<?x?xf32>) {
+      ^bb0(%gen_arg1: f32):
+        %tmp1 = exp %gen_arg1 : f32
+        linalg.yield %tmp1, %tmp1 : f32, f32
+    } -> tensor<?x?xf32>, tensor<?x?xf32>
+    return %0, %1 : tensor<?x?xf32>, tensor<?x?xf32>
+}
+
+// CHECK: #map0 = affine_map<(d0, d1) -> (d0, d1)>
+// CHECK: #map1 = affine_map<(d0, d1) -> (d1, d0)>
+
+// CHECK-LABEL: func @dynamic_results
+// CHECK-SAME: (%[[INPUT:.*]]: [[TYPE:.*]], %[[OUT_1:.*]]: [[TYPE]], %[[OUT_2:.*]]: [[TYPE]]) {
+// CHECK: %[[C0:.*]] = constant 0 : index
+// CHECK: %[[DIM_0:.*]] = dim %[[INPUT]], %[[C0]] : [[TYPE]]
+// CHECK: %[[C1:.*]] = constant 1 : index
+// CHECK: %[[DIM_1:.*]] = dim %[[INPUT]], %[[C1]] : [[TYPE]]
+// CHECK: %[[OUT_BUF_1:.*]] = alloc(%[[DIM_0]], %[[DIM_1]]) : [[TYPE]]
+// CHECK: %[[OUT_BUF_2:.*]] = alloc(%[[DIM_1]], %[[DIM_0]]) : [[TYPE]]
+
+// CHECK: linalg.generic {indexing_maps = [#map0, #map0, #map1], {{.*}}}
+// CHECK-SAME: ins(%[[INPUT]] : [[TYPE]])
+// CHECK-SAME: outs(%[[OUT_BUF_1]], %[[OUT_BUF_2]] : [[TYPE]], [[TYPE]]) {
+
+// CHECK: linalg.copy(%[[OUT_BUF_1]], %[[OUT_1]]) : [[TYPE]], [[TYPE]]
+// CHECK: dealloc %[[OUT_BUF_1]] : [[TYPE]]
+// CHECK: linalg.copy(%[[OUT_BUF_2]], %[[OUT_2]]) : [[TYPE]], [[TYPE]]
+// CHECK: dealloc %[[OUT_BUF_2]] : [[TYPE]]
+// CHECK: return
diff --git a/mlir/test/IR/core-ops.mlir b/mlir/test/IR/core-ops.mlir
index 2590dc0105c4ea..219c3bc84d5706 100644
--- a/mlir/test/IR/core-ops.mlir
+++ b/mlir/test/IR/core-ops.mlir
@@ -21,6 +21,7 @@
 // CHECK-DAG: #[[$SUBVIEW_MAP5:map[0-9]+]] = affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1 * 2)>
 // CHECK-DAG: #[[$SUBVIEW_MAP6:map[0-9]+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0 * 36 + d1 * 36 + d2 * 4 + d3 * 4 + d4)>
 // CHECK-DAG: #[[$SUBVIEW_MAP7:map[0-9]+]] = affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5, s6] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4 + d4 * s5 + d5 * s6)>
+// CHECK-DAG: #[[$SUBVIEW_MAP8:map[0-9]+]] = affine_map<(d0, d1, d2, d3)[s0, s1, s2, s3, s4] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3 + d3 * s4)>
 
 // CHECK-LABEL: func @func_with_ops
 // CHECK-SAME: %[[ARG:.*]]: f32
@@ -811,11 +812,11 @@ func @memref_subview(%arg0 : index, %arg1 : index, %arg2 : index) {
 
   %15 = alloc(%arg1, %arg2)[%c0, %c1, %arg1, %arg0, %arg0, %arg2, %arg2] : memref<1x?x5x1x?x1xf32, affine_map<(d0, d1, d2, d3, d4, d5)[s0, s1, s2, s3, s4, s5, s6] -> (s0 + s1 * d0 + s2 * d1 + s3 * d2 + s4 * d3 + s5 * d4 + s6 * d5)>>
   // CHECK: subview %15[0, 0, 0, 0, 0, 0] [1, %arg1, 5, 1, %arg2, 1] [1, 1, 1, 1, 1, 1]  :
-  // CHECK-SAME: memref<1x?x5x1x?x1xf32,  #[[$SUBVIEW_MAP7]]> to memref<?x5x?xf32>
-  %16 = subview %15[0, 0, 0, 0, 0, 0][1, %arg1, 5, 1, %arg2, 1][1, 1, 1, 1, 1, 1] : memref<1x?x5x1x?x1xf32, offset: ?, strides: [?, ?, ?, ?, ?, ?]> to memref<?x5x?xf32>
+  // CHECK-SAME: memref<1x?x5x1x?x1xf32,  #[[$SUBVIEW_MAP7]]> to memref<?x5x?xf32, #[[$BASE_MAP3]]>
+  %16 = subview %15[0, 0, 0, 0, 0, 0][1, %arg1, 5, 1, %arg2, 1][1, 1, 1, 1, 1, 1] : memref<1x?x5x1x?x1xf32, offset: ?, strides: [?, ?, ?, ?, ?, ?]> to memref<?x5x?xf32, offset: ?, strides: [?, ?, ?]>
   // CHECK: subview %15[%arg1, %arg1, %arg1, %arg1, %arg1, %arg1] [1, %arg1, 5, 1, %arg2, 1] [1, 1, 1, 1, 1, 1]  :
-  // CHECK-SAME: memref<1x?x5x1x?x1xf32, #[[$SUBVIEW_MAP7]]> to memref<?x5x?x1xf32>
-  %17 = subview %15[%arg1, %arg1, %arg1, %arg1, %arg1, %arg1][1, %arg1, 5, 1, %arg2, 1][1, 1, 1, 1, 1, 1] :  memref<1x?x5x1x?x1xf32, offset: ?, strides: [?, ?, ?, ?, ?, ?]> to memref<?x5x?x1xf32>
+  // CHECK-SAME: memref<1x?x5x1x?x1xf32, #[[$SUBVIEW_MAP7]]> to memref<?x5x?x1xf32, #[[$SUBVIEW_MAP8]]>
+  %17 = subview %15[%arg1, %arg1, %arg1, %arg1, %arg1, %arg1][1, %arg1, 5, 1, %arg2, 1][1, 1, 1, 1, 1, 1] :  memref<1x?x5x1x?x1xf32, offset: ?, strides: [?, ?, ?, ?, ?, ?]> to memref<?x5x?x1xf32, offset: ?, strides: [?, ?, ?, ?]>
 
   %18 = alloc() : memref<1x8xf32>
   // CHECK: subview %18[0, 0] [1, 8] [1, 1]  : memref<1x8xf32> to memref<8xf32>
diff --git a/mlir/test/IR/invalid-ops.mlir b/mlir/test/IR/invalid-ops.mlir
index 7356c07577dbaf..b59353aa2f7c51 100644
--- a/mlir/test/IR/invalid-ops.mlir
+++ b/mlir/test/IR/invalid-ops.mlir
@@ -1011,7 +1011,7 @@ func @invalid_subview(%arg0 : index, %arg1 : index, %arg2 : index) {
 
 func @invalid_subview(%arg0 : index, %arg1 : index, %arg2 : index) {
   %0 = alloc() : memref<8x16x4xf32>
-  // expected-error@+1 {{expected result type to be 'memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3)>>'}}
+  // expected-error@+1 {{expected result type to be 'memref<?x?x?xf32, affine_map<(d0, d1, d2)[s0, s1, s2, s3] -> (d0 * s1 + s0 + d1 * s2 + d2 * s3)>>' or a rank-reduced version. (mismatch of result strides)}}
   %1 = subview %0[%arg0, %arg1, %arg2][%arg0, %arg1, %arg2][%arg0, %arg1, %arg2]
     : memref<8x16x4xf32> to
       memref<?x?x?xf32, offset: ?, strides: [64, 4, 1]>
@@ -1020,9 +1020,31 @@ func @invalid_subview(%arg0 : index, %arg1 : index, %arg2 : index) {
 
 // -----
 
+func @invalid_subview(%arg0 : index, %arg1 : index, %arg2 : index) {
+  %0 = alloc() : memref<8x16x4xf32>
+  // expected-error@+1 {{expected result element type to be 'f32'}}
+  %1 = subview %0[0, 0, 0][8, 16, 4][1, 1, 1]
+    : memref<8x16x4xf32> to
+      memref<8x16x4xi32>
+  return
+}
+
+// -----
+
+func @invalid_subview(%arg0 : index, %arg1 : index, %arg2 : index) {
+  %0 = alloc() : memref<8x16x4xf32>
+  // expected-error@+1 {{expected result rank to be smaller or equal to the source rank.}}
+  %1 = subview %0[0, 0, 0][8, 16, 4][1, 1, 1]
+    : memref<8x16x4xf32> to
+      memref<8x16x4x3xi32>
+  return
+}
+
+// -----
+
 func @invalid_rank_reducing_subview(%arg0 : index, %arg1 : index, %arg2 : index) {
   %0 = alloc() : memref<8x16x4xf32>
-  // expected-error@+1 {{expected result type to be 'memref<8x16x4xf32, affine_map<(d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)>>'}}
+  // expected-error@+1 {{expected result type to be 'memref<8x16x4xf32, affine_map<(d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)>>' or a rank-reduced version. (mismatch of result sizes)}}
   %1 = subview %0[0, 0, 0][8, 16, 4][1, 1, 1]
     : memref<8x16x4xf32> to memref<16x4xf32>
   return
@@ -1030,6 +1052,14 @@ func @invalid_rank_reducing_subview(%arg0 : index, %arg1 : index, %arg2 : index)
 
 // -----
 
+func @invalid_rank_reducing_subview(%arg0 : memref<?x?xf32>, %arg1 : index, %arg2 : index) {
+  // expected-error@+1 {{expected result type to be 'memref<?x1xf32, affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)>>' or a rank-reduced version. (mismatch of result strides)}}
+  %0 = subview %arg0[0, %arg1][%arg2, 1][1, 1] : memref<?x?xf32> to memref<?xf32>
+  return
+}
+
+// -----
+
 func @invalid_memref_cast(%arg0 : memref<12x4x16xf32, offset:0, strides:[64, 16, 1]>) {
   // expected-error@+1{{operand type 'memref<12x4x16xf32, affine_map<(d0, d1, d2) -> (d0 * 64 + d1 * 16 + d2)>>' and result type 'memref<12x4x16xf32, affine_map<(d0, d1, d2) -> (d0 * 128 + d1 * 32 + d2 * 2)>>' are cast incompatible}}
   %0 = memref_cast %arg0 : memref<12x4x16xf32, offset:0, strides:[64, 16, 1]> to memref<12x4x16xf32, offset:0, strides:[128, 32, 2]>
@@ -1259,7 +1289,7 @@ func @imaginary_part_from_incompatible_complex_type(%cplx: complex<f64>) {
 // -----
 
 func @subtensor_wrong_dynamic_type(%t: tensor<8x16x4xf32>, %idx : index) {
-      // expected-error @+1 {{expected result type to be 'tensor<4x4x4xf32>'}}
+      // expected-error @+1 {{expected result type to be 'tensor<4x4x4xf32>' or a rank-reduced version. (mismatch of result sizes)}}
   %0 = subtensor %t[0, 2, 0][4, 4, 4][1, 1, 1]
     : tensor<8x16x4xf32> to tensor<?x4x4xf32>
 
@@ -1269,7 +1299,7 @@ func @subtensor_wrong_dynamic_type(%t: tensor<8x16x4xf32>, %idx : index) {
 // -----
 
 func @subtensor_wrong_static_type(%t: tensor<8x16x4xf32>, %idx : index) {
-      // expected-error @+1 {{expected result type to be 'tensor<?x3x?xf32>'}}
+      // expected-error @+1 {{expected result type to be 'tensor<?x3x?xf32>' or a rank-reduced version. (mismatch of result sizes)}}
   %0 = subtensor %t[0, 0, 0][%idx, 3, %idx][1, 1, 1]
     : tensor<8x16x4xf32> to tensor<4x4x4xf32>
 
diff --git a/mlir/test/lib/Dialect/Test/TestDialect.cpp b/mlir/test/lib/Dialect/Test/TestDialect.cpp
index c84a7717abe784..4ca89bced5eb93 100644
--- a/mlir/test/lib/Dialect/Test/TestDialect.cpp
+++ b/mlir/test/lib/Dialect/Test/TestDialect.cpp
@@ -685,7 +685,7 @@ void SideEffectOp::getEffects(
 
     // Get the specific memory effect.
     MemoryEffects::Effect *effect =
-        llvm::StringSwitch<MemoryEffects::Effect *>(
+        StringSwitch<MemoryEffects::Effect *>(
             effectElement.get("effect").cast<StringAttr>().getValue())
             .Case("allocate", MemoryEffects::Allocate::get())
             .Case("free", MemoryEffects::Free::get())
diff --git a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-gen.cpp b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-gen.cpp
index 4fe3cd1ee174b9..64424b4ac3d2f8 100644
--- a/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-gen.cpp
+++ b/mlir/tools/mlir-linalg-ods-gen/mlir-linalg-ods-gen.cpp
@@ -288,7 +288,7 @@ Token Lexer::lexIdentifier(const char *tokStart) {
 
   // Check to see if this identifier is a keyword.
   StringRef str(tokStart, curPtr - tokStart);
-  Token::Kind kind = llvm::StringSwitch<Token::Kind>(str)
+  Token::Kind kind = StringSwitch<Token::Kind>(str)
                          .Case("def", Token::Kind::kw_def)
                          .Case("ods_def", Token::Kind::kw_ods_def)
                          .Case("floordiv", Token::Kind::kw_floordiv)
diff --git a/mlir/tools/mlir-tblgen/OpFormatGen.cpp b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
index 336c9111677b4e..9b8f249232401f 100644
--- a/mlir/tools/mlir-tblgen/OpFormatGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpFormatGen.cpp
@@ -719,7 +719,7 @@ static void genLiteralParser(StringRef value, OpMethodBody &body) {
     body << "Keyword(\"" << value << "\")";
     return;
   }
-  body << (StringRef)llvm::StringSwitch<StringRef>(value)
+  body << (StringRef)StringSwitch<StringRef>(value)
               .Case("->", "Arrow()")
               .Case(":", "Colon()")
               .Case(",", "Comma()")
@@ -1936,7 +1936,7 @@ Token FormatLexer::lexIdentifier(const char *tokStart) {
   // Check to see if this identifier is a keyword.
   StringRef str(tokStart, curPtr - tokStart);
   Token::Kind kind =
-      llvm::StringSwitch<Token::Kind>(str)
+      StringSwitch<Token::Kind>(str)
           .Case("attr-dict", Token::kw_attr_dict)
           .Case("attr-dict-with-keyword", Token::kw_attr_dict_w_keyword)
           .Case("custom", Token::kw_custom)